diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index b89e0d1..afc9c2c 100644 Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index f985ebc..f247f45 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -812,6 +812,14 @@ typedef uint32_t ShflSyncResult __attribute__((ext_vector_type(2))); { uint8_t sel = static_cast(s >> (i * 4)); uint8_t addr = sel & 0x7; + if (sel & 0x8) + { + if (addr % 2 == 1) + { + v_perm_selector.u8x4[i] = 0x8 + addr / 2; + continue; + } + } v_perm_selector.u8x4[i] = addr; } @@ -821,9 +829,14 @@ typedef uint32_t ShflSyncResult __attribute__((ext_vector_type(2))); for (size_t i = 0; i < 4; i++) { uint8_t sel = static_cast(s >> (i * 4)); + uint8_t addr = sel & 0x7; if (sel & 0x8) { - output.u8x4[i] = (output.u8x4[i] & 0x80) * 0xff; + if (addr % 2 != 1) + { + output.u8x4[i] = (output.u8x4[i] & 0x80) * 0xff; + continue; + } } }