diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index 50f9d3d..e9d602c 100644 Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index 553070e..e9cf904 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -19,16 +19,34 @@ extern "C" return (uint32_t)__ockl_get_local_size(member); } - int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __attribute__((device)); - int32_t FUNC(bfe_s32)(int32_t base, uint32_t pos, uint32_t len) + uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __attribute__((device)); + uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos_32, uint32_t len_32) { - return __ockl_bfe_i32(base, pos, len); + uint32_t pos = pos_32 & 0xFFU; + uint32_t len = len_32 & 0xFFU; + if (pos >= 32) + return 0; + // V_BFE_U32 only uses bits [4:0] for len (max value is 31) + if (len >= 32) + return base >> pos; + len = std::min(len, 31U); + return __ockl_bfe_u32(base, pos, len); } - uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __attribute__((device)); - uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos, uint32_t len) + int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __attribute__((device)); + int32_t FUNC(bfe_s32)(int32_t base, uint32_t pos_32, uint32_t len_32) { - return __ockl_bfe_u32(base, pos, len); + uint32_t pos = pos_32 & 0xFFU; + uint32_t len = len_32 & 0xFFU; + if (len == 0) + return 0; + if (pos >= 32) + return (base >> 31); + // V_BFE_I32 only uses bits [4:0] for len (max value is 31) + if (len >= 32) + return base >> pos; + len = std::min(len, 31U); + return __ockl_bfe_i32(base, pos, len); } // LLVM contains mentions of llvm.amdgcn.ubfe.i64 and llvm.amdgcn.sbfe.i64, diff --git a/ptx/src/test/spirv_run/bfe.ptx b/ptx/src/test/spirv_run/bfe.ptx index a01a14a..60ee8a6 100644 --- a/ptx/src/test/spirv_run/bfe.ptx +++ b/ptx/src/test/spirv_run/bfe.ptx @@ -10,28 +10,14 @@ .reg .u64 in_addr; .reg .u64 out_addr; .reg .u32 temp<3>; - .reg .b32 result<2>; - .reg .b64 temp64_0; - .reg .b32 temp64_1; - .reg .b32 temp64_2; - .reg .b64 result64_<2>; ld.param.u64 in_addr, [input]; ld.param.u64 out_addr, [output]; - ld.b64 temp64_0, [in_addr]; - ld.b32 temp64_1, [in_addr+8]; - ld.b32 temp64_2, [in_addr+16]; - ld.u32 temp0, [in_addr+24]; - ld.u32 temp1, [in_addr+28]; - ld.u32 temp2, [in_addr+32]; - //bfe.u64 result64_0, temp64_0, temp64_1, temp64_2; - bfe.s64 result64_1, temp64_0, temp64_1, temp64_2; - bfe.u32 result0, temp0, temp1, temp2; - bfe.s32 result1, temp0, temp1, temp2; - st.b64 [out_addr], result64_0; - st.b64 [out_addr], result64_1; - st.b32 [out_addr], result0; - st.b32 [out_addr], result1; + ld.u32 temp0, [in_addr]; + ld.u32 temp1, [in_addr+4]; + ld.u32 temp2, [in_addr+8]; + bfe.u32 temp0, temp0, temp1, temp2; + st.u32 [out_addr], temp0; ret; }