diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index bc375c3..0eef0b4 100644 Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index 6174ec1..58e8a44 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -780,4 +780,9 @@ typedef uint32_t ShflSyncResult __attribute__((ext_vector_type(2))); return d_out; } + + uint32_t FUNC(prmt_b32)(uint32_t x, uint32_t y, uint32_t s) + { + return __byte_perm(x, y, s); + } } diff --git a/ptx/src/pass/llvm/emit.rs b/ptx/src/pass/llvm/emit.rs index 144f5e6..d9ec666 100644 --- a/ptx/src/pass/llvm/emit.rs +++ b/ptx/src/pass/llvm/emit.rs @@ -511,9 +511,6 @@ impl<'a> MethodEmitContext<'a> { ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments), ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments), ast::Instruction::BarWarp { .. } => self.emit_bar_warp(), - ast::Instruction::PrmtSlow { .. } => { - Err(error_todo_msg("PrmtSlow is not implemented yet")) - } ast::Instruction::Prmt { data, arguments } => self.emit_prmt(data, arguments), ast::Instruction::Membar { data } => self.emit_membar(data), ast::Instruction::Trap {} => self.emit_trap(), @@ -534,7 +531,8 @@ impl<'a> MethodEmitContext<'a> { | ast::Instruction::Nanosleep { .. } | ast::Instruction::ReduxSync { .. } | ast::Instruction::LdMatrix { .. } - | ast::Instruction::Mma { .. } => return Err(error_unreachable()), + | ast::Instruction::Mma { .. } + | ast::Instruction::PrmtSlow { .. } => return Err(error_unreachable()), } } diff --git a/ptx/src/pass/replace_instructions_with_functions.rs b/ptx/src/pass/replace_instructions_with_functions.rs index f7c976e..2304538 100644 --- a/ptx/src/pass/replace_instructions_with_functions.rs +++ b/ptx/src/pass/replace_instructions_with_functions.rs @@ -519,6 +519,9 @@ fn run_instruction<'input>( i, )? } + i @ ptx_parser::Instruction::PrmtSlow { .. } => { + to_call(resolver, fn_declarations, "prmt_b32".into(), i)? + } i => i, }) } diff --git a/ptx/src/test/ll/prmt_slow.ll b/ptx/src/test/ll/prmt_slow.ll new file mode 100644 index 0000000..3943afa --- /dev/null +++ b/ptx/src/test/ll/prmt_slow.ll @@ -0,0 +1,46 @@ +declare hidden i32 @__zluda_ptx_impl_prmt_b32(i32, i32, i32) #0 + +define amdgpu_kernel void @prmt_slow(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) + %"45" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"38" + +"38": ; preds = %1 + %"46" = load i64, ptr addrspace(4) %"39", align 8 + store i64 %"46", ptr addrspace(5) %"41", align 8 + %"47" = load i64, ptr addrspace(4) %"40", align 8 + store i64 %"47", ptr addrspace(5) %"42", align 8 + %"49" = load i64, ptr addrspace(5) %"41", align 8 + %"60" = inttoptr i64 %"49" to ptr + %"48" = load i32, ptr %"60", align 4 + store i32 %"48", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(5) %"41", align 8 + %"61" = inttoptr i64 %"50" to ptr + %"35" = getelementptr inbounds i8, ptr %"61", i64 4 + %"51" = load i32, ptr %"35", align 4 + store i32 %"51", ptr addrspace(5) %"44", align 4 + %"52" = load i64, ptr addrspace(5) %"41", align 8 + %"62" = inttoptr i64 %"52" to ptr + %"37" = getelementptr inbounds i8, ptr %"62", i64 8 + %"53" = load i32, ptr %"37", align 4 + store i32 %"53", ptr addrspace(5) %"45", align 4 + %"55" = load i32, ptr addrspace(5) %"43", align 4 + %"56" = load i32, ptr addrspace(5) %"44", align 4 + %"57" = load i32, ptr addrspace(5) %"45", align 4 + %"54" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"55", i32 %"56", i32 %"57") + store i32 %"54", ptr addrspace(5) %"44", align 4 + %"58" = load i64, ptr addrspace(5) %"42", align 8 + %"59" = load i32, ptr addrspace(5) %"44", align 4 + %"63" = inttoptr i64 %"58" to ptr + store i32 %"59", ptr %"63", align 4 + ret void +} + +attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 46bdd0b..5d2fc85 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -274,6 +274,11 @@ test_ptx!(const_ident, [0u16], [0u64, 0u64]); test_ptx!(cvt_s16_s8, [0x139231C2u32], [0xFFFFFFC2u32]); test_ptx!(cvt_f64_f32, [0.125f32], [0.125f64]); test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32]); +test_ptx!( + prmt_slow, + [0x70c507d6u32, 0x6fbd4b5cu32, 30212], + [0x6fbdd65cu32] +); test_ptx!(activemask, [0u32], [1u32]); test_ptx!(membar, [152731u32], [152731u32]); test_ptx!(shared_unify_extern, [7681u64, 7682u64], [15363u64]); diff --git a/ptx/src/test/spirv_run/prmt_slow.ptx b/ptx/src/test/spirv_run/prmt_slow.ptx new file mode 100644 index 0000000..08668ae --- /dev/null +++ b/ptx/src/test/spirv_run/prmt_slow.ptx @@ -0,0 +1,25 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry prmt_slow( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u32 temp1; + .reg .u32 temp2; + .reg .u32 temp3; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u32 temp1, [in_addr]; + ld.u32 temp2, [in_addr+4]; + ld.u32 temp3, [in_addr+8]; + prmt.b32 temp2, temp1, temp2, temp3; + st.u32 [out_addr], temp2; + ret; +}