diff --git a/ptx/src/test/ll/vote_all_sub.ll b/ptx/src/test/ll/vote_all_sub.ll new file mode 100644 index 0000000..5b7007d --- /dev/null +++ b/ptx/src/test/ll/vote_all_sub.ll @@ -0,0 +1,65 @@ +declare hidden i1 @__zluda_ptx_impl_vote_sync_all_pred(i1, i32) #0 + +declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare hidden i32 @__zluda_ptx_impl_sreg_laneid() #0 + +define amdgpu_kernel void @vote_all_sub(ptr addrspace(4) byref(i64) %"53") #1 { + %"54" = alloca i32, align 4, addrspace(5) + %"55" = alloca i32, align 4, addrspace(5) + %"56" = alloca i1, align 1, addrspace(5) + %"57" = alloca i1, align 1, addrspace(5) + %"58" = alloca i32, align 4, addrspace(5) + %"59" = alloca i64, align 8, addrspace(5) + %"70" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"48" + +"48": ; preds = %1 + %"60" = load i64, ptr addrspace(4) %"53", align 8 + store i64 %"60", ptr addrspace(5) %"59", align 8 + %"38" = call i32 @__zluda_ptx_impl_sreg_laneid() + br label %"49" + +"49": ; preds = %"48" + store i32 %"38", ptr addrspace(5) %"54", align 4 + %"40" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) + br label %"50" + +"50": ; preds = %"49" + store i32 %"40", ptr addrspace(5) %"55", align 4 + %"64" = load i32, ptr addrspace(5) %"54", align 4 + %2 = icmp eq i32 %"64", 0 + store i1 %2, ptr addrspace(5) %"56", align 1 + store i1 false, ptr addrspace(5) %"57", align 1 + %"66" = load i1, ptr addrspace(5) %"56", align 1 + br i1 %"66", label %"10", label %"19" + +"19": ; preds = %"50" + %"67" = call i1 @__zluda_ptx_impl_vote_sync_all_pred(i1 true, i32 -1) + store i1 %"67", ptr addrspace(5) %"57", align 1 + br label %"10" + +"10": ; preds = %"19", %"50" + %"69" = load i1, ptr addrspace(5) %"57", align 1 + %"68" = select i1 %"69", i32 1, i32 0 + store i32 %"68", ptr addrspace(5) %"58", align 4 + %"72" = load i32, ptr addrspace(5) %"55", align 4 + %3 = zext i32 %"72" to i64 + %"71" = mul i64 %3, 4 + store i64 %"71", ptr addrspace(5) %"70", align 8 + %"74" = load i64, ptr addrspace(5) %"59", align 8 + %"75" = load i64, ptr addrspace(5) %"70", align 8 + %"73" = add i64 %"74", %"75" + store i64 %"73", ptr addrspace(5) %"59", align 8 + %"76" = load i64, ptr addrspace(5) %"59", align 8 + %"77" = load i32, ptr addrspace(5) %"58", align 4 + %"78" = inttoptr i64 %"76" to ptr + store i32 %"77", ptr %"78", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 106a792..f631c95 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -409,6 +409,14 @@ test_ptx_warp!( 1, 1, 1, 1, 1 ] ); +test_ptx_warp!( + vote_all_sub, + [ + 0u32, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + ] +); test_ptx_warp!( vote_any, [ diff --git a/ptx/src/test/spirv_run/vote_all_sub.ptx b/ptx/src/test/spirv_run/vote_all_sub.ptx new file mode 100644 index 0000000..70dd56c --- /dev/null +++ b/ptx/src/test/spirv_run/vote_all_sub.ptx @@ -0,0 +1,35 @@ +.version 7.0 +.target sm_70 +.address_size 64 + +.visible .entry vote_all_sub( + .param .u64 output +) +{ + .reg .u32 laneid; + .reg .u32 tid; + .reg .pred first_lane; + .reg .pred result_pred; + .reg .u32 result; + .reg .u64 out_ptr; + + ld.param.u64 out_ptr, [output]; + + mov.u32 laneid, %laneid; + mov.u32 tid, %tid.x; + setp.eq.u32 first_lane, laneid, 0; + + mov.pred result_pred, 0; + + @first_lane bra EXIT; + // IMPORTANT: it is legal for membermask to be bigger than the execution mask + vote.sync.all.pred result_pred, 1, 0xFFFFFFFF; +EXIT: + selp.u32 result, 1, 0, result_pred; + + .reg .u64 out_offset; + mul.wide.u32 out_offset, tid, 4; + add.u64 out_ptr, out_ptr, out_offset; + st.u32 [out_ptr], result; + ret; +} \ No newline at end of file