From 36407dcc3a2a494ea049f8e0a1e8dd748a385bb4 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Mon, 24 Feb 2025 01:08:03 +0000 Subject: [PATCH] Update tests --- ptx/src/test/ll/activemask.ll | 28 +- ptx/src/test/ll/add.ll | 34 +-- ptx/src/test/ll/add_ftz.ll | 65 +++++ ptx/src/test/ll/add_non_coherent.ll | 34 +-- ptx/src/test/ll/add_tuning.ll | 34 +-- ptx/src/test/ll/and.ll | 46 ++-- ptx/src/test/ll/atom_add.ll | 64 ++--- ptx/src/test/ll/atom_add_float.ll | 64 ++--- ptx/src/test/ll/atom_cas.ll | 64 ++--- ptx/src/test/ll/atom_inc.ll | 64 ++--- ptx/src/test/ll/b64tof64.ll | 38 +-- ptx/src/test/ll/bfe.ll | 56 ++-- ptx/src/test/ll/bfi.ll | 66 ++--- ptx/src/test/ll/block.ll | 42 +-- ptx/src/test/ll/bra.ll | 52 ++-- ptx/src/test/ll/brev.ll | 42 +-- ptx/src/test/ll/call.ll | 71 +++--- ptx/src/test/ll/clz.ll | 42 +-- ptx/src/test/ll/const.ll | 64 ++--- ptx/src/test/ll/constant_f32.ll | 38 +-- ptx/src/test/ll/constant_negative.ll | 38 +-- ptx/src/test/ll/cos.ll | 42 +-- ptx/src/test/ll/cvt_f64_f32.ll | 40 +-- ptx/src/test/ll/cvt_rni.ll | 70 ++--- ptx/src/test/ll/cvt_rzi.ll | 70 ++--- ptx/src/test/ll/cvt_s16_s8.ll | 42 +-- ptx/src/test/ll/cvt_s32_f32.ll | 78 +++--- ptx/src/test/ll/cvt_s64_s32.ll | 40 +-- ptx/src/test/ll/cvt_sat_s_u.ll | 48 ++-- ptx/src/test/ll/cvta.ll | 44 ++-- ptx/src/test/ll/div_approx.ll | 46 ++-- ptx/src/test/ll/ex2.ll | 42 +-- ptx/src/test/ll/extern_shared.ll | 34 +-- ptx/src/test/ll/extern_shared_call.ll | 57 +++-- ptx/src/test/ll/fma.ll | 60 +++-- ptx/src/test/ll/global_array.ll | 30 ++- ptx/src/test/ll/ld_st.ll | 24 +- ptx/src/test/ll/ld_st_implicit.ll | 36 +-- ptx/src/test/ll/ld_st_offset.ll | 50 ++-- ptx/src/test/ll/lg2.ll | 42 +-- ptx/src/test/ll/local_align.ll | 26 +- ptx/src/test/ll/mad_s32.ll | 78 +++--- ptx/src/test/ll/malformed_label.ll | 53 ++++ ptx/src/test/ll/max.ll | 50 ++-- ptx/src/test/ll/membar.ll | 30 ++- ptx/src/test/ll/min.ll | 50 ++-- ptx/src/test/ll/mov.ll | 32 ++- ptx/src/test/ll/mov_address.ll | 16 +- ptx/src/test/ll/mul24.ll | 42 +-- ptx/src/test/ll/mul_ftz.ll | 46 ++-- ptx/src/test/ll/mul_hi.ll | 36 +-- ptx/src/test/ll/mul_lo.ll | 34 +-- ptx/src/test/ll/mul_non_ftz.ll | 46 ++-- ptx/src/test/ll/mul_wide.ll | 52 ++-- ptx/src/test/ll/neg.ll | 38 +-- ptx/src/test/ll/non_scalar_ptr_offset.ll | 44 ++-- ptx/src/test/ll/not.ll | 34 +-- ptx/src/test/ll/ntid.ll | 42 +-- ptx/src/test/ll/or.ll | 40 +-- ptx/src/test/ll/popc.ll | 42 +-- ptx/src/test/ll/pred_not.ll | 74 +++--- ptx/src/test/ll/prmt.ll | 50 ++-- ptx/src/test/ll/rcp.ll | 42 +-- ptx/src/test/ll/reg_local.ll | 48 ++-- ptx/src/test/ll/rem.ll | 46 ++-- ptx/src/test/ll/rsqrt.ll | 49 ++-- ptx/src/test/ll/selp.ll | 46 ++-- ptx/src/test/ll/selp_true.ll | 46 ++-- ptx/src/test/ll/setp.ll | 68 ++--- ptx/src/test/ll/setp_gt.ll | 76 +++--- ptx/src/test/ll/setp_leu.ll | 76 +++--- ptx/src/test/ll/setp_nan.ll | 254 ++++++++++--------- ptx/src/test/ll/setp_num.ll | 254 ++++++++++--------- ptx/src/test/ll/shared_ptr_32.ll | 54 ++-- ptx/src/test/ll/shared_ptr_take_address.ll | 46 ++-- ptx/src/test/ll/shared_unify_extern.ll | 82 +++--- ptx/src/test/ll/shared_unify_local.ll | 74 +++--- ptx/src/test/ll/shared_variable.ll | 36 +-- ptx/src/test/ll/shl.ll | 36 +-- ptx/src/test/ll/shr.ll | 40 +-- ptx/src/test/ll/sign_extend.ll | 30 ++- ptx/src/test/ll/sin.ll | 42 +-- ptx/src/test/ll/sqrt.ll | 42 +-- ptx/src/test/ll/stateful_ld_st_ntid.ll | 72 +++--- ptx/src/test/ll/stateful_ld_st_ntid_chain.ll | 76 +++--- ptx/src/test/ll/stateful_ld_st_ntid_sub.ll | 82 +++--- ptx/src/test/ll/stateful_ld_st_simple.ll | 42 +-- ptx/src/test/ll/stateful_neg_offset.ll | 54 ++-- ptx/src/test/ll/sub.ll | 34 +-- ptx/src/test/ll/vector.ll | 101 ++++---- ptx/src/test/ll/vector4.ll | 40 +-- ptx/src/test/ll/vector_extract.ll | 106 ++++---- ptx/src/test/ll/xor.ll | 46 ++-- 93 files changed, 2772 insertions(+), 2264 deletions(-) create mode 100644 ptx/src/test/ll/add_ftz.ll create mode 100644 ptx/src/test/ll/malformed_label.ll diff --git a/ptx/src/test/ll/activemask.ll b/ptx/src/test/ll/activemask.ll index a54bc7b..d3a37ba 100644 --- a/ptx/src/test/ll/activemask.ll +++ b/ptx/src/test/ll/activemask.ll @@ -12,21 +12,25 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { - %"35" = alloca i64, align 8, addrspace(5) - %"36" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #1 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"37" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"37", ptr addrspace(5) %"35", align 4 - %"38" = call i32 @__zluda_ptx_impl_activemask() - store i32 %"38", ptr addrspace(5) %"36", align 4 - %"39" = load i64, ptr addrspace(5) %"35", align 4 - %"40" = load i32, ptr addrspace(5) %"36", align 4 - %"41" = inttoptr i64 %"39" to ptr - store i32 %"40", ptr %"41", align 4 + br label %"44" + +"44": ; preds = %1 + %"38" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"38", ptr addrspace(5) %"36", align 4 + %"39" = call i32 @__zluda_ptx_impl_activemask() + store i32 %"39", ptr addrspace(5) %"37", align 4 + %"40" = load i64, ptr addrspace(5) %"36", align 4 + %"41" = load i32, ptr addrspace(5) %"37", align 4 + %"42" = inttoptr i64 %"40" to ptr + store i32 %"41", ptr %"42", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/add.ll b/ptx/src/test/ll/add.ll index d8807e0..f213d12 100644 --- a/ptx/src/test/ll/add.ll +++ b/ptx/src/test/ll/add.ll @@ -10,30 +10,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 + br label %"53" + +"53": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"46" = add i64 %"47", 1 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"51", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"46" to ptr + %"45" = load i64, ptr %"51", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %"47" = add i64 %"48", 1 + store i64 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"52" = inttoptr i64 %"49" to ptr + store i64 %"50", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/add_ftz.ll b/ptx/src/test/ll/add_ftz.ll new file mode 100644 index 0000000..25234b4 --- /dev/null +++ b/ptx/src/test/ll/add_ftz.ll @@ -0,0 +1,65 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @add_ftz(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #1 { + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca float, align 4, addrspace(5) + %"47" = alloca float, align 4, addrspace(5) + %"48" = alloca float, align 4, addrspace(5) + %"49" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"70" + +"70": ; preds = %1 + %"50" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"50", ptr addrspace(5) %"44", align 4 + %"51" = load i64, ptr addrspace(4) %"43", align 4 + store i64 %"51", ptr addrspace(5) %"45", align 4 + %"53" = load i64, ptr addrspace(5) %"44", align 4 + %"66" = inttoptr i64 %"53" to ptr + %"52" = load float, ptr %"66", align 4 + store float %"52", ptr addrspace(5) %"46", align 4 + %"54" = load i64, ptr addrspace(5) %"44", align 4 + %"67" = inttoptr i64 %"54" to ptr + %"33" = getelementptr inbounds i8, ptr %"67", i64 4 + %"55" = load float, ptr %"33", align 4 + store float %"55", ptr addrspace(5) %"47", align 4 + %"57" = load float, ptr addrspace(5) %"46", align 4 + %"58" = load float, ptr addrspace(5) %"47", align 4 + %"56" = fadd float %"57", %"58" + store float %"56", ptr addrspace(5) %"48", align 4 + %"60" = load float, ptr addrspace(5) %"46", align 4 + %"61" = load float, ptr addrspace(5) %"47", align 4 + call void @llvm.amdgcn.s.setreg(i32 2305, i32 3) + %"59" = fadd float %"60", %"61" + store float %"59", ptr addrspace(5) %"49", align 4 + %"62" = load i64, ptr addrspace(5) %"45", align 4 + %"63" = load float, ptr addrspace(5) %"48", align 4 + %"68" = inttoptr i64 %"62" to ptr + store float %"63", ptr %"68", align 4 + %"64" = load i64, ptr addrspace(5) %"45", align 4 + %"69" = inttoptr i64 %"64" to ptr + %"35" = getelementptr inbounds i8, ptr %"69", i64 4 + %"65" = load float, ptr addrspace(5) %"49", align 4 + store float %"65", ptr %"35", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #2 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind willreturn } \ No newline at end of file diff --git a/ptx/src/test/ll/add_non_coherent.ll b/ptx/src/test/ll/add_non_coherent.ll index 668031d..9c18c06 100644 --- a/ptx/src/test/ll/add_non_coherent.ll +++ b/ptx/src/test/ll/add_non_coherent.ll @@ -10,30 +10,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 + br label %"53" + +"53": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr addrspace(1) - %"44" = load i64, ptr addrspace(1) %"50", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"46" = add i64 %"47", 1 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr addrspace(1) - store i64 %"49", ptr addrspace(1) %"51", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"46" to ptr addrspace(1) + %"45" = load i64, ptr addrspace(1) %"51", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %"47" = add i64 %"48", 1 + store i64 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"52" = inttoptr i64 %"49" to ptr addrspace(1) + store i64 %"50", ptr addrspace(1) %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/add_tuning.ll b/ptx/src/test/ll/add_tuning.ll index 0ef4636..94ca5bc 100644 --- a/ptx/src/test/ll/add_tuning.ll +++ b/ptx/src/test/ll/add_tuning.ll @@ -10,30 +10,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 + br label %"53" + +"53": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"46" = add i64 %"47", 1 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"51", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"46" to ptr + %"45" = load i64, ptr %"51", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %"47" = add i64 %"48", 1 + store i64 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"52" = inttoptr i64 %"49" to ptr + store i64 %"50", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/and.ll b/ptx/src/test/ll/and.ll index f13e3a7..55418d8 100644 --- a/ptx/src/test/ll/and.ll +++ b/ptx/src/test/ll/and.ll @@ -10,36 +10,40 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 + br label %"61" + +"61": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"46" = load i32, ptr %"55", align 4 + store i32 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 4 + %"49" = load i32, ptr %"31", align 4 + store i32 %"49", ptr addrspace(5) %"43", align 4 %"51" = load i32, ptr addrspace(5) %"42", align 4 - %"56" = and i32 %"50", %"51" - store i32 %"56", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"41", align 4 - %"59" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"59", align 4 + %"52" = load i32, ptr addrspace(5) %"43", align 4 + %"57" = and i32 %"51", %"52" + store i32 %"57", ptr addrspace(5) %"42", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i32, ptr addrspace(5) %"42", align 4 + %"60" = inttoptr i64 %"53" to ptr + store i32 %"54", ptr %"60", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/atom_add.ll b/ptx/src/test/ll/atom_add.ll index b646974..615f521 100644 --- a/ptx/src/test/ll/atom_add.ll +++ b/ptx/src/test/ll/atom_add.ll @@ -12,44 +12,48 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { - %"42" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 { %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i32, align 4, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) %"45" = alloca i32, align 4, addrspace(5) + %"46" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 + br label %"68" + +"68": ; preds = %1 %"47" = load i64, ptr addrspace(4) %"41", align 4 store i64 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"60" = inttoptr i64 %"49" to ptr - %"48" = load i32, ptr %"60", align 4 - store i32 %"48", ptr addrspace(5) %"44", align 4 - %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"48", ptr addrspace(5) %"44", align 4 + %"50" = load i64, ptr addrspace(5) %"43", align 4 %"61" = inttoptr i64 %"50" to ptr - %"31" = getelementptr inbounds i8, ptr %"61", i64 4 - %"51" = load i32, ptr %"31", align 4 - store i32 %"51", ptr addrspace(5) %"45", align 4 - %"52" = load i32, ptr addrspace(5) %"44", align 4 - store i32 %"52", ptr addrspace(3) @shared_mem, align 4 - %"54" = load i32, ptr addrspace(5) %"45", align 4 - %2 = atomicrmw add ptr addrspace(3) @shared_mem, i32 %"54" syncscope("agent-one-as") monotonic, align 4 - store i32 %2, ptr addrspace(5) %"44", align 4 - %"55" = load i32, ptr addrspace(3) @shared_mem, align 4 - store i32 %"55", ptr addrspace(5) %"45", align 4 - %"56" = load i64, ptr addrspace(5) %"43", align 4 - %"57" = load i32, ptr addrspace(5) %"44", align 4 - %"65" = inttoptr i64 %"56" to ptr - store i32 %"57", ptr %"65", align 4 - %"58" = load i64, ptr addrspace(5) %"43", align 4 - %"66" = inttoptr i64 %"58" to ptr - %"33" = getelementptr inbounds i8, ptr %"66", i64 4 - %"59" = load i32, ptr addrspace(5) %"45", align 4 - store i32 %"59", ptr %"33", align 4 + %"49" = load i32, ptr %"61", align 4 + store i32 %"49", ptr addrspace(5) %"45", align 4 + %"51" = load i64, ptr addrspace(5) %"43", align 4 + %"62" = inttoptr i64 %"51" to ptr + %"32" = getelementptr inbounds i8, ptr %"62", i64 4 + %"52" = load i32, ptr %"32", align 4 + store i32 %"52", ptr addrspace(5) %"46", align 4 + %"53" = load i32, ptr addrspace(5) %"45", align 4 + store i32 %"53", ptr addrspace(3) @shared_mem, align 4 + %"55" = load i32, ptr addrspace(5) %"46", align 4 + %2 = atomicrmw add ptr addrspace(3) @shared_mem, i32 %"55" syncscope("agent-one-as") monotonic, align 4 + store i32 %2, ptr addrspace(5) %"45", align 4 + %"56" = load i32, ptr addrspace(3) @shared_mem, align 4 + store i32 %"56", ptr addrspace(5) %"46", align 4 + %"57" = load i64, ptr addrspace(5) %"44", align 4 + %"58" = load i32, ptr addrspace(5) %"45", align 4 + %"66" = inttoptr i64 %"57" to ptr + store i32 %"58", ptr %"66", align 4 + %"59" = load i64, ptr addrspace(5) %"44", align 4 + %"67" = inttoptr i64 %"59" to ptr + %"34" = getelementptr inbounds i8, ptr %"67", i64 4 + %"60" = load i32, ptr addrspace(5) %"46", align 4 + store i32 %"60", ptr %"34", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/atom_add_float.ll b/ptx/src/test/ll/atom_add_float.ll index 33265a4..ad83a5a 100644 --- a/ptx/src/test/ll/atom_add_float.ll +++ b/ptx/src/test/ll/atom_add_float.ll @@ -12,44 +12,48 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { - %"42" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 { %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca float, align 4, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) %"45" = alloca float, align 4, addrspace(5) + %"46" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 + br label %"68" + +"68": ; preds = %1 %"47" = load i64, ptr addrspace(4) %"41", align 4 store i64 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"60" = inttoptr i64 %"49" to ptr - %"48" = load float, ptr %"60", align 4 - store float %"48", ptr addrspace(5) %"44", align 4 - %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"48", ptr addrspace(5) %"44", align 4 + %"50" = load i64, ptr addrspace(5) %"43", align 4 %"61" = inttoptr i64 %"50" to ptr - %"31" = getelementptr inbounds i8, ptr %"61", i64 4 - %"51" = load float, ptr %"31", align 4 - store float %"51", ptr addrspace(5) %"45", align 4 - %"52" = load float, ptr addrspace(5) %"44", align 4 - store float %"52", ptr addrspace(3) @shared_mem, align 4 - %"54" = load float, ptr addrspace(5) %"45", align 4 - %2 = atomicrmw fadd ptr addrspace(3) @shared_mem, float %"54" syncscope("agent-one-as") monotonic, align 4 - store float %2, ptr addrspace(5) %"44", align 4 - %"55" = load float, ptr addrspace(3) @shared_mem, align 4 - store float %"55", ptr addrspace(5) %"45", align 4 - %"56" = load i64, ptr addrspace(5) %"43", align 4 - %"57" = load float, ptr addrspace(5) %"44", align 4 - %"65" = inttoptr i64 %"56" to ptr - store float %"57", ptr %"65", align 4 - %"58" = load i64, ptr addrspace(5) %"43", align 4 - %"66" = inttoptr i64 %"58" to ptr - %"33" = getelementptr inbounds i8, ptr %"66", i64 4 - %"59" = load float, ptr addrspace(5) %"45", align 4 - store float %"59", ptr %"33", align 4 + %"49" = load float, ptr %"61", align 4 + store float %"49", ptr addrspace(5) %"45", align 4 + %"51" = load i64, ptr addrspace(5) %"43", align 4 + %"62" = inttoptr i64 %"51" to ptr + %"32" = getelementptr inbounds i8, ptr %"62", i64 4 + %"52" = load float, ptr %"32", align 4 + store float %"52", ptr addrspace(5) %"46", align 4 + %"53" = load float, ptr addrspace(5) %"45", align 4 + store float %"53", ptr addrspace(3) @shared_mem, align 4 + %"55" = load float, ptr addrspace(5) %"46", align 4 + %2 = atomicrmw fadd ptr addrspace(3) @shared_mem, float %"55" syncscope("agent-one-as") monotonic, align 4 + store float %2, ptr addrspace(5) %"45", align 4 + %"56" = load float, ptr addrspace(3) @shared_mem, align 4 + store float %"56", ptr addrspace(5) %"46", align 4 + %"57" = load i64, ptr addrspace(5) %"44", align 4 + %"58" = load float, ptr addrspace(5) %"45", align 4 + %"66" = inttoptr i64 %"57" to ptr + store float %"58", ptr %"66", align 4 + %"59" = load i64, ptr addrspace(5) %"44", align 4 + %"67" = inttoptr i64 %"59" to ptr + %"34" = getelementptr inbounds i8, ptr %"67", i64 4 + %"60" = load float, ptr addrspace(5) %"46", align 4 + store float %"60", ptr %"34", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/atom_cas.ll b/ptx/src/test/ll/atom_cas.ll index 644d0cd..dee4c10 100644 --- a/ptx/src/test/ll/atom_cas.ll +++ b/ptx/src/test/ll/atom_cas.ll @@ -10,44 +10,48 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { - %"44" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #1 { %"45" = alloca i64, align 8, addrspace(5) - %"46" = alloca i32, align 4, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) %"47" = alloca i32, align 4, addrspace(5) + %"48" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"48" = load i64, ptr addrspace(4) %"42", align 4 - store i64 %"48", ptr addrspace(5) %"44", align 4 + br label %"69" + +"69": ; preds = %1 %"49" = load i64, ptr addrspace(4) %"43", align 4 store i64 %"49", ptr addrspace(5) %"45", align 4 - %"51" = load i64, ptr addrspace(5) %"44", align 4 - %"61" = inttoptr i64 %"51" to ptr - %"50" = load i32, ptr %"61", align 4 - store i32 %"50", ptr addrspace(5) %"46", align 4 - %"52" = load i64, ptr addrspace(5) %"44", align 4 + %"50" = load i64, ptr addrspace(4) %"44", align 4 + store i64 %"50", ptr addrspace(5) %"46", align 4 + %"52" = load i64, ptr addrspace(5) %"45", align 4 %"62" = inttoptr i64 %"52" to ptr - %"30" = getelementptr inbounds i8, ptr %"62", i64 4 - %"54" = load i32, ptr addrspace(5) %"46", align 4 - %2 = cmpxchg ptr %"30", i32 %"54", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4 - %"63" = extractvalue { i32, i1 } %2, 0 - store i32 %"63", ptr addrspace(5) %"46", align 4 - %"55" = load i64, ptr addrspace(5) %"44", align 4 - %"65" = inttoptr i64 %"55" to ptr - %"33" = getelementptr inbounds i8, ptr %"65", i64 4 - %"56" = load i32, ptr %"33", align 4 - store i32 %"56", ptr addrspace(5) %"47", align 4 - %"57" = load i64, ptr addrspace(5) %"45", align 4 - %"58" = load i32, ptr addrspace(5) %"46", align 4 - %"66" = inttoptr i64 %"57" to ptr - store i32 %"58", ptr %"66", align 4 - %"59" = load i64, ptr addrspace(5) %"45", align 4 - %"67" = inttoptr i64 %"59" to ptr - %"35" = getelementptr inbounds i8, ptr %"67", i64 4 - %"60" = load i32, ptr addrspace(5) %"47", align 4 - store i32 %"60", ptr %"35", align 4 + %"51" = load i32, ptr %"62", align 4 + store i32 %"51", ptr addrspace(5) %"47", align 4 + %"53" = load i64, ptr addrspace(5) %"45", align 4 + %"63" = inttoptr i64 %"53" to ptr + %"31" = getelementptr inbounds i8, ptr %"63", i64 4 + %"55" = load i32, ptr addrspace(5) %"47", align 4 + %2 = cmpxchg ptr %"31", i32 %"55", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4 + %"64" = extractvalue { i32, i1 } %2, 0 + store i32 %"64", ptr addrspace(5) %"47", align 4 + %"56" = load i64, ptr addrspace(5) %"45", align 4 + %"66" = inttoptr i64 %"56" to ptr + %"34" = getelementptr inbounds i8, ptr %"66", i64 4 + %"57" = load i32, ptr %"34", align 4 + store i32 %"57", ptr addrspace(5) %"48", align 4 + %"58" = load i64, ptr addrspace(5) %"46", align 4 + %"59" = load i32, ptr addrspace(5) %"47", align 4 + %"67" = inttoptr i64 %"58" to ptr + store i32 %"59", ptr %"67", align 4 + %"60" = load i64, ptr addrspace(5) %"46", align 4 + %"68" = inttoptr i64 %"60" to ptr + %"36" = getelementptr inbounds i8, ptr %"68", i64 4 + %"61" = load i32, ptr addrspace(5) %"48", align 4 + store i32 %"61", ptr %"36", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/atom_inc.ll b/ptx/src/test/ll/atom_inc.ll index 88ba124..703d820 100644 --- a/ptx/src/test/ll/atom_inc.ll +++ b/ptx/src/test/ll/atom_inc.ll @@ -10,46 +10,50 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { - %"44" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #1 { %"45" = alloca i64, align 8, addrspace(5) - %"46" = alloca i32, align 4, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) %"47" = alloca i32, align 4, addrspace(5) %"48" = alloca i32, align 4, addrspace(5) + %"49" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"49" = load i64, ptr addrspace(4) %"42", align 4 - store i64 %"49", ptr addrspace(5) %"44", align 4 + br label %"70" + +"70": ; preds = %1 %"50" = load i64, ptr addrspace(4) %"43", align 4 store i64 %"50", ptr addrspace(5) %"45", align 4 - %"52" = load i64, ptr addrspace(5) %"44", align 4 - %"63" = inttoptr i64 %"52" to ptr - %2 = atomicrmw uinc_wrap ptr %"63", i32 101 syncscope("agent-one-as") monotonic, align 4 - store i32 %2, ptr addrspace(5) %"46", align 4 - %"54" = load i64, ptr addrspace(5) %"44", align 4 - %"64" = inttoptr i64 %"54" to ptr addrspace(1) - %3 = atomicrmw uinc_wrap ptr addrspace(1) %"64", i32 101 syncscope("agent-one-as") monotonic, align 4 - store i32 %3, ptr addrspace(5) %"47", align 4 - %"56" = load i64, ptr addrspace(5) %"44", align 4 - %"65" = inttoptr i64 %"56" to ptr - %"55" = load i32, ptr %"65", align 4 - store i32 %"55", ptr addrspace(5) %"48", align 4 + %"51" = load i64, ptr addrspace(4) %"44", align 4 + store i64 %"51", ptr addrspace(5) %"46", align 4 + %"53" = load i64, ptr addrspace(5) %"45", align 4 + %"64" = inttoptr i64 %"53" to ptr + %2 = atomicrmw uinc_wrap ptr %"64", i32 101 syncscope("agent-one-as") monotonic, align 4 + store i32 %2, ptr addrspace(5) %"47", align 4 + %"55" = load i64, ptr addrspace(5) %"45", align 4 + %"65" = inttoptr i64 %"55" to ptr addrspace(1) + %3 = atomicrmw uinc_wrap ptr addrspace(1) %"65", i32 101 syncscope("agent-one-as") monotonic, align 4 + store i32 %3, ptr addrspace(5) %"48", align 4 %"57" = load i64, ptr addrspace(5) %"45", align 4 - %"58" = load i32, ptr addrspace(5) %"46", align 4 %"66" = inttoptr i64 %"57" to ptr - store i32 %"58", ptr %"66", align 4 - %"59" = load i64, ptr addrspace(5) %"45", align 4 - %"67" = inttoptr i64 %"59" to ptr - %"33" = getelementptr inbounds i8, ptr %"67", i64 4 - %"60" = load i32, ptr addrspace(5) %"47", align 4 - store i32 %"60", ptr %"33", align 4 - %"61" = load i64, ptr addrspace(5) %"45", align 4 - %"68" = inttoptr i64 %"61" to ptr - %"35" = getelementptr inbounds i8, ptr %"68", i64 8 - %"62" = load i32, ptr addrspace(5) %"48", align 4 - store i32 %"62", ptr %"35", align 4 + %"56" = load i32, ptr %"66", align 4 + store i32 %"56", ptr addrspace(5) %"49", align 4 + %"58" = load i64, ptr addrspace(5) %"46", align 4 + %"59" = load i32, ptr addrspace(5) %"47", align 4 + %"67" = inttoptr i64 %"58" to ptr + store i32 %"59", ptr %"67", align 4 + %"60" = load i64, ptr addrspace(5) %"46", align 4 + %"68" = inttoptr i64 %"60" to ptr + %"34" = getelementptr inbounds i8, ptr %"68", i64 4 + %"61" = load i32, ptr addrspace(5) %"48", align 4 + store i32 %"61", ptr %"34", align 4 + %"62" = load i64, ptr addrspace(5) %"46", align 4 + %"69" = inttoptr i64 %"62" to ptr + %"36" = getelementptr inbounds i8, ptr %"69", i64 8 + %"63" = load i32, ptr addrspace(5) %"49", align 4 + store i32 %"63", ptr %"36", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/b64tof64.ll b/ptx/src/test/ll/b64tof64.ll index 2373b64..e9d567c 100644 --- a/ptx/src/test/ll/b64tof64.ll +++ b/ptx/src/test/ll/b64tof64.ll @@ -10,30 +10,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca double, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { + %"38" = alloca double, align 8, addrspace(5) %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load double, ptr addrspace(4) %"35", align 8 - store double %"41", ptr addrspace(5) %"37", align 8 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load double, ptr addrspace(5) %"37", align 8 - %"50" = bitcast double %"44" to i64 - store i64 %"50", ptr addrspace(5) %"38", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %"51" = inttoptr i64 %"46" to ptr - %"45" = load i64, ptr %"51", align 4 - store i64 %"45", ptr addrspace(5) %"40", align 4 + br label %"54" + +"54": ; preds = %1 + %"42" = load double, ptr addrspace(4) %"36", align 8 + store double %"42", ptr addrspace(5) %"38", align 8 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"40", align 4 + %"45" = load double, ptr addrspace(5) %"38", align 8 + %"51" = bitcast double %"45" to i64 + store i64 %"51", ptr addrspace(5) %"39", align 4 %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 %"52" = inttoptr i64 %"47" to ptr - store i64 %"48", ptr %"52", align 4 + %"46" = load i64, ptr %"52", align 4 + store i64 %"46", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"53" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"53", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/bfe.ll b/ptx/src/test/ll/bfe.ll index fda252d..8659972 100644 --- a/ptx/src/test/ll/bfe.ll +++ b/ptx/src/test/ll/bfe.ll @@ -12,43 +12,47 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { - %"42" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 { %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i32, align 4, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) %"45" = alloca i32, align 4, addrspace(5) %"46" = alloca i32, align 4, addrspace(5) + %"47" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"47" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"47", ptr addrspace(5) %"42", align 4 + br label %"66" + +"66": ; preds = %1 %"48" = load i64, ptr addrspace(4) %"41", align 4 store i64 %"48", ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(5) %"42", align 4 - %"61" = inttoptr i64 %"50" to ptr - %"49" = load i32, ptr %"61", align 4 - store i32 %"49", ptr addrspace(5) %"44", align 4 - %"51" = load i64, ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"49", ptr addrspace(5) %"44", align 4 + %"51" = load i64, ptr addrspace(5) %"43", align 4 %"62" = inttoptr i64 %"51" to ptr - %"31" = getelementptr inbounds i8, ptr %"62", i64 4 - %"52" = load i32, ptr %"31", align 4 - store i32 %"52", ptr addrspace(5) %"45", align 4 - %"53" = load i64, ptr addrspace(5) %"42", align 4 - %"63" = inttoptr i64 %"53" to ptr - %"33" = getelementptr inbounds i8, ptr %"63", i64 8 - %"54" = load i32, ptr %"33", align 4 - store i32 %"54", ptr addrspace(5) %"46", align 4 - %"56" = load i32, ptr addrspace(5) %"44", align 4 + %"50" = load i32, ptr %"62", align 4 + store i32 %"50", ptr addrspace(5) %"45", align 4 + %"52" = load i64, ptr addrspace(5) %"43", align 4 + %"63" = inttoptr i64 %"52" to ptr + %"32" = getelementptr inbounds i8, ptr %"63", i64 4 + %"53" = load i32, ptr %"32", align 4 + store i32 %"53", ptr addrspace(5) %"46", align 4 + %"54" = load i64, ptr addrspace(5) %"43", align 4 + %"64" = inttoptr i64 %"54" to ptr + %"34" = getelementptr inbounds i8, ptr %"64", i64 8 + %"55" = load i32, ptr %"34", align 4 + store i32 %"55", ptr addrspace(5) %"47", align 4 %"57" = load i32, ptr addrspace(5) %"45", align 4 %"58" = load i32, ptr addrspace(5) %"46", align 4 - %"55" = call i32 @__zluda_ptx_impl_bfe_u32(i32 %"56", i32 %"57", i32 %"58") - store i32 %"55", ptr addrspace(5) %"44", align 4 - %"59" = load i64, ptr addrspace(5) %"43", align 4 - %"60" = load i32, ptr addrspace(5) %"44", align 4 - %"64" = inttoptr i64 %"59" to ptr - store i32 %"60", ptr %"64", align 4 + %"59" = load i32, ptr addrspace(5) %"47", align 4 + %"56" = call i32 @__zluda_ptx_impl_bfe_u32(i32 %"57", i32 %"58", i32 %"59") + store i32 %"56", ptr addrspace(5) %"45", align 4 + %"60" = load i64, ptr addrspace(5) %"44", align 4 + %"61" = load i32, ptr addrspace(5) %"45", align 4 + %"65" = inttoptr i64 %"60" to ptr + store i32 %"61", ptr %"65", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/bfi.ll b/ptx/src/test/ll/bfi.ll index ef437c1..f16bd64 100644 --- a/ptx/src/test/ll/bfi.ll +++ b/ptx/src/test/ll/bfi.ll @@ -12,50 +12,54 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { - %"45" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #1 { %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca i32, align 4, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) %"48" = alloca i32, align 4, addrspace(5) %"49" = alloca i32, align 4, addrspace(5) %"50" = alloca i32, align 4, addrspace(5) + %"51" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"51" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"51", ptr addrspace(5) %"45", align 4 + br label %"77" + +"77": ; preds = %1 %"52" = load i64, ptr addrspace(4) %"44", align 4 store i64 %"52", ptr addrspace(5) %"46", align 4 - %"54" = load i64, ptr addrspace(5) %"45", align 4 - %"68" = inttoptr i64 %"54" to ptr - %"53" = load i32, ptr %"68", align 4 - store i32 %"53", ptr addrspace(5) %"47", align 4 - %"55" = load i64, ptr addrspace(5) %"45", align 4 + %"53" = load i64, ptr addrspace(4) %"45", align 4 + store i64 %"53", ptr addrspace(5) %"47", align 4 + %"55" = load i64, ptr addrspace(5) %"46", align 4 %"69" = inttoptr i64 %"55" to ptr - %"32" = getelementptr inbounds i8, ptr %"69", i64 4 - %"56" = load i32, ptr %"32", align 4 - store i32 %"56", ptr addrspace(5) %"48", align 4 - %"57" = load i64, ptr addrspace(5) %"45", align 4 - %"70" = inttoptr i64 %"57" to ptr - %"34" = getelementptr inbounds i8, ptr %"70", i64 8 - %"58" = load i32, ptr %"34", align 4 - store i32 %"58", ptr addrspace(5) %"49", align 4 - %"59" = load i64, ptr addrspace(5) %"45", align 4 - %"71" = inttoptr i64 %"59" to ptr - %"36" = getelementptr inbounds i8, ptr %"71", i64 12 - %"60" = load i32, ptr %"36", align 4 - store i32 %"60", ptr addrspace(5) %"50", align 4 - %"62" = load i32, ptr addrspace(5) %"47", align 4 + %"54" = load i32, ptr %"69", align 4 + store i32 %"54", ptr addrspace(5) %"48", align 4 + %"56" = load i64, ptr addrspace(5) %"46", align 4 + %"70" = inttoptr i64 %"56" to ptr + %"33" = getelementptr inbounds i8, ptr %"70", i64 4 + %"57" = load i32, ptr %"33", align 4 + store i32 %"57", ptr addrspace(5) %"49", align 4 + %"58" = load i64, ptr addrspace(5) %"46", align 4 + %"71" = inttoptr i64 %"58" to ptr + %"35" = getelementptr inbounds i8, ptr %"71", i64 8 + %"59" = load i32, ptr %"35", align 4 + store i32 %"59", ptr addrspace(5) %"50", align 4 + %"60" = load i64, ptr addrspace(5) %"46", align 4 + %"72" = inttoptr i64 %"60" to ptr + %"37" = getelementptr inbounds i8, ptr %"72", i64 12 + %"61" = load i32, ptr %"37", align 4 + store i32 %"61", ptr addrspace(5) %"51", align 4 %"63" = load i32, ptr addrspace(5) %"48", align 4 %"64" = load i32, ptr addrspace(5) %"49", align 4 %"65" = load i32, ptr addrspace(5) %"50", align 4 - %"72" = call i32 @__zluda_ptx_impl_bfi_b32(i32 %"62", i32 %"63", i32 %"64", i32 %"65") - store i32 %"72", ptr addrspace(5) %"47", align 4 - %"66" = load i64, ptr addrspace(5) %"46", align 4 - %"67" = load i32, ptr addrspace(5) %"47", align 4 - %"75" = inttoptr i64 %"66" to ptr - store i32 %"67", ptr %"75", align 4 + %"66" = load i32, ptr addrspace(5) %"51", align 4 + %"73" = call i32 @__zluda_ptx_impl_bfi_b32(i32 %"63", i32 %"64", i32 %"65", i32 %"66") + store i32 %"73", ptr addrspace(5) %"48", align 4 + %"67" = load i64, ptr addrspace(5) %"47", align 4 + %"68" = load i32, ptr addrspace(5) %"48", align 4 + %"76" = inttoptr i64 %"67" to ptr + store i32 %"68", ptr %"76", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/block.ll b/ptx/src/test/ll/block.ll index 523d941..62e7a77 100644 --- a/ptx/src/test/ll/block.ll +++ b/ptx/src/test/ll/block.ll @@ -10,34 +10,38 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { - %"40" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i64, align 8, addrspace(5) %"43" = alloca i64, align 8, addrspace(5) - %"50" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + %"51" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 + br label %"58" + +"58": ; preds = %1 %"45" = load i64, ptr addrspace(4) %"39", align 4 store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"46" = load i64, ptr %"55", align 4 + %"46" = load i64, ptr addrspace(4) %"40", align 4 store i64 %"46", ptr addrspace(5) %"42", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"48" = add i64 %"49", 1 - store i64 %"48", ptr addrspace(5) %"43", align 4 - %"52" = load i64, ptr addrspace(5) %"50", align 4 - %"51" = add i64 %"52", 1 - store i64 %"51", ptr addrspace(5) %"50", align 4 - %"53" = load i64, ptr addrspace(5) %"41", align 4 - %"54" = load i64, ptr addrspace(5) %"43", align 4 - %"56" = inttoptr i64 %"53" to ptr - store i64 %"54", ptr %"56", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"47" = load i64, ptr %"56", align 4 + store i64 %"47", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(5) %"43", align 4 + %"49" = add i64 %"50", 1 + store i64 %"49", ptr addrspace(5) %"44", align 4 + %"53" = load i64, ptr addrspace(5) %"51", align 4 + %"52" = add i64 %"53", 1 + store i64 %"52", ptr addrspace(5) %"51", align 4 + %"54" = load i64, ptr addrspace(5) %"42", align 4 + %"55" = load i64, ptr addrspace(5) %"44", align 4 + %"57" = inttoptr i64 %"54" to ptr + store i64 %"55", ptr %"57", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/bra.ll b/ptx/src/test/ll/bra.ll index 0fb9769..a14b62d 100644 --- a/ptx/src/test/ll/bra.ll +++ b/ptx/src/test/ll/bra.ll @@ -10,42 +10,46 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { - %"42" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 { %"43" = alloca i64, align 8, addrspace(5) %"44" = alloca i64, align 8, addrspace(5) %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 + br label %"59" + +"59": ; preds = %1 %"47" = load i64, ptr addrspace(4) %"41", align 4 store i64 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = inttoptr i64 %"49" to ptr - %"48" = load i64, ptr %"56", align 4 + %"48" = load i64, ptr addrspace(4) %"42", align 4 store i64 %"48", ptr addrspace(5) %"44", align 4 - br label %"9" + %"50" = load i64, ptr addrspace(5) %"43", align 4 + %"57" = inttoptr i64 %"50" to ptr + %"49" = load i64, ptr %"57", align 4 + store i64 %"49", ptr addrspace(5) %"45", align 4 + br label %"10" -"9": ; preds = %1 - %"51" = load i64, ptr addrspace(5) %"44", align 4 - %"50" = add i64 %"51", 1 - store i64 %"50", ptr addrspace(5) %"45", align 4 - br label %"11" +"10": ; preds = %"59" + %"52" = load i64, ptr addrspace(5) %"45", align 4 + %"51" = add i64 %"52", 1 + store i64 %"51", ptr addrspace(5) %"46", align 4 + br label %"12" -"10": ; No predecessors! - %"53" = load i64, ptr addrspace(5) %"44", align 4 - %"52" = add i64 %"53", 2 - store i64 %"52", ptr addrspace(5) %"45", align 4 - br label %"11" +"11": ; No predecessors! + %"54" = load i64, ptr addrspace(5) %"45", align 4 + %"53" = add i64 %"54", 2 + store i64 %"53", ptr addrspace(5) %"46", align 4 + br label %"12" -"11": ; preds = %"10", %"9" - %"54" = load i64, ptr addrspace(5) %"43", align 4 - %"55" = load i64, ptr addrspace(5) %"45", align 4 - %"57" = inttoptr i64 %"54" to ptr - store i64 %"55", ptr %"57", align 4 +"12": ; preds = %"11", %"10" + %"55" = load i64, ptr addrspace(5) %"44", align 4 + %"56" = load i64, ptr addrspace(5) %"46", align 4 + %"58" = inttoptr i64 %"55" to ptr + store i64 %"56", ptr %"58", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/brev.ll b/ptx/src/test/ll/brev.ll index 6f10c94..c6b6305 100644 --- a/ptx/src/test/ll/brev.ll +++ b/ptx/src/test/ll/brev.ll @@ -10,33 +10,37 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"50" + +"50": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load i32, ptr %"47", align 4 - store i32 %"41", ptr addrspace(5) %"38", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 - %"43" = call i32 @llvm.bitreverse.i32(i32 %"44") - store i32 %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load i32, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store i32 %"46", ptr %"48", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load i32, ptr %"48", align 4 + store i32 %"42", ptr addrspace(5) %"39", align 4 + %"45" = load i32, ptr addrspace(5) %"39", align 4 + %"44" = call i32 @llvm.bitreverse.i32(i32 %"45") + store i32 %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store i32 %"47", ptr %"49", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.bitreverse.i32(i32) #1 +declare i32 @llvm.bitreverse.i32(i32) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/call.ll b/ptx/src/test/ll/call.ll index c9bb5ce..d13654a 100644 --- a/ptx/src/test/ll/call.ll +++ b/ptx/src/test/ll/call.ll @@ -10,57 +10,64 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define i64 @__zluda_ptx_impl_incr(i64 %"42") #0 { - %"65" = alloca i64, align 8, addrspace(5) +define i64 @incr(i64 %"43") #0 { %"66" = alloca i64, align 8, addrspace(5) %"67" = alloca i64, align 8, addrspace(5) %"68" = alloca i64, align 8, addrspace(5) + %"69" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - store i64 %"42", ptr addrspace(5) %"67", align 4 - %"69" = load i64, ptr addrspace(5) %"67", align 4 - store i64 %"69", ptr addrspace(5) %"68", align 4 - %"71" = load i64, ptr addrspace(5) %"68", align 4 - %"70" = add i64 %"71", 1 - store i64 %"70", ptr addrspace(5) %"68", align 4 - %"72" = load i64, ptr addrspace(5) %"68", align 4 - store i64 %"72", ptr addrspace(5) %"66", align 4 - %"73" = load i64, ptr addrspace(5) %"66", align 4 - store i64 %"73", ptr addrspace(5) %"65", align 4 - %2 = load i64, ptr addrspace(5) %"65", align 4 + br label %"80" + +"80": ; preds = %1 + store i64 %"43", ptr addrspace(5) %"68", align 4 + %"70" = load i64, ptr addrspace(5) %"68", align 4 + store i64 %"70", ptr addrspace(5) %"69", align 4 + %"72" = load i64, ptr addrspace(5) %"69", align 4 + %"71" = add i64 %"72", 1 + store i64 %"71", ptr addrspace(5) %"69", align 4 + %"73" = load i64, ptr addrspace(5) %"69", align 4 + store i64 %"73", ptr addrspace(5) %"67", align 4 + %"74" = load i64, ptr addrspace(5) %"67", align 4 + store i64 %"74", ptr addrspace(5) %"66", align 4 + %2 = load i64, ptr addrspace(5) %"66", align 4 ret i64 %2 } -define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #0 { - %"52" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"51", ptr addrspace(4) byref(i64) %"52") #1 { %"53" = alloca i64, align 8, addrspace(5) %"54" = alloca i64, align 8, addrspace(5) - %"59" = alloca i64, align 8, addrspace(5) + %"55" = alloca i64, align 8, addrspace(5) %"60" = alloca i64, align 8, addrspace(5) + %"61" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"55" = load i64, ptr addrspace(4) %"50", align 4 - store i64 %"55", ptr addrspace(5) %"52", align 4 + br label %"79" + +"79": ; preds = %1 %"56" = load i64, ptr addrspace(4) %"51", align 4 store i64 %"56", ptr addrspace(5) %"53", align 4 - %"58" = load i64, ptr addrspace(5) %"52", align 4 - %"74" = inttoptr i64 %"58" to ptr addrspace(1) - %"57" = load i64, ptr addrspace(1) %"74", align 4 + %"57" = load i64, ptr addrspace(4) %"52", align 4 store i64 %"57", ptr addrspace(5) %"54", align 4 - %"61" = load i64, ptr addrspace(5) %"54", align 4 - store i64 %"61", ptr addrspace(5) %"59", align 4 - %"39" = load i64, ptr addrspace(5) %"59", align 4 - %"40" = call i64 @__zluda_ptx_impl_incr(i64 %"39") - store i64 %"40", ptr addrspace(5) %"60", align 4 - %"62" = load i64, ptr addrspace(5) %"60", align 4 - store i64 %"62", ptr addrspace(5) %"54", align 4 - %"63" = load i64, ptr addrspace(5) %"53", align 4 + %"59" = load i64, ptr addrspace(5) %"53", align 4 + %"75" = inttoptr i64 %"59" to ptr addrspace(1) + %"58" = load i64, ptr addrspace(1) %"75", align 4 + store i64 %"58", ptr addrspace(5) %"55", align 4 + %"62" = load i64, ptr addrspace(5) %"55", align 4 + store i64 %"62", ptr addrspace(5) %"60", align 4 + %"40" = load i64, ptr addrspace(5) %"60", align 4 + %"41" = call i64 @incr(i64 %"40") + store i64 %"41", ptr addrspace(5) %"61", align 4 + %"63" = load i64, ptr addrspace(5) %"61", align 4 + store i64 %"63", ptr addrspace(5) %"55", align 4 %"64" = load i64, ptr addrspace(5) %"54", align 4 - %"77" = inttoptr i64 %"63" to ptr addrspace(1) - store i64 %"64", ptr addrspace(1) %"77", align 4 + %"65" = load i64, ptr addrspace(5) %"55", align 4 + %"78" = inttoptr i64 %"64" to ptr addrspace(1) + store i64 %"65", ptr addrspace(1) %"78", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/clz.ll b/ptx/src/test/ll/clz.ll index 160a634..16d8c0b 100644 --- a/ptx/src/test/ll/clz.ll +++ b/ptx/src/test/ll/clz.ll @@ -10,33 +10,37 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"51" + +"51": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load i32, ptr %"47", align 4 - store i32 %"41", ptr addrspace(5) %"38", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 - %"48" = call i32 @llvm.ctlz.i32(i32 %"44", i1 false) - store i32 %"48", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load i32, ptr addrspace(5) %"38", align 4 - %"49" = inttoptr i64 %"45" to ptr - store i32 %"46", ptr %"49", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load i32, ptr %"48", align 4 + store i32 %"42", ptr addrspace(5) %"39", align 4 + %"45" = load i32, ptr addrspace(5) %"39", align 4 + %"49" = call i32 @llvm.ctlz.i32(i32 %"45", i1 false) + store i32 %"49", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"50" = inttoptr i64 %"46" to ptr + store i32 %"47", ptr %"50", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1 +declare i32 @llvm.ctlz.i32(i32, i1 immarg) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/const.ll b/ptx/src/test/ll/const.ll index 0fbd7e0..f34142e 100644 --- a/ptx/src/test/ll/const.ll +++ b/ptx/src/test/ll/const.ll @@ -12,48 +12,52 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #0 { - %"52" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"51", ptr addrspace(4) byref(i64) %"52") #1 { %"53" = alloca i64, align 8, addrspace(5) - %"54" = alloca i16, align 2, addrspace(5) + %"54" = alloca i64, align 8, addrspace(5) %"55" = alloca i16, align 2, addrspace(5) %"56" = alloca i16, align 2, addrspace(5) %"57" = alloca i16, align 2, addrspace(5) + %"58" = alloca i16, align 2, addrspace(5) br label %1 1: ; preds = %0 - %"58" = load i64, ptr addrspace(4) %"50", align 4 - store i64 %"58", ptr addrspace(5) %"52", align 4 + br label %"85" + +"85": ; preds = %1 %"59" = load i64, ptr addrspace(4) %"51", align 4 store i64 %"59", ptr addrspace(5) %"53", align 4 - %"60" = load i16, ptr addrspace(4) @constparams, align 2 - store i16 %"60", ptr addrspace(5) %"54", align 2 - %"61" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2 + %"60" = load i64, ptr addrspace(4) %"52", align 4 + store i64 %"60", ptr addrspace(5) %"54", align 4 + %"61" = load i16, ptr addrspace(4) @constparams, align 2 store i16 %"61", ptr addrspace(5) %"55", align 2 - %"62" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2 + %"62" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2 store i16 %"62", ptr addrspace(5) %"56", align 2 - %"63" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2 + %"63" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2 store i16 %"63", ptr addrspace(5) %"57", align 2 - %"64" = load i64, ptr addrspace(5) %"53", align 4 - %"65" = load i16, ptr addrspace(5) %"54", align 2 - %"76" = inttoptr i64 %"64" to ptr - store i16 %"65", ptr %"76", align 2 - %"66" = load i64, ptr addrspace(5) %"53", align 4 - %"78" = inttoptr i64 %"66" to ptr - %"39" = getelementptr inbounds i8, ptr %"78", i64 2 - %"67" = load i16, ptr addrspace(5) %"55", align 2 - store i16 %"67", ptr %"39", align 2 - %"68" = load i64, ptr addrspace(5) %"53", align 4 - %"80" = inttoptr i64 %"68" to ptr - %"41" = getelementptr inbounds i8, ptr %"80", i64 4 - %"69" = load i16, ptr addrspace(5) %"56", align 2 - store i16 %"69", ptr %"41", align 2 - %"70" = load i64, ptr addrspace(5) %"53", align 4 - %"82" = inttoptr i64 %"70" to ptr - %"43" = getelementptr inbounds i8, ptr %"82", i64 6 - %"71" = load i16, ptr addrspace(5) %"57", align 2 - store i16 %"71", ptr %"43", align 2 + %"64" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2 + store i16 %"64", ptr addrspace(5) %"58", align 2 + %"65" = load i64, ptr addrspace(5) %"54", align 4 + %"66" = load i16, ptr addrspace(5) %"55", align 2 + %"77" = inttoptr i64 %"65" to ptr + store i16 %"66", ptr %"77", align 2 + %"67" = load i64, ptr addrspace(5) %"54", align 4 + %"79" = inttoptr i64 %"67" to ptr + %"40" = getelementptr inbounds i8, ptr %"79", i64 2 + %"68" = load i16, ptr addrspace(5) %"56", align 2 + store i16 %"68", ptr %"40", align 2 + %"69" = load i64, ptr addrspace(5) %"54", align 4 + %"81" = inttoptr i64 %"69" to ptr + %"42" = getelementptr inbounds i8, ptr %"81", i64 4 + %"70" = load i16, ptr addrspace(5) %"57", align 2 + store i16 %"70", ptr %"42", align 2 + %"71" = load i64, ptr addrspace(5) %"54", align 4 + %"83" = inttoptr i64 %"71" to ptr + %"44" = getelementptr inbounds i8, ptr %"83", i64 6 + %"72" = load i16, ptr addrspace(5) %"58", align 2 + store i16 %"72", ptr %"44", align 2 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/constant_f32.ll b/ptx/src/test/ll/constant_f32.ll index 60f625f..44e0952 100644 --- a/ptx/src/test/ll/constant_f32.ll +++ b/ptx/src/test/ll/constant_f32.ll @@ -10,29 +10,33 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca float, align 4, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 + br label %"51" + +"51": ; preds = %1 %"41" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = inttoptr i64 %"43" to ptr - %"42" = load float, ptr %"48", align 4 - store float %"42", ptr addrspace(5) %"39", align 4 - %"45" = load float, ptr addrspace(5) %"39", align 4 - %"44" = fmul float %"45", 5.000000e-01 - store float %"44", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %"47" = load float, ptr addrspace(5) %"39", align 4 - %"49" = inttoptr i64 %"46" to ptr - store float %"47", ptr %"49", align 4 + %"42" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"42", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(5) %"38", align 4 + %"49" = inttoptr i64 %"44" to ptr + %"43" = load float, ptr %"49", align 4 + store float %"43", ptr addrspace(5) %"40", align 4 + %"46" = load float, ptr addrspace(5) %"40", align 4 + %"45" = fmul float %"46", 5.000000e-01 + store float %"45", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"48" = load float, ptr addrspace(5) %"40", align 4 + %"50" = inttoptr i64 %"47" to ptr + store float %"48", ptr %"50", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/constant_negative.ll b/ptx/src/test/ll/constant_negative.ll index 201b867..7effb34 100644 --- a/ptx/src/test/ll/constant_negative.ll +++ b/ptx/src/test/ll/constant_negative.ll @@ -10,29 +10,33 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i32, align 4, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 + br label %"51" + +"51": ; preds = %1 %"41" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = inttoptr i64 %"43" to ptr - %"42" = load i32, ptr %"48", align 4 - store i32 %"42", ptr addrspace(5) %"39", align 4 - %"45" = load i32, ptr addrspace(5) %"39", align 4 - %"44" = mul i32 %"45", -1 - store i32 %"44", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %"47" = load i32, ptr addrspace(5) %"39", align 4 - %"49" = inttoptr i64 %"46" to ptr - store i32 %"47", ptr %"49", align 4 + %"42" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"42", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(5) %"38", align 4 + %"49" = inttoptr i64 %"44" to ptr + %"43" = load i32, ptr %"49", align 4 + store i32 %"43", ptr addrspace(5) %"40", align 4 + %"46" = load i32, ptr addrspace(5) %"40", align 4 + %"45" = mul i32 %"46", -1 + store i32 %"45", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"48" = load i32, ptr addrspace(5) %"40", align 4 + %"50" = inttoptr i64 %"47" to ptr + store i32 %"48", ptr %"50", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/cos.ll b/ptx/src/test/ll/cos.ll index 44c0ee0..8a63479 100644 --- a/ptx/src/test/ll/cos.ll +++ b/ptx/src/test/ll/cos.ll @@ -10,33 +10,37 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"50" + +"50": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call afn float @llvm.cos.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load float, ptr %"48", align 4 + store float %"42", ptr addrspace(5) %"39", align 4 + %"45" = load float, ptr addrspace(5) %"39", align 4 + %"44" = call afn float @llvm.cos.f32(float %"45") + store float %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load float, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store float %"47", ptr %"49", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.cos.f32(float) #1 +declare float @llvm.cos.f32(float) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_f64_f32.ll b/ptx/src/test/ll/cvt_f64_f32.ll index 4d5cf2c..78e9caf 100644 --- a/ptx/src/test/ll/cvt_f64_f32.ll +++ b/ptx/src/test/ll/cvt_f64_f32.ll @@ -10,30 +10,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca float, align 4, addrspace(5) - %"40" = alloca double, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca float, align 4, addrspace(5) + %"41" = alloca double, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"41", ptr addrspace(5) %"37", align 4 + br label %"52" + +"52": ; preds = %1 %"42" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"42", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"49" = inttoptr i64 %"44" to ptr addrspace(1) - %"43" = load float, ptr addrspace(1) %"49", align 4 - store float %"43", ptr addrspace(5) %"39", align 4 - %"46" = load float, ptr addrspace(5) %"39", align 4 - %"45" = fpext float %"46" to double - store double %"45", ptr addrspace(5) %"40", align 8 - %"47" = load i64, ptr addrspace(5) %"38", align 4 - %"48" = load double, ptr addrspace(5) %"40", align 8 - %"50" = inttoptr i64 %"47" to ptr - store double %"48", ptr %"50", align 8 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr addrspace(1) + %"44" = load float, ptr addrspace(1) %"50", align 4 + store float %"44", ptr addrspace(5) %"40", align 4 + %"47" = load float, ptr addrspace(5) %"40", align 4 + %"46" = fpext float %"47" to double + store double %"46", ptr addrspace(5) %"41", align 8 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load double, ptr addrspace(5) %"41", align 8 + %"51" = inttoptr i64 %"48" to ptr + store double %"49", ptr %"51", align 8 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_rni.ll b/ptx/src/test/ll/cvt_rni.ll index 850b1fb..50d62e4 100644 --- a/ptx/src/test/ll/cvt_rni.ll +++ b/ptx/src/test/ll/cvt_rni.ll @@ -10,49 +10,53 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca float, align 4, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) %"44" = alloca float, align 4, addrspace(5) + %"45" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 + br label %"64" + +"64": ; preds = %1 %"46" = load i64, ptr addrspace(4) %"40", align 4 store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"59" = inttoptr i64 %"48" to ptr - %"47" = load float, ptr %"59", align 4 - store float %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"47", ptr addrspace(5) %"43", align 4 + %"49" = load i64, ptr addrspace(5) %"42", align 4 %"60" = inttoptr i64 %"49" to ptr - %"30" = getelementptr inbounds i8, ptr %"60", i64 4 - %"50" = load float, ptr %"30", align 4 - store float %"50", ptr addrspace(5) %"44", align 4 - %"52" = load float, ptr addrspace(5) %"43", align 4 - %2 = call float @llvm.roundeven.f32(float %"52") - %"51" = freeze float %2 - store float %"51", ptr addrspace(5) %"43", align 4 - %"54" = load float, ptr addrspace(5) %"44", align 4 - %3 = call float @llvm.roundeven.f32(float %"54") - %"53" = freeze float %3 - store float %"53", ptr addrspace(5) %"44", align 4 - %"55" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = load float, ptr addrspace(5) %"43", align 4 - %"61" = inttoptr i64 %"55" to ptr - store float %"56", ptr %"61", align 4 - %"57" = load i64, ptr addrspace(5) %"42", align 4 - %"62" = inttoptr i64 %"57" to ptr - %"32" = getelementptr inbounds i8, ptr %"62", i64 4 - %"58" = load float, ptr addrspace(5) %"44", align 4 - store float %"58", ptr %"32", align 4 + %"48" = load float, ptr %"60", align 4 + store float %"48", ptr addrspace(5) %"44", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"61" = inttoptr i64 %"50" to ptr + %"31" = getelementptr inbounds i8, ptr %"61", i64 4 + %"51" = load float, ptr %"31", align 4 + store float %"51", ptr addrspace(5) %"45", align 4 + %"53" = load float, ptr addrspace(5) %"44", align 4 + %2 = call float @llvm.roundeven.f32(float %"53") + %"52" = freeze float %2 + store float %"52", ptr addrspace(5) %"44", align 4 + %"55" = load float, ptr addrspace(5) %"45", align 4 + %3 = call float @llvm.roundeven.f32(float %"55") + %"54" = freeze float %3 + store float %"54", ptr addrspace(5) %"45", align 4 + %"56" = load i64, ptr addrspace(5) %"43", align 4 + %"57" = load float, ptr addrspace(5) %"44", align 4 + %"62" = inttoptr i64 %"56" to ptr + store float %"57", ptr %"62", align 4 + %"58" = load i64, ptr addrspace(5) %"43", align 4 + %"63" = inttoptr i64 %"58" to ptr + %"33" = getelementptr inbounds i8, ptr %"63", i64 4 + %"59" = load float, ptr addrspace(5) %"45", align 4 + store float %"59", ptr %"33", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.roundeven.f32(float) #1 +declare float @llvm.roundeven.f32(float) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_rzi.ll b/ptx/src/test/ll/cvt_rzi.ll index 05a2d49..9d0c04d 100644 --- a/ptx/src/test/ll/cvt_rzi.ll +++ b/ptx/src/test/ll/cvt_rzi.ll @@ -10,49 +10,53 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca float, align 4, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) %"44" = alloca float, align 4, addrspace(5) + %"45" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 + br label %"64" + +"64": ; preds = %1 %"46" = load i64, ptr addrspace(4) %"40", align 4 store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"59" = inttoptr i64 %"48" to ptr - %"47" = load float, ptr %"59", align 4 - store float %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"47", ptr addrspace(5) %"43", align 4 + %"49" = load i64, ptr addrspace(5) %"42", align 4 %"60" = inttoptr i64 %"49" to ptr - %"30" = getelementptr inbounds i8, ptr %"60", i64 4 - %"50" = load float, ptr %"30", align 4 - store float %"50", ptr addrspace(5) %"44", align 4 - %"52" = load float, ptr addrspace(5) %"43", align 4 - %2 = call float @llvm.trunc.f32(float %"52") - %"51" = freeze float %2 - store float %"51", ptr addrspace(5) %"43", align 4 - %"54" = load float, ptr addrspace(5) %"44", align 4 - %3 = call float @llvm.trunc.f32(float %"54") - %"53" = freeze float %3 - store float %"53", ptr addrspace(5) %"44", align 4 - %"55" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = load float, ptr addrspace(5) %"43", align 4 - %"61" = inttoptr i64 %"55" to ptr - store float %"56", ptr %"61", align 4 - %"57" = load i64, ptr addrspace(5) %"42", align 4 - %"62" = inttoptr i64 %"57" to ptr - %"32" = getelementptr inbounds i8, ptr %"62", i64 4 - %"58" = load float, ptr addrspace(5) %"44", align 4 - store float %"58", ptr %"32", align 4 + %"48" = load float, ptr %"60", align 4 + store float %"48", ptr addrspace(5) %"44", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"61" = inttoptr i64 %"50" to ptr + %"31" = getelementptr inbounds i8, ptr %"61", i64 4 + %"51" = load float, ptr %"31", align 4 + store float %"51", ptr addrspace(5) %"45", align 4 + %"53" = load float, ptr addrspace(5) %"44", align 4 + %2 = call float @llvm.trunc.f32(float %"53") + %"52" = freeze float %2 + store float %"52", ptr addrspace(5) %"44", align 4 + %"55" = load float, ptr addrspace(5) %"45", align 4 + %3 = call float @llvm.trunc.f32(float %"55") + %"54" = freeze float %3 + store float %"54", ptr addrspace(5) %"45", align 4 + %"56" = load i64, ptr addrspace(5) %"43", align 4 + %"57" = load float, ptr addrspace(5) %"44", align 4 + %"62" = inttoptr i64 %"56" to ptr + store float %"57", ptr %"62", align 4 + %"58" = load i64, ptr addrspace(5) %"43", align 4 + %"63" = inttoptr i64 %"58" to ptr + %"33" = getelementptr inbounds i8, ptr %"63", i64 4 + %"59" = load float, ptr addrspace(5) %"45", align 4 + store float %"59", ptr %"33", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.trunc.f32(float) #1 +declare float @llvm.trunc.f32(float) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_s16_s8.ll b/ptx/src/test/ll/cvt_s16_s8.ll index b36fc88..6a30984 100644 --- a/ptx/src/test/ll/cvt_s16_s8.ll +++ b/ptx/src/test/ll/cvt_s16_s8.ll @@ -10,32 +10,36 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i32, align 4, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i32, align 4, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"41", ptr addrspace(5) %"37", align 4 + br label %"54" + +"54": ; preds = %1 %"42" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"42", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"49" = inttoptr i64 %"44" to ptr addrspace(1) - %"43" = load i32, ptr addrspace(1) %"49", align 4 - store i32 %"43", ptr addrspace(5) %"40", align 4 - %"46" = load i32, ptr addrspace(5) %"40", align 4 - %2 = trunc i32 %"46" to i8 - %"50" = sext i8 %2 to i16 - %"45" = sext i16 %"50" to i32 - store i32 %"45", ptr addrspace(5) %"39", align 4 - %"47" = load i64, ptr addrspace(5) %"38", align 4 - %"48" = load i32, ptr addrspace(5) %"39", align 4 - %"52" = inttoptr i64 %"47" to ptr - store i32 %"48", ptr %"52", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr addrspace(1) + %"44" = load i32, ptr addrspace(1) %"50", align 4 + store i32 %"44", ptr addrspace(5) %"41", align 4 + %"47" = load i32, ptr addrspace(5) %"41", align 4 + %2 = trunc i32 %"47" to i8 + %"51" = sext i8 %2 to i16 + %"46" = sext i16 %"51" to i32 + store i32 %"46", ptr addrspace(5) %"40", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i32, ptr addrspace(5) %"40", align 4 + %"53" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"53", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_s32_f32.ll b/ptx/src/test/ll/cvt_s32_f32.ll index 5a8e804..642f5ac 100644 --- a/ptx/src/test/ll/cvt_s32_f32.ll +++ b/ptx/src/test/ll/cvt_s32_f32.ll @@ -10,55 +10,59 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) %"44" = alloca i32, align 4, addrspace(5) + %"45" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 + br label %"72" + +"72": ; preds = %1 %"46" = load i64, ptr addrspace(4) %"40", align 4 store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"60" = inttoptr i64 %"48" to ptr - %"59" = load float, ptr %"60", align 4 - %"47" = bitcast float %"59" to i32 - store i32 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"47", ptr addrspace(5) %"43", align 4 + %"49" = load i64, ptr addrspace(5) %"42", align 4 %"61" = inttoptr i64 %"49" to ptr - %"30" = getelementptr inbounds i8, ptr %"61", i64 4 - %"62" = load float, ptr %"30", align 4 - %"50" = bitcast float %"62" to i32 - store i32 %"50", ptr addrspace(5) %"44", align 4 - %"52" = load i32, ptr addrspace(5) %"43", align 4 - %"64" = bitcast i32 %"52" to float - %2 = call float @llvm.ceil.f32(float %"64") + %"60" = load float, ptr %"61", align 4 + %"48" = bitcast float %"60" to i32 + store i32 %"48", ptr addrspace(5) %"44", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"62" = inttoptr i64 %"50" to ptr + %"31" = getelementptr inbounds i8, ptr %"62", i64 4 + %"63" = load float, ptr %"31", align 4 + %"51" = bitcast float %"63" to i32 + store i32 %"51", ptr addrspace(5) %"45", align 4 + %"53" = load i32, ptr addrspace(5) %"44", align 4 + %"65" = bitcast i32 %"53" to float + %2 = call float @llvm.ceil.f32(float %"65") %3 = fptosi float %2 to i32 - %"63" = freeze i32 %3 - store i32 %"63", ptr addrspace(5) %"43", align 4 - %"54" = load i32, ptr addrspace(5) %"44", align 4 - %"66" = bitcast i32 %"54" to float - %4 = call float @llvm.ceil.f32(float %"66") + %"64" = freeze i32 %3 + store i32 %"64", ptr addrspace(5) %"44", align 4 + %"55" = load i32, ptr addrspace(5) %"45", align 4 + %"67" = bitcast i32 %"55" to float + %4 = call float @llvm.ceil.f32(float %"67") %5 = fptosi float %4 to i32 - %"65" = freeze i32 %5 - store i32 %"65", ptr addrspace(5) %"44", align 4 - %"55" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = load i32, ptr addrspace(5) %"43", align 4 - %"67" = inttoptr i64 %"55" to ptr addrspace(1) - store i32 %"56", ptr addrspace(1) %"67", align 4 - %"57" = load i64, ptr addrspace(5) %"42", align 4 - %"69" = inttoptr i64 %"57" to ptr addrspace(1) - %"32" = getelementptr inbounds i8, ptr addrspace(1) %"69", i64 4 - %"58" = load i32, ptr addrspace(5) %"44", align 4 - store i32 %"58", ptr addrspace(1) %"32", align 4 + %"66" = freeze i32 %5 + store i32 %"66", ptr addrspace(5) %"45", align 4 + %"56" = load i64, ptr addrspace(5) %"43", align 4 + %"57" = load i32, ptr addrspace(5) %"44", align 4 + %"68" = inttoptr i64 %"56" to ptr addrspace(1) + store i32 %"57", ptr addrspace(1) %"68", align 4 + %"58" = load i64, ptr addrspace(5) %"43", align 4 + %"70" = inttoptr i64 %"58" to ptr addrspace(1) + %"33" = getelementptr inbounds i8, ptr addrspace(1) %"70", i64 4 + %"59" = load i32, ptr addrspace(5) %"45", align 4 + store i32 %"59", ptr addrspace(1) %"33", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.ceil.f32(float) #1 +declare float @llvm.ceil.f32(float) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_s64_s32.ll b/ptx/src/test/ll/cvt_s64_s32.ll index 5aa91b1..a0d6878 100644 --- a/ptx/src/test/ll/cvt_s64_s32.ll +++ b/ptx/src/test/ll/cvt_s64_s32.ll @@ -10,30 +10,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i32, align 4, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"41", ptr addrspace(5) %"37", align 4 + br label %"54" + +"54": ; preds = %1 %"42" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"42", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"50" = inttoptr i64 %"44" to ptr - %"49" = load i32, ptr %"50", align 4 - store i32 %"49", ptr addrspace(5) %"39", align 4 - %"46" = load i32, ptr addrspace(5) %"39", align 4 - %"45" = sext i32 %"46" to i64 - store i64 %"45", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"38", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"51" = inttoptr i64 %"47" to ptr - store i64 %"48", ptr %"51", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"51" = inttoptr i64 %"45" to ptr + %"50" = load i32, ptr %"51", align 4 + store i32 %"50", ptr addrspace(5) %"40", align 4 + %"47" = load i32, ptr addrspace(5) %"40", align 4 + %"46" = sext i32 %"47" to i64 + store i64 %"46", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"52" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_sat_s_u.ll b/ptx/src/test/ll/cvt_sat_s_u.ll index 63954f8..dcbf7da 100644 --- a/ptx/src/test/ll/cvt_sat_s_u.ll +++ b/ptx/src/test/ll/cvt_sat_s_u.ll @@ -10,41 +10,45 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i32, align 4, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i32, align 4, addrspace(5) %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"43", ptr addrspace(5) %"38", align 4 + br label %"56" + +"56": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"44", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %"53" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"53", align 4 - store i32 %"45", ptr addrspace(5) %"40", align 4 - %"48" = load i32, ptr addrspace(5) %"40", align 4 - %2 = call i32 @llvm.smax.i32(i32 %"48", i32 0) + %"45" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"45", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"54" = inttoptr i64 %"47" to ptr + %"46" = load i32, ptr %"54", align 4 + store i32 %"46", ptr addrspace(5) %"41", align 4 + %"49" = load i32, ptr addrspace(5) %"41", align 4 + %2 = call i32 @llvm.smax.i32(i32 %"49", i32 0) %3 = call i32 @llvm.umin.i32(i32 %2, i32 -1) - store i32 %3, ptr addrspace(5) %"41", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 - store i32 %"50", ptr addrspace(5) %"42", align 4 - %"51" = load i64, ptr addrspace(5) %"39", align 4 - %"52" = load i32, ptr addrspace(5) %"42", align 4 - %"54" = inttoptr i64 %"51" to ptr - store i32 %"52", ptr %"54", align 4 + store i32 %3, ptr addrspace(5) %"42", align 4 + %"51" = load i32, ptr addrspace(5) %"42", align 4 + store i32 %"51", ptr addrspace(5) %"43", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"53" = load i32, ptr addrspace(5) %"43", align 4 + %"55" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"55", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.smax.i32(i32, i32) #1 +declare i32 @llvm.smax.i32(i32, i32) #2 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.umin.i32(i32, i32) #1 +declare i32 @llvm.umin.i32(i32, i32) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvta.ll b/ptx/src/test/ll/cvta.ll index 495b312..ce44e38 100644 --- a/ptx/src/test/ll/cvta.ll +++ b/ptx/src/test/ll/cvta.ll @@ -10,34 +10,38 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"56" + +"56": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %2 = inttoptr i64 %"42" to ptr - %"49" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"49", ptr addrspace(5) %"36", align 8 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %3 = inttoptr i64 %"44" to ptr - %"51" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"51", ptr addrspace(5) %"37", align 8 - %"46" = load i64, ptr addrspace(5) %"36", align 4 - %"53" = inttoptr i64 %"46" to ptr addrspace(1) - %"45" = load float, ptr addrspace(1) %"53", align 4 - store float %"45", ptr addrspace(5) %"38", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %2 = inttoptr i64 %"43" to ptr + %"50" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"50", ptr addrspace(5) %"37", align 8 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %3 = inttoptr i64 %"45" to ptr + %"52" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"52", ptr addrspace(5) %"38", align 8 %"47" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = load float, ptr addrspace(5) %"38", align 4 %"54" = inttoptr i64 %"47" to ptr addrspace(1) - store float %"48", ptr addrspace(1) %"54", align 4 + %"46" = load float, ptr addrspace(1) %"54", align 4 + store float %"46", ptr addrspace(5) %"39", align 4 + %"48" = load i64, ptr addrspace(5) %"38", align 4 + %"49" = load float, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"48" to ptr addrspace(1) + store float %"49", ptr addrspace(1) %"55", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/div_approx.ll b/ptx/src/test/ll/div_approx.ll index cb8cb28..b2684ce 100644 --- a/ptx/src/test/ll/div_approx.ll +++ b/ptx/src/test/ll/div_approx.ll @@ -10,36 +10,40 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca float, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca float, align 4, addrspace(5) + %"43" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 + br label %"58" + +"58": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load float, ptr %"54", align 4 - store float %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load float, ptr %"30", align 4 - store float %"48", ptr addrspace(5) %"42", align 4 - %"50" = load float, ptr addrspace(5) %"41", align 4 + %"46" = load float, ptr %"55", align 4 + store float %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 4 + %"49" = load float, ptr %"31", align 4 + store float %"49", ptr addrspace(5) %"43", align 4 %"51" = load float, ptr addrspace(5) %"42", align 4 - %"49" = fdiv arcp afn float %"50", %"51" - store float %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load float, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store float %"53", ptr %"56", align 4 + %"52" = load float, ptr addrspace(5) %"43", align 4 + %"50" = fdiv arcp afn float %"51", %"52" + store float %"50", ptr addrspace(5) %"42", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load float, ptr addrspace(5) %"42", align 4 + %"57" = inttoptr i64 %"53" to ptr + store float %"54", ptr %"57", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/ex2.ll b/ptx/src/test/ll/ex2.ll index 904f238..f2ef504 100644 --- a/ptx/src/test/ll/ex2.ll +++ b/ptx/src/test/ll/ex2.ll @@ -10,33 +10,37 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"50" + +"50": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call float @llvm.amdgcn.exp2.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load float, ptr %"48", align 4 + store float %"42", ptr addrspace(5) %"39", align 4 + %"45" = load float, ptr addrspace(5) %"39", align 4 + %"44" = call float @llvm.amdgcn.exp2.f32(float %"45") + store float %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load float, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store float %"47", ptr %"49", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.amdgcn.exp2.f32(float) #1 +declare float @llvm.amdgcn.exp2.f32(float) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/extern_shared.ll b/ptx/src/test/ll/extern_shared.ll index 9b872ec..f3d3b23 100644 --- a/ptx/src/test/ll/extern_shared.ll +++ b/ptx/src/test/ll/extern_shared.ll @@ -12,30 +12,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 + br label %"53" + +"53": ; preds = %1 %"41" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = inttoptr i64 %"43" to ptr addrspace(1) - %"42" = load i64, ptr addrspace(1) %"48", align 4 + %"42" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"39", align 4 - store i64 %"44", ptr addrspace(3) @shared_mem, align 4 - %"45" = load i64, ptr addrspace(3) @shared_mem, align 4 - store i64 %"45", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"38", align 4 + %"49" = inttoptr i64 %"44" to ptr addrspace(1) + %"43" = load i64, ptr addrspace(1) %"49", align 4 + store i64 %"43", ptr addrspace(5) %"40", align 4 + %"45" = load i64, ptr addrspace(5) %"40", align 4 + store i64 %"45", ptr addrspace(3) @shared_mem, align 4 + %"46" = load i64, ptr addrspace(3) @shared_mem, align 4 + store i64 %"46", ptr addrspace(5) %"40", align 4 %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"51" = inttoptr i64 %"46" to ptr addrspace(1) - store i64 %"47", ptr addrspace(1) %"51", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"52" = inttoptr i64 %"47" to ptr addrspace(1) + store i64 %"48", ptr addrspace(1) %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/extern_shared_call.ll b/ptx/src/test/ll/extern_shared_call.ll index 923523b..df219c0 100644 --- a/ptx/src/test/ll/extern_shared_call.ll +++ b/ptx/src/test/ll/extern_shared_call.ll @@ -12,46 +12,53 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define void @__zluda_ptx_impl_incr_shared_2_global() #0 { - %"38" = alloca i64, align 8, addrspace(5) +define void @incr_shared_2_global() #0 { + %"39" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(3) @shared_mem, align 4 - store i64 %"39", ptr addrspace(5) %"38", align 4 - %"41" = load i64, ptr addrspace(5) %"38", align 4 - %"40" = add i64 %"41", 2 - store i64 %"40", ptr addrspace(5) %"38", align 4 - %"42" = load i64, ptr addrspace(5) %"38", align 4 - store i64 %"42", ptr addrspace(3) @shared_mem, align 4 + br label %"63" + +"63": ; preds = %1 + %"40" = load i64, ptr addrspace(3) @shared_mem, align 4 + store i64 %"40", ptr addrspace(5) %"39", align 4 + %"42" = load i64, ptr addrspace(5) %"39", align 4 + %"41" = add i64 %"42", 2 + store i64 %"41", ptr addrspace(5) %"39", align 4 + %"43" = load i64, ptr addrspace(5) %"39", align 4 + store i64 %"43", ptr addrspace(3) @shared_mem, align 4 ret void } -define amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { - %"45" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #1 { %"46" = alloca i64, align 8, addrspace(5) %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"48" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"48", ptr addrspace(5) %"45", align 4 + br label %"64" + +"64": ; preds = %1 %"49" = load i64, ptr addrspace(4) %"44", align 4 store i64 %"49", ptr addrspace(5) %"46", align 4 - %"51" = load i64, ptr addrspace(5) %"45", align 4 - %"58" = inttoptr i64 %"51" to ptr addrspace(1) - %"50" = load i64, ptr addrspace(1) %"58", align 4 + %"50" = load i64, ptr addrspace(4) %"45", align 4 store i64 %"50", ptr addrspace(5) %"47", align 4 - %"52" = load i64, ptr addrspace(5) %"47", align 4 - store i64 %"52", ptr addrspace(3) @shared_mem, align 4 - call void @__zluda_ptx_impl_incr_shared_2_global() - %"53" = load i64, ptr addrspace(3) @shared_mem, align 4 - store i64 %"53", ptr addrspace(5) %"47", align 4 - %"54" = load i64, ptr addrspace(5) %"46", align 4 + %"52" = load i64, ptr addrspace(5) %"46", align 4 + %"59" = inttoptr i64 %"52" to ptr addrspace(1) + %"51" = load i64, ptr addrspace(1) %"59", align 4 + store i64 %"51", ptr addrspace(5) %"48", align 4 + %"53" = load i64, ptr addrspace(5) %"48", align 4 + store i64 %"53", ptr addrspace(3) @shared_mem, align 4 + call void @incr_shared_2_global() + %"54" = load i64, ptr addrspace(3) @shared_mem, align 4 + store i64 %"54", ptr addrspace(5) %"48", align 4 %"55" = load i64, ptr addrspace(5) %"47", align 4 - %"61" = inttoptr i64 %"54" to ptr addrspace(1) - store i64 %"55", ptr addrspace(1) %"61", align 4 + %"56" = load i64, ptr addrspace(5) %"48", align 4 + %"62" = inttoptr i64 %"55" to ptr addrspace(1) + store i64 %"56", ptr addrspace(1) %"62", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/fma.ll b/ptx/src/test/ll/fma.ll index 4a454ef..6df8e2e 100644 --- a/ptx/src/test/ll/fma.ll +++ b/ptx/src/test/ll/fma.ll @@ -10,47 +10,51 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { - %"42" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 { %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca float, align 4, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) %"45" = alloca float, align 4, addrspace(5) %"46" = alloca float, align 4, addrspace(5) + %"47" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"47" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"47", ptr addrspace(5) %"42", align 4 + br label %"66" + +"66": ; preds = %1 %"48" = load i64, ptr addrspace(4) %"41", align 4 store i64 %"48", ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(5) %"42", align 4 - %"61" = inttoptr i64 %"50" to ptr - %"49" = load float, ptr %"61", align 4 - store float %"49", ptr addrspace(5) %"44", align 4 - %"51" = load i64, ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"49", ptr addrspace(5) %"44", align 4 + %"51" = load i64, ptr addrspace(5) %"43", align 4 %"62" = inttoptr i64 %"51" to ptr - %"31" = getelementptr inbounds i8, ptr %"62", i64 4 - %"52" = load float, ptr %"31", align 4 - store float %"52", ptr addrspace(5) %"45", align 4 - %"53" = load i64, ptr addrspace(5) %"42", align 4 - %"63" = inttoptr i64 %"53" to ptr - %"33" = getelementptr inbounds i8, ptr %"63", i64 8 - %"54" = load float, ptr %"33", align 4 - store float %"54", ptr addrspace(5) %"46", align 4 - %"56" = load float, ptr addrspace(5) %"44", align 4 + %"50" = load float, ptr %"62", align 4 + store float %"50", ptr addrspace(5) %"45", align 4 + %"52" = load i64, ptr addrspace(5) %"43", align 4 + %"63" = inttoptr i64 %"52" to ptr + %"32" = getelementptr inbounds i8, ptr %"63", i64 4 + %"53" = load float, ptr %"32", align 4 + store float %"53", ptr addrspace(5) %"46", align 4 + %"54" = load i64, ptr addrspace(5) %"43", align 4 + %"64" = inttoptr i64 %"54" to ptr + %"34" = getelementptr inbounds i8, ptr %"64", i64 8 + %"55" = load float, ptr %"34", align 4 + store float %"55", ptr addrspace(5) %"47", align 4 %"57" = load float, ptr addrspace(5) %"45", align 4 %"58" = load float, ptr addrspace(5) %"46", align 4 - %"55" = call float @llvm.fma.f32(float %"56", float %"57", float %"58") - store float %"55", ptr addrspace(5) %"44", align 4 - %"59" = load i64, ptr addrspace(5) %"43", align 4 - %"60" = load float, ptr addrspace(5) %"44", align 4 - %"64" = inttoptr i64 %"59" to ptr - store float %"60", ptr %"64", align 4 + %"59" = load float, ptr addrspace(5) %"47", align 4 + %"56" = call float @llvm.fma.f32(float %"57", float %"58", float %"59") + store float %"56", ptr addrspace(5) %"45", align 4 + %"60" = load i64, ptr addrspace(5) %"44", align 4 + %"61" = load float, ptr addrspace(5) %"45", align 4 + %"65" = inttoptr i64 %"60" to ptr + store float %"61", ptr %"65", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.fma.f32(float, float, float) #1 +declare float @llvm.fma.f32(float, float, float) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/global_array.ll b/ptx/src/test/ll/global_array.ll index fede5f7..5ee861e 100644 --- a/ptx/src/test/ll/global_array.ll +++ b/ptx/src/test/ll/global_array.ll @@ -12,25 +12,29 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i32, align 4, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %"37", align 4 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"47" = inttoptr i64 %"43" to ptr addrspace(1) - %"42" = load i32, ptr addrspace(1) %"47", align 4 - store i32 %"42", ptr addrspace(5) %"39", align 4 + br label %"50" + +"50": ; preds = %1 + store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %"38", align 4 + %"42" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"42", ptr addrspace(5) %"39", align 4 %"44" = load i64, ptr addrspace(5) %"38", align 4 - %"45" = load i32, ptr addrspace(5) %"39", align 4 %"48" = inttoptr i64 %"44" to ptr addrspace(1) - store i32 %"45", ptr addrspace(1) %"48", align 4 + %"43" = load i32, ptr addrspace(1) %"48", align 4 + store i32 %"43", ptr addrspace(5) %"40", align 4 + %"45" = load i64, ptr addrspace(5) %"39", align 4 + %"46" = load i32, ptr addrspace(5) %"40", align 4 + %"49" = inttoptr i64 %"45" to ptr addrspace(1) + store i32 %"46", ptr addrspace(1) %"49", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/ld_st.ll b/ptx/src/test/ll/ld_st.ll index 7c37090..31b54c2 100644 --- a/ptx/src/test/ll/ld_st.ll +++ b/ptx/src/test/ll/ld_st.ll @@ -10,26 +10,30 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"48" + +"48": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"45" = inttoptr i64 %"42" to ptr - %"41" = load i64, ptr %"45", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"41", ptr addrspace(5) %"38", align 4 %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"44" = load i64, ptr addrspace(5) %"38", align 4 %"46" = inttoptr i64 %"43" to ptr - store i64 %"44", ptr %"46", align 4 + %"42" = load i64, ptr %"46", align 4 + store i64 %"42", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"39", align 4 + %"47" = inttoptr i64 %"44" to ptr + store i64 %"45", ptr %"47", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/ld_st_implicit.ll b/ptx/src/test/ll/ld_st_implicit.ll index cb4e08a..a5f886a 100644 --- a/ptx/src/test/ll/ld_st_implicit.ll +++ b/ptx/src/test/ll/ld_st_implicit.ll @@ -10,31 +10,35 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 + br label %"52" + +"52": ; preds = %1 %"41" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"41", ptr addrspace(5) %"38", align 4 - store i64 81985529216486895, ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = inttoptr i64 %"44" to ptr addrspace(1) - %"47" = load float, ptr addrspace(1) %"48", align 4 - %2 = bitcast float %"47" to i32 - %"43" = zext i32 %2 to i64 - store i64 %"43", ptr addrspace(5) %"39", align 4 + %"42" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"42", ptr addrspace(5) %"39", align 4 + store i64 81985529216486895, ptr addrspace(5) %"40", align 4 %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 %"49" = inttoptr i64 %"45" to ptr addrspace(1) - %3 = trunc i64 %"46" to i32 - %"50" = bitcast i32 %3 to float - store float %"50", ptr addrspace(1) %"49", align 4 + %"48" = load float, ptr addrspace(1) %"49", align 4 + %2 = bitcast float %"48" to i32 + %"44" = zext i32 %2 to i64 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = inttoptr i64 %"46" to ptr addrspace(1) + %3 = trunc i64 %"47" to i32 + %"51" = bitcast i32 %3 to float + store float %"51", ptr addrspace(1) %"50", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/ld_st_offset.ll b/ptx/src/test/ll/ld_st_offset.ll index 81e0c62..70d5559 100644 --- a/ptx/src/test/ll/ld_st_offset.ll +++ b/ptx/src/test/ll/ld_st_offset.ll @@ -10,37 +10,41 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) %"44" = alloca i32, align 4, addrspace(5) + %"45" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 + br label %"60" + +"60": ; preds = %1 %"46" = load i64, ptr addrspace(4) %"40", align 4 store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"55" = inttoptr i64 %"48" to ptr - %"47" = load i32, ptr %"55", align 4 - store i32 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"47", ptr addrspace(5) %"43", align 4 + %"49" = load i64, ptr addrspace(5) %"42", align 4 %"56" = inttoptr i64 %"49" to ptr - %"30" = getelementptr inbounds i8, ptr %"56", i64 4 - %"50" = load i32, ptr %"30", align 4 - store i32 %"50", ptr addrspace(5) %"44", align 4 - %"51" = load i64, ptr addrspace(5) %"42", align 4 - %"52" = load i32, ptr addrspace(5) %"44", align 4 - %"57" = inttoptr i64 %"51" to ptr - store i32 %"52", ptr %"57", align 4 - %"53" = load i64, ptr addrspace(5) %"42", align 4 - %"58" = inttoptr i64 %"53" to ptr - %"32" = getelementptr inbounds i8, ptr %"58", i64 4 - %"54" = load i32, ptr addrspace(5) %"43", align 4 - store i32 %"54", ptr %"32", align 4 + %"48" = load i32, ptr %"56", align 4 + store i32 %"48", ptr addrspace(5) %"44", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"57" = inttoptr i64 %"50" to ptr + %"31" = getelementptr inbounds i8, ptr %"57", i64 4 + %"51" = load i32, ptr %"31", align 4 + store i32 %"51", ptr addrspace(5) %"45", align 4 + %"52" = load i64, ptr addrspace(5) %"43", align 4 + %"53" = load i32, ptr addrspace(5) %"45", align 4 + %"58" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"58", align 4 + %"54" = load i64, ptr addrspace(5) %"43", align 4 + %"59" = inttoptr i64 %"54" to ptr + %"33" = getelementptr inbounds i8, ptr %"59", i64 4 + %"55" = load i32, ptr addrspace(5) %"44", align 4 + store i32 %"55", ptr %"33", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/lg2.ll b/ptx/src/test/ll/lg2.ll index 543ae0a..ac971d6 100644 --- a/ptx/src/test/ll/lg2.ll +++ b/ptx/src/test/ll/lg2.ll @@ -10,33 +10,37 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"50" + +"50": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call float @llvm.amdgcn.log.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load float, ptr %"48", align 4 + store float %"42", ptr addrspace(5) %"39", align 4 + %"45" = load float, ptr addrspace(5) %"39", align 4 + %"44" = call float @llvm.amdgcn.log.f32(float %"45") + store float %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load float, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store float %"47", ptr %"49", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.amdgcn.log.f32(float) #1 +declare float @llvm.amdgcn.log.f32(float) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/local_align.ll b/ptx/src/test/ll/local_align.ll index 08c7971..b2509b9 100644 --- a/ptx/src/test/ll/local_align.ll +++ b/ptx/src/test/ll/local_align.ll @@ -10,27 +10,31 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"9" = alloca [8 x i8], align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { + %"10" = alloca [8 x i8], align 8, addrspace(5) %"38" = alloca i64, align 8, addrspace(5) %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 + br label %"49" + +"49": ; preds = %1 %"41" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = inttoptr i64 %"43" to ptr - %"42" = load i64, ptr %"46", align 4 + %"42" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"42", ptr addrspace(5) %"39", align 4 %"44" = load i64, ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"39", align 4 %"47" = inttoptr i64 %"44" to ptr - store i64 %"45", ptr %"47", align 4 + %"43" = load i64, ptr %"47", align 4 + store i64 %"43", ptr addrspace(5) %"40", align 4 + %"45" = load i64, ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"40", align 4 + %"48" = inttoptr i64 %"45" to ptr + store i64 %"46", ptr %"48", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mad_s32.ll b/ptx/src/test/ll/mad_s32.ll index f6ea9a8..10df304 100644 --- a/ptx/src/test/ll/mad_s32.ll +++ b/ptx/src/test/ll/mad_s32.ll @@ -10,55 +10,59 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { - %"47" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #1 { %"48" = alloca i64, align 8, addrspace(5) - %"49" = alloca i32, align 4, addrspace(5) + %"49" = alloca i64, align 8, addrspace(5) %"50" = alloca i32, align 4, addrspace(5) %"51" = alloca i32, align 4, addrspace(5) %"52" = alloca i32, align 4, addrspace(5) + %"53" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"53" = load i64, ptr addrspace(4) %"45", align 4 - store i64 %"53", ptr addrspace(5) %"47", align 4 + br label %"78" + +"78": ; preds = %1 %"54" = load i64, ptr addrspace(4) %"46", align 4 store i64 %"54", ptr addrspace(5) %"48", align 4 - %"56" = load i64, ptr addrspace(5) %"47", align 4 - %"71" = inttoptr i64 %"56" to ptr - %"55" = load i32, ptr %"71", align 4 - store i32 %"55", ptr addrspace(5) %"50", align 4 - %"57" = load i64, ptr addrspace(5) %"47", align 4 + %"55" = load i64, ptr addrspace(4) %"47", align 4 + store i64 %"55", ptr addrspace(5) %"49", align 4 + %"57" = load i64, ptr addrspace(5) %"48", align 4 %"72" = inttoptr i64 %"57" to ptr - %"32" = getelementptr inbounds i8, ptr %"72", i64 4 - %"58" = load i32, ptr %"32", align 4 - store i32 %"58", ptr addrspace(5) %"51", align 4 - %"59" = load i64, ptr addrspace(5) %"47", align 4 - %"73" = inttoptr i64 %"59" to ptr - %"34" = getelementptr inbounds i8, ptr %"73", i64 8 - %"60" = load i32, ptr %"34", align 4 - store i32 %"60", ptr addrspace(5) %"52", align 4 - %"62" = load i32, ptr addrspace(5) %"50", align 4 + %"56" = load i32, ptr %"72", align 4 + store i32 %"56", ptr addrspace(5) %"51", align 4 + %"58" = load i64, ptr addrspace(5) %"48", align 4 + %"73" = inttoptr i64 %"58" to ptr + %"33" = getelementptr inbounds i8, ptr %"73", i64 4 + %"59" = load i32, ptr %"33", align 4 + store i32 %"59", ptr addrspace(5) %"52", align 4 + %"60" = load i64, ptr addrspace(5) %"48", align 4 + %"74" = inttoptr i64 %"60" to ptr + %"35" = getelementptr inbounds i8, ptr %"74", i64 8 + %"61" = load i32, ptr %"35", align 4 + store i32 %"61", ptr addrspace(5) %"53", align 4 %"63" = load i32, ptr addrspace(5) %"51", align 4 %"64" = load i32, ptr addrspace(5) %"52", align 4 - %2 = mul i32 %"62", %"63" - %"61" = add i32 %2, %"64" - store i32 %"61", ptr addrspace(5) %"49", align 4 - %"65" = load i64, ptr addrspace(5) %"48", align 4 - %"66" = load i32, ptr addrspace(5) %"49", align 4 - %"74" = inttoptr i64 %"65" to ptr - store i32 %"66", ptr %"74", align 4 - %"67" = load i64, ptr addrspace(5) %"48", align 4 - %"75" = inttoptr i64 %"67" to ptr - %"36" = getelementptr inbounds i8, ptr %"75", i64 4 - %"68" = load i32, ptr addrspace(5) %"49", align 4 - store i32 %"68", ptr %"36", align 4 - %"69" = load i64, ptr addrspace(5) %"48", align 4 - %"76" = inttoptr i64 %"69" to ptr - %"38" = getelementptr inbounds i8, ptr %"76", i64 8 - %"70" = load i32, ptr addrspace(5) %"49", align 4 - store i32 %"70", ptr %"38", align 4 + %"65" = load i32, ptr addrspace(5) %"53", align 4 + %2 = mul i32 %"63", %"64" + %"62" = add i32 %2, %"65" + store i32 %"62", ptr addrspace(5) %"50", align 4 + %"66" = load i64, ptr addrspace(5) %"49", align 4 + %"67" = load i32, ptr addrspace(5) %"50", align 4 + %"75" = inttoptr i64 %"66" to ptr + store i32 %"67", ptr %"75", align 4 + %"68" = load i64, ptr addrspace(5) %"49", align 4 + %"76" = inttoptr i64 %"68" to ptr + %"37" = getelementptr inbounds i8, ptr %"76", i64 4 + %"69" = load i32, ptr addrspace(5) %"50", align 4 + store i32 %"69", ptr %"37", align 4 + %"70" = load i64, ptr addrspace(5) %"49", align 4 + %"77" = inttoptr i64 %"70" to ptr + %"39" = getelementptr inbounds i8, ptr %"77", i64 8 + %"71" = load i32, ptr addrspace(5) %"50", align 4 + store i32 %"71", ptr %"39", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/malformed_label.ll b/ptx/src/test/ll/malformed_label.ll new file mode 100644 index 0000000..7e9edfb --- /dev/null +++ b/ptx/src/test/ll/malformed_label.ll @@ -0,0 +1,53 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @malformed_label(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"57" + +"57": ; preds = %1 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + br label %"10" + +"58": ; No predecessors! + %"47" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = inttoptr i64 %"47" to ptr + %"46" = load i64, ptr %"54", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + br label %"10" + +"10": ; preds = %"58", %"57" + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"55" = inttoptr i64 %"49" to ptr + %"48" = load i64, ptr %"55", align 4 + store i64 %"48", ptr addrspace(5) %"42", align 4 + %"51" = load i64, ptr addrspace(5) %"42", align 4 + %"50" = add i64 %"51", 1 + store i64 %"50", ptr addrspace(5) %"43", align 4 + %"52" = load i64, ptr addrspace(5) %"41", align 4 + %"53" = load i64, ptr addrspace(5) %"43", align 4 + %"56" = inttoptr i64 %"52" to ptr + store i64 %"53", ptr %"56", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/max.ll b/ptx/src/test/ll/max.ll index e8f58ba..4a433e9 100644 --- a/ptx/src/test/ll/max.ll +++ b/ptx/src/test/ll/max.ll @@ -10,40 +10,44 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 + br label %"58" + +"58": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"46" = load i32, ptr %"55", align 4 + store i32 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 4 + %"49" = load i32, ptr %"31", align 4 + store i32 %"49", ptr addrspace(5) %"43", align 4 %"51" = load i32, ptr addrspace(5) %"42", align 4 - %"49" = call i32 @llvm.smax.i32(i32 %"50", i32 %"51") - store i32 %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"56", align 4 + %"52" = load i32, ptr addrspace(5) %"43", align 4 + %"50" = call i32 @llvm.smax.i32(i32 %"51", i32 %"52") + store i32 %"50", ptr addrspace(5) %"42", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i32, ptr addrspace(5) %"42", align 4 + %"57" = inttoptr i64 %"53" to ptr + store i32 %"54", ptr %"57", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.smax.i32(i32, i32) #1 +declare i32 @llvm.smax.i32(i32, i32) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/membar.ll b/ptx/src/test/ll/membar.ll index 2e78f12..b641d66 100644 --- a/ptx/src/test/ll/membar.ll +++ b/ptx/src/test/ll/membar.ll @@ -10,27 +10,31 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"49" + +"49": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"46" = inttoptr i64 %"42" to ptr - %"45" = load i32, ptr %"46", align 4 - store i32 %"45", ptr addrspace(5) %"38", align 4 - fence seq_cst + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 %"47" = inttoptr i64 %"43" to ptr - store i32 %"44", ptr %"47", align 4 + %"46" = load i32, ptr %"47", align 4 + store i32 %"46", ptr addrspace(5) %"39", align 4 + fence seq_cst + %"44" = load i64, ptr addrspace(5) %"38", align 4 + %"45" = load i32, ptr addrspace(5) %"39", align 4 + %"48" = inttoptr i64 %"44" to ptr + store i32 %"45", ptr %"48", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/min.ll b/ptx/src/test/ll/min.ll index e868195..c490c8d 100644 --- a/ptx/src/test/ll/min.ll +++ b/ptx/src/test/ll/min.ll @@ -10,40 +10,44 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 + br label %"58" + +"58": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"46" = load i32, ptr %"55", align 4 + store i32 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 4 + %"49" = load i32, ptr %"31", align 4 + store i32 %"49", ptr addrspace(5) %"43", align 4 %"51" = load i32, ptr addrspace(5) %"42", align 4 - %"49" = call i32 @llvm.smin.i32(i32 %"50", i32 %"51") - store i32 %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"56", align 4 + %"52" = load i32, ptr addrspace(5) %"43", align 4 + %"50" = call i32 @llvm.smin.i32(i32 %"51", i32 %"52") + store i32 %"50", ptr addrspace(5) %"42", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i32, ptr addrspace(5) %"42", align 4 + %"57" = inttoptr i64 %"53" to ptr + store i32 %"54", ptr %"57", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.smin.i32(i32, i32) #1 +declare i32 @llvm.smin.i32(i32, i32) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/mov.ll b/ptx/src/test/ll/mov.ll index cf6c7ee..7f7f275 100644 --- a/ptx/src/test/ll/mov.ll +++ b/ptx/src/test/ll/mov.ll @@ -10,29 +10,33 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"41", ptr addrspace(5) %"37", align 4 + br label %"52" + +"52": ; preds = %1 %"42" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"42", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"49" = inttoptr i64 %"44" to ptr - %"43" = load i64, ptr %"49", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - store i64 %"46", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"38", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"50" = inttoptr i64 %"47" to ptr - store i64 %"48", ptr %"50", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr + %"44" = load i64, ptr %"50", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + store i64 %"47", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"51" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"51", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mov_address.ll b/ptx/src/test/ll/mov_address.ll index 644df01..af20fa0 100644 --- a/ptx/src/test/ll/mov_address.ll +++ b/ptx/src/test/ll/mov_address.ll @@ -10,15 +10,19 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { - %"9" = alloca [8 x i8], align 1, addrspace(5) - %"35" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #1 { + %"10" = alloca [8 x i8], align 1, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"37" = ptrtoint ptr addrspace(5) %"9" to i64 - store i64 %"37", ptr addrspace(5) %"35", align 4 + br label %"39" + +"39": ; preds = %1 + %"38" = ptrtoint ptr addrspace(5) %"10" to i64 + store i64 %"38", ptr addrspace(5) %"36", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mul24.ll b/ptx/src/test/ll/mul24.ll index aae8aa0..647fbf8 100644 --- a/ptx/src/test/ll/mul24.ll +++ b/ptx/src/test/ll/mul24.ll @@ -10,34 +10,38 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @mul24(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mul24(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i32, align 4, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 + br label %"53" + +"53": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i32, ptr %"50", align 4 - store i32 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i32, ptr addrspace(5) %"40", align 4 - %"46" = call i32 @llvm.amdgcn.mul.u24(i32 %"47", i32 2) - store i32 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i32, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i32 %"49", ptr %"51", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"46" to ptr + %"45" = load i32, ptr %"51", align 4 + store i32 %"45", ptr addrspace(5) %"41", align 4 + %"48" = load i32, ptr addrspace(5) %"41", align 4 + %"47" = call i32 @llvm.amdgcn.mul.u24(i32 %"48", i32 2) + store i32 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = load i32, ptr addrspace(5) %"42", align 4 + %"52" = inttoptr i64 %"49" to ptr + store i32 %"50", ptr %"52", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.amdgcn.mul.u24(i32, i32) #1 +declare i32 @llvm.amdgcn.mul.u24(i32, i32) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/mul_ftz.ll b/ptx/src/test/ll/mul_ftz.ll index ceacd5d..96990e1 100644 --- a/ptx/src/test/ll/mul_ftz.ll +++ b/ptx/src/test/ll/mul_ftz.ll @@ -10,36 +10,40 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca float, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca float, align 4, addrspace(5) + %"43" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 + br label %"58" + +"58": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load float, ptr %"54", align 4 - store float %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load float, ptr %"30", align 4 - store float %"48", ptr addrspace(5) %"42", align 4 - %"50" = load float, ptr addrspace(5) %"41", align 4 + %"46" = load float, ptr %"55", align 4 + store float %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 4 + %"49" = load float, ptr %"31", align 4 + store float %"49", ptr addrspace(5) %"43", align 4 %"51" = load float, ptr addrspace(5) %"42", align 4 - %"49" = fmul float %"50", %"51" - store float %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load float, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store float %"53", ptr %"56", align 4 + %"52" = load float, ptr addrspace(5) %"43", align 4 + %"50" = fmul float %"51", %"52" + store float %"50", ptr addrspace(5) %"42", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load float, ptr addrspace(5) %"42", align 4 + %"57" = inttoptr i64 %"53" to ptr + store float %"54", ptr %"57", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mul_hi.ll b/ptx/src/test/ll/mul_hi.ll index 57ee469..ff1a404 100644 --- a/ptx/src/test/ll/mul_hi.ll +++ b/ptx/src/test/ll/mul_hi.ll @@ -10,33 +10,37 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 + br label %"53" + +"53": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %2 = zext i64 %"47" to i128 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"46" to ptr + %"45" = load i64, ptr %"51", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %2 = zext i64 %"48" to i128 %3 = mul i128 %2, 2 %4 = lshr i128 %3, 64 - %"46" = trunc i128 %4 to i64 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"51", align 4 + %"47" = trunc i128 %4 to i64 + store i64 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"52" = inttoptr i64 %"49" to ptr + store i64 %"50", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mul_lo.ll b/ptx/src/test/ll/mul_lo.ll index 15f39e8..93870c3 100644 --- a/ptx/src/test/ll/mul_lo.ll +++ b/ptx/src/test/ll/mul_lo.ll @@ -10,30 +10,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 + br label %"53" + +"53": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"46" = mul i64 %"47", 2 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"51", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"46" to ptr + %"45" = load i64, ptr %"51", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %"47" = mul i64 %"48", 2 + store i64 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"52" = inttoptr i64 %"49" to ptr + store i64 %"50", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mul_non_ftz.ll b/ptx/src/test/ll/mul_non_ftz.ll index ee1da37..eafbd42 100644 --- a/ptx/src/test/ll/mul_non_ftz.ll +++ b/ptx/src/test/ll/mul_non_ftz.ll @@ -10,36 +10,40 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca float, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca float, align 4, addrspace(5) + %"43" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 + br label %"58" + +"58": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load float, ptr %"54", align 4 - store float %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load float, ptr %"30", align 4 - store float %"48", ptr addrspace(5) %"42", align 4 - %"50" = load float, ptr addrspace(5) %"41", align 4 + %"46" = load float, ptr %"55", align 4 + store float %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 4 + %"49" = load float, ptr %"31", align 4 + store float %"49", ptr addrspace(5) %"43", align 4 %"51" = load float, ptr addrspace(5) %"42", align 4 - %"49" = fmul float %"50", %"51" - store float %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load float, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store float %"53", ptr %"56", align 4 + %"52" = load float, ptr addrspace(5) %"43", align 4 + %"50" = fmul float %"51", %"52" + store float %"50", ptr addrspace(5) %"42", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load float, ptr addrspace(5) %"42", align 4 + %"57" = inttoptr i64 %"53" to ptr + store float %"54", ptr %"57", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mul_wide.ll b/ptx/src/test/ll/mul_wide.ll index 7b815d1..a5dcc1b 100644 --- a/ptx/src/test/ll/mul_wide.ll +++ b/ptx/src/test/ll/mul_wide.ll @@ -10,39 +10,43 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { - %"40" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) %"43" = alloca i32, align 4, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"45", ptr addrspace(5) %"40", align 4 + br label %"61" + +"61": ; preds = %1 %"46" = load i64, ptr addrspace(4) %"39", align 4 store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"56" = inttoptr i64 %"48" to ptr addrspace(1) - %"47" = load i32, ptr addrspace(1) %"56", align 4 - store i32 %"47", ptr addrspace(5) %"42", align 4 - %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 %"57" = inttoptr i64 %"49" to ptr addrspace(1) - %"31" = getelementptr inbounds i8, ptr addrspace(1) %"57", i64 4 - %"50" = load i32, ptr addrspace(1) %"31", align 4 - store i32 %"50", ptr addrspace(5) %"43", align 4 - %"52" = load i32, ptr addrspace(5) %"42", align 4 + %"48" = load i32, ptr addrspace(1) %"57", align 4 + store i32 %"48", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(5) %"41", align 4 + %"58" = inttoptr i64 %"50" to ptr addrspace(1) + %"32" = getelementptr inbounds i8, ptr addrspace(1) %"58", i64 4 + %"51" = load i32, ptr addrspace(1) %"32", align 4 + store i32 %"51", ptr addrspace(5) %"44", align 4 %"53" = load i32, ptr addrspace(5) %"43", align 4 - %2 = sext i32 %"52" to i64 - %3 = sext i32 %"53" to i64 - %"51" = mul i64 %2, %3 - store i64 %"51", ptr addrspace(5) %"44", align 4 - %"54" = load i64, ptr addrspace(5) %"41", align 4 - %"55" = load i64, ptr addrspace(5) %"44", align 4 - %"58" = inttoptr i64 %"54" to ptr - store i64 %"55", ptr %"58", align 4 + %"54" = load i32, ptr addrspace(5) %"44", align 4 + %2 = sext i32 %"53" to i64 + %3 = sext i32 %"54" to i64 + %"52" = mul i64 %2, %3 + store i64 %"52", ptr addrspace(5) %"45", align 4 + %"55" = load i64, ptr addrspace(5) %"42", align 4 + %"56" = load i64, ptr addrspace(5) %"45", align 4 + %"59" = inttoptr i64 %"55" to ptr + store i64 %"56", ptr %"59", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/neg.ll b/ptx/src/test/ll/neg.ll index ebcedc0..59f0f7f 100644 --- a/ptx/src/test/ll/neg.ll +++ b/ptx/src/test/ll/neg.ll @@ -10,29 +10,33 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"50" + +"50": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load i32, ptr %"47", align 4 - store i32 %"41", ptr addrspace(5) %"38", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 - %"43" = sub i32 0, %"44" - store i32 %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load i32, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store i32 %"46", ptr %"48", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load i32, ptr %"48", align 4 + store i32 %"42", ptr addrspace(5) %"39", align 4 + %"45" = load i32, ptr addrspace(5) %"39", align 4 + %"44" = sub i32 0, %"45" + store i32 %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store i32 %"47", ptr %"49", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/non_scalar_ptr_offset.ll b/ptx/src/test/ll/non_scalar_ptr_offset.ll index 9fabfa6..3132475 100644 --- a/ptx/src/test/ll/non_scalar_ptr_offset.ll +++ b/ptx/src/test/ll/non_scalar_ptr_offset.ll @@ -10,35 +10,39 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { - %"40" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 + br label %"57" + +"57": ; preds = %1 %"45" = load i64, ptr addrspace(4) %"39", align 4 store i64 %"45", ptr addrspace(5) %"41", align 4 - %"46" = load i64, ptr addrspace(5) %"40", align 4 - %"54" = inttoptr i64 %"46" to ptr addrspace(1) - %"31" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8 - %"29" = load <2 x i32>, ptr addrspace(1) %"31", align 8 - %"47" = extractelement <2 x i32> %"29", i8 0 - %"48" = extractelement <2 x i32> %"29", i8 1 - store i32 %"47", ptr addrspace(5) %"42", align 4 + %"46" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"47" = load i64, ptr addrspace(5) %"41", align 4 + %"55" = inttoptr i64 %"47" to ptr addrspace(1) + %"32" = getelementptr inbounds i8, ptr addrspace(1) %"55", i64 8 + %"30" = load <2 x i32>, ptr addrspace(1) %"32", align 8 + %"48" = extractelement <2 x i32> %"30", i8 0 + %"49" = extractelement <2 x i32> %"30", i8 1 store i32 %"48", ptr addrspace(5) %"43", align 4 - %"50" = load i32, ptr addrspace(5) %"42", align 4 + store i32 %"49", ptr addrspace(5) %"44", align 4 %"51" = load i32, ptr addrspace(5) %"43", align 4 - %"49" = add i32 %"50", %"51" - store i32 %"49", ptr addrspace(5) %"42", align 4 - %"52" = load i64, ptr addrspace(5) %"41", align 4 - %"53" = load i32, ptr addrspace(5) %"42", align 4 - %"55" = inttoptr i64 %"52" to ptr addrspace(1) - store i32 %"53", ptr addrspace(1) %"55", align 4 + %"52" = load i32, ptr addrspace(5) %"44", align 4 + %"50" = add i32 %"51", %"52" + store i32 %"50", ptr addrspace(5) %"43", align 4 + %"53" = load i64, ptr addrspace(5) %"42", align 4 + %"54" = load i32, ptr addrspace(5) %"43", align 4 + %"56" = inttoptr i64 %"53" to ptr addrspace(1) + store i32 %"54", ptr addrspace(1) %"56", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/not.ll b/ptx/src/test/ll/not.ll index 8b078d7..fba0189 100644 --- a/ptx/src/test/ll/not.ll +++ b/ptx/src/test/ll/not.ll @@ -10,30 +10,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"41", ptr addrspace(5) %"37", align 4 + br label %"54" + +"54": ; preds = %1 %"42" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"42", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"49" = inttoptr i64 %"44" to ptr - %"43" = load i64, ptr %"49", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"50" = xor i64 %"46", -1 - store i64 %"50", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"38", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"52" = inttoptr i64 %"47" to ptr - store i64 %"48", ptr %"52", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr + %"44" = load i64, ptr %"50", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"51" = xor i64 %"47", -1 + store i64 %"51", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"53" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"53", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/ntid.ll b/ptx/src/test/ll/ntid.ll index 2144bc4..be29202 100644 --- a/ptx/src/test/ll/ntid.ll +++ b/ptx/src/test/ll/ntid.ll @@ -10,33 +10,37 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 + br label %"56" + +"56": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"53" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"53", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"30" = call i32 @__zluda_ptx_impl_sreg_ntid(i8 0) - store i32 %"30", ptr addrspace(5) %"42", align 4 - %"49" = load i32, ptr addrspace(5) %"41", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"54" = inttoptr i64 %"47" to ptr + %"46" = load i32, ptr %"54", align 4 + store i32 %"46", ptr addrspace(5) %"42", align 4 + %"31" = call i32 @__zluda_ptx_impl_sreg_ntid(i8 0) + store i32 %"31", ptr addrspace(5) %"43", align 4 %"50" = load i32, ptr addrspace(5) %"42", align 4 - %"48" = add i32 %"49", %"50" - store i32 %"48", ptr addrspace(5) %"41", align 4 - %"51" = load i64, ptr addrspace(5) %"40", align 4 - %"52" = load i32, ptr addrspace(5) %"41", align 4 - %"54" = inttoptr i64 %"51" to ptr - store i32 %"52", ptr %"54", align 4 + %"51" = load i32, ptr addrspace(5) %"43", align 4 + %"49" = add i32 %"50", %"51" + store i32 %"49", ptr addrspace(5) %"42", align 4 + %"52" = load i64, ptr addrspace(5) %"41", align 4 + %"53" = load i32, ptr addrspace(5) %"42", align 4 + %"55" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"55", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/or.ll b/ptx/src/test/ll/or.ll index c7190b7..7229ce5 100644 --- a/ptx/src/test/ll/or.ll +++ b/ptx/src/test/ll/or.ll @@ -10,36 +10,40 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 + br label %"61" + +"61": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i64, ptr %"54", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 8 - %"48" = load i64, ptr %"30", align 4 - store i64 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i64, ptr addrspace(5) %"41", align 4 + %"46" = load i64, ptr %"55", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 8 + %"49" = load i64, ptr %"31", align 4 + store i64 %"49", ptr addrspace(5) %"43", align 4 %"51" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = or i64 %"50", %"51" - store i64 %"56", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"52" = load i64, ptr addrspace(5) %"43", align 4 + %"57" = or i64 %"51", %"52" + store i64 %"57", ptr addrspace(5) %"42", align 4 %"53" = load i64, ptr addrspace(5) %"41", align 4 - %"59" = inttoptr i64 %"52" to ptr - store i64 %"53", ptr %"59", align 4 + %"54" = load i64, ptr addrspace(5) %"42", align 4 + %"60" = inttoptr i64 %"53" to ptr + store i64 %"54", ptr %"60", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/popc.ll b/ptx/src/test/ll/popc.ll index e71acba..a5fc275 100644 --- a/ptx/src/test/ll/popc.ll +++ b/ptx/src/test/ll/popc.ll @@ -10,33 +10,37 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"51" + +"51": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load i32, ptr %"47", align 4 - store i32 %"41", ptr addrspace(5) %"38", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 - %"48" = call i32 @llvm.ctpop.i32(i32 %"44") - store i32 %"48", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load i32, ptr addrspace(5) %"38", align 4 - %"49" = inttoptr i64 %"45" to ptr - store i32 %"46", ptr %"49", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load i32, ptr %"48", align 4 + store i32 %"42", ptr addrspace(5) %"39", align 4 + %"45" = load i32, ptr addrspace(5) %"39", align 4 + %"49" = call i32 @llvm.ctpop.i32(i32 %"45") + store i32 %"49", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"50" = inttoptr i64 %"46" to ptr + store i32 %"47", ptr %"50", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.ctpop.i32(i32) #1 +declare i32 @llvm.ctpop.i32(i32) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/pred_not.ll b/ptx/src/test/ll/pred_not.ll index 7046c09..d3e4070 100644 --- a/ptx/src/test/ll/pred_not.ll +++ b/ptx/src/test/ll/pred_not.ll @@ -10,57 +10,61 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { - %"47" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #1 { %"48" = alloca i64, align 8, addrspace(5) %"49" = alloca i64, align 8, addrspace(5) %"50" = alloca i64, align 8, addrspace(5) %"51" = alloca i64, align 8, addrspace(5) - %"52" = alloca i1, align 1, addrspace(5) + %"52" = alloca i64, align 8, addrspace(5) + %"53" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"53" = load i64, ptr addrspace(4) %"45", align 4 - store i64 %"53", ptr addrspace(5) %"47", align 4 + br label %"74" + +"74": ; preds = %1 %"54" = load i64, ptr addrspace(4) %"46", align 4 store i64 %"54", ptr addrspace(5) %"48", align 4 - %"56" = load i64, ptr addrspace(5) %"47", align 4 - %"70" = inttoptr i64 %"56" to ptr - %"55" = load i64, ptr %"70", align 4 + %"55" = load i64, ptr addrspace(4) %"47", align 4 store i64 %"55", ptr addrspace(5) %"49", align 4 - %"57" = load i64, ptr addrspace(5) %"47", align 4 + %"57" = load i64, ptr addrspace(5) %"48", align 4 %"71" = inttoptr i64 %"57" to ptr - %"36" = getelementptr inbounds i8, ptr %"71", i64 8 - %"58" = load i64, ptr %"36", align 4 - store i64 %"58", ptr addrspace(5) %"50", align 4 - %"60" = load i64, ptr addrspace(5) %"49", align 4 + %"56" = load i64, ptr %"71", align 4 + store i64 %"56", ptr addrspace(5) %"50", align 4 + %"58" = load i64, ptr addrspace(5) %"48", align 4 + %"72" = inttoptr i64 %"58" to ptr + %"37" = getelementptr inbounds i8, ptr %"72", i64 8 + %"59" = load i64, ptr %"37", align 4 + store i64 %"59", ptr addrspace(5) %"51", align 4 %"61" = load i64, ptr addrspace(5) %"50", align 4 - %"59" = icmp ult i64 %"60", %"61" - store i1 %"59", ptr addrspace(5) %"52", align 1 - %"63" = load i1, ptr addrspace(5) %"52", align 1 - %"62" = xor i1 %"63", true - store i1 %"62", ptr addrspace(5) %"52", align 1 - %"64" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"64", label %"15", label %"16" + %"62" = load i64, ptr addrspace(5) %"51", align 4 + %"60" = icmp ult i64 %"61", %"62" + store i1 %"60", ptr addrspace(5) %"53", align 1 + %"64" = load i1, ptr addrspace(5) %"53", align 1 + %"63" = xor i1 %"64", true + store i1 %"63", ptr addrspace(5) %"53", align 1 + %"65" = load i1, ptr addrspace(5) %"53", align 1 + br i1 %"65", label %"16", label %"17" -"15": ; preds = %1 - store i64 1, ptr addrspace(5) %"51", align 4 - br label %"16" +"16": ; preds = %"74" + store i64 1, ptr addrspace(5) %"52", align 4 + br label %"17" -"16": ; preds = %"15", %1 - %"66" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"66", label %"18", label %"17" +"17": ; preds = %"16", %"74" + %"67" = load i1, ptr addrspace(5) %"53", align 1 + br i1 %"67", label %"19", label %"18" -"17": ; preds = %"16" - store i64 2, ptr addrspace(5) %"51", align 4 - br label %"18" +"18": ; preds = %"17" + store i64 2, ptr addrspace(5) %"52", align 4 + br label %"19" -"18": ; preds = %"17", %"16" - %"68" = load i64, ptr addrspace(5) %"48", align 4 - %"69" = load i64, ptr addrspace(5) %"51", align 4 - %"72" = inttoptr i64 %"68" to ptr - store i64 %"69", ptr %"72", align 4 +"19": ; preds = %"18", %"17" + %"69" = load i64, ptr addrspace(5) %"49", align 4 + %"70" = load i64, ptr addrspace(5) %"52", align 4 + %"73" = inttoptr i64 %"69" to ptr + store i64 %"70", ptr %"73", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/prmt.ll b/ptx/src/test/ll/prmt.ll index dd5b95c..d1cc00a 100644 --- a/ptx/src/test/ll/prmt.ll +++ b/ptx/src/test/ll/prmt.ll @@ -10,38 +10,42 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 + br label %"61" + +"61": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"46" = load i32, ptr %"55", align 4 + store i32 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 4 + %"49" = load i32, ptr %"31", align 4 + store i32 %"49", ptr addrspace(5) %"43", align 4 %"51" = load i32, ptr addrspace(5) %"42", align 4 - %2 = bitcast i32 %"50" to <4 x i8> - %3 = bitcast i32 %"51" to <4 x i8> - %"56" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> - store <4 x i8> %"56", ptr addrspace(5) %"42", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"42", align 4 - %"59" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"59", align 4 + %"52" = load i32, ptr addrspace(5) %"43", align 4 + %2 = bitcast i32 %"51" to <4 x i8> + %3 = bitcast i32 %"52" to <4 x i8> + %"57" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> + store <4 x i8> %"57", ptr addrspace(5) %"43", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i32, ptr addrspace(5) %"43", align 4 + %"60" = inttoptr i64 %"53" to ptr + store i32 %"54", ptr %"60", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/rcp.ll b/ptx/src/test/ll/rcp.ll index c00012a..3875cd2 100644 --- a/ptx/src/test/ll/rcp.ll +++ b/ptx/src/test/ll/rcp.ll @@ -10,33 +10,37 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"50" + +"50": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call float @llvm.amdgcn.rcp.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load float, ptr %"48", align 4 + store float %"42", ptr addrspace(5) %"39", align 4 + %"45" = load float, ptr addrspace(5) %"39", align 4 + %"44" = call float @llvm.amdgcn.rcp.f32(float %"45") + store float %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load float, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store float %"47", ptr %"49", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.amdgcn.rcp.f32(float) #1 +declare float @llvm.amdgcn.rcp.f32(float) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/reg_local.ll b/ptx/src/test/ll/reg_local.ll index 51fe3e9..12a1694 100644 --- a/ptx/src/test/ll/reg_local.ll +++ b/ptx/src/test/ll/reg_local.ll @@ -10,36 +10,40 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { - %"9" = alloca [8 x i8], align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #1 { + %"10" = alloca [8 x i8], align 8, addrspace(5) %"44" = alloca i64, align 8, addrspace(5) %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"46" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"46", ptr addrspace(5) %"43", align 4 + br label %"63" + +"63": ; preds = %1 %"47" = load i64, ptr addrspace(4) %"42", align 4 store i64 %"47", ptr addrspace(5) %"44", align 4 - %"49" = load i64, ptr addrspace(5) %"43", align 4 - %"55" = inttoptr i64 %"49" to ptr addrspace(1) - %"54" = load i64, ptr addrspace(1) %"55", align 4 - store i64 %"54", ptr addrspace(5) %"45", align 4 - %"50" = load i64, ptr addrspace(5) %"45", align 4 - %"30" = add i64 %"50", 1 - %"56" = addrspacecast ptr addrspace(5) %"9" to ptr - store i64 %"30", ptr %"56", align 4 - %"58" = addrspacecast ptr addrspace(5) %"9" to ptr - %"32" = getelementptr inbounds i8, ptr %"58", i64 0 - %"59" = load i64, ptr %"32", align 4 - store i64 %"59", ptr addrspace(5) %"45", align 4 - %"52" = load i64, ptr addrspace(5) %"44", align 4 - %"60" = inttoptr i64 %"52" to ptr addrspace(1) - %"34" = getelementptr inbounds i8, ptr addrspace(1) %"60", i64 0 + %"48" = load i64, ptr addrspace(4) %"43", align 4 + store i64 %"48", ptr addrspace(5) %"45", align 4 + %"50" = load i64, ptr addrspace(5) %"44", align 4 + %"56" = inttoptr i64 %"50" to ptr addrspace(1) + %"55" = load i64, ptr addrspace(1) %"56", align 4 + store i64 %"55", ptr addrspace(5) %"46", align 4 + %"51" = load i64, ptr addrspace(5) %"46", align 4 + %"31" = add i64 %"51", 1 + %"57" = addrspacecast ptr addrspace(5) %"10" to ptr + store i64 %"31", ptr %"57", align 4 + %"59" = addrspacecast ptr addrspace(5) %"10" to ptr + %"33" = getelementptr inbounds i8, ptr %"59", i64 0 + %"60" = load i64, ptr %"33", align 4 + store i64 %"60", ptr addrspace(5) %"46", align 4 %"53" = load i64, ptr addrspace(5) %"45", align 4 - store i64 %"53", ptr addrspace(1) %"34", align 4 + %"61" = inttoptr i64 %"53" to ptr addrspace(1) + %"35" = getelementptr inbounds i8, ptr addrspace(1) %"61", i64 0 + %"54" = load i64, ptr addrspace(5) %"46", align 4 + store i64 %"54", ptr addrspace(1) %"35", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/rem.ll b/ptx/src/test/ll/rem.ll index 964021e..563f586 100644 --- a/ptx/src/test/ll/rem.ll +++ b/ptx/src/test/ll/rem.ll @@ -10,36 +10,40 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 + br label %"58" + +"58": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"46" = load i32, ptr %"55", align 4 + store i32 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 4 + %"49" = load i32, ptr %"31", align 4 + store i32 %"49", ptr addrspace(5) %"43", align 4 %"51" = load i32, ptr addrspace(5) %"42", align 4 - %"49" = srem i32 %"50", %"51" - store i32 %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"56", align 4 + %"52" = load i32, ptr addrspace(5) %"43", align 4 + %"50" = srem i32 %"51", %"52" + store i32 %"50", ptr addrspace(5) %"42", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i32, ptr addrspace(5) %"42", align 4 + %"57" = inttoptr i64 %"53" to ptr + store i32 %"54", ptr %"57", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/rsqrt.ll b/ptx/src/test/ll/rsqrt.ll index 532a8c8..fd2592c 100644 --- a/ptx/src/test/ll/rsqrt.ll +++ b/ptx/src/test/ll/rsqrt.ll @@ -10,33 +10,42 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca double, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca double, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"50" + +"50": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load double, ptr %"47", align 8 - store double %"41", ptr addrspace(5) %"38", align 8 - %"44" = load double, ptr addrspace(5) %"38", align 8 - %"43" = call double @llvm.amdgcn.rsq.f64(double %"44") - store double %"43", ptr addrspace(5) %"38", align 8 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load double, ptr addrspace(5) %"38", align 8 - %"48" = inttoptr i64 %"45" to ptr - store double %"46", ptr %"48", align 8 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load double, ptr %"48", align 8 + store double %"42", ptr addrspace(5) %"39", align 8 + %"45" = load double, ptr addrspace(5) %"39", align 8 + call void @llvm.amdgcn.s.setreg(i32 2433, i32 3) + %"44" = call double @llvm.amdgcn.rsq.f64(double %"45") + store double %"44", ptr addrspace(5) %"39", align 8 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load double, ptr addrspace(5) %"39", align 8 + %"49" = inttoptr i64 %"46" to ptr + store double %"47", ptr %"49", align 8 ret void } -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare double @llvm.amdgcn.rsq.f64(double) #1 +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare double @llvm.amdgcn.rsq.f64(double) #3 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind willreturn } +attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/selp.ll b/ptx/src/test/ll/selp.ll index 580754d..08c29dc 100644 --- a/ptx/src/test/ll/selp.ll +++ b/ptx/src/test/ll/selp.ll @@ -10,36 +10,40 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { - %"40" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i16, align 2, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) %"43" = alloca i16, align 2, addrspace(5) + %"44" = alloca i16, align 2, addrspace(5) br label %1 1: ; preds = %0 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 + br label %"59" + +"59": ; preds = %1 %"45" = load i64, ptr addrspace(4) %"39", align 4 store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"46" = load i16, ptr %"55", align 2 - store i16 %"46", ptr addrspace(5) %"42", align 2 - %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 %"56" = inttoptr i64 %"48" to ptr - %"30" = getelementptr inbounds i8, ptr %"56", i64 2 - %"49" = load i16, ptr %"30", align 2 - store i16 %"49", ptr addrspace(5) %"43", align 2 - %"51" = load i16, ptr addrspace(5) %"42", align 2 + %"47" = load i16, ptr %"56", align 2 + store i16 %"47", ptr addrspace(5) %"43", align 2 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"57" = inttoptr i64 %"49" to ptr + %"31" = getelementptr inbounds i8, ptr %"57", i64 2 + %"50" = load i16, ptr %"31", align 2 + store i16 %"50", ptr addrspace(5) %"44", align 2 %"52" = load i16, ptr addrspace(5) %"43", align 2 - %"50" = select i1 false, i16 %"51", i16 %"52" - store i16 %"50", ptr addrspace(5) %"42", align 2 - %"53" = load i64, ptr addrspace(5) %"41", align 4 - %"54" = load i16, ptr addrspace(5) %"42", align 2 - %"57" = inttoptr i64 %"53" to ptr - store i16 %"54", ptr %"57", align 2 + %"53" = load i16, ptr addrspace(5) %"44", align 2 + %"51" = select i1 false, i16 %"52", i16 %"53" + store i16 %"51", ptr addrspace(5) %"43", align 2 + %"54" = load i64, ptr addrspace(5) %"42", align 4 + %"55" = load i16, ptr addrspace(5) %"43", align 2 + %"58" = inttoptr i64 %"54" to ptr + store i16 %"55", ptr %"58", align 2 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/selp_true.ll b/ptx/src/test/ll/selp_true.ll index 142c361..1361105 100644 --- a/ptx/src/test/ll/selp_true.ll +++ b/ptx/src/test/ll/selp_true.ll @@ -10,36 +10,40 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { - %"40" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i16, align 2, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) %"43" = alloca i16, align 2, addrspace(5) + %"44" = alloca i16, align 2, addrspace(5) br label %1 1: ; preds = %0 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 + br label %"59" + +"59": ; preds = %1 %"45" = load i64, ptr addrspace(4) %"39", align 4 store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"46" = load i16, ptr %"55", align 2 - store i16 %"46", ptr addrspace(5) %"42", align 2 - %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 %"56" = inttoptr i64 %"48" to ptr - %"30" = getelementptr inbounds i8, ptr %"56", i64 2 - %"49" = load i16, ptr %"30", align 2 - store i16 %"49", ptr addrspace(5) %"43", align 2 - %"51" = load i16, ptr addrspace(5) %"42", align 2 + %"47" = load i16, ptr %"56", align 2 + store i16 %"47", ptr addrspace(5) %"43", align 2 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"57" = inttoptr i64 %"49" to ptr + %"31" = getelementptr inbounds i8, ptr %"57", i64 2 + %"50" = load i16, ptr %"31", align 2 + store i16 %"50", ptr addrspace(5) %"44", align 2 %"52" = load i16, ptr addrspace(5) %"43", align 2 - %"50" = select i1 true, i16 %"51", i16 %"52" - store i16 %"50", ptr addrspace(5) %"42", align 2 - %"53" = load i64, ptr addrspace(5) %"41", align 4 - %"54" = load i16, ptr addrspace(5) %"42", align 2 - %"57" = inttoptr i64 %"53" to ptr - store i16 %"54", ptr %"57", align 2 + %"53" = load i16, ptr addrspace(5) %"44", align 2 + %"51" = select i1 true, i16 %"52", i16 %"53" + store i16 %"51", ptr addrspace(5) %"43", align 2 + %"54" = load i64, ptr addrspace(5) %"42", align 4 + %"55" = load i16, ptr addrspace(5) %"43", align 2 + %"58" = inttoptr i64 %"54" to ptr + store i16 %"55", ptr %"58", align 2 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp.ll b/ptx/src/test/ll/setp.ll index 6625957..dd1f6c1 100644 --- a/ptx/src/test/ll/setp.ll +++ b/ptx/src/test/ll/setp.ll @@ -10,54 +10,58 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { - %"47" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #1 { %"48" = alloca i64, align 8, addrspace(5) %"49" = alloca i64, align 8, addrspace(5) %"50" = alloca i64, align 8, addrspace(5) %"51" = alloca i64, align 8, addrspace(5) - %"52" = alloca i1, align 1, addrspace(5) + %"52" = alloca i64, align 8, addrspace(5) + %"53" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"53" = load i64, ptr addrspace(4) %"45", align 4 - store i64 %"53", ptr addrspace(5) %"47", align 4 + br label %"72" + +"72": ; preds = %1 %"54" = load i64, ptr addrspace(4) %"46", align 4 store i64 %"54", ptr addrspace(5) %"48", align 4 - %"56" = load i64, ptr addrspace(5) %"47", align 4 - %"68" = inttoptr i64 %"56" to ptr - %"55" = load i64, ptr %"68", align 4 + %"55" = load i64, ptr addrspace(4) %"47", align 4 store i64 %"55", ptr addrspace(5) %"49", align 4 - %"57" = load i64, ptr addrspace(5) %"47", align 4 + %"57" = load i64, ptr addrspace(5) %"48", align 4 %"69" = inttoptr i64 %"57" to ptr - %"36" = getelementptr inbounds i8, ptr %"69", i64 8 - %"58" = load i64, ptr %"36", align 4 - store i64 %"58", ptr addrspace(5) %"50", align 4 - %"60" = load i64, ptr addrspace(5) %"49", align 4 + %"56" = load i64, ptr %"69", align 4 + store i64 %"56", ptr addrspace(5) %"50", align 4 + %"58" = load i64, ptr addrspace(5) %"48", align 4 + %"70" = inttoptr i64 %"58" to ptr + %"37" = getelementptr inbounds i8, ptr %"70", i64 8 + %"59" = load i64, ptr %"37", align 4 + store i64 %"59", ptr addrspace(5) %"51", align 4 %"61" = load i64, ptr addrspace(5) %"50", align 4 - %"59" = icmp ult i64 %"60", %"61" - store i1 %"59", ptr addrspace(5) %"52", align 1 - %"62" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"62", label %"15", label %"16" + %"62" = load i64, ptr addrspace(5) %"51", align 4 + %"60" = icmp ult i64 %"61", %"62" + store i1 %"60", ptr addrspace(5) %"53", align 1 + %"63" = load i1, ptr addrspace(5) %"53", align 1 + br i1 %"63", label %"16", label %"17" -"15": ; preds = %1 - store i64 1, ptr addrspace(5) %"51", align 4 - br label %"16" +"16": ; preds = %"72" + store i64 1, ptr addrspace(5) %"52", align 4 + br label %"17" -"16": ; preds = %"15", %1 - %"64" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"64", label %"18", label %"17" +"17": ; preds = %"16", %"72" + %"65" = load i1, ptr addrspace(5) %"53", align 1 + br i1 %"65", label %"19", label %"18" -"17": ; preds = %"16" - store i64 2, ptr addrspace(5) %"51", align 4 - br label %"18" +"18": ; preds = %"17" + store i64 2, ptr addrspace(5) %"52", align 4 + br label %"19" -"18": ; preds = %"17", %"16" - %"66" = load i64, ptr addrspace(5) %"48", align 4 - %"67" = load i64, ptr addrspace(5) %"51", align 4 - %"70" = inttoptr i64 %"66" to ptr - store i64 %"67", ptr %"70", align 4 +"19": ; preds = %"18", %"17" + %"67" = load i64, ptr addrspace(5) %"49", align 4 + %"68" = load i64, ptr addrspace(5) %"52", align 4 + %"71" = inttoptr i64 %"67" to ptr + store i64 %"68", ptr %"71", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_gt.ll b/ptx/src/test/ll/setp_gt.ll index 4badce3..3cc7b9e 100644 --- a/ptx/src/test/ll/setp_gt.ll +++ b/ptx/src/test/ll/setp_gt.ll @@ -10,56 +10,60 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { - %"45" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #1 { %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca float, align 4, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) %"48" = alloca float, align 4, addrspace(5) %"49" = alloca float, align 4, addrspace(5) - %"50" = alloca i1, align 1, addrspace(5) + %"50" = alloca float, align 4, addrspace(5) + %"51" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"51" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"51", ptr addrspace(5) %"45", align 4 + br label %"72" + +"72": ; preds = %1 %"52" = load i64, ptr addrspace(4) %"44", align 4 store i64 %"52", ptr addrspace(5) %"46", align 4 - %"54" = load i64, ptr addrspace(5) %"45", align 4 - %"68" = inttoptr i64 %"54" to ptr - %"53" = load float, ptr %"68", align 4 - store float %"53", ptr addrspace(5) %"47", align 4 - %"55" = load i64, ptr addrspace(5) %"45", align 4 + %"53" = load i64, ptr addrspace(4) %"45", align 4 + store i64 %"53", ptr addrspace(5) %"47", align 4 + %"55" = load i64, ptr addrspace(5) %"46", align 4 %"69" = inttoptr i64 %"55" to ptr - %"36" = getelementptr inbounds i8, ptr %"69", i64 4 - %"56" = load float, ptr %"36", align 4 - store float %"56", ptr addrspace(5) %"48", align 4 - %"58" = load float, ptr addrspace(5) %"47", align 4 + %"54" = load float, ptr %"69", align 4 + store float %"54", ptr addrspace(5) %"48", align 4 + %"56" = load i64, ptr addrspace(5) %"46", align 4 + %"70" = inttoptr i64 %"56" to ptr + %"37" = getelementptr inbounds i8, ptr %"70", i64 4 + %"57" = load float, ptr %"37", align 4 + store float %"57", ptr addrspace(5) %"49", align 4 %"59" = load float, ptr addrspace(5) %"48", align 4 - %"57" = fcmp ogt float %"58", %"59" - store i1 %"57", ptr addrspace(5) %"50", align 1 - %"60" = load i1, ptr addrspace(5) %"50", align 1 - br i1 %"60", label %"15", label %"16" + %"60" = load float, ptr addrspace(5) %"49", align 4 + %"58" = fcmp ogt float %"59", %"60" + store i1 %"58", ptr addrspace(5) %"51", align 1 + %"61" = load i1, ptr addrspace(5) %"51", align 1 + br i1 %"61", label %"16", label %"17" -"15": ; preds = %1 - %"62" = load float, ptr addrspace(5) %"47", align 4 - store float %"62", ptr addrspace(5) %"49", align 4 - br label %"16" +"16": ; preds = %"72" + %"63" = load float, ptr addrspace(5) %"48", align 4 + store float %"63", ptr addrspace(5) %"50", align 4 + br label %"17" -"16": ; preds = %"15", %1 - %"63" = load i1, ptr addrspace(5) %"50", align 1 - br i1 %"63", label %"18", label %"17" +"17": ; preds = %"16", %"72" + %"64" = load i1, ptr addrspace(5) %"51", align 1 + br i1 %"64", label %"19", label %"18" -"17": ; preds = %"16" - %"65" = load float, ptr addrspace(5) %"48", align 4 - store float %"65", ptr addrspace(5) %"49", align 4 - br label %"18" +"18": ; preds = %"17" + %"66" = load float, ptr addrspace(5) %"49", align 4 + store float %"66", ptr addrspace(5) %"50", align 4 + br label %"19" -"18": ; preds = %"17", %"16" - %"66" = load i64, ptr addrspace(5) %"46", align 4 - %"67" = load float, ptr addrspace(5) %"49", align 4 - %"70" = inttoptr i64 %"66" to ptr - store float %"67", ptr %"70", align 4 +"19": ; preds = %"18", %"17" + %"67" = load i64, ptr addrspace(5) %"47", align 4 + %"68" = load float, ptr addrspace(5) %"50", align 4 + %"71" = inttoptr i64 %"67" to ptr + store float %"68", ptr %"71", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_leu.ll b/ptx/src/test/ll/setp_leu.ll index d91e569..ab5ff32 100644 --- a/ptx/src/test/ll/setp_leu.ll +++ b/ptx/src/test/ll/setp_leu.ll @@ -10,56 +10,60 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { - %"45" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #1 { %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca float, align 4, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) %"48" = alloca float, align 4, addrspace(5) %"49" = alloca float, align 4, addrspace(5) - %"50" = alloca i1, align 1, addrspace(5) + %"50" = alloca float, align 4, addrspace(5) + %"51" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"51" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"51", ptr addrspace(5) %"45", align 4 + br label %"72" + +"72": ; preds = %1 %"52" = load i64, ptr addrspace(4) %"44", align 4 store i64 %"52", ptr addrspace(5) %"46", align 4 - %"54" = load i64, ptr addrspace(5) %"45", align 4 - %"68" = inttoptr i64 %"54" to ptr - %"53" = load float, ptr %"68", align 4 - store float %"53", ptr addrspace(5) %"47", align 4 - %"55" = load i64, ptr addrspace(5) %"45", align 4 + %"53" = load i64, ptr addrspace(4) %"45", align 4 + store i64 %"53", ptr addrspace(5) %"47", align 4 + %"55" = load i64, ptr addrspace(5) %"46", align 4 %"69" = inttoptr i64 %"55" to ptr - %"36" = getelementptr inbounds i8, ptr %"69", i64 4 - %"56" = load float, ptr %"36", align 4 - store float %"56", ptr addrspace(5) %"48", align 4 - %"58" = load float, ptr addrspace(5) %"47", align 4 + %"54" = load float, ptr %"69", align 4 + store float %"54", ptr addrspace(5) %"48", align 4 + %"56" = load i64, ptr addrspace(5) %"46", align 4 + %"70" = inttoptr i64 %"56" to ptr + %"37" = getelementptr inbounds i8, ptr %"70", i64 4 + %"57" = load float, ptr %"37", align 4 + store float %"57", ptr addrspace(5) %"49", align 4 %"59" = load float, ptr addrspace(5) %"48", align 4 - %"57" = fcmp ule float %"58", %"59" - store i1 %"57", ptr addrspace(5) %"50", align 1 - %"60" = load i1, ptr addrspace(5) %"50", align 1 - br i1 %"60", label %"15", label %"16" + %"60" = load float, ptr addrspace(5) %"49", align 4 + %"58" = fcmp ule float %"59", %"60" + store i1 %"58", ptr addrspace(5) %"51", align 1 + %"61" = load i1, ptr addrspace(5) %"51", align 1 + br i1 %"61", label %"16", label %"17" -"15": ; preds = %1 - %"62" = load float, ptr addrspace(5) %"47", align 4 - store float %"62", ptr addrspace(5) %"49", align 4 - br label %"16" +"16": ; preds = %"72" + %"63" = load float, ptr addrspace(5) %"48", align 4 + store float %"63", ptr addrspace(5) %"50", align 4 + br label %"17" -"16": ; preds = %"15", %1 - %"63" = load i1, ptr addrspace(5) %"50", align 1 - br i1 %"63", label %"18", label %"17" +"17": ; preds = %"16", %"72" + %"64" = load i1, ptr addrspace(5) %"51", align 1 + br i1 %"64", label %"19", label %"18" -"17": ; preds = %"16" - %"65" = load float, ptr addrspace(5) %"48", align 4 - store float %"65", ptr addrspace(5) %"49", align 4 - br label %"18" +"18": ; preds = %"17" + %"66" = load float, ptr addrspace(5) %"49", align 4 + store float %"66", ptr addrspace(5) %"50", align 4 + br label %"19" -"18": ; preds = %"17", %"16" - %"66" = load i64, ptr addrspace(5) %"46", align 4 - %"67" = load float, ptr addrspace(5) %"49", align 4 - %"70" = inttoptr i64 %"66" to ptr - store float %"67", ptr %"70", align 4 +"19": ; preds = %"18", %"17" + %"67" = load i64, ptr addrspace(5) %"47", align 4 + %"68" = load float, ptr addrspace(5) %"50", align 4 + %"71" = inttoptr i64 %"67" to ptr + store float %"68", ptr %"71", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_nan.ll b/ptx/src/test/ll/setp_nan.ll index 15c0c2a..bd164aa 100644 --- a/ptx/src/test/ll/setp_nan.ll +++ b/ptx/src/test/ll/setp_nan.ll @@ -10,10 +10,9 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"87", ptr addrspace(4) byref(i64) %"88") #0 { - %"89" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"88", ptr addrspace(4) byref(i64) %"89") #1 { %"90" = alloca i64, align 8, addrspace(5) - %"91" = alloca float, align 4, addrspace(5) + %"91" = alloca i64, align 8, addrspace(5) %"92" = alloca float, align 4, addrspace(5) %"93" = alloca float, align 4, addrspace(5) %"94" = alloca float, align 4, addrspace(5) @@ -21,154 +20,159 @@ define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"87", ptr addrs %"96" = alloca float, align 4, addrspace(5) %"97" = alloca float, align 4, addrspace(5) %"98" = alloca float, align 4, addrspace(5) - %"99" = alloca i32, align 4, addrspace(5) - %"100" = alloca i1, align 1, addrspace(5) + %"99" = alloca float, align 4, addrspace(5) + %"100" = alloca i32, align 4, addrspace(5) + %"101" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"101" = load i64, ptr addrspace(4) %"87", align 4 - store i64 %"101", ptr addrspace(5) %"89", align 4 + br label %"168" + +"168": ; preds = %1 %"102" = load i64, ptr addrspace(4) %"88", align 4 store i64 %"102", ptr addrspace(5) %"90", align 4 - %"104" = load i64, ptr addrspace(5) %"89", align 4 - %"155" = inttoptr i64 %"104" to ptr - %"103" = load float, ptr %"155", align 4 - store float %"103", ptr addrspace(5) %"91", align 4 - %"105" = load i64, ptr addrspace(5) %"89", align 4 + %"103" = load i64, ptr addrspace(4) %"89", align 4 + store i64 %"103", ptr addrspace(5) %"91", align 4 + %"105" = load i64, ptr addrspace(5) %"90", align 4 %"156" = inttoptr i64 %"105" to ptr - %"54" = getelementptr inbounds i8, ptr %"156", i64 4 - %"106" = load float, ptr %"54", align 4 - store float %"106", ptr addrspace(5) %"92", align 4 - %"107" = load i64, ptr addrspace(5) %"89", align 4 - %"157" = inttoptr i64 %"107" to ptr - %"56" = getelementptr inbounds i8, ptr %"157", i64 8 - %"108" = load float, ptr %"56", align 4 - store float %"108", ptr addrspace(5) %"93", align 4 - %"109" = load i64, ptr addrspace(5) %"89", align 4 - %"158" = inttoptr i64 %"109" to ptr - %"58" = getelementptr inbounds i8, ptr %"158", i64 12 - %"110" = load float, ptr %"58", align 4 - store float %"110", ptr addrspace(5) %"94", align 4 - %"111" = load i64, ptr addrspace(5) %"89", align 4 - %"159" = inttoptr i64 %"111" to ptr - %"60" = getelementptr inbounds i8, ptr %"159", i64 16 - %"112" = load float, ptr %"60", align 4 - store float %"112", ptr addrspace(5) %"95", align 4 - %"113" = load i64, ptr addrspace(5) %"89", align 4 - %"160" = inttoptr i64 %"113" to ptr - %"62" = getelementptr inbounds i8, ptr %"160", i64 20 - %"114" = load float, ptr %"62", align 4 - store float %"114", ptr addrspace(5) %"96", align 4 - %"115" = load i64, ptr addrspace(5) %"89", align 4 - %"161" = inttoptr i64 %"115" to ptr - %"64" = getelementptr inbounds i8, ptr %"161", i64 24 - %"116" = load float, ptr %"64", align 4 - store float %"116", ptr addrspace(5) %"97", align 4 - %"117" = load i64, ptr addrspace(5) %"89", align 4 - %"162" = inttoptr i64 %"117" to ptr - %"66" = getelementptr inbounds i8, ptr %"162", i64 28 - %"118" = load float, ptr %"66", align 4 - store float %"118", ptr addrspace(5) %"98", align 4 - %"120" = load float, ptr addrspace(5) %"91", align 4 + %"104" = load float, ptr %"156", align 4 + store float %"104", ptr addrspace(5) %"92", align 4 + %"106" = load i64, ptr addrspace(5) %"90", align 4 + %"157" = inttoptr i64 %"106" to ptr + %"55" = getelementptr inbounds i8, ptr %"157", i64 4 + %"107" = load float, ptr %"55", align 4 + store float %"107", ptr addrspace(5) %"93", align 4 + %"108" = load i64, ptr addrspace(5) %"90", align 4 + %"158" = inttoptr i64 %"108" to ptr + %"57" = getelementptr inbounds i8, ptr %"158", i64 8 + %"109" = load float, ptr %"57", align 4 + store float %"109", ptr addrspace(5) %"94", align 4 + %"110" = load i64, ptr addrspace(5) %"90", align 4 + %"159" = inttoptr i64 %"110" to ptr + %"59" = getelementptr inbounds i8, ptr %"159", i64 12 + %"111" = load float, ptr %"59", align 4 + store float %"111", ptr addrspace(5) %"95", align 4 + %"112" = load i64, ptr addrspace(5) %"90", align 4 + %"160" = inttoptr i64 %"112" to ptr + %"61" = getelementptr inbounds i8, ptr %"160", i64 16 + %"113" = load float, ptr %"61", align 4 + store float %"113", ptr addrspace(5) %"96", align 4 + %"114" = load i64, ptr addrspace(5) %"90", align 4 + %"161" = inttoptr i64 %"114" to ptr + %"63" = getelementptr inbounds i8, ptr %"161", i64 20 + %"115" = load float, ptr %"63", align 4 + store float %"115", ptr addrspace(5) %"97", align 4 + %"116" = load i64, ptr addrspace(5) %"90", align 4 + %"162" = inttoptr i64 %"116" to ptr + %"65" = getelementptr inbounds i8, ptr %"162", i64 24 + %"117" = load float, ptr %"65", align 4 + store float %"117", ptr addrspace(5) %"98", align 4 + %"118" = load i64, ptr addrspace(5) %"90", align 4 + %"163" = inttoptr i64 %"118" to ptr + %"67" = getelementptr inbounds i8, ptr %"163", i64 28 + %"119" = load float, ptr %"67", align 4 + store float %"119", ptr addrspace(5) %"99", align 4 %"121" = load float, ptr addrspace(5) %"92", align 4 - %"119" = fcmp uno float %"120", %"121" - store i1 %"119", ptr addrspace(5) %"100", align 1 - %"122" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"122", label %"21", label %"22" + %"122" = load float, ptr addrspace(5) %"93", align 4 + %"120" = fcmp uno float %"121", %"122" + store i1 %"120", ptr addrspace(5) %"101", align 1 + %"123" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"123", label %"22", label %"23" -"21": ; preds = %1 - store i32 1, ptr addrspace(5) %"99", align 4 - br label %"22" +"22": ; preds = %"168" + store i32 1, ptr addrspace(5) %"100", align 4 + br label %"23" -"22": ; preds = %"21", %1 - %"124" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"124", label %"24", label %"23" +"23": ; preds = %"22", %"168" + %"125" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"125", label %"25", label %"24" -"23": ; preds = %"22" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"24" +"24": ; preds = %"23" + store i32 0, ptr addrspace(5) %"100", align 4 + br label %"25" -"24": ; preds = %"23", %"22" - %"126" = load i64, ptr addrspace(5) %"90", align 4 - %"127" = load i32, ptr addrspace(5) %"99", align 4 - %"163" = inttoptr i64 %"126" to ptr - store i32 %"127", ptr %"163", align 4 - %"129" = load float, ptr addrspace(5) %"93", align 4 +"25": ; preds = %"24", %"23" + %"127" = load i64, ptr addrspace(5) %"91", align 4 + %"128" = load i32, ptr addrspace(5) %"100", align 4 + %"164" = inttoptr i64 %"127" to ptr + store i32 %"128", ptr %"164", align 4 %"130" = load float, ptr addrspace(5) %"94", align 4 - %"128" = fcmp uno float %"129", %"130" - store i1 %"128", ptr addrspace(5) %"100", align 1 - %"131" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"131", label %"25", label %"26" + %"131" = load float, ptr addrspace(5) %"95", align 4 + %"129" = fcmp uno float %"130", %"131" + store i1 %"129", ptr addrspace(5) %"101", align 1 + %"132" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"132", label %"26", label %"27" -"25": ; preds = %"24" - store i32 1, ptr addrspace(5) %"99", align 4 - br label %"26" +"26": ; preds = %"25" + store i32 1, ptr addrspace(5) %"100", align 4 + br label %"27" -"26": ; preds = %"25", %"24" - %"133" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"133", label %"28", label %"27" +"27": ; preds = %"26", %"25" + %"134" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"134", label %"29", label %"28" -"27": ; preds = %"26" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"28" +"28": ; preds = %"27" + store i32 0, ptr addrspace(5) %"100", align 4 + br label %"29" -"28": ; preds = %"27", %"26" - %"135" = load i64, ptr addrspace(5) %"90", align 4 - %"164" = inttoptr i64 %"135" to ptr - %"72" = getelementptr inbounds i8, ptr %"164", i64 4 - %"136" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"136", ptr %"72", align 4 - %"138" = load float, ptr addrspace(5) %"95", align 4 +"29": ; preds = %"28", %"27" + %"136" = load i64, ptr addrspace(5) %"91", align 4 + %"165" = inttoptr i64 %"136" to ptr + %"73" = getelementptr inbounds i8, ptr %"165", i64 4 + %"137" = load i32, ptr addrspace(5) %"100", align 4 + store i32 %"137", ptr %"73", align 4 %"139" = load float, ptr addrspace(5) %"96", align 4 - %"137" = fcmp uno float %"138", %"139" - store i1 %"137", ptr addrspace(5) %"100", align 1 - %"140" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"140", label %"29", label %"30" + %"140" = load float, ptr addrspace(5) %"97", align 4 + %"138" = fcmp uno float %"139", %"140" + store i1 %"138", ptr addrspace(5) %"101", align 1 + %"141" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"141", label %"30", label %"31" -"29": ; preds = %"28" - store i32 1, ptr addrspace(5) %"99", align 4 - br label %"30" +"30": ; preds = %"29" + store i32 1, ptr addrspace(5) %"100", align 4 + br label %"31" -"30": ; preds = %"29", %"28" - %"142" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"142", label %"32", label %"31" +"31": ; preds = %"30", %"29" + %"143" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"143", label %"33", label %"32" -"31": ; preds = %"30" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"32" +"32": ; preds = %"31" + store i32 0, ptr addrspace(5) %"100", align 4 + br label %"33" -"32": ; preds = %"31", %"30" - %"144" = load i64, ptr addrspace(5) %"90", align 4 - %"165" = inttoptr i64 %"144" to ptr - %"76" = getelementptr inbounds i8, ptr %"165", i64 8 - %"145" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"145", ptr %"76", align 4 - %"147" = load float, ptr addrspace(5) %"97", align 4 +"33": ; preds = %"32", %"31" + %"145" = load i64, ptr addrspace(5) %"91", align 4 + %"166" = inttoptr i64 %"145" to ptr + %"77" = getelementptr inbounds i8, ptr %"166", i64 8 + %"146" = load i32, ptr addrspace(5) %"100", align 4 + store i32 %"146", ptr %"77", align 4 %"148" = load float, ptr addrspace(5) %"98", align 4 - %"146" = fcmp uno float %"147", %"148" - store i1 %"146", ptr addrspace(5) %"100", align 1 - %"149" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"149", label %"33", label %"34" + %"149" = load float, ptr addrspace(5) %"99", align 4 + %"147" = fcmp uno float %"148", %"149" + store i1 %"147", ptr addrspace(5) %"101", align 1 + %"150" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"150", label %"34", label %"35" -"33": ; preds = %"32" - store i32 1, ptr addrspace(5) %"99", align 4 - br label %"34" +"34": ; preds = %"33" + store i32 1, ptr addrspace(5) %"100", align 4 + br label %"35" -"34": ; preds = %"33", %"32" - %"151" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"151", label %"36", label %"35" +"35": ; preds = %"34", %"33" + %"152" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"152", label %"37", label %"36" -"35": ; preds = %"34" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"36" +"36": ; preds = %"35" + store i32 0, ptr addrspace(5) %"100", align 4 + br label %"37" -"36": ; preds = %"35", %"34" - %"153" = load i64, ptr addrspace(5) %"90", align 4 - %"166" = inttoptr i64 %"153" to ptr - %"80" = getelementptr inbounds i8, ptr %"166", i64 12 - %"154" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"154", ptr %"80", align 4 +"37": ; preds = %"36", %"35" + %"154" = load i64, ptr addrspace(5) %"91", align 4 + %"167" = inttoptr i64 %"154" to ptr + %"81" = getelementptr inbounds i8, ptr %"167", i64 12 + %"155" = load i32, ptr addrspace(5) %"100", align 4 + store i32 %"155", ptr %"81", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_num.ll b/ptx/src/test/ll/setp_num.ll index c6303dc..d325931 100644 --- a/ptx/src/test/ll/setp_num.ll +++ b/ptx/src/test/ll/setp_num.ll @@ -10,10 +10,9 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"87", ptr addrspace(4) byref(i64) %"88") #0 { - %"89" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"88", ptr addrspace(4) byref(i64) %"89") #1 { %"90" = alloca i64, align 8, addrspace(5) - %"91" = alloca float, align 4, addrspace(5) + %"91" = alloca i64, align 8, addrspace(5) %"92" = alloca float, align 4, addrspace(5) %"93" = alloca float, align 4, addrspace(5) %"94" = alloca float, align 4, addrspace(5) @@ -21,154 +20,159 @@ define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"87", ptr addrs %"96" = alloca float, align 4, addrspace(5) %"97" = alloca float, align 4, addrspace(5) %"98" = alloca float, align 4, addrspace(5) - %"99" = alloca i32, align 4, addrspace(5) - %"100" = alloca i1, align 1, addrspace(5) + %"99" = alloca float, align 4, addrspace(5) + %"100" = alloca i32, align 4, addrspace(5) + %"101" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"101" = load i64, ptr addrspace(4) %"87", align 4 - store i64 %"101", ptr addrspace(5) %"89", align 4 + br label %"168" + +"168": ; preds = %1 %"102" = load i64, ptr addrspace(4) %"88", align 4 store i64 %"102", ptr addrspace(5) %"90", align 4 - %"104" = load i64, ptr addrspace(5) %"89", align 4 - %"155" = inttoptr i64 %"104" to ptr - %"103" = load float, ptr %"155", align 4 - store float %"103", ptr addrspace(5) %"91", align 4 - %"105" = load i64, ptr addrspace(5) %"89", align 4 + %"103" = load i64, ptr addrspace(4) %"89", align 4 + store i64 %"103", ptr addrspace(5) %"91", align 4 + %"105" = load i64, ptr addrspace(5) %"90", align 4 %"156" = inttoptr i64 %"105" to ptr - %"54" = getelementptr inbounds i8, ptr %"156", i64 4 - %"106" = load float, ptr %"54", align 4 - store float %"106", ptr addrspace(5) %"92", align 4 - %"107" = load i64, ptr addrspace(5) %"89", align 4 - %"157" = inttoptr i64 %"107" to ptr - %"56" = getelementptr inbounds i8, ptr %"157", i64 8 - %"108" = load float, ptr %"56", align 4 - store float %"108", ptr addrspace(5) %"93", align 4 - %"109" = load i64, ptr addrspace(5) %"89", align 4 - %"158" = inttoptr i64 %"109" to ptr - %"58" = getelementptr inbounds i8, ptr %"158", i64 12 - %"110" = load float, ptr %"58", align 4 - store float %"110", ptr addrspace(5) %"94", align 4 - %"111" = load i64, ptr addrspace(5) %"89", align 4 - %"159" = inttoptr i64 %"111" to ptr - %"60" = getelementptr inbounds i8, ptr %"159", i64 16 - %"112" = load float, ptr %"60", align 4 - store float %"112", ptr addrspace(5) %"95", align 4 - %"113" = load i64, ptr addrspace(5) %"89", align 4 - %"160" = inttoptr i64 %"113" to ptr - %"62" = getelementptr inbounds i8, ptr %"160", i64 20 - %"114" = load float, ptr %"62", align 4 - store float %"114", ptr addrspace(5) %"96", align 4 - %"115" = load i64, ptr addrspace(5) %"89", align 4 - %"161" = inttoptr i64 %"115" to ptr - %"64" = getelementptr inbounds i8, ptr %"161", i64 24 - %"116" = load float, ptr %"64", align 4 - store float %"116", ptr addrspace(5) %"97", align 4 - %"117" = load i64, ptr addrspace(5) %"89", align 4 - %"162" = inttoptr i64 %"117" to ptr - %"66" = getelementptr inbounds i8, ptr %"162", i64 28 - %"118" = load float, ptr %"66", align 4 - store float %"118", ptr addrspace(5) %"98", align 4 - %"120" = load float, ptr addrspace(5) %"91", align 4 + %"104" = load float, ptr %"156", align 4 + store float %"104", ptr addrspace(5) %"92", align 4 + %"106" = load i64, ptr addrspace(5) %"90", align 4 + %"157" = inttoptr i64 %"106" to ptr + %"55" = getelementptr inbounds i8, ptr %"157", i64 4 + %"107" = load float, ptr %"55", align 4 + store float %"107", ptr addrspace(5) %"93", align 4 + %"108" = load i64, ptr addrspace(5) %"90", align 4 + %"158" = inttoptr i64 %"108" to ptr + %"57" = getelementptr inbounds i8, ptr %"158", i64 8 + %"109" = load float, ptr %"57", align 4 + store float %"109", ptr addrspace(5) %"94", align 4 + %"110" = load i64, ptr addrspace(5) %"90", align 4 + %"159" = inttoptr i64 %"110" to ptr + %"59" = getelementptr inbounds i8, ptr %"159", i64 12 + %"111" = load float, ptr %"59", align 4 + store float %"111", ptr addrspace(5) %"95", align 4 + %"112" = load i64, ptr addrspace(5) %"90", align 4 + %"160" = inttoptr i64 %"112" to ptr + %"61" = getelementptr inbounds i8, ptr %"160", i64 16 + %"113" = load float, ptr %"61", align 4 + store float %"113", ptr addrspace(5) %"96", align 4 + %"114" = load i64, ptr addrspace(5) %"90", align 4 + %"161" = inttoptr i64 %"114" to ptr + %"63" = getelementptr inbounds i8, ptr %"161", i64 20 + %"115" = load float, ptr %"63", align 4 + store float %"115", ptr addrspace(5) %"97", align 4 + %"116" = load i64, ptr addrspace(5) %"90", align 4 + %"162" = inttoptr i64 %"116" to ptr + %"65" = getelementptr inbounds i8, ptr %"162", i64 24 + %"117" = load float, ptr %"65", align 4 + store float %"117", ptr addrspace(5) %"98", align 4 + %"118" = load i64, ptr addrspace(5) %"90", align 4 + %"163" = inttoptr i64 %"118" to ptr + %"67" = getelementptr inbounds i8, ptr %"163", i64 28 + %"119" = load float, ptr %"67", align 4 + store float %"119", ptr addrspace(5) %"99", align 4 %"121" = load float, ptr addrspace(5) %"92", align 4 - %"119" = fcmp ord float %"120", %"121" - store i1 %"119", ptr addrspace(5) %"100", align 1 - %"122" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"122", label %"21", label %"22" + %"122" = load float, ptr addrspace(5) %"93", align 4 + %"120" = fcmp ord float %"121", %"122" + store i1 %"120", ptr addrspace(5) %"101", align 1 + %"123" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"123", label %"22", label %"23" -"21": ; preds = %1 - store i32 2, ptr addrspace(5) %"99", align 4 - br label %"22" +"22": ; preds = %"168" + store i32 2, ptr addrspace(5) %"100", align 4 + br label %"23" -"22": ; preds = %"21", %1 - %"124" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"124", label %"24", label %"23" +"23": ; preds = %"22", %"168" + %"125" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"125", label %"25", label %"24" -"23": ; preds = %"22" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"24" +"24": ; preds = %"23" + store i32 0, ptr addrspace(5) %"100", align 4 + br label %"25" -"24": ; preds = %"23", %"22" - %"126" = load i64, ptr addrspace(5) %"90", align 4 - %"127" = load i32, ptr addrspace(5) %"99", align 4 - %"163" = inttoptr i64 %"126" to ptr - store i32 %"127", ptr %"163", align 4 - %"129" = load float, ptr addrspace(5) %"93", align 4 +"25": ; preds = %"24", %"23" + %"127" = load i64, ptr addrspace(5) %"91", align 4 + %"128" = load i32, ptr addrspace(5) %"100", align 4 + %"164" = inttoptr i64 %"127" to ptr + store i32 %"128", ptr %"164", align 4 %"130" = load float, ptr addrspace(5) %"94", align 4 - %"128" = fcmp ord float %"129", %"130" - store i1 %"128", ptr addrspace(5) %"100", align 1 - %"131" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"131", label %"25", label %"26" + %"131" = load float, ptr addrspace(5) %"95", align 4 + %"129" = fcmp ord float %"130", %"131" + store i1 %"129", ptr addrspace(5) %"101", align 1 + %"132" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"132", label %"26", label %"27" -"25": ; preds = %"24" - store i32 2, ptr addrspace(5) %"99", align 4 - br label %"26" +"26": ; preds = %"25" + store i32 2, ptr addrspace(5) %"100", align 4 + br label %"27" -"26": ; preds = %"25", %"24" - %"133" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"133", label %"28", label %"27" +"27": ; preds = %"26", %"25" + %"134" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"134", label %"29", label %"28" -"27": ; preds = %"26" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"28" +"28": ; preds = %"27" + store i32 0, ptr addrspace(5) %"100", align 4 + br label %"29" -"28": ; preds = %"27", %"26" - %"135" = load i64, ptr addrspace(5) %"90", align 4 - %"164" = inttoptr i64 %"135" to ptr - %"72" = getelementptr inbounds i8, ptr %"164", i64 4 - %"136" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"136", ptr %"72", align 4 - %"138" = load float, ptr addrspace(5) %"95", align 4 +"29": ; preds = %"28", %"27" + %"136" = load i64, ptr addrspace(5) %"91", align 4 + %"165" = inttoptr i64 %"136" to ptr + %"73" = getelementptr inbounds i8, ptr %"165", i64 4 + %"137" = load i32, ptr addrspace(5) %"100", align 4 + store i32 %"137", ptr %"73", align 4 %"139" = load float, ptr addrspace(5) %"96", align 4 - %"137" = fcmp ord float %"138", %"139" - store i1 %"137", ptr addrspace(5) %"100", align 1 - %"140" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"140", label %"29", label %"30" + %"140" = load float, ptr addrspace(5) %"97", align 4 + %"138" = fcmp ord float %"139", %"140" + store i1 %"138", ptr addrspace(5) %"101", align 1 + %"141" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"141", label %"30", label %"31" -"29": ; preds = %"28" - store i32 2, ptr addrspace(5) %"99", align 4 - br label %"30" +"30": ; preds = %"29" + store i32 2, ptr addrspace(5) %"100", align 4 + br label %"31" -"30": ; preds = %"29", %"28" - %"142" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"142", label %"32", label %"31" +"31": ; preds = %"30", %"29" + %"143" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"143", label %"33", label %"32" -"31": ; preds = %"30" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"32" +"32": ; preds = %"31" + store i32 0, ptr addrspace(5) %"100", align 4 + br label %"33" -"32": ; preds = %"31", %"30" - %"144" = load i64, ptr addrspace(5) %"90", align 4 - %"165" = inttoptr i64 %"144" to ptr - %"76" = getelementptr inbounds i8, ptr %"165", i64 8 - %"145" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"145", ptr %"76", align 4 - %"147" = load float, ptr addrspace(5) %"97", align 4 +"33": ; preds = %"32", %"31" + %"145" = load i64, ptr addrspace(5) %"91", align 4 + %"166" = inttoptr i64 %"145" to ptr + %"77" = getelementptr inbounds i8, ptr %"166", i64 8 + %"146" = load i32, ptr addrspace(5) %"100", align 4 + store i32 %"146", ptr %"77", align 4 %"148" = load float, ptr addrspace(5) %"98", align 4 - %"146" = fcmp ord float %"147", %"148" - store i1 %"146", ptr addrspace(5) %"100", align 1 - %"149" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"149", label %"33", label %"34" + %"149" = load float, ptr addrspace(5) %"99", align 4 + %"147" = fcmp ord float %"148", %"149" + store i1 %"147", ptr addrspace(5) %"101", align 1 + %"150" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"150", label %"34", label %"35" -"33": ; preds = %"32" - store i32 2, ptr addrspace(5) %"99", align 4 - br label %"34" +"34": ; preds = %"33" + store i32 2, ptr addrspace(5) %"100", align 4 + br label %"35" -"34": ; preds = %"33", %"32" - %"151" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"151", label %"36", label %"35" +"35": ; preds = %"34", %"33" + %"152" = load i1, ptr addrspace(5) %"101", align 1 + br i1 %"152", label %"37", label %"36" -"35": ; preds = %"34" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"36" +"36": ; preds = %"35" + store i32 0, ptr addrspace(5) %"100", align 4 + br label %"37" -"36": ; preds = %"35", %"34" - %"153" = load i64, ptr addrspace(5) %"90", align 4 - %"166" = inttoptr i64 %"153" to ptr - %"80" = getelementptr inbounds i8, ptr %"166", i64 12 - %"154" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"154", ptr %"80", align 4 +"37": ; preds = %"36", %"35" + %"154" = load i64, ptr addrspace(5) %"91", align 4 + %"167" = inttoptr i64 %"154" to ptr + %"81" = getelementptr inbounds i8, ptr %"167", i64 12 + %"155" = load i32, ptr addrspace(5) %"100", align 4 + store i32 %"155", ptr %"81", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_ptr_32.ll b/ptx/src/test/ll/shared_ptr_32.ll index ecba0ca..d5ea319 100644 --- a/ptx/src/test/ll/shared_ptr_32.ll +++ b/ptx/src/test/ll/shared_ptr_32.ll @@ -12,38 +12,42 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"46" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"46", ptr addrspace(5) %"41", align 4 + br label %"63" + +"63": ; preds = %1 %"47" = load i64, ptr addrspace(4) %"40", align 4 store i64 %"47", ptr addrspace(5) %"42", align 4 - store i32 ptrtoint (ptr addrspace(3) @shared_mem1 to i32), ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %"58" = inttoptr i64 %"50" to ptr addrspace(1) - %"49" = load i64, ptr addrspace(1) %"58", align 4 - store i64 %"49", ptr addrspace(5) %"44", align 4 - %"51" = load i32, ptr addrspace(5) %"43", align 4 - %"52" = load i64, ptr addrspace(5) %"44", align 4 - %"59" = inttoptr i32 %"51" to ptr addrspace(3) - store i64 %"52", ptr addrspace(3) %"59", align 4 - %"53" = load i32, ptr addrspace(5) %"43", align 4 - %"60" = inttoptr i32 %"53" to ptr addrspace(3) - %"32" = getelementptr inbounds i8, ptr addrspace(3) %"60", i64 0 - %"54" = load i64, ptr addrspace(3) %"32", align 4 - store i64 %"54", ptr addrspace(5) %"45", align 4 - %"55" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = load i64, ptr addrspace(5) %"45", align 4 - %"61" = inttoptr i64 %"55" to ptr addrspace(1) - store i64 %"56", ptr addrspace(1) %"61", align 4 + %"48" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"48", ptr addrspace(5) %"43", align 4 + store i32 ptrtoint (ptr addrspace(3) @shared_mem1 to i32), ptr addrspace(5) %"44", align 4 + %"51" = load i64, ptr addrspace(5) %"42", align 4 + %"59" = inttoptr i64 %"51" to ptr addrspace(1) + %"50" = load i64, ptr addrspace(1) %"59", align 4 + store i64 %"50", ptr addrspace(5) %"45", align 4 + %"52" = load i32, ptr addrspace(5) %"44", align 4 + %"53" = load i64, ptr addrspace(5) %"45", align 4 + %"60" = inttoptr i32 %"52" to ptr addrspace(3) + store i64 %"53", ptr addrspace(3) %"60", align 4 + %"54" = load i32, ptr addrspace(5) %"44", align 4 + %"61" = inttoptr i32 %"54" to ptr addrspace(3) + %"33" = getelementptr inbounds i8, ptr addrspace(3) %"61", i64 0 + %"55" = load i64, ptr addrspace(3) %"33", align 4 + store i64 %"55", ptr addrspace(5) %"46", align 4 + %"56" = load i64, ptr addrspace(5) %"43", align 4 + %"57" = load i64, ptr addrspace(5) %"46", align 4 + %"62" = inttoptr i64 %"56" to ptr addrspace(1) + store i64 %"57", ptr addrspace(1) %"62", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_ptr_take_address.ll b/ptx/src/test/ll/shared_ptr_take_address.ll index a5a250d..7693f04 100644 --- a/ptx/src/test/ll/shared_ptr_take_address.ll +++ b/ptx/src/test/ll/shared_ptr_take_address.ll @@ -12,37 +12,41 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i64, align 8, addrspace(5) %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"44" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"44", ptr addrspace(5) %"39", align 4 + br label %"61" + +"61": ; preds = %1 %"45" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"45", ptr addrspace(5) %"40", align 4 - store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"56" = inttoptr i64 %"48" to ptr addrspace(1) - %"47" = load i64, ptr addrspace(1) %"56", align 4 - store i64 %"47", ptr addrspace(5) %"42", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"46" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"46", ptr addrspace(5) %"41", align 4 + store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"57" = inttoptr i64 %"49" to ptr addrspace(1) + %"48" = load i64, ptr addrspace(1) %"57", align 4 + store i64 %"48", ptr addrspace(5) %"43", align 4 %"50" = load i64, ptr addrspace(5) %"42", align 4 - %"57" = inttoptr i64 %"49" to ptr addrspace(3) - store i64 %"50", ptr addrspace(3) %"57", align 4 - %"52" = load i64, ptr addrspace(5) %"41", align 4 - %"58" = inttoptr i64 %"52" to ptr addrspace(3) - %"51" = load i64, ptr addrspace(3) %"58", align 4 - store i64 %"51", ptr addrspace(5) %"43", align 4 - %"53" = load i64, ptr addrspace(5) %"40", align 4 - %"54" = load i64, ptr addrspace(5) %"43", align 4 - %"59" = inttoptr i64 %"53" to ptr addrspace(1) - store i64 %"54", ptr addrspace(1) %"59", align 4 + %"51" = load i64, ptr addrspace(5) %"43", align 4 + %"58" = inttoptr i64 %"50" to ptr addrspace(3) + store i64 %"51", ptr addrspace(3) %"58", align 4 + %"53" = load i64, ptr addrspace(5) %"42", align 4 + %"59" = inttoptr i64 %"53" to ptr addrspace(3) + %"52" = load i64, ptr addrspace(3) %"59", align 4 + store i64 %"52", ptr addrspace(5) %"44", align 4 + %"54" = load i64, ptr addrspace(5) %"41", align 4 + %"55" = load i64, ptr addrspace(5) %"44", align 4 + %"60" = inttoptr i64 %"54" to ptr addrspace(1) + store i64 %"55", ptr addrspace(1) %"60", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_unify_extern.ll b/ptx/src/test/ll/shared_unify_extern.ll index 68309bf..ab3c695 100644 --- a/ptx/src/test/ll/shared_unify_extern.ll +++ b/ptx/src/test/ll/shared_unify_extern.ll @@ -13,68 +13,78 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define i64 @__zluda_ptx_impl_add() #0 { - %"46" = alloca i64, align 8, addrspace(5) +define i64 @add() #0 { %"47" = alloca i64, align 8, addrspace(5) %"48" = alloca i64, align 8, addrspace(5) + %"49" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"49" = load i64, ptr addrspace(3) @shared_mod, align 4 - store i64 %"49", ptr addrspace(5) %"47", align 4 - %"50" = load i64, ptr addrspace(3) @shared_ex, align 4 + br label %"85" + +"85": ; preds = %1 + %"50" = load i64, ptr addrspace(3) @shared_mod, align 4 store i64 %"50", ptr addrspace(5) %"48", align 4 - %"52" = load i64, ptr addrspace(5) %"48", align 4 - %"53" = load i64, ptr addrspace(5) %"47", align 4 - %"75" = add i64 %"52", %"53" - store i64 %"75", ptr addrspace(5) %"46", align 4 - %2 = load i64, ptr addrspace(5) %"46", align 4 + %"51" = load i64, ptr addrspace(3) @shared_ex, align 4 + store i64 %"51", ptr addrspace(5) %"49", align 4 + %"53" = load i64, ptr addrspace(5) %"49", align 4 + %"54" = load i64, ptr addrspace(5) %"48", align 4 + %"76" = add i64 %"53", %"54" + store i64 %"76", ptr addrspace(5) %"47", align 4 + %2 = load i64, ptr addrspace(5) %"47", align 4 ret i64 %2 } -define i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"15") #0 { - %"54" = alloca i64, align 8, addrspace(5) +define i64 @set_shared_temp1(i64 %"15") #0 { + %"55" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 + br label %"86" + +"86": ; preds = %1 store i64 %"15", ptr addrspace(3) @shared_ex, align 4 - %"55" = call i64 @__zluda_ptx_impl_add() - store i64 %"55", ptr addrspace(5) %"54", align 4 - %2 = load i64, ptr addrspace(5) %"54", align 4 + %"56" = call i64 @add() + store i64 %"56", ptr addrspace(5) %"55", align 4 + %2 = load i64, ptr addrspace(5) %"55", align 4 ret i64 %2 } -define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56", ptr addrspace(4) byref(i64) %"57") #0 { - %"58" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"57", ptr addrspace(4) byref(i64) %"58") #1 { %"59" = alloca i64, align 8, addrspace(5) %"60" = alloca i64, align 8, addrspace(5) %"61" = alloca i64, align 8, addrspace(5) + %"62" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"62" = load i64, ptr addrspace(4) %"56", align 4 - store i64 %"62", ptr addrspace(5) %"58", align 4 + br label %"87" + +"87": ; preds = %1 %"63" = load i64, ptr addrspace(4) %"57", align 4 store i64 %"63", ptr addrspace(5) %"59", align 4 - %"65" = load i64, ptr addrspace(5) %"58", align 4 - %"78" = inttoptr i64 %"65" to ptr addrspace(1) - %"64" = load i64, ptr addrspace(1) %"78", align 4 + %"64" = load i64, ptr addrspace(4) %"58", align 4 store i64 %"64", ptr addrspace(5) %"60", align 4 - %"66" = load i64, ptr addrspace(5) %"58", align 4 + %"66" = load i64, ptr addrspace(5) %"59", align 4 %"79" = inttoptr i64 %"66" to ptr addrspace(1) - %"39" = getelementptr inbounds i8, ptr addrspace(1) %"79", i64 8 - %"67" = load i64, ptr addrspace(1) %"39", align 4 - store i64 %"67", ptr addrspace(5) %"61", align 4 - %"68" = load i64, ptr addrspace(5) %"61", align 4 - store i64 %"68", ptr addrspace(3) @shared_mod, align 4 - %"70" = load i64, ptr addrspace(5) %"60", align 4 - %"81" = call i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"70") - store i64 %"81", ptr addrspace(5) %"61", align 4 - %"71" = load i64, ptr addrspace(5) %"59", align 4 - %"72" = load i64, ptr addrspace(5) %"61", align 4 - %"83" = inttoptr i64 %"71" to ptr - store i64 %"72", ptr %"83", align 4 + %"65" = load i64, ptr addrspace(1) %"79", align 4 + store i64 %"65", ptr addrspace(5) %"61", align 4 + %"67" = load i64, ptr addrspace(5) %"59", align 4 + %"80" = inttoptr i64 %"67" to ptr addrspace(1) + %"40" = getelementptr inbounds i8, ptr addrspace(1) %"80", i64 8 + %"68" = load i64, ptr addrspace(1) %"40", align 4 + store i64 %"68", ptr addrspace(5) %"62", align 4 + %"69" = load i64, ptr addrspace(5) %"62", align 4 + store i64 %"69", ptr addrspace(3) @shared_mod, align 4 + %"71" = load i64, ptr addrspace(5) %"61", align 4 + %"82" = call i64 @set_shared_temp1(i64 %"71") + store i64 %"82", ptr addrspace(5) %"62", align 4 + %"72" = load i64, ptr addrspace(5) %"60", align 4 + %"73" = load i64, ptr addrspace(5) %"62", align 4 + %"84" = inttoptr i64 %"72" to ptr + store i64 %"73", ptr %"84", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_unify_local.ll b/ptx/src/test/ll/shared_unify_local.ll index 56a5bbb..c54e782 100644 --- a/ptx/src/test/ll/shared_unify_local.ll +++ b/ptx/src/test/ll/shared_unify_local.ll @@ -13,65 +13,75 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define i64 @__zluda_ptx_impl_add(i64 %"10") #0 { - %"47" = alloca i64, align 8, addrspace(5) +define i64 @add(i64 %"10") #0 { %"48" = alloca i64, align 8, addrspace(5) + %"49" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 + br label %"81" + +"81": ; preds = %1 store i64 %"10", ptr addrspace(3) @shared_mod, align 4 - %"49" = load i64, ptr addrspace(3) @shared_mod, align 4 - store i64 %"49", ptr addrspace(5) %"48", align 4 + %"50" = load i64, ptr addrspace(3) @shared_mod, align 4 + store i64 %"50", ptr addrspace(5) %"49", align 4 %"101" = load i64, ptr addrspace(3) @shared_ex, align 4 - %"51" = load i64, ptr addrspace(5) %"48", align 4 - %"72" = add i64 %"101", %"51" - store i64 %"72", ptr addrspace(5) %"47", align 4 - %2 = load i64, ptr addrspace(5) %"47", align 4 + %"52" = load i64, ptr addrspace(5) %"49", align 4 + %"73" = add i64 %"101", %"52" + store i64 %"73", ptr addrspace(5) %"48", align 4 + %2 = load i64, ptr addrspace(5) %"48", align 4 ret i64 %2 } -define i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"15", i64 %"16") #0 { - %"52" = alloca i64, align 8, addrspace(5) +define i64 @set_shared_temp1(i64 %"15", i64 %"16") #0 { + %"53" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 + br label %"82" + +"82": ; preds = %1 store i64 %"15", ptr addrspace(3) @shared_ex, align 4 - %"53" = call i64 @__zluda_ptx_impl_add(i64 %"16") - store i64 %"53", ptr addrspace(5) %"52", align 4 - %2 = load i64, ptr addrspace(5) %"52", align 4 + %"54" = call i64 @add(i64 %"16") + store i64 %"54", ptr addrspace(5) %"53", align 4 + %2 = load i64, ptr addrspace(5) %"53", align 4 ret i64 %2 } -define amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #0 { - %"56" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"55", ptr addrspace(4) byref(i64) %"56") #1 { %"57" = alloca i64, align 8, addrspace(5) %"58" = alloca i64, align 8, addrspace(5) %"59" = alloca i64, align 8, addrspace(5) + %"60" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"60" = load i64, ptr addrspace(4) %"54", align 4 - store i64 %"60", ptr addrspace(5) %"56", align 4 + br label %"83" + +"83": ; preds = %1 %"61" = load i64, ptr addrspace(4) %"55", align 4 store i64 %"61", ptr addrspace(5) %"57", align 4 - %"63" = load i64, ptr addrspace(5) %"56", align 4 - %"75" = inttoptr i64 %"63" to ptr addrspace(1) - %"62" = load i64, ptr addrspace(1) %"75", align 4 + %"62" = load i64, ptr addrspace(4) %"56", align 4 store i64 %"62", ptr addrspace(5) %"58", align 4 - %"64" = load i64, ptr addrspace(5) %"56", align 4 + %"64" = load i64, ptr addrspace(5) %"57", align 4 %"76" = inttoptr i64 %"64" to ptr addrspace(1) - %"40" = getelementptr inbounds i8, ptr addrspace(1) %"76", i64 8 - %"65" = load i64, ptr addrspace(1) %"40", align 4 - store i64 %"65", ptr addrspace(5) %"59", align 4 - %"67" = load i64, ptr addrspace(5) %"58", align 4 + %"63" = load i64, ptr addrspace(1) %"76", align 4 + store i64 %"63", ptr addrspace(5) %"59", align 4 + %"65" = load i64, ptr addrspace(5) %"57", align 4 + %"77" = inttoptr i64 %"65" to ptr addrspace(1) + %"41" = getelementptr inbounds i8, ptr addrspace(1) %"77", i64 8 + %"66" = load i64, ptr addrspace(1) %"41", align 4 + store i64 %"66", ptr addrspace(5) %"60", align 4 %"68" = load i64, ptr addrspace(5) %"59", align 4 - %"77" = call i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"67", i64 %"68") - store i64 %"77", ptr addrspace(5) %"59", align 4 - %"69" = load i64, ptr addrspace(5) %"57", align 4 - %"70" = load i64, ptr addrspace(5) %"59", align 4 - %"79" = inttoptr i64 %"69" to ptr - store i64 %"70", ptr %"79", align 4 + %"69" = load i64, ptr addrspace(5) %"60", align 4 + %"78" = call i64 @set_shared_temp1(i64 %"68", i64 %"69") + store i64 %"78", ptr addrspace(5) %"60", align 4 + %"70" = load i64, ptr addrspace(5) %"58", align 4 + %"71" = load i64, ptr addrspace(5) %"60", align 4 + %"80" = inttoptr i64 %"70" to ptr + store i64 %"71", ptr %"80", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_variable.ll b/ptx/src/test/ll/shared_variable.ll index f71fcc8..8801d05 100644 --- a/ptx/src/test/ll/shared_variable.ll +++ b/ptx/src/test/ll/shared_variable.ll @@ -12,31 +12,35 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @shared_variable(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shared_variable(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 + br label %"55" + +"55": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr addrspace(1) - %"44" = load i64, ptr addrspace(1) %"50", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"40", align 4 - store i64 %"46", ptr addrspace(3) @shared_mem1, align 4 - %"47" = load i64, ptr addrspace(3) @shared_mem1, align 4 - store i64 %"47", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"53" = inttoptr i64 %"48" to ptr addrspace(1) - store i64 %"49", ptr addrspace(1) %"53", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"46" to ptr addrspace(1) + %"45" = load i64, ptr addrspace(1) %"51", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"41", align 4 + store i64 %"47", ptr addrspace(3) @shared_mem1, align 4 + %"48" = load i64, ptr addrspace(3) @shared_mem1, align 4 + store i64 %"48", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"54" = inttoptr i64 %"49" to ptr addrspace(1) + store i64 %"50", ptr addrspace(1) %"54", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shl.ll b/ptx/src/test/ll/shl.ll index 1b0d8bf..c98b581 100644 --- a/ptx/src/test/ll/shl.ll +++ b/ptx/src/test/ll/shl.ll @@ -10,31 +10,35 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @shl(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shl(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 + br label %"55" + +"55": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %2 = shl i64 %"47", 2 - %"51" = select i1 false, i64 0, i64 %2 - store i64 %"51", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"53" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"53", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"46" to ptr + %"45" = load i64, ptr %"51", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %2 = shl i64 %"48", 2 + %"52" = select i1 false, i64 0, i64 %2 + store i64 %"52", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"54" = inttoptr i64 %"49" to ptr + store i64 %"50", ptr %"54", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shr.ll b/ptx/src/test/ll/shr.ll index 6b2cecd..a1bfa20 100644 --- a/ptx/src/test/ll/shr.ll +++ b/ptx/src/test/ll/shr.ll @@ -10,30 +10,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @shr(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shr(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i32, align 4, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 + br label %"51" + +"51": ; preds = %1 %"41" = load i64, ptr addrspace(4) %"36", align 4 store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = inttoptr i64 %"43" to ptr - %"42" = load i32, ptr %"48", align 4 - store i32 %"42", ptr addrspace(5) %"39", align 4 - %"45" = load i32, ptr addrspace(5) %"39", align 4 - %2 = ashr i32 %"45", 1 - %"44" = select i1 false, i32 0, i32 %2 - store i32 %"44", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %"47" = load i32, ptr addrspace(5) %"39", align 4 - %"49" = inttoptr i64 %"46" to ptr - store i32 %"47", ptr %"49", align 4 + %"42" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"42", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(5) %"38", align 4 + %"49" = inttoptr i64 %"44" to ptr + %"43" = load i32, ptr %"49", align 4 + store i32 %"43", ptr addrspace(5) %"40", align 4 + %"46" = load i32, ptr addrspace(5) %"40", align 4 + %2 = ashr i32 %"46", 1 + %"45" = select i1 false, i32 0, i32 %2 + store i32 %"45", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"48" = load i32, ptr addrspace(5) %"40", align 4 + %"50" = inttoptr i64 %"47" to ptr + store i32 %"48", ptr %"50", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/sign_extend.ll b/ptx/src/test/ll/sign_extend.ll index 0a29187..871e141 100644 --- a/ptx/src/test/ll/sign_extend.ll +++ b/ptx/src/test/ll/sign_extend.ll @@ -10,27 +10,31 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @sign_extend(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @sign_extend(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"49" + +"49": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"46" = inttoptr i64 %"42" to ptr - %"45" = load i16, ptr %"46", align 2 - %"41" = sext i16 %"45" to i32 - store i32 %"41", ptr addrspace(5) %"38", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 %"47" = inttoptr i64 %"43" to ptr - store i32 %"44", ptr %"47", align 4 + %"46" = load i16, ptr %"47", align 2 + %"42" = sext i16 %"46" to i32 + store i32 %"42", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(5) %"38", align 4 + %"45" = load i32, ptr addrspace(5) %"39", align 4 + %"48" = inttoptr i64 %"44" to ptr + store i32 %"45", ptr %"48", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/sin.ll b/ptx/src/test/ll/sin.ll index 656dbad..98eb44d 100644 --- a/ptx/src/test/ll/sin.ll +++ b/ptx/src/test/ll/sin.ll @@ -10,33 +10,37 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @sin(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @sin(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"50" + +"50": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call afn float @llvm.sin.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load float, ptr %"48", align 4 + store float %"42", ptr addrspace(5) %"39", align 4 + %"45" = load float, ptr addrspace(5) %"39", align 4 + %"44" = call afn float @llvm.sin.f32(float %"45") + store float %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load float, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store float %"47", ptr %"49", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.sin.f32(float) #1 +declare float @llvm.sin.f32(float) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/sqrt.ll b/ptx/src/test/ll/sqrt.ll index fe56dfe..af1df8c 100644 --- a/ptx/src/test/ll/sqrt.ll +++ b/ptx/src/test/ll/sqrt.ll @@ -10,33 +10,37 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 + br label %"50" + +"50": ; preds = %1 %"40" = load i64, ptr addrspace(4) %"35", align 4 store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call float @llvm.amdgcn.sqrt.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load float, ptr %"48", align 4 + store float %"42", ptr addrspace(5) %"39", align 4 + %"45" = load float, ptr addrspace(5) %"39", align 4 + %"44" = call float @llvm.amdgcn.sqrt.f32(float %"45") + store float %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load float, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store float %"47", ptr %"49", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.amdgcn.sqrt.f32(float) #1 +declare float @llvm.amdgcn.sqrt.f32(float) #2 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_ld_st_ntid.ll b/ptx/src/test/ll/stateful_ld_st_ntid.ll index cbdb89a..480d90a 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid.ll @@ -10,49 +10,53 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @stateful_ld_st_ntid(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { - %"40" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @stateful_ld_st_ntid(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"64" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"64", ptr addrspace(5) %"40", align 4 + br label %"73" + +"73": ; preds = %1 %"65" = load i64, ptr addrspace(4) %"39", align 4 store i64 %"65", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %2 = inttoptr i64 %"48" to ptr - %"47" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"47", ptr addrspace(5) %"40", align 8 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %3 = inttoptr i64 %"50" to ptr - %"49" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"49", ptr addrspace(5) %"41", align 8 - %"31" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) - store i32 %"31", ptr addrspace(5) %"42", align 4 - %"53" = load i32, ptr addrspace(5) %"42", align 4 - %"52" = zext i32 %"53" to i64 - store i64 %"52", ptr addrspace(5) %"43", align 4 - %"55" = load i64, ptr addrspace(5) %"40", align 4 - %"56" = load i64, ptr addrspace(5) %"43", align 4 - %"66" = add i64 %"55", %"56" - store i64 %"66", ptr addrspace(5) %"40", align 4 - %"58" = load i64, ptr addrspace(5) %"41", align 4 - %"59" = load i64, ptr addrspace(5) %"43", align 4 - %"68" = add i64 %"58", %"59" - store i64 %"68", ptr addrspace(5) %"41", align 4 - %"61" = load i64, ptr addrspace(5) %"40", align 4 - %"70" = inttoptr i64 %"61" to ptr addrspace(1) - %"60" = load i64, ptr addrspace(1) %"70", align 4 - store i64 %"60", ptr addrspace(5) %"44", align 4 + %"66" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"66", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %2 = inttoptr i64 %"49" to ptr + %"48" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"48", ptr addrspace(5) %"41", align 8 + %"51" = load i64, ptr addrspace(5) %"42", align 4 + %3 = inttoptr i64 %"51" to ptr + %"50" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"50", ptr addrspace(5) %"42", align 8 + %"32" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) + store i32 %"32", ptr addrspace(5) %"43", align 4 + %"54" = load i32, ptr addrspace(5) %"43", align 4 + %"53" = zext i32 %"54" to i64 + store i64 %"53", ptr addrspace(5) %"44", align 4 + %"56" = load i64, ptr addrspace(5) %"41", align 4 + %"57" = load i64, ptr addrspace(5) %"44", align 4 + %"67" = add i64 %"56", %"57" + store i64 %"67", ptr addrspace(5) %"41", align 4 + %"59" = load i64, ptr addrspace(5) %"42", align 4 + %"60" = load i64, ptr addrspace(5) %"44", align 4 + %"69" = add i64 %"59", %"60" + store i64 %"69", ptr addrspace(5) %"42", align 4 %"62" = load i64, ptr addrspace(5) %"41", align 4 - %"63" = load i64, ptr addrspace(5) %"44", align 4 %"71" = inttoptr i64 %"62" to ptr addrspace(1) - store i64 %"63", ptr addrspace(1) %"71", align 4 + %"61" = load i64, ptr addrspace(1) %"71", align 4 + store i64 %"61", ptr addrspace(5) %"45", align 4 + %"63" = load i64, ptr addrspace(5) %"42", align 4 + %"64" = load i64, ptr addrspace(5) %"45", align 4 + %"72" = inttoptr i64 %"63" to ptr addrspace(1) + store i64 %"64", ptr addrspace(1) %"72", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll b/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll index 1ac5a5f..748b329 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll @@ -10,53 +10,57 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @stateful_ld_st_ntid_chain(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { - %"44" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @stateful_ld_st_ntid_chain(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #1 { %"45" = alloca i64, align 8, addrspace(5) %"46" = alloca i64, align 8, addrspace(5) %"47" = alloca i64, align 8, addrspace(5) %"48" = alloca i64, align 8, addrspace(5) %"49" = alloca i64, align 8, addrspace(5) - %"50" = alloca i32, align 4, addrspace(5) - %"51" = alloca i64, align 8, addrspace(5) + %"50" = alloca i64, align 8, addrspace(5) + %"51" = alloca i32, align 4, addrspace(5) %"52" = alloca i64, align 8, addrspace(5) + %"53" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"72" = load i64, ptr addrspace(4) %"42", align 4 - store i64 %"72", ptr addrspace(5) %"44", align 4 + br label %"81" + +"81": ; preds = %1 %"73" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"73", ptr addrspace(5) %"47", align 4 - %"56" = load i64, ptr addrspace(5) %"44", align 4 - %2 = inttoptr i64 %"56" to ptr - %"55" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"55", ptr addrspace(5) %"45", align 8 - %"58" = load i64, ptr addrspace(5) %"47", align 4 - %3 = inttoptr i64 %"58" to ptr - %"57" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"57", ptr addrspace(5) %"48", align 8 - %"35" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) - store i32 %"35", ptr addrspace(5) %"50", align 4 - %"61" = load i32, ptr addrspace(5) %"50", align 4 - %"60" = zext i32 %"61" to i64 - store i64 %"60", ptr addrspace(5) %"51", align 4 - %"63" = load i64, ptr addrspace(5) %"45", align 4 - %"64" = load i64, ptr addrspace(5) %"51", align 4 - %"74" = add i64 %"63", %"64" - store i64 %"74", ptr addrspace(5) %"46", align 4 - %"66" = load i64, ptr addrspace(5) %"48", align 4 - %"67" = load i64, ptr addrspace(5) %"51", align 4 - %"76" = add i64 %"66", %"67" - store i64 %"76", ptr addrspace(5) %"49", align 4 - %"69" = load i64, ptr addrspace(5) %"46", align 4 - %"78" = inttoptr i64 %"69" to ptr addrspace(1) - %"68" = load i64, ptr addrspace(1) %"78", align 4 - store i64 %"68", ptr addrspace(5) %"52", align 4 - %"70" = load i64, ptr addrspace(5) %"49", align 4 - %"71" = load i64, ptr addrspace(5) %"52", align 4 + store i64 %"73", ptr addrspace(5) %"45", align 4 + %"74" = load i64, ptr addrspace(4) %"44", align 4 + store i64 %"74", ptr addrspace(5) %"48", align 4 + %"57" = load i64, ptr addrspace(5) %"45", align 4 + %2 = inttoptr i64 %"57" to ptr + %"56" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"56", ptr addrspace(5) %"46", align 8 + %"59" = load i64, ptr addrspace(5) %"48", align 4 + %3 = inttoptr i64 %"59" to ptr + %"58" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"58", ptr addrspace(5) %"49", align 8 + %"36" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) + store i32 %"36", ptr addrspace(5) %"51", align 4 + %"62" = load i32, ptr addrspace(5) %"51", align 4 + %"61" = zext i32 %"62" to i64 + store i64 %"61", ptr addrspace(5) %"52", align 4 + %"64" = load i64, ptr addrspace(5) %"46", align 4 + %"65" = load i64, ptr addrspace(5) %"52", align 4 + %"75" = add i64 %"64", %"65" + store i64 %"75", ptr addrspace(5) %"47", align 4 + %"67" = load i64, ptr addrspace(5) %"49", align 4 + %"68" = load i64, ptr addrspace(5) %"52", align 4 + %"77" = add i64 %"67", %"68" + store i64 %"77", ptr addrspace(5) %"50", align 4 + %"70" = load i64, ptr addrspace(5) %"47", align 4 %"79" = inttoptr i64 %"70" to ptr addrspace(1) - store i64 %"71", ptr addrspace(1) %"79", align 4 + %"69" = load i64, ptr addrspace(1) %"79", align 4 + store i64 %"69", ptr addrspace(5) %"53", align 4 + %"71" = load i64, ptr addrspace(5) %"50", align 4 + %"72" = load i64, ptr addrspace(5) %"53", align 4 + %"80" = inttoptr i64 %"71" to ptr addrspace(1) + store i64 %"72", ptr addrspace(1) %"80", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll b/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll index 8a07146..9cbd6b9 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll @@ -10,55 +10,59 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @stateful_ld_st_ntid_sub(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 { - %"48" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @stateful_ld_st_ntid_sub(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #1 { %"49" = alloca i64, align 8, addrspace(5) %"50" = alloca i64, align 8, addrspace(5) %"51" = alloca i64, align 8, addrspace(5) %"52" = alloca i64, align 8, addrspace(5) %"53" = alloca i64, align 8, addrspace(5) - %"54" = alloca i32, align 4, addrspace(5) - %"55" = alloca i64, align 8, addrspace(5) + %"54" = alloca i64, align 8, addrspace(5) + %"55" = alloca i32, align 4, addrspace(5) %"56" = alloca i64, align 8, addrspace(5) + %"57" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"76" = load i64, ptr addrspace(4) %"46", align 4 - store i64 %"76", ptr addrspace(5) %"48", align 4 + br label %"87" + +"87": ; preds = %1 %"77" = load i64, ptr addrspace(4) %"47", align 4 - store i64 %"77", ptr addrspace(5) %"51", align 4 - %"60" = load i64, ptr addrspace(5) %"48", align 4 - %2 = inttoptr i64 %"60" to ptr - %"59" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"59", ptr addrspace(5) %"49", align 8 - %"62" = load i64, ptr addrspace(5) %"51", align 4 - %3 = inttoptr i64 %"62" to ptr - %"61" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"61", ptr addrspace(5) %"52", align 8 - %"35" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) - store i32 %"35", ptr addrspace(5) %"54", align 4 - %"65" = load i32, ptr addrspace(5) %"54", align 4 - %"64" = zext i32 %"65" to i64 - store i64 %"64", ptr addrspace(5) %"55", align 4 - %"67" = load i64, ptr addrspace(5) %"49", align 4 - %"68" = load i64, ptr addrspace(5) %"55", align 4 - %"78" = sub i64 %"67", %"68" - store i64 %"78", ptr addrspace(5) %"50", align 4 - %"70" = load i64, ptr addrspace(5) %"52", align 4 - %"71" = load i64, ptr addrspace(5) %"55", align 4 - %"81" = sub i64 %"70", %"71" - store i64 %"81", ptr addrspace(5) %"53", align 4 - %"72" = load i64, ptr addrspace(5) %"50", align 4 - %"84" = inttoptr i64 %"72" to ptr addrspace(1) - %"37" = getelementptr inbounds i8, ptr addrspace(1) %"84", i64 0 - %"73" = load i64, ptr addrspace(1) %"37", align 4 - store i64 %"73", ptr addrspace(5) %"56", align 4 - %"74" = load i64, ptr addrspace(5) %"53", align 4 - %"85" = inttoptr i64 %"74" to ptr addrspace(1) - %"39" = getelementptr inbounds i8, ptr addrspace(1) %"85", i64 0 - %"75" = load i64, ptr addrspace(5) %"56", align 4 - store i64 %"75", ptr addrspace(1) %"39", align 4 + store i64 %"77", ptr addrspace(5) %"49", align 4 + %"78" = load i64, ptr addrspace(4) %"48", align 4 + store i64 %"78", ptr addrspace(5) %"52", align 4 + %"61" = load i64, ptr addrspace(5) %"49", align 4 + %2 = inttoptr i64 %"61" to ptr + %"60" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"60", ptr addrspace(5) %"50", align 8 + %"63" = load i64, ptr addrspace(5) %"52", align 4 + %3 = inttoptr i64 %"63" to ptr + %"62" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"62", ptr addrspace(5) %"53", align 8 + %"36" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) + store i32 %"36", ptr addrspace(5) %"55", align 4 + %"66" = load i32, ptr addrspace(5) %"55", align 4 + %"65" = zext i32 %"66" to i64 + store i64 %"65", ptr addrspace(5) %"56", align 4 + %"68" = load i64, ptr addrspace(5) %"50", align 4 + %"69" = load i64, ptr addrspace(5) %"56", align 4 + %"79" = sub i64 %"68", %"69" + store i64 %"79", ptr addrspace(5) %"51", align 4 + %"71" = load i64, ptr addrspace(5) %"53", align 4 + %"72" = load i64, ptr addrspace(5) %"56", align 4 + %"82" = sub i64 %"71", %"72" + store i64 %"82", ptr addrspace(5) %"54", align 4 + %"73" = load i64, ptr addrspace(5) %"51", align 4 + %"85" = inttoptr i64 %"73" to ptr addrspace(1) + %"38" = getelementptr inbounds i8, ptr addrspace(1) %"85", i64 0 + %"74" = load i64, ptr addrspace(1) %"38", align 4 + store i64 %"74", ptr addrspace(5) %"57", align 4 + %"75" = load i64, ptr addrspace(5) %"54", align 4 + %"86" = inttoptr i64 %"75" to ptr addrspace(1) + %"40" = getelementptr inbounds i8, ptr addrspace(1) %"86", i64 0 + %"76" = load i64, ptr addrspace(5) %"57", align 4 + store i64 %"76", ptr addrspace(1) %"40", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_ld_st_simple.ll b/ptx/src/test/ll/stateful_ld_st_simple.ll index 09d064b..94761f3 100644 --- a/ptx/src/test/ll/stateful_ld_st_simple.ll +++ b/ptx/src/test/ll/stateful_ld_st_simple.ll @@ -10,36 +10,40 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @stateful_ld_st_simple(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @stateful_ld_st_simple(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"43", ptr addrspace(5) %"38", align 4 + br label %"60" + +"60": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"44", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %2 = inttoptr i64 %"46" to ptr - %"53" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"53", ptr addrspace(5) %"40", align 8 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %3 = inttoptr i64 %"48" to ptr - %"55" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"55", ptr addrspace(5) %"41", align 8 - %"50" = load i64, ptr addrspace(5) %"40", align 4 - %"57" = inttoptr i64 %"50" to ptr addrspace(1) - %"49" = load i64, ptr addrspace(1) %"57", align 4 - store i64 %"49", ptr addrspace(5) %"42", align 4 + %"45" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"45", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %2 = inttoptr i64 %"47" to ptr + %"54" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"54", ptr addrspace(5) %"41", align 8 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %3 = inttoptr i64 %"49" to ptr + %"56" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"56", ptr addrspace(5) %"42", align 8 %"51" = load i64, ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"42", align 4 %"58" = inttoptr i64 %"51" to ptr addrspace(1) - store i64 %"52", ptr addrspace(1) %"58", align 4 + %"50" = load i64, ptr addrspace(1) %"58", align 4 + store i64 %"50", ptr addrspace(5) %"43", align 4 + %"52" = load i64, ptr addrspace(5) %"42", align 4 + %"53" = load i64, ptr addrspace(5) %"43", align 4 + %"59" = inttoptr i64 %"52" to ptr addrspace(1) + store i64 %"53", ptr addrspace(1) %"59", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_neg_offset.ll b/ptx/src/test/ll/stateful_neg_offset.ll index 38abb0d..26e1537 100644 --- a/ptx/src/test/ll/stateful_neg_offset.ll +++ b/ptx/src/test/ll/stateful_neg_offset.ll @@ -10,45 +10,49 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @stateful_neg_offset(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @stateful_neg_offset(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i64, align 8, addrspace(5) %"43" = alloca i64, align 8, addrspace(5) %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"45", ptr addrspace(5) %"39", align 4 + br label %"68" + +"68": ; preds = %1 %"46" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"46", ptr addrspace(5) %"40", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %2 = inttoptr i64 %"48" to ptr - %"61" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"61", ptr addrspace(5) %"41", align 8 - %"50" = load i64, ptr addrspace(5) %"40", align 4 - %3 = inttoptr i64 %"50" to ptr - %"63" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"63", ptr addrspace(5) %"42", align 8 - %"52" = load i64, ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"47", ptr addrspace(5) %"41", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %2 = inttoptr i64 %"49" to ptr + %"62" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"62", ptr addrspace(5) %"42", align 8 + %"51" = load i64, ptr addrspace(5) %"41", align 4 + %3 = inttoptr i64 %"51" to ptr + %"64" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"64", ptr addrspace(5) %"43", align 8 %"53" = load i64, ptr addrspace(5) %"42", align 4 - %"51" = add i64 %"52", %"53" - store i64 %"51", ptr addrspace(5) %"43", align 4 - %"55" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i64, ptr addrspace(5) %"43", align 4 + %"52" = add i64 %"53", %"54" + store i64 %"52", ptr addrspace(5) %"44", align 4 %"56" = load i64, ptr addrspace(5) %"42", align 4 - %"54" = sub i64 %"55", %"56" - store i64 %"54", ptr addrspace(5) %"43", align 4 - %"58" = load i64, ptr addrspace(5) %"41", align 4 - %"65" = inttoptr i64 %"58" to ptr addrspace(1) - %"57" = load i64, ptr addrspace(1) %"65", align 4 - store i64 %"57", ptr addrspace(5) %"44", align 4 + %"57" = load i64, ptr addrspace(5) %"43", align 4 + %"55" = sub i64 %"56", %"57" + store i64 %"55", ptr addrspace(5) %"44", align 4 %"59" = load i64, ptr addrspace(5) %"42", align 4 - %"60" = load i64, ptr addrspace(5) %"44", align 4 %"66" = inttoptr i64 %"59" to ptr addrspace(1) - store i64 %"60", ptr addrspace(1) %"66", align 4 + %"58" = load i64, ptr addrspace(1) %"66", align 4 + store i64 %"58", ptr addrspace(5) %"45", align 4 + %"60" = load i64, ptr addrspace(5) %"43", align 4 + %"61" = load i64, ptr addrspace(5) %"45", align 4 + %"67" = inttoptr i64 %"60" to ptr addrspace(1) + store i64 %"61", ptr addrspace(1) %"67", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/sub.ll b/ptx/src/test/ll/sub.ll index 31b5801..564b376 100644 --- a/ptx/src/test/ll/sub.ll +++ b/ptx/src/test/ll/sub.ll @@ -10,30 +10,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @sub(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @sub(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 + br label %"53" + +"53": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"46" = sub i64 %"47", 1 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"51", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"46" to ptr + %"45" = load i64, ptr %"51", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %"47" = sub i64 %"48", 1 + store i64 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"52" = inttoptr i64 %"49" to ptr + store i64 %"50", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/vector.ll b/ptx/src/test/ll/vector.ll index e909c7a..c26834c 100644 --- a/ptx/src/test/ll/vector.ll +++ b/ptx/src/test/ll/vector.ll @@ -10,70 +10,77 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define <2 x i32> @__zluda_ptx_impl_impl(<2 x i32> %"9") #0 { - %"49" = alloca <2 x i32>, align 8, addrspace(5) +define <2 x i32> @impl(<2 x i32> %"9") #0 { %"50" = alloca <2 x i32>, align 8, addrspace(5) - %"51" = alloca i32, align 4, addrspace(5) + %"51" = alloca <2 x i32>, align 8, addrspace(5) %"52" = alloca i32, align 4, addrspace(5) + %"53" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"37" = extractelement <2 x i32> %"9", i8 0 - store i32 %"37", ptr addrspace(5) %"51", align 4 - %"38" = extractelement <2 x i32> %"9", i8 1 + br label %"91" + +"91": ; preds = %1 + %"38" = extractelement <2 x i32> %"9", i8 0 store i32 %"38", ptr addrspace(5) %"52", align 4 - %"56" = load i32, ptr addrspace(5) %"51", align 4 + %"39" = extractelement <2 x i32> %"9", i8 1 + store i32 %"39", ptr addrspace(5) %"53", align 4 %"57" = load i32, ptr addrspace(5) %"52", align 4 - %"55" = add i32 %"56", %"57" - store i32 %"55", ptr addrspace(5) %"52", align 4 - %"58" = load i32, ptr addrspace(5) %"52", align 4 - %"60" = load <2 x i32>, ptr addrspace(5) %"50", align 8 - %"59" = insertelement <2 x i32> %"60", i32 %"58", i8 0 - store <2 x i32> %"59", ptr addrspace(5) %"50", align 8 - %"61" = load i32, ptr addrspace(5) %"52", align 4 - %"63" = load <2 x i32>, ptr addrspace(5) %"50", align 8 - %"62" = insertelement <2 x i32> %"63", i32 %"61", i8 1 - store <2 x i32> %"62", ptr addrspace(5) %"50", align 8 - %"64" = load <2 x i32>, ptr addrspace(5) %"50", align 8 - %"42" = extractelement <2 x i32> %"64", i8 1 - %"66" = load <2 x i32>, ptr addrspace(5) %"50", align 8 - %"65" = insertelement <2 x i32> %"66", i32 %"42", i8 0 - store <2 x i32> %"65", ptr addrspace(5) %"50", align 8 - %"68" = load <2 x i32>, ptr addrspace(5) %"50", align 8 - store <2 x i32> %"68", ptr addrspace(5) %"49", align 8 - %2 = load <2 x i32>, ptr addrspace(5) %"49", align 8 + %"58" = load i32, ptr addrspace(5) %"53", align 4 + %"56" = add i32 %"57", %"58" + store i32 %"56", ptr addrspace(5) %"53", align 4 + %"59" = load i32, ptr addrspace(5) %"53", align 4 + %"61" = load <2 x i32>, ptr addrspace(5) %"51", align 8 + %"60" = insertelement <2 x i32> %"61", i32 %"59", i8 0 + store <2 x i32> %"60", ptr addrspace(5) %"51", align 8 + %"62" = load i32, ptr addrspace(5) %"53", align 4 + %"64" = load <2 x i32>, ptr addrspace(5) %"51", align 8 + %"63" = insertelement <2 x i32> %"64", i32 %"62", i8 1 + store <2 x i32> %"63", ptr addrspace(5) %"51", align 8 + %"65" = load <2 x i32>, ptr addrspace(5) %"51", align 8 + %"43" = extractelement <2 x i32> %"65", i8 1 + %"67" = load <2 x i32>, ptr addrspace(5) %"51", align 8 + %"66" = insertelement <2 x i32> %"67", i32 %"43", i8 0 + store <2 x i32> %"66", ptr addrspace(5) %"51", align 8 + %"69" = load <2 x i32>, ptr addrspace(5) %"51", align 8 + store <2 x i32> %"69", ptr addrspace(5) %"50", align 8 + %2 = load <2 x i32>, ptr addrspace(5) %"50", align 8 ret <2 x i32> %2 } -define amdgpu_kernel void @vector(ptr addrspace(4) byref(i64) %"69", ptr addrspace(4) byref(i64) %"70") #0 { - %"71" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @vector(ptr addrspace(4) byref(i64) %"70", ptr addrspace(4) byref(i64) %"71") #1 { %"72" = alloca i64, align 8, addrspace(5) - %"73" = alloca <2 x i32>, align 8, addrspace(5) - %"74" = alloca i32, align 4, addrspace(5) + %"73" = alloca i64, align 8, addrspace(5) + %"74" = alloca <2 x i32>, align 8, addrspace(5) %"75" = alloca i32, align 4, addrspace(5) - %"76" = alloca i64, align 8, addrspace(5) + %"76" = alloca i32, align 4, addrspace(5) + %"77" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"77" = load i64, ptr addrspace(4) %"69", align 4 - store i64 %"77", ptr addrspace(5) %"71", align 4 + br label %"92" + +"92": ; preds = %1 %"78" = load i64, ptr addrspace(4) %"70", align 4 store i64 %"78", ptr addrspace(5) %"72", align 4 - %"80" = load i64, ptr addrspace(5) %"71", align 4 - %"87" = inttoptr i64 %"80" to ptr - %"79" = load <2 x i32>, ptr %"87", align 8 - store <2 x i32> %"79", ptr addrspace(5) %"73", align 8 - %"82" = load <2 x i32>, ptr addrspace(5) %"73", align 8 - %"81" = call <2 x i32> @__zluda_ptx_impl_impl(<2 x i32> %"82") - store <2 x i32> %"81", ptr addrspace(5) %"73", align 8 - %"84" = load <2 x i32>, ptr addrspace(5) %"73", align 8 - %"88" = bitcast <2 x i32> %"84" to i64 - store i64 %"88", ptr addrspace(5) %"76", align 4 - %"85" = load i64, ptr addrspace(5) %"72", align 4 - %"86" = load <2 x i32>, ptr addrspace(5) %"73", align 8 - %"89" = inttoptr i64 %"85" to ptr - store <2 x i32> %"86", ptr %"89", align 8 + %"79" = load i64, ptr addrspace(4) %"71", align 4 + store i64 %"79", ptr addrspace(5) %"73", align 4 + %"81" = load i64, ptr addrspace(5) %"72", align 4 + %"88" = inttoptr i64 %"81" to ptr + %"80" = load <2 x i32>, ptr %"88", align 8 + store <2 x i32> %"80", ptr addrspace(5) %"74", align 8 + %"83" = load <2 x i32>, ptr addrspace(5) %"74", align 8 + %"82" = call <2 x i32> @impl(<2 x i32> %"83") + store <2 x i32> %"82", ptr addrspace(5) %"74", align 8 + %"85" = load <2 x i32>, ptr addrspace(5) %"74", align 8 + %"89" = bitcast <2 x i32> %"85" to i64 + store i64 %"89", ptr addrspace(5) %"77", align 4 + %"86" = load i64, ptr addrspace(5) %"73", align 4 + %"87" = load <2 x i32>, ptr addrspace(5) %"74", align 8 + %"90" = inttoptr i64 %"86" to ptr + store <2 x i32> %"87", ptr %"90", align 8 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/vector4.ll b/ptx/src/test/ll/vector4.ll index 1b8ce24..7f7bdf6 100644 --- a/ptx/src/test/ll/vector4.ll +++ b/ptx/src/test/ll/vector4.ll @@ -10,30 +10,34 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @vector4(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @vector4(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca <4 x i32>, align 16, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca <4 x i32>, align 16, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 + br label %"55" + +"55": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 4 store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load <4 x i32>, ptr %"50", align 16 - store <4 x i32> %"44", ptr addrspace(5) %"40", align 16 - %"46" = load <4 x i32>, ptr addrspace(5) %"40", align 16 - %"29" = extractelement <4 x i32> %"46", i8 3 - store i32 %"29", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i32, ptr addrspace(5) %"41", align 4 - %"53" = inttoptr i64 %"48" to ptr - store i32 %"49", ptr %"53", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"46" to ptr + %"45" = load <4 x i32>, ptr %"51", align 16 + store <4 x i32> %"45", ptr addrspace(5) %"41", align 16 + %"47" = load <4 x i32>, ptr addrspace(5) %"41", align 16 + %"30" = extractelement <4 x i32> %"47", i8 3 + store i32 %"30", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = load i32, ptr addrspace(5) %"42", align 4 + %"54" = inttoptr i64 %"49" to ptr + store i32 %"50", ptr %"54", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/vector_extract.ll b/ptx/src/test/ll/vector_extract.ll index a106da8..44b0e23 100644 --- a/ptx/src/test/ll/vector_extract.ll +++ b/ptx/src/test/ll/vector_extract.ll @@ -10,86 +10,90 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 { - %"46" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #1 { %"47" = alloca i64, align 8, addrspace(5) - %"48" = alloca i16, align 2, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) %"49" = alloca i16, align 2, addrspace(5) %"50" = alloca i16, align 2, addrspace(5) %"51" = alloca i16, align 2, addrspace(5) - %"52" = alloca <4 x i16>, align 8, addrspace(5) + %"52" = alloca i16, align 2, addrspace(5) + %"53" = alloca <4 x i16>, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"53" = load i64, ptr addrspace(4) %"44", align 4 - store i64 %"53", ptr addrspace(5) %"46", align 4 + br label %"94" + +"94": ; preds = %1 %"54" = load i64, ptr addrspace(4) %"45", align 4 store i64 %"54", ptr addrspace(5) %"47", align 4 - %"55" = load i64, ptr addrspace(5) %"46", align 4 - %"83" = inttoptr i64 %"55" to ptr addrspace(1) - %"32" = load <4 x i8>, ptr addrspace(1) %"83", align 4 - %"84" = extractelement <4 x i8> %"32", i8 0 - %"85" = extractelement <4 x i8> %"32", i8 1 - %"86" = extractelement <4 x i8> %"32", i8 2 - %"87" = extractelement <4 x i8> %"32", i8 3 - %"56" = zext i8 %"84" to i16 + %"55" = load i64, ptr addrspace(4) %"46", align 4 + store i64 %"55", ptr addrspace(5) %"48", align 4 + %"56" = load i64, ptr addrspace(5) %"47", align 4 + %"84" = inttoptr i64 %"56" to ptr addrspace(1) + %"33" = load <4 x i8>, ptr addrspace(1) %"84", align 4 + %"85" = extractelement <4 x i8> %"33", i8 0 + %"86" = extractelement <4 x i8> %"33", i8 1 + %"87" = extractelement <4 x i8> %"33", i8 2 + %"88" = extractelement <4 x i8> %"33", i8 3 %"57" = zext i8 %"85" to i16 %"58" = zext i8 %"86" to i16 %"59" = zext i8 %"87" to i16 - store i16 %"56", ptr addrspace(5) %"48", align 2 + %"60" = zext i8 %"88" to i16 store i16 %"57", ptr addrspace(5) %"49", align 2 store i16 %"58", ptr addrspace(5) %"50", align 2 store i16 %"59", ptr addrspace(5) %"51", align 2 - %"60" = load i16, ptr addrspace(5) %"49", align 2 + store i16 %"60", ptr addrspace(5) %"52", align 2 %"61" = load i16, ptr addrspace(5) %"50", align 2 %"62" = load i16, ptr addrspace(5) %"51", align 2 - %"63" = load i16, ptr addrspace(5) %"48", align 2 - %2 = insertelement <4 x i16> undef, i16 %"60", i8 0 - %3 = insertelement <4 x i16> %2, i16 %"61", i8 1 - %4 = insertelement <4 x i16> %3, i16 %"62", i8 2 - %"33" = insertelement <4 x i16> %4, i16 %"63", i8 3 - store <4 x i16> %"33", ptr addrspace(5) %"52", align 8 - %"65" = load <4 x i16>, ptr addrspace(5) %"52", align 8 - %"66" = extractelement <4 x i16> %"65", i8 0 - %"67" = extractelement <4 x i16> %"65", i8 1 - %"68" = extractelement <4 x i16> %"65", i8 2 - %"69" = extractelement <4 x i16> %"65", i8 3 - store i16 %"66", ptr addrspace(5) %"50", align 2 + %"63" = load i16, ptr addrspace(5) %"52", align 2 + %"64" = load i16, ptr addrspace(5) %"49", align 2 + %2 = insertelement <4 x i16> undef, i16 %"61", i8 0 + %3 = insertelement <4 x i16> %2, i16 %"62", i8 1 + %4 = insertelement <4 x i16> %3, i16 %"63", i8 2 + %"34" = insertelement <4 x i16> %4, i16 %"64", i8 3 + store <4 x i16> %"34", ptr addrspace(5) %"53", align 8 + %"66" = load <4 x i16>, ptr addrspace(5) %"53", align 8 + %"67" = extractelement <4 x i16> %"66", i8 0 + %"68" = extractelement <4 x i16> %"66", i8 1 + %"69" = extractelement <4 x i16> %"66", i8 2 + %"70" = extractelement <4 x i16> %"66", i8 3 store i16 %"67", ptr addrspace(5) %"51", align 2 - store i16 %"68", ptr addrspace(5) %"48", align 2 + store i16 %"68", ptr addrspace(5) %"52", align 2 store i16 %"69", ptr addrspace(5) %"49", align 2 - %"70" = load i16, ptr addrspace(5) %"50", align 2 + store i16 %"70", ptr addrspace(5) %"50", align 2 %"71" = load i16, ptr addrspace(5) %"51", align 2 - %"72" = load i16, ptr addrspace(5) %"48", align 2 + %"72" = load i16, ptr addrspace(5) %"52", align 2 %"73" = load i16, ptr addrspace(5) %"49", align 2 - %5 = insertelement <4 x i16> undef, i16 %"70", i8 0 - %6 = insertelement <4 x i16> %5, i16 %"71", i8 1 - %7 = insertelement <4 x i16> %6, i16 %"72", i8 2 - %"36" = insertelement <4 x i16> %7, i16 %"73", i8 3 - %"74" = extractelement <4 x i16> %"36", i8 0 - %"75" = extractelement <4 x i16> %"36", i8 1 - %"76" = extractelement <4 x i16> %"36", i8 2 - %"77" = extractelement <4 x i16> %"36", i8 3 - store i16 %"74", ptr addrspace(5) %"51", align 2 - store i16 %"75", ptr addrspace(5) %"48", align 2 + %"74" = load i16, ptr addrspace(5) %"50", align 2 + %5 = insertelement <4 x i16> undef, i16 %"71", i8 0 + %6 = insertelement <4 x i16> %5, i16 %"72", i8 1 + %7 = insertelement <4 x i16> %6, i16 %"73", i8 2 + %"37" = insertelement <4 x i16> %7, i16 %"74", i8 3 + %"75" = extractelement <4 x i16> %"37", i8 0 + %"76" = extractelement <4 x i16> %"37", i8 1 + %"77" = extractelement <4 x i16> %"37", i8 2 + %"78" = extractelement <4 x i16> %"37", i8 3 + store i16 %"75", ptr addrspace(5) %"52", align 2 store i16 %"76", ptr addrspace(5) %"49", align 2 store i16 %"77", ptr addrspace(5) %"50", align 2 - %"78" = load i16, ptr addrspace(5) %"48", align 2 + store i16 %"78", ptr addrspace(5) %"51", align 2 %"79" = load i16, ptr addrspace(5) %"49", align 2 %"80" = load i16, ptr addrspace(5) %"50", align 2 %"81" = load i16, ptr addrspace(5) %"51", align 2 - %"88" = trunc i16 %"78" to i8 + %"82" = load i16, ptr addrspace(5) %"52", align 2 %"89" = trunc i16 %"79" to i8 %"90" = trunc i16 %"80" to i8 %"91" = trunc i16 %"81" to i8 - %8 = insertelement <4 x i8> undef, i8 %"88", i8 0 - %9 = insertelement <4 x i8> %8, i8 %"89", i8 1 - %10 = insertelement <4 x i8> %9, i8 %"90", i8 2 - %"37" = insertelement <4 x i8> %10, i8 %"91", i8 3 - %"82" = load i64, ptr addrspace(5) %"47", align 4 - %"92" = inttoptr i64 %"82" to ptr addrspace(1) - store <4 x i8> %"37", ptr addrspace(1) %"92", align 4 + %"92" = trunc i16 %"82" to i8 + %8 = insertelement <4 x i8> undef, i8 %"89", i8 0 + %9 = insertelement <4 x i8> %8, i8 %"90", i8 1 + %10 = insertelement <4 x i8> %9, i8 %"91", i8 2 + %"38" = insertelement <4 x i8> %10, i8 %"92", i8 3 + %"83" = load i64, ptr addrspace(5) %"48", align 4 + %"93" = inttoptr i64 %"83" to ptr addrspace(1) + store <4 x i8> %"38", ptr addrspace(1) %"93", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/xor.ll b/ptx/src/test/ll/xor.ll index 859decb..c84ff02 100644 --- a/ptx/src/test/ll/xor.ll +++ b/ptx/src/test/ll/xor.ll @@ -10,36 +10,40 @@ declare i32 @__zluda_ptx_impl_sreg_clock() #0 declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 -define amdgpu_kernel void @xor(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @xor(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #1 { %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 + br label %"58" + +"58": ; preds = %1 %"44" = load i64, ptr addrspace(4) %"38", align 4 store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"46" = load i32, ptr %"55", align 4 + store i32 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 4 + %"49" = load i32, ptr %"31", align 4 + store i32 %"49", ptr addrspace(5) %"43", align 4 %"51" = load i32, ptr addrspace(5) %"42", align 4 - %"49" = xor i32 %"50", %"51" - store i32 %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"56", align 4 + %"52" = load i32, ptr addrspace(5) %"43", align 4 + %"50" = xor i32 %"51", %"52" + store i32 %"50", ptr addrspace(5) %"42", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i32, ptr addrspace(5) %"42", align 4 + %"57" = inttoptr i64 %"53" to ptr + store i32 %"54", ptr %"57", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file