diff --git a/Cargo.lock b/Cargo.lock index 66e9625..0635659 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "aho-corasick" @@ -296,6 +296,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + [[package]] name = "dynasm" version = "1.2.3" @@ -701,6 +707,16 @@ dependencies = [ "portable-atomic", ] +[[package]] +name = "pretty_assertions" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ae130e2f271fbc2ac3a40fb1d07180839cdbbe443c7a27e1e3c13c5cac0116d" +dependencies = [ + "diff", + "yansi", +] + [[package]] name = "prettyplease" version = "0.2.25" @@ -787,6 +803,7 @@ dependencies = [ "microlp", "paste", "petgraph", + "pretty_assertions", "ptx_parser", "quick-error", "rustc-hash 2.0.0", diff --git a/ptx/Cargo.toml b/ptx/Cargo.toml index 143b562..6cf9c79 100644 --- a/ptx/Cargo.toml +++ b/ptx/Cargo.toml @@ -27,3 +27,4 @@ comgr = { path = "../comgr" } tempfile = "3" paste = "1.0" cuda-driver-sys = "0.3.0" +pretty_assertions = "1.4.1" \ No newline at end of file diff --git a/ptx/src/pass/emit_llvm.rs b/ptx/src/pass/emit_llvm.rs index 90c6b8b..73d7ced 100644 --- a/ptx/src/pass/emit_llvm.rs +++ b/ptx/src/pass/emit_llvm.rs @@ -65,17 +65,24 @@ impl Drop for Context { } } -struct Module(LLVMModuleRef); +pub struct Module(LLVMModuleRef, Context); impl Module { - fn new(ctx: &Context, name: &CStr) -> Self { - Self(unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) }) + fn new(ctx: Context, name: &CStr) -> Self { + Self( + unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) }, + ctx, + ) } fn get(&self) -> LLVMModuleRef { self.0 } + fn context(&self) -> &Context { + &self.1 + } + fn verify(&self) -> Result<(), Message> { let mut err = ptr::null_mut(); let error = unsafe { @@ -92,10 +99,15 @@ impl Module { } } - fn write_bitcode_to_memory(&self) -> MemoryBuffer { + pub fn write_bitcode_to_memory(&self) -> MemoryBuffer { let memory_buffer = unsafe { LLVMWriteBitcodeToMemoryBuffer(self.get()) }; MemoryBuffer(memory_buffer) } + + pub fn print_module_to_string(&self) -> Message { + let asm = unsafe { LLVMPrintModuleToString(self.get()) }; + Message(unsafe { CStr::from_ptr(asm) }) + } } impl Drop for Module { @@ -130,7 +142,7 @@ impl Drop for Builder { } } -struct Message(&'static CStr); +pub struct Message(&'static CStr); impl Drop for Message { fn drop(&mut self) { @@ -146,6 +158,12 @@ impl std::fmt::Debug for Message { } } +impl Message { + pub fn to_str(&self) -> &str { + self.0.to_str().unwrap().trim() + } +} + pub struct MemoryBuffer(LLVMMemoryBufferRef); impl Drop for MemoryBuffer { @@ -169,10 +187,10 @@ impl Deref for MemoryBuffer { pub(super) fn run<'input>( id_defs: GlobalStringIdentResolver2<'input>, directives: Vec, SpirvWord>>, -) -> Result { +) -> Result { let context = Context::new(); - let module = Module::new(&context, LLVM_UNNAMED); - let mut emit_ctx = ModuleEmitContext::new(&context, &module, &id_defs); + let module = Module::new(context, LLVM_UNNAMED); + let mut emit_ctx = ModuleEmitContext::new(&module, &id_defs); for directive in directives { match directive { Directive2::Variable(linking, variable) => emit_ctx.emit_global(linking, variable)?, @@ -182,7 +200,7 @@ pub(super) fn run<'input>( if let Err(err) = module.verify() { panic!("{:?}", err); } - Ok(module.write_bitcode_to_memory()) + Ok(module) } struct ModuleEmitContext<'a, 'input> { @@ -194,11 +212,8 @@ struct ModuleEmitContext<'a, 'input> { } impl<'a, 'input> ModuleEmitContext<'a, 'input> { - fn new( - context: &Context, - module: &Module, - id_defs: &'a GlobalStringIdentResolver2<'input>, - ) -> Self { + fn new(module: &Module, id_defs: &'a GlobalStringIdentResolver2<'input>) -> Self { + let context = module.context(); ModuleEmitContext { context: context.get(), module: module.get(), @@ -546,6 +561,7 @@ impl<'a> MethodEmitContext<'a> { ast::Instruction::Add { data, arguments } => self.emit_add(data, arguments), ast::Instruction::St { data, arguments } => self.emit_st(data, arguments), ast::Instruction::Mul { data, arguments } => self.emit_mul(data, arguments), + ast::Instruction::Mul24 { data, arguments } => self.emit_mul24(data, arguments), ast::Instruction::Setp { data, arguments } => self.emit_setp(data, arguments), ast::Instruction::SetpBool { .. } => todo!(), ast::Instruction::Not { data, arguments } => self.emit_not(data, arguments), @@ -2225,6 +2241,25 @@ impl<'a> MethodEmitContext<'a> { Ok(()) } + fn emit_mul24( + &mut self, + data: ast::Mul24Details, + arguments: ast::Mul24Args, + ) -> Result<(), TranslateError> { + let src1 = self.resolver.value(arguments.src1)?; + let src2 = self.resolver.value(arguments.src2)?; + self.emit_intrinsic( + c"llvm.amdgcn.mul.u24", + Some(arguments.dst), + Some(&ast::Type::Scalar(data.type_)), + vec![ + (src1, get_scalar_type(self.context, data.type_)), + (src2, get_scalar_type(self.context, data.type_)), + ], + )?; + Ok(()) + } + fn emit_set_mode(&mut self, mode_reg: ModeRegister) -> Result<(), TranslateError> { let intrinsic = c"llvm.amdgcn.s.setreg"; let llvm_i32 = get_scalar_type(self.context, ast::ScalarType::B32); diff --git a/ptx/src/pass/insert_ftz_control.rs b/ptx/src/pass/insert_ftz_control.rs index 25eca97..a1d4a7a 100644 --- a/ptx/src/pass/insert_ftz_control.rs +++ b/ptx/src/pass/insert_ftz_control.rs @@ -987,6 +987,7 @@ fn get_modes(inst: &ast::Instruction) -> InstructionModes { | ast::Instruction::Bar { .. } | ast::Instruction::Cvta { .. } | ast::Instruction::Atom { .. } + | ast::Instruction::Mul24 { .. } | ast::Instruction::AtomCas { .. } => InstructionModes::none(), ast::Instruction::Add { data: ast::ArithDetails::Integer(_), diff --git a/ptx/src/pass/mod.rs b/ptx/src/pass/mod.rs index 9eda5f3..1a094fb 100644 --- a/ptx/src/pass/mod.rs +++ b/ptx/src/pass/mod.rs @@ -65,7 +65,7 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result, } diff --git a/ptx/src/test/ll/activemask.ll b/ptx/src/test/ll/activemask.ll new file mode 100644 index 0000000..a54bc7b --- /dev/null +++ b/ptx/src/test/ll/activemask.ll @@ -0,0 +1,32 @@ +declare i32 @__zluda_ptx_impl_activemask() #0 + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"37" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"37", ptr addrspace(5) %"35", align 4 + %"38" = call i32 @__zluda_ptx_impl_activemask() + store i32 %"38", ptr addrspace(5) %"36", align 4 + %"39" = load i64, ptr addrspace(5) %"35", align 4 + %"40" = load i32, ptr addrspace(5) %"36", align 4 + %"41" = inttoptr i64 %"39" to ptr + store i32 %"40", ptr %"41", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/add.ll b/ptx/src/test/ll/add.ll new file mode 100644 index 0000000..d8807e0 --- /dev/null +++ b/ptx/src/test/ll/add.ll @@ -0,0 +1,39 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr + %"44" = load i64, ptr %"50", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"46" = add i64 %"47", 1 + store i64 %"46", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"51" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"51", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/add_non_coherent.ll b/ptx/src/test/ll/add_non_coherent.ll new file mode 100644 index 0000000..668031d --- /dev/null +++ b/ptx/src/test/ll/add_non_coherent.ll @@ -0,0 +1,39 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr addrspace(1) + %"44" = load i64, ptr addrspace(1) %"50", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"46" = add i64 %"47", 1 + store i64 %"46", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"51" = inttoptr i64 %"48" to ptr addrspace(1) + store i64 %"49", ptr addrspace(1) %"51", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/add_tuning.ll b/ptx/src/test/ll/add_tuning.ll new file mode 100644 index 0000000..0ef4636 --- /dev/null +++ b/ptx/src/test/ll/add_tuning.ll @@ -0,0 +1,39 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr + %"44" = load i64, ptr %"50", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"46" = add i64 %"47", 1 + store i64 %"46", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"51" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"51", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/and.ll b/ptx/src/test/ll/and.ll new file mode 100644 index 0000000..f13e3a7 --- /dev/null +++ b/ptx/src/test/ll/and.ll @@ -0,0 +1,45 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"54" = inttoptr i64 %"46" to ptr + %"45" = load i32, ptr %"54", align 4 + store i32 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"30" = getelementptr inbounds i8, ptr %"55", i64 4 + %"48" = load i32, ptr %"30", align 4 + store i32 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"51" = load i32, ptr addrspace(5) %"42", align 4 + %"56" = and i32 %"50", %"51" + store i32 %"56", ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"53" = load i32, ptr addrspace(5) %"41", align 4 + %"59" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"59", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/atom_add.ll b/ptx/src/test/ll/atom_add.ll new file mode 100644 index 0000000..b646974 --- /dev/null +++ b/ptx/src/test/ll/atom_add.ll @@ -0,0 +1,55 @@ +@shared_mem = external addrspace(3) global [1024 x i8], align 4 + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) + %"45" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"46" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"47" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"47", ptr addrspace(5) %"43", align 4 + %"49" = load i64, ptr addrspace(5) %"42", align 4 + %"60" = inttoptr i64 %"49" to ptr + %"48" = load i32, ptr %"60", align 4 + store i32 %"48", ptr addrspace(5) %"44", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"61" = inttoptr i64 %"50" to ptr + %"31" = getelementptr inbounds i8, ptr %"61", i64 4 + %"51" = load i32, ptr %"31", align 4 + store i32 %"51", ptr addrspace(5) %"45", align 4 + %"52" = load i32, ptr addrspace(5) %"44", align 4 + store i32 %"52", ptr addrspace(3) @shared_mem, align 4 + %"54" = load i32, ptr addrspace(5) %"45", align 4 + %2 = atomicrmw add ptr addrspace(3) @shared_mem, i32 %"54" syncscope("agent-one-as") monotonic, align 4 + store i32 %2, ptr addrspace(5) %"44", align 4 + %"55" = load i32, ptr addrspace(3) @shared_mem, align 4 + store i32 %"55", ptr addrspace(5) %"45", align 4 + %"56" = load i64, ptr addrspace(5) %"43", align 4 + %"57" = load i32, ptr addrspace(5) %"44", align 4 + %"65" = inttoptr i64 %"56" to ptr + store i32 %"57", ptr %"65", align 4 + %"58" = load i64, ptr addrspace(5) %"43", align 4 + %"66" = inttoptr i64 %"58" to ptr + %"33" = getelementptr inbounds i8, ptr %"66", i64 4 + %"59" = load i32, ptr addrspace(5) %"45", align 4 + store i32 %"59", ptr %"33", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/atom_add_float.ll b/ptx/src/test/ll/atom_add_float.ll new file mode 100644 index 0000000..33265a4 --- /dev/null +++ b/ptx/src/test/ll/atom_add_float.ll @@ -0,0 +1,55 @@ +@shared_mem = external addrspace(3) global [1024 x i8], align 4 + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca float, align 4, addrspace(5) + %"45" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"46" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"47" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"47", ptr addrspace(5) %"43", align 4 + %"49" = load i64, ptr addrspace(5) %"42", align 4 + %"60" = inttoptr i64 %"49" to ptr + %"48" = load float, ptr %"60", align 4 + store float %"48", ptr addrspace(5) %"44", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"61" = inttoptr i64 %"50" to ptr + %"31" = getelementptr inbounds i8, ptr %"61", i64 4 + %"51" = load float, ptr %"31", align 4 + store float %"51", ptr addrspace(5) %"45", align 4 + %"52" = load float, ptr addrspace(5) %"44", align 4 + store float %"52", ptr addrspace(3) @shared_mem, align 4 + %"54" = load float, ptr addrspace(5) %"45", align 4 + %2 = atomicrmw fadd ptr addrspace(3) @shared_mem, float %"54" syncscope("agent-one-as") monotonic, align 4 + store float %2, ptr addrspace(5) %"44", align 4 + %"55" = load float, ptr addrspace(3) @shared_mem, align 4 + store float %"55", ptr addrspace(5) %"45", align 4 + %"56" = load i64, ptr addrspace(5) %"43", align 4 + %"57" = load float, ptr addrspace(5) %"44", align 4 + %"65" = inttoptr i64 %"56" to ptr + store float %"57", ptr %"65", align 4 + %"58" = load i64, ptr addrspace(5) %"43", align 4 + %"66" = inttoptr i64 %"58" to ptr + %"33" = getelementptr inbounds i8, ptr %"66", i64 4 + %"59" = load float, ptr addrspace(5) %"45", align 4 + store float %"59", ptr %"33", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/atom_cas.ll b/ptx/src/test/ll/atom_cas.ll new file mode 100644 index 0000000..644d0cd --- /dev/null +++ b/ptx/src/test/ll/atom_cas.ll @@ -0,0 +1,53 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i32, align 4, addrspace(5) + %"47" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"48" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"48", ptr addrspace(5) %"44", align 4 + %"49" = load i64, ptr addrspace(4) %"43", align 4 + store i64 %"49", ptr addrspace(5) %"45", align 4 + %"51" = load i64, ptr addrspace(5) %"44", align 4 + %"61" = inttoptr i64 %"51" to ptr + %"50" = load i32, ptr %"61", align 4 + store i32 %"50", ptr addrspace(5) %"46", align 4 + %"52" = load i64, ptr addrspace(5) %"44", align 4 + %"62" = inttoptr i64 %"52" to ptr + %"30" = getelementptr inbounds i8, ptr %"62", i64 4 + %"54" = load i32, ptr addrspace(5) %"46", align 4 + %2 = cmpxchg ptr %"30", i32 %"54", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4 + %"63" = extractvalue { i32, i1 } %2, 0 + store i32 %"63", ptr addrspace(5) %"46", align 4 + %"55" = load i64, ptr addrspace(5) %"44", align 4 + %"65" = inttoptr i64 %"55" to ptr + %"33" = getelementptr inbounds i8, ptr %"65", i64 4 + %"56" = load i32, ptr %"33", align 4 + store i32 %"56", ptr addrspace(5) %"47", align 4 + %"57" = load i64, ptr addrspace(5) %"45", align 4 + %"58" = load i32, ptr addrspace(5) %"46", align 4 + %"66" = inttoptr i64 %"57" to ptr + store i32 %"58", ptr %"66", align 4 + %"59" = load i64, ptr addrspace(5) %"45", align 4 + %"67" = inttoptr i64 %"59" to ptr + %"35" = getelementptr inbounds i8, ptr %"67", i64 4 + %"60" = load i32, ptr addrspace(5) %"47", align 4 + store i32 %"60", ptr %"35", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/atom_inc.ll b/ptx/src/test/ll/atom_inc.ll new file mode 100644 index 0000000..88ba124 --- /dev/null +++ b/ptx/src/test/ll/atom_inc.ll @@ -0,0 +1,55 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i32, align 4, addrspace(5) + %"47" = alloca i32, align 4, addrspace(5) + %"48" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"49" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"49", ptr addrspace(5) %"44", align 4 + %"50" = load i64, ptr addrspace(4) %"43", align 4 + store i64 %"50", ptr addrspace(5) %"45", align 4 + %"52" = load i64, ptr addrspace(5) %"44", align 4 + %"63" = inttoptr i64 %"52" to ptr + %2 = atomicrmw uinc_wrap ptr %"63", i32 101 syncscope("agent-one-as") monotonic, align 4 + store i32 %2, ptr addrspace(5) %"46", align 4 + %"54" = load i64, ptr addrspace(5) %"44", align 4 + %"64" = inttoptr i64 %"54" to ptr addrspace(1) + %3 = atomicrmw uinc_wrap ptr addrspace(1) %"64", i32 101 syncscope("agent-one-as") monotonic, align 4 + store i32 %3, ptr addrspace(5) %"47", align 4 + %"56" = load i64, ptr addrspace(5) %"44", align 4 + %"65" = inttoptr i64 %"56" to ptr + %"55" = load i32, ptr %"65", align 4 + store i32 %"55", ptr addrspace(5) %"48", align 4 + %"57" = load i64, ptr addrspace(5) %"45", align 4 + %"58" = load i32, ptr addrspace(5) %"46", align 4 + %"66" = inttoptr i64 %"57" to ptr + store i32 %"58", ptr %"66", align 4 + %"59" = load i64, ptr addrspace(5) %"45", align 4 + %"67" = inttoptr i64 %"59" to ptr + %"33" = getelementptr inbounds i8, ptr %"67", i64 4 + %"60" = load i32, ptr addrspace(5) %"47", align 4 + store i32 %"60", ptr %"33", align 4 + %"61" = load i64, ptr addrspace(5) %"45", align 4 + %"68" = inttoptr i64 %"61" to ptr + %"35" = getelementptr inbounds i8, ptr %"68", i64 8 + %"62" = load i32, ptr addrspace(5) %"48", align 4 + store i32 %"62", ptr %"35", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/b64tof64.ll b/ptx/src/test/ll/b64tof64.ll new file mode 100644 index 0000000..2373b64 --- /dev/null +++ b/ptx/src/test/ll/b64tof64.ll @@ -0,0 +1,39 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca double, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"41" = load double, ptr addrspace(4) %"35", align 8 + store double %"41", ptr addrspace(5) %"37", align 8 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"39", align 4 + %"44" = load double, ptr addrspace(5) %"37", align 8 + %"50" = bitcast double %"44" to i64 + store i64 %"50", ptr addrspace(5) %"38", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"51" = inttoptr i64 %"46" to ptr + %"45" = load i64, ptr %"51", align 4 + store i64 %"45", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"52" = inttoptr i64 %"47" to ptr + store i64 %"48", ptr %"52", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/bfe.ll b/ptx/src/test/ll/bfe.ll new file mode 100644 index 0000000..fda252d --- /dev/null +++ b/ptx/src/test/ll/bfe.ll @@ -0,0 +1,54 @@ +declare i32 @__zluda_ptx_impl_bfe_u32(i32, i32, i32) #0 + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) + %"45" = alloca i32, align 4, addrspace(5) + %"46" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"47" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"47", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"48", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"61" = inttoptr i64 %"50" to ptr + %"49" = load i32, ptr %"61", align 4 + store i32 %"49", ptr addrspace(5) %"44", align 4 + %"51" = load i64, ptr addrspace(5) %"42", align 4 + %"62" = inttoptr i64 %"51" to ptr + %"31" = getelementptr inbounds i8, ptr %"62", i64 4 + %"52" = load i32, ptr %"31", align 4 + store i32 %"52", ptr addrspace(5) %"45", align 4 + %"53" = load i64, ptr addrspace(5) %"42", align 4 + %"63" = inttoptr i64 %"53" to ptr + %"33" = getelementptr inbounds i8, ptr %"63", i64 8 + %"54" = load i32, ptr %"33", align 4 + store i32 %"54", ptr addrspace(5) %"46", align 4 + %"56" = load i32, ptr addrspace(5) %"44", align 4 + %"57" = load i32, ptr addrspace(5) %"45", align 4 + %"58" = load i32, ptr addrspace(5) %"46", align 4 + %"55" = call i32 @__zluda_ptx_impl_bfe_u32(i32 %"56", i32 %"57", i32 %"58") + store i32 %"55", ptr addrspace(5) %"44", align 4 + %"59" = load i64, ptr addrspace(5) %"43", align 4 + %"60" = load i32, ptr addrspace(5) %"44", align 4 + %"64" = inttoptr i64 %"59" to ptr + store i32 %"60", ptr %"64", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/bfi.ll b/ptx/src/test/ll/bfi.ll new file mode 100644 index 0000000..ef437c1 --- /dev/null +++ b/ptx/src/test/ll/bfi.ll @@ -0,0 +1,61 @@ +declare i32 @__zluda_ptx_impl_bfi_b32(i32, i32, i32, i32) #0 + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i32, align 4, addrspace(5) + %"48" = alloca i32, align 4, addrspace(5) + %"49" = alloca i32, align 4, addrspace(5) + %"50" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"51" = load i64, ptr addrspace(4) %"43", align 4 + store i64 %"51", ptr addrspace(5) %"45", align 4 + %"52" = load i64, ptr addrspace(4) %"44", align 4 + store i64 %"52", ptr addrspace(5) %"46", align 4 + %"54" = load i64, ptr addrspace(5) %"45", align 4 + %"68" = inttoptr i64 %"54" to ptr + %"53" = load i32, ptr %"68", align 4 + store i32 %"53", ptr addrspace(5) %"47", align 4 + %"55" = load i64, ptr addrspace(5) %"45", align 4 + %"69" = inttoptr i64 %"55" to ptr + %"32" = getelementptr inbounds i8, ptr %"69", i64 4 + %"56" = load i32, ptr %"32", align 4 + store i32 %"56", ptr addrspace(5) %"48", align 4 + %"57" = load i64, ptr addrspace(5) %"45", align 4 + %"70" = inttoptr i64 %"57" to ptr + %"34" = getelementptr inbounds i8, ptr %"70", i64 8 + %"58" = load i32, ptr %"34", align 4 + store i32 %"58", ptr addrspace(5) %"49", align 4 + %"59" = load i64, ptr addrspace(5) %"45", align 4 + %"71" = inttoptr i64 %"59" to ptr + %"36" = getelementptr inbounds i8, ptr %"71", i64 12 + %"60" = load i32, ptr %"36", align 4 + store i32 %"60", ptr addrspace(5) %"50", align 4 + %"62" = load i32, ptr addrspace(5) %"47", align 4 + %"63" = load i32, ptr addrspace(5) %"48", align 4 + %"64" = load i32, ptr addrspace(5) %"49", align 4 + %"65" = load i32, ptr addrspace(5) %"50", align 4 + %"72" = call i32 @__zluda_ptx_impl_bfi_b32(i32 %"62", i32 %"63", i32 %"64", i32 %"65") + store i32 %"72", ptr addrspace(5) %"47", align 4 + %"66" = load i64, ptr addrspace(5) %"46", align 4 + %"67" = load i32, ptr addrspace(5) %"47", align 4 + %"75" = inttoptr i64 %"66" to ptr + store i32 %"67", ptr %"75", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/block.ll b/ptx/src/test/ll/block.ll new file mode 100644 index 0000000..523d941 --- /dev/null +++ b/ptx/src/test/ll/block.ll @@ -0,0 +1,43 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"50" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"46" = load i64, ptr %"55", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"42", align 4 + %"48" = add i64 %"49", 1 + store i64 %"48", ptr addrspace(5) %"43", align 4 + %"52" = load i64, ptr addrspace(5) %"50", align 4 + %"51" = add i64 %"52", 1 + store i64 %"51", ptr addrspace(5) %"50", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i64, ptr addrspace(5) %"43", align 4 + %"56" = inttoptr i64 %"53" to ptr + store i64 %"54", ptr %"56", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/bra.ll b/ptx/src/test/ll/bra.ll new file mode 100644 index 0000000..0fb9769 --- /dev/null +++ b/ptx/src/test/ll/bra.ll @@ -0,0 +1,51 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"46" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"47" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"47", ptr addrspace(5) %"43", align 4 + %"49" = load i64, ptr addrspace(5) %"42", align 4 + %"56" = inttoptr i64 %"49" to ptr + %"48" = load i64, ptr %"56", align 4 + store i64 %"48", ptr addrspace(5) %"44", align 4 + br label %"9" + +"9": ; preds = %1 + %"51" = load i64, ptr addrspace(5) %"44", align 4 + %"50" = add i64 %"51", 1 + store i64 %"50", ptr addrspace(5) %"45", align 4 + br label %"11" + +"10": ; No predecessors! + %"53" = load i64, ptr addrspace(5) %"44", align 4 + %"52" = add i64 %"53", 2 + store i64 %"52", ptr addrspace(5) %"45", align 4 + br label %"11" + +"11": ; preds = %"10", %"9" + %"54" = load i64, ptr addrspace(5) %"43", align 4 + %"55" = load i64, ptr addrspace(5) %"45", align 4 + %"57" = inttoptr i64 %"54" to ptr + store i64 %"55", ptr %"57", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/brev.ll b/ptx/src/test/ll/brev.ll new file mode 100644 index 0000000..6f10c94 --- /dev/null +++ b/ptx/src/test/ll/brev.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"47", align 4 + store i32 %"41", ptr addrspace(5) %"38", align 4 + %"44" = load i32, ptr addrspace(5) %"38", align 4 + %"43" = call i32 @llvm.bitreverse.i32(i32 %"44") + store i32 %"43", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load i32, ptr addrspace(5) %"38", align 4 + %"48" = inttoptr i64 %"45" to ptr + store i32 %"46", ptr %"48", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.bitreverse.i32(i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/call.ll b/ptx/src/test/ll/call.ll new file mode 100644 index 0000000..c9bb5ce --- /dev/null +++ b/ptx/src/test/ll/call.ll @@ -0,0 +1,66 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define i64 @__zluda_ptx_impl_incr(i64 %"42") #0 { + %"65" = alloca i64, align 8, addrspace(5) + %"66" = alloca i64, align 8, addrspace(5) + %"67" = alloca i64, align 8, addrspace(5) + %"68" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i64 %"42", ptr addrspace(5) %"67", align 4 + %"69" = load i64, ptr addrspace(5) %"67", align 4 + store i64 %"69", ptr addrspace(5) %"68", align 4 + %"71" = load i64, ptr addrspace(5) %"68", align 4 + %"70" = add i64 %"71", 1 + store i64 %"70", ptr addrspace(5) %"68", align 4 + %"72" = load i64, ptr addrspace(5) %"68", align 4 + store i64 %"72", ptr addrspace(5) %"66", align 4 + %"73" = load i64, ptr addrspace(5) %"66", align 4 + store i64 %"73", ptr addrspace(5) %"65", align 4 + %2 = load i64, ptr addrspace(5) %"65", align 4 + ret i64 %2 +} + +define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #0 { + %"52" = alloca i64, align 8, addrspace(5) + %"53" = alloca i64, align 8, addrspace(5) + %"54" = alloca i64, align 8, addrspace(5) + %"59" = alloca i64, align 8, addrspace(5) + %"60" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"55" = load i64, ptr addrspace(4) %"50", align 4 + store i64 %"55", ptr addrspace(5) %"52", align 4 + %"56" = load i64, ptr addrspace(4) %"51", align 4 + store i64 %"56", ptr addrspace(5) %"53", align 4 + %"58" = load i64, ptr addrspace(5) %"52", align 4 + %"74" = inttoptr i64 %"58" to ptr addrspace(1) + %"57" = load i64, ptr addrspace(1) %"74", align 4 + store i64 %"57", ptr addrspace(5) %"54", align 4 + %"61" = load i64, ptr addrspace(5) %"54", align 4 + store i64 %"61", ptr addrspace(5) %"59", align 4 + %"39" = load i64, ptr addrspace(5) %"59", align 4 + %"40" = call i64 @__zluda_ptx_impl_incr(i64 %"39") + store i64 %"40", ptr addrspace(5) %"60", align 4 + %"62" = load i64, ptr addrspace(5) %"60", align 4 + store i64 %"62", ptr addrspace(5) %"54", align 4 + %"63" = load i64, ptr addrspace(5) %"53", align 4 + %"64" = load i64, ptr addrspace(5) %"54", align 4 + %"77" = inttoptr i64 %"63" to ptr addrspace(1) + store i64 %"64", ptr addrspace(1) %"77", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/clz.ll b/ptx/src/test/ll/clz.ll new file mode 100644 index 0000000..160a634 --- /dev/null +++ b/ptx/src/test/ll/clz.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"47", align 4 + store i32 %"41", ptr addrspace(5) %"38", align 4 + %"44" = load i32, ptr addrspace(5) %"38", align 4 + %"48" = call i32 @llvm.ctlz.i32(i32 %"44", i1 false) + store i32 %"48", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load i32, ptr addrspace(5) %"38", align 4 + %"49" = inttoptr i64 %"45" to ptr + store i32 %"46", ptr %"49", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/const.ll b/ptx/src/test/ll/const.ll new file mode 100644 index 0000000..0fbd7e0 --- /dev/null +++ b/ptx/src/test/ll/const.ll @@ -0,0 +1,59 @@ +@constparams = addrspace(4) global [4 x i16] [i16 10, i16 20, i16 30, i16 40], align 8 + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #0 { + %"52" = alloca i64, align 8, addrspace(5) + %"53" = alloca i64, align 8, addrspace(5) + %"54" = alloca i16, align 2, addrspace(5) + %"55" = alloca i16, align 2, addrspace(5) + %"56" = alloca i16, align 2, addrspace(5) + %"57" = alloca i16, align 2, addrspace(5) + br label %1 + +1: ; preds = %0 + %"58" = load i64, ptr addrspace(4) %"50", align 4 + store i64 %"58", ptr addrspace(5) %"52", align 4 + %"59" = load i64, ptr addrspace(4) %"51", align 4 + store i64 %"59", ptr addrspace(5) %"53", align 4 + %"60" = load i16, ptr addrspace(4) @constparams, align 2 + store i16 %"60", ptr addrspace(5) %"54", align 2 + %"61" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2 + store i16 %"61", ptr addrspace(5) %"55", align 2 + %"62" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2 + store i16 %"62", ptr addrspace(5) %"56", align 2 + %"63" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2 + store i16 %"63", ptr addrspace(5) %"57", align 2 + %"64" = load i64, ptr addrspace(5) %"53", align 4 + %"65" = load i16, ptr addrspace(5) %"54", align 2 + %"76" = inttoptr i64 %"64" to ptr + store i16 %"65", ptr %"76", align 2 + %"66" = load i64, ptr addrspace(5) %"53", align 4 + %"78" = inttoptr i64 %"66" to ptr + %"39" = getelementptr inbounds i8, ptr %"78", i64 2 + %"67" = load i16, ptr addrspace(5) %"55", align 2 + store i16 %"67", ptr %"39", align 2 + %"68" = load i64, ptr addrspace(5) %"53", align 4 + %"80" = inttoptr i64 %"68" to ptr + %"41" = getelementptr inbounds i8, ptr %"80", i64 4 + %"69" = load i16, ptr addrspace(5) %"56", align 2 + store i16 %"69", ptr %"41", align 2 + %"70" = load i64, ptr addrspace(5) %"53", align 4 + %"82" = inttoptr i64 %"70" to ptr + %"43" = getelementptr inbounds i8, ptr %"82", i64 6 + %"71" = load i16, ptr addrspace(5) %"57", align 2 + store i16 %"71", ptr %"43", align 2 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/constant_f32.ll b/ptx/src/test/ll/constant_f32.ll new file mode 100644 index 0000000..60f625f --- /dev/null +++ b/ptx/src/test/ll/constant_f32.ll @@ -0,0 +1,38 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load float, ptr %"48", align 4 + store float %"42", ptr addrspace(5) %"39", align 4 + %"45" = load float, ptr addrspace(5) %"39", align 4 + %"44" = fmul float %"45", 5.000000e-01 + store float %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load float, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store float %"47", ptr %"49", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/constant_negative.ll b/ptx/src/test/ll/constant_negative.ll new file mode 100644 index 0000000..201b867 --- /dev/null +++ b/ptx/src/test/ll/constant_negative.ll @@ -0,0 +1,38 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load i32, ptr %"48", align 4 + store i32 %"42", ptr addrspace(5) %"39", align 4 + %"45" = load i32, ptr addrspace(5) %"39", align 4 + %"44" = mul i32 %"45", -1 + store i32 %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store i32 %"47", ptr %"49", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/cos.ll b/ptx/src/test/ll/cos.ll new file mode 100644 index 0000000..44c0ee0 --- /dev/null +++ b/ptx/src/test/ll/cos.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load float, ptr %"47", align 4 + store float %"41", ptr addrspace(5) %"38", align 4 + %"44" = load float, ptr addrspace(5) %"38", align 4 + %"43" = call afn float @llvm.cos.f32(float %"44") + store float %"43", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load float, ptr addrspace(5) %"38", align 4 + %"48" = inttoptr i64 %"45" to ptr + store float %"46", ptr %"48", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.cos.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/cvt_f64_f32.ll b/ptx/src/test/ll/cvt_f64_f32.ll new file mode 100644 index 0000000..4d5cf2c --- /dev/null +++ b/ptx/src/test/ll/cvt_f64_f32.ll @@ -0,0 +1,39 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca float, align 4, addrspace(5) + %"40" = alloca double, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = inttoptr i64 %"44" to ptr addrspace(1) + %"43" = load float, ptr addrspace(1) %"49", align 4 + store float %"43", ptr addrspace(5) %"39", align 4 + %"46" = load float, ptr addrspace(5) %"39", align 4 + %"45" = fpext float %"46" to double + store double %"45", ptr addrspace(5) %"40", align 8 + %"47" = load i64, ptr addrspace(5) %"38", align 4 + %"48" = load double, ptr addrspace(5) %"40", align 8 + %"50" = inttoptr i64 %"47" to ptr + store double %"48", ptr %"50", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/cvt_rni.ll b/ptx/src/test/ll/cvt_rni.ll new file mode 100644 index 0000000..850b1fb --- /dev/null +++ b/ptx/src/test/ll/cvt_rni.ll @@ -0,0 +1,58 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca float, align 4, addrspace(5) + %"44" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"46" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %"59" = inttoptr i64 %"48" to ptr + %"47" = load float, ptr %"59", align 4 + store float %"47", ptr addrspace(5) %"43", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"60" = inttoptr i64 %"49" to ptr + %"30" = getelementptr inbounds i8, ptr %"60", i64 4 + %"50" = load float, ptr %"30", align 4 + store float %"50", ptr addrspace(5) %"44", align 4 + %"52" = load float, ptr addrspace(5) %"43", align 4 + %2 = call float @llvm.roundeven.f32(float %"52") + %"51" = freeze float %2 + store float %"51", ptr addrspace(5) %"43", align 4 + %"54" = load float, ptr addrspace(5) %"44", align 4 + %3 = call float @llvm.roundeven.f32(float %"54") + %"53" = freeze float %3 + store float %"53", ptr addrspace(5) %"44", align 4 + %"55" = load i64, ptr addrspace(5) %"42", align 4 + %"56" = load float, ptr addrspace(5) %"43", align 4 + %"61" = inttoptr i64 %"55" to ptr + store float %"56", ptr %"61", align 4 + %"57" = load i64, ptr addrspace(5) %"42", align 4 + %"62" = inttoptr i64 %"57" to ptr + %"32" = getelementptr inbounds i8, ptr %"62", i64 4 + %"58" = load float, ptr addrspace(5) %"44", align 4 + store float %"58", ptr %"32", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.roundeven.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/cvt_rzi.ll b/ptx/src/test/ll/cvt_rzi.ll new file mode 100644 index 0000000..05a2d49 --- /dev/null +++ b/ptx/src/test/ll/cvt_rzi.ll @@ -0,0 +1,58 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca float, align 4, addrspace(5) + %"44" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"46" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %"59" = inttoptr i64 %"48" to ptr + %"47" = load float, ptr %"59", align 4 + store float %"47", ptr addrspace(5) %"43", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"60" = inttoptr i64 %"49" to ptr + %"30" = getelementptr inbounds i8, ptr %"60", i64 4 + %"50" = load float, ptr %"30", align 4 + store float %"50", ptr addrspace(5) %"44", align 4 + %"52" = load float, ptr addrspace(5) %"43", align 4 + %2 = call float @llvm.trunc.f32(float %"52") + %"51" = freeze float %2 + store float %"51", ptr addrspace(5) %"43", align 4 + %"54" = load float, ptr addrspace(5) %"44", align 4 + %3 = call float @llvm.trunc.f32(float %"54") + %"53" = freeze float %3 + store float %"53", ptr addrspace(5) %"44", align 4 + %"55" = load i64, ptr addrspace(5) %"42", align 4 + %"56" = load float, ptr addrspace(5) %"43", align 4 + %"61" = inttoptr i64 %"55" to ptr + store float %"56", ptr %"61", align 4 + %"57" = load i64, ptr addrspace(5) %"42", align 4 + %"62" = inttoptr i64 %"57" to ptr + %"32" = getelementptr inbounds i8, ptr %"62", i64 4 + %"58" = load float, ptr addrspace(5) %"44", align 4 + store float %"58", ptr %"32", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.trunc.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/cvt_s16_s8.ll b/ptx/src/test/ll/cvt_s16_s8.ll new file mode 100644 index 0000000..b36fc88 --- /dev/null +++ b/ptx/src/test/ll/cvt_s16_s8.ll @@ -0,0 +1,41 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = inttoptr i64 %"44" to ptr addrspace(1) + %"43" = load i32, ptr addrspace(1) %"49", align 4 + store i32 %"43", ptr addrspace(5) %"40", align 4 + %"46" = load i32, ptr addrspace(5) %"40", align 4 + %2 = trunc i32 %"46" to i8 + %"50" = sext i8 %2 to i16 + %"45" = sext i16 %"50" to i32 + store i32 %"45", ptr addrspace(5) %"39", align 4 + %"47" = load i64, ptr addrspace(5) %"38", align 4 + %"48" = load i32, ptr addrspace(5) %"39", align 4 + %"52" = inttoptr i64 %"47" to ptr + store i32 %"48", ptr %"52", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/cvt_s32_f32.ll b/ptx/src/test/ll/cvt_s32_f32.ll new file mode 100644 index 0000000..5a8e804 --- /dev/null +++ b/ptx/src/test/ll/cvt_s32_f32.ll @@ -0,0 +1,64 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"46" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %"60" = inttoptr i64 %"48" to ptr + %"59" = load float, ptr %"60", align 4 + %"47" = bitcast float %"59" to i32 + store i32 %"47", ptr addrspace(5) %"43", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"61" = inttoptr i64 %"49" to ptr + %"30" = getelementptr inbounds i8, ptr %"61", i64 4 + %"62" = load float, ptr %"30", align 4 + %"50" = bitcast float %"62" to i32 + store i32 %"50", ptr addrspace(5) %"44", align 4 + %"52" = load i32, ptr addrspace(5) %"43", align 4 + %"64" = bitcast i32 %"52" to float + %2 = call float @llvm.ceil.f32(float %"64") + %3 = fptosi float %2 to i32 + %"63" = freeze i32 %3 + store i32 %"63", ptr addrspace(5) %"43", align 4 + %"54" = load i32, ptr addrspace(5) %"44", align 4 + %"66" = bitcast i32 %"54" to float + %4 = call float @llvm.ceil.f32(float %"66") + %5 = fptosi float %4 to i32 + %"65" = freeze i32 %5 + store i32 %"65", ptr addrspace(5) %"44", align 4 + %"55" = load i64, ptr addrspace(5) %"42", align 4 + %"56" = load i32, ptr addrspace(5) %"43", align 4 + %"67" = inttoptr i64 %"55" to ptr addrspace(1) + store i32 %"56", ptr addrspace(1) %"67", align 4 + %"57" = load i64, ptr addrspace(5) %"42", align 4 + %"69" = inttoptr i64 %"57" to ptr addrspace(1) + %"32" = getelementptr inbounds i8, ptr addrspace(1) %"69", i64 4 + %"58" = load i32, ptr addrspace(5) %"44", align 4 + store i32 %"58", ptr addrspace(1) %"32", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.ceil.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/cvt_s64_s32.ll b/ptx/src/test/ll/cvt_s64_s32.ll new file mode 100644 index 0000000..5aa91b1 --- /dev/null +++ b/ptx/src/test/ll/cvt_s64_s32.ll @@ -0,0 +1,39 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"50" = inttoptr i64 %"44" to ptr + %"49" = load i32, ptr %"50", align 4 + store i32 %"49", ptr addrspace(5) %"39", align 4 + %"46" = load i32, ptr addrspace(5) %"39", align 4 + %"45" = sext i32 %"46" to i64 + store i64 %"45", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"38", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"51" = inttoptr i64 %"47" to ptr + store i64 %"48", ptr %"51", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/cvt_sat_s_u.ll b/ptx/src/test/ll/cvt_sat_s_u.ll new file mode 100644 index 0000000..63954f8 --- /dev/null +++ b/ptx/src/test/ll/cvt_sat_s_u.ll @@ -0,0 +1,50 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"43", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"53" = inttoptr i64 %"46" to ptr + %"45" = load i32, ptr %"53", align 4 + store i32 %"45", ptr addrspace(5) %"40", align 4 + %"48" = load i32, ptr addrspace(5) %"40", align 4 + %2 = call i32 @llvm.smax.i32(i32 %"48", i32 0) + %3 = call i32 @llvm.umin.i32(i32 %2, i32 -1) + store i32 %3, ptr addrspace(5) %"41", align 4 + %"50" = load i32, ptr addrspace(5) %"41", align 4 + store i32 %"50", ptr addrspace(5) %"42", align 4 + %"51" = load i64, ptr addrspace(5) %"39", align 4 + %"52" = load i32, ptr addrspace(5) %"42", align 4 + %"54" = inttoptr i64 %"51" to ptr + store i32 %"52", ptr %"54", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.umin.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/cvta.ll b/ptx/src/test/ll/cvta.ll new file mode 100644 index 0000000..495b312 --- /dev/null +++ b/ptx/src/test/ll/cvta.ll @@ -0,0 +1,43 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %2 = inttoptr i64 %"42" to ptr + %"49" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"49", ptr addrspace(5) %"36", align 8 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %3 = inttoptr i64 %"44" to ptr + %"51" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"51", ptr addrspace(5) %"37", align 8 + %"46" = load i64, ptr addrspace(5) %"36", align 4 + %"53" = inttoptr i64 %"46" to ptr addrspace(1) + %"45" = load float, ptr addrspace(1) %"53", align 4 + store float %"45", ptr addrspace(5) %"38", align 4 + %"47" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = load float, ptr addrspace(5) %"38", align 4 + %"54" = inttoptr i64 %"47" to ptr addrspace(1) + store float %"48", ptr addrspace(1) %"54", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/div_approx.ll b/ptx/src/test/ll/div_approx.ll new file mode 100644 index 0000000..cb8cb28 --- /dev/null +++ b/ptx/src/test/ll/div_approx.ll @@ -0,0 +1,45 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca float, align 4, addrspace(5) + %"42" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"54" = inttoptr i64 %"46" to ptr + %"45" = load float, ptr %"54", align 4 + store float %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"30" = getelementptr inbounds i8, ptr %"55", i64 4 + %"48" = load float, ptr %"30", align 4 + store float %"48", ptr addrspace(5) %"42", align 4 + %"50" = load float, ptr addrspace(5) %"41", align 4 + %"51" = load float, ptr addrspace(5) %"42", align 4 + %"49" = fdiv arcp afn float %"50", %"51" + store float %"49", ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"53" = load float, ptr addrspace(5) %"41", align 4 + %"56" = inttoptr i64 %"52" to ptr + store float %"53", ptr %"56", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/ex2.ll b/ptx/src/test/ll/ex2.ll new file mode 100644 index 0000000..904f238 --- /dev/null +++ b/ptx/src/test/ll/ex2.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load float, ptr %"47", align 4 + store float %"41", ptr addrspace(5) %"38", align 4 + %"44" = load float, ptr addrspace(5) %"38", align 4 + %"43" = call float @llvm.amdgcn.exp2.f32(float %"44") + store float %"43", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load float, ptr addrspace(5) %"38", align 4 + %"48" = inttoptr i64 %"45" to ptr + store float %"46", ptr %"48", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.amdgcn.exp2.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/extern_shared.ll b/ptx/src/test/ll/extern_shared.ll new file mode 100644 index 0000000..9b872ec --- /dev/null +++ b/ptx/src/test/ll/extern_shared.ll @@ -0,0 +1,41 @@ +@shared_mem = external addrspace(3) global [0 x i32] + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr addrspace(1) + %"42" = load i64, ptr addrspace(1) %"48", align 4 + store i64 %"42", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(5) %"39", align 4 + store i64 %"44", ptr addrspace(3) @shared_mem, align 4 + %"45" = load i64, ptr addrspace(3) @shared_mem, align 4 + store i64 %"45", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"46" to ptr addrspace(1) + store i64 %"47", ptr addrspace(1) %"51", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/extern_shared_call.ll b/ptx/src/test/ll/extern_shared_call.ll new file mode 100644 index 0000000..923523b --- /dev/null +++ b/ptx/src/test/ll/extern_shared_call.ll @@ -0,0 +1,57 @@ +@shared_mem = external addrspace(3) global [0 x i32], align 4 + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define void @__zluda_ptx_impl_incr_shared_2_global() #0 { + %"38" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(3) @shared_mem, align 4 + store i64 %"39", ptr addrspace(5) %"38", align 4 + %"41" = load i64, ptr addrspace(5) %"38", align 4 + %"40" = add i64 %"41", 2 + store i64 %"40", ptr addrspace(5) %"38", align 4 + %"42" = load i64, ptr addrspace(5) %"38", align 4 + store i64 %"42", ptr addrspace(3) @shared_mem, align 4 + ret void +} + +define amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"48" = load i64, ptr addrspace(4) %"43", align 4 + store i64 %"48", ptr addrspace(5) %"45", align 4 + %"49" = load i64, ptr addrspace(4) %"44", align 4 + store i64 %"49", ptr addrspace(5) %"46", align 4 + %"51" = load i64, ptr addrspace(5) %"45", align 4 + %"58" = inttoptr i64 %"51" to ptr addrspace(1) + %"50" = load i64, ptr addrspace(1) %"58", align 4 + store i64 %"50", ptr addrspace(5) %"47", align 4 + %"52" = load i64, ptr addrspace(5) %"47", align 4 + store i64 %"52", ptr addrspace(3) @shared_mem, align 4 + call void @__zluda_ptx_impl_incr_shared_2_global() + %"53" = load i64, ptr addrspace(3) @shared_mem, align 4 + store i64 %"53", ptr addrspace(5) %"47", align 4 + %"54" = load i64, ptr addrspace(5) %"46", align 4 + %"55" = load i64, ptr addrspace(5) %"47", align 4 + %"61" = inttoptr i64 %"54" to ptr addrspace(1) + store i64 %"55", ptr addrspace(1) %"61", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/fma.ll b/ptx/src/test/ll/fma.ll new file mode 100644 index 0000000..4a454ef --- /dev/null +++ b/ptx/src/test/ll/fma.ll @@ -0,0 +1,56 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca float, align 4, addrspace(5) + %"45" = alloca float, align 4, addrspace(5) + %"46" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"47" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"47", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"48", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"61" = inttoptr i64 %"50" to ptr + %"49" = load float, ptr %"61", align 4 + store float %"49", ptr addrspace(5) %"44", align 4 + %"51" = load i64, ptr addrspace(5) %"42", align 4 + %"62" = inttoptr i64 %"51" to ptr + %"31" = getelementptr inbounds i8, ptr %"62", i64 4 + %"52" = load float, ptr %"31", align 4 + store float %"52", ptr addrspace(5) %"45", align 4 + %"53" = load i64, ptr addrspace(5) %"42", align 4 + %"63" = inttoptr i64 %"53" to ptr + %"33" = getelementptr inbounds i8, ptr %"63", i64 8 + %"54" = load float, ptr %"33", align 4 + store float %"54", ptr addrspace(5) %"46", align 4 + %"56" = load float, ptr addrspace(5) %"44", align 4 + %"57" = load float, ptr addrspace(5) %"45", align 4 + %"58" = load float, ptr addrspace(5) %"46", align 4 + %"55" = call float @llvm.fma.f32(float %"56", float %"57", float %"58") + store float %"55", ptr addrspace(5) %"44", align 4 + %"59" = load i64, ptr addrspace(5) %"43", align 4 + %"60" = load float, ptr addrspace(5) %"44", align 4 + %"64" = inttoptr i64 %"59" to ptr + store float %"60", ptr %"64", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.fma.f32(float, float, float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/global_array.ll b/ptx/src/test/ll/global_array.ll new file mode 100644 index 0000000..fede5f7 --- /dev/null +++ b/ptx/src/test/ll/global_array.ll @@ -0,0 +1,36 @@ +@foobar = addrspace(1) global [4 x i32] [i32 1, i32 0, i32 0, i32 0] + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %"37", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"47" = inttoptr i64 %"43" to ptr addrspace(1) + %"42" = load i32, ptr addrspace(1) %"47", align 4 + store i32 %"42", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(5) %"38", align 4 + %"45" = load i32, ptr addrspace(5) %"39", align 4 + %"48" = inttoptr i64 %"44" to ptr addrspace(1) + store i32 %"45", ptr addrspace(1) %"48", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/ld_st.ll b/ptx/src/test/ll/ld_st.ll new file mode 100644 index 0000000..7c37090 --- /dev/null +++ b/ptx/src/test/ll/ld_st.ll @@ -0,0 +1,35 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"45" = inttoptr i64 %"42" to ptr + %"41" = load i64, ptr %"45", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"38", align 4 + %"46" = inttoptr i64 %"43" to ptr + store i64 %"44", ptr %"46", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/ld_st_implicit.ll b/ptx/src/test/ll/ld_st_implicit.ll new file mode 100644 index 0000000..cb4e08a --- /dev/null +++ b/ptx/src/test/ll/ld_st_implicit.ll @@ -0,0 +1,40 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + store i64 81985529216486895, ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"44" to ptr addrspace(1) + %"47" = load float, ptr addrspace(1) %"48", align 4 + %2 = bitcast float %"47" to i32 + %"43" = zext i32 %2 to i64 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"45" to ptr addrspace(1) + %3 = trunc i64 %"46" to i32 + %"50" = bitcast i32 %3 to float + store float %"50", ptr addrspace(1) %"49", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/ld_st_offset.ll b/ptx/src/test/ll/ld_st_offset.ll new file mode 100644 index 0000000..81e0c62 --- /dev/null +++ b/ptx/src/test/ll/ld_st_offset.ll @@ -0,0 +1,46 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"46" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"41", align 4 + %"55" = inttoptr i64 %"48" to ptr + %"47" = load i32, ptr %"55", align 4 + store i32 %"47", ptr addrspace(5) %"43", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"56" = inttoptr i64 %"49" to ptr + %"30" = getelementptr inbounds i8, ptr %"56", i64 4 + %"50" = load i32, ptr %"30", align 4 + store i32 %"50", ptr addrspace(5) %"44", align 4 + %"51" = load i64, ptr addrspace(5) %"42", align 4 + %"52" = load i32, ptr addrspace(5) %"44", align 4 + %"57" = inttoptr i64 %"51" to ptr + store i32 %"52", ptr %"57", align 4 + %"53" = load i64, ptr addrspace(5) %"42", align 4 + %"58" = inttoptr i64 %"53" to ptr + %"32" = getelementptr inbounds i8, ptr %"58", i64 4 + %"54" = load i32, ptr addrspace(5) %"43", align 4 + store i32 %"54", ptr %"32", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/lg2.ll b/ptx/src/test/ll/lg2.ll new file mode 100644 index 0000000..543ae0a --- /dev/null +++ b/ptx/src/test/ll/lg2.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load float, ptr %"47", align 4 + store float %"41", ptr addrspace(5) %"38", align 4 + %"44" = load float, ptr addrspace(5) %"38", align 4 + %"43" = call float @llvm.amdgcn.log.f32(float %"44") + store float %"43", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load float, ptr addrspace(5) %"38", align 4 + %"48" = inttoptr i64 %"45" to ptr + store float %"46", ptr %"48", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.amdgcn.log.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/local_align.ll b/ptx/src/test/ll/local_align.ll new file mode 100644 index 0000000..08c7971 --- /dev/null +++ b/ptx/src/test/ll/local_align.ll @@ -0,0 +1,36 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"9" = alloca [8 x i8], align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = inttoptr i64 %"43" to ptr + %"42" = load i64, ptr %"46", align 4 + store i64 %"42", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"39", align 4 + %"47" = inttoptr i64 %"44" to ptr + store i64 %"45", ptr %"47", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/mad_s32.ll b/ptx/src/test/ll/mad_s32.ll new file mode 100644 index 0000000..f6ea9a8 --- /dev/null +++ b/ptx/src/test/ll/mad_s32.ll @@ -0,0 +1,64 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) + %"49" = alloca i32, align 4, addrspace(5) + %"50" = alloca i32, align 4, addrspace(5) + %"51" = alloca i32, align 4, addrspace(5) + %"52" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"53" = load i64, ptr addrspace(4) %"45", align 4 + store i64 %"53", ptr addrspace(5) %"47", align 4 + %"54" = load i64, ptr addrspace(4) %"46", align 4 + store i64 %"54", ptr addrspace(5) %"48", align 4 + %"56" = load i64, ptr addrspace(5) %"47", align 4 + %"71" = inttoptr i64 %"56" to ptr + %"55" = load i32, ptr %"71", align 4 + store i32 %"55", ptr addrspace(5) %"50", align 4 + %"57" = load i64, ptr addrspace(5) %"47", align 4 + %"72" = inttoptr i64 %"57" to ptr + %"32" = getelementptr inbounds i8, ptr %"72", i64 4 + %"58" = load i32, ptr %"32", align 4 + store i32 %"58", ptr addrspace(5) %"51", align 4 + %"59" = load i64, ptr addrspace(5) %"47", align 4 + %"73" = inttoptr i64 %"59" to ptr + %"34" = getelementptr inbounds i8, ptr %"73", i64 8 + %"60" = load i32, ptr %"34", align 4 + store i32 %"60", ptr addrspace(5) %"52", align 4 + %"62" = load i32, ptr addrspace(5) %"50", align 4 + %"63" = load i32, ptr addrspace(5) %"51", align 4 + %"64" = load i32, ptr addrspace(5) %"52", align 4 + %2 = mul i32 %"62", %"63" + %"61" = add i32 %2, %"64" + store i32 %"61", ptr addrspace(5) %"49", align 4 + %"65" = load i64, ptr addrspace(5) %"48", align 4 + %"66" = load i32, ptr addrspace(5) %"49", align 4 + %"74" = inttoptr i64 %"65" to ptr + store i32 %"66", ptr %"74", align 4 + %"67" = load i64, ptr addrspace(5) %"48", align 4 + %"75" = inttoptr i64 %"67" to ptr + %"36" = getelementptr inbounds i8, ptr %"75", i64 4 + %"68" = load i32, ptr addrspace(5) %"49", align 4 + store i32 %"68", ptr %"36", align 4 + %"69" = load i64, ptr addrspace(5) %"48", align 4 + %"76" = inttoptr i64 %"69" to ptr + %"38" = getelementptr inbounds i8, ptr %"76", i64 8 + %"70" = load i32, ptr addrspace(5) %"49", align 4 + store i32 %"70", ptr %"38", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/max.ll b/ptx/src/test/ll/max.ll new file mode 100644 index 0000000..e8f58ba --- /dev/null +++ b/ptx/src/test/ll/max.ll @@ -0,0 +1,49 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"54" = inttoptr i64 %"46" to ptr + %"45" = load i32, ptr %"54", align 4 + store i32 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"30" = getelementptr inbounds i8, ptr %"55", i64 4 + %"48" = load i32, ptr %"30", align 4 + store i32 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"51" = load i32, ptr addrspace(5) %"42", align 4 + %"49" = call i32 @llvm.smax.i32(i32 %"50", i32 %"51") + store i32 %"49", ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"53" = load i32, ptr addrspace(5) %"41", align 4 + %"56" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"56", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/membar.ll b/ptx/src/test/ll/membar.ll new file mode 100644 index 0000000..2e78f12 --- /dev/null +++ b/ptx/src/test/ll/membar.ll @@ -0,0 +1,36 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"46" = inttoptr i64 %"42" to ptr + %"45" = load i32, ptr %"46", align 4 + store i32 %"45", ptr addrspace(5) %"38", align 4 + fence seq_cst + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"44" = load i32, ptr addrspace(5) %"38", align 4 + %"47" = inttoptr i64 %"43" to ptr + store i32 %"44", ptr %"47", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/min.ll b/ptx/src/test/ll/min.ll new file mode 100644 index 0000000..e868195 --- /dev/null +++ b/ptx/src/test/ll/min.ll @@ -0,0 +1,49 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"54" = inttoptr i64 %"46" to ptr + %"45" = load i32, ptr %"54", align 4 + store i32 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"30" = getelementptr inbounds i8, ptr %"55", i64 4 + %"48" = load i32, ptr %"30", align 4 + store i32 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"51" = load i32, ptr addrspace(5) %"42", align 4 + %"49" = call i32 @llvm.smin.i32(i32 %"50", i32 %"51") + store i32 %"49", ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"53" = load i32, ptr addrspace(5) %"41", align 4 + %"56" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"56", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smin.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/mov.ll b/ptx/src/test/ll/mov.ll new file mode 100644 index 0000000..cf6c7ee --- /dev/null +++ b/ptx/src/test/ll/mov.ll @@ -0,0 +1,38 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = inttoptr i64 %"44" to ptr + %"43" = load i64, ptr %"49", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + store i64 %"46", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"38", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"50" = inttoptr i64 %"47" to ptr + store i64 %"48", ptr %"50", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/mov_address.ll b/ptx/src/test/ll/mov_address.ll new file mode 100644 index 0000000..644df01 --- /dev/null +++ b/ptx/src/test/ll/mov_address.ll @@ -0,0 +1,24 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"9" = alloca [8 x i8], align 1, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"37" = ptrtoint ptr addrspace(5) %"9" to i64 + store i64 %"37", ptr addrspace(5) %"35", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/mul24.ll b/ptx/src/test/ll/mul24.ll new file mode 100644 index 0000000..aae8aa0 --- /dev/null +++ b/ptx/src/test/ll/mul24.ll @@ -0,0 +1,43 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @mul24(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr + %"44" = load i32, ptr %"50", align 4 + store i32 %"44", ptr addrspace(5) %"40", align 4 + %"47" = load i32, ptr addrspace(5) %"40", align 4 + %"46" = call i32 @llvm.amdgcn.mul.u24(i32 %"47", i32 2) + store i32 %"46", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i32, ptr addrspace(5) %"41", align 4 + %"51" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"51", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.amdgcn.mul.u24(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/mul_ftz.ll b/ptx/src/test/ll/mul_ftz.ll new file mode 100644 index 0000000..ceacd5d --- /dev/null +++ b/ptx/src/test/ll/mul_ftz.ll @@ -0,0 +1,45 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca float, align 4, addrspace(5) + %"42" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"54" = inttoptr i64 %"46" to ptr + %"45" = load float, ptr %"54", align 4 + store float %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"30" = getelementptr inbounds i8, ptr %"55", i64 4 + %"48" = load float, ptr %"30", align 4 + store float %"48", ptr addrspace(5) %"42", align 4 + %"50" = load float, ptr addrspace(5) %"41", align 4 + %"51" = load float, ptr addrspace(5) %"42", align 4 + %"49" = fmul float %"50", %"51" + store float %"49", ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"53" = load float, ptr addrspace(5) %"41", align 4 + %"56" = inttoptr i64 %"52" to ptr + store float %"53", ptr %"56", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/mul_hi.ll b/ptx/src/test/ll/mul_hi.ll new file mode 100644 index 0000000..57ee469 --- /dev/null +++ b/ptx/src/test/ll/mul_hi.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr + %"44" = load i64, ptr %"50", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %2 = zext i64 %"47" to i128 + %3 = mul i128 %2, 2 + %4 = lshr i128 %3, 64 + %"46" = trunc i128 %4 to i64 + store i64 %"46", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"51" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"51", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/mul_lo.ll b/ptx/src/test/ll/mul_lo.ll new file mode 100644 index 0000000..15f39e8 --- /dev/null +++ b/ptx/src/test/ll/mul_lo.ll @@ -0,0 +1,39 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr + %"44" = load i64, ptr %"50", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"46" = mul i64 %"47", 2 + store i64 %"46", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"51" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"51", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/mul_non_ftz.ll b/ptx/src/test/ll/mul_non_ftz.ll new file mode 100644 index 0000000..ee1da37 --- /dev/null +++ b/ptx/src/test/ll/mul_non_ftz.ll @@ -0,0 +1,45 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca float, align 4, addrspace(5) + %"42" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"54" = inttoptr i64 %"46" to ptr + %"45" = load float, ptr %"54", align 4 + store float %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"30" = getelementptr inbounds i8, ptr %"55", i64 4 + %"48" = load float, ptr %"30", align 4 + store float %"48", ptr addrspace(5) %"42", align 4 + %"50" = load float, ptr addrspace(5) %"41", align 4 + %"51" = load float, ptr addrspace(5) %"42", align 4 + %"49" = fmul float %"50", %"51" + store float %"49", ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"53" = load float, ptr addrspace(5) %"41", align 4 + %"56" = inttoptr i64 %"52" to ptr + store float %"53", ptr %"56", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/mul_wide.ll b/ptx/src/test/ll/mul_wide.ll new file mode 100644 index 0000000..7b815d1 --- /dev/null +++ b/ptx/src/test/ll/mul_wide.ll @@ -0,0 +1,48 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"45" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"45", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"46", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr addrspace(1) + %"47" = load i32, ptr addrspace(1) %"56", align 4 + store i32 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"40", align 4 + %"57" = inttoptr i64 %"49" to ptr addrspace(1) + %"31" = getelementptr inbounds i8, ptr addrspace(1) %"57", i64 4 + %"50" = load i32, ptr addrspace(1) %"31", align 4 + store i32 %"50", ptr addrspace(5) %"43", align 4 + %"52" = load i32, ptr addrspace(5) %"42", align 4 + %"53" = load i32, ptr addrspace(5) %"43", align 4 + %2 = sext i32 %"52" to i64 + %3 = sext i32 %"53" to i64 + %"51" = mul i64 %2, %3 + store i64 %"51", ptr addrspace(5) %"44", align 4 + %"54" = load i64, ptr addrspace(5) %"41", align 4 + %"55" = load i64, ptr addrspace(5) %"44", align 4 + %"58" = inttoptr i64 %"54" to ptr + store i64 %"55", ptr %"58", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/neg.ll b/ptx/src/test/ll/neg.ll new file mode 100644 index 0000000..ebcedc0 --- /dev/null +++ b/ptx/src/test/ll/neg.ll @@ -0,0 +1,38 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"47", align 4 + store i32 %"41", ptr addrspace(5) %"38", align 4 + %"44" = load i32, ptr addrspace(5) %"38", align 4 + %"43" = sub i32 0, %"44" + store i32 %"43", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load i32, ptr addrspace(5) %"38", align 4 + %"48" = inttoptr i64 %"45" to ptr + store i32 %"46", ptr %"48", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/non_scalar_ptr_offset.ll b/ptx/src/test/ll/non_scalar_ptr_offset.ll new file mode 100644 index 0000000..9fabfa6 --- /dev/null +++ b/ptx/src/test/ll/non_scalar_ptr_offset.ll @@ -0,0 +1,44 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"46" = load i64, ptr addrspace(5) %"40", align 4 + %"54" = inttoptr i64 %"46" to ptr addrspace(1) + %"31" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8 + %"29" = load <2 x i32>, ptr addrspace(1) %"31", align 8 + %"47" = extractelement <2 x i32> %"29", i8 0 + %"48" = extractelement <2 x i32> %"29", i8 1 + store i32 %"47", ptr addrspace(5) %"42", align 4 + store i32 %"48", ptr addrspace(5) %"43", align 4 + %"50" = load i32, ptr addrspace(5) %"42", align 4 + %"51" = load i32, ptr addrspace(5) %"43", align 4 + %"49" = add i32 %"50", %"51" + store i32 %"49", ptr addrspace(5) %"42", align 4 + %"52" = load i64, ptr addrspace(5) %"41", align 4 + %"53" = load i32, ptr addrspace(5) %"42", align 4 + %"55" = inttoptr i64 %"52" to ptr addrspace(1) + store i32 %"53", ptr addrspace(1) %"55", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/not.ll b/ptx/src/test/ll/not.ll new file mode 100644 index 0000000..8b078d7 --- /dev/null +++ b/ptx/src/test/ll/not.ll @@ -0,0 +1,39 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = inttoptr i64 %"44" to ptr + %"43" = load i64, ptr %"49", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"50" = xor i64 %"46", -1 + store i64 %"50", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"38", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"52" = inttoptr i64 %"47" to ptr + store i64 %"48", ptr %"52", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/ntid.ll b/ptx/src/test/ll/ntid.ll new file mode 100644 index 0000000..2144bc4 --- /dev/null +++ b/ptx/src/test/ll/ntid.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"53" = inttoptr i64 %"46" to ptr + %"45" = load i32, ptr %"53", align 4 + store i32 %"45", ptr addrspace(5) %"41", align 4 + %"30" = call i32 @__zluda_ptx_impl_sreg_ntid(i8 0) + store i32 %"30", ptr addrspace(5) %"42", align 4 + %"49" = load i32, ptr addrspace(5) %"41", align 4 + %"50" = load i32, ptr addrspace(5) %"42", align 4 + %"48" = add i32 %"49", %"50" + store i32 %"48", ptr addrspace(5) %"41", align 4 + %"51" = load i64, ptr addrspace(5) %"40", align 4 + %"52" = load i32, ptr addrspace(5) %"41", align 4 + %"54" = inttoptr i64 %"51" to ptr + store i32 %"52", ptr %"54", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/or.ll b/ptx/src/test/ll/or.ll new file mode 100644 index 0000000..c7190b7 --- /dev/null +++ b/ptx/src/test/ll/or.ll @@ -0,0 +1,45 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"54" = inttoptr i64 %"46" to ptr + %"45" = load i64, ptr %"54", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"30" = getelementptr inbounds i8, ptr %"55", i64 8 + %"48" = load i64, ptr %"30", align 4 + store i64 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i64, ptr addrspace(5) %"41", align 4 + %"51" = load i64, ptr addrspace(5) %"42", align 4 + %"56" = or i64 %"50", %"51" + store i64 %"56", ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"59" = inttoptr i64 %"52" to ptr + store i64 %"53", ptr %"59", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/popc.ll b/ptx/src/test/ll/popc.ll new file mode 100644 index 0000000..e71acba --- /dev/null +++ b/ptx/src/test/ll/popc.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"47", align 4 + store i32 %"41", ptr addrspace(5) %"38", align 4 + %"44" = load i32, ptr addrspace(5) %"38", align 4 + %"48" = call i32 @llvm.ctpop.i32(i32 %"44") + store i32 %"48", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load i32, ptr addrspace(5) %"38", align 4 + %"49" = inttoptr i64 %"45" to ptr + store i32 %"46", ptr %"49", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.ctpop.i32(i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/pred_not.ll b/ptx/src/test/ll/pred_not.ll new file mode 100644 index 0000000..7046c09 --- /dev/null +++ b/ptx/src/test/ll/pred_not.ll @@ -0,0 +1,66 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) + %"49" = alloca i64, align 8, addrspace(5) + %"50" = alloca i64, align 8, addrspace(5) + %"51" = alloca i64, align 8, addrspace(5) + %"52" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 + %"53" = load i64, ptr addrspace(4) %"45", align 4 + store i64 %"53", ptr addrspace(5) %"47", align 4 + %"54" = load i64, ptr addrspace(4) %"46", align 4 + store i64 %"54", ptr addrspace(5) %"48", align 4 + %"56" = load i64, ptr addrspace(5) %"47", align 4 + %"70" = inttoptr i64 %"56" to ptr + %"55" = load i64, ptr %"70", align 4 + store i64 %"55", ptr addrspace(5) %"49", align 4 + %"57" = load i64, ptr addrspace(5) %"47", align 4 + %"71" = inttoptr i64 %"57" to ptr + %"36" = getelementptr inbounds i8, ptr %"71", i64 8 + %"58" = load i64, ptr %"36", align 4 + store i64 %"58", ptr addrspace(5) %"50", align 4 + %"60" = load i64, ptr addrspace(5) %"49", align 4 + %"61" = load i64, ptr addrspace(5) %"50", align 4 + %"59" = icmp ult i64 %"60", %"61" + store i1 %"59", ptr addrspace(5) %"52", align 1 + %"63" = load i1, ptr addrspace(5) %"52", align 1 + %"62" = xor i1 %"63", true + store i1 %"62", ptr addrspace(5) %"52", align 1 + %"64" = load i1, ptr addrspace(5) %"52", align 1 + br i1 %"64", label %"15", label %"16" + +"15": ; preds = %1 + store i64 1, ptr addrspace(5) %"51", align 4 + br label %"16" + +"16": ; preds = %"15", %1 + %"66" = load i1, ptr addrspace(5) %"52", align 1 + br i1 %"66", label %"18", label %"17" + +"17": ; preds = %"16" + store i64 2, ptr addrspace(5) %"51", align 4 + br label %"18" + +"18": ; preds = %"17", %"16" + %"68" = load i64, ptr addrspace(5) %"48", align 4 + %"69" = load i64, ptr addrspace(5) %"51", align 4 + %"72" = inttoptr i64 %"68" to ptr + store i64 %"69", ptr %"72", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/prmt.ll b/ptx/src/test/ll/prmt.ll new file mode 100644 index 0000000..dd5b95c --- /dev/null +++ b/ptx/src/test/ll/prmt.ll @@ -0,0 +1,47 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"54" = inttoptr i64 %"46" to ptr + %"45" = load i32, ptr %"54", align 4 + store i32 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"30" = getelementptr inbounds i8, ptr %"55", i64 4 + %"48" = load i32, ptr %"30", align 4 + store i32 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"51" = load i32, ptr addrspace(5) %"42", align 4 + %2 = bitcast i32 %"50" to <4 x i8> + %3 = bitcast i32 %"51" to <4 x i8> + %"56" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> + store <4 x i8> %"56", ptr addrspace(5) %"42", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"53" = load i32, ptr addrspace(5) %"42", align 4 + %"59" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"59", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/rcp.ll b/ptx/src/test/ll/rcp.ll new file mode 100644 index 0000000..c00012a --- /dev/null +++ b/ptx/src/test/ll/rcp.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load float, ptr %"47", align 4 + store float %"41", ptr addrspace(5) %"38", align 4 + %"44" = load float, ptr addrspace(5) %"38", align 4 + %"43" = call float @llvm.amdgcn.rcp.f32(float %"44") + store float %"43", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load float, ptr addrspace(5) %"38", align 4 + %"48" = inttoptr i64 %"45" to ptr + store float %"46", ptr %"48", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.amdgcn.rcp.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/reg_local.ll b/ptx/src/test/ll/reg_local.ll new file mode 100644 index 0000000..51fe3e9 --- /dev/null +++ b/ptx/src/test/ll/reg_local.ll @@ -0,0 +1,45 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { + %"9" = alloca [8 x i8], align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"46" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"46", ptr addrspace(5) %"43", align 4 + %"47" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"47", ptr addrspace(5) %"44", align 4 + %"49" = load i64, ptr addrspace(5) %"43", align 4 + %"55" = inttoptr i64 %"49" to ptr addrspace(1) + %"54" = load i64, ptr addrspace(1) %"55", align 4 + store i64 %"54", ptr addrspace(5) %"45", align 4 + %"50" = load i64, ptr addrspace(5) %"45", align 4 + %"30" = add i64 %"50", 1 + %"56" = addrspacecast ptr addrspace(5) %"9" to ptr + store i64 %"30", ptr %"56", align 4 + %"58" = addrspacecast ptr addrspace(5) %"9" to ptr + %"32" = getelementptr inbounds i8, ptr %"58", i64 0 + %"59" = load i64, ptr %"32", align 4 + store i64 %"59", ptr addrspace(5) %"45", align 4 + %"52" = load i64, ptr addrspace(5) %"44", align 4 + %"60" = inttoptr i64 %"52" to ptr addrspace(1) + %"34" = getelementptr inbounds i8, ptr addrspace(1) %"60", i64 0 + %"53" = load i64, ptr addrspace(5) %"45", align 4 + store i64 %"53", ptr addrspace(1) %"34", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/rem.ll b/ptx/src/test/ll/rem.ll new file mode 100644 index 0000000..964021e --- /dev/null +++ b/ptx/src/test/ll/rem.ll @@ -0,0 +1,45 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"54" = inttoptr i64 %"46" to ptr + %"45" = load i32, ptr %"54", align 4 + store i32 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"30" = getelementptr inbounds i8, ptr %"55", i64 4 + %"48" = load i32, ptr %"30", align 4 + store i32 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"51" = load i32, ptr addrspace(5) %"42", align 4 + %"49" = srem i32 %"50", %"51" + store i32 %"49", ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"53" = load i32, ptr addrspace(5) %"41", align 4 + %"56" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"56", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/rsqrt.ll b/ptx/src/test/ll/rsqrt.ll new file mode 100644 index 0000000..532a8c8 --- /dev/null +++ b/ptx/src/test/ll/rsqrt.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca double, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load double, ptr %"47", align 8 + store double %"41", ptr addrspace(5) %"38", align 8 + %"44" = load double, ptr addrspace(5) %"38", align 8 + %"43" = call double @llvm.amdgcn.rsq.f64(double %"44") + store double %"43", ptr addrspace(5) %"38", align 8 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load double, ptr addrspace(5) %"38", align 8 + %"48" = inttoptr i64 %"45" to ptr + store double %"46", ptr %"48", align 8 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare double @llvm.amdgcn.rsq.f64(double) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/selp.ll b/ptx/src/test/ll/selp.ll new file mode 100644 index 0000000..580754d --- /dev/null +++ b/ptx/src/test/ll/selp.ll @@ -0,0 +1,45 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i16, align 2, addrspace(5) + %"43" = alloca i16, align 2, addrspace(5) + br label %1 + +1: ; preds = %0 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"46" = load i16, ptr %"55", align 2 + store i16 %"46", ptr addrspace(5) %"42", align 2 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"30" = getelementptr inbounds i8, ptr %"56", i64 2 + %"49" = load i16, ptr %"30", align 2 + store i16 %"49", ptr addrspace(5) %"43", align 2 + %"51" = load i16, ptr addrspace(5) %"42", align 2 + %"52" = load i16, ptr addrspace(5) %"43", align 2 + %"50" = select i1 false, i16 %"51", i16 %"52" + store i16 %"50", ptr addrspace(5) %"42", align 2 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i16, ptr addrspace(5) %"42", align 2 + %"57" = inttoptr i64 %"53" to ptr + store i16 %"54", ptr %"57", align 2 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/selp_true.ll b/ptx/src/test/ll/selp_true.ll new file mode 100644 index 0000000..142c361 --- /dev/null +++ b/ptx/src/test/ll/selp_true.ll @@ -0,0 +1,45 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i16, align 2, addrspace(5) + %"43" = alloca i16, align 2, addrspace(5) + br label %1 + +1: ; preds = %0 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"46" = load i16, ptr %"55", align 2 + store i16 %"46", ptr addrspace(5) %"42", align 2 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr + %"30" = getelementptr inbounds i8, ptr %"56", i64 2 + %"49" = load i16, ptr %"30", align 2 + store i16 %"49", ptr addrspace(5) %"43", align 2 + %"51" = load i16, ptr addrspace(5) %"42", align 2 + %"52" = load i16, ptr addrspace(5) %"43", align 2 + %"50" = select i1 true, i16 %"51", i16 %"52" + store i16 %"50", ptr addrspace(5) %"42", align 2 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i16, ptr addrspace(5) %"42", align 2 + %"57" = inttoptr i64 %"53" to ptr + store i16 %"54", ptr %"57", align 2 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/setp.ll b/ptx/src/test/ll/setp.ll new file mode 100644 index 0000000..6625957 --- /dev/null +++ b/ptx/src/test/ll/setp.ll @@ -0,0 +1,63 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) + %"49" = alloca i64, align 8, addrspace(5) + %"50" = alloca i64, align 8, addrspace(5) + %"51" = alloca i64, align 8, addrspace(5) + %"52" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 + %"53" = load i64, ptr addrspace(4) %"45", align 4 + store i64 %"53", ptr addrspace(5) %"47", align 4 + %"54" = load i64, ptr addrspace(4) %"46", align 4 + store i64 %"54", ptr addrspace(5) %"48", align 4 + %"56" = load i64, ptr addrspace(5) %"47", align 4 + %"68" = inttoptr i64 %"56" to ptr + %"55" = load i64, ptr %"68", align 4 + store i64 %"55", ptr addrspace(5) %"49", align 4 + %"57" = load i64, ptr addrspace(5) %"47", align 4 + %"69" = inttoptr i64 %"57" to ptr + %"36" = getelementptr inbounds i8, ptr %"69", i64 8 + %"58" = load i64, ptr %"36", align 4 + store i64 %"58", ptr addrspace(5) %"50", align 4 + %"60" = load i64, ptr addrspace(5) %"49", align 4 + %"61" = load i64, ptr addrspace(5) %"50", align 4 + %"59" = icmp ult i64 %"60", %"61" + store i1 %"59", ptr addrspace(5) %"52", align 1 + %"62" = load i1, ptr addrspace(5) %"52", align 1 + br i1 %"62", label %"15", label %"16" + +"15": ; preds = %1 + store i64 1, ptr addrspace(5) %"51", align 4 + br label %"16" + +"16": ; preds = %"15", %1 + %"64" = load i1, ptr addrspace(5) %"52", align 1 + br i1 %"64", label %"18", label %"17" + +"17": ; preds = %"16" + store i64 2, ptr addrspace(5) %"51", align 4 + br label %"18" + +"18": ; preds = %"17", %"16" + %"66" = load i64, ptr addrspace(5) %"48", align 4 + %"67" = load i64, ptr addrspace(5) %"51", align 4 + %"70" = inttoptr i64 %"66" to ptr + store i64 %"67", ptr %"70", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/setp_gt.ll b/ptx/src/test/ll/setp_gt.ll new file mode 100644 index 0000000..4badce3 --- /dev/null +++ b/ptx/src/test/ll/setp_gt.ll @@ -0,0 +1,65 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca float, align 4, addrspace(5) + %"48" = alloca float, align 4, addrspace(5) + %"49" = alloca float, align 4, addrspace(5) + %"50" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 + %"51" = load i64, ptr addrspace(4) %"43", align 4 + store i64 %"51", ptr addrspace(5) %"45", align 4 + %"52" = load i64, ptr addrspace(4) %"44", align 4 + store i64 %"52", ptr addrspace(5) %"46", align 4 + %"54" = load i64, ptr addrspace(5) %"45", align 4 + %"68" = inttoptr i64 %"54" to ptr + %"53" = load float, ptr %"68", align 4 + store float %"53", ptr addrspace(5) %"47", align 4 + %"55" = load i64, ptr addrspace(5) %"45", align 4 + %"69" = inttoptr i64 %"55" to ptr + %"36" = getelementptr inbounds i8, ptr %"69", i64 4 + %"56" = load float, ptr %"36", align 4 + store float %"56", ptr addrspace(5) %"48", align 4 + %"58" = load float, ptr addrspace(5) %"47", align 4 + %"59" = load float, ptr addrspace(5) %"48", align 4 + %"57" = fcmp ogt float %"58", %"59" + store i1 %"57", ptr addrspace(5) %"50", align 1 + %"60" = load i1, ptr addrspace(5) %"50", align 1 + br i1 %"60", label %"15", label %"16" + +"15": ; preds = %1 + %"62" = load float, ptr addrspace(5) %"47", align 4 + store float %"62", ptr addrspace(5) %"49", align 4 + br label %"16" + +"16": ; preds = %"15", %1 + %"63" = load i1, ptr addrspace(5) %"50", align 1 + br i1 %"63", label %"18", label %"17" + +"17": ; preds = %"16" + %"65" = load float, ptr addrspace(5) %"48", align 4 + store float %"65", ptr addrspace(5) %"49", align 4 + br label %"18" + +"18": ; preds = %"17", %"16" + %"66" = load i64, ptr addrspace(5) %"46", align 4 + %"67" = load float, ptr addrspace(5) %"49", align 4 + %"70" = inttoptr i64 %"66" to ptr + store float %"67", ptr %"70", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/setp_leu.ll b/ptx/src/test/ll/setp_leu.ll new file mode 100644 index 0000000..d91e569 --- /dev/null +++ b/ptx/src/test/ll/setp_leu.ll @@ -0,0 +1,65 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca float, align 4, addrspace(5) + %"48" = alloca float, align 4, addrspace(5) + %"49" = alloca float, align 4, addrspace(5) + %"50" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 + %"51" = load i64, ptr addrspace(4) %"43", align 4 + store i64 %"51", ptr addrspace(5) %"45", align 4 + %"52" = load i64, ptr addrspace(4) %"44", align 4 + store i64 %"52", ptr addrspace(5) %"46", align 4 + %"54" = load i64, ptr addrspace(5) %"45", align 4 + %"68" = inttoptr i64 %"54" to ptr + %"53" = load float, ptr %"68", align 4 + store float %"53", ptr addrspace(5) %"47", align 4 + %"55" = load i64, ptr addrspace(5) %"45", align 4 + %"69" = inttoptr i64 %"55" to ptr + %"36" = getelementptr inbounds i8, ptr %"69", i64 4 + %"56" = load float, ptr %"36", align 4 + store float %"56", ptr addrspace(5) %"48", align 4 + %"58" = load float, ptr addrspace(5) %"47", align 4 + %"59" = load float, ptr addrspace(5) %"48", align 4 + %"57" = fcmp ule float %"58", %"59" + store i1 %"57", ptr addrspace(5) %"50", align 1 + %"60" = load i1, ptr addrspace(5) %"50", align 1 + br i1 %"60", label %"15", label %"16" + +"15": ; preds = %1 + %"62" = load float, ptr addrspace(5) %"47", align 4 + store float %"62", ptr addrspace(5) %"49", align 4 + br label %"16" + +"16": ; preds = %"15", %1 + %"63" = load i1, ptr addrspace(5) %"50", align 1 + br i1 %"63", label %"18", label %"17" + +"17": ; preds = %"16" + %"65" = load float, ptr addrspace(5) %"48", align 4 + store float %"65", ptr addrspace(5) %"49", align 4 + br label %"18" + +"18": ; preds = %"17", %"16" + %"66" = load i64, ptr addrspace(5) %"46", align 4 + %"67" = load float, ptr addrspace(5) %"49", align 4 + %"70" = inttoptr i64 %"66" to ptr + store float %"67", ptr %"70", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/setp_nan.ll b/ptx/src/test/ll/setp_nan.ll new file mode 100644 index 0000000..15c0c2a --- /dev/null +++ b/ptx/src/test/ll/setp_nan.ll @@ -0,0 +1,174 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"87", ptr addrspace(4) byref(i64) %"88") #0 { + %"89" = alloca i64, align 8, addrspace(5) + %"90" = alloca i64, align 8, addrspace(5) + %"91" = alloca float, align 4, addrspace(5) + %"92" = alloca float, align 4, addrspace(5) + %"93" = alloca float, align 4, addrspace(5) + %"94" = alloca float, align 4, addrspace(5) + %"95" = alloca float, align 4, addrspace(5) + %"96" = alloca float, align 4, addrspace(5) + %"97" = alloca float, align 4, addrspace(5) + %"98" = alloca float, align 4, addrspace(5) + %"99" = alloca i32, align 4, addrspace(5) + %"100" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 + %"101" = load i64, ptr addrspace(4) %"87", align 4 + store i64 %"101", ptr addrspace(5) %"89", align 4 + %"102" = load i64, ptr addrspace(4) %"88", align 4 + store i64 %"102", ptr addrspace(5) %"90", align 4 + %"104" = load i64, ptr addrspace(5) %"89", align 4 + %"155" = inttoptr i64 %"104" to ptr + %"103" = load float, ptr %"155", align 4 + store float %"103", ptr addrspace(5) %"91", align 4 + %"105" = load i64, ptr addrspace(5) %"89", align 4 + %"156" = inttoptr i64 %"105" to ptr + %"54" = getelementptr inbounds i8, ptr %"156", i64 4 + %"106" = load float, ptr %"54", align 4 + store float %"106", ptr addrspace(5) %"92", align 4 + %"107" = load i64, ptr addrspace(5) %"89", align 4 + %"157" = inttoptr i64 %"107" to ptr + %"56" = getelementptr inbounds i8, ptr %"157", i64 8 + %"108" = load float, ptr %"56", align 4 + store float %"108", ptr addrspace(5) %"93", align 4 + %"109" = load i64, ptr addrspace(5) %"89", align 4 + %"158" = inttoptr i64 %"109" to ptr + %"58" = getelementptr inbounds i8, ptr %"158", i64 12 + %"110" = load float, ptr %"58", align 4 + store float %"110", ptr addrspace(5) %"94", align 4 + %"111" = load i64, ptr addrspace(5) %"89", align 4 + %"159" = inttoptr i64 %"111" to ptr + %"60" = getelementptr inbounds i8, ptr %"159", i64 16 + %"112" = load float, ptr %"60", align 4 + store float %"112", ptr addrspace(5) %"95", align 4 + %"113" = load i64, ptr addrspace(5) %"89", align 4 + %"160" = inttoptr i64 %"113" to ptr + %"62" = getelementptr inbounds i8, ptr %"160", i64 20 + %"114" = load float, ptr %"62", align 4 + store float %"114", ptr addrspace(5) %"96", align 4 + %"115" = load i64, ptr addrspace(5) %"89", align 4 + %"161" = inttoptr i64 %"115" to ptr + %"64" = getelementptr inbounds i8, ptr %"161", i64 24 + %"116" = load float, ptr %"64", align 4 + store float %"116", ptr addrspace(5) %"97", align 4 + %"117" = load i64, ptr addrspace(5) %"89", align 4 + %"162" = inttoptr i64 %"117" to ptr + %"66" = getelementptr inbounds i8, ptr %"162", i64 28 + %"118" = load float, ptr %"66", align 4 + store float %"118", ptr addrspace(5) %"98", align 4 + %"120" = load float, ptr addrspace(5) %"91", align 4 + %"121" = load float, ptr addrspace(5) %"92", align 4 + %"119" = fcmp uno float %"120", %"121" + store i1 %"119", ptr addrspace(5) %"100", align 1 + %"122" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"122", label %"21", label %"22" + +"21": ; preds = %1 + store i32 1, ptr addrspace(5) %"99", align 4 + br label %"22" + +"22": ; preds = %"21", %1 + %"124" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"124", label %"24", label %"23" + +"23": ; preds = %"22" + store i32 0, ptr addrspace(5) %"99", align 4 + br label %"24" + +"24": ; preds = %"23", %"22" + %"126" = load i64, ptr addrspace(5) %"90", align 4 + %"127" = load i32, ptr addrspace(5) %"99", align 4 + %"163" = inttoptr i64 %"126" to ptr + store i32 %"127", ptr %"163", align 4 + %"129" = load float, ptr addrspace(5) %"93", align 4 + %"130" = load float, ptr addrspace(5) %"94", align 4 + %"128" = fcmp uno float %"129", %"130" + store i1 %"128", ptr addrspace(5) %"100", align 1 + %"131" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"131", label %"25", label %"26" + +"25": ; preds = %"24" + store i32 1, ptr addrspace(5) %"99", align 4 + br label %"26" + +"26": ; preds = %"25", %"24" + %"133" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"133", label %"28", label %"27" + +"27": ; preds = %"26" + store i32 0, ptr addrspace(5) %"99", align 4 + br label %"28" + +"28": ; preds = %"27", %"26" + %"135" = load i64, ptr addrspace(5) %"90", align 4 + %"164" = inttoptr i64 %"135" to ptr + %"72" = getelementptr inbounds i8, ptr %"164", i64 4 + %"136" = load i32, ptr addrspace(5) %"99", align 4 + store i32 %"136", ptr %"72", align 4 + %"138" = load float, ptr addrspace(5) %"95", align 4 + %"139" = load float, ptr addrspace(5) %"96", align 4 + %"137" = fcmp uno float %"138", %"139" + store i1 %"137", ptr addrspace(5) %"100", align 1 + %"140" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"140", label %"29", label %"30" + +"29": ; preds = %"28" + store i32 1, ptr addrspace(5) %"99", align 4 + br label %"30" + +"30": ; preds = %"29", %"28" + %"142" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"142", label %"32", label %"31" + +"31": ; preds = %"30" + store i32 0, ptr addrspace(5) %"99", align 4 + br label %"32" + +"32": ; preds = %"31", %"30" + %"144" = load i64, ptr addrspace(5) %"90", align 4 + %"165" = inttoptr i64 %"144" to ptr + %"76" = getelementptr inbounds i8, ptr %"165", i64 8 + %"145" = load i32, ptr addrspace(5) %"99", align 4 + store i32 %"145", ptr %"76", align 4 + %"147" = load float, ptr addrspace(5) %"97", align 4 + %"148" = load float, ptr addrspace(5) %"98", align 4 + %"146" = fcmp uno float %"147", %"148" + store i1 %"146", ptr addrspace(5) %"100", align 1 + %"149" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"149", label %"33", label %"34" + +"33": ; preds = %"32" + store i32 1, ptr addrspace(5) %"99", align 4 + br label %"34" + +"34": ; preds = %"33", %"32" + %"151" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"151", label %"36", label %"35" + +"35": ; preds = %"34" + store i32 0, ptr addrspace(5) %"99", align 4 + br label %"36" + +"36": ; preds = %"35", %"34" + %"153" = load i64, ptr addrspace(5) %"90", align 4 + %"166" = inttoptr i64 %"153" to ptr + %"80" = getelementptr inbounds i8, ptr %"166", i64 12 + %"154" = load i32, ptr addrspace(5) %"99", align 4 + store i32 %"154", ptr %"80", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/setp_num.ll b/ptx/src/test/ll/setp_num.ll new file mode 100644 index 0000000..c6303dc --- /dev/null +++ b/ptx/src/test/ll/setp_num.ll @@ -0,0 +1,174 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"87", ptr addrspace(4) byref(i64) %"88") #0 { + %"89" = alloca i64, align 8, addrspace(5) + %"90" = alloca i64, align 8, addrspace(5) + %"91" = alloca float, align 4, addrspace(5) + %"92" = alloca float, align 4, addrspace(5) + %"93" = alloca float, align 4, addrspace(5) + %"94" = alloca float, align 4, addrspace(5) + %"95" = alloca float, align 4, addrspace(5) + %"96" = alloca float, align 4, addrspace(5) + %"97" = alloca float, align 4, addrspace(5) + %"98" = alloca float, align 4, addrspace(5) + %"99" = alloca i32, align 4, addrspace(5) + %"100" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 + %"101" = load i64, ptr addrspace(4) %"87", align 4 + store i64 %"101", ptr addrspace(5) %"89", align 4 + %"102" = load i64, ptr addrspace(4) %"88", align 4 + store i64 %"102", ptr addrspace(5) %"90", align 4 + %"104" = load i64, ptr addrspace(5) %"89", align 4 + %"155" = inttoptr i64 %"104" to ptr + %"103" = load float, ptr %"155", align 4 + store float %"103", ptr addrspace(5) %"91", align 4 + %"105" = load i64, ptr addrspace(5) %"89", align 4 + %"156" = inttoptr i64 %"105" to ptr + %"54" = getelementptr inbounds i8, ptr %"156", i64 4 + %"106" = load float, ptr %"54", align 4 + store float %"106", ptr addrspace(5) %"92", align 4 + %"107" = load i64, ptr addrspace(5) %"89", align 4 + %"157" = inttoptr i64 %"107" to ptr + %"56" = getelementptr inbounds i8, ptr %"157", i64 8 + %"108" = load float, ptr %"56", align 4 + store float %"108", ptr addrspace(5) %"93", align 4 + %"109" = load i64, ptr addrspace(5) %"89", align 4 + %"158" = inttoptr i64 %"109" to ptr + %"58" = getelementptr inbounds i8, ptr %"158", i64 12 + %"110" = load float, ptr %"58", align 4 + store float %"110", ptr addrspace(5) %"94", align 4 + %"111" = load i64, ptr addrspace(5) %"89", align 4 + %"159" = inttoptr i64 %"111" to ptr + %"60" = getelementptr inbounds i8, ptr %"159", i64 16 + %"112" = load float, ptr %"60", align 4 + store float %"112", ptr addrspace(5) %"95", align 4 + %"113" = load i64, ptr addrspace(5) %"89", align 4 + %"160" = inttoptr i64 %"113" to ptr + %"62" = getelementptr inbounds i8, ptr %"160", i64 20 + %"114" = load float, ptr %"62", align 4 + store float %"114", ptr addrspace(5) %"96", align 4 + %"115" = load i64, ptr addrspace(5) %"89", align 4 + %"161" = inttoptr i64 %"115" to ptr + %"64" = getelementptr inbounds i8, ptr %"161", i64 24 + %"116" = load float, ptr %"64", align 4 + store float %"116", ptr addrspace(5) %"97", align 4 + %"117" = load i64, ptr addrspace(5) %"89", align 4 + %"162" = inttoptr i64 %"117" to ptr + %"66" = getelementptr inbounds i8, ptr %"162", i64 28 + %"118" = load float, ptr %"66", align 4 + store float %"118", ptr addrspace(5) %"98", align 4 + %"120" = load float, ptr addrspace(5) %"91", align 4 + %"121" = load float, ptr addrspace(5) %"92", align 4 + %"119" = fcmp ord float %"120", %"121" + store i1 %"119", ptr addrspace(5) %"100", align 1 + %"122" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"122", label %"21", label %"22" + +"21": ; preds = %1 + store i32 2, ptr addrspace(5) %"99", align 4 + br label %"22" + +"22": ; preds = %"21", %1 + %"124" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"124", label %"24", label %"23" + +"23": ; preds = %"22" + store i32 0, ptr addrspace(5) %"99", align 4 + br label %"24" + +"24": ; preds = %"23", %"22" + %"126" = load i64, ptr addrspace(5) %"90", align 4 + %"127" = load i32, ptr addrspace(5) %"99", align 4 + %"163" = inttoptr i64 %"126" to ptr + store i32 %"127", ptr %"163", align 4 + %"129" = load float, ptr addrspace(5) %"93", align 4 + %"130" = load float, ptr addrspace(5) %"94", align 4 + %"128" = fcmp ord float %"129", %"130" + store i1 %"128", ptr addrspace(5) %"100", align 1 + %"131" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"131", label %"25", label %"26" + +"25": ; preds = %"24" + store i32 2, ptr addrspace(5) %"99", align 4 + br label %"26" + +"26": ; preds = %"25", %"24" + %"133" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"133", label %"28", label %"27" + +"27": ; preds = %"26" + store i32 0, ptr addrspace(5) %"99", align 4 + br label %"28" + +"28": ; preds = %"27", %"26" + %"135" = load i64, ptr addrspace(5) %"90", align 4 + %"164" = inttoptr i64 %"135" to ptr + %"72" = getelementptr inbounds i8, ptr %"164", i64 4 + %"136" = load i32, ptr addrspace(5) %"99", align 4 + store i32 %"136", ptr %"72", align 4 + %"138" = load float, ptr addrspace(5) %"95", align 4 + %"139" = load float, ptr addrspace(5) %"96", align 4 + %"137" = fcmp ord float %"138", %"139" + store i1 %"137", ptr addrspace(5) %"100", align 1 + %"140" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"140", label %"29", label %"30" + +"29": ; preds = %"28" + store i32 2, ptr addrspace(5) %"99", align 4 + br label %"30" + +"30": ; preds = %"29", %"28" + %"142" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"142", label %"32", label %"31" + +"31": ; preds = %"30" + store i32 0, ptr addrspace(5) %"99", align 4 + br label %"32" + +"32": ; preds = %"31", %"30" + %"144" = load i64, ptr addrspace(5) %"90", align 4 + %"165" = inttoptr i64 %"144" to ptr + %"76" = getelementptr inbounds i8, ptr %"165", i64 8 + %"145" = load i32, ptr addrspace(5) %"99", align 4 + store i32 %"145", ptr %"76", align 4 + %"147" = load float, ptr addrspace(5) %"97", align 4 + %"148" = load float, ptr addrspace(5) %"98", align 4 + %"146" = fcmp ord float %"147", %"148" + store i1 %"146", ptr addrspace(5) %"100", align 1 + %"149" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"149", label %"33", label %"34" + +"33": ; preds = %"32" + store i32 2, ptr addrspace(5) %"99", align 4 + br label %"34" + +"34": ; preds = %"33", %"32" + %"151" = load i1, ptr addrspace(5) %"100", align 1 + br i1 %"151", label %"36", label %"35" + +"35": ; preds = %"34" + store i32 0, ptr addrspace(5) %"99", align 4 + br label %"36" + +"36": ; preds = %"35", %"34" + %"153" = load i64, ptr addrspace(5) %"90", align 4 + %"166" = inttoptr i64 %"153" to ptr + %"80" = getelementptr inbounds i8, ptr %"166", i64 12 + %"154" = load i32, ptr addrspace(5) %"99", align 4 + store i32 %"154", ptr %"80", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/shared_ptr_32.ll b/ptx/src/test/ll/shared_ptr_32.ll new file mode 100644 index 0000000..ecba0ca --- /dev/null +++ b/ptx/src/test/ll/shared_ptr_32.ll @@ -0,0 +1,49 @@ +@shared_mem1 = external addrspace(3) global [128 x i8], align 4 + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"46" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"46", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"47", ptr addrspace(5) %"42", align 4 + store i32 ptrtoint (ptr addrspace(3) @shared_mem1 to i32), ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(5) %"41", align 4 + %"58" = inttoptr i64 %"50" to ptr addrspace(1) + %"49" = load i64, ptr addrspace(1) %"58", align 4 + store i64 %"49", ptr addrspace(5) %"44", align 4 + %"51" = load i32, ptr addrspace(5) %"43", align 4 + %"52" = load i64, ptr addrspace(5) %"44", align 4 + %"59" = inttoptr i32 %"51" to ptr addrspace(3) + store i64 %"52", ptr addrspace(3) %"59", align 4 + %"53" = load i32, ptr addrspace(5) %"43", align 4 + %"60" = inttoptr i32 %"53" to ptr addrspace(3) + %"32" = getelementptr inbounds i8, ptr addrspace(3) %"60", i64 0 + %"54" = load i64, ptr addrspace(3) %"32", align 4 + store i64 %"54", ptr addrspace(5) %"45", align 4 + %"55" = load i64, ptr addrspace(5) %"42", align 4 + %"56" = load i64, ptr addrspace(5) %"45", align 4 + %"61" = inttoptr i64 %"55" to ptr addrspace(1) + store i64 %"56", ptr addrspace(1) %"61", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/shared_ptr_take_address.ll b/ptx/src/test/ll/shared_ptr_take_address.ll new file mode 100644 index 0000000..a5a250d --- /dev/null +++ b/ptx/src/test/ll/shared_ptr_take_address.ll @@ -0,0 +1,48 @@ +@shared_mem = external addrspace(3) global [0 x i8], align 4 + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"44" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"44", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"45", ptr addrspace(5) %"40", align 4 + store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"56" = inttoptr i64 %"48" to ptr addrspace(1) + %"47" = load i64, ptr addrspace(1) %"56", align 4 + store i64 %"47", ptr addrspace(5) %"42", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"50" = load i64, ptr addrspace(5) %"42", align 4 + %"57" = inttoptr i64 %"49" to ptr addrspace(3) + store i64 %"50", ptr addrspace(3) %"57", align 4 + %"52" = load i64, ptr addrspace(5) %"41", align 4 + %"58" = inttoptr i64 %"52" to ptr addrspace(3) + %"51" = load i64, ptr addrspace(3) %"58", align 4 + store i64 %"51", ptr addrspace(5) %"43", align 4 + %"53" = load i64, ptr addrspace(5) %"40", align 4 + %"54" = load i64, ptr addrspace(5) %"43", align 4 + %"59" = inttoptr i64 %"53" to ptr addrspace(1) + store i64 %"54", ptr addrspace(1) %"59", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/shared_unify_extern.ll b/ptx/src/test/ll/shared_unify_extern.ll new file mode 100644 index 0000000..68309bf --- /dev/null +++ b/ptx/src/test/ll/shared_unify_extern.ll @@ -0,0 +1,80 @@ +@shared_ex = external addrspace(3) global [0 x i32] +@shared_mod = external addrspace(3) global [4 x i32] + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define i64 @__zluda_ptx_impl_add() #0 { + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"49" = load i64, ptr addrspace(3) @shared_mod, align 4 + store i64 %"49", ptr addrspace(5) %"47", align 4 + %"50" = load i64, ptr addrspace(3) @shared_ex, align 4 + store i64 %"50", ptr addrspace(5) %"48", align 4 + %"52" = load i64, ptr addrspace(5) %"48", align 4 + %"53" = load i64, ptr addrspace(5) %"47", align 4 + %"75" = add i64 %"52", %"53" + store i64 %"75", ptr addrspace(5) %"46", align 4 + %2 = load i64, ptr addrspace(5) %"46", align 4 + ret i64 %2 +} + +define i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"15") #0 { + %"54" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i64 %"15", ptr addrspace(3) @shared_ex, align 4 + %"55" = call i64 @__zluda_ptx_impl_add() + store i64 %"55", ptr addrspace(5) %"54", align 4 + %2 = load i64, ptr addrspace(5) %"54", align 4 + ret i64 %2 +} + +define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56", ptr addrspace(4) byref(i64) %"57") #0 { + %"58" = alloca i64, align 8, addrspace(5) + %"59" = alloca i64, align 8, addrspace(5) + %"60" = alloca i64, align 8, addrspace(5) + %"61" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"62" = load i64, ptr addrspace(4) %"56", align 4 + store i64 %"62", ptr addrspace(5) %"58", align 4 + %"63" = load i64, ptr addrspace(4) %"57", align 4 + store i64 %"63", ptr addrspace(5) %"59", align 4 + %"65" = load i64, ptr addrspace(5) %"58", align 4 + %"78" = inttoptr i64 %"65" to ptr addrspace(1) + %"64" = load i64, ptr addrspace(1) %"78", align 4 + store i64 %"64", ptr addrspace(5) %"60", align 4 + %"66" = load i64, ptr addrspace(5) %"58", align 4 + %"79" = inttoptr i64 %"66" to ptr addrspace(1) + %"39" = getelementptr inbounds i8, ptr addrspace(1) %"79", i64 8 + %"67" = load i64, ptr addrspace(1) %"39", align 4 + store i64 %"67", ptr addrspace(5) %"61", align 4 + %"68" = load i64, ptr addrspace(5) %"61", align 4 + store i64 %"68", ptr addrspace(3) @shared_mod, align 4 + %"70" = load i64, ptr addrspace(5) %"60", align 4 + %"81" = call i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"70") + store i64 %"81", ptr addrspace(5) %"61", align 4 + %"71" = load i64, ptr addrspace(5) %"59", align 4 + %"72" = load i64, ptr addrspace(5) %"61", align 4 + %"83" = inttoptr i64 %"71" to ptr + store i64 %"72", ptr %"83", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/shared_unify_local.ll b/ptx/src/test/ll/shared_unify_local.ll new file mode 100644 index 0000000..56a5bbb --- /dev/null +++ b/ptx/src/test/ll/shared_unify_local.ll @@ -0,0 +1,77 @@ +@shared_ex = external addrspace(3) global [0 x i32] +@shared_mod = external addrspace(3) global i64, align 4 + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define i64 @__zluda_ptx_impl_add(i64 %"10") #0 { + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i64 %"10", ptr addrspace(3) @shared_mod, align 4 + %"49" = load i64, ptr addrspace(3) @shared_mod, align 4 + store i64 %"49", ptr addrspace(5) %"48", align 4 + %"101" = load i64, ptr addrspace(3) @shared_ex, align 4 + %"51" = load i64, ptr addrspace(5) %"48", align 4 + %"72" = add i64 %"101", %"51" + store i64 %"72", ptr addrspace(5) %"47", align 4 + %2 = load i64, ptr addrspace(5) %"47", align 4 + ret i64 %2 +} + +define i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"15", i64 %"16") #0 { + %"52" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i64 %"15", ptr addrspace(3) @shared_ex, align 4 + %"53" = call i64 @__zluda_ptx_impl_add(i64 %"16") + store i64 %"53", ptr addrspace(5) %"52", align 4 + %2 = load i64, ptr addrspace(5) %"52", align 4 + ret i64 %2 +} + +define amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #0 { + %"56" = alloca i64, align 8, addrspace(5) + %"57" = alloca i64, align 8, addrspace(5) + %"58" = alloca i64, align 8, addrspace(5) + %"59" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"60" = load i64, ptr addrspace(4) %"54", align 4 + store i64 %"60", ptr addrspace(5) %"56", align 4 + %"61" = load i64, ptr addrspace(4) %"55", align 4 + store i64 %"61", ptr addrspace(5) %"57", align 4 + %"63" = load i64, ptr addrspace(5) %"56", align 4 + %"75" = inttoptr i64 %"63" to ptr addrspace(1) + %"62" = load i64, ptr addrspace(1) %"75", align 4 + store i64 %"62", ptr addrspace(5) %"58", align 4 + %"64" = load i64, ptr addrspace(5) %"56", align 4 + %"76" = inttoptr i64 %"64" to ptr addrspace(1) + %"40" = getelementptr inbounds i8, ptr addrspace(1) %"76", i64 8 + %"65" = load i64, ptr addrspace(1) %"40", align 4 + store i64 %"65", ptr addrspace(5) %"59", align 4 + %"67" = load i64, ptr addrspace(5) %"58", align 4 + %"68" = load i64, ptr addrspace(5) %"59", align 4 + %"77" = call i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"67", i64 %"68") + store i64 %"77", ptr addrspace(5) %"59", align 4 + %"69" = load i64, ptr addrspace(5) %"57", align 4 + %"70" = load i64, ptr addrspace(5) %"59", align 4 + %"79" = inttoptr i64 %"69" to ptr + store i64 %"70", ptr %"79", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/shared_variable.ll b/ptx/src/test/ll/shared_variable.ll new file mode 100644 index 0000000..f71fcc8 --- /dev/null +++ b/ptx/src/test/ll/shared_variable.ll @@ -0,0 +1,42 @@ +@shared_mem1 = external addrspace(3) global [128 x i8], align 4 + +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @shared_variable(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr addrspace(1) + %"44" = load i64, ptr addrspace(1) %"50", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"40", align 4 + store i64 %"46", ptr addrspace(3) @shared_mem1, align 4 + %"47" = load i64, ptr addrspace(3) @shared_mem1, align 4 + store i64 %"47", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"53" = inttoptr i64 %"48" to ptr addrspace(1) + store i64 %"49", ptr addrspace(1) %"53", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/shl.ll b/ptx/src/test/ll/shl.ll new file mode 100644 index 0000000..1b0d8bf --- /dev/null +++ b/ptx/src/test/ll/shl.ll @@ -0,0 +1,40 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @shl(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr + %"44" = load i64, ptr %"50", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %2 = shl i64 %"47", 2 + %"51" = select i1 false, i64 0, i64 %2 + store i64 %"51", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"53" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"53", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/shr.ll b/ptx/src/test/ll/shr.ll new file mode 100644 index 0000000..6b2cecd --- /dev/null +++ b/ptx/src/test/ll/shr.ll @@ -0,0 +1,39 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @shr(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"41" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load i32, ptr %"48", align 4 + store i32 %"42", ptr addrspace(5) %"39", align 4 + %"45" = load i32, ptr addrspace(5) %"39", align 4 + %2 = ashr i32 %"45", 1 + %"44" = select i1 false, i32 0, i32 %2 + store i32 %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store i32 %"47", ptr %"49", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/sign_extend.ll b/ptx/src/test/ll/sign_extend.ll new file mode 100644 index 0000000..0a29187 --- /dev/null +++ b/ptx/src/test/ll/sign_extend.ll @@ -0,0 +1,36 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @sign_extend(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"46" = inttoptr i64 %"42" to ptr + %"45" = load i16, ptr %"46", align 2 + %"41" = sext i16 %"45" to i32 + store i32 %"41", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(5) %"37", align 4 + %"44" = load i32, ptr addrspace(5) %"38", align 4 + %"47" = inttoptr i64 %"43" to ptr + store i32 %"44", ptr %"47", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/sin.ll b/ptx/src/test/ll/sin.ll new file mode 100644 index 0000000..656dbad --- /dev/null +++ b/ptx/src/test/ll/sin.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @sin(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load float, ptr %"47", align 4 + store float %"41", ptr addrspace(5) %"38", align 4 + %"44" = load float, ptr addrspace(5) %"38", align 4 + %"43" = call afn float @llvm.sin.f32(float %"44") + store float %"43", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load float, ptr addrspace(5) %"38", align 4 + %"48" = inttoptr i64 %"45" to ptr + store float %"46", ptr %"48", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.sin.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/sqrt.ll b/ptx/src/test/ll/sqrt.ll new file mode 100644 index 0000000..fe56dfe --- /dev/null +++ b/ptx/src/test/ll/sqrt.ll @@ -0,0 +1,42 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"39" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"39", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"40", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load float, ptr %"47", align 4 + store float %"41", ptr addrspace(5) %"38", align 4 + %"44" = load float, ptr addrspace(5) %"38", align 4 + %"43" = call float @llvm.amdgcn.sqrt.f32(float %"44") + store float %"43", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load float, ptr addrspace(5) %"38", align 4 + %"48" = inttoptr i64 %"45" to ptr + store float %"46", ptr %"48", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.amdgcn.sqrt.f32(float) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/ptx/src/test/ll/stateful_ld_st_ntid.ll b/ptx/src/test/ll/stateful_ld_st_ntid.ll new file mode 100644 index 0000000..cbdb89a --- /dev/null +++ b/ptx/src/test/ll/stateful_ld_st_ntid.ll @@ -0,0 +1,58 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @stateful_ld_st_ntid(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"64" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"64", ptr addrspace(5) %"40", align 4 + %"65" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"65", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %2 = inttoptr i64 %"48" to ptr + %"47" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"47", ptr addrspace(5) %"40", align 8 + %"50" = load i64, ptr addrspace(5) %"41", align 4 + %3 = inttoptr i64 %"50" to ptr + %"49" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"49", ptr addrspace(5) %"41", align 8 + %"31" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) + store i32 %"31", ptr addrspace(5) %"42", align 4 + %"53" = load i32, ptr addrspace(5) %"42", align 4 + %"52" = zext i32 %"53" to i64 + store i64 %"52", ptr addrspace(5) %"43", align 4 + %"55" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = load i64, ptr addrspace(5) %"43", align 4 + %"66" = add i64 %"55", %"56" + store i64 %"66", ptr addrspace(5) %"40", align 4 + %"58" = load i64, ptr addrspace(5) %"41", align 4 + %"59" = load i64, ptr addrspace(5) %"43", align 4 + %"68" = add i64 %"58", %"59" + store i64 %"68", ptr addrspace(5) %"41", align 4 + %"61" = load i64, ptr addrspace(5) %"40", align 4 + %"70" = inttoptr i64 %"61" to ptr addrspace(1) + %"60" = load i64, ptr addrspace(1) %"70", align 4 + store i64 %"60", ptr addrspace(5) %"44", align 4 + %"62" = load i64, ptr addrspace(5) %"41", align 4 + %"63" = load i64, ptr addrspace(5) %"44", align 4 + %"71" = inttoptr i64 %"62" to ptr addrspace(1) + store i64 %"63", ptr addrspace(1) %"71", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll b/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll new file mode 100644 index 0000000..1ac5a5f --- /dev/null +++ b/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll @@ -0,0 +1,62 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @stateful_ld_st_ntid_chain(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i64, align 8, addrspace(5) + %"49" = alloca i64, align 8, addrspace(5) + %"50" = alloca i32, align 4, addrspace(5) + %"51" = alloca i64, align 8, addrspace(5) + %"52" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"72" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"72", ptr addrspace(5) %"44", align 4 + %"73" = load i64, ptr addrspace(4) %"43", align 4 + store i64 %"73", ptr addrspace(5) %"47", align 4 + %"56" = load i64, ptr addrspace(5) %"44", align 4 + %2 = inttoptr i64 %"56" to ptr + %"55" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"55", ptr addrspace(5) %"45", align 8 + %"58" = load i64, ptr addrspace(5) %"47", align 4 + %3 = inttoptr i64 %"58" to ptr + %"57" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"57", ptr addrspace(5) %"48", align 8 + %"35" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) + store i32 %"35", ptr addrspace(5) %"50", align 4 + %"61" = load i32, ptr addrspace(5) %"50", align 4 + %"60" = zext i32 %"61" to i64 + store i64 %"60", ptr addrspace(5) %"51", align 4 + %"63" = load i64, ptr addrspace(5) %"45", align 4 + %"64" = load i64, ptr addrspace(5) %"51", align 4 + %"74" = add i64 %"63", %"64" + store i64 %"74", ptr addrspace(5) %"46", align 4 + %"66" = load i64, ptr addrspace(5) %"48", align 4 + %"67" = load i64, ptr addrspace(5) %"51", align 4 + %"76" = add i64 %"66", %"67" + store i64 %"76", ptr addrspace(5) %"49", align 4 + %"69" = load i64, ptr addrspace(5) %"46", align 4 + %"78" = inttoptr i64 %"69" to ptr addrspace(1) + %"68" = load i64, ptr addrspace(1) %"78", align 4 + store i64 %"68", ptr addrspace(5) %"52", align 4 + %"70" = load i64, ptr addrspace(5) %"49", align 4 + %"71" = load i64, ptr addrspace(5) %"52", align 4 + %"79" = inttoptr i64 %"70" to ptr addrspace(1) + store i64 %"71", ptr addrspace(1) %"79", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll b/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll new file mode 100644 index 0000000..8a07146 --- /dev/null +++ b/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll @@ -0,0 +1,64 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @stateful_ld_st_ntid_sub(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 { + %"48" = alloca i64, align 8, addrspace(5) + %"49" = alloca i64, align 8, addrspace(5) + %"50" = alloca i64, align 8, addrspace(5) + %"51" = alloca i64, align 8, addrspace(5) + %"52" = alloca i64, align 8, addrspace(5) + %"53" = alloca i64, align 8, addrspace(5) + %"54" = alloca i32, align 4, addrspace(5) + %"55" = alloca i64, align 8, addrspace(5) + %"56" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"76" = load i64, ptr addrspace(4) %"46", align 4 + store i64 %"76", ptr addrspace(5) %"48", align 4 + %"77" = load i64, ptr addrspace(4) %"47", align 4 + store i64 %"77", ptr addrspace(5) %"51", align 4 + %"60" = load i64, ptr addrspace(5) %"48", align 4 + %2 = inttoptr i64 %"60" to ptr + %"59" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"59", ptr addrspace(5) %"49", align 8 + %"62" = load i64, ptr addrspace(5) %"51", align 4 + %3 = inttoptr i64 %"62" to ptr + %"61" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"61", ptr addrspace(5) %"52", align 8 + %"35" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) + store i32 %"35", ptr addrspace(5) %"54", align 4 + %"65" = load i32, ptr addrspace(5) %"54", align 4 + %"64" = zext i32 %"65" to i64 + store i64 %"64", ptr addrspace(5) %"55", align 4 + %"67" = load i64, ptr addrspace(5) %"49", align 4 + %"68" = load i64, ptr addrspace(5) %"55", align 4 + %"78" = sub i64 %"67", %"68" + store i64 %"78", ptr addrspace(5) %"50", align 4 + %"70" = load i64, ptr addrspace(5) %"52", align 4 + %"71" = load i64, ptr addrspace(5) %"55", align 4 + %"81" = sub i64 %"70", %"71" + store i64 %"81", ptr addrspace(5) %"53", align 4 + %"72" = load i64, ptr addrspace(5) %"50", align 4 + %"84" = inttoptr i64 %"72" to ptr addrspace(1) + %"37" = getelementptr inbounds i8, ptr addrspace(1) %"84", i64 0 + %"73" = load i64, ptr addrspace(1) %"37", align 4 + store i64 %"73", ptr addrspace(5) %"56", align 4 + %"74" = load i64, ptr addrspace(5) %"53", align 4 + %"85" = inttoptr i64 %"74" to ptr addrspace(1) + %"39" = getelementptr inbounds i8, ptr addrspace(1) %"85", i64 0 + %"75" = load i64, ptr addrspace(5) %"56", align 4 + store i64 %"75", ptr addrspace(1) %"39", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/stateful_ld_st_simple.ll b/ptx/src/test/ll/stateful_ld_st_simple.ll new file mode 100644 index 0000000..09d064b --- /dev/null +++ b/ptx/src/test/ll/stateful_ld_st_simple.ll @@ -0,0 +1,45 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @stateful_ld_st_simple(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"43", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %2 = inttoptr i64 %"46" to ptr + %"53" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"53", ptr addrspace(5) %"40", align 8 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %3 = inttoptr i64 %"48" to ptr + %"55" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"55", ptr addrspace(5) %"41", align 8 + %"50" = load i64, ptr addrspace(5) %"40", align 4 + %"57" = inttoptr i64 %"50" to ptr addrspace(1) + %"49" = load i64, ptr addrspace(1) %"57", align 4 + store i64 %"49", ptr addrspace(5) %"42", align 4 + %"51" = load i64, ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"42", align 4 + %"58" = inttoptr i64 %"51" to ptr addrspace(1) + store i64 %"52", ptr addrspace(1) %"58", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/stateful_neg_offset.ll b/ptx/src/test/ll/stateful_neg_offset.ll new file mode 100644 index 0000000..38abb0d --- /dev/null +++ b/ptx/src/test/ll/stateful_neg_offset.ll @@ -0,0 +1,54 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @stateful_neg_offset(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"45" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"45", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"46", ptr addrspace(5) %"40", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %2 = inttoptr i64 %"48" to ptr + %"61" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"61", ptr addrspace(5) %"41", align 8 + %"50" = load i64, ptr addrspace(5) %"40", align 4 + %3 = inttoptr i64 %"50" to ptr + %"63" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"63", ptr addrspace(5) %"42", align 8 + %"52" = load i64, ptr addrspace(5) %"41", align 4 + %"53" = load i64, ptr addrspace(5) %"42", align 4 + %"51" = add i64 %"52", %"53" + store i64 %"51", ptr addrspace(5) %"43", align 4 + %"55" = load i64, ptr addrspace(5) %"41", align 4 + %"56" = load i64, ptr addrspace(5) %"42", align 4 + %"54" = sub i64 %"55", %"56" + store i64 %"54", ptr addrspace(5) %"43", align 4 + %"58" = load i64, ptr addrspace(5) %"41", align 4 + %"65" = inttoptr i64 %"58" to ptr addrspace(1) + %"57" = load i64, ptr addrspace(1) %"65", align 4 + store i64 %"57", ptr addrspace(5) %"44", align 4 + %"59" = load i64, ptr addrspace(5) %"42", align 4 + %"60" = load i64, ptr addrspace(5) %"44", align 4 + %"66" = inttoptr i64 %"59" to ptr addrspace(1) + store i64 %"60", ptr addrspace(1) %"66", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/sub.ll b/ptx/src/test/ll/sub.ll new file mode 100644 index 0000000..31b5801 --- /dev/null +++ b/ptx/src/test/ll/sub.ll @@ -0,0 +1,39 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @sub(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr + %"44" = load i64, ptr %"50", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"46" = sub i64 %"47", 1 + store i64 %"46", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"51" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"51", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/vector.ll b/ptx/src/test/ll/vector.ll new file mode 100644 index 0000000..e909c7a --- /dev/null +++ b/ptx/src/test/ll/vector.ll @@ -0,0 +1,79 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define <2 x i32> @__zluda_ptx_impl_impl(<2 x i32> %"9") #0 { + %"49" = alloca <2 x i32>, align 8, addrspace(5) + %"50" = alloca <2 x i32>, align 8, addrspace(5) + %"51" = alloca i32, align 4, addrspace(5) + %"52" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"37" = extractelement <2 x i32> %"9", i8 0 + store i32 %"37", ptr addrspace(5) %"51", align 4 + %"38" = extractelement <2 x i32> %"9", i8 1 + store i32 %"38", ptr addrspace(5) %"52", align 4 + %"56" = load i32, ptr addrspace(5) %"51", align 4 + %"57" = load i32, ptr addrspace(5) %"52", align 4 + %"55" = add i32 %"56", %"57" + store i32 %"55", ptr addrspace(5) %"52", align 4 + %"58" = load i32, ptr addrspace(5) %"52", align 4 + %"60" = load <2 x i32>, ptr addrspace(5) %"50", align 8 + %"59" = insertelement <2 x i32> %"60", i32 %"58", i8 0 + store <2 x i32> %"59", ptr addrspace(5) %"50", align 8 + %"61" = load i32, ptr addrspace(5) %"52", align 4 + %"63" = load <2 x i32>, ptr addrspace(5) %"50", align 8 + %"62" = insertelement <2 x i32> %"63", i32 %"61", i8 1 + store <2 x i32> %"62", ptr addrspace(5) %"50", align 8 + %"64" = load <2 x i32>, ptr addrspace(5) %"50", align 8 + %"42" = extractelement <2 x i32> %"64", i8 1 + %"66" = load <2 x i32>, ptr addrspace(5) %"50", align 8 + %"65" = insertelement <2 x i32> %"66", i32 %"42", i8 0 + store <2 x i32> %"65", ptr addrspace(5) %"50", align 8 + %"68" = load <2 x i32>, ptr addrspace(5) %"50", align 8 + store <2 x i32> %"68", ptr addrspace(5) %"49", align 8 + %2 = load <2 x i32>, ptr addrspace(5) %"49", align 8 + ret <2 x i32> %2 +} + +define amdgpu_kernel void @vector(ptr addrspace(4) byref(i64) %"69", ptr addrspace(4) byref(i64) %"70") #0 { + %"71" = alloca i64, align 8, addrspace(5) + %"72" = alloca i64, align 8, addrspace(5) + %"73" = alloca <2 x i32>, align 8, addrspace(5) + %"74" = alloca i32, align 4, addrspace(5) + %"75" = alloca i32, align 4, addrspace(5) + %"76" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"77" = load i64, ptr addrspace(4) %"69", align 4 + store i64 %"77", ptr addrspace(5) %"71", align 4 + %"78" = load i64, ptr addrspace(4) %"70", align 4 + store i64 %"78", ptr addrspace(5) %"72", align 4 + %"80" = load i64, ptr addrspace(5) %"71", align 4 + %"87" = inttoptr i64 %"80" to ptr + %"79" = load <2 x i32>, ptr %"87", align 8 + store <2 x i32> %"79", ptr addrspace(5) %"73", align 8 + %"82" = load <2 x i32>, ptr addrspace(5) %"73", align 8 + %"81" = call <2 x i32> @__zluda_ptx_impl_impl(<2 x i32> %"82") + store <2 x i32> %"81", ptr addrspace(5) %"73", align 8 + %"84" = load <2 x i32>, ptr addrspace(5) %"73", align 8 + %"88" = bitcast <2 x i32> %"84" to i64 + store i64 %"88", ptr addrspace(5) %"76", align 4 + %"85" = load i64, ptr addrspace(5) %"72", align 4 + %"86" = load <2 x i32>, ptr addrspace(5) %"73", align 8 + %"89" = inttoptr i64 %"85" to ptr + store <2 x i32> %"86", ptr %"89", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/vector4.ll b/ptx/src/test/ll/vector4.ll new file mode 100644 index 0000000..1b8ce24 --- /dev/null +++ b/ptx/src/test/ll/vector4.ll @@ -0,0 +1,39 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @vector4(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca <4 x i32>, align 16, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr + %"44" = load <4 x i32>, ptr %"50", align 16 + store <4 x i32> %"44", ptr addrspace(5) %"40", align 16 + %"46" = load <4 x i32>, ptr addrspace(5) %"40", align 16 + %"29" = extractelement <4 x i32> %"46", i8 3 + store i32 %"29", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i32, ptr addrspace(5) %"41", align 4 + %"53" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"53", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/vector_extract.ll b/ptx/src/test/ll/vector_extract.ll new file mode 100644 index 0000000..a106da8 --- /dev/null +++ b/ptx/src/test/ll/vector_extract.ll @@ -0,0 +1,95 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 { + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) + %"48" = alloca i16, align 2, addrspace(5) + %"49" = alloca i16, align 2, addrspace(5) + %"50" = alloca i16, align 2, addrspace(5) + %"51" = alloca i16, align 2, addrspace(5) + %"52" = alloca <4 x i16>, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + %"53" = load i64, ptr addrspace(4) %"44", align 4 + store i64 %"53", ptr addrspace(5) %"46", align 4 + %"54" = load i64, ptr addrspace(4) %"45", align 4 + store i64 %"54", ptr addrspace(5) %"47", align 4 + %"55" = load i64, ptr addrspace(5) %"46", align 4 + %"83" = inttoptr i64 %"55" to ptr addrspace(1) + %"32" = load <4 x i8>, ptr addrspace(1) %"83", align 4 + %"84" = extractelement <4 x i8> %"32", i8 0 + %"85" = extractelement <4 x i8> %"32", i8 1 + %"86" = extractelement <4 x i8> %"32", i8 2 + %"87" = extractelement <4 x i8> %"32", i8 3 + %"56" = zext i8 %"84" to i16 + %"57" = zext i8 %"85" to i16 + %"58" = zext i8 %"86" to i16 + %"59" = zext i8 %"87" to i16 + store i16 %"56", ptr addrspace(5) %"48", align 2 + store i16 %"57", ptr addrspace(5) %"49", align 2 + store i16 %"58", ptr addrspace(5) %"50", align 2 + store i16 %"59", ptr addrspace(5) %"51", align 2 + %"60" = load i16, ptr addrspace(5) %"49", align 2 + %"61" = load i16, ptr addrspace(5) %"50", align 2 + %"62" = load i16, ptr addrspace(5) %"51", align 2 + %"63" = load i16, ptr addrspace(5) %"48", align 2 + %2 = insertelement <4 x i16> undef, i16 %"60", i8 0 + %3 = insertelement <4 x i16> %2, i16 %"61", i8 1 + %4 = insertelement <4 x i16> %3, i16 %"62", i8 2 + %"33" = insertelement <4 x i16> %4, i16 %"63", i8 3 + store <4 x i16> %"33", ptr addrspace(5) %"52", align 8 + %"65" = load <4 x i16>, ptr addrspace(5) %"52", align 8 + %"66" = extractelement <4 x i16> %"65", i8 0 + %"67" = extractelement <4 x i16> %"65", i8 1 + %"68" = extractelement <4 x i16> %"65", i8 2 + %"69" = extractelement <4 x i16> %"65", i8 3 + store i16 %"66", ptr addrspace(5) %"50", align 2 + store i16 %"67", ptr addrspace(5) %"51", align 2 + store i16 %"68", ptr addrspace(5) %"48", align 2 + store i16 %"69", ptr addrspace(5) %"49", align 2 + %"70" = load i16, ptr addrspace(5) %"50", align 2 + %"71" = load i16, ptr addrspace(5) %"51", align 2 + %"72" = load i16, ptr addrspace(5) %"48", align 2 + %"73" = load i16, ptr addrspace(5) %"49", align 2 + %5 = insertelement <4 x i16> undef, i16 %"70", i8 0 + %6 = insertelement <4 x i16> %5, i16 %"71", i8 1 + %7 = insertelement <4 x i16> %6, i16 %"72", i8 2 + %"36" = insertelement <4 x i16> %7, i16 %"73", i8 3 + %"74" = extractelement <4 x i16> %"36", i8 0 + %"75" = extractelement <4 x i16> %"36", i8 1 + %"76" = extractelement <4 x i16> %"36", i8 2 + %"77" = extractelement <4 x i16> %"36", i8 3 + store i16 %"74", ptr addrspace(5) %"51", align 2 + store i16 %"75", ptr addrspace(5) %"48", align 2 + store i16 %"76", ptr addrspace(5) %"49", align 2 + store i16 %"77", ptr addrspace(5) %"50", align 2 + %"78" = load i16, ptr addrspace(5) %"48", align 2 + %"79" = load i16, ptr addrspace(5) %"49", align 2 + %"80" = load i16, ptr addrspace(5) %"50", align 2 + %"81" = load i16, ptr addrspace(5) %"51", align 2 + %"88" = trunc i16 %"78" to i8 + %"89" = trunc i16 %"79" to i8 + %"90" = trunc i16 %"80" to i8 + %"91" = trunc i16 %"81" to i8 + %8 = insertelement <4 x i8> undef, i8 %"88", i8 0 + %9 = insertelement <4 x i8> %8, i8 %"89", i8 1 + %10 = insertelement <4 x i8> %9, i8 %"90", i8 2 + %"37" = insertelement <4 x i8> %10, i8 %"91", i8 3 + %"82" = load i64, ptr addrspace(5) %"47", align 4 + %"92" = inttoptr i64 %"82" to ptr addrspace(1) + store <4 x i8> %"37", ptr addrspace(1) %"92", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/ll/xor.ll b/ptx/src/test/ll/xor.ll new file mode 100644 index 0000000..859decb --- /dev/null +++ b/ptx/src/test/ll/xor.ll @@ -0,0 +1,45 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @xor(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"39", align 4 + %"54" = inttoptr i64 %"46" to ptr + %"45" = load i32, ptr %"54", align 4 + store i32 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"47" to ptr + %"30" = getelementptr inbounds i8, ptr %"55", i64 4 + %"48" = load i32, ptr %"30", align 4 + store i32 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i32, ptr addrspace(5) %"41", align 4 + %"51" = load i32, ptr addrspace(5) %"42", align 4 + %"49" = xor i32 %"50", %"51" + store i32 %"49", ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"53" = load i32, ptr addrspace(5) %"41", align 4 + %"56" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"56", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index e1c1670..e5bcf40 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -1,11 +1,16 @@ use crate::pass; use hip_runtime_sys::hipError_t; +use std::env; use std::error; use std::ffi::{CStr, CString}; -use std::fmt; -use std::fmt::{Debug, Display, Formatter}; +use std::fmt::{self, Debug, Display, Formatter}; +use std::fs::{self, File}; +use std::io::Write; use std::mem; -use std::{ptr, str}; +use std::path::Path; +use std::ptr; +use std::str; +use pretty_assertions; macro_rules! test_ptx { ($fn_name:ident, $input:expr, $output:expr) => { @@ -28,6 +33,15 @@ macro_rules! test_ptx { test_cuda_assert(stringify!($fn_name), ptx, &input, &mut output) } } + + paste::item! { + #[test] + fn [<$fn_name _llvm>]() -> Result<(), Box> { + let ptx = include_str!(concat!(stringify!($fn_name), ".ptx")); + let ll = include_str!(concat!("../ll/", stringify!($fn_name), ".ll")).trim(); + test_llvm_assert(stringify!($fn_name), ptx, &ll) + } + } }; ($fn_name:ident) => {}; @@ -39,6 +53,7 @@ test_ptx!(mov, [1u64], [1u64]); test_ptx!(mul_lo, [1u64], [2u64]); test_ptx!(mul_hi, [u64::max_value()], [1u64]); test_ptx!(add, [1u64], [2u64]); +test_ptx!(mul24, [10u32], [20u32]); test_ptx!(setp, [10u64, 11u64], [1u64, 0u64]); test_ptx!(setp_gt, [f32::NAN, 1f32], [1f32]); test_ptx!(setp_leu, [1f32, f32::NAN], [1f32]); @@ -230,6 +245,32 @@ fn test_hip_assert< Ok(()) } +fn test_llvm_assert< + 'a, +>( + name: &str, + ptx_text: &'a str, + expected_ll: &str +) -> Result<(), Box> { + let ast = ptx_parser::parse_module_checked(ptx_text).unwrap(); + let llvm_ir = pass::to_llvm_module(ast).unwrap(); + let actual_ll = llvm_ir.llvm_ir.print_module_to_string(); + let actual_ll = actual_ll.to_str(); + if actual_ll != expected_ll { + let output_dir = env::var("TEST_PTX_LLVM_FAIL_DIR"); + if let Ok(output_dir) = output_dir { + let output_dir = Path::new(&output_dir); + fs::create_dir_all(&output_dir).unwrap(); + let output_file = output_dir.join(format!("{}.ll", name)); + let mut output_file = File::create(output_file).unwrap(); + output_file.write_all(actual_ll.as_bytes()).unwrap(); + } + let comparison = pretty_assertions::StrComparison::new(actual_ll, expected_ll); + panic!("assertion failed: `(left == right)`\n\n{}", comparison); + } + Ok(()) +} + fn test_cuda_assert< 'a, Input: From + Debug + Copy + PartialEq, @@ -311,7 +352,7 @@ fn run_hip + Copy + Debug, Output: From + Copy + Debug + Def unsafe { hipGetDevicePropertiesR0600(&mut dev_props, dev) }.unwrap(); let elf_module = comgr::compile_bitcode( unsafe { CStr::from_ptr(dev_props.gcnArchName.as_ptr()) }, - &*module.llvm_ir, + &*module.llvm_ir.write_bitcode_to_memory(), module.linked_bitcode(), ) .unwrap(); diff --git a/ptx/src/test/spirv_run/mul24.ptx b/ptx/src/test/spirv_run/mul24.ptx new file mode 100644 index 0000000..53c1224 --- /dev/null +++ b/ptx/src/test/spirv_run/mul24.ptx @@ -0,0 +1,22 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry mul24( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u32 temp; + .reg .u32 temp2; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u32 temp, [in_addr]; + mul24.lo.u32 temp2, temp, 2; + st.u32 [out_addr], temp2; + ret; +} diff --git a/ptx_parser/src/ast.rs b/ptx_parser/src/ast.rs index c2776c8..4d9f23d 100644 --- a/ptx_parser/src/ast.rs +++ b/ptx_parser/src/ast.rs @@ -2,7 +2,7 @@ use super::{ AtomSemantics, MemScope, RawRoundingMode, RawSetpCompareOp, ScalarType, SetpBoolPostOp, StateSpace, VectorPrefix, }; -use crate::{PtxError, PtxParserState}; +use crate::{PtxError, PtxParserState, Mul24Control}; use bitflags::bitflags; use std::{alloc::Layout, cmp::Ordering, num::NonZeroU8}; @@ -87,6 +87,15 @@ ptx_parser_macros::generate_instruction_type!( src2: T, } }, + Mul24 { + type: { Type::from(data.type_) }, + data: Mul24Details, + arguments: { + dst: T, + src1: T, + src2: T, + } + }, Setp { data: SetpData, arguments: { @@ -1185,6 +1194,13 @@ pub enum MulIntControl { Wide, } + +#[derive(Copy, Clone)] +pub struct Mul24Details { + pub type_: ScalarType, + pub control: Mul24Control, +} + pub struct SetpData { pub type_: ScalarType, pub flush_to_zero: Option, diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index ca40f63..da46a8c 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -1652,6 +1652,9 @@ derive_parser!( #[derive(Copy, Clone, PartialEq, Eq, Hash)] pub enum AtomSemantics { } + #[derive(Copy, Clone, PartialEq, Eq, Hash)] + pub enum Mul24Control { } + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov mov{.vec}.type d, a => { Instruction::Mov { @@ -3383,6 +3386,19 @@ derive_parser!( Instruction::Ret { data: RetData { uniform: uni } } } + mul24.mode.type d, a, b => { + ast::Instruction::Mul24 { + data: ast::Mul24Details { + control: mode, + type_ + }, + arguments: Mul24Args { dst: d, src1: a, src2: b } + } + } + + .mode: Mul24Control = { .hi, .lo }; + .type: ScalarType = { .u32, .s32 }; + ); #[cfg(test)] diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs index b469a89..a881e16 100644 --- a/zluda/src/impl/module.rs +++ b/zluda/src/impl/module.rs @@ -30,7 +30,7 @@ pub(crate) fn load_data(module: &mut CUmodule, image: *const std::ffi::c_void) - unsafe { hipGetDevicePropertiesR0600(&mut props, dev) }?; let elf_module = comgr::compile_bitcode( unsafe { CStr::from_ptr(props.gcnArchName.as_ptr()) }, - &*llvm_module.llvm_ir, + &*llvm_module.llvm_ir.write_bitcode_to_memory(), llvm_module.linked_bitcode(), ) .map_err(|_| CUerror::UNKNOWN)?;