diff --git a/Cargo.lock b/Cargo.lock index e00478e..5726bb3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -52,6 +52,12 @@ dependencies = [ "serde", ] +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + [[package]] name = "beef" version = "0.5.2" @@ -328,6 +334,12 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.9" @@ -344,6 +356,12 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + [[package]] name = "fnv" version = "1.0.7" @@ -373,6 +391,12 @@ version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b43ede17f21864e81be2fa654110bf1e793774238d86ef8555c37e6519c0403" +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + [[package]] name = "heck" version = "0.5.0" @@ -383,6 +407,29 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" name = "hip_runtime-sys" version = "0.0.0" +[[package]] +name = "indexmap" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "int-enum" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a37a9c11c6ecfec8b9bed97337dfecff3686d02ba8f52e8addad2829d047128" +dependencies = [ + "proc-macro2", + "proc-macro2-diagnostics", + "quote", + "syn 2.0.89", + "version_check", +] + [[package]] name = "itertools" version = "0.13.0" @@ -499,6 +546,16 @@ dependencies = [ "libc", ] +[[package]] +name = "matrixmultiply" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" +dependencies = [ + "autocfg", + "rawpointer", +] + [[package]] name = "memchr" version = "2.7.4" @@ -514,12 +571,37 @@ dependencies = [ "libc", ] +[[package]] +name = "microlp" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edaa5264bc1f7668bc12e10757f8f529a526656c796cc2106cf2be10c5b8d483" +dependencies = [ + "log", + "sprs", +] + [[package]] name = "minimal-lexical" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" +[[package]] +name = "ndarray" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + [[package]] name = "nom" version = "7.1.3" @@ -530,6 +612,33 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "num_enum" version = "0.4.3" @@ -567,6 +676,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset", + "indexmap", +] + [[package]] name = "plain" version = "0.2.3" @@ -579,6 +698,15 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "pretty_assertions" version = "1.4.1" @@ -647,6 +775,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proc-macro2-diagnostics" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.89", + "version_check", + "yansi", +] + [[package]] name = "ptx" version = "0.0.0" @@ -657,8 +798,11 @@ dependencies = [ "cuda-driver-sys", "half", "hip_runtime-sys", + "int-enum", "llvm_zluda", + "microlp", "paste", + "petgraph", "pretty_assertions", "ptx_parser", "quick-error", @@ -667,6 +811,7 @@ dependencies = [ "strum_macros", "tempfile", "thiserror 1.0.64", + "unwrap_or", ] [[package]] @@ -720,6 +865,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "regex" version = "1.11.0" @@ -859,6 +1010,24 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "sprs" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bff8419009a08f6cb7519a602c5590241fbff1446bcc823c07af15386eb801b" +dependencies = [ + "ndarray", + "num-complex", + "num-traits", + "smallvec", +] + [[package]] name = "strum" version = "0.26.3" @@ -980,6 +1149,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unwrap_or" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f2fe1f049979528ce97d8a4672f984f8846fc9975be0cf14ca798116d724c4a" + [[package]] name = "version_check" version = "0.9.5" diff --git a/comgr/src/lib.rs b/comgr/src/lib.rs index 0ff838b..ac407ef 100644 --- a/comgr/src/lib.rs +++ b/comgr/src/lib.rs @@ -133,21 +133,26 @@ pub fn compile_bitcode( &linking_info, amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_BC_TO_BC, )?; - let link_with_device_libs_info = ActionInfo::new()?; - link_with_device_libs_info.set_isa_name(gcn_arch)?; - link_with_device_libs_info.set_language(amd_comgr_language_t::AMD_COMGR_LANGUAGE_LLVM_IR)?; - // This makes no sense, but it makes ockl linking work - link_with_device_libs_info - .set_options([c"-Xclang", c"-mno-link-builtin-bitcode-postopt"].into_iter())?; - let with_device_libs = do_action( - &linked_data_set, - &link_with_device_libs_info, - amd_comgr_action_kind_t::AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC, - )?; - let compile_action_info = ActionInfo::new()?; - compile_action_info.set_isa_name(gcn_arch)?; - let common_options = [c"-O3", c"-mno-wavefrontsize64", c"-mcumode"].into_iter(); + let compile_to_exec = ActionInfo::new()?; + compile_to_exec.set_isa_name(gcn_arch)?; + compile_to_exec.set_language(amd_comgr_language_t::AMD_COMGR_LANGUAGE_LLVM_IR)?; + let common_options = [ + // This makes no sense, but it makes ockl linking work + c"-Xclang", + c"-mno-link-builtin-bitcode-postopt", + // Otherwise LLVM omits dynamic fp mode for ockl functions during linking + // and then fails to inline them + c"-Xclang", + c"-fdenormal-fp-math=dynamic", + c"-O3", + c"-mno-wavefrontsize64", + c"-mcumode", + // Useful for inlining reports, combined with AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_EMIT_VERBOSE_LOGS=1 AMD_COMGR_REDIRECT_LOGS=stderr + // c"-fsave-optimization-record=yaml", + ] + .into_iter(); let opt_options = if cfg!(debug_assertions) { + //[c"-g", c"-mllvm", c"-print-before-all", c"", c""] [c"-g", c"", c"", c"", c""] } else { [ @@ -159,19 +164,14 @@ pub fn compile_bitcode( c"-inlinehint-threshold=3250", ] }; - compile_action_info.set_options(common_options.chain(opt_options))?; - let reloc_data_set = do_action( - &with_device_libs, - &compile_action_info, - amd_comgr_action_kind_t::AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE, - )?; + compile_to_exec.set_options(common_options.chain(opt_options))?; let exec_data_set = do_action( - &reloc_data_set, - &compile_action_info, - amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, + &linked_data_set, + &compile_to_exec, + amd_comgr_action_kind_t::AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE, )?; let executable = - exec_data_set.get_data(amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_EXECUTABLE, 0)?; + exec_data_set.get_data(amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_EXECUTABLE, 0)?; executable.copy_content() } diff --git a/ptx/Cargo.toml b/ptx/Cargo.toml index 08ae693..2876539 100644 --- a/ptx/Cargo.toml +++ b/ptx/Cargo.toml @@ -17,6 +17,10 @@ bitflags = "1.2" rustc-hash = "2.0.0" strum = "0.26" strum_macros = "0.26" +petgraph = "0.7.1" +microlp = "0.2.10" +int-enum = "1.1" +unwrap_or = "1.0.1" [dev-dependencies] hip_runtime-sys = { path = "../ext/hip_runtime-sys" } diff --git a/ptx/src/pass/deparamize_functions.rs b/ptx/src/pass/deparamize_functions.rs index 15125b0..e203394 100644 --- a/ptx/src/pass/deparamize_functions.rs +++ b/ptx/src/pass/deparamize_functions.rs @@ -2,8 +2,8 @@ use super::*; pub(super) fn run<'a, 'input>( resolver: &mut GlobalStringIdentResolver2<'input>, - directives: Vec, SpirvWord>>, -) -> Result, SpirvWord>>, TranslateError> { + directives: Vec, SpirvWord>>, +) -> Result, SpirvWord>>, TranslateError> { directives .into_iter() .map(|directive| run_directive(resolver, directive)) @@ -12,8 +12,8 @@ pub(super) fn run<'a, 'input>( fn run_directive<'input>( resolver: &mut GlobalStringIdentResolver2, - directive: Directive2<'input, ast::Instruction, SpirvWord>, -) -> Result, SpirvWord>, TranslateError> { + directive: Directive2, SpirvWord>, +) -> Result, SpirvWord>, TranslateError> { Ok(match directive { var @ Directive2::Variable(..) => var, Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?), @@ -22,13 +22,13 @@ fn run_directive<'input>( fn run_method<'input>( resolver: &mut GlobalStringIdentResolver2, - mut method: Function2<'input, ast::Instruction, SpirvWord>, -) -> Result, SpirvWord>, TranslateError> { + mut method: Function2, SpirvWord>, +) -> Result, SpirvWord>, TranslateError> { let is_declaration = method.body.is_none(); let mut body = Vec::new(); let mut remap_returns = Vec::new(); - if !method.func_decl.name.is_kernel() { - for arg in method.func_decl.return_arguments.iter_mut() { + if !method.is_kernel { + for arg in method.return_arguments.iter_mut() { match arg.state_space { ptx_parser::StateSpace::Param => { arg.state_space = ptx_parser::StateSpace::Reg; @@ -51,7 +51,7 @@ fn run_method<'input>( _ => return Err(error_unreachable()), } } - for arg in method.func_decl.input_arguments.iter_mut() { + for arg in method.input_arguments.iter_mut() { match arg.state_space { ptx_parser::StateSpace::Param => { arg.state_space = ptx_parser::StateSpace::Reg; @@ -95,14 +95,7 @@ fn run_method<'input>( Ok::<_, TranslateError>(body) }) .transpose()?; - Ok(Function2 { - func_decl: method.func_decl, - globals: method.globals, - body, - import_as: method.import_as, - tuning: method.tuning, - linkage: method.linkage, - }) + Ok(Function2 { body, ..method }) } fn run_statement<'input>( diff --git a/ptx/src/pass/emit_llvm.rs b/ptx/src/pass/emit_llvm.rs index 8b43f3e..5a5dd80 100644 --- a/ptx/src/pass/emit_llvm.rs +++ b/ptx/src/pass/emit_llvm.rs @@ -69,7 +69,10 @@ pub struct Module(LLVMModuleRef, Context); impl Module { fn new(ctx: Context, name: &CStr) -> Self { - Self(unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) }, ctx) + Self( + unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) }, + ctx, + ) } fn get(&self) -> LLVMModuleRef { @@ -183,9 +186,10 @@ impl Deref for MemoryBuffer { pub(super) fn run<'input>( id_defs: GlobalStringIdentResolver2<'input>, - directives: Vec, SpirvWord>>, + directives: Vec, SpirvWord>>, ) -> Result { - let module = Module::new(Context::new(), LLVM_UNNAMED); + let context = Context::new(); + let module = Module::new(context, LLVM_UNNAMED); let mut emit_ctx = ModuleEmitContext::new(&module, &id_defs); for directive in directives { match directive { @@ -208,10 +212,7 @@ struct ModuleEmitContext<'a, 'input> { } impl<'a, 'input> ModuleEmitContext<'a, 'input> { - fn new( - module: &Module, - id_defs: &'a GlobalStringIdentResolver2<'input>, - ) -> Self { + fn new(module: &Module, id_defs: &'a GlobalStringIdentResolver2<'input>) -> Self { let context = module.context(); ModuleEmitContext { context: context.get(), @@ -232,24 +233,20 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> { fn emit_method( &mut self, - method: Function2<'input, ast::Instruction, SpirvWord>, + method: Function2, SpirvWord>, ) -> Result<(), TranslateError> { - let func_decl = method.func_decl; let name = method .import_as .as_deref() - .or_else(|| match func_decl.name { - ast::MethodName::Kernel(name) => Some(name), - ast::MethodName::Func(id) => self.id_defs.ident_map[&id].name.as_deref(), - }) + .or_else(|| self.id_defs.ident_map[&method.name].name.as_deref()) .ok_or_else(|| error_unreachable())?; let name = CString::new(name).map_err(|_| error_unreachable())?; let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) }; if fn_ == ptr::null_mut() { let fn_type = get_function_type( self.context, - func_decl.return_arguments.iter().map(|v| &v.v_type), - func_decl + method.return_arguments.iter().map(|v| &v.v_type), + method .input_arguments .iter() .map(|v| get_input_argument_type(self.context, &v.v_type, v.state_space)), @@ -259,15 +256,28 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> { self.emit_fn_attribute(fn_, "uniform-work-group-size", "true"); self.emit_fn_attribute(fn_, "no-trapping-math", "true"); } - if let ast::MethodName::Func(name) = func_decl.name { - self.resolver.register(name, fn_); + if !method.is_kernel { + self.resolver.register(method.name, fn_); + self.emit_fn_attribute(fn_, "denormal-fp-math-f32", "dynamic"); + self.emit_fn_attribute(fn_, "denormal-fp-math", "dynamic"); + } else { + self.emit_fn_attribute( + fn_, + "denormal-fp-math-f32", + llvm_ftz(method.flush_to_zero_f32), + ); + self.emit_fn_attribute( + fn_, + "denormal-fp-math", + llvm_ftz(method.flush_to_zero_f16f64), + ); } - for (i, param) in func_decl.input_arguments.iter().enumerate() { + for (i, param) in method.input_arguments.iter().enumerate() { let value = unsafe { LLVMGetParam(fn_, i as u32) }; let name = self.resolver.get_or_add(param.name); unsafe { LLVMSetValueName2(value, name.as_ptr().cast(), name.len()) }; self.resolver.register(param.name, value); - if func_decl.name.is_kernel() { + if method.is_kernel { let attr_kind = unsafe { LLVMGetEnumAttributeKindForName(b"byref".as_ptr().cast(), b"byref".len()) }; @@ -281,7 +291,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> { unsafe { LLVMAddAttributeAtIndex(fn_, i as u32 + 1, attr) }; } } - let call_conv = if func_decl.name.is_kernel() { + let call_conv = if method.is_kernel { Self::kernel_call_convention() } else { Self::func_call_convention() @@ -296,7 +306,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> { unsafe { LLVMAppendBasicBlockInContext(self.context, fn_, LLVM_UNNAMED.as_ptr()) }; unsafe { LLVMPositionBuilderAtEnd(self.builder.get(), real_bb) }; let mut method_emitter = MethodEmitContext::new(self, fn_, variables_builder); - for var in func_decl.return_arguments { + for var in method.return_arguments { method_emitter.emit_variable(var)?; } for statement in statements.iter() { @@ -304,6 +314,17 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> { method_emitter.emit_label_initial(*label); } } + let mut statements = statements.into_iter(); + if let Some(Statement::Label(label)) = statements.next() { + method_emitter.emit_label_delayed(label)?; + } else { + return Err(error_unreachable()); + } + method_emitter.emit_kernel_rounding_prelude( + method.is_kernel, + method.rounding_mode_f32, + method.rounding_mode_f16f64, + )?; for statement in statements { method_emitter.emit_statement(statement)?; } @@ -431,6 +452,14 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> { } } +fn llvm_ftz(ftz: bool) -> &'static str { + if ftz { + "preserve-sign" + } else { + "ieee" + } +} + fn get_input_argument_type( context: LLVMContextRef, v_type: &ast::Type, @@ -487,9 +516,32 @@ impl<'a> MethodEmitContext<'a> { Statement::FunctionPointer(_) => todo!(), Statement::VectorRead(vector_read) => self.emit_vector_read(vector_read)?, Statement::VectorWrite(vector_write) => self.emit_vector_write(vector_write)?, + Statement::SetMode(mode_reg) => self.emit_set_mode(mode_reg)?, }) } + // This should be a kernel attribute, but sadly AMDGPU LLVM target does + // not support attribute for it. So we have to set it as the first + // instruction in the body of a kernel + fn emit_kernel_rounding_prelude( + &mut self, + is_kernel: bool, + rounding_mode_f32: ast::RoundingMode, + rounding_mode_f16f64: ast::RoundingMode, + ) -> Result<(), TranslateError> { + if is_kernel { + if rounding_mode_f32 != ast::RoundingMode::NearestEven + || rounding_mode_f16f64 != ast::RoundingMode::NearestEven + { + self.emit_set_mode(ModeRegister::Rounding { + f32: rounding_mode_f32, + f16f64: rounding_mode_f16f64, + })?; + } + } + Ok(()) + } + fn emit_variable(&mut self, var: ast::Variable) -> Result<(), TranslateError> { let alloca = unsafe { LLVMZludaBuildAlloca( @@ -1143,7 +1195,7 @@ impl<'a> MethodEmitContext<'a> { let cos = self.emit_intrinsic( c"llvm.cos.f32", Some(arguments.dst), - &ast::ScalarType::F32.into(), + Some(&ast::ScalarType::F32.into()), vec![(self.resolver.value(arguments.src)?, llvm_f32)], )?; unsafe { LLVMZludaSetFastMathFlags(cos, LLVMZludaFastMathApproxFunc) } @@ -1396,7 +1448,7 @@ impl<'a> MethodEmitContext<'a> { let sin = self.emit_intrinsic( c"llvm.sin.f32", Some(arguments.dst), - &ast::ScalarType::F32.into(), + Some(&ast::ScalarType::F32.into()), vec![(self.resolver.value(arguments.src)?, llvm_f32)], )?; unsafe { LLVMZludaSetFastMathFlags(sin, LLVMZludaFastMathApproxFunc) } @@ -1407,12 +1459,12 @@ impl<'a> MethodEmitContext<'a> { &mut self, name: &CStr, dst: Option, - return_type: &ast::Type, + return_type: Option<&ast::Type>, arguments: Vec<(LLVMValueRef, LLVMTypeRef)>, ) -> Result { let fn_type = get_function_type( self.context, - iter::once(return_type), + return_type.into_iter(), arguments.iter().map(|(_, type_)| Ok(*type_)), )?; let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) }; @@ -1573,7 +1625,7 @@ impl<'a> MethodEmitContext<'a> { return self.emit_cvt_float_to_int( data.from, data.to, - integer_rounding.unwrap_or(ast::RoundingMode::NearestEven), + integer_rounding, arguments, Some(LLVMBuildFPToSI), ) @@ -1631,7 +1683,7 @@ impl<'a> MethodEmitContext<'a> { let clamped = self.emit_intrinsic( c"llvm.umin", None, - &from.into(), + Some(&from.into()), vec![ (self.resolver.value(arguments.src)?, from_llvm), (max, from_llvm), @@ -1661,7 +1713,7 @@ impl<'a> MethodEmitContext<'a> { let zero_clamped = self.emit_intrinsic( unsafe { CStr::from_bytes_with_nul_unchecked(zero_clamp_intrinsic.as_bytes()) }, None, - &from.into(), + Some(&from.into()), vec![ (self.resolver.value(arguments.src)?, from_llvm), (zero, from_llvm), @@ -1680,7 +1732,7 @@ impl<'a> MethodEmitContext<'a> { let fully_clamped = self.emit_intrinsic( unsafe { CStr::from_bytes_with_nul_unchecked(max_clamp_intrinsic.as_bytes()) }, None, - &from.into(), + Some(&from.into()), vec![(zero_clamped, from_llvm), (max, from_llvm)], )?; let resize_fn = if to.layout().size() >= from.layout().size() { @@ -1720,7 +1772,7 @@ impl<'a> MethodEmitContext<'a> { let rounded_float = self.emit_intrinsic( unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) }, None, - &from.into(), + Some(&from.into()), vec![( self.resolver.value(arguments.src)?, get_scalar_type(self.context, from), @@ -1789,7 +1841,7 @@ impl<'a> MethodEmitContext<'a> { self.emit_intrinsic( intrinsic, Some(arguments.dst), - &data.type_.into(), + Some(&data.type_.into()), vec![(self.resolver.value(arguments.src)?, type_)], )?; Ok(()) @@ -1810,7 +1862,7 @@ impl<'a> MethodEmitContext<'a> { self.emit_intrinsic( intrinsic, Some(arguments.dst), - &data.type_.into(), + Some(&data.type_.into()), vec![(self.resolver.value(arguments.src)?, type_)], )?; Ok(()) @@ -1832,7 +1884,7 @@ impl<'a> MethodEmitContext<'a> { self.emit_intrinsic( intrinsic, Some(arguments.dst), - &data.type_.into(), + Some(&data.type_.into()), vec![(self.resolver.value(arguments.src)?, type_)], )?; Ok(()) @@ -1954,7 +2006,7 @@ impl<'a> MethodEmitContext<'a> { self.emit_intrinsic( intrinsic, Some(arguments.dst), - &data.type_.into(), + Some(&data.type_.into()), vec![( self.resolver.value(arguments.src)?, get_scalar_type(self.context, data.type_), @@ -1971,7 +2023,7 @@ impl<'a> MethodEmitContext<'a> { self.emit_intrinsic( c"llvm.amdgcn.log.f32", Some(arguments.dst), - &ast::ScalarType::F32.into(), + Some(&ast::ScalarType::F32.into()), vec![( self.resolver.value(arguments.src)?, get_scalar_type(self.context, ast::ScalarType::F32.into()), @@ -2026,7 +2078,7 @@ impl<'a> MethodEmitContext<'a> { self.emit_intrinsic( intrinsic, Some(arguments.dst), - &type_.into(), + Some(&type_.into()), vec![(self.resolver.value(arguments.src)?, llvm_type)], )?; Ok(()) @@ -2050,7 +2102,7 @@ impl<'a> MethodEmitContext<'a> { self.emit_intrinsic( unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) }, Some(arguments.dst), - &data.type_().into(), + Some(&data.type_().into()), vec![ (self.resolver.value(arguments.src1)?, llvm_type), (self.resolver.value(arguments.src2)?, llvm_type), @@ -2077,7 +2129,7 @@ impl<'a> MethodEmitContext<'a> { self.emit_intrinsic( unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) }, Some(arguments.dst), - &data.type_().into(), + Some(&data.type_().into()), vec![ (self.resolver.value(arguments.src1)?, llvm_type), (self.resolver.value(arguments.src2)?, llvm_type), @@ -2095,7 +2147,7 @@ impl<'a> MethodEmitContext<'a> { self.emit_intrinsic( unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) }, Some(arguments.dst), - &data.type_.into(), + Some(&data.type_.into()), vec![ ( self.resolver.value(arguments.src1)?, @@ -2216,7 +2268,7 @@ impl<'a> MethodEmitContext<'a> { self.emit_intrinsic( unsafe { CStr::from_bytes_with_nul_unchecked(llvm_intrinsic.as_bytes()) }, Some(arguments.dst), - &data.type_.into(), + Some(&data.type_.into()), intrinsic_arguments, )?; Ok(()) @@ -2229,13 +2281,69 @@ impl<'a> MethodEmitContext<'a> { ) -> Result<(), TranslateError> { let src1 = self.resolver.value(arguments.src1)?; let src2 = self.resolver.value(arguments.src2)?; - self.emit_intrinsic(c"llvm.amdgcn.mul.u24", Some(arguments.dst), &ast::Type::Scalar(data.type_), vec![ - (src1, get_scalar_type(self.context, data.type_)), - (src2, get_scalar_type(self.context, data.type_)), - ])?; + self.emit_intrinsic( + c"llvm.amdgcn.mul.u24", + Some(arguments.dst), + Some(&ast::Type::Scalar(data.type_)), + vec![ + (src1, get_scalar_type(self.context, data.type_)), + (src2, get_scalar_type(self.context, data.type_)), + ], + )?; Ok(()) } - + + fn emit_set_mode(&mut self, mode_reg: ModeRegister) -> Result<(), TranslateError> { + fn hwreg(reg: u32, offset: u32, size: u32) -> u32 { + reg | (offset << 6) | ((size - 1) << 11) + } + fn denormal_to_value(ftz: bool) -> u32 { + if ftz { + 0 + } else { + 3 + } + } + fn rounding_to_value(ftz: ast::RoundingMode) -> u32 { + match ftz { + ptx_parser::RoundingMode::NearestEven => 0, + ptx_parser::RoundingMode::Zero => 3, + ptx_parser::RoundingMode::NegativeInf => 2, + ptx_parser::RoundingMode::PositiveInf => 1, + } + } + fn merge_regs(f32: u32, f16f64: u32) -> u32 { + f32 | f16f64 << 2 + } + let intrinsic = c"llvm.amdgcn.s.setreg"; + let (hwreg, value) = match mode_reg { + ModeRegister::Denormal { f32, f16f64 } => { + let hwreg = hwreg(1, 4, 4); + let f32 = denormal_to_value(f32); + let f16f64 = denormal_to_value(f16f64); + let value = merge_regs(f32, f16f64); + (hwreg, value) + } + ModeRegister::Rounding { f32, f16f64 } => { + let hwreg = hwreg(1, 0, 4); + let f32 = rounding_to_value(f32); + let f16f64 = rounding_to_value(f16f64); + let value = merge_regs(f32, f16f64); + (hwreg, value) + } + }; + let llvm_i32 = get_scalar_type(self.context, ast::ScalarType::B32); + let hwreg_llvm = unsafe { LLVMConstInt(llvm_i32, hwreg as _, 0) }; + let value_llvm = unsafe { LLVMConstInt(llvm_i32, value as _, 0) }; + self.emit_intrinsic( + intrinsic, + None, + None, + vec![(hwreg_llvm, llvm_i32), (value_llvm, llvm_i32)], + )?; + Ok(()) + } + /* // Currently unused, LLVM 18 (ROCm 6.2) does not support `llvm.set.rounding` // Should be available in LLVM 19 diff --git a/ptx/src/pass/expand_operands.rs b/ptx/src/pass/expand_operands.rs index f2de786..a9ede33 100644 --- a/ptx/src/pass/expand_operands.rs +++ b/ptx/src/pass/expand_operands.rs @@ -2,8 +2,8 @@ use super::*; pub(super) fn run<'a, 'input>( resolver: &mut GlobalStringIdentResolver2<'input>, - directives: Vec>, -) -> Result, SpirvWord>>, TranslateError> { + directives: Vec, +) -> Result, SpirvWord>>, TranslateError> { directives .into_iter() .map(|directive| run_directive(resolver, directive)) @@ -13,11 +13,10 @@ pub(super) fn run<'a, 'input>( fn run_directive<'input>( resolver: &mut GlobalStringIdentResolver2<'input>, directive: Directive2< - 'input, ast::Instruction>, ast::ParsedOperand, >, -) -> Result, SpirvWord>, TranslateError> { +) -> Result, SpirvWord>, TranslateError> { Ok(match directive { Directive2::Variable(linking, var) => Directive2::Variable(linking, var), Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?), @@ -27,11 +26,10 @@ fn run_directive<'input>( fn run_method<'input>( resolver: &mut GlobalStringIdentResolver2<'input>, method: Function2< - 'input, ast::Instruction>, ast::ParsedOperand, >, -) -> Result, SpirvWord>, TranslateError> { +) -> Result, SpirvWord>, TranslateError> { let body = method .body .map(|statements| { @@ -43,12 +41,18 @@ fn run_method<'input>( }) .transpose()?; Ok(Function2 { - func_decl: method.func_decl, - globals: method.globals, body, + return_arguments: method.return_arguments, + name: method.name, + input_arguments: method.input_arguments, import_as: method.import_as, tuning: method.tuning, linkage: method.linkage, + is_kernel: method.is_kernel, + flush_to_zero_f32: method.flush_to_zero_f32, + flush_to_zero_f16f64: method.flush_to_zero_f16f64, + rounding_mode_f32: method.rounding_mode_f32, + rounding_mode_f16f64: method.rounding_mode_f16f64, }) } diff --git a/ptx/src/pass/fix_special_registers2.rs b/ptx/src/pass/fix_special_registers2.rs index 8c3b794..78e66c9 100644 --- a/ptx/src/pass/fix_special_registers2.rs +++ b/ptx/src/pass/fix_special_registers2.rs @@ -1,30 +1,33 @@ use super::*; pub(super) fn run<'a, 'input>( - resolver: &mut GlobalStringIdentResolver2<'input>, + resolver: &'a mut GlobalStringIdentResolver2<'input>, special_registers: &'a SpecialRegistersMap2, - directives: Vec>, -) -> Result>, TranslateError> { - let declarations = SpecialRegistersMap2::generate_declarations(resolver); - let mut result = Vec::with_capacity(declarations.len() + directives.len()); + directives: Vec, +) -> Result, TranslateError> { + let mut result = Vec::with_capacity(SpecialRegistersMap2::len() + directives.len()); let mut sreg_to_function = - FxHashMap::with_capacity_and_hasher(declarations.len(), Default::default()); - for (sreg, declaration) in declarations { - let name = if let ast::MethodName::Func(name) = declaration.name { - name - } else { - return Err(error_unreachable()); - }; - result.push(UnconditionalDirective::Method(UnconditionalFunction { - func_decl: declaration, - globals: Vec::new(), - body: None, - import_as: None, - tuning: Vec::new(), - linkage: ast::LinkingDirective::EXTERN, - })); - sreg_to_function.insert(sreg, name); - } + FxHashMap::with_capacity_and_hasher(SpecialRegistersMap2::len(), Default::default()); + SpecialRegistersMap2::foreach_declaration( + resolver, + |sreg, (return_arguments, name, input_arguments)| { + result.push(UnconditionalDirective::Method(UnconditionalFunction { + return_arguments, + name, + input_arguments, + body: None, + import_as: None, + tuning: Vec::new(), + linkage: ast::LinkingDirective::EXTERN, + is_kernel: false, + flush_to_zero_f32: false, + flush_to_zero_f16f64: false, + rounding_mode_f32: ptx_parser::RoundingMode::NearestEven, + rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven, + })); + sreg_to_function.insert(sreg, name); + }, + ); let mut visitor = SpecialRegisterResolver { resolver, special_registers, @@ -39,8 +42,8 @@ pub(super) fn run<'a, 'input>( fn run_directive<'a, 'input>( visitor: &mut SpecialRegisterResolver<'a, 'input>, - directive: UnconditionalDirective<'input>, -) -> Result, TranslateError> { + directive: UnconditionalDirective, +) -> Result { Ok(match directive { var @ Directive2::Variable(..) => var, Directive2::Method(method) => Directive2::Method(run_method(visitor, method)?), @@ -49,8 +52,8 @@ fn run_directive<'a, 'input>( fn run_method<'a, 'input>( visitor: &mut SpecialRegisterResolver<'a, 'input>, - method: UnconditionalFunction<'input>, -) -> Result, TranslateError> { + method: UnconditionalFunction, +) -> Result { let body = method .body .map(|statements| { @@ -61,14 +64,7 @@ fn run_method<'a, 'input>( Ok::<_, TranslateError>(result) }) .transpose()?; - Ok(Function2 { - func_decl: method.func_decl, - globals: method.globals, - body, - import_as: method.import_as, - tuning: method.tuning, - linkage: method.linkage, - }) + Ok(Function2 { body, ..method }) } fn run_statement<'a, 'input>( diff --git a/ptx/src/pass/hoist_globals.rs b/ptx/src/pass/hoist_globals.rs index 718c052..654a7e9 100644 --- a/ptx/src/pass/hoist_globals.rs +++ b/ptx/src/pass/hoist_globals.rs @@ -1,8 +1,8 @@ use super::*; pub(super) fn run<'input>( - directives: Vec, SpirvWord>>, -) -> Result, SpirvWord>>, TranslateError> { + directives: Vec, SpirvWord>>, +) -> Result, SpirvWord>>, TranslateError> { let mut result = Vec::with_capacity(directives.len()); for mut directive in directives.into_iter() { run_directive(&mut result, &mut directive)?; @@ -12,8 +12,8 @@ pub(super) fn run<'input>( } fn run_directive<'input>( - result: &mut Vec, SpirvWord>>, - directive: &mut Directive2<'input, ptx_parser::Instruction, SpirvWord>, + result: &mut Vec, SpirvWord>>, + directive: &mut Directive2, SpirvWord>, ) -> Result<(), TranslateError> { match directive { Directive2::Variable(..) => {} @@ -23,8 +23,8 @@ fn run_directive<'input>( } fn run_function<'input>( - result: &mut Vec, SpirvWord>>, - function: &mut Function2<'input, ptx_parser::Instruction, SpirvWord>, + result: &mut Vec, SpirvWord>>, + function: &mut Function2, SpirvWord>, ) { function.body = function.body.take().map(|statements| { statements diff --git a/ptx/src/pass/insert_explicit_load_store.rs b/ptx/src/pass/insert_explicit_load_store.rs index 702f733..935e78d 100644 --- a/ptx/src/pass/insert_explicit_load_store.rs +++ b/ptx/src/pass/insert_explicit_load_store.rs @@ -11,8 +11,8 @@ use super::*; // pass, so we do nothing there pub(super) fn run<'a, 'input>( resolver: &mut GlobalStringIdentResolver2<'input>, - directives: Vec, SpirvWord>>, -) -> Result, SpirvWord>>, TranslateError> { + directives: Vec, SpirvWord>>, +) -> Result, SpirvWord>>, TranslateError> { directives .into_iter() .map(|directive| run_directive(resolver, directive)) @@ -21,8 +21,8 @@ pub(super) fn run<'a, 'input>( fn run_directive<'a, 'input>( resolver: &mut GlobalStringIdentResolver2<'input>, - directive: Directive2<'input, ast::Instruction, SpirvWord>, -) -> Result, SpirvWord>, TranslateError> { + directive: Directive2, SpirvWord>, +) -> Result, SpirvWord>, TranslateError> { Ok(match directive { var @ Directive2::Variable(..) => var, Directive2::Method(method) => { @@ -34,12 +34,11 @@ fn run_directive<'a, 'input>( fn run_method<'a, 'input>( mut visitor: InsertMemSSAVisitor<'a, 'input>, - method: Function2<'input, ast::Instruction, SpirvWord>, -) -> Result, SpirvWord>, TranslateError> { - let mut func_decl = method.func_decl; - let is_kernel = func_decl.name.is_kernel(); + mut method: Function2, SpirvWord>, +) -> Result, SpirvWord>, TranslateError> { + let is_kernel = method.is_kernel; if is_kernel { - for arg in func_decl.input_arguments.iter_mut() { + for arg in method.input_arguments.iter_mut() { let old_name = arg.name; let old_space = arg.state_space; let new_space = ast::StateSpace::ParamEntry; @@ -51,10 +50,10 @@ fn run_method<'a, 'input>( arg.state_space = new_space; } }; - for arg in func_decl.return_arguments.iter_mut() { + for arg in method.return_arguments.iter_mut() { visitor.visit_variable(arg)?; } - let return_arguments = &func_decl.return_arguments[..]; + let return_arguments = &method.return_arguments[..]; let body = method .body .map(move |statements| { @@ -65,14 +64,7 @@ fn run_method<'a, 'input>( Ok::<_, TranslateError>(result) }) .transpose()?; - Ok(Function2 { - func_decl: func_decl, - globals: method.globals, - body, - import_as: method.import_as, - tuning: method.tuning, - linkage: method.linkage, - }) + Ok(Function2 { body, ..method }) } fn run_statement<'a, 'input>( diff --git a/ptx/src/pass/insert_implicit_conversions2.rs b/ptx/src/pass/insert_implicit_conversions2.rs index 4f738f5..9f8b01c 100644 --- a/ptx/src/pass/insert_implicit_conversions2.rs +++ b/ptx/src/pass/insert_implicit_conversions2.rs @@ -19,8 +19,8 @@ use ptx_parser as ast; */ pub(super) fn run<'input>( resolver: &mut GlobalStringIdentResolver2<'input>, - directives: Vec, SpirvWord>>, -) -> Result, SpirvWord>>, TranslateError> { + directives: Vec, SpirvWord>>, +) -> Result, SpirvWord>>, TranslateError> { directives .into_iter() .map(|directive| run_directive(resolver, directive)) @@ -29,8 +29,8 @@ pub(super) fn run<'input>( fn run_directive<'a, 'input>( resolver: &mut GlobalStringIdentResolver2<'input>, - directive: Directive2<'input, ast::Instruction, SpirvWord>, -) -> Result, SpirvWord>, TranslateError> { + directive: Directive2, SpirvWord>, +) -> Result, SpirvWord>, TranslateError> { Ok(match directive { var @ Directive2::Variable(..) => var, Directive2::Method(mut method) => { diff --git a/ptx/src/pass/instruction_mode_to_global_mode/call_with_mode.ptx b/ptx/src/pass/instruction_mode_to_global_mode/call_with_mode.ptx new file mode 100644 index 0000000..506145a --- /dev/null +++ b/ptx/src/pass/instruction_mode_to_global_mode/call_with_mode.ptx @@ -0,0 +1,29 @@ +.version 6.5 +.target sm_50 +.address_size 64 + +.func use_modes(); + +.visible .entry kernel() +{ + .reg .f32 temp; + + add.rz.ftz.f32 temp, temp, temp; + call use_modes; + add.rp.ftz.f32 temp, temp, temp; + ret; +} + +.func use_modes() +{ + .reg .f32 temp; + .reg .pred pred; + @pred bra SET_RM; + @!pred bra SET_RZ; +SET_RM: + add.rm.f32 temp, temp, temp; + ret; +SET_RZ: + add.rz.f32 temp, temp, temp; + ret; +} diff --git a/ptx/src/pass/instruction_mode_to_global_mode/fold_denormal.ptx b/ptx/src/pass/instruction_mode_to_global_mode/fold_denormal.ptx new file mode 100644 index 0000000..1fa161a --- /dev/null +++ b/ptx/src/pass/instruction_mode_to_global_mode/fold_denormal.ptx @@ -0,0 +1,15 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry add() +{ + .reg .f32 temp<3>; + + add.ftz.f16 temp2, temp1, temp0; + add.ftz.f32 temp2, temp1, temp0; + + add.f16 temp2, temp1, temp0; + add.f32 temp2, temp1, temp0; + ret; +} diff --git a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs new file mode 100644 index 0000000..c2b9672 --- /dev/null +++ b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs @@ -0,0 +1,1907 @@ +use super::BrachCondition; +use super::Directive2; +use super::Function2; +use super::GlobalStringIdentResolver2; +use super::ModeRegister; +use super::SpirvWord; +use super::Statement; +use super::TranslateError; +use crate::pass::error_unreachable; +use microlp::OptimizationDirection; +use microlp::Problem; +use microlp::Variable; +use petgraph::graph::NodeIndex; +use petgraph::visit::IntoNodeReferences; +use petgraph::Direction; +use petgraph::Graph; +use ptx_parser as ast; +use rustc_hash::FxHashMap; +use rustc_hash::FxHashSet; +use std::hash::Hash; +use std::iter; +use std::mem; +use strum::EnumCount; +use strum_macros::{EnumCount, VariantArray}; +use unwrap_or::unwrap_some_or; + +#[derive(Default, PartialEq, Eq, Clone, Copy, Debug, VariantArray, EnumCount)] +enum DenormalMode { + #[default] + FlushToZero, + Preserve, +} + +impl DenormalMode { + fn from_ftz(ftz: bool) -> Self { + if ftz { + DenormalMode::FlushToZero + } else { + DenormalMode::Preserve + } + } + + fn to_ftz(self) -> bool { + match self { + DenormalMode::FlushToZero => true, + DenormalMode::Preserve => false, + } + } +} + +impl Into for DenormalMode { + fn into(self) -> bool { + self.to_ftz() + } +} + +impl Into for DenormalMode { + fn into(self) -> usize { + self as usize + } +} + +#[derive(Default, PartialEq, Eq, Clone, Copy, Debug, VariantArray, EnumCount)] +enum RoundingMode { + #[default] + NearestEven, + Zero, + NegativeInf, + PositiveInf, +} + +impl RoundingMode { + fn to_ast(self) -> ast::RoundingMode { + match self { + RoundingMode::NearestEven => ast::RoundingMode::NearestEven, + RoundingMode::Zero => ast::RoundingMode::Zero, + RoundingMode::NegativeInf => ast::RoundingMode::NegativeInf, + RoundingMode::PositiveInf => ast::RoundingMode::PositiveInf, + } + } + + fn from_ast(rnd: ast::RoundingMode) -> Self { + match rnd { + ast::RoundingMode::NearestEven => RoundingMode::NearestEven, + ast::RoundingMode::Zero => RoundingMode::Zero, + ast::RoundingMode::NegativeInf => RoundingMode::NegativeInf, + ast::RoundingMode::PositiveInf => RoundingMode::PositiveInf, + } + } +} + +impl Into for RoundingMode { + fn into(self) -> ast::RoundingMode { + self.to_ast() + } +} + +impl Into for RoundingMode { + fn into(self) -> usize { + self as usize + } +} + +struct InstructionModes { + denormal_f32: Option, + denormal_f16f64: Option, + rounding_f32: Option, + rounding_f16f64: Option, +} + +struct ResolvedInstructionModes { + denormal_f32: Resolved, + denormal_f16f64: Resolved, + rounding_f32: Resolved, + rounding_f16f64: Resolved, +} + +impl InstructionModes { + fn fold_into(self, entry: &mut Self, exit: &mut Self) { + fn set_if_none(source: &mut Option, value: Option) { + match (*source, value) { + (None, Some(x)) => *source = Some(x), + _ => {} + } + } + fn set_if_any(source: &mut Option, value: Option) { + if let Some(x) = value { + *source = Some(x); + } + } + set_if_none(&mut entry.denormal_f32, self.denormal_f32); + set_if_none(&mut entry.denormal_f16f64, self.denormal_f16f64); + set_if_none(&mut entry.rounding_f32, self.rounding_f32); + set_if_none(&mut entry.rounding_f16f64, self.rounding_f16f64); + set_if_any(&mut exit.denormal_f32, self.denormal_f32); + set_if_any(&mut exit.denormal_f16f64, self.denormal_f16f64); + set_if_any(&mut exit.rounding_f32, self.rounding_f32); + set_if_any(&mut exit.rounding_f16f64, self.rounding_f16f64); + } + + fn none() -> Self { + Self { + denormal_f32: None, + denormal_f16f64: None, + rounding_f32: None, + rounding_f16f64: None, + } + } + + fn new( + type_: ast::ScalarType, + denormal: Option, + rounding: Option, + ) -> Self { + if type_ != ast::ScalarType::F32 { + Self { + denormal_f16f64: denormal, + rounding_f16f64: rounding, + ..Self::none() + } + } else { + Self { + denormal_f32: denormal, + rounding_f32: rounding, + ..Self::none() + } + } + } + + fn mixed_ftz_f32( + type_: ast::ScalarType, + denormal: Option, + rounding: Option, + ) -> Self { + if type_ != ast::ScalarType::F32 { + Self { + denormal_f16f64: denormal, + rounding_f32: rounding, + ..Self::none() + } + } else { + Self { + denormal_f32: denormal, + rounding_f32: rounding, + ..Self::none() + } + } + } + + fn from_arith_float(arith: &ast::ArithFloat) -> InstructionModes { + let denormal = arith.flush_to_zero.map(DenormalMode::from_ftz); + let rounding = Some(RoundingMode::from_ast(arith.rounding)); + InstructionModes::new(arith.type_, denormal, rounding) + } + + fn from_ftz(type_: ast::ScalarType, ftz: Option) -> Self { + Self::new(type_, ftz.map(DenormalMode::from_ftz), None) + } + + fn from_ftz_f32(ftz: bool) -> Self { + Self::new( + ast::ScalarType::F32, + Some(DenormalMode::from_ftz(ftz)), + None, + ) + } + + fn from_rcp(data: ast::RcpData) -> InstructionModes { + let rounding = match data.kind { + ast::RcpKind::Approx => None, + ast::RcpKind::Compliant(rnd) => Some(RoundingMode::from_ast(rnd)), + }; + let denormal = data.flush_to_zero.map(DenormalMode::from_ftz); + InstructionModes::new(data.type_, denormal, rounding) + } + + fn from_cvt(cvt: &ast::CvtDetails) -> InstructionModes { + match cvt.mode { + ast::CvtMode::ZeroExtend + | ast::CvtMode::SignExtend + | ast::CvtMode::Truncate + | ast::CvtMode::Bitcast + | ast::CvtMode::SaturateUnsignedToSigned + | ast::CvtMode::SaturateSignedToUnsigned => Self::none(), + ast::CvtMode::FPExtend { flush_to_zero } => { + Self::from_ftz(ast::ScalarType::F32, flush_to_zero) + } + ast::CvtMode::FPTruncate { + rounding, + flush_to_zero, + } + | ast::CvtMode::FPRound { + integer_rounding: rounding, + flush_to_zero, + } => Self::mixed_ftz_f32( + cvt.to, + flush_to_zero.map(DenormalMode::from_ftz), + Some(RoundingMode::from_ast(rounding)), + ), + // float to int contains rounding field, but it's not a rounding + // mode but rather round-to-int operation that will be applied + ast::CvtMode::SignedFromFP { flush_to_zero, .. } + | ast::CvtMode::UnsignedFromFP { flush_to_zero, .. } => { + Self::new(cvt.from, flush_to_zero.map(DenormalMode::from_ftz), None) + } + ast::CvtMode::FPFromSigned(rnd) | ast::CvtMode::FPFromUnsigned(rnd) => { + Self::new(cvt.to, None, Some(RoundingMode::from_ast(rnd))) + } + } + } +} + +struct ControlFlowGraph { + entry_points: FxHashMap, + basic_blocks: FxHashMap, + // map function -> return label + call_returns: FxHashMap>, + // map function -> return basic block + functions_rets: FxHashMap, + graph: Graph, +} + +impl ControlFlowGraph { + fn new() -> Self { + Self { + entry_points: FxHashMap::default(), + basic_blocks: FxHashMap::default(), + call_returns: FxHashMap::default(), + functions_rets: FxHashMap::default(), + graph: Graph::new(), + } + } + + fn add_entry_basic_block(&mut self, label: SpirvWord) -> NodeIndex { + let idx = self.graph.add_node(Node::entry(label)); + assert_eq!(self.entry_points.insert(label, idx), None); + idx + } + + fn get_or_add_basic_block(&mut self, label: SpirvWord) -> NodeIndex { + self.basic_blocks.get(&label).copied().unwrap_or_else(|| { + let idx = self.graph.add_node(Node::new(label)); + self.basic_blocks.insert(label, idx); + idx + }) + } + + fn add_jump(&mut self, from: NodeIndex, to: SpirvWord) -> NodeIndex { + let to = self.get_or_add_basic_block(to); + self.graph.add_edge(from, to, ()); + to + } + + fn set_modes(&mut self, node: NodeIndex, entry: InstructionModes, exit: InstructionModes) { + let node = &mut self.graph[node]; + node.denormal_f32.entry = entry.denormal_f32.map(ExtendedMode::BasicBlock); + node.denormal_f16f64.entry = entry.denormal_f16f64.map(ExtendedMode::BasicBlock); + node.rounding_f32.entry = entry.rounding_f32.map(ExtendedMode::BasicBlock); + node.rounding_f16f64.entry = entry.rounding_f16f64.map(ExtendedMode::BasicBlock); + node.denormal_f32.exit = exit.denormal_f32.map(ExtendedMode::BasicBlock); + node.denormal_f16f64.exit = exit.denormal_f16f64.map(ExtendedMode::BasicBlock); + node.rounding_f32.exit = exit.rounding_f32.map(ExtendedMode::BasicBlock); + node.rounding_f16f64.exit = exit.rounding_f16f64.map(ExtendedMode::BasicBlock); + } + + // Our control flow graph expresses function calls as edges in the graph. + // While building the graph it's always possible to create the edge from + // caller basic block to a function, but it's impossible to construct an + // edge from the function return basic block to after-call basic block in + // caller (the function might have been just a declaration for now). + // That's why we collect: + // * Which basic blocks does a function return to + // * What is thew functin's return basic blocks + // and then, after visiting all functions, we add the missing edges here + fn fixup_function_calls(&mut self) -> Result<(), TranslateError> { + for (fn_, follow_on_labels) in self.call_returns.iter() { + let connecting_bb = match self.functions_rets.get(fn_) { + Some(return_bb) => *return_bb, + // function is just a declaration + None => *self.basic_blocks.get(fn_).ok_or_else(error_unreachable)?, + }; + for follow_on_label in follow_on_labels { + self.graph.add_edge(connecting_bb, *follow_on_label, ()); + } + } + Ok(()) + } +} + +struct ResolvedControlFlowGraph { + basic_blocks: FxHashMap, + // map function -> return basic block + functions_rets: FxHashMap, + graph: Graph, +} + +impl ResolvedControlFlowGraph { + // This function takes the initial control flow graph. Initial control flow + // graph only has mode values for basic blocks if any instruction in the + // given basic block requires a mode. All the other basic blocks have no + // value. This pass resolved the values for all basic blocks. If a basic + // block sets no value then and there are multiple incoming edges from + // basic block with different values then the value is set to a special + // value "Conflict". + // After this pass every basic block either has a concrete value or "Conflict" + fn new( + cfg: ControlFlowGraph, + f32_denormal_kernels: &FxHashMap, + f16f64_denormal_kernels: &FxHashMap, + f32_rounding_kernels: &FxHashMap, + f16f64_rounding_kernels: &FxHashMap, + ) -> Result { + fn get_incoming_mode( + cfg: &ControlFlowGraph, + kernels: &FxHashMap, + node: NodeIndex, + mut exit_getter: impl FnMut(&Node) -> Option>, + ) -> Result, TranslateError> { + let mut mode: Option = None; + let mut visited = iter::once(node).collect::>(); + let mut to_visit = cfg + .graph + .neighbors_directed(node, Direction::Incoming) + .map(|x| x) + .collect::>(); + while let Some(node) = to_visit.pop() { + if !visited.insert(node) { + continue; + } + let node_data = &cfg.graph[node]; + match (mode, exit_getter(node_data)) { + (_, None) => { + for next in cfg.graph.neighbors_directed(node, Direction::Incoming) { + if !visited.contains(&next) { + to_visit.push(next); + } + } + } + (existing_mode, Some(new_mode)) => { + let new_mode = match new_mode { + ExtendedMode::BasicBlock(new_mode) => new_mode, + ExtendedMode::Entry(kernel) => { + kernels.get(&kernel).copied().unwrap_or_default() + } + }; + if let Some(existing_mode) = existing_mode { + if existing_mode != new_mode { + return Ok(Resolved::Conflict); + } + } + mode = Some(new_mode); + } + } + } + // This should happen only for orphaned basic blocks + mode.map(Resolved::Value).ok_or_else(error_unreachable) + } + fn resolve_mode( + cfg: &ControlFlowGraph, + kernels: &FxHashMap, + node: NodeIndex, + exit_getter: impl FnMut(&Node) -> Option>, + mode: &Mode, + ) -> Result, TranslateError> { + let entry = match mode.entry { + Some(ExtendedMode::Entry(kernel)) => { + Resolved::Value(kernels.get(&kernel).copied().unwrap_or_default()) + } + Some(ExtendedMode::BasicBlock(bb)) => Resolved::Value(bb), + None => get_incoming_mode(cfg, kernels, node, exit_getter)?, + }; + let exit = match mode.entry { + Some(ExtendedMode::BasicBlock(bb)) => Resolved::Value(bb), + Some(ExtendedMode::Entry(_)) | None => entry, + }; + Ok(ResolvedMode { entry, exit }) + } + fn resolve_node_impl( + cfg: &ControlFlowGraph, + f32_denormal_kernels: &FxHashMap, + f16f64_denormal_kernels: &FxHashMap, + f32_rounding_kernels: &FxHashMap, + f16f64_rounding_kernels: &FxHashMap, + index: NodeIndex, + node: &Node, + ) -> Result { + let denormal_f32 = resolve_mode( + cfg, + f32_denormal_kernels, + index, + |node| node.denormal_f32.exit, + &node.denormal_f32, + )?; + let denormal_f16f64 = resolve_mode( + cfg, + f16f64_denormal_kernels, + index, + |node| node.denormal_f16f64.exit, + &node.denormal_f16f64, + )?; + let rounding_f32 = resolve_mode( + cfg, + f32_rounding_kernels, + index, + |node| node.rounding_f32.exit, + &node.rounding_f32, + )?; + let rounding_f16f64 = resolve_mode( + cfg, + f16f64_rounding_kernels, + index, + |node| node.rounding_f16f64.exit, + &node.rounding_f16f64, + )?; + Ok(ResolvedNode { + label: node.label, + denormal_f32, + denormal_f16f64, + rounding_f32, + rounding_f16f64, + }) + } + fn resolve_node( + cfg: &ControlFlowGraph, + f32_denormal_kernels: &FxHashMap, + f16f64_denormal_kernels: &FxHashMap, + f32_rounding_kernels: &FxHashMap, + f16f64_rounding_kernels: &FxHashMap, + index: NodeIndex, + node: &Node, + error: &mut bool, + ) -> ResolvedNode { + match resolve_node_impl( + cfg, + f32_denormal_kernels, + f16f64_denormal_kernels, + f32_rounding_kernels, + f16f64_rounding_kernels, + index, + node, + ) { + Ok(node) => node, + Err(_) => { + *error = true; + ResolvedNode { + label: SpirvWord(u32::MAX), + denormal_f32: ResolvedMode { + entry: Resolved::Conflict, + exit: Resolved::Conflict, + }, + denormal_f16f64: ResolvedMode { + entry: Resolved::Conflict, + exit: Resolved::Conflict, + }, + rounding_f32: ResolvedMode { + entry: Resolved::Conflict, + exit: Resolved::Conflict, + }, + rounding_f16f64: ResolvedMode { + entry: Resolved::Conflict, + exit: Resolved::Conflict, + }, + } + } + } + } + let mut error = false; + let graph = cfg.graph.map( + |index, node| { + resolve_node( + &cfg, + f32_denormal_kernels, + f16f64_denormal_kernels, + f32_rounding_kernels, + f16f64_rounding_kernels, + index, + node, + &mut error, + ) + }, + |_, ()| (), + ); + if error { + Err(error_unreachable()) + } else { + Ok(Self { + basic_blocks: cfg.basic_blocks, + functions_rets: cfg.functions_rets, + graph, + }) + } + } +} + +#[derive(Clone, Copy)] +//#[cfg_attr(test, derive(Debug))] +#[derive(Debug)] +struct Mode { + entry: Option>, + exit: Option>, +} + +impl Mode { + fn new() -> Self { + Self { + entry: None, + exit: None, + } + } + + fn entry(label: SpirvWord) -> Self { + Self { + entry: Some(ExtendedMode::Entry(label)), + exit: Some(ExtendedMode::Entry(label)), + } + } +} + +#[derive(Copy, Clone)] +struct ResolvedMode { + entry: Resolved, + exit: Resolved, +} + +//#[cfg_attr(test, derive(Debug))] +#[derive(Debug)] +struct Node { + label: SpirvWord, + denormal_f32: Mode, + denormal_f16f64: Mode, + rounding_f32: Mode, + rounding_f16f64: Mode, +} + +struct ResolvedNode { + label: SpirvWord, + denormal_f32: ResolvedMode, + denormal_f16f64: ResolvedMode, + rounding_f32: ResolvedMode, + rounding_f16f64: ResolvedMode, +} + +impl Node { + fn entry(label: SpirvWord) -> Self { + Self { + label, + denormal_f32: Mode::entry(label), + denormal_f16f64: Mode::entry(label), + rounding_f32: Mode::entry(label), + rounding_f16f64: Mode::entry(label), + } + } + + fn new(label: SpirvWord) -> Self { + Self { + label, + denormal_f32: Mode::new(), + denormal_f16f64: Mode::new(), + rounding_f32: Mode::new(), + rounding_f16f64: Mode::new(), + } + } +} + +// This instruction convert instruction-scoped modes (denormal, rounding) in PTX +// to globally-scoped modes as expected by AMD GPUs. +// As a simplified example this pass converts this instruction: +// add.ftz.rn.f32 %r1, %r2, %r3; +// to: +// set_ftz_mode true; +// set_rnd_mode rn; +// add.ftz.rn.f32 %r1, %r2, %r3; +pub(crate) fn run<'input>( + flat_resolver: &mut GlobalStringIdentResolver2<'input>, + directives: Vec, SpirvWord>>, +) -> Result, SpirvWord>>, TranslateError> { + let cfg = create_control_flow_graph(&directives)?; + let (denormal_f32, denormal_f16f64, rounding_f32, rounding_f16f64) = + compute_minimal_mode_insertions(&cfg); + let temp = compute_full_mode_insertions( + flat_resolver, + &directives, + cfg, + denormal_f32, + denormal_f16f64, + rounding_f32, + rounding_f16f64, + )?; + apply_global_mode_controls(directives, temp) +} + +// For every basic block this pass computes: +// - Name of mode prologue basic blocks. Mode prologue is a basic block which +// contains single instruction that sets mode to the desired value. It will +// be later inserted just before the basic block and all jumps that require +// mode change will go through this basic block +// - Entry mode: what is the mode for both f32 and f16f64 at the first instruction. +// This will be used when emiting instructions in the basic block. When we +// emit an instruction we get its modes, check if they are different and if so +// decide: do we emit new mode set statement or we fold into previous mode set. +// We don't need to compute exit mode for every basic block because this will be +// computed naturally when emitting instructions in a basic block. +// Only exception is exit mode for returning (containing instruction `ret;`) +// basic blocks for functions. +// We need this information to handle call instructions correctly. +fn compute_full_mode_insertions( + flat_resolver: &mut GlobalStringIdentResolver2, + directives: &Vec, SpirvWord>>, + cfg: ControlFlowGraph, + denormal_f32: MandatoryModeInsertions, + denormal_f16f64: MandatoryModeInsertions, + rounding_f32: MandatoryModeInsertions, + rounding_f16f64: MandatoryModeInsertions, +) -> Result { + let cfg = ResolvedControlFlowGraph::new( + cfg, + &denormal_f32.kernels, + &denormal_f16f64.kernels, + &rounding_f32.kernels, + &rounding_f16f64.kernels, + )?; + join_modes( + flat_resolver, + directives, + cfg, + denormal_f32, + denormal_f16f64, + rounding_f32, + rounding_f16f64, + ) +} + +// This function takes the control flow graph and for each global mode computes: +// * Which basic blocks have an incoming edge from at least one basic block with +// different mode. That means that we will later need to insert a mode +// "prologue": an artifical basic block which sets the mode to the desired +// value. All mode-changing edges will be redirected to than basic block +// * What is the initial value for the mode in a kernel. Note, that only +// computes the initial value if the value is observed by a basic block. +// For some kernels the initial value does not matter and in that case a later +// pass should use default value +fn compute_minimal_mode_insertions( + cfg: &ControlFlowGraph, +) -> ( + MandatoryModeInsertions, + MandatoryModeInsertions, + MandatoryModeInsertions, + MandatoryModeInsertions, +) { + let rounding_f32 = compute_single_mode_insertions(cfg, |node| node.rounding_f32); + let denormal_f32 = compute_single_mode_insertions(cfg, |node| node.denormal_f32); + let denormal_f16f64 = compute_single_mode_insertions(cfg, |node| node.denormal_f16f64); + let rounding_f16f64 = compute_single_mode_insertions(cfg, |node| node.rounding_f16f64); + let denormal_f32 = + optimize_mode_insertions::(denormal_f32); + let denormal_f16f64 = + optimize_mode_insertions::(denormal_f16f64); + let rounding_f32 = + optimize_mode_insertions::(rounding_f32); + let rounding_f16f64: MandatoryModeInsertions = + optimize_mode_insertions::(rounding_f16f64); + (denormal_f32, denormal_f16f64, rounding_f32, rounding_f16f64) +} + +// This function creates control flow graph for the whole module. This control +// flow graph expresses function calls as edges in the control flow graph +fn create_control_flow_graph( + directives: &Vec, SpirvWord>>, +) -> Result { + let mut cfg = ControlFlowGraph::new(); + for directive in directives.iter() { + match directive { + super::Directive2::Method(Function2 { + name, + body: Some(body), + is_kernel, + .. + }) => { + let (mut bb_state, mut body_iter) = + BasicBlockState::new(&mut cfg, *name, body, *is_kernel)?; + while let Some(statement) = body_iter.next() { + match statement { + Statement::Instruction(ast::Instruction::Bra { arguments }) => { + bb_state.end(&[arguments.src]); + } + Statement::Instruction(ast::Instruction::Call { + arguments: ast::CallArgs { func, .. }, + .. + }) => { + let after_call_label = match body_iter.next() { + Some(Statement::Instruction(ast::Instruction::Bra { + arguments: ast::BraArgs { src }, + })) => *src, + _ => return Err(error_unreachable()), + }; + bb_state.record_call(*func, after_call_label)?; + } + Statement::RetValue(..) + | Statement::Instruction(ast::Instruction::Ret { .. }) => { + if !is_kernel { + bb_state.record_ret(*name)?; + } + } + Statement::Label(label) => { + bb_state.start(*label); + } + Statement::Conditional(BrachCondition { + if_true, if_false, .. + }) => { + bb_state.end(&[*if_true, *if_false]); + } + Statement::Instruction(instruction) => { + let modes = get_modes(instruction); + bb_state.append(modes); + } + _ => {} + } + } + } + _ => {} + } + } + cfg.fixup_function_calls()?; + Ok(cfg) +} + +fn join_modes( + flat_resolver: &mut super::GlobalStringIdentResolver2, + directives: &Vec, super::SpirvWord>>, + cfg: ResolvedControlFlowGraph, + mandatory_denormal_f32: MandatoryModeInsertions, + mandatory_denormal_f16f64: MandatoryModeInsertions, + mandatory_rounding_f32: MandatoryModeInsertions, + mandatory_rounding_f16f64: MandatoryModeInsertions, +) -> Result { + let basic_blocks = cfg + .graph + .node_weights() + .map(|basic_block| { + let denormal_prologue = if mandatory_denormal_f32 + .basic_blocks + .contains(&basic_block.label) + || mandatory_denormal_f16f64 + .basic_blocks + .contains(&basic_block.label) + { + Some(flat_resolver.register_unnamed(None)) + } else { + None + }; + let rounding_prologue = if mandatory_rounding_f32 + .basic_blocks + .contains(&basic_block.label) + || mandatory_rounding_f16f64 + .basic_blocks + .contains(&basic_block.label) + { + Some(flat_resolver.register_unnamed(None)) + } else { + None + }; + let dual_prologue = if denormal_prologue.is_some() && rounding_prologue.is_some() { + Some(flat_resolver.register_unnamed(None)) + } else { + None + }; + let denormal = BasicBlockEntryState { + prologue: denormal_prologue, + twin_mode: TwinMode { + f32: basic_block.denormal_f32.entry, + f16f64: basic_block.denormal_f16f64.entry, + }, + }; + let rounding = BasicBlockEntryState { + prologue: rounding_prologue, + twin_mode: TwinMode { + f32: basic_block.rounding_f32.entry, + f16f64: basic_block.rounding_f16f64.entry, + }, + }; + Ok(( + basic_block.label, + FullBasicBlockEntryState { + dual_prologue, + denormal, + rounding, + }, + )) + }) + .collect::, _>>()?; + let functions_exit_modes = directives + .iter() + .filter_map(|directive| match directive { + Directive2::Method(Function2 { + name, + body: None, + is_kernel: false, + .. + }) => { + let fn_bb = match cfg.basic_blocks.get(name) { + Some(bb) => bb, + None => return None, + }; + let weights = cfg.graph.node_weight(*fn_bb).unwrap(); + let modes = ResolvedInstructionModes { + denormal_f32: weights.denormal_f32.exit.map(DenormalMode::to_ftz), + denormal_f16f64: weights.denormal_f16f64.exit.map(DenormalMode::to_ftz), + rounding_f32: weights.rounding_f32.exit.map(RoundingMode::to_ast), + rounding_f16f64: weights.rounding_f16f64.exit.map(RoundingMode::to_ast), + }; + Some(Ok((*name, modes))) + } + Directive2::Method(Function2 { + name, + body: Some(_), + is_kernel: false, + .. + }) => { + let ret_bb = cfg.functions_rets.get(name).unwrap(); + let weights = cfg.graph.node_weight(*ret_bb).unwrap(); + let modes = ResolvedInstructionModes { + denormal_f32: weights.denormal_f32.exit.map(DenormalMode::to_ftz), + denormal_f16f64: weights.denormal_f16f64.exit.map(DenormalMode::to_ftz), + rounding_f32: weights.rounding_f32.exit.map(RoundingMode::to_ast), + rounding_f16f64: weights.rounding_f16f64.exit.map(RoundingMode::to_ast), + }; + Some(Ok((*name, modes))) + } + _ => None, + }) + .collect::, _>>()?; + Ok(FullModeInsertion { + basic_blocks, + functions_exit_modes, + }) +} + +struct FullModeInsertion { + basic_blocks: FxHashMap, + functions_exit_modes: FxHashMap, +} + +struct FullBasicBlockEntryState { + dual_prologue: Option, + denormal: BasicBlockEntryState, + rounding: BasicBlockEntryState, +} + +#[derive(Clone, Copy)] +struct BasicBlockEntryState { + prologue: Option, + twin_mode: TwinMode>, +} + +#[derive(Clone, Copy)] +struct TwinMode { + f32: T, + f16f64: T, +} + +// This function goes through every method, every basic block, every instruction +// and based on computed information inserts: +// * Instructions that change global mode +// * Insert additional "prelude" basic blocks that sets mode +// * Redirect some jumps to "prelude" basic blocks +fn apply_global_mode_controls( + directives: Vec, SpirvWord>>, + global_modes: FullModeInsertion, +) -> Result, SpirvWord>>, TranslateError> { + directives + .into_iter() + .map(|directive| { + let (mut method, initial_mode) = match directive { + Directive2::Variable(..) | Directive2::Method(Function2 { body: None, .. }) => { + return Ok(directive); + } + Directive2::Method( + mut method @ Function2 { + name, + body: Some(_), + .. + }, + ) => { + let initial_mode = global_modes + .basic_blocks + .get(&name) + .ok_or_else(error_unreachable)?; + let denormal_mode = initial_mode.denormal.twin_mode; + let rounding_mode = initial_mode.rounding.twin_mode; + method.flush_to_zero_f32 = + denormal_mode.f32.ok_or_else(error_unreachable)?.to_ftz(); + method.flush_to_zero_f16f64 = + denormal_mode.f16f64.ok_or_else(error_unreachable)?.to_ftz(); + method.rounding_mode_f32 = + rounding_mode.f32.ok_or_else(error_unreachable)?.to_ast(); + method.rounding_mode_f16f64 = + rounding_mode.f16f64.ok_or_else(error_unreachable)?.to_ast(); + (method, initial_mode) + } + }; + check_function_prelude(&method, &global_modes)?; + let old_body = method.body.take().unwrap(); + let mut result = Vec::with_capacity(old_body.len()); + let mut bb_state = BasicBlockControlState::new(&global_modes, initial_mode); + let mut old_body = old_body.into_iter(); + while let Some(mut statement) = old_body.next() { + let mut call_target = None; + match &mut statement { + Statement::Label(label) => { + bb_state.start(*label, &mut result)?; + } + Statement::Instruction(ast::Instruction::Call { + arguments: ast::CallArgs { func, .. }, + .. + }) => { + bb_state.redirect_jump(func)?; + call_target = Some(*func); + } + Statement::Conditional(BrachCondition { + if_true, if_false, .. + }) => { + bb_state.redirect_jump(if_true)?; + bb_state.redirect_jump(if_false)?; + } + Statement::Instruction(ast::Instruction::Bra { + arguments: ptx_parser::BraArgs { src }, + }) => { + bb_state.redirect_jump(src)?; + } + Statement::Instruction(instruction) => { + let modes = get_modes(&instruction); + bb_state.insert(&mut result, modes)?; + } + _ => {} + } + result.push(statement); + if let Some(call_target) = call_target { + let mut post_call_bra = old_body.next().ok_or_else(error_unreachable)?; + if let Statement::Instruction(ast::Instruction::Bra { + arguments: + ast::BraArgs { + src: ref mut post_call_label, + }, + }) = post_call_bra + { + let node_exit_mode = global_modes + .functions_exit_modes + .get(&call_target) + .ok_or_else(error_unreachable)?; + redirect_jump_impl( + &bb_state.global_modes, + node_exit_mode, + post_call_label, + )?; + result.push(post_call_bra); + } else { + return Err(error_unreachable()); + } + } + } + method.body = Some(result); + Ok(Directive2::Method(method)) + }) + .collect::, _>>() +} + +fn check_function_prelude( + method: &Function2, SpirvWord>, + global_modes: &FullModeInsertion, +) -> Result<(), TranslateError> { + let fn_mode_state = global_modes + .basic_blocks + .get(&method.name) + .ok_or_else(error_unreachable)?; + // A function should never have a prelude. Preludes happen only if there + // is an edge in the control flow graph that requires a mode change. + // Since functions never have a mode setting instructions that means they + // only pass the mode from incoming edges to outgoing edges + if fn_mode_state.dual_prologue.is_some() + || fn_mode_state.denormal.prologue.is_some() + || fn_mode_state.rounding.prologue.is_some() + { + return Err(error_unreachable()); + } + Ok(()) +} + +struct BasicBlockControlState<'a> { + global_modes: &'a FullModeInsertion, + denormal_f32: RegisterState, + denormal_f16f64: RegisterState, + rounding_f32: RegisterState, + rounding_f16f64: RegisterState, +} + +#[derive(Clone, Copy)] +struct RegisterState { + current_value: Resolved, + // This is slightly subtle: this value is Some iff there's a SetMode in this + // basic block setting this mode, but on which no instruciton relies + last_foldable: Option, +} + +impl RegisterState { + fn new(value: Resolved) -> RegisterState + where + U: Into, + { + RegisterState { + current_value: value.map(Into::into), + last_foldable: None, + } + } +} + +impl<'a> BasicBlockControlState<'a> { + fn new(global_modes: &'a FullModeInsertion, initial_mode: &FullBasicBlockEntryState) -> Self { + let denormal_f32 = RegisterState::new(initial_mode.denormal.twin_mode.f32); + let denormal_f16f64 = RegisterState::new(initial_mode.denormal.twin_mode.f16f64); + let rounding_f32 = RegisterState::new(initial_mode.rounding.twin_mode.f32); + let rounding_f16f64 = RegisterState::new(initial_mode.rounding.twin_mode.f16f64); + BasicBlockControlState { + global_modes, + denormal_f32, + denormal_f16f64, + rounding_f32, + rounding_f16f64, + } + } + + fn start( + &mut self, + basic_block: SpirvWord, + statements: &mut Vec, SpirvWord>>, + ) -> Result<(), TranslateError> { + let bb_state = self + .global_modes + .basic_blocks + .get(&basic_block) + .ok_or_else(error_unreachable)?; + + let denormal_f32 = RegisterState::new(bb_state.denormal.twin_mode.f32); + let denormal_f16f64 = RegisterState::new(bb_state.denormal.twin_mode.f16f64); + self.denormal_f32 = denormal_f32; + self.denormal_f16f64 = denormal_f16f64; + let rounding_f32 = RegisterState::new(bb_state.rounding.twin_mode.f32); + let rounding_f16f64 = RegisterState::new(bb_state.rounding.twin_mode.f16f64); + self.rounding_f32 = rounding_f32; + self.rounding_f16f64 = rounding_f16f64; + if let Some(prologue) = bb_state.dual_prologue { + statements.push(Statement::Label(prologue)); + statements.push(Statement::SetMode(ModeRegister::Denormal { + f32: bb_state.denormal.twin_mode.f32.unwrap_or_default().to_ftz(), + f16f64: bb_state + .denormal + .twin_mode + .f16f64 + .unwrap_or_default() + .to_ftz(), + })); + statements.push(Statement::SetMode(ModeRegister::Rounding { + f32: bb_state.rounding.twin_mode.f32.unwrap_or_default().to_ast(), + f16f64: bb_state + .rounding + .twin_mode + .f16f64 + .unwrap_or_default() + .to_ast(), + })); + statements.push(Statement::Instruction(ast::Instruction::Bra { + arguments: ast::BraArgs { src: basic_block }, + })); + } + if let Some(prologue) = bb_state.denormal.prologue { + statements.push(Statement::Label(prologue)); + statements.push(Statement::SetMode(ModeRegister::Denormal { + f32: bb_state.denormal.twin_mode.f32.unwrap_or_default().to_ftz(), + f16f64: bb_state + .denormal + .twin_mode + .f16f64 + .unwrap_or_default() + .to_ftz(), + })); + statements.push(Statement::Instruction(ast::Instruction::Bra { + arguments: ast::BraArgs { src: basic_block }, + })); + } + if let Some(prologue) = bb_state.rounding.prologue { + statements.push(Statement::Label(prologue)); + statements.push(Statement::SetMode(ModeRegister::Rounding { + f32: bb_state.rounding.twin_mode.f32.unwrap_or_default().to_ast(), + f16f64: bb_state + .rounding + .twin_mode + .f16f64 + .unwrap_or_default() + .to_ast(), + })); + statements.push(Statement::Instruction(ast::Instruction::Bra { + arguments: ast::BraArgs { src: basic_block }, + })); + } + Ok(()) + } + + fn insert( + &mut self, + result: &mut Vec, SpirvWord>>, + modes: InstructionModes, + ) -> Result<(), TranslateError> { + self.insert_one::(result, modes.denormal_f32.map(DenormalMode::to_ftz))?; + self.insert_one::( + result, + modes.denormal_f16f64.map(DenormalMode::to_ftz), + )?; + self.insert_one::(result, modes.rounding_f32.map(RoundingMode::to_ast))?; + self.insert_one::( + result, + modes.rounding_f16f64.map(RoundingMode::to_ast), + )?; + Ok(()) + } + + fn insert_one( + &mut self, + result: &mut Vec, SpirvWord>>, + mode: Option, + ) -> Result<(), TranslateError> { + fn set_fold_index(bb: &mut BasicBlockControlState, index: Option) { + let mut reg = View::get_register(bb); + reg.last_foldable = index; + View::set_register(bb, reg); + } + let new_mode = unwrap_some_or!(mode, return Ok(())); + let register_state = View::get_register(self); + match register_state.current_value { + Resolved::Conflict => { + return Err(error_unreachable()); + } + Resolved::Value(old) if old == new_mode => { + set_fold_index::(self, None); + } + _ => match register_state.last_foldable { + // fold successful + Some(index) => { + if let Some(Statement::SetMode(mode_set)) = result.get_mut(index) { + View::set_single_mode(mode_set, new_mode)?; + set_fold_index::(self, None); + } else { + return Err(error_unreachable()); + } + } + // fold failed, insert new instruction + None => { + result.push(Statement::SetMode(View::new_mode( + new_mode, + View::TwinView::get_register(self) + .current_value + .unwrap_or(View::ComputeValue::default().into()), + ))); + View::set_register( + self, + RegisterState { + current_value: Resolved::Value(new_mode), + last_foldable: None, + }, + ); + set_fold_index::(self, Some(result.len() - 1)); + } + }, + } + Ok(()) + } + + fn redirect_jump(&self, jump_target: &mut SpirvWord) -> Result<(), TranslateError> { + let current_mode = ResolvedInstructionModes { + denormal_f32: self.denormal_f32.current_value, + denormal_f16f64: self.denormal_f16f64.current_value, + rounding_f32: self.rounding_f32.current_value, + rounding_f16f64: self.rounding_f16f64.current_value, + }; + redirect_jump_impl(self.global_modes, ¤t_mode, jump_target) + } +} + +fn redirect_jump_impl( + global_modes: &FullModeInsertion, + current_mode: &ResolvedInstructionModes, + jump_target: &mut SpirvWord, +) -> Result<(), TranslateError> { + let target = global_modes + .basic_blocks + .get(jump_target) + .ok_or_else(error_unreachable)?; + let jump_to_denormal_prelude = current_mode + .denormal_f32 + .mode_change(target.denormal.twin_mode.f32.map(DenormalMode::to_ftz)) + || current_mode + .denormal_f16f64 + .mode_change(target.denormal.twin_mode.f16f64.map(DenormalMode::to_ftz)); + let jump_to_rounding_prelude = current_mode + .rounding_f32 + .mode_change(target.rounding.twin_mode.f32.map(RoundingMode::to_ast)) + || current_mode + .rounding_f16f64 + .mode_change(target.rounding.twin_mode.f16f64.map(RoundingMode::to_ast)); + match (jump_to_denormal_prelude, jump_to_rounding_prelude) { + (true, false) => { + *jump_target = target.denormal.prologue.ok_or_else(error_unreachable)?; + } + (false, true) => { + *jump_target = target.rounding.prologue.ok_or_else(error_unreachable)?; + } + (true, true) => { + *jump_target = target.dual_prologue.ok_or_else(error_unreachable)?; + } + (false, false) => {} + } + Ok(()) +} + +#[derive(Copy, Clone)] +enum Resolved { + Conflict, + Value(T), +} + +impl Resolved { + fn unwrap_or_default(self) -> T { + match self { + Resolved::Conflict => T::default(), + Resolved::Value(t) => t, + } + } +} + +impl Resolved { + fn mode_change(self, target: Self) -> bool { + match (self, target) { + (Resolved::Conflict, Resolved::Conflict) => false, + (Resolved::Conflict, Resolved::Value(_)) => true, + (Resolved::Value(_), Resolved::Conflict) => false, + (Resolved::Value(x), Resolved::Value(y)) => x != y, + } + } +} + +impl Resolved { + fn unwrap_or(self, if_fail: T) -> T { + match self { + Resolved::Conflict => if_fail, + Resolved::Value(t) => t, + } + } + + fn map(self, f: F) -> Resolved + where + F: FnOnce(T) -> U, + { + match self { + Resolved::Value(x) => Resolved::Value(f(x)), + Resolved::Conflict => Resolved::Conflict, + } + } + + fn ok_or_else(self, err: F) -> Result + where + F: FnOnce() -> E, + { + match self { + Resolved::Value(v) => Ok(v), + Resolved::Conflict => Err(err()), + } + } +} + +trait ModeView { + type ComputeValue: Default + Into; + type Value: PartialEq + Eq + Copy + Clone; + type TwinView: ModeView; + + fn get_register(bb: &BasicBlockControlState) -> RegisterState; + fn set_register(bb: &mut BasicBlockControlState, reg: RegisterState); + fn new_mode(t: Self::Value, other: Self::Value) -> ModeRegister; + fn set_single_mode(reg: &mut ModeRegister, x: Self::Value) -> Result<(), TranslateError>; +} + +struct DenormalF32View; + +impl ModeView for DenormalF32View { + type ComputeValue = DenormalMode; + type Value = bool; + type TwinView = DenormalF16F64View; + + fn get_register(bb: &BasicBlockControlState) -> RegisterState { + bb.denormal_f32 + } + + fn set_register(bb: &mut BasicBlockControlState, reg: RegisterState) { + bb.denormal_f32 = reg; + } + + fn new_mode(f32: Self::Value, f16f64: Self::Value) -> ModeRegister { + ModeRegister::Denormal { f32, f16f64 } + } + + fn set_single_mode(reg: &mut ModeRegister, x: Self::Value) -> Result<(), TranslateError> { + match reg { + ModeRegister::Denormal { f32, f16f64: _ } => *f32 = x, + ModeRegister::Rounding { .. } => return Err(error_unreachable()), + } + Ok(()) + } +} + +struct DenormalF16F64View; + +impl ModeView for DenormalF16F64View { + type ComputeValue = DenormalMode; + type Value = bool; + type TwinView = DenormalF32View; + + fn get_register(bb: &BasicBlockControlState) -> RegisterState { + bb.denormal_f16f64 + } + + fn set_register(bb: &mut BasicBlockControlState, reg: RegisterState) { + bb.denormal_f16f64 = reg; + } + + fn new_mode(f16f64: Self::Value, f32: Self::Value) -> ModeRegister { + ModeRegister::Denormal { f32, f16f64 } + } + + fn set_single_mode(reg: &mut ModeRegister, x: Self::Value) -> Result<(), TranslateError> { + match reg { + ModeRegister::Denormal { f32: _, f16f64 } => *f16f64 = x, + ModeRegister::Rounding { .. } => return Err(error_unreachable()), + } + Ok(()) + } +} + +struct RoundingF32View; + +impl ModeView for RoundingF32View { + type ComputeValue = RoundingMode; + type Value = ast::RoundingMode; + type TwinView = RoundingF16F64View; + + fn get_register(bb: &BasicBlockControlState) -> RegisterState { + bb.rounding_f32 + } + + fn set_register(bb: &mut BasicBlockControlState, reg: RegisterState) { + bb.rounding_f32 = reg; + } + + fn new_mode(f32: Self::Value, f16f64: Self::Value) -> ModeRegister { + ModeRegister::Rounding { f32, f16f64 } + } + + fn set_single_mode(reg: &mut ModeRegister, x: Self::Value) -> Result<(), TranslateError> { + match reg { + ModeRegister::Rounding { f32, f16f64: _ } => *f32 = x, + ModeRegister::Denormal { .. } => return Err(error_unreachable()), + } + Ok(()) + } +} + +struct RoundingF16F64View; + +impl ModeView for RoundingF16F64View { + type ComputeValue = RoundingMode; + type Value = ast::RoundingMode; + type TwinView = RoundingF32View; + + fn get_register(bb: &BasicBlockControlState) -> RegisterState { + bb.rounding_f16f64 + } + + fn set_register(bb: &mut BasicBlockControlState, reg: RegisterState) { + bb.rounding_f16f64 = reg; + } + + fn new_mode(f16f64: Self::Value, f32: Self::Value) -> ModeRegister { + ModeRegister::Rounding { f32, f16f64 } + } + + fn set_single_mode(reg: &mut ModeRegister, x: Self::Value) -> Result<(), TranslateError> { + match reg { + ModeRegister::Rounding { f32: _, f16f64 } => *f16f64 = x, + ModeRegister::Denormal { .. } => return Err(error_unreachable()), + } + Ok(()) + } +} + +struct BasicBlockState<'a> { + cfg: &'a mut ControlFlowGraph, + node_index: Option, + // If it's a kernel basic block then we don't track entry instruction mode + entry: InstructionModes, + exit: InstructionModes, +} + +impl<'a> BasicBlockState<'a> { + #[must_use] + fn new<'x>( + cfg: &'a mut ControlFlowGraph, + fn_name: SpirvWord, + body: &'x Vec, SpirvWord>>, + is_kernel: bool, + ) -> Result< + ( + BasicBlockState<'a>, + std::iter::Peekable< + impl Iterator, SpirvWord>>, + >, + ), + TranslateError, + > { + let entry_index = if is_kernel { + cfg.add_entry_basic_block(fn_name) + } else { + cfg.get_or_add_basic_block(fn_name) + }; + let mut body_iter = body.iter(); + let mut bb_state = Self { + cfg, + node_index: None, + entry: InstructionModes::none(), + exit: InstructionModes::none(), + }; + match body_iter.next() { + Some(Statement::Label(label)) => { + bb_state.cfg.add_jump(entry_index, *label); + bb_state.start(*label); + } + _ => return Err(error_unreachable()), + }; + Ok((bb_state, body_iter.peekable())) + } + + fn start(&mut self, label: SpirvWord) { + self.end(&[]); + self.node_index = Some(self.cfg.get_or_add_basic_block(label)); + } + + fn end(&mut self, jumps: &[SpirvWord]) -> Option { + let node_index = self.node_index.take(); + let node_index = match node_index { + Some(x) => x, + None => return None, + }; + for target in jumps { + self.cfg.add_jump(node_index, *target); + } + self.cfg.set_modes( + node_index, + mem::replace(&mut self.entry, InstructionModes::none()), + mem::replace(&mut self.exit, InstructionModes::none()), + ); + Some(node_index) + } + + fn record_call( + &mut self, + fn_call: SpirvWord, + after_call_label: SpirvWord, + ) -> Result<(), TranslateError> { + self.end(&[fn_call]).ok_or_else(error_unreachable)?; + let after_call_label = self.cfg.get_or_add_basic_block(after_call_label); + let call_returns = self + .cfg + .call_returns + .entry(fn_call) + .or_insert_with(|| Vec::new()); + call_returns.push(after_call_label); + Ok(()) + } + + fn record_ret(&mut self, fn_name: SpirvWord) -> Result<(), TranslateError> { + let node_index = self.node_index.ok_or_else(error_unreachable)?; + let previous_function_ret = self.cfg.functions_rets.insert(fn_name, node_index); + // This pass relies on there being only a single `ret;` in a function + if previous_function_ret.is_some() { + return Err(error_unreachable()); + } + Ok(()) + } + + fn append(&mut self, modes: InstructionModes) { + modes.fold_into(&mut self.entry, &mut self.exit); + } +} + +impl<'a> Drop for BasicBlockState<'a> { + fn drop(&mut self) { + self.end(&[]); + } +} + +fn compute_single_mode_insertions( + graph: &ControlFlowGraph, + mut getter: impl FnMut(&Node) -> Mode, +) -> PartialModeInsertion { + let mut must_insert_mode = FxHashSet::::default(); + let mut maybe_insert_mode = FxHashMap::default(); + let mut remaining = graph + .graph + .node_references() + .rev() + .filter_map(|(index, node)| { + getter(node) + .entry + .as_ref() + .map(|mode| match mode { + ExtendedMode::BasicBlock(mode) => Some((index, node.label, *mode)), + ExtendedMode::Entry(_) => None, + }) + .flatten() + }) + .collect::>(); + 'next_basic_block: while let Some((index, node_id, expected_mode)) = remaining.pop() { + let mut to_visit = + UniqueVec::new(graph.graph.neighbors_directed(index, Direction::Incoming)); + let mut visited = FxHashSet::default(); + while let Some(current) = to_visit.pop() { + if !visited.insert(current) { + continue; + } + let exit_mode = getter(graph.graph.node_weight(current).unwrap()).exit; + match exit_mode { + None => { + for predecessor in graph.graph.neighbors_directed(current, Direction::Incoming) + { + if !visited.contains(&predecessor) { + to_visit.push(predecessor); + } + } + } + Some(ExtendedMode::BasicBlock(mode)) => { + if mode != expected_mode { + maybe_insert_mode.remove(&node_id); + must_insert_mode.insert(node_id); + continue 'next_basic_block; + } + } + Some(ExtendedMode::Entry(kernel)) => match maybe_insert_mode.entry(node_id) { + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert((expected_mode, iter::once(kernel).collect::>())); + } + std::collections::hash_map::Entry::Occupied(mut entry) => { + entry.get_mut().1.insert(kernel); + } + }, + } + } + } + PartialModeInsertion { + bb_must_insert_mode: must_insert_mode, + bb_maybe_insert_mode: maybe_insert_mode, + } +} + +#[derive(Debug)] +struct PartialModeInsertion { + bb_must_insert_mode: FxHashSet, + bb_maybe_insert_mode: FxHashMap)>, +} + +// Only returns kernel mode insertions if a kernel is relevant to the optimization problem +fn optimize_mode_insertions< + T: Copy + Into + strum::VariantArray + std::fmt::Debug + Default, + const N: usize, +>( + partial: PartialModeInsertion, +) -> MandatoryModeInsertions { + let mut problem = Problem::new(OptimizationDirection::Maximize); + let mut kernel_modes = FxHashMap::default(); + let basic_block_variables = partial + .bb_maybe_insert_mode + .into_iter() + .map(|(basic_block, (value, entry_points))| { + let modes = entry_points + .iter() + .map(|entry_point| { + let kernel_modes = kernel_modes + .entry(*entry_point) + .or_insert_with(|| one_of::(&mut problem)); + kernel_modes[value.into()] + }) + .collect::>(); + let bb = and(&mut problem, &*modes); + (basic_block, bb) + }) + .collect::>(); + // TODO: add fallback on Error + let solution = problem.solve().unwrap(); + let mut basic_blocks = partial.bb_must_insert_mode; + for (basic_block, variable) in basic_block_variables { + if solution[variable] < 0.5 { + basic_blocks.insert(basic_block); + } + } + let mut kernels = FxHashMap::default(); + 'iterate_kernels: for (kernel, modes) in kernel_modes { + for (mode, var) in modes.into_iter().enumerate() { + if solution[var] > 0.5 { + kernels.insert(kernel, T::VARIANTS[mode]); + continue 'iterate_kernels; + } + } + } + MandatoryModeInsertions { + basic_blocks, + kernels, + } +} + +fn and(problem: &mut Problem, variables: &[Variable]) -> Variable { + let result = problem.add_binary_var(1.0); + for var in variables { + problem.add_constraint( + &[(result, 1.0), (*var, -1.0)], + microlp::ComparisonOp::Le, + 0.0, + ); + } + problem.add_constraint( + iter::once((result, 1.0)).chain(variables.iter().map(|var| (*var, -1.0))), + microlp::ComparisonOp::Ge, + -((variables.len() - 1) as f64), + ); + result +} + +fn one_of(problem: &mut Problem) -> [Variable; N] { + let result = std::array::from_fn(|_| problem.add_binary_var(0.0)); + problem.add_constraint( + result.into_iter().map(|var| (var, 1.0)), + microlp::ComparisonOp::Eq, + 1.0, + ); + result +} + +struct MandatoryModeInsertions { + basic_blocks: FxHashSet, + kernels: FxHashMap, +} + +#[derive(Eq, PartialEq, Clone, Copy)] +//#[cfg_attr(test, derive(Debug))] +#[derive(Debug)] +enum ExtendedMode { + BasicBlock(T), + Entry(SpirvWord), +} + +struct UniqueVec { + set: FxHashSet, + vec: Vec, +} + +impl UniqueVec { + fn new(iter: impl Iterator) -> Self { + let mut set = FxHashSet::default(); + let mut vec = Vec::new(); + for item in iter { + if set.contains(&item) { + continue; + } + set.insert(item); + vec.push(item); + } + Self { set, vec } + } + + fn pop(&mut self) -> Option { + if let Some(t) = self.vec.pop() { + assert!(self.set.remove(&t)); + Some(t) + } else { + None + } + } + + fn push(&mut self, t: T) -> bool { + if self.set.insert(t) { + self.vec.push(t); + true + } else { + false + } + } +} + +fn get_modes(inst: &ast::Instruction) -> InstructionModes { + match inst { + // TODO: review it when implementing virtual calls + ast::Instruction::Call { .. } + | ast::Instruction::Mov { .. } + | ast::Instruction::Ld { .. } + | ast::Instruction::St { .. } + | ast::Instruction::PrmtSlow { .. } + | ast::Instruction::Prmt { .. } + | ast::Instruction::Activemask { .. } + | ast::Instruction::Membar { .. } + | ast::Instruction::Trap {} + | ast::Instruction::Not { .. } + | ast::Instruction::Or { .. } + | ast::Instruction::And { .. } + | ast::Instruction::Bra { .. } + | ast::Instruction::Clz { .. } + | ast::Instruction::Brev { .. } + | ast::Instruction::Popc { .. } + | ast::Instruction::Xor { .. } + | ast::Instruction::Rem { .. } + | ast::Instruction::Bfe { .. } + | ast::Instruction::Bfi { .. } + | ast::Instruction::Shr { .. } + | ast::Instruction::Shl { .. } + | ast::Instruction::Selp { .. } + | ast::Instruction::Ret { .. } + | ast::Instruction::Bar { .. } + | ast::Instruction::Cvta { .. } + | ast::Instruction::Atom { .. } + | ast::Instruction::Mul24 { .. } + | ast::Instruction::AtomCas { .. } => InstructionModes::none(), + ast::Instruction::Add { + data: ast::ArithDetails::Integer(_), + .. + } + | ast::Instruction::Sub { + data: ast::ArithDetails::Integer(..), + .. + } + | ast::Instruction::Mul { + data: ast::MulDetails::Integer { .. }, + .. + } + | ast::Instruction::Mad { + data: ast::MadDetails::Integer { .. }, + .. + } + | ast::Instruction::Min { + data: ast::MinMaxDetails::Signed(..) | ast::MinMaxDetails::Unsigned(..), + .. + } + | ast::Instruction::Max { + data: ast::MinMaxDetails::Signed(..) | ast::MinMaxDetails::Unsigned(..), + .. + } + | ast::Instruction::Div { + data: ast::DivDetails::Signed(..) | ast::DivDetails::Unsigned(..), + .. + } => InstructionModes::none(), + ast::Instruction::Fma { data, .. } + | ast::Instruction::Sub { + data: ast::ArithDetails::Float(data), + .. + } + | ast::Instruction::Mul { + data: ast::MulDetails::Float(data), + .. + } + | ast::Instruction::Mad { + data: ast::MadDetails::Float(data), + .. + } + | ast::Instruction::Add { + data: ast::ArithDetails::Float(data), + .. + } => InstructionModes::from_arith_float(data), + ast::Instruction::Setp { + data: + ast::SetpData { + type_, + flush_to_zero, + .. + }, + .. + } + | ast::Instruction::SetpBool { + data: + ast::SetpBoolData { + base: + ast::SetpData { + type_, + flush_to_zero, + .. + }, + .. + }, + .. + } + | ast::Instruction::Neg { + data: ast::TypeFtz { + type_, + flush_to_zero, + }, + .. + } + | ast::Instruction::Ex2 { + data: ast::TypeFtz { + type_, + flush_to_zero, + }, + .. + } + | ast::Instruction::Rsqrt { + data: ast::TypeFtz { + type_, + flush_to_zero, + }, + .. + } + | ast::Instruction::Abs { + data: ast::TypeFtz { + type_, + flush_to_zero, + }, + .. + } + | ast::Instruction::Min { + data: + ast::MinMaxDetails::Float(ast::MinMaxFloat { + type_, + flush_to_zero, + .. + }), + .. + } + | ast::Instruction::Max { + data: + ast::MinMaxDetails::Float(ast::MinMaxFloat { + type_, + flush_to_zero, + .. + }), + .. + } + | ast::Instruction::Div { + data: + ast::DivDetails::Float(ast::DivFloatDetails { + type_, + flush_to_zero, + .. + }), + .. + } => InstructionModes::from_ftz(*type_, *flush_to_zero), + ast::Instruction::Sin { data, .. } + | ast::Instruction::Cos { data, .. } + | ast::Instruction::Lg2 { data, .. } => InstructionModes::from_ftz_f32(data.flush_to_zero), + ast::Instruction::Rcp { data, .. } | ast::Instruction::Sqrt { data, .. } => { + InstructionModes::from_rcp(*data) + } + ast::Instruction::Cvt { data, .. } => InstructionModes::from_cvt(data), + } +} + +#[cfg(test)] +mod test; diff --git a/ptx/src/pass/instruction_mode_to_global_mode/test.rs b/ptx/src/pass/instruction_mode_to_global_mode/test.rs new file mode 100644 index 0000000..78d1d66 --- /dev/null +++ b/ptx/src/pass/instruction_mode_to_global_mode/test.rs @@ -0,0 +1,399 @@ +use super::*; +use int_enum::IntEnum; +use strum::EnumCount; + +#[repr(usize)] +#[derive(IntEnum, Eq, PartialEq, Copy, Clone, Debug)] +enum Bool { + False = 0, + True = 1, +} + +fn ftz() -> InstructionModes { + InstructionModes { + denormal_f32: Some(DenormalMode::FlushToZero), + denormal_f16f64: None, + rounding_f32: None, + rounding_f16f64: None, + } +} + +fn preserve() -> InstructionModes { + InstructionModes { + denormal_f32: Some(DenormalMode::Preserve), + denormal_f16f64: None, + rounding_f32: None, + rounding_f16f64: None, + } +} + +#[test] +fn transitive_mixed() { + let mut graph = ControlFlowGraph::new(); + let entry_id = SpirvWord(1); + let false_id = SpirvWord(2); + let empty_id = SpirvWord(3); + let false2_id = SpirvWord(4); + let entry = graph.add_entry_basic_block(entry_id); + graph.add_jump(entry, false_id); + let false_ = graph.get_or_add_basic_block(false_id); + graph.set_modes(false_, ftz(), ftz()); + graph.add_jump(false_, empty_id); + let empty = graph.get_or_add_basic_block(empty_id); + graph.add_jump(empty, false2_id); + let false2_ = graph.get_or_add_basic_block(false2_id); + graph.set_modes(false2_, ftz(), ftz()); + let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32); + assert_eq!(partial_result.bb_must_insert_mode.len(), 0); + assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1); + assert_eq!( + partial_result.bb_maybe_insert_mode[&false_id], + (DenormalMode::FlushToZero, iter::once(entry_id).collect()) + ); + + let result = optimize_mode_insertions::(partial_result); + assert_eq!(result.basic_blocks.len(), 0); + assert_eq!(result.kernels.len(), 1); + assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero); +} + +#[test] +fn transitive_change_twice() { + let mut graph = ControlFlowGraph::new(); + let entry_id = SpirvWord(1); + let false_id = SpirvWord(2); + let empty_id = SpirvWord(3); + let true_id = SpirvWord(4); + let entry = graph.add_entry_basic_block(entry_id); + graph.add_jump(entry, false_id); + let false_ = graph.get_or_add_basic_block(false_id); + graph.set_modes(false_, ftz(), ftz()); + graph.add_jump(false_, empty_id); + let empty = graph.get_or_add_basic_block(empty_id); + graph.add_jump(empty, true_id); + let true_ = graph.get_or_add_basic_block(true_id); + graph.set_modes(true_, preserve(), preserve()); + let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32); + assert_eq!(partial_result.bb_must_insert_mode.len(), 1); + assert!(partial_result.bb_must_insert_mode.contains(&true_id)); + assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1); + assert_eq!( + partial_result.bb_maybe_insert_mode[&false_id], + (DenormalMode::FlushToZero, iter::once(entry_id).collect()) + ); + + let result = optimize_mode_insertions::(partial_result); + assert_eq!(result.basic_blocks, iter::once(true_id).collect()); + assert_eq!(result.kernels.len(), 1); + assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero); +} + +#[test] +fn transitive_change() { + let mut graph = ControlFlowGraph::new(); + let entry_id = SpirvWord(1); + let empty_id = SpirvWord(2); + let true_id = SpirvWord(3); + let entry = graph.add_entry_basic_block(entry_id); + graph.add_jump(entry, empty_id); + let empty = graph.get_or_add_basic_block(empty_id); + graph.add_jump(empty, true_id); + let true_ = graph.get_or_add_basic_block(true_id); + graph.set_modes(true_, preserve(), preserve()); + let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32); + assert_eq!(partial_result.bb_must_insert_mode.len(), 0); + assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1); + assert_eq!( + partial_result.bb_maybe_insert_mode[&true_id], + (DenormalMode::Preserve, iter::once(entry_id).collect()) + ); + + let result = optimize_mode_insertions::(partial_result); + assert_eq!(result.basic_blocks.len(), 0); + assert_eq!(result.kernels.len(), 1); + assert_eq!(result.kernels[&entry_id], DenormalMode::Preserve); +} + +#[test] +fn codependency() { + let mut graph = ControlFlowGraph::new(); + let entry_id = SpirvWord(1); + let left_f_id = SpirvWord(2); + let right_f_id = SpirvWord(3); + let left_none_id = SpirvWord(4); + let mid_none_id = SpirvWord(5); + let right_none_id = SpirvWord(6); + let entry = graph.add_entry_basic_block(entry_id); + graph.add_jump(entry, left_f_id); + graph.add_jump(entry, right_f_id); + let left_f = graph.get_or_add_basic_block(left_f_id); + graph.set_modes(left_f, ftz(), ftz()); + let right_f = graph.get_or_add_basic_block(right_f_id); + graph.set_modes(right_f, ftz(), ftz()); + graph.add_jump(left_f, left_none_id); + let left_none = graph.get_or_add_basic_block(left_none_id); + graph.add_jump(right_f, right_none_id); + let right_none = graph.get_or_add_basic_block(right_none_id); + graph.add_jump(left_none, mid_none_id); + graph.add_jump(right_none, mid_none_id); + let mid_none = graph.get_or_add_basic_block(mid_none_id); + graph.add_jump(mid_none, left_none_id); + graph.add_jump(mid_none, right_none_id); + //println!( + // "{:?}", + // petgraph::dot::Dot::with_config(&graph.graph, &[petgraph::dot::Config::EdgeNoLabel]) + //); + let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32); + assert_eq!(partial_result.bb_must_insert_mode.len(), 0); + assert_eq!(partial_result.bb_maybe_insert_mode.len(), 2); + assert_eq!( + partial_result.bb_maybe_insert_mode[&left_f_id], + (DenormalMode::FlushToZero, iter::once(entry_id).collect()) + ); + assert_eq!( + partial_result.bb_maybe_insert_mode[&right_f_id], + (DenormalMode::FlushToZero, iter::once(entry_id).collect()) + ); + + let result = optimize_mode_insertions::(partial_result); + assert_eq!(result.basic_blocks.len(), 0); + assert_eq!(result.kernels.len(), 1); + assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero); +} + +static FOLD_DENORMAL_PTX: &'static str = include_str!("fold_denormal.ptx"); + +#[test] +fn fold_denormal() { + let method = compile_methods(FOLD_DENORMAL_PTX).pop().unwrap(); + assert_eq!(true, method.flush_to_zero_f32); + assert_eq!(true, method.flush_to_zero_f16f64); + let method_body = method.body.unwrap(); + assert!(matches!( + &*method_body, + [ + Statement::Label(..), + Statement::Variable(..), + Statement::Variable(..), + Statement::Variable(..), + Statement::Instruction(ast::Instruction::Add { .. }), + Statement::Instruction(ast::Instruction::Add { .. }), + Statement::SetMode(ModeRegister::Denormal { + f32: false, + f16f64: false + }), + Statement::Instruction(ast::Instruction::Add { .. }), + Statement::Instruction(ast::Instruction::Add { .. }), + Statement::Instruction(ast::Instruction::Ret { .. }), + ] + )); +} + +fn compile_methods(ptx: &str) -> Vec, SpirvWord>> { + use crate::pass::*; + + let module = ptx_parser::parse_module_checked(ptx).unwrap(); + let mut flat_resolver = GlobalStringIdentResolver2::new(SpirvWord(1)); + let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver); + let directives = normalize_identifiers2::run(&mut scoped_resolver, module.directives).unwrap(); + let directives = normalize_predicates2::run(&mut flat_resolver, directives).unwrap(); + let directives = expand_operands::run(&mut flat_resolver, directives).unwrap(); + let directives = normalize_basic_blocks::run(&mut flat_resolver, directives).unwrap(); + let directives = super::run(&mut flat_resolver, directives).unwrap(); + directives + .into_iter() + .filter_map(|s| match s { + Directive2::Method(m) => Some(m), + _ => None, + }) + .collect::>() +} + +static CALL_WITH_MODE_PTX: &'static str = include_str!("call_with_mode.ptx"); + +#[test] +fn call_with_mode() { + let methods = compile_methods(CALL_WITH_MODE_PTX); + + assert!(matches!(methods[0].body, None)); + + let method_1 = methods[1].body.as_ref().unwrap(); + assert!(matches!( + &**method_1, + [ + Statement::Label(..), + Statement::Variable(..), + Statement::Instruction(ast::Instruction::Add { .. }), + Statement::Instruction(ast::Instruction::Call { .. }), + Statement::Instruction(ast::Instruction::Bra { .. }), + Statement::Label(..), + // Dual prelude + Statement::SetMode(ModeRegister::Denormal { + f32: true, + f16f64: true + }), + Statement::SetMode(ModeRegister::Rounding { + f32: ast::RoundingMode::PositiveInf, + f16f64: ast::RoundingMode::NearestEven + }), + Statement::Instruction(ast::Instruction::Bra { .. }), + // Denormal prelude + Statement::Label(..), + Statement::SetMode(ModeRegister::Denormal { + f32: true, + f16f64: true + }), + Statement::Instruction(ast::Instruction::Bra { .. }), + // Rounding prelude + Statement::Label(..), + Statement::SetMode(ModeRegister::Rounding { + f32: ast::RoundingMode::PositiveInf, + f16f64: ast::RoundingMode::NearestEven + }), + Statement::Instruction(ast::Instruction::Bra { .. }), + Statement::Label(..), + Statement::Instruction(ast::Instruction::Add { .. }), + Statement::Instruction(ast::Instruction::Ret { .. }), + ] + )); + let [to_fn0] = calls(method_1); + let [_, dual_prelude, _, _, add] = labels(method_1); + let [post_call, post_prelude_dual, post_prelude_denormal, post_prelude_rounding] = + branches(method_1); + assert_eq!(methods[0].name, to_fn0); + assert_eq!(post_call, dual_prelude); + assert_eq!(post_prelude_dual, add); + assert_eq!(post_prelude_denormal, add); + assert_eq!(post_prelude_rounding, add); + + let method_2 = methods[2].body.as_ref().unwrap(); + assert!(matches!( + &**method_2, + [ + Statement::Label(..), + Statement::Variable(..), + Statement::Variable(..), + Statement::Conditional(..), + Statement::Label(..), + Statement::Conditional(..), + Statement::Label(..), + Statement::Instruction(ast::Instruction::Bra { .. }), + Statement::Label(..), + // Dual prelude + Statement::SetMode(ModeRegister::Denormal { + f32: false, + f16f64: true + }), + Statement::SetMode(ModeRegister::Rounding { + f32: ast::RoundingMode::NegativeInf, + f16f64: ast::RoundingMode::NearestEven + }), + Statement::Instruction(ast::Instruction::Bra { .. }), + // Denormal prelude + Statement::Label(..), + Statement::SetMode(ModeRegister::Denormal { + f32: false, + f16f64: true + }), + Statement::Instruction(ast::Instruction::Bra { .. }), + // Rounding prelude + Statement::Label(..), + Statement::SetMode(ModeRegister::Rounding { + f32: ast::RoundingMode::NegativeInf, + f16f64: ast::RoundingMode::NearestEven + }), + Statement::Instruction(ast::Instruction::Bra { .. }), + Statement::Label(..), + Statement::Instruction(ast::Instruction::Add { .. }), + Statement::Instruction(ast::Instruction::Bra { .. }), + Statement::Label(..), + Statement::SetMode(ModeRegister::Denormal { + f32: false, + f16f64: true + }), + Statement::Instruction(ast::Instruction::Bra { .. }), + Statement::Label(..), + Statement::Instruction(ast::Instruction::Add { .. }), + Statement::Instruction(ast::Instruction::Bra { .. }), + Statement::Label(..), + Statement::Instruction(ast::Instruction::Ret { .. }), + ] + )); + let [(if_rm_true, if_rm_false), (if_rz_true, if_rz_false)] = conditionals(method_2); + let [_, conditional2, post_conditional2, prelude_dual, _, _, add1, add2_set_denormal, add2, ret] = + labels(method_2); + let [post_conditional2_jump, post_prelude_dual, post_prelude_denormal, post_prelude_rounding, post_add1, post_add2_set_denormal, post_add2] = + branches(method_2); + assert_eq!(if_rm_true, prelude_dual); + assert_eq!(if_rm_false, conditional2); + assert_eq!(if_rz_true, post_conditional2); + assert_eq!(if_rz_false, add2_set_denormal); + assert_eq!(post_conditional2_jump, prelude_dual); + assert_eq!(post_prelude_dual, add1); + assert_eq!(post_prelude_denormal, add1); + assert_eq!(post_prelude_rounding, add1); + assert_eq!(post_add1, ret); + assert_eq!(post_add2_set_denormal, add2); + assert_eq!(post_add2, ret); +} + +fn branches( + fn_: &Vec, SpirvWord>>, +) -> [SpirvWord; N] { + fn_.iter() + .filter_map(|s| match s { + Statement::Instruction(ast::Instruction::Bra { + arguments: ast::BraArgs { src }, + }) => Some(*src), + _ => None, + }) + .collect::>() + .try_into() + .unwrap() +} + +fn labels( + fn_: &Vec, SpirvWord>>, +) -> [SpirvWord; N] { + fn_.iter() + .filter_map( + |s: &Statement, SpirvWord>| match s { + Statement::Label(label) => Some(*label), + _ => None, + }, + ) + .collect::>() + .try_into() + .unwrap() +} + +fn calls( + fn_: &Vec, SpirvWord>>, +) -> [SpirvWord; N] { + fn_.iter() + .filter_map(|s| match s { + Statement::Instruction(ast::Instruction::Call { + arguments: ast::CallArgs { func, .. }, + .. + }) => Some(*func), + _ => None, + }) + .collect::>() + .try_into() + .unwrap() +} + +fn conditionals( + fn_: &Vec, SpirvWord>>, +) -> [(SpirvWord, SpirvWord); N] { + fn_.iter() + .filter_map(|s| match s { + Statement::Conditional(BrachCondition { + if_true, if_false, .. + }) => Some((*if_true, *if_false)), + _ => None, + }) + .collect::>() + .try_into() + .unwrap() +} diff --git a/ptx/src/pass/mod.rs b/ptx/src/pass/mod.rs index f11a381..77d7e60 100644 --- a/ptx/src/pass/mod.rs +++ b/ptx/src/pass/mod.rs @@ -17,12 +17,15 @@ mod expand_operands; mod fix_special_registers2; mod hoist_globals; mod insert_explicit_load_store; +mod instruction_mode_to_global_mode; mod insert_implicit_conversions2; +mod normalize_basic_blocks; mod normalize_identifiers2; mod normalize_predicates2; +mod remove_unreachable_basic_blocks; mod replace_instructions_with_function_calls; -mod resolve_function_pointers; mod replace_known_functions; +mod resolve_function_pointers; static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../../lib/zluda_ptx_impl.bc"); const ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl_"; @@ -43,12 +46,15 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result>, ptx_parser::ParsedOperand>> = fix_special_registers2::run(&mut flat_resolver, &sreg_map, directives)?; + let directives = fix_special_registers2::run(&mut flat_resolver, &sreg_map, directives)?; let directives = expand_operands::run(&mut flat_resolver, directives)?; let directives = deparamize_functions::run(&mut flat_resolver, directives)?; + let directives = normalize_basic_blocks::run(&mut flat_resolver, directives)?; + let directives = remove_unreachable_basic_blocks::run(directives)?; + let directives = instruction_mode_to_global_mode::run(&mut flat_resolver, directives)?; let directives = insert_explicit_load_store::run(&mut flat_resolver, directives)?; let directives = insert_implicit_conversions2::run(&mut flat_resolver, directives)?; let directives = replace_instructions_with_function_calls::run(&mut flat_resolver, directives)?; @@ -195,6 +201,20 @@ enum Statement { FunctionPointer(FunctionPointerDetails), VectorRead(VectorRead), VectorWrite(VectorWrite), + SetMode(ModeRegister), +} + +#[derive(Eq, PartialEq, Clone, Copy)] +#[cfg_attr(test, derive(Debug))] +enum ModeRegister { + Denormal { + f32: bool, + f16f64: bool, + }, + Rounding { + f32: ast::RoundingMode, + f16f64: ast::RoundingMode, + }, } impl> Statement, T> { @@ -467,6 +487,7 @@ impl> Statement, T> { let src = visitor.visit_ident(src, None, false, false)?; Statement::FunctionPointer(FunctionPointerDetails { dst, src }) } + Statement::SetMode(mode_register) => Statement::SetMode(mode_register), }) } } @@ -525,7 +546,7 @@ struct FunctionPointerDetails { src: SpirvWord, } -#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Debug)] pub struct SpirvWord(u32); impl From for SpirvWord { @@ -557,22 +578,27 @@ type NormalizedStatement = Statement< ast::ParsedOperand, >; -enum Directive2<'input, Instruction, Operand: ast::Operand> { +enum Directive2 { Variable(ast::LinkingDirective, ast::Variable), - Method(Function2<'input, Instruction, Operand>), + Method(Function2), } -struct Function2<'input, Instruction, Operand: ast::Operand> { - pub func_decl: ast::MethodDeclaration<'input, SpirvWord>, - pub globals: Vec>, +struct Function2 { + pub return_arguments: Vec>, + pub name: Operand::Ident, + pub input_arguments: Vec>, pub body: Option>>, + is_kernel: bool, import_as: Option, tuning: Vec, linkage: ast::LinkingDirective, + flush_to_zero_f32: bool, + flush_to_zero_f16f64: bool, + rounding_mode_f32: ast::RoundingMode, + rounding_mode_f16f64: ast::RoundingMode, } -type NormalizedDirective2<'input> = Directive2< - 'input, +type NormalizedDirective2 = Directive2< ( Option>, ast::Instruction>, @@ -580,8 +606,7 @@ type NormalizedDirective2<'input> = Directive2< ast::ParsedOperand, >; -type NormalizedFunction2<'input> = Function2< - 'input, +type NormalizedFunction2 = Function2< ( Option>, ast::Instruction>, @@ -589,17 +614,11 @@ type NormalizedFunction2<'input> = Function2< ast::ParsedOperand, >; -type UnconditionalDirective<'input> = Directive2< - 'input, - ast::Instruction>, - ast::ParsedOperand, ->; +type UnconditionalDirective = + Directive2>, ast::ParsedOperand>; -type UnconditionalFunction<'input> = Function2< - 'input, - ast::Instruction>, - ast::ParsedOperand, ->; +type UnconditionalFunction = + Function2>, ast::ParsedOperand>; struct GlobalStringIdentResolver2<'input> { pub(crate) current_id: SpirvWord, @@ -805,47 +824,45 @@ impl SpecialRegistersMap2 { self.id_to_reg.get(&id).copied() } - fn generate_declarations<'a, 'input>( + fn len() -> usize { + PtxSpecialRegister::iter().len() + } + + fn foreach_declaration<'a, 'input>( resolver: &'a mut GlobalStringIdentResolver2<'input>, - ) -> impl ExactSizeIterator< - Item = ( + mut fn_: impl FnMut( PtxSpecialRegister, - ast::MethodDeclaration<'input, SpirvWord>, + ( + Vec>, + SpirvWord, + Vec>, + ), ), - > + 'a { - PtxSpecialRegister::iter().map(|sreg| { + ) { + for sreg in PtxSpecialRegister::iter() { let external_fn_name = [ZLUDA_PTX_PREFIX, sreg.get_unprefixed_function_name()].concat(); - let name = - ast::MethodName::Func(resolver.register_named(Cow::Owned(external_fn_name), None)); + let name = resolver.register_named(Cow::Owned(external_fn_name), None); let return_type = sreg.get_function_return_type(); let input_type = sreg.get_function_input_type(); - ( - sreg, - ast::MethodDeclaration { - return_arguments: vec![ast::Variable { - align: None, - v_type: return_type.into(), - state_space: ast::StateSpace::Reg, - name: resolver - .register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))), - array_init: Vec::new(), - }], - name: name, - input_arguments: input_type - .into_iter() - .map(|type_| ast::Variable { - align: None, - v_type: type_.into(), - state_space: ast::StateSpace::Reg, - name: resolver - .register_unnamed(Some((type_.into(), ast::StateSpace::Reg))), - array_init: Vec::new(), - }) - .collect::>(), - shared_mem: None, - }, - ) - }) + let return_arguments = vec![ast::Variable { + align: None, + v_type: return_type.into(), + state_space: ast::StateSpace::Reg, + name: resolver.register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))), + array_init: Vec::new(), + }]; + let input_arguments = input_type + .into_iter() + .map(|type_| ast::Variable { + align: None, + v_type: type_.into(), + state_space: ast::StateSpace::Reg, + name: resolver.register_unnamed(Some((type_.into(), ast::StateSpace::Reg))), + array_init: Vec::new(), + }) + .collect::>(); + fn_(sreg, (return_arguments, name, input_arguments)); + } } } diff --git a/ptx/src/pass/normalize_basic_blocks.rs b/ptx/src/pass/normalize_basic_blocks.rs new file mode 100644 index 0000000..920cf21 --- /dev/null +++ b/ptx/src/pass/normalize_basic_blocks.rs @@ -0,0 +1,134 @@ +use super::*; + +// This pass normalizes ptx modules in two ways that makes mode computation pass +// and code emissions passes much simpler: +// * Inserts label at the start of every function +// This makes control flow graph simpler in mode computation block: we can +// represent kernels as separate nodes with its own separate entry/exit mode +// * Inserts label at the start of every basic block +// * Insert explicit jumps before labels +// * Non-.entry methods get a single `ret;` exit point - this is because mode computation +// logic requires it. Control flow graph constructed by mode computation +// models function calls as jumps into and then from another function. +// If this cfg allowed multiple return basic blocks then there would be cases +// where we want to insert mode setting instruction along the edge between +// `ret;` and bb in the caller. This is only possible if there's a single +// edge between from function `ret;` and caller +pub(crate) fn run( + flat_resolver: &mut GlobalStringIdentResolver2<'_>, + mut directives: Vec, SpirvWord>>, +) -> Result, SpirvWord>>, TranslateError> { + for directive in directives.iter_mut() { + let (body_ref, is_kernel) = match directive { + Directive2::Method(Function2 { + body: Some(body), is_kernel, .. + }) => (body, *is_kernel), + _ => continue, + }; + let body = std::mem::replace(body_ref, Vec::new()); + let mut result = Vec::with_capacity(body.len()); + let mut previous_instruction_was_terminator = TerminatorKind::Not; + let mut body_iterator = body.into_iter(); + let mut return_statements = Vec::new(); + match body_iterator.next() { + Some(Statement::Label(_)) => {} + Some(statement) => { + result.push(Statement::Label(flat_resolver.register_unnamed(None))); + result.push(statement); + } + None => {} + } + for statement in body_iterator { + match previous_instruction_was_terminator { + TerminatorKind::Not => match statement { + Statement::Label(label) => { + result.push(Statement::Instruction(ast::Instruction::Bra { + arguments: ast::BraArgs { src: label }, + })) + } + _ => {} + }, + TerminatorKind::Real => { + if !matches!(statement, Statement::Label(..)) { + result.push(Statement::Label(flat_resolver.register_unnamed(None))); + } + } + TerminatorKind::Fake => match statement { + // If there's a label after a call just reuse it + Statement::Label(label) => { + result.push(Statement::Instruction(ast::Instruction::Bra { + arguments: ast::BraArgs { src: label }, + })) + } + _ => { + let label = flat_resolver.register_unnamed(None); + result.push(Statement::Instruction(ast::Instruction::Bra { + arguments: ast::BraArgs { src: label }, + })); + result.push(Statement::Label(label)); + } + }, + } + match statement { + Statement::RetValue(..) => { + return Err(error_unreachable()); + } + Statement::Instruction(ast::Instruction::Ret { .. }) => { + if !is_kernel { + return_statements.push(result.len()); + } + } + _ => {} + } + previous_instruction_was_terminator = is_block_terminator(&statement); + result.push(statement); + } + convert_from_multiple_returns_to_single_return( + flat_resolver, + &mut result, + return_statements, + )?; + *body_ref = result; + } + Ok(directives) +} + +enum TerminatorKind { + Not, + Real, + Fake, +} + +fn convert_from_multiple_returns_to_single_return( + flat_resolver: &mut GlobalStringIdentResolver2<'_>, + result: &mut Vec, SpirvWord>>, + return_statements: Vec, +) -> Result<(), TranslateError> { + Ok(if return_statements.len() > 1 { + let ret_bb = flat_resolver.register_unnamed(None); + result.push(Statement::Label(ret_bb)); + result.push(Statement::Instruction(ast::Instruction::Ret { + data: ast::RetData { uniform: false }, + })); + for ret_index in return_statements { + let statement = result.get_mut(ret_index).ok_or_else(error_unreachable)?; + *statement = Statement::Instruction(ast::Instruction::Bra { + arguments: ast::BraArgs { src: ret_bb }, + }); + } + }) +} + +fn is_block_terminator( + statement: &Statement, SpirvWord>, +) -> TerminatorKind { + match statement { + Statement::Conditional(..) + | Statement::Instruction(ast::Instruction::Bra { .. }) + // Normally call is not a terminator, but we treat it as such because it + // makes the "instruction modes to global modes" pass possible + | Statement::Instruction(ast::Instruction::Ret { .. }) => TerminatorKind::Real, + Statement::Instruction(ast::Instruction::Call { .. }) => TerminatorKind::Fake, + _ => TerminatorKind::Not, + } +} diff --git a/ptx/src/pass/normalize_identifiers2.rs b/ptx/src/pass/normalize_identifiers2.rs index 5155886..810ef3e 100644 --- a/ptx/src/pass/normalize_identifiers2.rs +++ b/ptx/src/pass/normalize_identifiers2.rs @@ -4,7 +4,7 @@ use ptx_parser as ast; pub(crate) fn run<'input, 'b>( resolver: &mut ScopedResolver<'input, 'b>, directives: Vec>>, -) -> Result>, TranslateError> { +) -> Result, TranslateError> { resolver.start_scope(); let result = directives .into_iter() @@ -17,7 +17,7 @@ pub(crate) fn run<'input, 'b>( fn run_directive<'input, 'b>( resolver: &mut ScopedResolver<'input, 'b>, directive: ast::Directive<'input, ast::ParsedOperand<&'input str>>, -) -> Result, TranslateError> { +) -> Result { Ok(match directive { ast::Directive::Variable(linking, var) => { NormalizedDirective2::Variable(linking, run_variable(resolver, var)?) @@ -32,15 +32,11 @@ fn run_method<'input, 'b>( resolver: &mut ScopedResolver<'input, 'b>, linkage: ast::LinkingDirective, method: ast::Function<'input, &'input str, ast::Statement>>, -) -> Result, TranslateError> { - let name = match method.func_directive.name { - ast::MethodName::Kernel(name) => ast::MethodName::Kernel(name), - ast::MethodName::Func(text) => { - ast::MethodName::Func(resolver.add_or_get_in_current_scope_untyped(text)?) - } - }; +) -> Result { + let is_kernel = method.func_directive.name.is_kernel(); + let name = resolver.add_or_get_in_current_scope_untyped(method.func_directive.name.text())?; resolver.start_scope(); - let func_decl = run_function_decl(resolver, method.func_directive, name)?; + let (return_arguments, input_arguments) = run_function_decl(resolver, method.func_directive)?; let body = method .body .map(|statements| { @@ -51,20 +47,25 @@ fn run_method<'input, 'b>( .transpose()?; resolver.end_scope(); Ok(Function2 { - func_decl, - globals: Vec::new(), + return_arguments, + name, + input_arguments, body, import_as: None, - tuning: method.tuning, linkage, + is_kernel, + tuning: method.tuning, + flush_to_zero_f32: false, + flush_to_zero_f16f64: false, + rounding_mode_f32: ptx_parser::RoundingMode::NearestEven, + rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven, }) } fn run_function_decl<'input, 'b>( resolver: &mut ScopedResolver<'input, 'b>, func_directive: ast::MethodDeclaration<'input, &'input str>, - name: ast::MethodName<'input, SpirvWord>, -) -> Result, TranslateError> { +) -> Result<(Vec>, Vec>), TranslateError> { assert!(func_directive.shared_mem.is_none()); let return_arguments = func_directive .return_arguments @@ -76,12 +77,7 @@ fn run_function_decl<'input, 'b>( .into_iter() .map(|var| run_variable(resolver, var)) .collect::, _>>()?; - Ok(ast::MethodDeclaration { - return_arguments, - name, - input_arguments, - shared_mem: None, - }) + Ok((return_arguments, input_arguments)) } fn run_variable<'input, 'b>( diff --git a/ptx/src/pass/normalize_predicates2.rs b/ptx/src/pass/normalize_predicates2.rs index d91e23c..ae41021 100644 --- a/ptx/src/pass/normalize_predicates2.rs +++ b/ptx/src/pass/normalize_predicates2.rs @@ -3,8 +3,8 @@ use ptx_parser as ast; pub(crate) fn run<'input>( resolver: &mut GlobalStringIdentResolver2<'input>, - directives: Vec>, -) -> Result>, TranslateError> { + directives: Vec, +) -> Result, TranslateError> { directives .into_iter() .map(|directive| run_directive(resolver, directive)) @@ -13,8 +13,8 @@ pub(crate) fn run<'input>( fn run_directive<'input>( resolver: &mut GlobalStringIdentResolver2<'input>, - directive: NormalizedDirective2<'input>, -) -> Result, TranslateError> { + directive: NormalizedDirective2, +) -> Result { Ok(match directive { Directive2::Variable(linking, var) => Directive2::Variable(linking, var), Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?), @@ -23,8 +23,8 @@ fn run_directive<'input>( fn run_method<'input>( resolver: &mut GlobalStringIdentResolver2<'input>, - method: NormalizedFunction2<'input>, -) -> Result, TranslateError> { + method: NormalizedFunction2, +) -> Result { let body = method .body .map(|statements| { @@ -36,12 +36,18 @@ fn run_method<'input>( }) .transpose()?; Ok(Function2 { - func_decl: method.func_decl, - globals: method.globals, body, + return_arguments: method.return_arguments, + name: method.name, + input_arguments: method.input_arguments, import_as: method.import_as, tuning: method.tuning, linkage: method.linkage, + is_kernel: method.is_kernel, + flush_to_zero_f32: method.flush_to_zero_f32, + flush_to_zero_f16f64: method.flush_to_zero_f16f64, + rounding_mode_f32: method.rounding_mode_f32, + rounding_mode_f16f64: method.rounding_mode_f16f64, }) } diff --git a/ptx/src/pass/remove_unreachable_basic_blocks.rs b/ptx/src/pass/remove_unreachable_basic_blocks.rs new file mode 100644 index 0000000..68c4605 --- /dev/null +++ b/ptx/src/pass/remove_unreachable_basic_blocks.rs @@ -0,0 +1,122 @@ +use super::*; +use petgraph::{ + graph::NodeIndex, + visit::{Bfs, VisitMap}, + Graph, +}; +use rustc_hash::FxHashSet; + +pub(crate) fn run( + mut directives: Vec, SpirvWord>>, +) -> Result, SpirvWord>>, TranslateError> { + let mut reachable_funcs = FxHashSet::default(); + for directive in directives.iter_mut() { + match directive { + Directive2::Method(Function2 { + body: Some(body), .. + }) => { + let old_body = std::mem::replace(body, Vec::new()); + let mut cfg = ControlFlowGraph::new(); + let mut old_body_iter = old_body.iter(); + let mut current_bb = match old_body_iter.next() { + Some(Statement::Label(label)) => cfg.add_or_get_node(*label), + _ => return Err(error_unreachable()), + }; + let first_bb = current_bb; + for statement in old_body_iter { + match statement { + Statement::Label(label) => { + current_bb = cfg.add_or_get_node(*label); + } + Statement::Conditional(branch) => { + cfg.add_branch(current_bb, branch.if_true); + cfg.add_branch(current_bb, branch.if_false); + } + Statement::Instruction(ast::Instruction::Bra { + arguments: ast::BraArgs { src }, + }) => { + cfg.add_branch(current_bb, *src); + } + Statement::FunctionPointer(FunctionPointerDetails { + src: _func, .. + }) => { + return Err(error_todo()); + } + Statement::Instruction(ast::Instruction::Call { + arguments: ast::CallArgs { func, .. }, + .. + }) => { + reachable_funcs.insert(*func); + } + _ => {} + } + } + let mut bfs = Bfs::new(&cfg.graph, first_bb); + while let Some(_) = bfs.next(&cfg.graph) {} + let mut visited = true; + *body = try_filter_to_vec(old_body.into_iter(), |statement| { + match statement { + Statement::Label(label) => { + visited = bfs + .discovered + .is_visited(cfg.nodes.get(label).ok_or_else(error_unreachable)?); + } + _ => {} + } + Ok(visited) + })?; + } + _ => {} + } + } + Ok(directives + .into_iter() + .filter(|directive| match directive { + Directive2::Variable(..) => true, + Directive2::Method(Function2 { + name, is_kernel, .. + }) => *is_kernel || reachable_funcs.contains(name), + }) + .collect::>()) +} + +fn try_filter_to_vec( + mut iter: impl ExactSizeIterator, + mut filter: impl FnMut(&T) -> Result, +) -> Result, E> { + iter.try_fold(Vec::with_capacity(iter.len()), |mut vec, item| { + match filter(&item) { + Ok(true) => vec.push(item), + Ok(false) => {} + Err(err) => return Err(err), + } + Ok(vec) + }) +} + +struct ControlFlowGraph { + graph: Graph, + nodes: FxHashMap, +} + +impl ControlFlowGraph { + fn new() -> Self { + Self { + graph: Graph::new(), + nodes: FxHashMap::default(), + } + } + + fn add_or_get_node(&mut self, id: SpirvWord) -> NodeIndex { + *self + .nodes + .entry(id) + .or_insert_with(|| self.graph.add_node(id)) + } + + fn add_branch(&mut self, from: NodeIndex, to: SpirvWord) -> NodeIndex { + let to = self.add_or_get_node(to); + self.graph.add_edge(from, to, ()); + to + } +} diff --git a/ptx/src/pass/replace_instructions_with_function_calls.rs b/ptx/src/pass/replace_instructions_with_function_calls.rs index 668cc21..0f9311a 100644 --- a/ptx/src/pass/replace_instructions_with_function_calls.rs +++ b/ptx/src/pass/replace_instructions_with_function_calls.rs @@ -2,8 +2,8 @@ use super::*; pub(super) fn run<'input>( resolver: &mut GlobalStringIdentResolver2<'input>, - directives: Vec, SpirvWord>>, -) -> Result, SpirvWord>>, TranslateError> { + directives: Vec, SpirvWord>>, +) -> Result, SpirvWord>>, TranslateError> { let mut fn_declarations = FxHashMap::default(); let remapped_directives = directives .into_iter() @@ -13,17 +13,18 @@ pub(super) fn run<'input>( .into_iter() .map(|(_, (return_arguments, name, input_arguments))| { Directive2::Method(Function2 { - func_decl: ast::MethodDeclaration { - return_arguments, - name: ast::MethodName::Func(name), - input_arguments, - shared_mem: None, - }, - globals: Vec::new(), + return_arguments, + name: name, + input_arguments, body: None, import_as: None, tuning: Vec::new(), linkage: ast::LinkingDirective::EXTERN, + is_kernel: false, + flush_to_zero_f32: false, + flush_to_zero_f16f64: false, + rounding_mode_f32: ptx_parser::RoundingMode::NearestEven, + rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven, }) }) .collect::>(); @@ -41,8 +42,8 @@ fn run_directive<'input>( Vec>, ), >, - directive: Directive2<'input, ast::Instruction, SpirvWord>, -) -> Result, SpirvWord>, TranslateError> { + directive: Directive2, SpirvWord>, +) -> Result, SpirvWord>, TranslateError> { Ok(match directive { var @ Directive2::Variable(..) => var, Directive2::Method(mut method) => { diff --git a/ptx/src/pass/replace_known_functions.rs b/ptx/src/pass/replace_known_functions.rs index 56bb7e6..48f2b45 100644 --- a/ptx/src/pass/replace_known_functions.rs +++ b/ptx/src/pass/replace_known_functions.rs @@ -1,14 +1,15 @@ +use std::borrow::Cow; + use super::{GlobalStringIdentResolver2, NormalizedDirective2, SpirvWord}; pub(crate) fn run<'input>( - resolver: &GlobalStringIdentResolver2<'input>, - mut directives: Vec>, -) -> Vec> { + resolver: &mut GlobalStringIdentResolver2<'input>, + mut directives: Vec, +) -> Vec { for directive in directives.iter_mut() { match directive { NormalizedDirective2::Method(func) => { - func.import_as = - replace_with_ptx_impl(resolver, &func.func_decl.name, func.import_as.take()); + replace_with_ptx_impl(resolver, func.name); } _ => {} } @@ -17,22 +18,16 @@ pub(crate) fn run<'input>( } fn replace_with_ptx_impl<'input>( - resolver: &GlobalStringIdentResolver2<'input>, - fn_name: &ptx_parser::MethodName<'input, SpirvWord>, - name: Option, -) -> Option { + resolver: &mut GlobalStringIdentResolver2<'input>, + fn_name: SpirvWord, +) { let known_names = ["__assertfail"]; - match name { - Some(name) if known_names.contains(&&*name) => Some(format!("__zluda_ptx_impl_{}", name)), - Some(name) => Some(name), - None => match fn_name { - ptx_parser::MethodName::Func(name) => match resolver.ident_map.get(name) { - Some(super::IdentEntry { - name: Some(name), .. - }) => Some(format!("__zluda_ptx_impl_{}", name)), - _ => None, - }, - ptx_parser::MethodName::Kernel(..) => None, - }, + if let Some(super::IdentEntry { + name: Some(name), .. + }) = resolver.ident_map.get_mut(&fn_name) + { + if known_names.contains(&&**name) { + *name = Cow::Owned(format!("__zluda_ptx_impl_{}", name)); + } } } diff --git a/ptx/src/pass/resolve_function_pointers.rs b/ptx/src/pass/resolve_function_pointers.rs index eb7abb1..81b9f0a 100644 --- a/ptx/src/pass/resolve_function_pointers.rs +++ b/ptx/src/pass/resolve_function_pointers.rs @@ -3,8 +3,8 @@ use ptx_parser as ast; use rustc_hash::FxHashSet; pub(crate) fn run<'input>( - directives: Vec>, -) -> Result>, TranslateError> { + directives: Vec, +) -> Result, TranslateError> { let mut functions = FxHashSet::default(); directives .into_iter() @@ -14,19 +14,13 @@ pub(crate) fn run<'input>( fn run_directive<'input>( functions: &mut FxHashSet, - directive: UnconditionalDirective<'input>, -) -> Result, TranslateError> { + directive: UnconditionalDirective, +) -> Result { Ok(match directive { var @ Directive2::Variable(..) => var, Directive2::Method(method) => { - { - let func_decl = &method.func_decl; - match func_decl.name { - ptx_parser::MethodName::Kernel(_) => {} - ptx_parser::MethodName::Func(name) => { - functions.insert(name); - } - } + if !method.is_kernel { + functions.insert(method.name); } Directive2::Method(run_method(functions, method)?) } @@ -35,8 +29,8 @@ fn run_directive<'input>( fn run_method<'input>( functions: &mut FxHashSet, - method: UnconditionalFunction<'input>, -) -> Result, TranslateError> { + method: UnconditionalFunction, +) -> Result { let body = method .body .map(|statements| { @@ -46,14 +40,7 @@ fn run_method<'input>( .collect::, _>>() }) .transpose()?; - Ok(Function2 { - func_decl: method.func_decl, - globals: method.globals, - body, - import_as: method.import_as, - tuning: method.tuning, - linkage: method.linkage, - }) + Ok(Function2 { body, ..method }) } fn run_statement<'input>( diff --git a/ptx/src/test/ll/activemask.ll b/ptx/src/test/ll/activemask.ll index a54bc7b..0da737e 100644 --- a/ptx/src/test/ll/activemask.ll +++ b/ptx/src/test/ll/activemask.ll @@ -1,32 +1,24 @@ declare i32 @__zluda_ptx_impl_activemask() #0 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { - %"35" = alloca i64, align 8, addrspace(5) - %"36" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 { + %"31" = alloca i64, align 8, addrspace(5) + %"32" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"37" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"37", ptr addrspace(5) %"35", align 4 - %"38" = call i32 @__zluda_ptx_impl_activemask() - store i32 %"38", ptr addrspace(5) %"36", align 4 - %"39" = load i64, ptr addrspace(5) %"35", align 4 - %"40" = load i32, ptr addrspace(5) %"36", align 4 - %"41" = inttoptr i64 %"39" to ptr - store i32 %"40", ptr %"41", align 4 + br label %"28" + +"28": ; preds = %1 + %"33" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"33", ptr addrspace(5) %"31", align 4 + %"34" = call i32 @__zluda_ptx_impl_activemask() + store i32 %"34", ptr addrspace(5) %"32", align 4 + %"35" = load i64, ptr addrspace(5) %"31", align 4 + %"36" = load i32, ptr addrspace(5) %"32", align 4 + %"37" = inttoptr i64 %"35" to ptr + store i32 %"36", ptr %"37", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/add.ll b/ptx/src/test/ll/add.ll index d8807e0..e945f2e 100644 --- a/ptx/src/test/ll/add.ll +++ b/ptx/src/test/ll/add.ll @@ -1,39 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"46" = add i64 %"47", 1 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"51", align 4 + br label %"31" + +"31": ; preds = %1 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr + %"40" = load i64, ptr %"46", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"42" = add i64 %"43", 1 + store i64 %"42", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"47" = inttoptr i64 %"44" to ptr + store i64 %"45", ptr %"47", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/add_ftz.ll b/ptx/src/test/ll/add_ftz.ll new file mode 100644 index 0000000..1760ec7 --- /dev/null +++ b/ptx/src/test/ll/add_ftz.ll @@ -0,0 +1,52 @@ +define amdgpu_kernel void @add_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca float, align 4, addrspace(5) + %"42" = alloca float, align 4, addrspace(5) + %"43" = alloca float, align 4, addrspace(5) + %"44" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"36" + +"36": ; preds = %1 + %"45" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"45", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"46", ptr addrspace(5) %"40", align 4 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"61" = inttoptr i64 %"48" to ptr + %"47" = load float, ptr %"61", align 4 + store float %"47", ptr addrspace(5) %"41", align 4 + %"49" = load i64, ptr addrspace(5) %"39", align 4 + %"62" = inttoptr i64 %"49" to ptr + %"33" = getelementptr inbounds i8, ptr %"62", i64 4 + %"50" = load float, ptr %"33", align 4 + store float %"50", ptr addrspace(5) %"42", align 4 + %"52" = load float, ptr addrspace(5) %"41", align 4 + %"53" = load float, ptr addrspace(5) %"42", align 4 + %"51" = fadd float %"52", %"53" + store float %"51", ptr addrspace(5) %"43", align 4 + call void @llvm.amdgcn.s.setreg(i32 6401, i32 3) + %"55" = load float, ptr addrspace(5) %"41", align 4 + %"56" = load float, ptr addrspace(5) %"42", align 4 + %"54" = fadd float %"55", %"56" + store float %"54", ptr addrspace(5) %"44", align 4 + %"57" = load i64, ptr addrspace(5) %"40", align 4 + %"58" = load float, ptr addrspace(5) %"43", align 4 + %"63" = inttoptr i64 %"57" to ptr + store float %"58", ptr %"63", align 4 + %"59" = load i64, ptr addrspace(5) %"40", align 4 + %"64" = inttoptr i64 %"59" to ptr + %"35" = getelementptr inbounds i8, ptr %"64", i64 4 + %"60" = load float, ptr addrspace(5) %"44", align 4 + store float %"60", ptr %"35", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind willreturn } \ No newline at end of file diff --git a/ptx/src/test/ll/add_non_coherent.ll b/ptx/src/test/ll/add_non_coherent.ll index 668031d..00e4092 100644 --- a/ptx/src/test/ll/add_non_coherent.ll +++ b/ptx/src/test/ll/add_non_coherent.ll @@ -1,39 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr addrspace(1) - %"44" = load i64, ptr addrspace(1) %"50", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"46" = add i64 %"47", 1 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr addrspace(1) - store i64 %"49", ptr addrspace(1) %"51", align 4 + br label %"31" + +"31": ; preds = %1 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr addrspace(1) + %"40" = load i64, ptr addrspace(1) %"46", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"42" = add i64 %"43", 1 + store i64 %"42", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"47" = inttoptr i64 %"44" to ptr addrspace(1) + store i64 %"45", ptr addrspace(1) %"47", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/add_tuning.ll b/ptx/src/test/ll/add_tuning.ll index 0ef4636..42d2031 100644 --- a/ptx/src/test/ll/add_tuning.ll +++ b/ptx/src/test/ll/add_tuning.ll @@ -1,39 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"46" = add i64 %"47", 1 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"51", align 4 + br label %"31" + +"31": ; preds = %1 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr + %"40" = load i64, ptr %"46", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"42" = add i64 %"43", 1 + store i64 %"42", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"47" = inttoptr i64 %"44" to ptr + store i64 %"45", ptr %"47", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/and.ll b/ptx/src/test/ll/and.ll index f13e3a7..84b720c 100644 --- a/ptx/src/test/ll/and.ll +++ b/ptx/src/test/ll/and.ll @@ -1,45 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 - %"51" = load i32, ptr addrspace(5) %"42", align 4 - %"56" = and i32 %"50", %"51" - store i32 %"56", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"41", align 4 - %"59" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"59", align 4 + br label %"32" + +"32": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"50", align 4 + store i32 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load i32, ptr %"31", align 4 + store i32 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i32, ptr addrspace(5) %"37", align 4 + %"47" = load i32, ptr addrspace(5) %"38", align 4 + %"52" = and i32 %"46", %"47" + store i32 %"52", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load i32, ptr addrspace(5) %"37", align 4 + %"55" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"55", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/atom_add.ll b/ptx/src/test/ll/atom_add.ll index b646974..72a9a75 100644 --- a/ptx/src/test/ll/atom_add.ll +++ b/ptx/src/test/ll/atom_add.ll @@ -1,55 +1,46 @@ @shared_mem = external addrspace(3) global [1024 x i8], align 4 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i32, align 4, addrspace(5) - %"45" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"47" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"60" = inttoptr i64 %"49" to ptr - %"48" = load i32, ptr %"60", align 4 - store i32 %"48", ptr addrspace(5) %"44", align 4 - %"50" = load i64, ptr addrspace(5) %"42", align 4 - %"61" = inttoptr i64 %"50" to ptr - %"31" = getelementptr inbounds i8, ptr %"61", i64 4 - %"51" = load i32, ptr %"31", align 4 - store i32 %"51", ptr addrspace(5) %"45", align 4 - %"52" = load i32, ptr addrspace(5) %"44", align 4 - store i32 %"52", ptr addrspace(3) @shared_mem, align 4 - %"54" = load i32, ptr addrspace(5) %"45", align 4 - %2 = atomicrmw add ptr addrspace(3) @shared_mem, i32 %"54" syncscope("agent-one-as") monotonic, align 4 - store i32 %2, ptr addrspace(5) %"44", align 4 - %"55" = load i32, ptr addrspace(3) @shared_mem, align 4 - store i32 %"55", ptr addrspace(5) %"45", align 4 - %"56" = load i64, ptr addrspace(5) %"43", align 4 - %"57" = load i32, ptr addrspace(5) %"44", align 4 - %"65" = inttoptr i64 %"56" to ptr - store i32 %"57", ptr %"65", align 4 - %"58" = load i64, ptr addrspace(5) %"43", align 4 - %"66" = inttoptr i64 %"58" to ptr - %"33" = getelementptr inbounds i8, ptr %"66", i64 4 - %"59" = load i32, ptr addrspace(5) %"45", align 4 - store i32 %"59", ptr %"33", align 4 + br label %"35" + +"35": ; preds = %1 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"56" = inttoptr i64 %"45" to ptr + %"44" = load i32, ptr %"56", align 4 + store i32 %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"57" = inttoptr i64 %"46" to ptr + %"32" = getelementptr inbounds i8, ptr %"57", i64 4 + %"47" = load i32, ptr %"32", align 4 + store i32 %"47", ptr addrspace(5) %"41", align 4 + %"48" = load i32, ptr addrspace(5) %"40", align 4 + store i32 %"48", ptr addrspace(3) @shared_mem, align 4 + %"50" = load i32, ptr addrspace(5) %"41", align 4 + %2 = atomicrmw add ptr addrspace(3) @shared_mem, i32 %"50" syncscope("agent-one-as") monotonic, align 4 + store i32 %2, ptr addrspace(5) %"40", align 4 + %"51" = load i32, ptr addrspace(3) @shared_mem, align 4 + store i32 %"51", ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"39", align 4 + %"53" = load i32, ptr addrspace(5) %"40", align 4 + %"61" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"61", align 4 + %"54" = load i64, ptr addrspace(5) %"39", align 4 + %"62" = inttoptr i64 %"54" to ptr + %"34" = getelementptr inbounds i8, ptr %"62", i64 4 + %"55" = load i32, ptr addrspace(5) %"41", align 4 + store i32 %"55", ptr %"34", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/atom_add_float.ll b/ptx/src/test/ll/atom_add_float.ll index 33265a4..acf9979 100644 --- a/ptx/src/test/ll/atom_add_float.ll +++ b/ptx/src/test/ll/atom_add_float.ll @@ -1,55 +1,46 @@ @shared_mem = external addrspace(3) global [1024 x i8], align 4 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca float, align 4, addrspace(5) - %"45" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca float, align 4, addrspace(5) + %"41" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"47" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"60" = inttoptr i64 %"49" to ptr - %"48" = load float, ptr %"60", align 4 - store float %"48", ptr addrspace(5) %"44", align 4 - %"50" = load i64, ptr addrspace(5) %"42", align 4 - %"61" = inttoptr i64 %"50" to ptr - %"31" = getelementptr inbounds i8, ptr %"61", i64 4 - %"51" = load float, ptr %"31", align 4 - store float %"51", ptr addrspace(5) %"45", align 4 - %"52" = load float, ptr addrspace(5) %"44", align 4 - store float %"52", ptr addrspace(3) @shared_mem, align 4 - %"54" = load float, ptr addrspace(5) %"45", align 4 - %2 = atomicrmw fadd ptr addrspace(3) @shared_mem, float %"54" syncscope("agent-one-as") monotonic, align 4 - store float %2, ptr addrspace(5) %"44", align 4 - %"55" = load float, ptr addrspace(3) @shared_mem, align 4 - store float %"55", ptr addrspace(5) %"45", align 4 - %"56" = load i64, ptr addrspace(5) %"43", align 4 - %"57" = load float, ptr addrspace(5) %"44", align 4 - %"65" = inttoptr i64 %"56" to ptr - store float %"57", ptr %"65", align 4 - %"58" = load i64, ptr addrspace(5) %"43", align 4 - %"66" = inttoptr i64 %"58" to ptr - %"33" = getelementptr inbounds i8, ptr %"66", i64 4 - %"59" = load float, ptr addrspace(5) %"45", align 4 - store float %"59", ptr %"33", align 4 + br label %"35" + +"35": ; preds = %1 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"56" = inttoptr i64 %"45" to ptr + %"44" = load float, ptr %"56", align 4 + store float %"44", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"57" = inttoptr i64 %"46" to ptr + %"32" = getelementptr inbounds i8, ptr %"57", i64 4 + %"47" = load float, ptr %"32", align 4 + store float %"47", ptr addrspace(5) %"41", align 4 + %"48" = load float, ptr addrspace(5) %"40", align 4 + store float %"48", ptr addrspace(3) @shared_mem, align 4 + %"50" = load float, ptr addrspace(5) %"41", align 4 + %2 = atomicrmw fadd ptr addrspace(3) @shared_mem, float %"50" syncscope("agent-one-as") monotonic, align 4 + store float %2, ptr addrspace(5) %"40", align 4 + %"51" = load float, ptr addrspace(3) @shared_mem, align 4 + store float %"51", ptr addrspace(5) %"41", align 4 + %"52" = load i64, ptr addrspace(5) %"39", align 4 + %"53" = load float, ptr addrspace(5) %"40", align 4 + %"61" = inttoptr i64 %"52" to ptr + store float %"53", ptr %"61", align 4 + %"54" = load i64, ptr addrspace(5) %"39", align 4 + %"62" = inttoptr i64 %"54" to ptr + %"34" = getelementptr inbounds i8, ptr %"62", i64 4 + %"55" = load float, ptr addrspace(5) %"41", align 4 + store float %"55", ptr %"34", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/atom_cas.ll b/ptx/src/test/ll/atom_cas.ll index 644d0cd..073fb62 100644 --- a/ptx/src/test/ll/atom_cas.ll +++ b/ptx/src/test/ll/atom_cas.ll @@ -1,53 +1,44 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { - %"44" = alloca i64, align 8, addrspace(5) - %"45" = alloca i64, align 8, addrspace(5) - %"46" = alloca i32, align 4, addrspace(5) - %"47" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"48" = load i64, ptr addrspace(4) %"42", align 4 - store i64 %"48", ptr addrspace(5) %"44", align 4 - %"49" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"49", ptr addrspace(5) %"45", align 4 - %"51" = load i64, ptr addrspace(5) %"44", align 4 + br label %"37" + +"37": ; preds = %1 + %"44" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + %"45" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"57" = inttoptr i64 %"47" to ptr + %"46" = load i32, ptr %"57", align 4 + store i32 %"46", ptr addrspace(5) %"42", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"58" = inttoptr i64 %"48" to ptr + %"31" = getelementptr inbounds i8, ptr %"58", i64 4 + %"50" = load i32, ptr addrspace(5) %"42", align 4 + %2 = cmpxchg ptr %"31", i32 %"50", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4 + %"59" = extractvalue { i32, i1 } %2, 0 + store i32 %"59", ptr addrspace(5) %"42", align 4 + %"51" = load i64, ptr addrspace(5) %"40", align 4 %"61" = inttoptr i64 %"51" to ptr - %"50" = load i32, ptr %"61", align 4 - store i32 %"50", ptr addrspace(5) %"46", align 4 - %"52" = load i64, ptr addrspace(5) %"44", align 4 - %"62" = inttoptr i64 %"52" to ptr - %"30" = getelementptr inbounds i8, ptr %"62", i64 4 - %"54" = load i32, ptr addrspace(5) %"46", align 4 - %2 = cmpxchg ptr %"30", i32 %"54", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4 - %"63" = extractvalue { i32, i1 } %2, 0 - store i32 %"63", ptr addrspace(5) %"46", align 4 - %"55" = load i64, ptr addrspace(5) %"44", align 4 - %"65" = inttoptr i64 %"55" to ptr - %"33" = getelementptr inbounds i8, ptr %"65", i64 4 - %"56" = load i32, ptr %"33", align 4 - store i32 %"56", ptr addrspace(5) %"47", align 4 - %"57" = load i64, ptr addrspace(5) %"45", align 4 - %"58" = load i32, ptr addrspace(5) %"46", align 4 - %"66" = inttoptr i64 %"57" to ptr - store i32 %"58", ptr %"66", align 4 - %"59" = load i64, ptr addrspace(5) %"45", align 4 - %"67" = inttoptr i64 %"59" to ptr - %"35" = getelementptr inbounds i8, ptr %"67", i64 4 - %"60" = load i32, ptr addrspace(5) %"47", align 4 - store i32 %"60", ptr %"35", align 4 + %"34" = getelementptr inbounds i8, ptr %"61", i64 4 + %"52" = load i32, ptr %"34", align 4 + store i32 %"52", ptr addrspace(5) %"43", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i32, ptr addrspace(5) %"42", align 4 + %"62" = inttoptr i64 %"53" to ptr + store i32 %"54", ptr %"62", align 4 + %"55" = load i64, ptr addrspace(5) %"41", align 4 + %"63" = inttoptr i64 %"55" to ptr + %"36" = getelementptr inbounds i8, ptr %"63", i64 4 + %"56" = load i32, ptr addrspace(5) %"43", align 4 + store i32 %"56", ptr %"36", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/atom_inc.ll b/ptx/src/test/ll/atom_inc.ll index 88ba124..b6906f3 100644 --- a/ptx/src/test/ll/atom_inc.ll +++ b/ptx/src/test/ll/atom_inc.ll @@ -1,55 +1,46 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { - %"44" = alloca i64, align 8, addrspace(5) - %"45" = alloca i64, align 8, addrspace(5) - %"46" = alloca i32, align 4, addrspace(5) - %"47" = alloca i32, align 4, addrspace(5) - %"48" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"49" = load i64, ptr addrspace(4) %"42", align 4 - store i64 %"49", ptr addrspace(5) %"44", align 4 - %"50" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"50", ptr addrspace(5) %"45", align 4 - %"52" = load i64, ptr addrspace(5) %"44", align 4 - %"63" = inttoptr i64 %"52" to ptr - %2 = atomicrmw uinc_wrap ptr %"63", i32 101 syncscope("agent-one-as") monotonic, align 4 - store i32 %2, ptr addrspace(5) %"46", align 4 - %"54" = load i64, ptr addrspace(5) %"44", align 4 - %"64" = inttoptr i64 %"54" to ptr addrspace(1) - %3 = atomicrmw uinc_wrap ptr addrspace(1) %"64", i32 101 syncscope("agent-one-as") monotonic, align 4 - store i32 %3, ptr addrspace(5) %"47", align 4 - %"56" = load i64, ptr addrspace(5) %"44", align 4 - %"65" = inttoptr i64 %"56" to ptr - %"55" = load i32, ptr %"65", align 4 - store i32 %"55", ptr addrspace(5) %"48", align 4 - %"57" = load i64, ptr addrspace(5) %"45", align 4 - %"58" = load i32, ptr addrspace(5) %"46", align 4 - %"66" = inttoptr i64 %"57" to ptr - store i32 %"58", ptr %"66", align 4 - %"59" = load i64, ptr addrspace(5) %"45", align 4 - %"67" = inttoptr i64 %"59" to ptr - %"33" = getelementptr inbounds i8, ptr %"67", i64 4 - %"60" = load i32, ptr addrspace(5) %"47", align 4 - store i32 %"60", ptr %"33", align 4 - %"61" = load i64, ptr addrspace(5) %"45", align 4 - %"68" = inttoptr i64 %"61" to ptr - %"35" = getelementptr inbounds i8, ptr %"68", i64 8 - %"62" = load i32, ptr addrspace(5) %"48", align 4 - store i32 %"62", ptr %"35", align 4 + br label %"37" + +"37": ; preds = %1 + %"45" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"45", ptr addrspace(5) %"40", align 4 + %"46" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"46", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"59" = inttoptr i64 %"48" to ptr + %2 = atomicrmw uinc_wrap ptr %"59", i32 101 syncscope("agent-one-as") monotonic, align 4 + store i32 %2, ptr addrspace(5) %"42", align 4 + %"50" = load i64, ptr addrspace(5) %"40", align 4 + %"60" = inttoptr i64 %"50" to ptr addrspace(1) + %3 = atomicrmw uinc_wrap ptr addrspace(1) %"60", i32 101 syncscope("agent-one-as") monotonic, align 4 + store i32 %3, ptr addrspace(5) %"43", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 4 + %"61" = inttoptr i64 %"52" to ptr + %"51" = load i32, ptr %"61", align 4 + store i32 %"51", ptr addrspace(5) %"44", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"54" = load i32, ptr addrspace(5) %"42", align 4 + %"62" = inttoptr i64 %"53" to ptr + store i32 %"54", ptr %"62", align 4 + %"55" = load i64, ptr addrspace(5) %"41", align 4 + %"63" = inttoptr i64 %"55" to ptr + %"34" = getelementptr inbounds i8, ptr %"63", i64 4 + %"56" = load i32, ptr addrspace(5) %"43", align 4 + store i32 %"56", ptr %"34", align 4 + %"57" = load i64, ptr addrspace(5) %"41", align 4 + %"64" = inttoptr i64 %"57" to ptr + %"36" = getelementptr inbounds i8, ptr %"64", i64 8 + %"58" = load i32, ptr addrspace(5) %"44", align 4 + store i32 %"58", ptr %"36", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/b64tof64.ll b/ptx/src/test/ll/b64tof64.ll index 2373b64..ced692b 100644 --- a/ptx/src/test/ll/b64tof64.ll +++ b/ptx/src/test/ll/b64tof64.ll @@ -1,39 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca double, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca double, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load double, ptr addrspace(4) %"35", align 8 - store double %"41", ptr addrspace(5) %"37", align 8 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load double, ptr addrspace(5) %"37", align 8 - %"50" = bitcast double %"44" to i64 - store i64 %"50", ptr addrspace(5) %"38", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %"51" = inttoptr i64 %"46" to ptr - %"45" = load i64, ptr %"51", align 4 - store i64 %"45", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"52" = inttoptr i64 %"47" to ptr - store i64 %"48", ptr %"52", align 4 + br label %"30" + +"30": ; preds = %1 + %"37" = load double, ptr addrspace(4) %"31", align 8 + store double %"37", ptr addrspace(5) %"33", align 8 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"35", align 4 + %"40" = load double, ptr addrspace(5) %"33", align 8 + %"46" = bitcast double %"40" to i64 + store i64 %"46", ptr addrspace(5) %"34", align 4 + %"42" = load i64, ptr addrspace(5) %"34", align 4 + %"47" = inttoptr i64 %"42" to ptr + %"41" = load i64, ptr %"47", align 4 + store i64 %"41", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"44" = load i64, ptr addrspace(5) %"36", align 4 + %"48" = inttoptr i64 %"43" to ptr + store i64 %"44", ptr %"48", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/bench.ll b/ptx/src/test/ll/bench.ll new file mode 100644 index 0000000..166524f --- /dev/null +++ b/ptx/src/test/ll/bench.ll @@ -0,0 +1,91 @@ +declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 + +declare i32 @__zluda_ptx_impl_sreg_clock() #0 + +declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 + +define amdgpu_kernel void @bench(ptr addrspace(4) byref(i64) %"55", ptr addrspace(4) byref(i64) %"56") #1 { + %"57" = alloca i64, align 8, addrspace(5) + %"58" = alloca i64, align 8, addrspace(5) + %"59" = alloca float, align 4, addrspace(5) + %"60" = alloca float, align 4, addrspace(5) + %"61" = alloca float, align 4, addrspace(5) + %"62" = alloca float, align 4, addrspace(5) + %"63" = alloca i32, align 4, addrspace(5) + %"64" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"97" + +"97": ; preds = %1 + %"65" = load i64, ptr addrspace(4) %"55", align 4 + store i64 %"65", ptr addrspace(5) %"57", align 4 + %"66" = load i64, ptr addrspace(4) %"56", align 4 + store i64 %"66", ptr addrspace(5) %"58", align 4 + %"68" = load i64, ptr addrspace(5) %"57", align 4 + %"91" = inttoptr i64 %"68" to ptr + %"67" = load float, ptr %"91", align 4 + store float %"67", ptr addrspace(5) %"59", align 4 + %"69" = load i64, ptr addrspace(5) %"57", align 4 + %"92" = inttoptr i64 %"69" to ptr + %"39" = getelementptr inbounds i8, ptr %"92", i64 4 + %"70" = load float, ptr %"39", align 4 + store float %"70", ptr addrspace(5) %"60", align 4 + %"71" = load i64, ptr addrspace(5) %"57", align 4 + %"93" = inttoptr i64 %"71" to ptr + %"41" = getelementptr inbounds i8, ptr %"93", i64 8 + %"72" = load float, ptr %"41", align 4 + store float %"72", ptr addrspace(5) %"61", align 4 + %"73" = load i64, ptr addrspace(5) %"57", align 4 + %"94" = inttoptr i64 %"73" to ptr + %"43" = getelementptr inbounds i8, ptr %"94", i64 12 + %"74" = load float, ptr %"43", align 4 + store float %"74", ptr addrspace(5) %"62", align 4 + store i32 0, ptr addrspace(5) %"63", align 4 + br label %"10" + +"10": ; preds = %"21", %"97" + %"77" = load float, ptr addrspace(5) %"59", align 4 + %"78" = load float, ptr addrspace(5) %"60", align 4 + call void asm sideeffect "s_denorm_mode 0", "~{mode}"() + %"76" = fmul float %"77", %"78" + store float %"76", ptr addrspace(5) %"59", align 4 + %"80" = load float, ptr addrspace(5) %"61", align 4 + %"81" = load float, ptr addrspace(5) %"62", align 4 + call void asm sideeffect "s_denorm_mode 11", "~{mode}"() + %"79" = fmul float %"80", %"81" + store float %"79", ptr addrspace(5) %"61", align 4 + %"83" = load i32, ptr addrspace(5) %"63", align 4 + %"82" = add i32 %"83", 1 + store i32 %"82", ptr addrspace(5) %"63", align 4 + %"85" = load i32, ptr addrspace(5) %"63", align 4 + %"84" = icmp eq i32 %"85", 100000000 + store i1 %"84", ptr addrspace(5) %"64", align 1 + %"86" = load i1, ptr addrspace(5) %"64", align 1 + br i1 %"86", label %"11", label %"21" + +"21": ; preds = %"10" + br label %"10" + +"11": ; preds = %"10" + %"87" = load i64, ptr addrspace(5) %"58", align 4 + %"88" = load float, ptr addrspace(5) %"59", align 4 + %"95" = inttoptr i64 %"87" to ptr + store float %"88", ptr %"95", align 4 + %"89" = load i64, ptr addrspace(5) %"58", align 4 + %"96" = inttoptr i64 %"89" to ptr + %"48" = getelementptr inbounds i8, ptr %"96", i64 4 + %"90" = load float, ptr addrspace(5) %"61", align 4 + store float %"90", ptr %"48", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/bfe.ll b/ptx/src/test/ll/bfe.ll index fda252d..8544b99 100644 --- a/ptx/src/test/ll/bfe.ll +++ b/ptx/src/test/ll/bfe.ll @@ -1,54 +1,46 @@ declare i32 @__zluda_ptx_impl_bfe_u32(i32, i32, i32) #0 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i32, align 4, addrspace(5) - %"45" = alloca i32, align 4, addrspace(5) - %"46" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"47" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"47", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"48", ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(5) %"42", align 4 - %"61" = inttoptr i64 %"50" to ptr - %"49" = load i32, ptr %"61", align 4 - store i32 %"49", ptr addrspace(5) %"44", align 4 - %"51" = load i64, ptr addrspace(5) %"42", align 4 - %"62" = inttoptr i64 %"51" to ptr - %"31" = getelementptr inbounds i8, ptr %"62", i64 4 - %"52" = load i32, ptr %"31", align 4 - store i32 %"52", ptr addrspace(5) %"45", align 4 - %"53" = load i64, ptr addrspace(5) %"42", align 4 - %"63" = inttoptr i64 %"53" to ptr - %"33" = getelementptr inbounds i8, ptr %"63", i64 8 - %"54" = load i32, ptr %"33", align 4 - store i32 %"54", ptr addrspace(5) %"46", align 4 - %"56" = load i32, ptr addrspace(5) %"44", align 4 - %"57" = load i32, ptr addrspace(5) %"45", align 4 - %"58" = load i32, ptr addrspace(5) %"46", align 4 - %"55" = call i32 @__zluda_ptx_impl_bfe_u32(i32 %"56", i32 %"57", i32 %"58") - store i32 %"55", ptr addrspace(5) %"44", align 4 - %"59" = load i64, ptr addrspace(5) %"43", align 4 - %"60" = load i32, ptr addrspace(5) %"44", align 4 - %"64" = inttoptr i64 %"59" to ptr - store i32 %"60", ptr %"64", align 4 + br label %"35" + +"35": ; preds = %1 + %"43" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"43", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"57" = inttoptr i64 %"46" to ptr + %"45" = load i32, ptr %"57", align 4 + store i32 %"45", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"38", align 4 + %"58" = inttoptr i64 %"47" to ptr + %"32" = getelementptr inbounds i8, ptr %"58", i64 4 + %"48" = load i32, ptr %"32", align 4 + store i32 %"48", ptr addrspace(5) %"41", align 4 + %"49" = load i64, ptr addrspace(5) %"38", align 4 + %"59" = inttoptr i64 %"49" to ptr + %"34" = getelementptr inbounds i8, ptr %"59", i64 8 + %"50" = load i32, ptr %"34", align 4 + store i32 %"50", ptr addrspace(5) %"42", align 4 + %"52" = load i32, ptr addrspace(5) %"40", align 4 + %"53" = load i32, ptr addrspace(5) %"41", align 4 + %"54" = load i32, ptr addrspace(5) %"42", align 4 + %"51" = call i32 @__zluda_ptx_impl_bfe_u32(i32 %"52", i32 %"53", i32 %"54") + store i32 %"51", ptr addrspace(5) %"40", align 4 + %"55" = load i64, ptr addrspace(5) %"39", align 4 + %"56" = load i32, ptr addrspace(5) %"40", align 4 + %"60" = inttoptr i64 %"55" to ptr + store i32 %"56", ptr %"60", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/bfi.ll b/ptx/src/test/ll/bfi.ll index ef437c1..43b09f2 100644 --- a/ptx/src/test/ll/bfi.ll +++ b/ptx/src/test/ll/bfi.ll @@ -1,61 +1,53 @@ declare i32 @__zluda_ptx_impl_bfi_b32(i32, i32, i32, i32) #0 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { - %"45" = alloca i64, align 8, addrspace(5) - %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca i32, align 4, addrspace(5) - %"48" = alloca i32, align 4, addrspace(5) - %"49" = alloca i32, align 4, addrspace(5) - %"50" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) + %"45" = alloca i32, align 4, addrspace(5) + %"46" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"51" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"51", ptr addrspace(5) %"45", align 4 - %"52" = load i64, ptr addrspace(4) %"44", align 4 - store i64 %"52", ptr addrspace(5) %"46", align 4 - %"54" = load i64, ptr addrspace(5) %"45", align 4 - %"68" = inttoptr i64 %"54" to ptr - %"53" = load i32, ptr %"68", align 4 - store i32 %"53", ptr addrspace(5) %"47", align 4 - %"55" = load i64, ptr addrspace(5) %"45", align 4 - %"69" = inttoptr i64 %"55" to ptr - %"32" = getelementptr inbounds i8, ptr %"69", i64 4 - %"56" = load i32, ptr %"32", align 4 - store i32 %"56", ptr addrspace(5) %"48", align 4 - %"57" = load i64, ptr addrspace(5) %"45", align 4 - %"70" = inttoptr i64 %"57" to ptr - %"34" = getelementptr inbounds i8, ptr %"70", i64 8 - %"58" = load i32, ptr %"34", align 4 - store i32 %"58", ptr addrspace(5) %"49", align 4 - %"59" = load i64, ptr addrspace(5) %"45", align 4 - %"71" = inttoptr i64 %"59" to ptr - %"36" = getelementptr inbounds i8, ptr %"71", i64 12 - %"60" = load i32, ptr %"36", align 4 - store i32 %"60", ptr addrspace(5) %"50", align 4 - %"62" = load i32, ptr addrspace(5) %"47", align 4 - %"63" = load i32, ptr addrspace(5) %"48", align 4 - %"64" = load i32, ptr addrspace(5) %"49", align 4 - %"65" = load i32, ptr addrspace(5) %"50", align 4 - %"72" = call i32 @__zluda_ptx_impl_bfi_b32(i32 %"62", i32 %"63", i32 %"64", i32 %"65") - store i32 %"72", ptr addrspace(5) %"47", align 4 - %"66" = load i64, ptr addrspace(5) %"46", align 4 - %"67" = load i32, ptr addrspace(5) %"47", align 4 - %"75" = inttoptr i64 %"66" to ptr - store i32 %"67", ptr %"75", align 4 + br label %"38" + +"38": ; preds = %1 + %"47" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"47", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i64, ptr addrspace(5) %"41", align 4 + %"64" = inttoptr i64 %"50" to ptr + %"49" = load i32, ptr %"64", align 4 + store i32 %"49", ptr addrspace(5) %"43", align 4 + %"51" = load i64, ptr addrspace(5) %"41", align 4 + %"65" = inttoptr i64 %"51" to ptr + %"33" = getelementptr inbounds i8, ptr %"65", i64 4 + %"52" = load i32, ptr %"33", align 4 + store i32 %"52", ptr addrspace(5) %"44", align 4 + %"53" = load i64, ptr addrspace(5) %"41", align 4 + %"66" = inttoptr i64 %"53" to ptr + %"35" = getelementptr inbounds i8, ptr %"66", i64 8 + %"54" = load i32, ptr %"35", align 4 + store i32 %"54", ptr addrspace(5) %"45", align 4 + %"55" = load i64, ptr addrspace(5) %"41", align 4 + %"67" = inttoptr i64 %"55" to ptr + %"37" = getelementptr inbounds i8, ptr %"67", i64 12 + %"56" = load i32, ptr %"37", align 4 + store i32 %"56", ptr addrspace(5) %"46", align 4 + %"58" = load i32, ptr addrspace(5) %"43", align 4 + %"59" = load i32, ptr addrspace(5) %"44", align 4 + %"60" = load i32, ptr addrspace(5) %"45", align 4 + %"61" = load i32, ptr addrspace(5) %"46", align 4 + %"68" = call i32 @__zluda_ptx_impl_bfi_b32(i32 %"58", i32 %"59", i32 %"60", i32 %"61") + store i32 %"68", ptr addrspace(5) %"43", align 4 + %"62" = load i64, ptr addrspace(5) %"42", align 4 + %"63" = load i32, ptr addrspace(5) %"43", align 4 + %"71" = inttoptr i64 %"62" to ptr + store i32 %"63", ptr %"71", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/block.ll b/ptx/src/test/ll/block.ll index 523d941..b492a5a 100644 --- a/ptx/src/test/ll/block.ll +++ b/ptx/src/test/ll/block.ll @@ -1,43 +1,34 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"50" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"46" = load i64, ptr %"55", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"48" = add i64 %"49", 1 - store i64 %"48", ptr addrspace(5) %"43", align 4 - %"52" = load i64, ptr addrspace(5) %"50", align 4 - %"51" = add i64 %"52", 1 - store i64 %"51", ptr addrspace(5) %"50", align 4 - %"53" = load i64, ptr addrspace(5) %"41", align 4 - %"54" = load i64, ptr addrspace(5) %"43", align 4 - %"56" = inttoptr i64 %"53" to ptr - store i64 %"54", ptr %"56", align 4 + br label %"33" + +"33": ; preds = %1 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"42" = load i64, ptr %"51", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"44" = add i64 %"45", 1 + store i64 %"44", ptr addrspace(5) %"39", align 4 + %"48" = load i64, ptr addrspace(5) %"46", align 4 + %"47" = add i64 %"48", 1 + store i64 %"47", ptr addrspace(5) %"46", align 4 + %"49" = load i64, ptr addrspace(5) %"37", align 4 + %"50" = load i64, ptr addrspace(5) %"39", align 4 + %"52" = inttoptr i64 %"49" to ptr + store i64 %"50", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/bra.ll b/ptx/src/test/ll/bra.ll index 0fb9769..3246790 100644 --- a/ptx/src/test/ll/bra.ll +++ b/ptx/src/test/ll/bra.ll @@ -1,51 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) - %"45" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"47" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = inttoptr i64 %"49" to ptr - %"48" = load i64, ptr %"56", align 4 - store i64 %"48", ptr addrspace(5) %"44", align 4 - br label %"9" + br label %"35" -"9": ; preds = %1 - %"51" = load i64, ptr addrspace(5) %"44", align 4 - %"50" = add i64 %"51", 1 - store i64 %"50", ptr addrspace(5) %"45", align 4 - br label %"11" +"35": ; preds = %1 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"43" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"45" to ptr + %"44" = load i64, ptr %"50", align 4 + store i64 %"44", ptr addrspace(5) %"40", align 4 + br label %"10" -"10": ; No predecessors! - %"53" = load i64, ptr addrspace(5) %"44", align 4 - %"52" = add i64 %"53", 2 - store i64 %"52", ptr addrspace(5) %"45", align 4 - br label %"11" +"10": ; preds = %"35" + %"47" = load i64, ptr addrspace(5) %"40", align 4 + %"46" = add i64 %"47", 1 + store i64 %"46", ptr addrspace(5) %"41", align 4 + br label %"12" -"11": ; preds = %"10", %"9" - %"54" = load i64, ptr addrspace(5) %"43", align 4 - %"55" = load i64, ptr addrspace(5) %"45", align 4 - %"57" = inttoptr i64 %"54" to ptr - store i64 %"55", ptr %"57", align 4 +"12": ; preds = %"10" + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + %"51" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"51", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/brev.ll b/ptx/src/test/ll/brev.ll index 6f10c94..9126fc4 100644 --- a/ptx/src/test/ll/brev.ll +++ b/ptx/src/test/ll/brev.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load i32, ptr %"47", align 4 - store i32 %"41", ptr addrspace(5) %"38", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 - %"43" = call i32 @llvm.bitreverse.i32(i32 %"44") - store i32 %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load i32, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store i32 %"46", ptr %"48", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load i32, ptr %"43", align 4 + store i32 %"37", ptr addrspace(5) %"34", align 4 + %"40" = load i32, ptr addrspace(5) %"34", align 4 + %"39" = call i32 @llvm.bitreverse.i32(i32 %"40") + store i32 %"39", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load i32, ptr addrspace(5) %"34", align 4 + %"44" = inttoptr i64 %"41" to ptr + store i32 %"42", ptr %"44", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare i32 @llvm.bitreverse.i32(i32) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/call.ll b/ptx/src/test/ll/call.ll index c9bb5ce..09b68c9 100644 --- a/ptx/src/test/ll/call.ll +++ b/ptx/src/test/ll/call.ll @@ -1,66 +1,64 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define i64 @__zluda_ptx_impl_incr(i64 %"42") #0 { +define i64 @incr(i64 %"43") #0 { + %"63" = alloca i64, align 8, addrspace(5) + %"64" = alloca i64, align 8, addrspace(5) %"65" = alloca i64, align 8, addrspace(5) %"66" = alloca i64, align 8, addrspace(5) - %"67" = alloca i64, align 8, addrspace(5) - %"68" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - store i64 %"42", ptr addrspace(5) %"67", align 4 - %"69" = load i64, ptr addrspace(5) %"67", align 4 - store i64 %"69", ptr addrspace(5) %"68", align 4 - %"71" = load i64, ptr addrspace(5) %"68", align 4 - %"70" = add i64 %"71", 1 - store i64 %"70", ptr addrspace(5) %"68", align 4 - %"72" = load i64, ptr addrspace(5) %"68", align 4 - store i64 %"72", ptr addrspace(5) %"66", align 4 - %"73" = load i64, ptr addrspace(5) %"66", align 4 - store i64 %"73", ptr addrspace(5) %"65", align 4 - %2 = load i64, ptr addrspace(5) %"65", align 4 + br label %"46" + +"46": ; preds = %1 + store i64 %"43", ptr addrspace(5) %"65", align 4 + %"67" = load i64, ptr addrspace(5) %"65", align 4 + store i64 %"67", ptr addrspace(5) %"66", align 4 + %"69" = load i64, ptr addrspace(5) %"66", align 4 + %"68" = add i64 %"69", 1 + store i64 %"68", ptr addrspace(5) %"66", align 4 + %"70" = load i64, ptr addrspace(5) %"66", align 4 + store i64 %"70", ptr addrspace(5) %"64", align 4 + %"71" = load i64, ptr addrspace(5) %"64", align 4 + store i64 %"71", ptr addrspace(5) %"63", align 4 + %2 = load i64, ptr addrspace(5) %"63", align 4 ret i64 %2 } -define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #0 { +define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"48", ptr addrspace(4) byref(i64) %"49") #1 { + %"50" = alloca i64, align 8, addrspace(5) + %"51" = alloca i64, align 8, addrspace(5) %"52" = alloca i64, align 8, addrspace(5) - %"53" = alloca i64, align 8, addrspace(5) - %"54" = alloca i64, align 8, addrspace(5) - %"59" = alloca i64, align 8, addrspace(5) - %"60" = alloca i64, align 8, addrspace(5) + %"57" = alloca i64, align 8, addrspace(5) + %"58" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"55" = load i64, ptr addrspace(4) %"50", align 4 + br label %"44" + +"44": ; preds = %1 + %"53" = load i64, ptr addrspace(4) %"48", align 4 + store i64 %"53", ptr addrspace(5) %"50", align 4 + %"54" = load i64, ptr addrspace(4) %"49", align 4 + store i64 %"54", ptr addrspace(5) %"51", align 4 + %"56" = load i64, ptr addrspace(5) %"50", align 4 + %"72" = inttoptr i64 %"56" to ptr addrspace(1) + %"55" = load i64, ptr addrspace(1) %"72", align 4 store i64 %"55", ptr addrspace(5) %"52", align 4 - %"56" = load i64, ptr addrspace(4) %"51", align 4 - store i64 %"56", ptr addrspace(5) %"53", align 4 - %"58" = load i64, ptr addrspace(5) %"52", align 4 - %"74" = inttoptr i64 %"58" to ptr addrspace(1) - %"57" = load i64, ptr addrspace(1) %"74", align 4 - store i64 %"57", ptr addrspace(5) %"54", align 4 - %"61" = load i64, ptr addrspace(5) %"54", align 4 - store i64 %"61", ptr addrspace(5) %"59", align 4 - %"39" = load i64, ptr addrspace(5) %"59", align 4 - %"40" = call i64 @__zluda_ptx_impl_incr(i64 %"39") - store i64 %"40", ptr addrspace(5) %"60", align 4 - %"62" = load i64, ptr addrspace(5) %"60", align 4 - store i64 %"62", ptr addrspace(5) %"54", align 4 - %"63" = load i64, ptr addrspace(5) %"53", align 4 - %"64" = load i64, ptr addrspace(5) %"54", align 4 - %"77" = inttoptr i64 %"63" to ptr addrspace(1) - store i64 %"64", ptr addrspace(1) %"77", align 4 + %"59" = load i64, ptr addrspace(5) %"52", align 4 + store i64 %"59", ptr addrspace(5) %"57", align 4 + %"40" = load i64, ptr addrspace(5) %"57", align 4 + %"41" = call i64 @incr(i64 %"40") + br label %"45" + +"45": ; preds = %"44" + store i64 %"41", ptr addrspace(5) %"58", align 4 + %"60" = load i64, ptr addrspace(5) %"58", align 4 + store i64 %"60", ptr addrspace(5) %"52", align 4 + %"61" = load i64, ptr addrspace(5) %"51", align 4 + %"62" = load i64, ptr addrspace(5) %"52", align 4 + %"75" = inttoptr i64 %"61" to ptr addrspace(1) + store i64 %"62", ptr addrspace(1) %"75", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/call_rnd.ll b/ptx/src/test/ll/call_rnd.ll new file mode 100644 index 0000000..b727812 --- /dev/null +++ b/ptx/src/test/ll/call_rnd.ll @@ -0,0 +1,155 @@ +define float @add_rm(float %"79", float %"80") #0 { + %"128" = alloca float, align 4, addrspace(5) + %"129" = alloca float, align 4, addrspace(5) + %"130" = alloca float, align 4, addrspace(5) + %"131" = alloca float, align 4, addrspace(5) + %"132" = alloca float, align 4, addrspace(5) + %"133" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"89" + +"89": ; preds = %1 + call void @llvm.amdgcn.s.setreg(i32 6145, i32 2) + br label %"87" + +"87": ; preds = %"89" + store float %"79", ptr addrspace(5) %"130", align 4 + store float %"80", ptr addrspace(5) %"131", align 4 + %"134" = load float, ptr addrspace(5) %"130", align 4 + store float %"134", ptr addrspace(5) %"132", align 4 + %"135" = load float, ptr addrspace(5) %"131", align 4 + store float %"135", ptr addrspace(5) %"133", align 4 + %"137" = load float, ptr addrspace(5) %"132", align 4 + %"138" = load float, ptr addrspace(5) %"133", align 4 + %"136" = fadd float %"137", %"138" + store float %"136", ptr addrspace(5) %"132", align 4 + %"139" = load float, ptr addrspace(5) %"132", align 4 + store float %"139", ptr addrspace(5) %"129", align 4 + %"140" = load float, ptr addrspace(5) %"129", align 4 + store float %"140", ptr addrspace(5) %"128", align 4 + %2 = load float, ptr addrspace(5) %"128", align 4 + ret float %2 +} + +define float @add_rp(float %"82", float %"83") #0 { + %"141" = alloca float, align 4, addrspace(5) + %"142" = alloca float, align 4, addrspace(5) + %"143" = alloca float, align 4, addrspace(5) + %"144" = alloca float, align 4, addrspace(5) + %"145" = alloca float, align 4, addrspace(5) + %"146" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"88" + +"88": ; preds = %1 + store float %"82", ptr addrspace(5) %"143", align 4 + store float %"83", ptr addrspace(5) %"144", align 4 + %"147" = load float, ptr addrspace(5) %"143", align 4 + store float %"147", ptr addrspace(5) %"145", align 4 + %"148" = load float, ptr addrspace(5) %"144", align 4 + store float %"148", ptr addrspace(5) %"146", align 4 + %"150" = load float, ptr addrspace(5) %"145", align 4 + %"151" = load float, ptr addrspace(5) %"146", align 4 + %"149" = fadd float %"150", %"151" + store float %"149", ptr addrspace(5) %"145", align 4 + %"152" = load float, ptr addrspace(5) %"145", align 4 + store float %"152", ptr addrspace(5) %"142", align 4 + %"153" = load float, ptr addrspace(5) %"142", align 4 + store float %"153", ptr addrspace(5) %"141", align 4 + %2 = load float, ptr addrspace(5) %"141", align 4 + ret float %2 +} + +define amdgpu_kernel void @call_rnd(ptr addrspace(4) byref(i64) %"92", ptr addrspace(4) byref(i64) %"93") #1 { + %"94" = alloca i64, align 8, addrspace(5) + %"95" = alloca i64, align 8, addrspace(5) + %"96" = alloca float, align 4, addrspace(5) + %"97" = alloca float, align 4, addrspace(5) + %"98" = alloca float, align 4, addrspace(5) + %"99" = alloca float, align 4, addrspace(5) + %"100" = alloca float, align 4, addrspace(5) + %"101" = alloca float, align 4, addrspace(5) + %"102" = alloca float, align 4, addrspace(5) + %"103" = alloca float, align 4, addrspace(5) + %"104" = alloca float, align 4, addrspace(5) + %"105" = alloca float, align 4, addrspace(5) + %"106" = alloca float, align 4, addrspace(5) + %"107" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"84" + +"84": ; preds = %1 + call void @llvm.amdgcn.s.setreg(i32 6145, i32 1) + %"108" = load i64, ptr addrspace(4) %"92", align 4 + store i64 %"108", ptr addrspace(5) %"94", align 4 + %"109" = load i64, ptr addrspace(4) %"93", align 4 + store i64 %"109", ptr addrspace(5) %"95", align 4 + %"111" = load i64, ptr addrspace(5) %"94", align 4 + %"154" = inttoptr i64 %"111" to ptr + %"110" = load float, ptr %"154", align 4 + store float %"110", ptr addrspace(5) %"96", align 4 + %"112" = load i64, ptr addrspace(5) %"94", align 4 + %"155" = inttoptr i64 %"112" to ptr + %"59" = getelementptr inbounds i8, ptr %"155", i64 4 + %"113" = load float, ptr %"59", align 4 + store float %"113", ptr addrspace(5) %"97", align 4 + %"114" = load i64, ptr addrspace(5) %"94", align 4 + %"156" = inttoptr i64 %"114" to ptr + %"61" = getelementptr inbounds i8, ptr %"156", i64 8 + %"115" = load float, ptr %"61", align 4 + store float %"115", ptr addrspace(5) %"98", align 4 + %"116" = load i64, ptr addrspace(5) %"94", align 4 + %"157" = inttoptr i64 %"116" to ptr + %"63" = getelementptr inbounds i8, ptr %"157", i64 12 + %"117" = load float, ptr %"63", align 4 + store float %"117", ptr addrspace(5) %"99", align 4 + %"118" = load float, ptr addrspace(5) %"96", align 4 + store float %"118", ptr addrspace(5) %"102", align 4 + %"119" = load float, ptr addrspace(5) %"97", align 4 + store float %"119", ptr addrspace(5) %"103", align 4 + %"72" = load float, ptr addrspace(5) %"102", align 4 + %"73" = load float, ptr addrspace(5) %"103", align 4 + %"74" = call float @add_rp(float %"72", float %"73") + br label %"85" + +"85": ; preds = %"84" + store float %"74", ptr addrspace(5) %"104", align 4 + %"120" = load float, ptr addrspace(5) %"104", align 4 + store float %"120", ptr addrspace(5) %"100", align 4 + %"121" = load i64, ptr addrspace(5) %"95", align 4 + %"122" = load float, ptr addrspace(5) %"100", align 4 + %"158" = inttoptr i64 %"121" to ptr + store float %"122", ptr %"158", align 4 + %"123" = load float, ptr addrspace(5) %"98", align 4 + store float %"123", ptr addrspace(5) %"105", align 4 + %"124" = load float, ptr addrspace(5) %"99", align 4 + store float %"124", ptr addrspace(5) %"106", align 4 + %"75" = load float, ptr addrspace(5) %"105", align 4 + %"76" = load float, ptr addrspace(5) %"106", align 4 + %"77" = call float @add_rm(float %"75", float %"76") + br label %"86" + +"86": ; preds = %"85" + store float %"77", ptr addrspace(5) %"107", align 4 + %"125" = load float, ptr addrspace(5) %"107", align 4 + store float %"125", ptr addrspace(5) %"101", align 4 + %"126" = load i64, ptr addrspace(5) %"95", align 4 + %"159" = inttoptr i64 %"126" to ptr + %"65" = getelementptr inbounds i8, ptr %"159", i64 4 + %"127" = load float, ptr addrspace(5) %"101", align 4 + store float %"127", ptr %"65", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #2 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #2 = { nocallback nofree nosync nounwind willreturn } \ No newline at end of file diff --git a/ptx/src/test/ll/clz.ll b/ptx/src/test/ll/clz.ll index 160a634..e9ff3e5 100644 --- a/ptx/src/test/ll/clz.ll +++ b/ptx/src/test/ll/clz.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load i32, ptr %"47", align 4 - store i32 %"41", ptr addrspace(5) %"38", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 - %"48" = call i32 @llvm.ctlz.i32(i32 %"44", i1 false) - store i32 %"48", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load i32, ptr addrspace(5) %"38", align 4 - %"49" = inttoptr i64 %"45" to ptr - store i32 %"46", ptr %"49", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load i32, ptr %"43", align 4 + store i32 %"37", ptr addrspace(5) %"34", align 4 + %"40" = load i32, ptr addrspace(5) %"34", align 4 + %"44" = call i32 @llvm.ctlz.i32(i32 %"40", i1 false) + store i32 %"44", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load i32, ptr addrspace(5) %"34", align 4 + %"45" = inttoptr i64 %"41" to ptr + store i32 %"42", ptr %"45", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/const.ll b/ptx/src/test/ll/const.ll index 0fbd7e0..ec65d1f 100644 --- a/ptx/src/test/ll/const.ll +++ b/ptx/src/test/ll/const.ll @@ -1,59 +1,50 @@ @constparams = addrspace(4) global [4 x i16] [i16 10, i16 20, i16 30, i16 40], align 8 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #0 { - %"52" = alloca i64, align 8, addrspace(5) - %"53" = alloca i64, align 8, addrspace(5) - %"54" = alloca i16, align 2, addrspace(5) - %"55" = alloca i16, align 2, addrspace(5) - %"56" = alloca i16, align 2, addrspace(5) - %"57" = alloca i16, align 2, addrspace(5) +define amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 { + %"48" = alloca i64, align 8, addrspace(5) + %"49" = alloca i64, align 8, addrspace(5) + %"50" = alloca i16, align 2, addrspace(5) + %"51" = alloca i16, align 2, addrspace(5) + %"52" = alloca i16, align 2, addrspace(5) + %"53" = alloca i16, align 2, addrspace(5) br label %1 1: ; preds = %0 - %"58" = load i64, ptr addrspace(4) %"50", align 4 - store i64 %"58", ptr addrspace(5) %"52", align 4 - %"59" = load i64, ptr addrspace(4) %"51", align 4 - store i64 %"59", ptr addrspace(5) %"53", align 4 - %"60" = load i16, ptr addrspace(4) @constparams, align 2 - store i16 %"60", ptr addrspace(5) %"54", align 2 - %"61" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2 - store i16 %"61", ptr addrspace(5) %"55", align 2 - %"62" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2 - store i16 %"62", ptr addrspace(5) %"56", align 2 - %"63" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2 - store i16 %"63", ptr addrspace(5) %"57", align 2 - %"64" = load i64, ptr addrspace(5) %"53", align 4 - %"65" = load i16, ptr addrspace(5) %"54", align 2 + br label %"45" + +"45": ; preds = %1 + %"54" = load i64, ptr addrspace(4) %"46", align 4 + store i64 %"54", ptr addrspace(5) %"48", align 4 + %"55" = load i64, ptr addrspace(4) %"47", align 4 + store i64 %"55", ptr addrspace(5) %"49", align 4 + %"56" = load i16, ptr addrspace(4) @constparams, align 2 + store i16 %"56", ptr addrspace(5) %"50", align 2 + %"57" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2 + store i16 %"57", ptr addrspace(5) %"51", align 2 + %"58" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2 + store i16 %"58", ptr addrspace(5) %"52", align 2 + %"59" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2 + store i16 %"59", ptr addrspace(5) %"53", align 2 + %"60" = load i64, ptr addrspace(5) %"49", align 4 + %"61" = load i16, ptr addrspace(5) %"50", align 2 + %"72" = inttoptr i64 %"60" to ptr + store i16 %"61", ptr %"72", align 2 + %"62" = load i64, ptr addrspace(5) %"49", align 4 + %"74" = inttoptr i64 %"62" to ptr + %"40" = getelementptr inbounds i8, ptr %"74", i64 2 + %"63" = load i16, ptr addrspace(5) %"51", align 2 + store i16 %"63", ptr %"40", align 2 + %"64" = load i64, ptr addrspace(5) %"49", align 4 %"76" = inttoptr i64 %"64" to ptr - store i16 %"65", ptr %"76", align 2 - %"66" = load i64, ptr addrspace(5) %"53", align 4 + %"42" = getelementptr inbounds i8, ptr %"76", i64 4 + %"65" = load i16, ptr addrspace(5) %"52", align 2 + store i16 %"65", ptr %"42", align 2 + %"66" = load i64, ptr addrspace(5) %"49", align 4 %"78" = inttoptr i64 %"66" to ptr - %"39" = getelementptr inbounds i8, ptr %"78", i64 2 - %"67" = load i16, ptr addrspace(5) %"55", align 2 - store i16 %"67", ptr %"39", align 2 - %"68" = load i64, ptr addrspace(5) %"53", align 4 - %"80" = inttoptr i64 %"68" to ptr - %"41" = getelementptr inbounds i8, ptr %"80", i64 4 - %"69" = load i16, ptr addrspace(5) %"56", align 2 - store i16 %"69", ptr %"41", align 2 - %"70" = load i64, ptr addrspace(5) %"53", align 4 - %"82" = inttoptr i64 %"70" to ptr - %"43" = getelementptr inbounds i8, ptr %"82", i64 6 - %"71" = load i16, ptr addrspace(5) %"57", align 2 - store i16 %"71", ptr %"43", align 2 + %"44" = getelementptr inbounds i8, ptr %"78", i64 6 + %"67" = load i16, ptr addrspace(5) %"53", align 2 + store i16 %"67", ptr %"44", align 2 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/constant_f32.ll b/ptx/src/test/ll/constant_f32.ll index 60f625f..b1c04a4 100644 --- a/ptx/src/test/ll/constant_f32.ll +++ b/ptx/src/test/ll/constant_f32.ll @@ -1,38 +1,29 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = inttoptr i64 %"43" to ptr - %"42" = load float, ptr %"48", align 4 - store float %"42", ptr addrspace(5) %"39", align 4 - %"45" = load float, ptr addrspace(5) %"39", align 4 - %"44" = fmul float %"45", 5.000000e-01 - store float %"44", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %"47" = load float, ptr addrspace(5) %"39", align 4 - %"49" = inttoptr i64 %"46" to ptr - store float %"47", ptr %"49", align 4 + br label %"30" + +"30": ; preds = %1 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"37" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"37", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"44" = inttoptr i64 %"39" to ptr + %"38" = load float, ptr %"44", align 4 + store float %"38", ptr addrspace(5) %"35", align 4 + %"41" = load float, ptr addrspace(5) %"35", align 4 + %"40" = fmul float %"41", 5.000000e-01 + store float %"40", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(5) %"34", align 4 + %"43" = load float, ptr addrspace(5) %"35", align 4 + %"45" = inttoptr i64 %"42" to ptr + store float %"43", ptr %"45", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/constant_negative.ll b/ptx/src/test/ll/constant_negative.ll index 201b867..9fec04d 100644 --- a/ptx/src/test/ll/constant_negative.ll +++ b/ptx/src/test/ll/constant_negative.ll @@ -1,38 +1,29 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = inttoptr i64 %"43" to ptr - %"42" = load i32, ptr %"48", align 4 - store i32 %"42", ptr addrspace(5) %"39", align 4 - %"45" = load i32, ptr addrspace(5) %"39", align 4 - %"44" = mul i32 %"45", -1 - store i32 %"44", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %"47" = load i32, ptr addrspace(5) %"39", align 4 - %"49" = inttoptr i64 %"46" to ptr - store i32 %"47", ptr %"49", align 4 + br label %"30" + +"30": ; preds = %1 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"37" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"37", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"44" = inttoptr i64 %"39" to ptr + %"38" = load i32, ptr %"44", align 4 + store i32 %"38", ptr addrspace(5) %"35", align 4 + %"41" = load i32, ptr addrspace(5) %"35", align 4 + %"40" = mul i32 %"41", -1 + store i32 %"40", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(5) %"34", align 4 + %"43" = load i32, ptr addrspace(5) %"35", align 4 + %"45" = inttoptr i64 %"42" to ptr + store i32 %"43", ptr %"45", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/cos.ll b/ptx/src/test/ll/cos.ll index 44c0ee0..bd1b5cb 100644 --- a/ptx/src/test/ll/cos.ll +++ b/ptx/src/test/ll/cos.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call afn float @llvm.cos.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load float, ptr %"43", align 4 + store float %"37", ptr addrspace(5) %"34", align 4 + %"40" = load float, ptr addrspace(5) %"34", align 4 + %"39" = call afn float @llvm.cos.f32(float %"40") + store float %"39", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load float, ptr addrspace(5) %"34", align 4 + %"44" = inttoptr i64 %"41" to ptr + store float %"42", ptr %"44", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.cos.f32(float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_f64_f32.ll b/ptx/src/test/ll/cvt_f64_f32.ll index 4d5cf2c..d7360da 100644 --- a/ptx/src/test/ll/cvt_f64_f32.ll +++ b/ptx/src/test/ll/cvt_f64_f32.ll @@ -1,39 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca float, align 4, addrspace(5) - %"40" = alloca double, align 8, addrspace(5) +define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca float, align 4, addrspace(5) + %"36" = alloca double, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"41", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"49" = inttoptr i64 %"44" to ptr addrspace(1) - %"43" = load float, ptr addrspace(1) %"49", align 4 - store float %"43", ptr addrspace(5) %"39", align 4 - %"46" = load float, ptr addrspace(5) %"39", align 4 - %"45" = fpext float %"46" to double - store double %"45", ptr addrspace(5) %"40", align 8 - %"47" = load i64, ptr addrspace(5) %"38", align 4 - %"48" = load double, ptr addrspace(5) %"40", align 8 - %"50" = inttoptr i64 %"47" to ptr - store double %"48", ptr %"50", align 8 + br label %"30" + +"30": ; preds = %1 + %"37" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"37", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"40" = load i64, ptr addrspace(5) %"33", align 4 + %"45" = inttoptr i64 %"40" to ptr addrspace(1) + %"39" = load float, ptr addrspace(1) %"45", align 4 + store float %"39", ptr addrspace(5) %"35", align 4 + %"42" = load float, ptr addrspace(5) %"35", align 4 + %"41" = fpext float %"42" to double + store double %"41", ptr addrspace(5) %"36", align 8 + %"43" = load i64, ptr addrspace(5) %"34", align 4 + %"44" = load double, ptr addrspace(5) %"36", align 8 + %"46" = inttoptr i64 %"43" to ptr + store double %"44", ptr %"46", align 8 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_rni.ll b/ptx/src/test/ll/cvt_rni.ll index 850b1fb..888997d 100644 --- a/ptx/src/test/ll/cvt_rni.ll +++ b/ptx/src/test/ll/cvt_rni.ll @@ -1,58 +1,49 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca float, align 4, addrspace(5) - %"44" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca float, align 4, addrspace(5) + %"40" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"59" = inttoptr i64 %"48" to ptr - %"47" = load float, ptr %"59", align 4 - store float %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"60" = inttoptr i64 %"49" to ptr - %"30" = getelementptr inbounds i8, ptr %"60", i64 4 - %"50" = load float, ptr %"30", align 4 - store float %"50", ptr addrspace(5) %"44", align 4 - %"52" = load float, ptr addrspace(5) %"43", align 4 - %2 = call float @llvm.roundeven.f32(float %"52") - %"51" = freeze float %2 - store float %"51", ptr addrspace(5) %"43", align 4 - %"54" = load float, ptr addrspace(5) %"44", align 4 - %3 = call float @llvm.roundeven.f32(float %"54") - %"53" = freeze float %3 - store float %"53", ptr addrspace(5) %"44", align 4 - %"55" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = load float, ptr addrspace(5) %"43", align 4 - %"61" = inttoptr i64 %"55" to ptr - store float %"56", ptr %"61", align 4 - %"57" = load i64, ptr addrspace(5) %"42", align 4 - %"62" = inttoptr i64 %"57" to ptr - %"32" = getelementptr inbounds i8, ptr %"62", i64 4 - %"58" = load float, ptr addrspace(5) %"44", align 4 - store float %"58", ptr %"32", align 4 + br label %"34" + +"34": ; preds = %1 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"55" = inttoptr i64 %"44" to ptr + %"43" = load float, ptr %"55", align 4 + store float %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"56" = inttoptr i64 %"45" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 4 + %"46" = load float, ptr %"31", align 4 + store float %"46", ptr addrspace(5) %"40", align 4 + %"48" = load float, ptr addrspace(5) %"39", align 4 + %2 = call float @llvm.roundeven.f32(float %"48") + %"47" = freeze float %2 + store float %"47", ptr addrspace(5) %"39", align 4 + %"50" = load float, ptr addrspace(5) %"40", align 4 + %3 = call float @llvm.roundeven.f32(float %"50") + %"49" = freeze float %3 + store float %"49", ptr addrspace(5) %"40", align 4 + %"51" = load i64, ptr addrspace(5) %"38", align 4 + %"52" = load float, ptr addrspace(5) %"39", align 4 + %"57" = inttoptr i64 %"51" to ptr + store float %"52", ptr %"57", align 4 + %"53" = load i64, ptr addrspace(5) %"38", align 4 + %"58" = inttoptr i64 %"53" to ptr + %"33" = getelementptr inbounds i8, ptr %"58", i64 4 + %"54" = load float, ptr addrspace(5) %"40", align 4 + store float %"54", ptr %"33", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.roundeven.f32(float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_rzi.ll b/ptx/src/test/ll/cvt_rzi.ll index 05a2d49..70019e1 100644 --- a/ptx/src/test/ll/cvt_rzi.ll +++ b/ptx/src/test/ll/cvt_rzi.ll @@ -1,58 +1,54 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca float, align 4, addrspace(5) - %"44" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca float, align 4, addrspace(5) + %"40" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"59" = inttoptr i64 %"48" to ptr - %"47" = load float, ptr %"59", align 4 - store float %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"60" = inttoptr i64 %"49" to ptr - %"30" = getelementptr inbounds i8, ptr %"60", i64 4 - %"50" = load float, ptr %"30", align 4 - store float %"50", ptr addrspace(5) %"44", align 4 - %"52" = load float, ptr addrspace(5) %"43", align 4 - %2 = call float @llvm.trunc.f32(float %"52") - %"51" = freeze float %2 - store float %"51", ptr addrspace(5) %"43", align 4 - %"54" = load float, ptr addrspace(5) %"44", align 4 - %3 = call float @llvm.trunc.f32(float %"54") - %"53" = freeze float %3 - store float %"53", ptr addrspace(5) %"44", align 4 - %"55" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = load float, ptr addrspace(5) %"43", align 4 - %"61" = inttoptr i64 %"55" to ptr - store float %"56", ptr %"61", align 4 - %"57" = load i64, ptr addrspace(5) %"42", align 4 - %"62" = inttoptr i64 %"57" to ptr - %"32" = getelementptr inbounds i8, ptr %"62", i64 4 - %"58" = load float, ptr addrspace(5) %"44", align 4 - store float %"58", ptr %"32", align 4 + br label %"34" + +"34": ; preds = %1 + call void @llvm.amdgcn.s.setreg(i32 6145, i32 3) + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"55" = inttoptr i64 %"44" to ptr + %"43" = load float, ptr %"55", align 4 + store float %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"56" = inttoptr i64 %"45" to ptr + %"31" = getelementptr inbounds i8, ptr %"56", i64 4 + %"46" = load float, ptr %"31", align 4 + store float %"46", ptr addrspace(5) %"40", align 4 + %"48" = load float, ptr addrspace(5) %"39", align 4 + %2 = call float @llvm.trunc.f32(float %"48") + %"47" = freeze float %2 + store float %"47", ptr addrspace(5) %"39", align 4 + %"50" = load float, ptr addrspace(5) %"40", align 4 + %3 = call float @llvm.trunc.f32(float %"50") + %"49" = freeze float %3 + store float %"49", ptr addrspace(5) %"40", align 4 + %"51" = load i64, ptr addrspace(5) %"38", align 4 + %"52" = load float, ptr addrspace(5) %"39", align 4 + %"57" = inttoptr i64 %"51" to ptr + store float %"52", ptr %"57", align 4 + %"53" = load i64, ptr addrspace(5) %"38", align 4 + %"58" = inttoptr i64 %"53" to ptr + %"33" = getelementptr inbounds i8, ptr %"58", i64 4 + %"54" = load float, ptr addrspace(5) %"40", align 4 + store float %"54", ptr %"33", align 4 ret void } -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.trunc.f32(float) #1 +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.trunc.f32(float) #2 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind willreturn } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_s16_s8.ll b/ptx/src/test/ll/cvt_s16_s8.ll index b36fc88..c199eee 100644 --- a/ptx/src/test/ll/cvt_s16_s8.ll +++ b/ptx/src/test/ll/cvt_s16_s8.ll @@ -1,41 +1,32 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i32, align 4, addrspace(5) - %"40" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i32, align 4, addrspace(5) + %"36" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"41", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"49" = inttoptr i64 %"44" to ptr addrspace(1) - %"43" = load i32, ptr addrspace(1) %"49", align 4 - store i32 %"43", ptr addrspace(5) %"40", align 4 - %"46" = load i32, ptr addrspace(5) %"40", align 4 - %2 = trunc i32 %"46" to i8 - %"50" = sext i8 %2 to i16 - %"45" = sext i16 %"50" to i32 - store i32 %"45", ptr addrspace(5) %"39", align 4 - %"47" = load i64, ptr addrspace(5) %"38", align 4 - %"48" = load i32, ptr addrspace(5) %"39", align 4 - %"52" = inttoptr i64 %"47" to ptr - store i32 %"48", ptr %"52", align 4 + br label %"30" + +"30": ; preds = %1 + %"37" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"37", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"40" = load i64, ptr addrspace(5) %"33", align 4 + %"45" = inttoptr i64 %"40" to ptr addrspace(1) + %"39" = load i32, ptr addrspace(1) %"45", align 4 + store i32 %"39", ptr addrspace(5) %"36", align 4 + %"42" = load i32, ptr addrspace(5) %"36", align 4 + %2 = trunc i32 %"42" to i8 + %"46" = sext i8 %2 to i16 + %"41" = sext i16 %"46" to i32 + store i32 %"41", ptr addrspace(5) %"35", align 4 + %"43" = load i64, ptr addrspace(5) %"34", align 4 + %"44" = load i32, ptr addrspace(5) %"35", align 4 + %"48" = inttoptr i64 %"43" to ptr + store i32 %"44", ptr %"48", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_s32_f32.ll b/ptx/src/test/ll/cvt_s32_f32.ll index 5a8e804..196f067 100644 --- a/ptx/src/test/ll/cvt_s32_f32.ll +++ b/ptx/src/test/ll/cvt_s32_f32.ll @@ -1,64 +1,55 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) - %"44" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"60" = inttoptr i64 %"48" to ptr - %"59" = load float, ptr %"60", align 4 - %"47" = bitcast float %"59" to i32 - store i32 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"61" = inttoptr i64 %"49" to ptr - %"30" = getelementptr inbounds i8, ptr %"61", i64 4 - %"62" = load float, ptr %"30", align 4 - %"50" = bitcast float %"62" to i32 - store i32 %"50", ptr addrspace(5) %"44", align 4 - %"52" = load i32, ptr addrspace(5) %"43", align 4 - %"64" = bitcast i32 %"52" to float - %2 = call float @llvm.ceil.f32(float %"64") + br label %"34" + +"34": ; preds = %1 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"56" = inttoptr i64 %"44" to ptr + %"55" = load float, ptr %"56", align 4 + %"43" = bitcast float %"55" to i32 + store i32 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"57" = inttoptr i64 %"45" to ptr + %"31" = getelementptr inbounds i8, ptr %"57", i64 4 + %"58" = load float, ptr %"31", align 4 + %"46" = bitcast float %"58" to i32 + store i32 %"46", ptr addrspace(5) %"40", align 4 + %"48" = load i32, ptr addrspace(5) %"39", align 4 + %"60" = bitcast i32 %"48" to float + %2 = call float @llvm.ceil.f32(float %"60") %3 = fptosi float %2 to i32 - %"63" = freeze i32 %3 - store i32 %"63", ptr addrspace(5) %"43", align 4 - %"54" = load i32, ptr addrspace(5) %"44", align 4 - %"66" = bitcast i32 %"54" to float - %4 = call float @llvm.ceil.f32(float %"66") + %"59" = freeze i32 %3 + store i32 %"59", ptr addrspace(5) %"39", align 4 + %"50" = load i32, ptr addrspace(5) %"40", align 4 + %"62" = bitcast i32 %"50" to float + %4 = call float @llvm.ceil.f32(float %"62") %5 = fptosi float %4 to i32 - %"65" = freeze i32 %5 - store i32 %"65", ptr addrspace(5) %"44", align 4 - %"55" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = load i32, ptr addrspace(5) %"43", align 4 - %"67" = inttoptr i64 %"55" to ptr addrspace(1) - store i32 %"56", ptr addrspace(1) %"67", align 4 - %"57" = load i64, ptr addrspace(5) %"42", align 4 - %"69" = inttoptr i64 %"57" to ptr addrspace(1) - %"32" = getelementptr inbounds i8, ptr addrspace(1) %"69", i64 4 - %"58" = load i32, ptr addrspace(5) %"44", align 4 - store i32 %"58", ptr addrspace(1) %"32", align 4 + %"61" = freeze i32 %5 + store i32 %"61", ptr addrspace(5) %"40", align 4 + %"51" = load i64, ptr addrspace(5) %"38", align 4 + %"52" = load i32, ptr addrspace(5) %"39", align 4 + %"63" = inttoptr i64 %"51" to ptr addrspace(1) + store i32 %"52", ptr addrspace(1) %"63", align 4 + %"53" = load i64, ptr addrspace(5) %"38", align 4 + %"65" = inttoptr i64 %"53" to ptr addrspace(1) + %"33" = getelementptr inbounds i8, ptr addrspace(1) %"65", i64 4 + %"54" = load i32, ptr addrspace(5) %"40", align 4 + store i32 %"54", ptr addrspace(1) %"33", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.ceil.f32(float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_s64_s32.ll b/ptx/src/test/ll/cvt_s64_s32.ll index 5aa91b1..d1c6c83 100644 --- a/ptx/src/test/ll/cvt_s64_s32.ll +++ b/ptx/src/test/ll/cvt_s64_s32.ll @@ -1,39 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i32, align 4, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i32, align 4, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"41", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"50" = inttoptr i64 %"44" to ptr - %"49" = load i32, ptr %"50", align 4 - store i32 %"49", ptr addrspace(5) %"39", align 4 - %"46" = load i32, ptr addrspace(5) %"39", align 4 - %"45" = sext i32 %"46" to i64 - store i64 %"45", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"38", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"51" = inttoptr i64 %"47" to ptr - store i64 %"48", ptr %"51", align 4 + br label %"30" + +"30": ; preds = %1 + %"37" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"37", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"40" = load i64, ptr addrspace(5) %"33", align 4 + %"46" = inttoptr i64 %"40" to ptr + %"45" = load i32, ptr %"46", align 4 + store i32 %"45", ptr addrspace(5) %"35", align 4 + %"42" = load i32, ptr addrspace(5) %"35", align 4 + %"41" = sext i32 %"42" to i64 + store i64 %"41", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"34", align 4 + %"44" = load i64, ptr addrspace(5) %"36", align 4 + %"47" = inttoptr i64 %"43" to ptr + store i64 %"44", ptr %"47", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_sat_s_u.ll b/ptx/src/test/ll/cvt_sat_s_u.ll index 63954f8..68ff04b 100644 --- a/ptx/src/test/ll/cvt_sat_s_u.ll +++ b/ptx/src/test/ll/cvt_sat_s_u.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i32, align 4, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i32, align 4, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"43", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"44", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %"53" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"53", align 4 - store i32 %"45", ptr addrspace(5) %"40", align 4 - %"48" = load i32, ptr addrspace(5) %"40", align 4 - %2 = call i32 @llvm.smax.i32(i32 %"48", i32 0) + br label %"31" + +"31": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"39", ptr addrspace(5) %"34", align 4 + %"40" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"40", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(5) %"34", align 4 + %"49" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"49", align 4 + store i32 %"41", ptr addrspace(5) %"36", align 4 + %"44" = load i32, ptr addrspace(5) %"36", align 4 + %2 = call i32 @llvm.smax.i32(i32 %"44", i32 0) %3 = call i32 @llvm.umin.i32(i32 %2, i32 -1) - store i32 %3, ptr addrspace(5) %"41", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 - store i32 %"50", ptr addrspace(5) %"42", align 4 - %"51" = load i64, ptr addrspace(5) %"39", align 4 - %"52" = load i32, ptr addrspace(5) %"42", align 4 - %"54" = inttoptr i64 %"51" to ptr - store i32 %"52", ptr %"54", align 4 + store i32 %3, ptr addrspace(5) %"37", align 4 + %"46" = load i32, ptr addrspace(5) %"37", align 4 + store i32 %"46", ptr addrspace(5) %"38", align 4 + %"47" = load i64, ptr addrspace(5) %"35", align 4 + %"48" = load i32, ptr addrspace(5) %"38", align 4 + %"50" = inttoptr i64 %"47" to ptr + store i32 %"48", ptr %"50", align 4 ret void } @@ -46,5 +37,5 @@ declare i32 @llvm.smax.i32(i32, i32) #1 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare i32 @llvm.umin.i32(i32, i32) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvta.ll b/ptx/src/test/ll/cvta.ll index 495b312..7bbbbfb 100644 --- a/ptx/src/test/ll/cvta.ll +++ b/ptx/src/test/ll/cvta.ll @@ -1,43 +1,34 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %2 = inttoptr i64 %"42" to ptr - %"49" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"49", ptr addrspace(5) %"36", align 8 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %3 = inttoptr i64 %"44" to ptr - %"51" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"51", ptr addrspace(5) %"37", align 8 - %"46" = load i64, ptr addrspace(5) %"36", align 4 - %"53" = inttoptr i64 %"46" to ptr addrspace(1) - %"45" = load float, ptr addrspace(1) %"53", align 4 - store float %"45", ptr addrspace(5) %"38", align 4 - %"47" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = load float, ptr addrspace(5) %"38", align 4 - %"54" = inttoptr i64 %"47" to ptr addrspace(1) - store float %"48", ptr addrspace(1) %"54", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %2 = inttoptr i64 %"38" to ptr + %"45" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"45", ptr addrspace(5) %"32", align 8 + %"40" = load i64, ptr addrspace(5) %"33", align 4 + %3 = inttoptr i64 %"40" to ptr + %"47" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"47", ptr addrspace(5) %"33", align 8 + %"42" = load i64, ptr addrspace(5) %"32", align 4 + %"49" = inttoptr i64 %"42" to ptr addrspace(1) + %"41" = load float, ptr addrspace(1) %"49", align 4 + store float %"41", ptr addrspace(5) %"34", align 4 + %"43" = load i64, ptr addrspace(5) %"33", align 4 + %"44" = load float, ptr addrspace(5) %"34", align 4 + %"50" = inttoptr i64 %"43" to ptr addrspace(1) + store float %"44", ptr addrspace(1) %"50", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/div_approx.ll b/ptx/src/test/ll/div_approx.ll index cb8cb28..8498e78 100644 --- a/ptx/src/test/ll/div_approx.ll +++ b/ptx/src/test/ll/div_approx.ll @@ -1,45 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca float, align 4, addrspace(5) - %"42" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca float, align 4, addrspace(5) + %"38" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load float, ptr %"54", align 4 - store float %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load float, ptr %"30", align 4 - store float %"48", ptr addrspace(5) %"42", align 4 - %"50" = load float, ptr addrspace(5) %"41", align 4 - %"51" = load float, ptr addrspace(5) %"42", align 4 - %"49" = fdiv arcp afn float %"50", %"51" - store float %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load float, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store float %"53", ptr %"56", align 4 + br label %"32" + +"32": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load float, ptr %"50", align 4 + store float %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load float, ptr %"31", align 4 + store float %"44", ptr addrspace(5) %"38", align 4 + %"46" = load float, ptr addrspace(5) %"37", align 4 + %"47" = load float, ptr addrspace(5) %"38", align 4 + %"45" = fdiv arcp afn float %"46", %"47" + store float %"45", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load float, ptr addrspace(5) %"37", align 4 + %"52" = inttoptr i64 %"48" to ptr + store float %"49", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/ex2.ll b/ptx/src/test/ll/ex2.ll index 904f238..f883ad4 100644 --- a/ptx/src/test/ll/ex2.ll +++ b/ptx/src/test/ll/ex2.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call float @llvm.amdgcn.exp2.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load float, ptr %"43", align 4 + store float %"37", ptr addrspace(5) %"34", align 4 + %"40" = load float, ptr addrspace(5) %"34", align 4 + %"39" = call float @llvm.amdgcn.exp2.f32(float %"40") + store float %"39", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load float, ptr addrspace(5) %"34", align 4 + %"44" = inttoptr i64 %"41" to ptr + store float %"42", ptr %"44", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.amdgcn.exp2.f32(float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/extern_shared.ll b/ptx/src/test/ll/extern_shared.ll index 9b872ec..19f7a7e 100644 --- a/ptx/src/test/ll/extern_shared.ll +++ b/ptx/src/test/ll/extern_shared.ll @@ -1,41 +1,32 @@ @shared_mem = external addrspace(3) global [0 x i32] -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = inttoptr i64 %"43" to ptr addrspace(1) - %"42" = load i64, ptr addrspace(1) %"48", align 4 - store i64 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"39", align 4 - store i64 %"44", ptr addrspace(3) @shared_mem, align 4 - %"45" = load i64, ptr addrspace(3) @shared_mem, align 4 - store i64 %"45", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"51" = inttoptr i64 %"46" to ptr addrspace(1) - store i64 %"47", ptr addrspace(1) %"51", align 4 + br label %"30" + +"30": ; preds = %1 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"37" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"37", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"44" = inttoptr i64 %"39" to ptr addrspace(1) + %"38" = load i64, ptr addrspace(1) %"44", align 4 + store i64 %"38", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(5) %"35", align 4 + store i64 %"40", ptr addrspace(3) @shared_mem, align 4 + %"41" = load i64, ptr addrspace(3) @shared_mem, align 4 + store i64 %"41", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(5) %"34", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"47" = inttoptr i64 %"42" to ptr addrspace(1) + store i64 %"43", ptr addrspace(1) %"47", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/extern_shared_call.ll b/ptx/src/test/ll/extern_shared_call.ll index 923523b..54e00ee 100644 --- a/ptx/src/test/ll/extern_shared_call.ll +++ b/ptx/src/test/ll/extern_shared_call.ll @@ -1,57 +1,55 @@ @shared_mem = external addrspace(3) global [0 x i32], align 4 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define void @__zluda_ptx_impl_incr_shared_2_global() #0 { - %"38" = alloca i64, align 8, addrspace(5) +define void @incr_shared_2_global() #0 { + %"36" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(3) @shared_mem, align 4 - store i64 %"39", ptr addrspace(5) %"38", align 4 - %"41" = load i64, ptr addrspace(5) %"38", align 4 - %"40" = add i64 %"41", 2 - store i64 %"40", ptr addrspace(5) %"38", align 4 - %"42" = load i64, ptr addrspace(5) %"38", align 4 - store i64 %"42", ptr addrspace(3) @shared_mem, align 4 + br label %"33" + +"33": ; preds = %1 + %"37" = load i64, ptr addrspace(3) @shared_mem, align 4 + store i64 %"37", ptr addrspace(5) %"36", align 4 + %"39" = load i64, ptr addrspace(5) %"36", align 4 + %"38" = add i64 %"39", 2 + store i64 %"38", ptr addrspace(5) %"36", align 4 + %"40" = load i64, ptr addrspace(5) %"36", align 4 + store i64 %"40", ptr addrspace(3) @shared_mem, align 4 ret void } -define amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { +define amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 { + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) %"45" = alloca i64, align 8, addrspace(5) - %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"48" = load i64, ptr addrspace(4) %"43", align 4 + br label %"34" + +"34": ; preds = %1 + %"46" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"46", ptr addrspace(5) %"43", align 4 + %"47" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"47", ptr addrspace(5) %"44", align 4 + %"49" = load i64, ptr addrspace(5) %"43", align 4 + %"56" = inttoptr i64 %"49" to ptr addrspace(1) + %"48" = load i64, ptr addrspace(1) %"56", align 4 store i64 %"48", ptr addrspace(5) %"45", align 4 - %"49" = load i64, ptr addrspace(4) %"44", align 4 - store i64 %"49", ptr addrspace(5) %"46", align 4 - %"51" = load i64, ptr addrspace(5) %"45", align 4 - %"58" = inttoptr i64 %"51" to ptr addrspace(1) - %"50" = load i64, ptr addrspace(1) %"58", align 4 - store i64 %"50", ptr addrspace(5) %"47", align 4 - %"52" = load i64, ptr addrspace(5) %"47", align 4 - store i64 %"52", ptr addrspace(3) @shared_mem, align 4 - call void @__zluda_ptx_impl_incr_shared_2_global() - %"53" = load i64, ptr addrspace(3) @shared_mem, align 4 - store i64 %"53", ptr addrspace(5) %"47", align 4 - %"54" = load i64, ptr addrspace(5) %"46", align 4 - %"55" = load i64, ptr addrspace(5) %"47", align 4 - %"61" = inttoptr i64 %"54" to ptr addrspace(1) - store i64 %"55", ptr addrspace(1) %"61", align 4 + %"50" = load i64, ptr addrspace(5) %"45", align 4 + store i64 %"50", ptr addrspace(3) @shared_mem, align 4 + call void @incr_shared_2_global() + br label %"35" + +"35": ; preds = %"34" + %"51" = load i64, ptr addrspace(3) @shared_mem, align 4 + store i64 %"51", ptr addrspace(5) %"45", align 4 + %"52" = load i64, ptr addrspace(5) %"44", align 4 + %"53" = load i64, ptr addrspace(5) %"45", align 4 + %"59" = inttoptr i64 %"52" to ptr addrspace(1) + store i64 %"53", ptr addrspace(1) %"59", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/fma.ll b/ptx/src/test/ll/fma.ll index 4a454ef..184f902 100644 --- a/ptx/src/test/ll/fma.ll +++ b/ptx/src/test/ll/fma.ll @@ -1,56 +1,47 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca float, align 4, addrspace(5) - %"45" = alloca float, align 4, addrspace(5) - %"46" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca float, align 4, addrspace(5) + %"41" = alloca float, align 4, addrspace(5) + %"42" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"47" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"47", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"48", ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(5) %"42", align 4 - %"61" = inttoptr i64 %"50" to ptr - %"49" = load float, ptr %"61", align 4 - store float %"49", ptr addrspace(5) %"44", align 4 - %"51" = load i64, ptr addrspace(5) %"42", align 4 - %"62" = inttoptr i64 %"51" to ptr - %"31" = getelementptr inbounds i8, ptr %"62", i64 4 - %"52" = load float, ptr %"31", align 4 - store float %"52", ptr addrspace(5) %"45", align 4 - %"53" = load i64, ptr addrspace(5) %"42", align 4 - %"63" = inttoptr i64 %"53" to ptr - %"33" = getelementptr inbounds i8, ptr %"63", i64 8 - %"54" = load float, ptr %"33", align 4 - store float %"54", ptr addrspace(5) %"46", align 4 - %"56" = load float, ptr addrspace(5) %"44", align 4 - %"57" = load float, ptr addrspace(5) %"45", align 4 - %"58" = load float, ptr addrspace(5) %"46", align 4 - %"55" = call float @llvm.fma.f32(float %"56", float %"57", float %"58") - store float %"55", ptr addrspace(5) %"44", align 4 - %"59" = load i64, ptr addrspace(5) %"43", align 4 - %"60" = load float, ptr addrspace(5) %"44", align 4 - %"64" = inttoptr i64 %"59" to ptr - store float %"60", ptr %"64", align 4 + br label %"35" + +"35": ; preds = %1 + %"43" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"43", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"57" = inttoptr i64 %"46" to ptr + %"45" = load float, ptr %"57", align 4 + store float %"45", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"38", align 4 + %"58" = inttoptr i64 %"47" to ptr + %"32" = getelementptr inbounds i8, ptr %"58", i64 4 + %"48" = load float, ptr %"32", align 4 + store float %"48", ptr addrspace(5) %"41", align 4 + %"49" = load i64, ptr addrspace(5) %"38", align 4 + %"59" = inttoptr i64 %"49" to ptr + %"34" = getelementptr inbounds i8, ptr %"59", i64 8 + %"50" = load float, ptr %"34", align 4 + store float %"50", ptr addrspace(5) %"42", align 4 + %"52" = load float, ptr addrspace(5) %"40", align 4 + %"53" = load float, ptr addrspace(5) %"41", align 4 + %"54" = load float, ptr addrspace(5) %"42", align 4 + %"51" = call float @llvm.fma.f32(float %"52", float %"53", float %"54") + store float %"51", ptr addrspace(5) %"40", align 4 + %"55" = load i64, ptr addrspace(5) %"39", align 4 + %"56" = load float, ptr addrspace(5) %"40", align 4 + %"60" = inttoptr i64 %"55" to ptr + store float %"56", ptr %"60", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.fma.f32(float, float, float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/global_array.ll b/ptx/src/test/ll/global_array.ll index fede5f7..d36c3d1 100644 --- a/ptx/src/test/ll/global_array.ll +++ b/ptx/src/test/ll/global_array.ll @@ -1,36 +1,27 @@ @foobar = addrspace(1) global [4 x i32] [i32 1, i32 0, i32 0, i32 0] -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %"37", align 4 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"47" = inttoptr i64 %"43" to ptr addrspace(1) - %"42" = load i32, ptr addrspace(1) %"47", align 4 - store i32 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"38", align 4 - %"45" = load i32, ptr addrspace(5) %"39", align 4 - %"48" = inttoptr i64 %"44" to ptr addrspace(1) - store i32 %"45", ptr addrspace(1) %"48", align 4 + br label %"30" + +"30": ; preds = %1 + store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %"33", align 4 + %"37" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"37", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"43" = inttoptr i64 %"39" to ptr addrspace(1) + %"38" = load i32, ptr addrspace(1) %"43", align 4 + store i32 %"38", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(5) %"34", align 4 + %"41" = load i32, ptr addrspace(5) %"35", align 4 + %"44" = inttoptr i64 %"40" to ptr addrspace(1) + store i32 %"41", ptr addrspace(1) %"44", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/ld_st.ll b/ptx/src/test/ll/ld_st.ll index 7c37090..016d5cf 100644 --- a/ptx/src/test/ll/ld_st.ll +++ b/ptx/src/test/ll/ld_st.ll @@ -1,35 +1,26 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"45" = inttoptr i64 %"42" to ptr - %"41" = load i64, ptr %"45", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"44" = load i64, ptr addrspace(5) %"38", align 4 - %"46" = inttoptr i64 %"43" to ptr - store i64 %"44", ptr %"46", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"41" = inttoptr i64 %"38" to ptr + %"37" = load i64, ptr %"41", align 4 + store i64 %"37", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"40" = load i64, ptr addrspace(5) %"34", align 4 + %"42" = inttoptr i64 %"39" to ptr + store i64 %"40", ptr %"42", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/ld_st_implicit.ll b/ptx/src/test/ll/ld_st_implicit.ll index cb4e08a..e9095e9 100644 --- a/ptx/src/test/ll/ld_st_implicit.ll +++ b/ptx/src/test/ll/ld_st_implicit.ll @@ -1,40 +1,31 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - store i64 81985529216486895, ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = inttoptr i64 %"44" to ptr addrspace(1) - %"47" = load float, ptr addrspace(1) %"48", align 4 - %2 = bitcast float %"47" to i32 - %"43" = zext i32 %2 to i64 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = inttoptr i64 %"45" to ptr addrspace(1) - %3 = trunc i64 %"46" to i32 - %"50" = bitcast i32 %3 to float - store float %"50", ptr addrspace(1) %"49", align 4 + br label %"30" + +"30": ; preds = %1 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"37" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"37", ptr addrspace(5) %"34", align 4 + store i64 81985529216486895, ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(5) %"33", align 4 + %"44" = inttoptr i64 %"40" to ptr addrspace(1) + %"43" = load float, ptr addrspace(1) %"44", align 4 + %2 = bitcast float %"43" to i32 + %"39" = zext i32 %2 to i64 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = inttoptr i64 %"41" to ptr addrspace(1) + %3 = trunc i64 %"42" to i32 + %"46" = bitcast i32 %3 to float + store float %"46", ptr addrspace(1) %"45", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/ld_st_offset.ll b/ptx/src/test/ll/ld_st_offset.ll index 81e0c62..25e68d6 100644 --- a/ptx/src/test/ll/ld_st_offset.ll +++ b/ptx/src/test/ll/ld_st_offset.ll @@ -1,46 +1,37 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) - %"44" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"46" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"46", ptr addrspace(5) %"42", align 4 - %"48" = load i64, ptr addrspace(5) %"41", align 4 - %"55" = inttoptr i64 %"48" to ptr - %"47" = load i32, ptr %"55", align 4 - store i32 %"47", ptr addrspace(5) %"43", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"49" to ptr - %"30" = getelementptr inbounds i8, ptr %"56", i64 4 - %"50" = load i32, ptr %"30", align 4 - store i32 %"50", ptr addrspace(5) %"44", align 4 - %"51" = load i64, ptr addrspace(5) %"42", align 4 - %"52" = load i32, ptr addrspace(5) %"44", align 4 - %"57" = inttoptr i64 %"51" to ptr - store i32 %"52", ptr %"57", align 4 - %"53" = load i64, ptr addrspace(5) %"42", align 4 - %"58" = inttoptr i64 %"53" to ptr - %"32" = getelementptr inbounds i8, ptr %"58", i64 4 - %"54" = load i32, ptr addrspace(5) %"43", align 4 - store i32 %"54", ptr %"32", align 4 + br label %"34" + +"34": ; preds = %1 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"51" = inttoptr i64 %"44" to ptr + %"43" = load i32, ptr %"51", align 4 + store i32 %"43", ptr addrspace(5) %"39", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"52" = inttoptr i64 %"45" to ptr + %"31" = getelementptr inbounds i8, ptr %"52", i64 4 + %"46" = load i32, ptr %"31", align 4 + store i32 %"46", ptr addrspace(5) %"40", align 4 + %"47" = load i64, ptr addrspace(5) %"38", align 4 + %"48" = load i32, ptr addrspace(5) %"40", align 4 + %"53" = inttoptr i64 %"47" to ptr + store i32 %"48", ptr %"53", align 4 + %"49" = load i64, ptr addrspace(5) %"38", align 4 + %"54" = inttoptr i64 %"49" to ptr + %"33" = getelementptr inbounds i8, ptr %"54", i64 4 + %"50" = load i32, ptr addrspace(5) %"39", align 4 + store i32 %"50", ptr %"33", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/lg2.ll b/ptx/src/test/ll/lg2.ll index 543ae0a..ed7de7a 100644 --- a/ptx/src/test/ll/lg2.ll +++ b/ptx/src/test/ll/lg2.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call float @llvm.amdgcn.log.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load float, ptr %"43", align 4 + store float %"37", ptr addrspace(5) %"34", align 4 + %"40" = load float, ptr addrspace(5) %"34", align 4 + %"39" = call float @llvm.amdgcn.log.f32(float %"40") + store float %"39", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load float, ptr addrspace(5) %"34", align 4 + %"44" = inttoptr i64 %"41" to ptr + store float %"42", ptr %"44", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.amdgcn.log.f32(float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/local_align.ll b/ptx/src/test/ll/local_align.ll index 08c7971..70dac59 100644 --- a/ptx/src/test/ll/local_align.ll +++ b/ptx/src/test/ll/local_align.ll @@ -1,36 +1,27 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"9" = alloca [8 x i8], align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"10" = alloca [8 x i8], align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = inttoptr i64 %"43" to ptr - %"42" = load i64, ptr %"46", align 4 - store i64 %"42", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"39", align 4 - %"47" = inttoptr i64 %"44" to ptr - store i64 %"45", ptr %"47", align 4 + br label %"30" + +"30": ; preds = %1 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"37" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"37", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = inttoptr i64 %"39" to ptr + %"38" = load i64, ptr %"42", align 4 + store i64 %"38", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"35", align 4 + %"43" = inttoptr i64 %"40" to ptr + store i64 %"41", ptr %"43", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mad_s32.ll b/ptx/src/test/ll/mad_s32.ll index f6ea9a8..37db9d3 100644 --- a/ptx/src/test/ll/mad_s32.ll +++ b/ptx/src/test/ll/mad_s32.ll @@ -1,64 +1,55 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { - %"47" = alloca i64, align 8, addrspace(5) - %"48" = alloca i64, align 8, addrspace(5) - %"49" = alloca i32, align 4, addrspace(5) - %"50" = alloca i32, align 4, addrspace(5) - %"51" = alloca i32, align 4, addrspace(5) - %"52" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i32, align 4, addrspace(5) + %"46" = alloca i32, align 4, addrspace(5) + %"47" = alloca i32, align 4, addrspace(5) + %"48" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"53" = load i64, ptr addrspace(4) %"45", align 4 - store i64 %"53", ptr addrspace(5) %"47", align 4 - %"54" = load i64, ptr addrspace(4) %"46", align 4 - store i64 %"54", ptr addrspace(5) %"48", align 4 - %"56" = load i64, ptr addrspace(5) %"47", align 4 - %"71" = inttoptr i64 %"56" to ptr - %"55" = load i32, ptr %"71", align 4 - store i32 %"55", ptr addrspace(5) %"50", align 4 - %"57" = load i64, ptr addrspace(5) %"47", align 4 - %"72" = inttoptr i64 %"57" to ptr - %"32" = getelementptr inbounds i8, ptr %"72", i64 4 - %"58" = load i32, ptr %"32", align 4 - store i32 %"58", ptr addrspace(5) %"51", align 4 - %"59" = load i64, ptr addrspace(5) %"47", align 4 - %"73" = inttoptr i64 %"59" to ptr - %"34" = getelementptr inbounds i8, ptr %"73", i64 8 - %"60" = load i32, ptr %"34", align 4 - store i32 %"60", ptr addrspace(5) %"52", align 4 - %"62" = load i32, ptr addrspace(5) %"50", align 4 - %"63" = load i32, ptr addrspace(5) %"51", align 4 - %"64" = load i32, ptr addrspace(5) %"52", align 4 - %2 = mul i32 %"62", %"63" - %"61" = add i32 %2, %"64" - store i32 %"61", ptr addrspace(5) %"49", align 4 - %"65" = load i64, ptr addrspace(5) %"48", align 4 - %"66" = load i32, ptr addrspace(5) %"49", align 4 - %"74" = inttoptr i64 %"65" to ptr - store i32 %"66", ptr %"74", align 4 - %"67" = load i64, ptr addrspace(5) %"48", align 4 - %"75" = inttoptr i64 %"67" to ptr - %"36" = getelementptr inbounds i8, ptr %"75", i64 4 - %"68" = load i32, ptr addrspace(5) %"49", align 4 - store i32 %"68", ptr %"36", align 4 - %"69" = load i64, ptr addrspace(5) %"48", align 4 - %"76" = inttoptr i64 %"69" to ptr - %"38" = getelementptr inbounds i8, ptr %"76", i64 8 - %"70" = load i32, ptr addrspace(5) %"49", align 4 - store i32 %"70", ptr %"38", align 4 + br label %"40" + +"40": ; preds = %1 + %"49" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"49", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"50", ptr addrspace(5) %"44", align 4 + %"52" = load i64, ptr addrspace(5) %"43", align 4 + %"67" = inttoptr i64 %"52" to ptr + %"51" = load i32, ptr %"67", align 4 + store i32 %"51", ptr addrspace(5) %"46", align 4 + %"53" = load i64, ptr addrspace(5) %"43", align 4 + %"68" = inttoptr i64 %"53" to ptr + %"33" = getelementptr inbounds i8, ptr %"68", i64 4 + %"54" = load i32, ptr %"33", align 4 + store i32 %"54", ptr addrspace(5) %"47", align 4 + %"55" = load i64, ptr addrspace(5) %"43", align 4 + %"69" = inttoptr i64 %"55" to ptr + %"35" = getelementptr inbounds i8, ptr %"69", i64 8 + %"56" = load i32, ptr %"35", align 4 + store i32 %"56", ptr addrspace(5) %"48", align 4 + %"58" = load i32, ptr addrspace(5) %"46", align 4 + %"59" = load i32, ptr addrspace(5) %"47", align 4 + %"60" = load i32, ptr addrspace(5) %"48", align 4 + %2 = mul i32 %"58", %"59" + %"57" = add i32 %2, %"60" + store i32 %"57", ptr addrspace(5) %"45", align 4 + %"61" = load i64, ptr addrspace(5) %"44", align 4 + %"62" = load i32, ptr addrspace(5) %"45", align 4 + %"70" = inttoptr i64 %"61" to ptr + store i32 %"62", ptr %"70", align 4 + %"63" = load i64, ptr addrspace(5) %"44", align 4 + %"71" = inttoptr i64 %"63" to ptr + %"37" = getelementptr inbounds i8, ptr %"71", i64 4 + %"64" = load i32, ptr addrspace(5) %"45", align 4 + store i32 %"64", ptr %"37", align 4 + %"65" = load i64, ptr addrspace(5) %"44", align 4 + %"72" = inttoptr i64 %"65" to ptr + %"39" = getelementptr inbounds i8, ptr %"72", i64 8 + %"66" = load i32, ptr addrspace(5) %"45", align 4 + store i32 %"66", ptr %"39", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/malformed_label.ll b/ptx/src/test/ll/malformed_label.ll new file mode 100644 index 0000000..c8bac71 --- /dev/null +++ b/ptx/src/test/ll/malformed_label.ll @@ -0,0 +1,33 @@ +define amdgpu_kernel void @malformed_label(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"32" + +"32": ; preds = %1 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + br label %"10" + +"10": ; preds = %"32" + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"48" = inttoptr i64 %"43" to ptr + %"42" = load i64, ptr %"48", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"38", align 4 + %"44" = add i64 %"45", 1 + store i64 %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"37", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 4 + %"49" = inttoptr i64 %"46" to ptr + store i64 %"47", ptr %"49", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/max.ll b/ptx/src/test/ll/max.ll index e8f58ba..ae1256b 100644 --- a/ptx/src/test/ll/max.ll +++ b/ptx/src/test/ll/max.ll @@ -1,49 +1,40 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 - %"51" = load i32, ptr addrspace(5) %"42", align 4 - %"49" = call i32 @llvm.smax.i32(i32 %"50", i32 %"51") - store i32 %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"56", align 4 + br label %"32" + +"32": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"50", align 4 + store i32 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load i32, ptr %"31", align 4 + store i32 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i32, ptr addrspace(5) %"37", align 4 + %"47" = load i32, ptr addrspace(5) %"38", align 4 + %"45" = call i32 @llvm.smax.i32(i32 %"46", i32 %"47") + store i32 %"45", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load i32, ptr addrspace(5) %"37", align 4 + %"52" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"52", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare i32 @llvm.smax.i32(i32, i32) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/membar.ll b/ptx/src/test/ll/membar.ll index 2e78f12..7455f34 100644 --- a/ptx/src/test/ll/membar.ll +++ b/ptx/src/test/ll/membar.ll @@ -1,36 +1,27 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"46" = inttoptr i64 %"42" to ptr - %"45" = load i32, ptr %"46", align 4 - store i32 %"45", ptr addrspace(5) %"38", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"42" = inttoptr i64 %"38" to ptr + %"41" = load i32, ptr %"42", align 4 + store i32 %"41", ptr addrspace(5) %"34", align 4 fence seq_cst - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 - %"47" = inttoptr i64 %"43" to ptr - store i32 %"44", ptr %"47", align 4 + %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"40" = load i32, ptr addrspace(5) %"34", align 4 + %"43" = inttoptr i64 %"39" to ptr + store i32 %"40", ptr %"43", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/min.ll b/ptx/src/test/ll/min.ll index e868195..bf38592 100644 --- a/ptx/src/test/ll/min.ll +++ b/ptx/src/test/ll/min.ll @@ -1,49 +1,40 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 - %"51" = load i32, ptr addrspace(5) %"42", align 4 - %"49" = call i32 @llvm.smin.i32(i32 %"50", i32 %"51") - store i32 %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"56", align 4 + br label %"32" + +"32": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"50", align 4 + store i32 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load i32, ptr %"31", align 4 + store i32 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i32, ptr addrspace(5) %"37", align 4 + %"47" = load i32, ptr addrspace(5) %"38", align 4 + %"45" = call i32 @llvm.smin.i32(i32 %"46", i32 %"47") + store i32 %"45", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load i32, ptr addrspace(5) %"37", align 4 + %"52" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"52", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare i32 @llvm.smin.i32(i32, i32) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/mov.ll b/ptx/src/test/ll/mov.ll index cf6c7ee..5ecb1fb 100644 --- a/ptx/src/test/ll/mov.ll +++ b/ptx/src/test/ll/mov.ll @@ -1,38 +1,29 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"41", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"49" = inttoptr i64 %"44" to ptr - %"43" = load i64, ptr %"49", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - store i64 %"46", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"38", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"50" = inttoptr i64 %"47" to ptr - store i64 %"48", ptr %"50", align 4 + br label %"30" + +"30": ; preds = %1 + %"37" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"37", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"40" = load i64, ptr addrspace(5) %"33", align 4 + %"45" = inttoptr i64 %"40" to ptr + %"39" = load i64, ptr %"45", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + store i64 %"42", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"34", align 4 + %"44" = load i64, ptr addrspace(5) %"36", align 4 + %"46" = inttoptr i64 %"43" to ptr + store i64 %"44", ptr %"46", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mov_address.ll b/ptx/src/test/ll/mov_address.ll index 644df01..ea6ce80 100644 --- a/ptx/src/test/ll/mov_address.ll +++ b/ptx/src/test/ll/mov_address.ll @@ -1,24 +1,15 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { - %"9" = alloca [8 x i8], align 1, addrspace(5) - %"35" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 { + %"10" = alloca [8 x i8], align 1, addrspace(5) + %"31" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"37" = ptrtoint ptr addrspace(5) %"9" to i64 - store i64 %"37", ptr addrspace(5) %"35", align 4 + br label %"28" + +"28": ; preds = %1 + %"33" = ptrtoint ptr addrspace(5) %"10" to i64 + store i64 %"33", ptr addrspace(5) %"31", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mul24.ll b/ptx/src/test/ll/mul24.ll index aae8aa0..f65aa94 100644 --- a/ptx/src/test/ll/mul24.ll +++ b/ptx/src/test/ll/mul24.ll @@ -1,43 +1,34 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @mul24(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i32, align 4, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @mul24(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i32, align 4, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i32, ptr %"50", align 4 - store i32 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i32, ptr addrspace(5) %"40", align 4 - %"46" = call i32 @llvm.amdgcn.mul.u24(i32 %"47", i32 2) - store i32 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i32, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i32 %"49", ptr %"51", align 4 + br label %"31" + +"31": ; preds = %1 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr + %"40" = load i32, ptr %"46", align 4 + store i32 %"40", ptr addrspace(5) %"36", align 4 + %"43" = load i32, ptr addrspace(5) %"36", align 4 + %"42" = call i32 @llvm.amdgcn.mul.u24(i32 %"43", i32 2) + store i32 %"42", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i32, ptr addrspace(5) %"37", align 4 + %"47" = inttoptr i64 %"44" to ptr + store i32 %"45", ptr %"47", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare i32 @llvm.amdgcn.mul.u24(i32, i32) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/mul_ftz.ll b/ptx/src/test/ll/mul_ftz.ll index ceacd5d..60bfc6f 100644 --- a/ptx/src/test/ll/mul_ftz.ll +++ b/ptx/src/test/ll/mul_ftz.ll @@ -1,45 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca float, align 4, addrspace(5) - %"42" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca float, align 4, addrspace(5) + %"38" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load float, ptr %"54", align 4 - store float %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load float, ptr %"30", align 4 - store float %"48", ptr addrspace(5) %"42", align 4 - %"50" = load float, ptr addrspace(5) %"41", align 4 - %"51" = load float, ptr addrspace(5) %"42", align 4 - %"49" = fmul float %"50", %"51" - store float %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load float, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store float %"53", ptr %"56", align 4 + br label %"32" + +"32": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load float, ptr %"50", align 4 + store float %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load float, ptr %"31", align 4 + store float %"44", ptr addrspace(5) %"38", align 4 + %"46" = load float, ptr addrspace(5) %"37", align 4 + %"47" = load float, ptr addrspace(5) %"38", align 4 + %"45" = fmul float %"46", %"47" + store float %"45", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load float, ptr addrspace(5) %"37", align 4 + %"52" = inttoptr i64 %"48" to ptr + store float %"49", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mul_hi.ll b/ptx/src/test/ll/mul_hi.ll index 57ee469..155d766 100644 --- a/ptx/src/test/ll/mul_hi.ll +++ b/ptx/src/test/ll/mul_hi.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %2 = zext i64 %"47" to i128 + br label %"31" + +"31": ; preds = %1 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr + %"40" = load i64, ptr %"46", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %2 = zext i64 %"43" to i128 %3 = mul i128 %2, 2 %4 = lshr i128 %3, 64 - %"46" = trunc i128 %4 to i64 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"51", align 4 + %"42" = trunc i128 %4 to i64 + store i64 %"42", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"47" = inttoptr i64 %"44" to ptr + store i64 %"45", ptr %"47", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mul_lo.ll b/ptx/src/test/ll/mul_lo.ll index 15f39e8..b1a96dd 100644 --- a/ptx/src/test/ll/mul_lo.ll +++ b/ptx/src/test/ll/mul_lo.ll @@ -1,39 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"46" = mul i64 %"47", 2 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"51", align 4 + br label %"31" + +"31": ; preds = %1 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr + %"40" = load i64, ptr %"46", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"42" = mul i64 %"43", 2 + store i64 %"42", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"47" = inttoptr i64 %"44" to ptr + store i64 %"45", ptr %"47", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mul_non_ftz.ll b/ptx/src/test/ll/mul_non_ftz.ll index ee1da37..afdd691 100644 --- a/ptx/src/test/ll/mul_non_ftz.ll +++ b/ptx/src/test/ll/mul_non_ftz.ll @@ -1,45 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca float, align 4, addrspace(5) - %"42" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca float, align 4, addrspace(5) + %"38" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load float, ptr %"54", align 4 - store float %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load float, ptr %"30", align 4 - store float %"48", ptr addrspace(5) %"42", align 4 - %"50" = load float, ptr addrspace(5) %"41", align 4 - %"51" = load float, ptr addrspace(5) %"42", align 4 - %"49" = fmul float %"50", %"51" - store float %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load float, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store float %"53", ptr %"56", align 4 + br label %"32" + +"32": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load float, ptr %"50", align 4 + store float %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load float, ptr %"31", align 4 + store float %"44", ptr addrspace(5) %"38", align 4 + %"46" = load float, ptr addrspace(5) %"37", align 4 + %"47" = load float, ptr addrspace(5) %"38", align 4 + %"45" = fmul float %"46", %"47" + store float %"45", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load float, ptr addrspace(5) %"37", align 4 + %"52" = inttoptr i64 %"48" to ptr + store float %"49", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/mul_wide.ll b/ptx/src/test/ll/mul_wide.ll index 7b815d1..7c37943 100644 --- a/ptx/src/test/ll/mul_wide.ll +++ b/ptx/src/test/ll/mul_wide.ll @@ -1,48 +1,39 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { +define amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"45", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"56" = inttoptr i64 %"48" to ptr addrspace(1) - %"47" = load i32, ptr addrspace(1) %"56", align 4 - store i32 %"47", ptr addrspace(5) %"42", align 4 - %"49" = load i64, ptr addrspace(5) %"40", align 4 - %"57" = inttoptr i64 %"49" to ptr addrspace(1) - %"31" = getelementptr inbounds i8, ptr addrspace(1) %"57", i64 4 - %"50" = load i32, ptr addrspace(1) %"31", align 4 - store i32 %"50", ptr addrspace(5) %"43", align 4 - %"52" = load i32, ptr addrspace(5) %"42", align 4 - %"53" = load i32, ptr addrspace(5) %"43", align 4 - %2 = sext i32 %"52" to i64 - %3 = sext i32 %"53" to i64 - %"51" = mul i64 %2, %3 - store i64 %"51", ptr addrspace(5) %"44", align 4 - %"54" = load i64, ptr addrspace(5) %"41", align 4 - %"55" = load i64, ptr addrspace(5) %"44", align 4 - %"58" = inttoptr i64 %"54" to ptr - store i64 %"55", ptr %"58", align 4 + br label %"33" + +"33": ; preds = %1 + %"41" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"41", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"42", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"36", align 4 + %"52" = inttoptr i64 %"44" to ptr addrspace(1) + %"43" = load i32, ptr addrspace(1) %"52", align 4 + store i32 %"43", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"36", align 4 + %"53" = inttoptr i64 %"45" to ptr addrspace(1) + %"32" = getelementptr inbounds i8, ptr addrspace(1) %"53", i64 4 + %"46" = load i32, ptr addrspace(1) %"32", align 4 + store i32 %"46", ptr addrspace(5) %"39", align 4 + %"48" = load i32, ptr addrspace(5) %"38", align 4 + %"49" = load i32, ptr addrspace(5) %"39", align 4 + %2 = sext i32 %"48" to i64 + %3 = sext i32 %"49" to i64 + %"47" = mul i64 %2, %3 + store i64 %"47", ptr addrspace(5) %"40", align 4 + %"50" = load i64, ptr addrspace(5) %"37", align 4 + %"51" = load i64, ptr addrspace(5) %"40", align 4 + %"54" = inttoptr i64 %"50" to ptr + store i64 %"51", ptr %"54", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/neg.ll b/ptx/src/test/ll/neg.ll index ebcedc0..d0992e7 100644 --- a/ptx/src/test/ll/neg.ll +++ b/ptx/src/test/ll/neg.ll @@ -1,38 +1,29 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load i32, ptr %"47", align 4 - store i32 %"41", ptr addrspace(5) %"38", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 - %"43" = sub i32 0, %"44" - store i32 %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load i32, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store i32 %"46", ptr %"48", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load i32, ptr %"43", align 4 + store i32 %"37", ptr addrspace(5) %"34", align 4 + %"40" = load i32, ptr addrspace(5) %"34", align 4 + %"39" = sub i32 0, %"40" + store i32 %"39", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load i32, ptr addrspace(5) %"34", align 4 + %"44" = inttoptr i64 %"41" to ptr + store i32 %"42", ptr %"44", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/non_scalar_ptr_offset.ll b/ptx/src/test/ll/non_scalar_ptr_offset.ll index 9fabfa6..a86e8ff 100644 --- a/ptx/src/test/ll/non_scalar_ptr_offset.ll +++ b/ptx/src/test/ll/non_scalar_ptr_offset.ll @@ -1,44 +1,35 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"46" = load i64, ptr addrspace(5) %"40", align 4 - %"54" = inttoptr i64 %"46" to ptr addrspace(1) - %"31" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8 - %"29" = load <2 x i32>, ptr addrspace(1) %"31", align 8 - %"47" = extractelement <2 x i32> %"29", i8 0 - %"48" = extractelement <2 x i32> %"29", i8 1 - store i32 %"47", ptr addrspace(5) %"42", align 4 - store i32 %"48", ptr addrspace(5) %"43", align 4 - %"50" = load i32, ptr addrspace(5) %"42", align 4 - %"51" = load i32, ptr addrspace(5) %"43", align 4 - %"49" = add i32 %"50", %"51" - store i32 %"49", ptr addrspace(5) %"42", align 4 - %"52" = load i64, ptr addrspace(5) %"41", align 4 - %"53" = load i32, ptr addrspace(5) %"42", align 4 - %"55" = inttoptr i64 %"52" to ptr addrspace(1) - store i32 %"53", ptr addrspace(1) %"55", align 4 + br label %"33" + +"33": ; preds = %1 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + %"50" = inttoptr i64 %"42" to ptr addrspace(1) + %"32" = getelementptr inbounds i8, ptr addrspace(1) %"50", i64 8 + %"30" = load <2 x i32>, ptr addrspace(1) %"32", align 8 + %"43" = extractelement <2 x i32> %"30", i8 0 + %"44" = extractelement <2 x i32> %"30", i8 1 + store i32 %"43", ptr addrspace(5) %"38", align 4 + store i32 %"44", ptr addrspace(5) %"39", align 4 + %"46" = load i32, ptr addrspace(5) %"38", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"45" = add i32 %"46", %"47" + store i32 %"45", ptr addrspace(5) %"38", align 4 + %"48" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = load i32, ptr addrspace(5) %"38", align 4 + %"51" = inttoptr i64 %"48" to ptr addrspace(1) + store i32 %"49", ptr addrspace(1) %"51", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/not.ll b/ptx/src/test/ll/not.ll index 8b078d7..efb1f95 100644 --- a/ptx/src/test/ll/not.ll +++ b/ptx/src/test/ll/not.ll @@ -1,39 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"41" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"41", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(5) %"37", align 4 - %"49" = inttoptr i64 %"44" to ptr - %"43" = load i64, ptr %"49", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"50" = xor i64 %"46", -1 - store i64 %"50", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"38", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"52" = inttoptr i64 %"47" to ptr - store i64 %"48", ptr %"52", align 4 + br label %"30" + +"30": ; preds = %1 + %"37" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"37", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"40" = load i64, ptr addrspace(5) %"33", align 4 + %"45" = inttoptr i64 %"40" to ptr + %"39" = load i64, ptr %"45", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"46" = xor i64 %"42", -1 + store i64 %"46", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"34", align 4 + %"44" = load i64, ptr addrspace(5) %"36", align 4 + %"48" = inttoptr i64 %"43" to ptr + store i64 %"44", ptr %"48", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/ntid.ll b/ptx/src/test/ll/ntid.ll index 2144bc4..87185bc 100644 --- a/ptx/src/test/ll/ntid.ll +++ b/ptx/src/test/ll/ntid.ll @@ -1,42 +1,39 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"53" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"53", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"30" = call i32 @__zluda_ptx_impl_sreg_ntid(i8 0) - store i32 %"30", ptr addrspace(5) %"42", align 4 - %"49" = load i32, ptr addrspace(5) %"41", align 4 - %"50" = load i32, ptr addrspace(5) %"42", align 4 - %"48" = add i32 %"49", %"50" - store i32 %"48", ptr addrspace(5) %"41", align 4 - %"51" = load i64, ptr addrspace(5) %"40", align 4 - %"52" = load i32, ptr addrspace(5) %"41", align 4 - %"54" = inttoptr i64 %"51" to ptr - store i32 %"52", ptr %"54", align 4 + br label %"32" + +"32": ; preds = %1 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"42" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"42", ptr addrspace(5) %"38", align 4 + %"44" = load i64, ptr addrspace(5) %"37", align 4 + %"51" = inttoptr i64 %"44" to ptr + %"43" = load i32, ptr %"51", align 4 + store i32 %"43", ptr addrspace(5) %"39", align 4 + %"31" = call i32 @__zluda_ptx_impl_sreg_ntid(i8 0) + br label %"33" + +"33": ; preds = %"32" + store i32 %"31", ptr addrspace(5) %"40", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"48" = load i32, ptr addrspace(5) %"40", align 4 + %"46" = add i32 %"47", %"48" + store i32 %"46", ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = load i32, ptr addrspace(5) %"39", align 4 + %"52" = inttoptr i64 %"49" to ptr + store i32 %"50", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/or.ll b/ptx/src/test/ll/or.ll index c7190b7..e773120 100644 --- a/ptx/src/test/ll/or.ll +++ b/ptx/src/test/ll/or.ll @@ -1,45 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i64, ptr %"54", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 8 - %"48" = load i64, ptr %"30", align 4 - store i64 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = or i64 %"50", %"51" - store i64 %"56", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i64, ptr addrspace(5) %"41", align 4 - %"59" = inttoptr i64 %"52" to ptr - store i64 %"53", ptr %"59", align 4 + br label %"32" + +"32": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load i64, ptr %"50", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 8 + %"44" = load i64, ptr %"31", align 4 + store i64 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i64, ptr addrspace(5) %"37", align 4 + %"47" = load i64, ptr addrspace(5) %"38", align 4 + %"52" = or i64 %"46", %"47" + store i64 %"52", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load i64, ptr addrspace(5) %"37", align 4 + %"55" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"55", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/popc.ll b/ptx/src/test/ll/popc.ll index e71acba..0b379c5 100644 --- a/ptx/src/test/ll/popc.ll +++ b/ptx/src/test/ll/popc.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load i32, ptr %"47", align 4 - store i32 %"41", ptr addrspace(5) %"38", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 - %"48" = call i32 @llvm.ctpop.i32(i32 %"44") - store i32 %"48", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load i32, ptr addrspace(5) %"38", align 4 - %"49" = inttoptr i64 %"45" to ptr - store i32 %"46", ptr %"49", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load i32, ptr %"43", align 4 + store i32 %"37", ptr addrspace(5) %"34", align 4 + %"40" = load i32, ptr addrspace(5) %"34", align 4 + %"44" = call i32 @llvm.ctpop.i32(i32 %"40") + store i32 %"44", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load i32, ptr addrspace(5) %"34", align 4 + %"45" = inttoptr i64 %"41" to ptr + store i32 %"42", ptr %"45", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare i32 @llvm.ctpop.i32(i32) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/pred_not.ll b/ptx/src/test/ll/pred_not.ll index 7046c09..65cc659 100644 --- a/ptx/src/test/ll/pred_not.ll +++ b/ptx/src/test/ll/pred_not.ll @@ -1,66 +1,57 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { +define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) %"47" = alloca i64, align 8, addrspace(5) - %"48" = alloca i64, align 8, addrspace(5) - %"49" = alloca i64, align 8, addrspace(5) - %"50" = alloca i64, align 8, addrspace(5) - %"51" = alloca i64, align 8, addrspace(5) - %"52" = alloca i1, align 1, addrspace(5) + %"48" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"53" = load i64, ptr addrspace(4) %"45", align 4 - store i64 %"53", ptr addrspace(5) %"47", align 4 - %"54" = load i64, ptr addrspace(4) %"46", align 4 - store i64 %"54", ptr addrspace(5) %"48", align 4 - %"56" = load i64, ptr addrspace(5) %"47", align 4 - %"70" = inttoptr i64 %"56" to ptr - %"55" = load i64, ptr %"70", align 4 - store i64 %"55", ptr addrspace(5) %"49", align 4 - %"57" = load i64, ptr addrspace(5) %"47", align 4 - %"71" = inttoptr i64 %"57" to ptr - %"36" = getelementptr inbounds i8, ptr %"71", i64 8 - %"58" = load i64, ptr %"36", align 4 - store i64 %"58", ptr addrspace(5) %"50", align 4 - %"60" = load i64, ptr addrspace(5) %"49", align 4 - %"61" = load i64, ptr addrspace(5) %"50", align 4 - %"59" = icmp ult i64 %"60", %"61" - store i1 %"59", ptr addrspace(5) %"52", align 1 - %"63" = load i1, ptr addrspace(5) %"52", align 1 - %"62" = xor i1 %"63", true - store i1 %"62", ptr addrspace(5) %"52", align 1 - %"64" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"64", label %"15", label %"16" + br label %"40" -"15": ; preds = %1 - store i64 1, ptr addrspace(5) %"51", align 4 - br label %"16" +"40": ; preds = %1 + %"49" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"49", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"50", ptr addrspace(5) %"44", align 4 + %"52" = load i64, ptr addrspace(5) %"43", align 4 + %"66" = inttoptr i64 %"52" to ptr + %"51" = load i64, ptr %"66", align 4 + store i64 %"51", ptr addrspace(5) %"45", align 4 + %"53" = load i64, ptr addrspace(5) %"43", align 4 + %"67" = inttoptr i64 %"53" to ptr + %"37" = getelementptr inbounds i8, ptr %"67", i64 8 + %"54" = load i64, ptr %"37", align 4 + store i64 %"54", ptr addrspace(5) %"46", align 4 + %"56" = load i64, ptr addrspace(5) %"45", align 4 + %"57" = load i64, ptr addrspace(5) %"46", align 4 + %"55" = icmp ult i64 %"56", %"57" + store i1 %"55", ptr addrspace(5) %"48", align 1 + %"59" = load i1, ptr addrspace(5) %"48", align 1 + %"58" = xor i1 %"59", true + store i1 %"58", ptr addrspace(5) %"48", align 1 + %"60" = load i1, ptr addrspace(5) %"48", align 1 + br i1 %"60", label %"16", label %"17" -"16": ; preds = %"15", %1 - %"66" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"66", label %"18", label %"17" +"16": ; preds = %"40" + store i64 1, ptr addrspace(5) %"47", align 4 + br label %"17" -"17": ; preds = %"16" - store i64 2, ptr addrspace(5) %"51", align 4 - br label %"18" +"17": ; preds = %"16", %"40" + %"62" = load i1, ptr addrspace(5) %"48", align 1 + br i1 %"62", label %"19", label %"18" -"18": ; preds = %"17", %"16" - %"68" = load i64, ptr addrspace(5) %"48", align 4 - %"69" = load i64, ptr addrspace(5) %"51", align 4 - %"72" = inttoptr i64 %"68" to ptr - store i64 %"69", ptr %"72", align 4 +"18": ; preds = %"17" + store i64 2, ptr addrspace(5) %"47", align 4 + br label %"19" + +"19": ; preds = %"18", %"17" + %"64" = load i64, ptr addrspace(5) %"44", align 4 + %"65" = load i64, ptr addrspace(5) %"47", align 4 + %"68" = inttoptr i64 %"64" to ptr + store i64 %"65", ptr %"68", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/prmt.ll b/ptx/src/test/ll/prmt.ll index dd5b95c..85f144e 100644 --- a/ptx/src/test/ll/prmt.ll +++ b/ptx/src/test/ll/prmt.ll @@ -1,47 +1,38 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 - %"51" = load i32, ptr addrspace(5) %"42", align 4 - %2 = bitcast i32 %"50" to <4 x i8> - %3 = bitcast i32 %"51" to <4 x i8> - %"56" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> - store <4 x i8> %"56", ptr addrspace(5) %"42", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"42", align 4 - %"59" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"59", align 4 + br label %"32" + +"32": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"50", align 4 + store i32 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load i32, ptr %"31", align 4 + store i32 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i32, ptr addrspace(5) %"37", align 4 + %"47" = load i32, ptr addrspace(5) %"38", align 4 + %2 = bitcast i32 %"46" to <4 x i8> + %3 = bitcast i32 %"47" to <4 x i8> + %"52" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> + store <4 x i8> %"52", ptr addrspace(5) %"38", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load i32, ptr addrspace(5) %"38", align 4 + %"55" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"55", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/rcp.ll b/ptx/src/test/ll/rcp.ll index c00012a..0995cc0 100644 --- a/ptx/src/test/ll/rcp.ll +++ b/ptx/src/test/ll/rcp.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call float @llvm.amdgcn.rcp.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load float, ptr %"43", align 4 + store float %"37", ptr addrspace(5) %"34", align 4 + %"40" = load float, ptr addrspace(5) %"34", align 4 + %"39" = call float @llvm.amdgcn.rcp.f32(float %"40") + store float %"39", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load float, ptr addrspace(5) %"34", align 4 + %"44" = inttoptr i64 %"41" to ptr + store float %"42", ptr %"44", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.amdgcn.rcp.f32(float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/reg_local.ll b/ptx/src/test/ll/reg_local.ll index 51fe3e9..a1b6bf2 100644 --- a/ptx/src/test/ll/reg_local.ll +++ b/ptx/src/test/ll/reg_local.ll @@ -1,45 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { - %"9" = alloca [8 x i8], align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) - %"45" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"10" = alloca [8 x i8], align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"46" = load i64, ptr addrspace(4) %"41", align 4 - store i64 %"46", ptr addrspace(5) %"43", align 4 - %"47" = load i64, ptr addrspace(4) %"42", align 4 - store i64 %"47", ptr addrspace(5) %"44", align 4 - %"49" = load i64, ptr addrspace(5) %"43", align 4 - %"55" = inttoptr i64 %"49" to ptr addrspace(1) - %"54" = load i64, ptr addrspace(1) %"55", align 4 - store i64 %"54", ptr addrspace(5) %"45", align 4 - %"50" = load i64, ptr addrspace(5) %"45", align 4 - %"30" = add i64 %"50", 1 - %"56" = addrspacecast ptr addrspace(5) %"9" to ptr - store i64 %"30", ptr %"56", align 4 - %"58" = addrspacecast ptr addrspace(5) %"9" to ptr - %"32" = getelementptr inbounds i8, ptr %"58", i64 0 - %"59" = load i64, ptr %"32", align 4 - store i64 %"59", ptr addrspace(5) %"45", align 4 - %"52" = load i64, ptr addrspace(5) %"44", align 4 - %"60" = inttoptr i64 %"52" to ptr addrspace(1) - %"34" = getelementptr inbounds i8, ptr addrspace(1) %"60", i64 0 - %"53" = load i64, ptr addrspace(5) %"45", align 4 - store i64 %"53", ptr addrspace(1) %"34", align 4 + br label %"36" + +"36": ; preds = %1 + %"42" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"42", ptr addrspace(5) %"39", align 4 + %"43" = load i64, ptr addrspace(4) %"38", align 4 + store i64 %"43", ptr addrspace(5) %"40", align 4 + %"45" = load i64, ptr addrspace(5) %"39", align 4 + %"51" = inttoptr i64 %"45" to ptr addrspace(1) + %"50" = load i64, ptr addrspace(1) %"51", align 4 + store i64 %"50", ptr addrspace(5) %"41", align 4 + %"46" = load i64, ptr addrspace(5) %"41", align 4 + %"31" = add i64 %"46", 1 + %"52" = addrspacecast ptr addrspace(5) %"10" to ptr + store i64 %"31", ptr %"52", align 4 + %"54" = addrspacecast ptr addrspace(5) %"10" to ptr + %"33" = getelementptr inbounds i8, ptr %"54", i64 0 + %"55" = load i64, ptr %"33", align 4 + store i64 %"55", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"56" = inttoptr i64 %"48" to ptr addrspace(1) + %"35" = getelementptr inbounds i8, ptr addrspace(1) %"56", i64 0 + %"49" = load i64, ptr addrspace(5) %"41", align 4 + store i64 %"49", ptr addrspace(1) %"35", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/rem.ll b/ptx/src/test/ll/rem.ll index 964021e..dd33785 100644 --- a/ptx/src/test/ll/rem.ll +++ b/ptx/src/test/ll/rem.ll @@ -1,45 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 - %"51" = load i32, ptr addrspace(5) %"42", align 4 - %"49" = srem i32 %"50", %"51" - store i32 %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"56", align 4 + br label %"32" + +"32": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"50", align 4 + store i32 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load i32, ptr %"31", align 4 + store i32 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i32, ptr addrspace(5) %"37", align 4 + %"47" = load i32, ptr addrspace(5) %"38", align 4 + %"45" = srem i32 %"46", %"47" + store i32 %"45", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load i32, ptr addrspace(5) %"37", align 4 + %"52" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/rsqrt.ll b/ptx/src/test/ll/rsqrt.ll index 532a8c8..04ca3e5 100644 --- a/ptx/src/test/ll/rsqrt.ll +++ b/ptx/src/test/ll/rsqrt.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca double, align 8, addrspace(5) +define amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca double, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load double, ptr %"47", align 8 - store double %"41", ptr addrspace(5) %"38", align 8 - %"44" = load double, ptr addrspace(5) %"38", align 8 - %"43" = call double @llvm.amdgcn.rsq.f64(double %"44") - store double %"43", ptr addrspace(5) %"38", align 8 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load double, ptr addrspace(5) %"38", align 8 - %"48" = inttoptr i64 %"45" to ptr - store double %"46", ptr %"48", align 8 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load double, ptr %"43", align 8 + store double %"37", ptr addrspace(5) %"34", align 8 + %"40" = load double, ptr addrspace(5) %"34", align 8 + %"39" = call double @llvm.amdgcn.rsq.f64(double %"40") + store double %"39", ptr addrspace(5) %"34", align 8 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load double, ptr addrspace(5) %"34", align 8 + %"44" = inttoptr i64 %"41" to ptr + store double %"42", ptr %"44", align 8 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare double @llvm.amdgcn.rsq.f64(double) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/selp.ll b/ptx/src/test/ll/selp.ll index 580754d..918c4df 100644 --- a/ptx/src/test/ll/selp.ll +++ b/ptx/src/test/ll/selp.ll @@ -1,45 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i16, align 2, addrspace(5) - %"43" = alloca i16, align 2, addrspace(5) +define amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i16, align 2, addrspace(5) + %"39" = alloca i16, align 2, addrspace(5) br label %1 1: ; preds = %0 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"46" = load i16, ptr %"55", align 2 - store i16 %"46", ptr addrspace(5) %"42", align 2 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"56" = inttoptr i64 %"48" to ptr - %"30" = getelementptr inbounds i8, ptr %"56", i64 2 - %"49" = load i16, ptr %"30", align 2 - store i16 %"49", ptr addrspace(5) %"43", align 2 - %"51" = load i16, ptr addrspace(5) %"42", align 2 - %"52" = load i16, ptr addrspace(5) %"43", align 2 - %"50" = select i1 false, i16 %"51", i16 %"52" - store i16 %"50", ptr addrspace(5) %"42", align 2 - %"53" = load i64, ptr addrspace(5) %"41", align 4 - %"54" = load i16, ptr addrspace(5) %"42", align 2 - %"57" = inttoptr i64 %"53" to ptr - store i16 %"54", ptr %"57", align 2 + br label %"33" + +"33": ; preds = %1 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"42" = load i16, ptr %"51", align 2 + store i16 %"42", ptr addrspace(5) %"38", align 2 + %"44" = load i64, ptr addrspace(5) %"36", align 4 + %"52" = inttoptr i64 %"44" to ptr + %"31" = getelementptr inbounds i8, ptr %"52", i64 2 + %"45" = load i16, ptr %"31", align 2 + store i16 %"45", ptr addrspace(5) %"39", align 2 + %"47" = load i16, ptr addrspace(5) %"38", align 2 + %"48" = load i16, ptr addrspace(5) %"39", align 2 + %"46" = select i1 false, i16 %"47", i16 %"48" + store i16 %"46", ptr addrspace(5) %"38", align 2 + %"49" = load i64, ptr addrspace(5) %"37", align 4 + %"50" = load i16, ptr addrspace(5) %"38", align 2 + %"53" = inttoptr i64 %"49" to ptr + store i16 %"50", ptr %"53", align 2 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/selp_true.ll b/ptx/src/test/ll/selp_true.ll index 142c361..a422f89 100644 --- a/ptx/src/test/ll/selp_true.ll +++ b/ptx/src/test/ll/selp_true.ll @@ -1,45 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i16, align 2, addrspace(5) - %"43" = alloca i16, align 2, addrspace(5) +define amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i16, align 2, addrspace(5) + %"39" = alloca i16, align 2, addrspace(5) br label %1 1: ; preds = %0 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"45" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"46" = load i16, ptr %"55", align 2 - store i16 %"46", ptr addrspace(5) %"42", align 2 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %"56" = inttoptr i64 %"48" to ptr - %"30" = getelementptr inbounds i8, ptr %"56", i64 2 - %"49" = load i16, ptr %"30", align 2 - store i16 %"49", ptr addrspace(5) %"43", align 2 - %"51" = load i16, ptr addrspace(5) %"42", align 2 - %"52" = load i16, ptr addrspace(5) %"43", align 2 - %"50" = select i1 true, i16 %"51", i16 %"52" - store i16 %"50", ptr addrspace(5) %"42", align 2 - %"53" = load i64, ptr addrspace(5) %"41", align 4 - %"54" = load i16, ptr addrspace(5) %"42", align 2 - %"57" = inttoptr i64 %"53" to ptr - store i16 %"54", ptr %"57", align 2 + br label %"33" + +"33": ; preds = %1 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"41" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"42" = load i16, ptr %"51", align 2 + store i16 %"42", ptr addrspace(5) %"38", align 2 + %"44" = load i64, ptr addrspace(5) %"36", align 4 + %"52" = inttoptr i64 %"44" to ptr + %"31" = getelementptr inbounds i8, ptr %"52", i64 2 + %"45" = load i16, ptr %"31", align 2 + store i16 %"45", ptr addrspace(5) %"39", align 2 + %"47" = load i16, ptr addrspace(5) %"38", align 2 + %"48" = load i16, ptr addrspace(5) %"39", align 2 + %"46" = select i1 true, i16 %"47", i16 %"48" + store i16 %"46", ptr addrspace(5) %"38", align 2 + %"49" = load i64, ptr addrspace(5) %"37", align 4 + %"50" = load i16, ptr addrspace(5) %"38", align 2 + %"53" = inttoptr i64 %"49" to ptr + store i16 %"50", ptr %"53", align 2 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp.ll b/ptx/src/test/ll/setp.ll index 6625957..d0617b8 100644 --- a/ptx/src/test/ll/setp.ll +++ b/ptx/src/test/ll/setp.ll @@ -1,63 +1,54 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { +define amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) %"47" = alloca i64, align 8, addrspace(5) - %"48" = alloca i64, align 8, addrspace(5) - %"49" = alloca i64, align 8, addrspace(5) - %"50" = alloca i64, align 8, addrspace(5) - %"51" = alloca i64, align 8, addrspace(5) - %"52" = alloca i1, align 1, addrspace(5) + %"48" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"53" = load i64, ptr addrspace(4) %"45", align 4 - store i64 %"53", ptr addrspace(5) %"47", align 4 - %"54" = load i64, ptr addrspace(4) %"46", align 4 - store i64 %"54", ptr addrspace(5) %"48", align 4 - %"56" = load i64, ptr addrspace(5) %"47", align 4 - %"68" = inttoptr i64 %"56" to ptr - %"55" = load i64, ptr %"68", align 4 - store i64 %"55", ptr addrspace(5) %"49", align 4 - %"57" = load i64, ptr addrspace(5) %"47", align 4 - %"69" = inttoptr i64 %"57" to ptr - %"36" = getelementptr inbounds i8, ptr %"69", i64 8 - %"58" = load i64, ptr %"36", align 4 - store i64 %"58", ptr addrspace(5) %"50", align 4 - %"60" = load i64, ptr addrspace(5) %"49", align 4 - %"61" = load i64, ptr addrspace(5) %"50", align 4 - %"59" = icmp ult i64 %"60", %"61" - store i1 %"59", ptr addrspace(5) %"52", align 1 - %"62" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"62", label %"15", label %"16" + br label %"40" -"15": ; preds = %1 - store i64 1, ptr addrspace(5) %"51", align 4 - br label %"16" +"40": ; preds = %1 + %"49" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"49", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(4) %"42", align 4 + store i64 %"50", ptr addrspace(5) %"44", align 4 + %"52" = load i64, ptr addrspace(5) %"43", align 4 + %"64" = inttoptr i64 %"52" to ptr + %"51" = load i64, ptr %"64", align 4 + store i64 %"51", ptr addrspace(5) %"45", align 4 + %"53" = load i64, ptr addrspace(5) %"43", align 4 + %"65" = inttoptr i64 %"53" to ptr + %"37" = getelementptr inbounds i8, ptr %"65", i64 8 + %"54" = load i64, ptr %"37", align 4 + store i64 %"54", ptr addrspace(5) %"46", align 4 + %"56" = load i64, ptr addrspace(5) %"45", align 4 + %"57" = load i64, ptr addrspace(5) %"46", align 4 + %"55" = icmp ult i64 %"56", %"57" + store i1 %"55", ptr addrspace(5) %"48", align 1 + %"58" = load i1, ptr addrspace(5) %"48", align 1 + br i1 %"58", label %"16", label %"17" -"16": ; preds = %"15", %1 - %"64" = load i1, ptr addrspace(5) %"52", align 1 - br i1 %"64", label %"18", label %"17" +"16": ; preds = %"40" + store i64 1, ptr addrspace(5) %"47", align 4 + br label %"17" -"17": ; preds = %"16" - store i64 2, ptr addrspace(5) %"51", align 4 - br label %"18" +"17": ; preds = %"16", %"40" + %"60" = load i1, ptr addrspace(5) %"48", align 1 + br i1 %"60", label %"19", label %"18" -"18": ; preds = %"17", %"16" - %"66" = load i64, ptr addrspace(5) %"48", align 4 - %"67" = load i64, ptr addrspace(5) %"51", align 4 - %"70" = inttoptr i64 %"66" to ptr - store i64 %"67", ptr %"70", align 4 +"18": ; preds = %"17" + store i64 2, ptr addrspace(5) %"47", align 4 + br label %"19" + +"19": ; preds = %"18", %"17" + %"62" = load i64, ptr addrspace(5) %"44", align 4 + %"63" = load i64, ptr addrspace(5) %"47", align 4 + %"66" = inttoptr i64 %"62" to ptr + store i64 %"63", ptr %"66", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_gt.ll b/ptx/src/test/ll/setp_gt.ll index 4badce3..c02b59e 100644 --- a/ptx/src/test/ll/setp_gt.ll +++ b/ptx/src/test/ll/setp_gt.ll @@ -1,65 +1,56 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { - %"45" = alloca i64, align 8, addrspace(5) - %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca float, align 4, addrspace(5) - %"48" = alloca float, align 4, addrspace(5) - %"49" = alloca float, align 4, addrspace(5) - %"50" = alloca i1, align 1, addrspace(5) +define amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca float, align 4, addrspace(5) + %"44" = alloca float, align 4, addrspace(5) + %"45" = alloca float, align 4, addrspace(5) + %"46" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"51" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"51", ptr addrspace(5) %"45", align 4 - %"52" = load i64, ptr addrspace(4) %"44", align 4 - store i64 %"52", ptr addrspace(5) %"46", align 4 - %"54" = load i64, ptr addrspace(5) %"45", align 4 - %"68" = inttoptr i64 %"54" to ptr - %"53" = load float, ptr %"68", align 4 - store float %"53", ptr addrspace(5) %"47", align 4 - %"55" = load i64, ptr addrspace(5) %"45", align 4 - %"69" = inttoptr i64 %"55" to ptr - %"36" = getelementptr inbounds i8, ptr %"69", i64 4 - %"56" = load float, ptr %"36", align 4 - store float %"56", ptr addrspace(5) %"48", align 4 - %"58" = load float, ptr addrspace(5) %"47", align 4 - %"59" = load float, ptr addrspace(5) %"48", align 4 - %"57" = fcmp ogt float %"58", %"59" - store i1 %"57", ptr addrspace(5) %"50", align 1 - %"60" = load i1, ptr addrspace(5) %"50", align 1 - br i1 %"60", label %"15", label %"16" + br label %"38" -"15": ; preds = %1 - %"62" = load float, ptr addrspace(5) %"47", align 4 - store float %"62", ptr addrspace(5) %"49", align 4 - br label %"16" +"38": ; preds = %1 + %"47" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"47", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i64, ptr addrspace(5) %"41", align 4 + %"64" = inttoptr i64 %"50" to ptr + %"49" = load float, ptr %"64", align 4 + store float %"49", ptr addrspace(5) %"43", align 4 + %"51" = load i64, ptr addrspace(5) %"41", align 4 + %"65" = inttoptr i64 %"51" to ptr + %"37" = getelementptr inbounds i8, ptr %"65", i64 4 + %"52" = load float, ptr %"37", align 4 + store float %"52", ptr addrspace(5) %"44", align 4 + %"54" = load float, ptr addrspace(5) %"43", align 4 + %"55" = load float, ptr addrspace(5) %"44", align 4 + %"53" = fcmp ogt float %"54", %"55" + store i1 %"53", ptr addrspace(5) %"46", align 1 + %"56" = load i1, ptr addrspace(5) %"46", align 1 + br i1 %"56", label %"16", label %"17" -"16": ; preds = %"15", %1 - %"63" = load i1, ptr addrspace(5) %"50", align 1 - br i1 %"63", label %"18", label %"17" +"16": ; preds = %"38" + %"58" = load float, ptr addrspace(5) %"43", align 4 + store float %"58", ptr addrspace(5) %"45", align 4 + br label %"17" -"17": ; preds = %"16" - %"65" = load float, ptr addrspace(5) %"48", align 4 - store float %"65", ptr addrspace(5) %"49", align 4 - br label %"18" +"17": ; preds = %"16", %"38" + %"59" = load i1, ptr addrspace(5) %"46", align 1 + br i1 %"59", label %"19", label %"18" -"18": ; preds = %"17", %"16" - %"66" = load i64, ptr addrspace(5) %"46", align 4 - %"67" = load float, ptr addrspace(5) %"49", align 4 - %"70" = inttoptr i64 %"66" to ptr - store float %"67", ptr %"70", align 4 +"18": ; preds = %"17" + %"61" = load float, ptr addrspace(5) %"44", align 4 + store float %"61", ptr addrspace(5) %"45", align 4 + br label %"19" + +"19": ; preds = %"18", %"17" + %"62" = load i64, ptr addrspace(5) %"42", align 4 + %"63" = load float, ptr addrspace(5) %"45", align 4 + %"66" = inttoptr i64 %"62" to ptr + store float %"63", ptr %"66", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_leu.ll b/ptx/src/test/ll/setp_leu.ll index d91e569..5d19314 100644 --- a/ptx/src/test/ll/setp_leu.ll +++ b/ptx/src/test/ll/setp_leu.ll @@ -1,65 +1,56 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { - %"45" = alloca i64, align 8, addrspace(5) - %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca float, align 4, addrspace(5) - %"48" = alloca float, align 4, addrspace(5) - %"49" = alloca float, align 4, addrspace(5) - %"50" = alloca i1, align 1, addrspace(5) +define amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca float, align 4, addrspace(5) + %"44" = alloca float, align 4, addrspace(5) + %"45" = alloca float, align 4, addrspace(5) + %"46" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"51" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"51", ptr addrspace(5) %"45", align 4 - %"52" = load i64, ptr addrspace(4) %"44", align 4 - store i64 %"52", ptr addrspace(5) %"46", align 4 - %"54" = load i64, ptr addrspace(5) %"45", align 4 - %"68" = inttoptr i64 %"54" to ptr - %"53" = load float, ptr %"68", align 4 - store float %"53", ptr addrspace(5) %"47", align 4 - %"55" = load i64, ptr addrspace(5) %"45", align 4 - %"69" = inttoptr i64 %"55" to ptr - %"36" = getelementptr inbounds i8, ptr %"69", i64 4 - %"56" = load float, ptr %"36", align 4 - store float %"56", ptr addrspace(5) %"48", align 4 - %"58" = load float, ptr addrspace(5) %"47", align 4 - %"59" = load float, ptr addrspace(5) %"48", align 4 - %"57" = fcmp ule float %"58", %"59" - store i1 %"57", ptr addrspace(5) %"50", align 1 - %"60" = load i1, ptr addrspace(5) %"50", align 1 - br i1 %"60", label %"15", label %"16" + br label %"38" -"15": ; preds = %1 - %"62" = load float, ptr addrspace(5) %"47", align 4 - store float %"62", ptr addrspace(5) %"49", align 4 - br label %"16" +"38": ; preds = %1 + %"47" = load i64, ptr addrspace(4) %"39", align 4 + store i64 %"47", ptr addrspace(5) %"41", align 4 + %"48" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"48", ptr addrspace(5) %"42", align 4 + %"50" = load i64, ptr addrspace(5) %"41", align 4 + %"64" = inttoptr i64 %"50" to ptr + %"49" = load float, ptr %"64", align 4 + store float %"49", ptr addrspace(5) %"43", align 4 + %"51" = load i64, ptr addrspace(5) %"41", align 4 + %"65" = inttoptr i64 %"51" to ptr + %"37" = getelementptr inbounds i8, ptr %"65", i64 4 + %"52" = load float, ptr %"37", align 4 + store float %"52", ptr addrspace(5) %"44", align 4 + %"54" = load float, ptr addrspace(5) %"43", align 4 + %"55" = load float, ptr addrspace(5) %"44", align 4 + %"53" = fcmp ule float %"54", %"55" + store i1 %"53", ptr addrspace(5) %"46", align 1 + %"56" = load i1, ptr addrspace(5) %"46", align 1 + br i1 %"56", label %"16", label %"17" -"16": ; preds = %"15", %1 - %"63" = load i1, ptr addrspace(5) %"50", align 1 - br i1 %"63", label %"18", label %"17" +"16": ; preds = %"38" + %"58" = load float, ptr addrspace(5) %"43", align 4 + store float %"58", ptr addrspace(5) %"45", align 4 + br label %"17" -"17": ; preds = %"16" - %"65" = load float, ptr addrspace(5) %"48", align 4 - store float %"65", ptr addrspace(5) %"49", align 4 - br label %"18" +"17": ; preds = %"16", %"38" + %"59" = load i1, ptr addrspace(5) %"46", align 1 + br i1 %"59", label %"19", label %"18" -"18": ; preds = %"17", %"16" - %"66" = load i64, ptr addrspace(5) %"46", align 4 - %"67" = load float, ptr addrspace(5) %"49", align 4 - %"70" = inttoptr i64 %"66" to ptr - store float %"67", ptr %"70", align 4 +"18": ; preds = %"17" + %"61" = load float, ptr addrspace(5) %"44", align 4 + store float %"61", ptr addrspace(5) %"45", align 4 + br label %"19" + +"19": ; preds = %"18", %"17" + %"62" = load i64, ptr addrspace(5) %"42", align 4 + %"63" = load float, ptr addrspace(5) %"45", align 4 + %"66" = inttoptr i64 %"62" to ptr + store float %"63", ptr %"66", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_nan.ll b/ptx/src/test/ll/setp_nan.ll index 15c0c2a..ca1e98b 100644 --- a/ptx/src/test/ll/setp_nan.ll +++ b/ptx/src/test/ll/setp_nan.ll @@ -1,174 +1,165 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"87", ptr addrspace(4) byref(i64) %"88") #0 { - %"89" = alloca i64, align 8, addrspace(5) - %"90" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"83", ptr addrspace(4) byref(i64) %"84") #0 { + %"85" = alloca i64, align 8, addrspace(5) + %"86" = alloca i64, align 8, addrspace(5) + %"87" = alloca float, align 4, addrspace(5) + %"88" = alloca float, align 4, addrspace(5) + %"89" = alloca float, align 4, addrspace(5) + %"90" = alloca float, align 4, addrspace(5) %"91" = alloca float, align 4, addrspace(5) %"92" = alloca float, align 4, addrspace(5) %"93" = alloca float, align 4, addrspace(5) %"94" = alloca float, align 4, addrspace(5) - %"95" = alloca float, align 4, addrspace(5) - %"96" = alloca float, align 4, addrspace(5) - %"97" = alloca float, align 4, addrspace(5) - %"98" = alloca float, align 4, addrspace(5) - %"99" = alloca i32, align 4, addrspace(5) - %"100" = alloca i1, align 1, addrspace(5) + %"95" = alloca i32, align 4, addrspace(5) + %"96" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"101" = load i64, ptr addrspace(4) %"87", align 4 - store i64 %"101", ptr addrspace(5) %"89", align 4 - %"102" = load i64, ptr addrspace(4) %"88", align 4 - store i64 %"102", ptr addrspace(5) %"90", align 4 - %"104" = load i64, ptr addrspace(5) %"89", align 4 - %"155" = inttoptr i64 %"104" to ptr - %"103" = load float, ptr %"155", align 4 - store float %"103", ptr addrspace(5) %"91", align 4 - %"105" = load i64, ptr addrspace(5) %"89", align 4 - %"156" = inttoptr i64 %"105" to ptr - %"54" = getelementptr inbounds i8, ptr %"156", i64 4 - %"106" = load float, ptr %"54", align 4 - store float %"106", ptr addrspace(5) %"92", align 4 - %"107" = load i64, ptr addrspace(5) %"89", align 4 - %"157" = inttoptr i64 %"107" to ptr - %"56" = getelementptr inbounds i8, ptr %"157", i64 8 - %"108" = load float, ptr %"56", align 4 - store float %"108", ptr addrspace(5) %"93", align 4 - %"109" = load i64, ptr addrspace(5) %"89", align 4 - %"158" = inttoptr i64 %"109" to ptr - %"58" = getelementptr inbounds i8, ptr %"158", i64 12 - %"110" = load float, ptr %"58", align 4 - store float %"110", ptr addrspace(5) %"94", align 4 - %"111" = load i64, ptr addrspace(5) %"89", align 4 - %"159" = inttoptr i64 %"111" to ptr - %"60" = getelementptr inbounds i8, ptr %"159", i64 16 - %"112" = load float, ptr %"60", align 4 - store float %"112", ptr addrspace(5) %"95", align 4 - %"113" = load i64, ptr addrspace(5) %"89", align 4 - %"160" = inttoptr i64 %"113" to ptr - %"62" = getelementptr inbounds i8, ptr %"160", i64 20 - %"114" = load float, ptr %"62", align 4 - store float %"114", ptr addrspace(5) %"96", align 4 - %"115" = load i64, ptr addrspace(5) %"89", align 4 - %"161" = inttoptr i64 %"115" to ptr - %"64" = getelementptr inbounds i8, ptr %"161", i64 24 - %"116" = load float, ptr %"64", align 4 - store float %"116", ptr addrspace(5) %"97", align 4 - %"117" = load i64, ptr addrspace(5) %"89", align 4 - %"162" = inttoptr i64 %"117" to ptr - %"66" = getelementptr inbounds i8, ptr %"162", i64 28 - %"118" = load float, ptr %"66", align 4 - store float %"118", ptr addrspace(5) %"98", align 4 - %"120" = load float, ptr addrspace(5) %"91", align 4 - %"121" = load float, ptr addrspace(5) %"92", align 4 - %"119" = fcmp uno float %"120", %"121" - store i1 %"119", ptr addrspace(5) %"100", align 1 - %"122" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"122", label %"21", label %"22" + br label %"82" -"21": ; preds = %1 - store i32 1, ptr addrspace(5) %"99", align 4 - br label %"22" +"82": ; preds = %1 + %"97" = load i64, ptr addrspace(4) %"83", align 4 + store i64 %"97", ptr addrspace(5) %"85", align 4 + %"98" = load i64, ptr addrspace(4) %"84", align 4 + store i64 %"98", ptr addrspace(5) %"86", align 4 + %"100" = load i64, ptr addrspace(5) %"85", align 4 + %"151" = inttoptr i64 %"100" to ptr + %"99" = load float, ptr %"151", align 4 + store float %"99", ptr addrspace(5) %"87", align 4 + %"101" = load i64, ptr addrspace(5) %"85", align 4 + %"152" = inttoptr i64 %"101" to ptr + %"55" = getelementptr inbounds i8, ptr %"152", i64 4 + %"102" = load float, ptr %"55", align 4 + store float %"102", ptr addrspace(5) %"88", align 4 + %"103" = load i64, ptr addrspace(5) %"85", align 4 + %"153" = inttoptr i64 %"103" to ptr + %"57" = getelementptr inbounds i8, ptr %"153", i64 8 + %"104" = load float, ptr %"57", align 4 + store float %"104", ptr addrspace(5) %"89", align 4 + %"105" = load i64, ptr addrspace(5) %"85", align 4 + %"154" = inttoptr i64 %"105" to ptr + %"59" = getelementptr inbounds i8, ptr %"154", i64 12 + %"106" = load float, ptr %"59", align 4 + store float %"106", ptr addrspace(5) %"90", align 4 + %"107" = load i64, ptr addrspace(5) %"85", align 4 + %"155" = inttoptr i64 %"107" to ptr + %"61" = getelementptr inbounds i8, ptr %"155", i64 16 + %"108" = load float, ptr %"61", align 4 + store float %"108", ptr addrspace(5) %"91", align 4 + %"109" = load i64, ptr addrspace(5) %"85", align 4 + %"156" = inttoptr i64 %"109" to ptr + %"63" = getelementptr inbounds i8, ptr %"156", i64 20 + %"110" = load float, ptr %"63", align 4 + store float %"110", ptr addrspace(5) %"92", align 4 + %"111" = load i64, ptr addrspace(5) %"85", align 4 + %"157" = inttoptr i64 %"111" to ptr + %"65" = getelementptr inbounds i8, ptr %"157", i64 24 + %"112" = load float, ptr %"65", align 4 + store float %"112", ptr addrspace(5) %"93", align 4 + %"113" = load i64, ptr addrspace(5) %"85", align 4 + %"158" = inttoptr i64 %"113" to ptr + %"67" = getelementptr inbounds i8, ptr %"158", i64 28 + %"114" = load float, ptr %"67", align 4 + store float %"114", ptr addrspace(5) %"94", align 4 + %"116" = load float, ptr addrspace(5) %"87", align 4 + %"117" = load float, ptr addrspace(5) %"88", align 4 + %"115" = fcmp uno float %"116", %"117" + store i1 %"115", ptr addrspace(5) %"96", align 1 + %"118" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"118", label %"22", label %"23" -"22": ; preds = %"21", %1 - %"124" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"124", label %"24", label %"23" +"22": ; preds = %"82" + store i32 1, ptr addrspace(5) %"95", align 4 + br label %"23" -"23": ; preds = %"22" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"24" +"23": ; preds = %"22", %"82" + %"120" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"120", label %"25", label %"24" -"24": ; preds = %"23", %"22" - %"126" = load i64, ptr addrspace(5) %"90", align 4 - %"127" = load i32, ptr addrspace(5) %"99", align 4 - %"163" = inttoptr i64 %"126" to ptr - store i32 %"127", ptr %"163", align 4 - %"129" = load float, ptr addrspace(5) %"93", align 4 - %"130" = load float, ptr addrspace(5) %"94", align 4 - %"128" = fcmp uno float %"129", %"130" - store i1 %"128", ptr addrspace(5) %"100", align 1 - %"131" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"131", label %"25", label %"26" +"24": ; preds = %"23" + store i32 0, ptr addrspace(5) %"95", align 4 + br label %"25" -"25": ; preds = %"24" - store i32 1, ptr addrspace(5) %"99", align 4 - br label %"26" +"25": ; preds = %"24", %"23" + %"122" = load i64, ptr addrspace(5) %"86", align 4 + %"123" = load i32, ptr addrspace(5) %"95", align 4 + %"159" = inttoptr i64 %"122" to ptr + store i32 %"123", ptr %"159", align 4 + %"125" = load float, ptr addrspace(5) %"89", align 4 + %"126" = load float, ptr addrspace(5) %"90", align 4 + %"124" = fcmp uno float %"125", %"126" + store i1 %"124", ptr addrspace(5) %"96", align 1 + %"127" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"127", label %"26", label %"27" -"26": ; preds = %"25", %"24" - %"133" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"133", label %"28", label %"27" +"26": ; preds = %"25" + store i32 1, ptr addrspace(5) %"95", align 4 + br label %"27" -"27": ; preds = %"26" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"28" +"27": ; preds = %"26", %"25" + %"129" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"129", label %"29", label %"28" -"28": ; preds = %"27", %"26" - %"135" = load i64, ptr addrspace(5) %"90", align 4 - %"164" = inttoptr i64 %"135" to ptr - %"72" = getelementptr inbounds i8, ptr %"164", i64 4 - %"136" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"136", ptr %"72", align 4 - %"138" = load float, ptr addrspace(5) %"95", align 4 - %"139" = load float, ptr addrspace(5) %"96", align 4 - %"137" = fcmp uno float %"138", %"139" - store i1 %"137", ptr addrspace(5) %"100", align 1 - %"140" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"140", label %"29", label %"30" +"28": ; preds = %"27" + store i32 0, ptr addrspace(5) %"95", align 4 + br label %"29" -"29": ; preds = %"28" - store i32 1, ptr addrspace(5) %"99", align 4 - br label %"30" +"29": ; preds = %"28", %"27" + %"131" = load i64, ptr addrspace(5) %"86", align 4 + %"160" = inttoptr i64 %"131" to ptr + %"73" = getelementptr inbounds i8, ptr %"160", i64 4 + %"132" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"132", ptr %"73", align 4 + %"134" = load float, ptr addrspace(5) %"91", align 4 + %"135" = load float, ptr addrspace(5) %"92", align 4 + %"133" = fcmp uno float %"134", %"135" + store i1 %"133", ptr addrspace(5) %"96", align 1 + %"136" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"136", label %"30", label %"31" -"30": ; preds = %"29", %"28" - %"142" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"142", label %"32", label %"31" +"30": ; preds = %"29" + store i32 1, ptr addrspace(5) %"95", align 4 + br label %"31" -"31": ; preds = %"30" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"32" +"31": ; preds = %"30", %"29" + %"138" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"138", label %"33", label %"32" -"32": ; preds = %"31", %"30" - %"144" = load i64, ptr addrspace(5) %"90", align 4 - %"165" = inttoptr i64 %"144" to ptr - %"76" = getelementptr inbounds i8, ptr %"165", i64 8 - %"145" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"145", ptr %"76", align 4 - %"147" = load float, ptr addrspace(5) %"97", align 4 - %"148" = load float, ptr addrspace(5) %"98", align 4 - %"146" = fcmp uno float %"147", %"148" - store i1 %"146", ptr addrspace(5) %"100", align 1 - %"149" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"149", label %"33", label %"34" +"32": ; preds = %"31" + store i32 0, ptr addrspace(5) %"95", align 4 + br label %"33" -"33": ; preds = %"32" - store i32 1, ptr addrspace(5) %"99", align 4 - br label %"34" +"33": ; preds = %"32", %"31" + %"140" = load i64, ptr addrspace(5) %"86", align 4 + %"161" = inttoptr i64 %"140" to ptr + %"77" = getelementptr inbounds i8, ptr %"161", i64 8 + %"141" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"141", ptr %"77", align 4 + %"143" = load float, ptr addrspace(5) %"93", align 4 + %"144" = load float, ptr addrspace(5) %"94", align 4 + %"142" = fcmp uno float %"143", %"144" + store i1 %"142", ptr addrspace(5) %"96", align 1 + %"145" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"145", label %"34", label %"35" -"34": ; preds = %"33", %"32" - %"151" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"151", label %"36", label %"35" +"34": ; preds = %"33" + store i32 1, ptr addrspace(5) %"95", align 4 + br label %"35" -"35": ; preds = %"34" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"36" +"35": ; preds = %"34", %"33" + %"147" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"147", label %"37", label %"36" -"36": ; preds = %"35", %"34" - %"153" = load i64, ptr addrspace(5) %"90", align 4 - %"166" = inttoptr i64 %"153" to ptr - %"80" = getelementptr inbounds i8, ptr %"166", i64 12 - %"154" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"154", ptr %"80", align 4 +"36": ; preds = %"35" + store i32 0, ptr addrspace(5) %"95", align 4 + br label %"37" + +"37": ; preds = %"36", %"35" + %"149" = load i64, ptr addrspace(5) %"86", align 4 + %"162" = inttoptr i64 %"149" to ptr + %"81" = getelementptr inbounds i8, ptr %"162", i64 12 + %"150" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"150", ptr %"81", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp_num.ll b/ptx/src/test/ll/setp_num.ll index c6303dc..4a6d56f 100644 --- a/ptx/src/test/ll/setp_num.ll +++ b/ptx/src/test/ll/setp_num.ll @@ -1,174 +1,165 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"87", ptr addrspace(4) byref(i64) %"88") #0 { - %"89" = alloca i64, align 8, addrspace(5) - %"90" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"83", ptr addrspace(4) byref(i64) %"84") #0 { + %"85" = alloca i64, align 8, addrspace(5) + %"86" = alloca i64, align 8, addrspace(5) + %"87" = alloca float, align 4, addrspace(5) + %"88" = alloca float, align 4, addrspace(5) + %"89" = alloca float, align 4, addrspace(5) + %"90" = alloca float, align 4, addrspace(5) %"91" = alloca float, align 4, addrspace(5) %"92" = alloca float, align 4, addrspace(5) %"93" = alloca float, align 4, addrspace(5) %"94" = alloca float, align 4, addrspace(5) - %"95" = alloca float, align 4, addrspace(5) - %"96" = alloca float, align 4, addrspace(5) - %"97" = alloca float, align 4, addrspace(5) - %"98" = alloca float, align 4, addrspace(5) - %"99" = alloca i32, align 4, addrspace(5) - %"100" = alloca i1, align 1, addrspace(5) + %"95" = alloca i32, align 4, addrspace(5) + %"96" = alloca i1, align 1, addrspace(5) br label %1 1: ; preds = %0 - %"101" = load i64, ptr addrspace(4) %"87", align 4 - store i64 %"101", ptr addrspace(5) %"89", align 4 - %"102" = load i64, ptr addrspace(4) %"88", align 4 - store i64 %"102", ptr addrspace(5) %"90", align 4 - %"104" = load i64, ptr addrspace(5) %"89", align 4 - %"155" = inttoptr i64 %"104" to ptr - %"103" = load float, ptr %"155", align 4 - store float %"103", ptr addrspace(5) %"91", align 4 - %"105" = load i64, ptr addrspace(5) %"89", align 4 - %"156" = inttoptr i64 %"105" to ptr - %"54" = getelementptr inbounds i8, ptr %"156", i64 4 - %"106" = load float, ptr %"54", align 4 - store float %"106", ptr addrspace(5) %"92", align 4 - %"107" = load i64, ptr addrspace(5) %"89", align 4 - %"157" = inttoptr i64 %"107" to ptr - %"56" = getelementptr inbounds i8, ptr %"157", i64 8 - %"108" = load float, ptr %"56", align 4 - store float %"108", ptr addrspace(5) %"93", align 4 - %"109" = load i64, ptr addrspace(5) %"89", align 4 - %"158" = inttoptr i64 %"109" to ptr - %"58" = getelementptr inbounds i8, ptr %"158", i64 12 - %"110" = load float, ptr %"58", align 4 - store float %"110", ptr addrspace(5) %"94", align 4 - %"111" = load i64, ptr addrspace(5) %"89", align 4 - %"159" = inttoptr i64 %"111" to ptr - %"60" = getelementptr inbounds i8, ptr %"159", i64 16 - %"112" = load float, ptr %"60", align 4 - store float %"112", ptr addrspace(5) %"95", align 4 - %"113" = load i64, ptr addrspace(5) %"89", align 4 - %"160" = inttoptr i64 %"113" to ptr - %"62" = getelementptr inbounds i8, ptr %"160", i64 20 - %"114" = load float, ptr %"62", align 4 - store float %"114", ptr addrspace(5) %"96", align 4 - %"115" = load i64, ptr addrspace(5) %"89", align 4 - %"161" = inttoptr i64 %"115" to ptr - %"64" = getelementptr inbounds i8, ptr %"161", i64 24 - %"116" = load float, ptr %"64", align 4 - store float %"116", ptr addrspace(5) %"97", align 4 - %"117" = load i64, ptr addrspace(5) %"89", align 4 - %"162" = inttoptr i64 %"117" to ptr - %"66" = getelementptr inbounds i8, ptr %"162", i64 28 - %"118" = load float, ptr %"66", align 4 - store float %"118", ptr addrspace(5) %"98", align 4 - %"120" = load float, ptr addrspace(5) %"91", align 4 - %"121" = load float, ptr addrspace(5) %"92", align 4 - %"119" = fcmp ord float %"120", %"121" - store i1 %"119", ptr addrspace(5) %"100", align 1 - %"122" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"122", label %"21", label %"22" + br label %"82" -"21": ; preds = %1 - store i32 2, ptr addrspace(5) %"99", align 4 - br label %"22" +"82": ; preds = %1 + %"97" = load i64, ptr addrspace(4) %"83", align 4 + store i64 %"97", ptr addrspace(5) %"85", align 4 + %"98" = load i64, ptr addrspace(4) %"84", align 4 + store i64 %"98", ptr addrspace(5) %"86", align 4 + %"100" = load i64, ptr addrspace(5) %"85", align 4 + %"151" = inttoptr i64 %"100" to ptr + %"99" = load float, ptr %"151", align 4 + store float %"99", ptr addrspace(5) %"87", align 4 + %"101" = load i64, ptr addrspace(5) %"85", align 4 + %"152" = inttoptr i64 %"101" to ptr + %"55" = getelementptr inbounds i8, ptr %"152", i64 4 + %"102" = load float, ptr %"55", align 4 + store float %"102", ptr addrspace(5) %"88", align 4 + %"103" = load i64, ptr addrspace(5) %"85", align 4 + %"153" = inttoptr i64 %"103" to ptr + %"57" = getelementptr inbounds i8, ptr %"153", i64 8 + %"104" = load float, ptr %"57", align 4 + store float %"104", ptr addrspace(5) %"89", align 4 + %"105" = load i64, ptr addrspace(5) %"85", align 4 + %"154" = inttoptr i64 %"105" to ptr + %"59" = getelementptr inbounds i8, ptr %"154", i64 12 + %"106" = load float, ptr %"59", align 4 + store float %"106", ptr addrspace(5) %"90", align 4 + %"107" = load i64, ptr addrspace(5) %"85", align 4 + %"155" = inttoptr i64 %"107" to ptr + %"61" = getelementptr inbounds i8, ptr %"155", i64 16 + %"108" = load float, ptr %"61", align 4 + store float %"108", ptr addrspace(5) %"91", align 4 + %"109" = load i64, ptr addrspace(5) %"85", align 4 + %"156" = inttoptr i64 %"109" to ptr + %"63" = getelementptr inbounds i8, ptr %"156", i64 20 + %"110" = load float, ptr %"63", align 4 + store float %"110", ptr addrspace(5) %"92", align 4 + %"111" = load i64, ptr addrspace(5) %"85", align 4 + %"157" = inttoptr i64 %"111" to ptr + %"65" = getelementptr inbounds i8, ptr %"157", i64 24 + %"112" = load float, ptr %"65", align 4 + store float %"112", ptr addrspace(5) %"93", align 4 + %"113" = load i64, ptr addrspace(5) %"85", align 4 + %"158" = inttoptr i64 %"113" to ptr + %"67" = getelementptr inbounds i8, ptr %"158", i64 28 + %"114" = load float, ptr %"67", align 4 + store float %"114", ptr addrspace(5) %"94", align 4 + %"116" = load float, ptr addrspace(5) %"87", align 4 + %"117" = load float, ptr addrspace(5) %"88", align 4 + %"115" = fcmp ord float %"116", %"117" + store i1 %"115", ptr addrspace(5) %"96", align 1 + %"118" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"118", label %"22", label %"23" -"22": ; preds = %"21", %1 - %"124" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"124", label %"24", label %"23" +"22": ; preds = %"82" + store i32 2, ptr addrspace(5) %"95", align 4 + br label %"23" -"23": ; preds = %"22" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"24" +"23": ; preds = %"22", %"82" + %"120" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"120", label %"25", label %"24" -"24": ; preds = %"23", %"22" - %"126" = load i64, ptr addrspace(5) %"90", align 4 - %"127" = load i32, ptr addrspace(5) %"99", align 4 - %"163" = inttoptr i64 %"126" to ptr - store i32 %"127", ptr %"163", align 4 - %"129" = load float, ptr addrspace(5) %"93", align 4 - %"130" = load float, ptr addrspace(5) %"94", align 4 - %"128" = fcmp ord float %"129", %"130" - store i1 %"128", ptr addrspace(5) %"100", align 1 - %"131" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"131", label %"25", label %"26" +"24": ; preds = %"23" + store i32 0, ptr addrspace(5) %"95", align 4 + br label %"25" -"25": ; preds = %"24" - store i32 2, ptr addrspace(5) %"99", align 4 - br label %"26" +"25": ; preds = %"24", %"23" + %"122" = load i64, ptr addrspace(5) %"86", align 4 + %"123" = load i32, ptr addrspace(5) %"95", align 4 + %"159" = inttoptr i64 %"122" to ptr + store i32 %"123", ptr %"159", align 4 + %"125" = load float, ptr addrspace(5) %"89", align 4 + %"126" = load float, ptr addrspace(5) %"90", align 4 + %"124" = fcmp ord float %"125", %"126" + store i1 %"124", ptr addrspace(5) %"96", align 1 + %"127" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"127", label %"26", label %"27" -"26": ; preds = %"25", %"24" - %"133" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"133", label %"28", label %"27" +"26": ; preds = %"25" + store i32 2, ptr addrspace(5) %"95", align 4 + br label %"27" -"27": ; preds = %"26" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"28" +"27": ; preds = %"26", %"25" + %"129" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"129", label %"29", label %"28" -"28": ; preds = %"27", %"26" - %"135" = load i64, ptr addrspace(5) %"90", align 4 - %"164" = inttoptr i64 %"135" to ptr - %"72" = getelementptr inbounds i8, ptr %"164", i64 4 - %"136" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"136", ptr %"72", align 4 - %"138" = load float, ptr addrspace(5) %"95", align 4 - %"139" = load float, ptr addrspace(5) %"96", align 4 - %"137" = fcmp ord float %"138", %"139" - store i1 %"137", ptr addrspace(5) %"100", align 1 - %"140" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"140", label %"29", label %"30" +"28": ; preds = %"27" + store i32 0, ptr addrspace(5) %"95", align 4 + br label %"29" -"29": ; preds = %"28" - store i32 2, ptr addrspace(5) %"99", align 4 - br label %"30" +"29": ; preds = %"28", %"27" + %"131" = load i64, ptr addrspace(5) %"86", align 4 + %"160" = inttoptr i64 %"131" to ptr + %"73" = getelementptr inbounds i8, ptr %"160", i64 4 + %"132" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"132", ptr %"73", align 4 + %"134" = load float, ptr addrspace(5) %"91", align 4 + %"135" = load float, ptr addrspace(5) %"92", align 4 + %"133" = fcmp ord float %"134", %"135" + store i1 %"133", ptr addrspace(5) %"96", align 1 + %"136" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"136", label %"30", label %"31" -"30": ; preds = %"29", %"28" - %"142" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"142", label %"32", label %"31" +"30": ; preds = %"29" + store i32 2, ptr addrspace(5) %"95", align 4 + br label %"31" -"31": ; preds = %"30" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"32" +"31": ; preds = %"30", %"29" + %"138" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"138", label %"33", label %"32" -"32": ; preds = %"31", %"30" - %"144" = load i64, ptr addrspace(5) %"90", align 4 - %"165" = inttoptr i64 %"144" to ptr - %"76" = getelementptr inbounds i8, ptr %"165", i64 8 - %"145" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"145", ptr %"76", align 4 - %"147" = load float, ptr addrspace(5) %"97", align 4 - %"148" = load float, ptr addrspace(5) %"98", align 4 - %"146" = fcmp ord float %"147", %"148" - store i1 %"146", ptr addrspace(5) %"100", align 1 - %"149" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"149", label %"33", label %"34" +"32": ; preds = %"31" + store i32 0, ptr addrspace(5) %"95", align 4 + br label %"33" -"33": ; preds = %"32" - store i32 2, ptr addrspace(5) %"99", align 4 - br label %"34" +"33": ; preds = %"32", %"31" + %"140" = load i64, ptr addrspace(5) %"86", align 4 + %"161" = inttoptr i64 %"140" to ptr + %"77" = getelementptr inbounds i8, ptr %"161", i64 8 + %"141" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"141", ptr %"77", align 4 + %"143" = load float, ptr addrspace(5) %"93", align 4 + %"144" = load float, ptr addrspace(5) %"94", align 4 + %"142" = fcmp ord float %"143", %"144" + store i1 %"142", ptr addrspace(5) %"96", align 1 + %"145" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"145", label %"34", label %"35" -"34": ; preds = %"33", %"32" - %"151" = load i1, ptr addrspace(5) %"100", align 1 - br i1 %"151", label %"36", label %"35" +"34": ; preds = %"33" + store i32 2, ptr addrspace(5) %"95", align 4 + br label %"35" -"35": ; preds = %"34" - store i32 0, ptr addrspace(5) %"99", align 4 - br label %"36" +"35": ; preds = %"34", %"33" + %"147" = load i1, ptr addrspace(5) %"96", align 1 + br i1 %"147", label %"37", label %"36" -"36": ; preds = %"35", %"34" - %"153" = load i64, ptr addrspace(5) %"90", align 4 - %"166" = inttoptr i64 %"153" to ptr - %"80" = getelementptr inbounds i8, ptr %"166", i64 12 - %"154" = load i32, ptr addrspace(5) %"99", align 4 - store i32 %"154", ptr %"80", align 4 +"36": ; preds = %"35" + store i32 0, ptr addrspace(5) %"95", align 4 + br label %"37" + +"37": ; preds = %"36", %"35" + %"149" = load i64, ptr addrspace(5) %"86", align 4 + %"162" = inttoptr i64 %"149" to ptr + %"81" = getelementptr inbounds i8, ptr %"162", i64 12 + %"150" = load i32, ptr addrspace(5) %"95", align 4 + store i32 %"150", ptr %"81", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_ptr_32.ll b/ptx/src/test/ll/shared_ptr_32.ll index ecba0ca..5a6f55f 100644 --- a/ptx/src/test/ll/shared_ptr_32.ll +++ b/ptx/src/test/ll/shared_ptr_32.ll @@ -1,49 +1,40 @@ @shared_mem1 = external addrspace(3) global [128 x i8], align 4 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { +define amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i32, align 4, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i32, align 4, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) - %"45" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"46" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(4) %"40", align 4 - store i64 %"47", ptr addrspace(5) %"42", align 4 - store i32 ptrtoint (ptr addrspace(3) @shared_mem1 to i32), ptr addrspace(5) %"43", align 4 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %"58" = inttoptr i64 %"50" to ptr addrspace(1) - %"49" = load i64, ptr addrspace(1) %"58", align 4 - store i64 %"49", ptr addrspace(5) %"44", align 4 - %"51" = load i32, ptr addrspace(5) %"43", align 4 - %"52" = load i64, ptr addrspace(5) %"44", align 4 - %"59" = inttoptr i32 %"51" to ptr addrspace(3) - store i64 %"52", ptr addrspace(3) %"59", align 4 - %"53" = load i32, ptr addrspace(5) %"43", align 4 - %"60" = inttoptr i32 %"53" to ptr addrspace(3) - %"32" = getelementptr inbounds i8, ptr addrspace(3) %"60", i64 0 - %"54" = load i64, ptr addrspace(3) %"32", align 4 - store i64 %"54", ptr addrspace(5) %"45", align 4 - %"55" = load i64, ptr addrspace(5) %"42", align 4 - %"56" = load i64, ptr addrspace(5) %"45", align 4 - %"61" = inttoptr i64 %"55" to ptr addrspace(1) - store i64 %"56", ptr addrspace(1) %"61", align 4 + br label %"34" + +"34": ; preds = %1 + %"42" = load i64, ptr addrspace(4) %"35", align 4 + store i64 %"42", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"43", ptr addrspace(5) %"38", align 4 + store i32 ptrtoint (ptr addrspace(3) @shared_mem1 to i32), ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"37", align 4 + %"54" = inttoptr i64 %"46" to ptr addrspace(1) + %"45" = load i64, ptr addrspace(1) %"54", align 4 + store i64 %"45", ptr addrspace(5) %"40", align 4 + %"47" = load i32, ptr addrspace(5) %"39", align 4 + %"48" = load i64, ptr addrspace(5) %"40", align 4 + %"55" = inttoptr i32 %"47" to ptr addrspace(3) + store i64 %"48", ptr addrspace(3) %"55", align 4 + %"49" = load i32, ptr addrspace(5) %"39", align 4 + %"56" = inttoptr i32 %"49" to ptr addrspace(3) + %"33" = getelementptr inbounds i8, ptr addrspace(3) %"56", i64 0 + %"50" = load i64, ptr addrspace(3) %"33", align 4 + store i64 %"50", ptr addrspace(5) %"41", align 4 + %"51" = load i64, ptr addrspace(5) %"38", align 4 + %"52" = load i64, ptr addrspace(5) %"41", align 4 + %"57" = inttoptr i64 %"51" to ptr addrspace(1) + store i64 %"52", ptr addrspace(1) %"57", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_ptr_take_address.ll b/ptx/src/test/ll/shared_ptr_take_address.ll index a5a250d..b075ccb 100644 --- a/ptx/src/test/ll/shared_ptr_take_address.ll +++ b/ptx/src/test/ll/shared_ptr_take_address.ll @@ -1,48 +1,39 @@ @shared_mem = external addrspace(3) global [0 x i8], align 4 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { +define amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"44" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"44", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"45", ptr addrspace(5) %"40", align 4 - store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"56" = inttoptr i64 %"48" to ptr addrspace(1) - %"47" = load i64, ptr addrspace(1) %"56", align 4 - store i64 %"47", ptr addrspace(5) %"42", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"50" = load i64, ptr addrspace(5) %"42", align 4 - %"57" = inttoptr i64 %"49" to ptr addrspace(3) - store i64 %"50", ptr addrspace(3) %"57", align 4 - %"52" = load i64, ptr addrspace(5) %"41", align 4 - %"58" = inttoptr i64 %"52" to ptr addrspace(3) - %"51" = load i64, ptr addrspace(3) %"58", align 4 - store i64 %"51", ptr addrspace(5) %"43", align 4 - %"53" = load i64, ptr addrspace(5) %"40", align 4 - %"54" = load i64, ptr addrspace(5) %"43", align 4 - %"59" = inttoptr i64 %"53" to ptr addrspace(1) - store i64 %"54", ptr addrspace(1) %"59", align 4 + br label %"32" + +"32": ; preds = %1 + %"40" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"40", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"41", ptr addrspace(5) %"36", align 4 + store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"52" = inttoptr i64 %"44" to ptr addrspace(1) + %"43" = load i64, ptr addrspace(1) %"52", align 4 + store i64 %"43", ptr addrspace(5) %"38", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %"53" = inttoptr i64 %"45" to ptr addrspace(3) + store i64 %"46", ptr addrspace(3) %"53", align 4 + %"48" = load i64, ptr addrspace(5) %"37", align 4 + %"54" = inttoptr i64 %"48" to ptr addrspace(3) + %"47" = load i64, ptr addrspace(3) %"54", align 4 + store i64 %"47", ptr addrspace(5) %"39", align 4 + %"49" = load i64, ptr addrspace(5) %"36", align 4 + %"50" = load i64, ptr addrspace(5) %"39", align 4 + %"55" = inttoptr i64 %"49" to ptr addrspace(1) + store i64 %"50", ptr addrspace(1) %"55", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_unify_extern.ll b/ptx/src/test/ll/shared_unify_extern.ll index 68309bf..4020f92 100644 --- a/ptx/src/test/ll/shared_unify_extern.ll +++ b/ptx/src/test/ll/shared_unify_extern.ll @@ -1,25 +1,16 @@ @shared_ex = external addrspace(3) global [0 x i32] @shared_mod = external addrspace(3) global [4 x i32] -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define i64 @__zluda_ptx_impl_add() #0 { +define i64 @add() #0 { %"46" = alloca i64, align 8, addrspace(5) %"47" = alloca i64, align 8, addrspace(5) %"48" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 + br label %"41" + +"41": ; preds = %1 %"49" = load i64, ptr addrspace(3) @shared_mod, align 4 store i64 %"49", ptr addrspace(5) %"47", align 4 %"50" = load i64, ptr addrspace(3) @shared_ex, align 4 @@ -32,19 +23,25 @@ define i64 @__zluda_ptx_impl_add() #0 { ret i64 %2 } -define i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"15") #0 { +define i64 @set_shared_temp1(i64 %"15") #0 { %"54" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 + br label %"42" + +"42": ; preds = %1 store i64 %"15", ptr addrspace(3) @shared_ex, align 4 - %"55" = call i64 @__zluda_ptx_impl_add() + %"55" = call i64 @add() store i64 %"55", ptr addrspace(5) %"54", align 4 + br label %"43" + +"43": ; preds = %"42" %2 = load i64, ptr addrspace(5) %"54", align 4 ret i64 %2 } -define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56", ptr addrspace(4) byref(i64) %"57") #0 { +define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56", ptr addrspace(4) byref(i64) %"57") #1 { %"58" = alloca i64, align 8, addrspace(5) %"59" = alloca i64, align 8, addrspace(5) %"60" = alloca i64, align 8, addrspace(5) @@ -52,6 +49,9 @@ define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56" br label %1 1: ; preds = %0 + br label %"44" + +"44": ; preds = %1 %"62" = load i64, ptr addrspace(4) %"56", align 4 store i64 %"62", ptr addrspace(5) %"58", align 4 %"63" = load i64, ptr addrspace(4) %"57", align 4 @@ -62,14 +62,17 @@ define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56" store i64 %"64", ptr addrspace(5) %"60", align 4 %"66" = load i64, ptr addrspace(5) %"58", align 4 %"79" = inttoptr i64 %"66" to ptr addrspace(1) - %"39" = getelementptr inbounds i8, ptr addrspace(1) %"79", i64 8 - %"67" = load i64, ptr addrspace(1) %"39", align 4 + %"40" = getelementptr inbounds i8, ptr addrspace(1) %"79", i64 8 + %"67" = load i64, ptr addrspace(1) %"40", align 4 store i64 %"67", ptr addrspace(5) %"61", align 4 %"68" = load i64, ptr addrspace(5) %"61", align 4 store i64 %"68", ptr addrspace(3) @shared_mod, align 4 %"70" = load i64, ptr addrspace(5) %"60", align 4 - %"81" = call i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"70") + %"81" = call i64 @set_shared_temp1(i64 %"70") store i64 %"81", ptr addrspace(5) %"61", align 4 + br label %"45" + +"45": ; preds = %"44" %"71" = load i64, ptr addrspace(5) %"59", align 4 %"72" = load i64, ptr addrspace(5) %"61", align 4 %"83" = inttoptr i64 %"71" to ptr @@ -77,4 +80,5 @@ define amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"56" ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_unify_local.ll b/ptx/src/test/ll/shared_unify_local.ll index 56a5bbb..ef4b605 100644 --- a/ptx/src/test/ll/shared_unify_local.ll +++ b/ptx/src/test/ll/shared_unify_local.ll @@ -1,24 +1,15 @@ @shared_ex = external addrspace(3) global [0 x i32] @shared_mod = external addrspace(3) global i64, align 4 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define i64 @__zluda_ptx_impl_add(i64 %"10") #0 { +define i64 @add(i64 %"10") #0 { %"47" = alloca i64, align 8, addrspace(5) %"48" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 + br label %"42" + +"42": ; preds = %1 store i64 %"10", ptr addrspace(3) @shared_mod, align 4 %"49" = load i64, ptr addrspace(3) @shared_mod, align 4 store i64 %"49", ptr addrspace(5) %"48", align 4 @@ -30,19 +21,25 @@ define i64 @__zluda_ptx_impl_add(i64 %"10") #0 { ret i64 %2 } -define i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"15", i64 %"16") #0 { +define i64 @set_shared_temp1(i64 %"15", i64 %"16") #0 { %"52" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 + br label %"43" + +"43": ; preds = %1 store i64 %"15", ptr addrspace(3) @shared_ex, align 4 - %"53" = call i64 @__zluda_ptx_impl_add(i64 %"16") + %"53" = call i64 @add(i64 %"16") store i64 %"53", ptr addrspace(5) %"52", align 4 + br label %"44" + +"44": ; preds = %"43" %2 = load i64, ptr addrspace(5) %"52", align 4 ret i64 %2 } -define amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #0 { +define amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #1 { %"56" = alloca i64, align 8, addrspace(5) %"57" = alloca i64, align 8, addrspace(5) %"58" = alloca i64, align 8, addrspace(5) @@ -50,6 +47,9 @@ define amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"54", br label %1 1: ; preds = %0 + br label %"45" + +"45": ; preds = %1 %"60" = load i64, ptr addrspace(4) %"54", align 4 store i64 %"60", ptr addrspace(5) %"56", align 4 %"61" = load i64, ptr addrspace(4) %"55", align 4 @@ -60,13 +60,16 @@ define amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"54", store i64 %"62", ptr addrspace(5) %"58", align 4 %"64" = load i64, ptr addrspace(5) %"56", align 4 %"76" = inttoptr i64 %"64" to ptr addrspace(1) - %"40" = getelementptr inbounds i8, ptr addrspace(1) %"76", i64 8 - %"65" = load i64, ptr addrspace(1) %"40", align 4 + %"41" = getelementptr inbounds i8, ptr addrspace(1) %"76", i64 8 + %"65" = load i64, ptr addrspace(1) %"41", align 4 store i64 %"65", ptr addrspace(5) %"59", align 4 %"67" = load i64, ptr addrspace(5) %"58", align 4 %"68" = load i64, ptr addrspace(5) %"59", align 4 - %"77" = call i64 @__zluda_ptx_impl_set_shared_temp1(i64 %"67", i64 %"68") + %"77" = call i64 @set_shared_temp1(i64 %"67", i64 %"68") store i64 %"77", ptr addrspace(5) %"59", align 4 + br label %"46" + +"46": ; preds = %"45" %"69" = load i64, ptr addrspace(5) %"57", align 4 %"70" = load i64, ptr addrspace(5) %"59", align 4 %"79" = inttoptr i64 %"69" to ptr @@ -74,4 +77,5 @@ define amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"54", ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shared_variable.ll b/ptx/src/test/ll/shared_variable.ll index f71fcc8..821ac7e 100644 --- a/ptx/src/test/ll/shared_variable.ll +++ b/ptx/src/test/ll/shared_variable.ll @@ -1,42 +1,33 @@ @shared_mem1 = external addrspace(3) global [128 x i8], align 4 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @shared_variable(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shared_variable(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr addrspace(1) - %"44" = load i64, ptr addrspace(1) %"50", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"40", align 4 - store i64 %"46", ptr addrspace(3) @shared_mem1, align 4 - %"47" = load i64, ptr addrspace(3) @shared_mem1, align 4 - store i64 %"47", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"53" = inttoptr i64 %"48" to ptr addrspace(1) - store i64 %"49", ptr addrspace(1) %"53", align 4 + br label %"31" + +"31": ; preds = %1 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr addrspace(1) + %"40" = load i64, ptr addrspace(1) %"46", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"36", align 4 + store i64 %"42", ptr addrspace(3) @shared_mem1, align 4 + %"43" = load i64, ptr addrspace(3) @shared_mem1, align 4 + store i64 %"43", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = inttoptr i64 %"44" to ptr addrspace(1) + store i64 %"45", ptr addrspace(1) %"49", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shl.ll b/ptx/src/test/ll/shl.ll index 1b0d8bf..d1e8022 100644 --- a/ptx/src/test/ll/shl.ll +++ b/ptx/src/test/ll/shl.ll @@ -1,40 +1,31 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @shl(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @shl(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %2 = shl i64 %"47", 2 - %"51" = select i1 false, i64 0, i64 %2 - store i64 %"51", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"53" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"53", align 4 + br label %"31" + +"31": ; preds = %1 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr + %"40" = load i64, ptr %"46", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %2 = shl i64 %"43", 2 + %"47" = select i1 false, i64 0, i64 %2 + store i64 %"47", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = inttoptr i64 %"44" to ptr + store i64 %"45", ptr %"49", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/shr.ll b/ptx/src/test/ll/shr.ll index 6b2cecd..bbb8f9c 100644 --- a/ptx/src/test/ll/shr.ll +++ b/ptx/src/test/ll/shr.ll @@ -1,39 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @shr(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @shr(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"41" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"48" = inttoptr i64 %"43" to ptr - %"42" = load i32, ptr %"48", align 4 - store i32 %"42", ptr addrspace(5) %"39", align 4 - %"45" = load i32, ptr addrspace(5) %"39", align 4 - %2 = ashr i32 %"45", 1 - %"44" = select i1 false, i32 0, i32 %2 - store i32 %"44", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %"47" = load i32, ptr addrspace(5) %"39", align 4 - %"49" = inttoptr i64 %"46" to ptr - store i32 %"47", ptr %"49", align 4 + br label %"30" + +"30": ; preds = %1 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"37" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"37", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"44" = inttoptr i64 %"39" to ptr + %"38" = load i32, ptr %"44", align 4 + store i32 %"38", ptr addrspace(5) %"35", align 4 + %"41" = load i32, ptr addrspace(5) %"35", align 4 + %2 = ashr i32 %"41", 1 + %"40" = select i1 false, i32 0, i32 %2 + store i32 %"40", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(5) %"34", align 4 + %"43" = load i32, ptr addrspace(5) %"35", align 4 + %"45" = inttoptr i64 %"42" to ptr + store i32 %"43", ptr %"45", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/sign_extend.ll b/ptx/src/test/ll/sign_extend.ll index 0a29187..1d8ed20 100644 --- a/ptx/src/test/ll/sign_extend.ll +++ b/ptx/src/test/ll/sign_extend.ll @@ -1,36 +1,27 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @sign_extend(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @sign_extend(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"46" = inttoptr i64 %"42" to ptr - %"45" = load i16, ptr %"46", align 2 - %"41" = sext i16 %"45" to i32 - store i32 %"41", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(5) %"37", align 4 - %"44" = load i32, ptr addrspace(5) %"38", align 4 - %"47" = inttoptr i64 %"43" to ptr - store i32 %"44", ptr %"47", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"42" = inttoptr i64 %"38" to ptr + %"41" = load i16, ptr %"42", align 2 + %"37" = sext i16 %"41" to i32 + store i32 %"37", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"40" = load i32, ptr addrspace(5) %"34", align 4 + %"43" = inttoptr i64 %"39" to ptr + store i32 %"40", ptr %"43", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/sin.ll b/ptx/src/test/ll/sin.ll index 656dbad..922256b 100644 --- a/ptx/src/test/ll/sin.ll +++ b/ptx/src/test/ll/sin.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @sin(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @sin(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call afn float @llvm.sin.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load float, ptr %"43", align 4 + store float %"37", ptr addrspace(5) %"34", align 4 + %"40" = load float, ptr addrspace(5) %"34", align 4 + %"39" = call afn float @llvm.sin.f32(float %"40") + store float %"39", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load float, ptr addrspace(5) %"34", align 4 + %"44" = inttoptr i64 %"41" to ptr + store float %"42", ptr %"44", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.sin.f32(float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/sqrt.ll b/ptx/src/test/ll/sqrt.ll index fe56dfe..2497375 100644 --- a/ptx/src/test/ll/sqrt.ll +++ b/ptx/src/test/ll/sqrt.ll @@ -1,42 +1,33 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { - %"36" = alloca i64, align 8, addrspace(5) - %"37" = alloca i64, align 8, addrspace(5) - %"38" = alloca float, align 4, addrspace(5) +define amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { + %"32" = alloca i64, align 8, addrspace(5) + %"33" = alloca i64, align 8, addrspace(5) + %"34" = alloca float, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"39" = load i64, ptr addrspace(4) %"34", align 4 - store i64 %"39", ptr addrspace(5) %"36", align 4 - %"40" = load i64, ptr addrspace(4) %"35", align 4 - store i64 %"40", ptr addrspace(5) %"37", align 4 - %"42" = load i64, ptr addrspace(5) %"36", align 4 - %"47" = inttoptr i64 %"42" to ptr - %"41" = load float, ptr %"47", align 4 - store float %"41", ptr addrspace(5) %"38", align 4 - %"44" = load float, ptr addrspace(5) %"38", align 4 - %"43" = call float @llvm.amdgcn.sqrt.f32(float %"44") - store float %"43", ptr addrspace(5) %"38", align 4 - %"45" = load i64, ptr addrspace(5) %"37", align 4 - %"46" = load float, ptr addrspace(5) %"38", align 4 - %"48" = inttoptr i64 %"45" to ptr - store float %"46", ptr %"48", align 4 + br label %"29" + +"29": ; preds = %1 + %"35" = load i64, ptr addrspace(4) %"30", align 4 + store i64 %"35", ptr addrspace(5) %"32", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 4 + store i64 %"36", ptr addrspace(5) %"33", align 4 + %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"43" = inttoptr i64 %"38" to ptr + %"37" = load float, ptr %"43", align 4 + store float %"37", ptr addrspace(5) %"34", align 4 + %"40" = load float, ptr addrspace(5) %"34", align 4 + %"39" = call float @llvm.amdgcn.sqrt.f32(float %"40") + store float %"39", ptr addrspace(5) %"34", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"42" = load float, ptr addrspace(5) %"34", align 4 + %"44" = inttoptr i64 %"41" to ptr + store float %"42", ptr %"44", align 4 ret void } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.amdgcn.sqrt.f32(float) #1 -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_ld_st_ntid.ll b/ptx/src/test/ll/stateful_ld_st_ntid.ll index cbdb89a..c100da6 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid.ll @@ -1,58 +1,55 @@ declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @stateful_ld_st_ntid(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { - %"40" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @stateful_ld_st_ntid(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { + %"38" = alloca i64, align 8, addrspace(5) + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i32, align 4, addrspace(5) %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"64" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"64", ptr addrspace(5) %"40", align 4 - %"65" = load i64, ptr addrspace(4) %"39", align 4 - store i64 %"65", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"40", align 4 - %2 = inttoptr i64 %"48" to ptr - %"47" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"47", ptr addrspace(5) %"40", align 8 - %"50" = load i64, ptr addrspace(5) %"41", align 4 - %3 = inttoptr i64 %"50" to ptr - %"49" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"49", ptr addrspace(5) %"41", align 8 - %"31" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) - store i32 %"31", ptr addrspace(5) %"42", align 4 - %"53" = load i32, ptr addrspace(5) %"42", align 4 - %"52" = zext i32 %"53" to i64 - store i64 %"52", ptr addrspace(5) %"43", align 4 - %"55" = load i64, ptr addrspace(5) %"40", align 4 - %"56" = load i64, ptr addrspace(5) %"43", align 4 - %"66" = add i64 %"55", %"56" - store i64 %"66", ptr addrspace(5) %"40", align 4 - %"58" = load i64, ptr addrspace(5) %"41", align 4 - %"59" = load i64, ptr addrspace(5) %"43", align 4 - %"68" = add i64 %"58", %"59" - store i64 %"68", ptr addrspace(5) %"41", align 4 - %"61" = load i64, ptr addrspace(5) %"40", align 4 - %"70" = inttoptr i64 %"61" to ptr addrspace(1) - %"60" = load i64, ptr addrspace(1) %"70", align 4 - store i64 %"60", ptr addrspace(5) %"44", align 4 - %"62" = load i64, ptr addrspace(5) %"41", align 4 - %"63" = load i64, ptr addrspace(5) %"44", align 4 - %"71" = inttoptr i64 %"62" to ptr addrspace(1) - store i64 %"63", ptr addrspace(1) %"71", align 4 + br label %"33" + +"33": ; preds = %1 + %"62" = load i64, ptr addrspace(4) %"36", align 4 + store i64 %"62", ptr addrspace(5) %"38", align 4 + %"63" = load i64, ptr addrspace(4) %"37", align 4 + store i64 %"63", ptr addrspace(5) %"39", align 4 + %"46" = load i64, ptr addrspace(5) %"38", align 4 + %2 = inttoptr i64 %"46" to ptr + %"45" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"45", ptr addrspace(5) %"38", align 8 + %"48" = load i64, ptr addrspace(5) %"39", align 4 + %3 = inttoptr i64 %"48" to ptr + %"47" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"47", ptr addrspace(5) %"39", align 8 + %"32" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) + br label %"34" + +"34": ; preds = %"33" + store i32 %"32", ptr addrspace(5) %"40", align 4 + %"51" = load i32, ptr addrspace(5) %"40", align 4 + %"50" = zext i32 %"51" to i64 + store i64 %"50", ptr addrspace(5) %"41", align 4 + %"53" = load i64, ptr addrspace(5) %"38", align 4 + %"54" = load i64, ptr addrspace(5) %"41", align 4 + %"64" = add i64 %"53", %"54" + store i64 %"64", ptr addrspace(5) %"38", align 4 + %"56" = load i64, ptr addrspace(5) %"39", align 4 + %"57" = load i64, ptr addrspace(5) %"41", align 4 + %"66" = add i64 %"56", %"57" + store i64 %"66", ptr addrspace(5) %"39", align 4 + %"59" = load i64, ptr addrspace(5) %"38", align 4 + %"68" = inttoptr i64 %"59" to ptr addrspace(1) + %"58" = load i64, ptr addrspace(1) %"68", align 4 + store i64 %"58", ptr addrspace(5) %"42", align 4 + %"60" = load i64, ptr addrspace(5) %"39", align 4 + %"61" = load i64, ptr addrspace(5) %"42", align 4 + %"69" = inttoptr i64 %"60" to ptr addrspace(1) + store i64 %"61", ptr addrspace(1) %"69", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll b/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll index 1ac5a5f..c1a59c6 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll @@ -1,62 +1,59 @@ declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @stateful_ld_st_ntid_chain(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { +define amdgpu_kernel void @stateful_ld_st_ntid_chain(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) %"44" = alloca i64, align 8, addrspace(5) %"45" = alloca i64, align 8, addrspace(5) %"46" = alloca i64, align 8, addrspace(5) %"47" = alloca i64, align 8, addrspace(5) - %"48" = alloca i64, align 8, addrspace(5) + %"48" = alloca i32, align 4, addrspace(5) %"49" = alloca i64, align 8, addrspace(5) - %"50" = alloca i32, align 4, addrspace(5) - %"51" = alloca i64, align 8, addrspace(5) - %"52" = alloca i64, align 8, addrspace(5) + %"50" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"72" = load i64, ptr addrspace(4) %"42", align 4 + br label %"37" + +"37": ; preds = %1 + %"70" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"70", ptr addrspace(5) %"42", align 4 + %"71" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"71", ptr addrspace(5) %"45", align 4 + %"54" = load i64, ptr addrspace(5) %"42", align 4 + %2 = inttoptr i64 %"54" to ptr + %"53" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"53", ptr addrspace(5) %"43", align 8 + %"56" = load i64, ptr addrspace(5) %"45", align 4 + %3 = inttoptr i64 %"56" to ptr + %"55" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"55", ptr addrspace(5) %"46", align 8 + %"36" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) + br label %"38" + +"38": ; preds = %"37" + store i32 %"36", ptr addrspace(5) %"48", align 4 + %"59" = load i32, ptr addrspace(5) %"48", align 4 + %"58" = zext i32 %"59" to i64 + store i64 %"58", ptr addrspace(5) %"49", align 4 + %"61" = load i64, ptr addrspace(5) %"43", align 4 + %"62" = load i64, ptr addrspace(5) %"49", align 4 + %"72" = add i64 %"61", %"62" store i64 %"72", ptr addrspace(5) %"44", align 4 - %"73" = load i64, ptr addrspace(4) %"43", align 4 - store i64 %"73", ptr addrspace(5) %"47", align 4 - %"56" = load i64, ptr addrspace(5) %"44", align 4 - %2 = inttoptr i64 %"56" to ptr - %"55" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"55", ptr addrspace(5) %"45", align 8 - %"58" = load i64, ptr addrspace(5) %"47", align 4 - %3 = inttoptr i64 %"58" to ptr - %"57" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"57", ptr addrspace(5) %"48", align 8 - %"35" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) - store i32 %"35", ptr addrspace(5) %"50", align 4 - %"61" = load i32, ptr addrspace(5) %"50", align 4 - %"60" = zext i32 %"61" to i64 - store i64 %"60", ptr addrspace(5) %"51", align 4 - %"63" = load i64, ptr addrspace(5) %"45", align 4 - %"64" = load i64, ptr addrspace(5) %"51", align 4 - %"74" = add i64 %"63", %"64" - store i64 %"74", ptr addrspace(5) %"46", align 4 - %"66" = load i64, ptr addrspace(5) %"48", align 4 - %"67" = load i64, ptr addrspace(5) %"51", align 4 - %"76" = add i64 %"66", %"67" - store i64 %"76", ptr addrspace(5) %"49", align 4 - %"69" = load i64, ptr addrspace(5) %"46", align 4 - %"78" = inttoptr i64 %"69" to ptr addrspace(1) - %"68" = load i64, ptr addrspace(1) %"78", align 4 - store i64 %"68", ptr addrspace(5) %"52", align 4 - %"70" = load i64, ptr addrspace(5) %"49", align 4 - %"71" = load i64, ptr addrspace(5) %"52", align 4 - %"79" = inttoptr i64 %"70" to ptr addrspace(1) - store i64 %"71", ptr addrspace(1) %"79", align 4 + %"64" = load i64, ptr addrspace(5) %"46", align 4 + %"65" = load i64, ptr addrspace(5) %"49", align 4 + %"74" = add i64 %"64", %"65" + store i64 %"74", ptr addrspace(5) %"47", align 4 + %"67" = load i64, ptr addrspace(5) %"44", align 4 + %"76" = inttoptr i64 %"67" to ptr addrspace(1) + %"66" = load i64, ptr addrspace(1) %"76", align 4 + store i64 %"66", ptr addrspace(5) %"50", align 4 + %"68" = load i64, ptr addrspace(5) %"47", align 4 + %"69" = load i64, ptr addrspace(5) %"50", align 4 + %"77" = inttoptr i64 %"68" to ptr addrspace(1) + store i64 %"69", ptr addrspace(1) %"77", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll b/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll index 8a07146..dd54c84 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll @@ -1,64 +1,61 @@ declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @stateful_ld_st_ntid_sub(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 { +define amdgpu_kernel void @stateful_ld_st_ntid_sub(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #1 { + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) %"48" = alloca i64, align 8, addrspace(5) %"49" = alloca i64, align 8, addrspace(5) %"50" = alloca i64, align 8, addrspace(5) %"51" = alloca i64, align 8, addrspace(5) - %"52" = alloca i64, align 8, addrspace(5) + %"52" = alloca i32, align 4, addrspace(5) %"53" = alloca i64, align 8, addrspace(5) - %"54" = alloca i32, align 4, addrspace(5) - %"55" = alloca i64, align 8, addrspace(5) - %"56" = alloca i64, align 8, addrspace(5) + %"54" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"76" = load i64, ptr addrspace(4) %"46", align 4 + br label %"41" + +"41": ; preds = %1 + %"74" = load i64, ptr addrspace(4) %"44", align 4 + store i64 %"74", ptr addrspace(5) %"46", align 4 + %"75" = load i64, ptr addrspace(4) %"45", align 4 + store i64 %"75", ptr addrspace(5) %"49", align 4 + %"58" = load i64, ptr addrspace(5) %"46", align 4 + %2 = inttoptr i64 %"58" to ptr + %"57" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"57", ptr addrspace(5) %"47", align 8 + %"60" = load i64, ptr addrspace(5) %"49", align 4 + %3 = inttoptr i64 %"60" to ptr + %"59" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"59", ptr addrspace(5) %"50", align 8 + %"36" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) + br label %"42" + +"42": ; preds = %"41" + store i32 %"36", ptr addrspace(5) %"52", align 4 + %"63" = load i32, ptr addrspace(5) %"52", align 4 + %"62" = zext i32 %"63" to i64 + store i64 %"62", ptr addrspace(5) %"53", align 4 + %"65" = load i64, ptr addrspace(5) %"47", align 4 + %"66" = load i64, ptr addrspace(5) %"53", align 4 + %"76" = sub i64 %"65", %"66" store i64 %"76", ptr addrspace(5) %"48", align 4 - %"77" = load i64, ptr addrspace(4) %"47", align 4 - store i64 %"77", ptr addrspace(5) %"51", align 4 - %"60" = load i64, ptr addrspace(5) %"48", align 4 - %2 = inttoptr i64 %"60" to ptr - %"59" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"59", ptr addrspace(5) %"49", align 8 - %"62" = load i64, ptr addrspace(5) %"51", align 4 - %3 = inttoptr i64 %"62" to ptr - %"61" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"61", ptr addrspace(5) %"52", align 8 - %"35" = call i32 @__zluda_ptx_impl_sreg_tid(i8 0) - store i32 %"35", ptr addrspace(5) %"54", align 4 - %"65" = load i32, ptr addrspace(5) %"54", align 4 - %"64" = zext i32 %"65" to i64 - store i64 %"64", ptr addrspace(5) %"55", align 4 - %"67" = load i64, ptr addrspace(5) %"49", align 4 - %"68" = load i64, ptr addrspace(5) %"55", align 4 - %"78" = sub i64 %"67", %"68" - store i64 %"78", ptr addrspace(5) %"50", align 4 - %"70" = load i64, ptr addrspace(5) %"52", align 4 - %"71" = load i64, ptr addrspace(5) %"55", align 4 - %"81" = sub i64 %"70", %"71" - store i64 %"81", ptr addrspace(5) %"53", align 4 - %"72" = load i64, ptr addrspace(5) %"50", align 4 - %"84" = inttoptr i64 %"72" to ptr addrspace(1) - %"37" = getelementptr inbounds i8, ptr addrspace(1) %"84", i64 0 - %"73" = load i64, ptr addrspace(1) %"37", align 4 - store i64 %"73", ptr addrspace(5) %"56", align 4 - %"74" = load i64, ptr addrspace(5) %"53", align 4 - %"85" = inttoptr i64 %"74" to ptr addrspace(1) - %"39" = getelementptr inbounds i8, ptr addrspace(1) %"85", i64 0 - %"75" = load i64, ptr addrspace(5) %"56", align 4 - store i64 %"75", ptr addrspace(1) %"39", align 4 + %"68" = load i64, ptr addrspace(5) %"50", align 4 + %"69" = load i64, ptr addrspace(5) %"53", align 4 + %"79" = sub i64 %"68", %"69" + store i64 %"79", ptr addrspace(5) %"51", align 4 + %"70" = load i64, ptr addrspace(5) %"48", align 4 + %"82" = inttoptr i64 %"70" to ptr addrspace(1) + %"38" = getelementptr inbounds i8, ptr addrspace(1) %"82", i64 0 + %"71" = load i64, ptr addrspace(1) %"38", align 4 + store i64 %"71", ptr addrspace(5) %"54", align 4 + %"72" = load i64, ptr addrspace(5) %"51", align 4 + %"83" = inttoptr i64 %"72" to ptr addrspace(1) + %"40" = getelementptr inbounds i8, ptr addrspace(1) %"83", i64 0 + %"73" = load i64, ptr addrspace(5) %"54", align 4 + store i64 %"73", ptr addrspace(1) %"40", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_ld_st_simple.ll b/ptx/src/test/ll/stateful_ld_st_simple.ll index 09d064b..f945ee2 100644 --- a/ptx/src/test/ll/stateful_ld_st_simple.ll +++ b/ptx/src/test/ll/stateful_ld_st_simple.ll @@ -1,45 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @stateful_ld_st_simple(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { +define amdgpu_kernel void @stateful_ld_st_simple(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"43", ptr addrspace(5) %"38", align 4 - %"44" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"44", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 4 - %2 = inttoptr i64 %"46" to ptr - %"53" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"53", ptr addrspace(5) %"40", align 8 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %3 = inttoptr i64 %"48" to ptr - %"55" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"55", ptr addrspace(5) %"41", align 8 - %"50" = load i64, ptr addrspace(5) %"40", align 4 - %"57" = inttoptr i64 %"50" to ptr addrspace(1) - %"49" = load i64, ptr addrspace(1) %"57", align 4 - store i64 %"49", ptr addrspace(5) %"42", align 4 - %"51" = load i64, ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"42", align 4 - %"58" = inttoptr i64 %"51" to ptr addrspace(1) - store i64 %"52", ptr addrspace(1) %"58", align 4 + br label %"31" + +"31": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"39", ptr addrspace(5) %"34", align 4 + %"40" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"40", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(5) %"34", align 4 + %2 = inttoptr i64 %"42" to ptr + %"49" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"49", ptr addrspace(5) %"36", align 8 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %3 = inttoptr i64 %"44" to ptr + %"51" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"51", ptr addrspace(5) %"37", align 8 + %"46" = load i64, ptr addrspace(5) %"36", align 4 + %"53" = inttoptr i64 %"46" to ptr addrspace(1) + %"45" = load i64, ptr addrspace(1) %"53", align 4 + store i64 %"45", ptr addrspace(5) %"38", align 4 + %"47" = load i64, ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"38", align 4 + %"54" = inttoptr i64 %"47" to ptr addrspace(1) + store i64 %"48", ptr addrspace(1) %"54", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/stateful_neg_offset.ll b/ptx/src/test/ll/stateful_neg_offset.ll index 38abb0d..d51943d 100644 --- a/ptx/src/test/ll/stateful_neg_offset.ll +++ b/ptx/src/test/ll/stateful_neg_offset.ll @@ -1,54 +1,45 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @stateful_neg_offset(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { +define amdgpu_kernel void @stateful_neg_offset(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) + %"38" = alloca i64, align 8, addrspace(5) %"39" = alloca i64, align 8, addrspace(5) %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"44" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"45" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"45", ptr addrspace(5) %"39", align 4 - %"46" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"46", ptr addrspace(5) %"40", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %2 = inttoptr i64 %"48" to ptr - %"61" = addrspacecast ptr %2 to ptr addrspace(1) - store ptr addrspace(1) %"61", ptr addrspace(5) %"41", align 8 - %"50" = load i64, ptr addrspace(5) %"40", align 4 - %3 = inttoptr i64 %"50" to ptr - %"63" = addrspacecast ptr %3 to ptr addrspace(1) - store ptr addrspace(1) %"63", ptr addrspace(5) %"42", align 8 - %"52" = load i64, ptr addrspace(5) %"41", align 4 - %"53" = load i64, ptr addrspace(5) %"42", align 4 - %"51" = add i64 %"52", %"53" - store i64 %"51", ptr addrspace(5) %"43", align 4 - %"55" = load i64, ptr addrspace(5) %"41", align 4 - %"56" = load i64, ptr addrspace(5) %"42", align 4 - %"54" = sub i64 %"55", %"56" - store i64 %"54", ptr addrspace(5) %"43", align 4 - %"58" = load i64, ptr addrspace(5) %"41", align 4 - %"65" = inttoptr i64 %"58" to ptr addrspace(1) - %"57" = load i64, ptr addrspace(1) %"65", align 4 - store i64 %"57", ptr addrspace(5) %"44", align 4 - %"59" = load i64, ptr addrspace(5) %"42", align 4 - %"60" = load i64, ptr addrspace(5) %"44", align 4 - %"66" = inttoptr i64 %"59" to ptr addrspace(1) - store i64 %"60", ptr addrspace(1) %"66", align 4 + br label %"32" + +"32": ; preds = %1 + %"41" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"41", ptr addrspace(5) %"35", align 4 + %"42" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"42", ptr addrspace(5) %"36", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %2 = inttoptr i64 %"44" to ptr + %"57" = addrspacecast ptr %2 to ptr addrspace(1) + store ptr addrspace(1) %"57", ptr addrspace(5) %"37", align 8 + %"46" = load i64, ptr addrspace(5) %"36", align 4 + %3 = inttoptr i64 %"46" to ptr + %"59" = addrspacecast ptr %3 to ptr addrspace(1) + store ptr addrspace(1) %"59", ptr addrspace(5) %"38", align 8 + %"48" = load i64, ptr addrspace(5) %"37", align 4 + %"49" = load i64, ptr addrspace(5) %"38", align 4 + %"47" = add i64 %"48", %"49" + store i64 %"47", ptr addrspace(5) %"39", align 4 + %"51" = load i64, ptr addrspace(5) %"37", align 4 + %"52" = load i64, ptr addrspace(5) %"38", align 4 + %"50" = sub i64 %"51", %"52" + store i64 %"50", ptr addrspace(5) %"39", align 4 + %"54" = load i64, ptr addrspace(5) %"37", align 4 + %"61" = inttoptr i64 %"54" to ptr addrspace(1) + %"53" = load i64, ptr addrspace(1) %"61", align 4 + store i64 %"53", ptr addrspace(5) %"40", align 4 + %"55" = load i64, ptr addrspace(5) %"38", align 4 + %"56" = load i64, ptr addrspace(5) %"40", align 4 + %"62" = inttoptr i64 %"55" to ptr addrspace(1) + store i64 %"56", ptr addrspace(1) %"62", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/sub.ll b/ptx/src/test/ll/sub.ll index 31b5801..eafd223 100644 --- a/ptx/src/test/ll/sub.ll +++ b/ptx/src/test/ll/sub.ll @@ -1,39 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @sub(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @sub(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load i64, ptr %"50", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"47" = load i64, ptr addrspace(5) %"40", align 4 - %"46" = sub i64 %"47", 1 - store i64 %"46", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i64, ptr addrspace(5) %"41", align 4 - %"51" = inttoptr i64 %"48" to ptr - store i64 %"49", ptr %"51", align 4 + br label %"31" + +"31": ; preds = %1 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr + %"40" = load i64, ptr %"46", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"36", align 4 + %"42" = sub i64 %"43", 1 + store i64 %"42", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i64, ptr addrspace(5) %"37", align 4 + %"47" = inttoptr i64 %"44" to ptr + store i64 %"45", ptr %"47", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/vector.ll b/ptx/src/test/ll/vector.ll index e909c7a..95cb569 100644 --- a/ptx/src/test/ll/vector.ll +++ b/ptx/src/test/ll/vector.ll @@ -1,79 +1,77 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define <2 x i32> @__zluda_ptx_impl_impl(<2 x i32> %"9") #0 { - %"49" = alloca <2 x i32>, align 8, addrspace(5) - %"50" = alloca <2 x i32>, align 8, addrspace(5) - %"51" = alloca i32, align 4, addrspace(5) - %"52" = alloca i32, align 4, addrspace(5) +define <2 x i32> @impl(<2 x i32> %"9") #0 { + %"47" = alloca <2 x i32>, align 8, addrspace(5) + %"48" = alloca <2 x i32>, align 8, addrspace(5) + %"49" = alloca i32, align 4, addrspace(5) + %"50" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"37" = extractelement <2 x i32> %"9", i8 0 - store i32 %"37", ptr addrspace(5) %"51", align 4 - %"38" = extractelement <2 x i32> %"9", i8 1 - store i32 %"38", ptr addrspace(5) %"52", align 4 - %"56" = load i32, ptr addrspace(5) %"51", align 4 - %"57" = load i32, ptr addrspace(5) %"52", align 4 - %"55" = add i32 %"56", %"57" - store i32 %"55", ptr addrspace(5) %"52", align 4 - %"58" = load i32, ptr addrspace(5) %"52", align 4 - %"60" = load <2 x i32>, ptr addrspace(5) %"50", align 8 - %"59" = insertelement <2 x i32> %"60", i32 %"58", i8 0 - store <2 x i32> %"59", ptr addrspace(5) %"50", align 8 - %"61" = load i32, ptr addrspace(5) %"52", align 4 - %"63" = load <2 x i32>, ptr addrspace(5) %"50", align 8 - %"62" = insertelement <2 x i32> %"63", i32 %"61", i8 1 - store <2 x i32> %"62", ptr addrspace(5) %"50", align 8 - %"64" = load <2 x i32>, ptr addrspace(5) %"50", align 8 - %"42" = extractelement <2 x i32> %"64", i8 1 - %"66" = load <2 x i32>, ptr addrspace(5) %"50", align 8 - %"65" = insertelement <2 x i32> %"66", i32 %"42", i8 0 - store <2 x i32> %"65", ptr addrspace(5) %"50", align 8 - %"68" = load <2 x i32>, ptr addrspace(5) %"50", align 8 - store <2 x i32> %"68", ptr addrspace(5) %"49", align 8 - %2 = load <2 x i32>, ptr addrspace(5) %"49", align 8 + br label %"44" + +"44": ; preds = %1 + %"38" = extractelement <2 x i32> %"9", i8 0 + store i32 %"38", ptr addrspace(5) %"49", align 4 + %"39" = extractelement <2 x i32> %"9", i8 1 + store i32 %"39", ptr addrspace(5) %"50", align 4 + %"54" = load i32, ptr addrspace(5) %"49", align 4 + %"55" = load i32, ptr addrspace(5) %"50", align 4 + %"53" = add i32 %"54", %"55" + store i32 %"53", ptr addrspace(5) %"50", align 4 + %"56" = load i32, ptr addrspace(5) %"50", align 4 + %"58" = load <2 x i32>, ptr addrspace(5) %"48", align 8 + %"57" = insertelement <2 x i32> %"58", i32 %"56", i8 0 + store <2 x i32> %"57", ptr addrspace(5) %"48", align 8 + %"59" = load i32, ptr addrspace(5) %"50", align 4 + %"61" = load <2 x i32>, ptr addrspace(5) %"48", align 8 + %"60" = insertelement <2 x i32> %"61", i32 %"59", i8 1 + store <2 x i32> %"60", ptr addrspace(5) %"48", align 8 + %"62" = load <2 x i32>, ptr addrspace(5) %"48", align 8 + %"43" = extractelement <2 x i32> %"62", i8 1 + %"64" = load <2 x i32>, ptr addrspace(5) %"48", align 8 + %"63" = insertelement <2 x i32> %"64", i32 %"43", i8 0 + store <2 x i32> %"63", ptr addrspace(5) %"48", align 8 + %"66" = load <2 x i32>, ptr addrspace(5) %"48", align 8 + store <2 x i32> %"66", ptr addrspace(5) %"47", align 8 + %2 = load <2 x i32>, ptr addrspace(5) %"47", align 8 ret <2 x i32> %2 } -define amdgpu_kernel void @vector(ptr addrspace(4) byref(i64) %"69", ptr addrspace(4) byref(i64) %"70") #0 { - %"71" = alloca i64, align 8, addrspace(5) - %"72" = alloca i64, align 8, addrspace(5) - %"73" = alloca <2 x i32>, align 8, addrspace(5) - %"74" = alloca i32, align 4, addrspace(5) - %"75" = alloca i32, align 4, addrspace(5) - %"76" = alloca i64, align 8, addrspace(5) +define amdgpu_kernel void @vector(ptr addrspace(4) byref(i64) %"67", ptr addrspace(4) byref(i64) %"68") #1 { + %"69" = alloca i64, align 8, addrspace(5) + %"70" = alloca i64, align 8, addrspace(5) + %"71" = alloca <2 x i32>, align 8, addrspace(5) + %"72" = alloca i32, align 4, addrspace(5) + %"73" = alloca i32, align 4, addrspace(5) + %"74" = alloca i64, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"77" = load i64, ptr addrspace(4) %"69", align 4 - store i64 %"77", ptr addrspace(5) %"71", align 4 - %"78" = load i64, ptr addrspace(4) %"70", align 4 - store i64 %"78", ptr addrspace(5) %"72", align 4 - %"80" = load i64, ptr addrspace(5) %"71", align 4 - %"87" = inttoptr i64 %"80" to ptr - %"79" = load <2 x i32>, ptr %"87", align 8 - store <2 x i32> %"79", ptr addrspace(5) %"73", align 8 - %"82" = load <2 x i32>, ptr addrspace(5) %"73", align 8 - %"81" = call <2 x i32> @__zluda_ptx_impl_impl(<2 x i32> %"82") - store <2 x i32> %"81", ptr addrspace(5) %"73", align 8 - %"84" = load <2 x i32>, ptr addrspace(5) %"73", align 8 - %"88" = bitcast <2 x i32> %"84" to i64 - store i64 %"88", ptr addrspace(5) %"76", align 4 - %"85" = load i64, ptr addrspace(5) %"72", align 4 - %"86" = load <2 x i32>, ptr addrspace(5) %"73", align 8 - %"89" = inttoptr i64 %"85" to ptr - store <2 x i32> %"86", ptr %"89", align 8 + br label %"45" + +"45": ; preds = %1 + %"75" = load i64, ptr addrspace(4) %"67", align 4 + store i64 %"75", ptr addrspace(5) %"69", align 4 + %"76" = load i64, ptr addrspace(4) %"68", align 4 + store i64 %"76", ptr addrspace(5) %"70", align 4 + %"78" = load i64, ptr addrspace(5) %"69", align 4 + %"85" = inttoptr i64 %"78" to ptr + %"77" = load <2 x i32>, ptr %"85", align 8 + store <2 x i32> %"77", ptr addrspace(5) %"71", align 8 + %"80" = load <2 x i32>, ptr addrspace(5) %"71", align 8 + %"79" = call <2 x i32> @impl(<2 x i32> %"80") + store <2 x i32> %"79", ptr addrspace(5) %"71", align 8 + br label %"46" + +"46": ; preds = %"45" + %"82" = load <2 x i32>, ptr addrspace(5) %"71", align 8 + %"86" = bitcast <2 x i32> %"82" to i64 + store i64 %"86", ptr addrspace(5) %"74", align 4 + %"83" = load i64, ptr addrspace(5) %"70", align 4 + %"84" = load <2 x i32>, ptr addrspace(5) %"71", align 8 + %"87" = inttoptr i64 %"83" to ptr + store <2 x i32> %"84", ptr %"87", align 8 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/vector4.ll b/ptx/src/test/ll/vector4.ll index 1b8ce24..cf32621 100644 --- a/ptx/src/test/ll/vector4.ll +++ b/ptx/src/test/ll/vector4.ll @@ -1,39 +1,30 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @vector4(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca <4 x i32>, align 16, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @vector4(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { + %"34" = alloca i64, align 8, addrspace(5) + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca <4 x i32>, align 16, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"42" = load i64, ptr addrspace(4) %"36", align 4 - store i64 %"42", ptr addrspace(5) %"38", align 4 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"45" = load i64, ptr addrspace(5) %"38", align 4 - %"50" = inttoptr i64 %"45" to ptr - %"44" = load <4 x i32>, ptr %"50", align 16 - store <4 x i32> %"44", ptr addrspace(5) %"40", align 16 - %"46" = load <4 x i32>, ptr addrspace(5) %"40", align 16 - %"29" = extractelement <4 x i32> %"46", i8 3 - store i32 %"29", ptr addrspace(5) %"41", align 4 - %"48" = load i64, ptr addrspace(5) %"39", align 4 - %"49" = load i32, ptr addrspace(5) %"41", align 4 - %"53" = inttoptr i64 %"48" to ptr - store i32 %"49", ptr %"53", align 4 + br label %"31" + +"31": ; preds = %1 + %"38" = load i64, ptr addrspace(4) %"32", align 4 + store i64 %"38", ptr addrspace(5) %"34", align 4 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"41" = load i64, ptr addrspace(5) %"34", align 4 + %"46" = inttoptr i64 %"41" to ptr + %"40" = load <4 x i32>, ptr %"46", align 16 + store <4 x i32> %"40", ptr addrspace(5) %"36", align 16 + %"42" = load <4 x i32>, ptr addrspace(5) %"36", align 16 + %"30" = extractelement <4 x i32> %"42", i8 3 + store i32 %"30", ptr addrspace(5) %"37", align 4 + %"44" = load i64, ptr addrspace(5) %"35", align 4 + %"45" = load i32, ptr addrspace(5) %"37", align 4 + %"49" = inttoptr i64 %"44" to ptr + store i32 %"45", ptr %"49", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/vector_extract.ll b/ptx/src/test/ll/vector_extract.ll index a106da8..9c615ca 100644 --- a/ptx/src/test/ll/vector_extract.ll +++ b/ptx/src/test/ll/vector_extract.ll @@ -1,95 +1,86 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 { - %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca i64, align 8, addrspace(5) - %"48" = alloca i16, align 2, addrspace(5) - %"49" = alloca i16, align 2, addrspace(5) - %"50" = alloca i16, align 2, addrspace(5) - %"51" = alloca i16, align 2, addrspace(5) - %"52" = alloca <4 x i16>, align 8, addrspace(5) +define amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) + %"44" = alloca i16, align 2, addrspace(5) + %"45" = alloca i16, align 2, addrspace(5) + %"46" = alloca i16, align 2, addrspace(5) + %"47" = alloca i16, align 2, addrspace(5) + %"48" = alloca <4 x i16>, align 8, addrspace(5) br label %1 1: ; preds = %0 - %"53" = load i64, ptr addrspace(4) %"44", align 4 - store i64 %"53", ptr addrspace(5) %"46", align 4 - %"54" = load i64, ptr addrspace(4) %"45", align 4 - store i64 %"54", ptr addrspace(5) %"47", align 4 - %"55" = load i64, ptr addrspace(5) %"46", align 4 - %"83" = inttoptr i64 %"55" to ptr addrspace(1) - %"32" = load <4 x i8>, ptr addrspace(1) %"83", align 4 - %"84" = extractelement <4 x i8> %"32", i8 0 - %"85" = extractelement <4 x i8> %"32", i8 1 - %"86" = extractelement <4 x i8> %"32", i8 2 - %"87" = extractelement <4 x i8> %"32", i8 3 - %"56" = zext i8 %"84" to i16 - %"57" = zext i8 %"85" to i16 - %"58" = zext i8 %"86" to i16 - %"59" = zext i8 %"87" to i16 - store i16 %"56", ptr addrspace(5) %"48", align 2 - store i16 %"57", ptr addrspace(5) %"49", align 2 - store i16 %"58", ptr addrspace(5) %"50", align 2 - store i16 %"59", ptr addrspace(5) %"51", align 2 - %"60" = load i16, ptr addrspace(5) %"49", align 2 - %"61" = load i16, ptr addrspace(5) %"50", align 2 - %"62" = load i16, ptr addrspace(5) %"51", align 2 - %"63" = load i16, ptr addrspace(5) %"48", align 2 - %2 = insertelement <4 x i16> undef, i16 %"60", i8 0 - %3 = insertelement <4 x i16> %2, i16 %"61", i8 1 - %4 = insertelement <4 x i16> %3, i16 %"62", i8 2 - %"33" = insertelement <4 x i16> %4, i16 %"63", i8 3 - store <4 x i16> %"33", ptr addrspace(5) %"52", align 8 - %"65" = load <4 x i16>, ptr addrspace(5) %"52", align 8 - %"66" = extractelement <4 x i16> %"65", i8 0 - %"67" = extractelement <4 x i16> %"65", i8 1 - %"68" = extractelement <4 x i16> %"65", i8 2 - %"69" = extractelement <4 x i16> %"65", i8 3 - store i16 %"66", ptr addrspace(5) %"50", align 2 - store i16 %"67", ptr addrspace(5) %"51", align 2 - store i16 %"68", ptr addrspace(5) %"48", align 2 - store i16 %"69", ptr addrspace(5) %"49", align 2 - %"70" = load i16, ptr addrspace(5) %"50", align 2 - %"71" = load i16, ptr addrspace(5) %"51", align 2 - %"72" = load i16, ptr addrspace(5) %"48", align 2 - %"73" = load i16, ptr addrspace(5) %"49", align 2 - %5 = insertelement <4 x i16> undef, i16 %"70", i8 0 - %6 = insertelement <4 x i16> %5, i16 %"71", i8 1 - %7 = insertelement <4 x i16> %6, i16 %"72", i8 2 - %"36" = insertelement <4 x i16> %7, i16 %"73", i8 3 - %"74" = extractelement <4 x i16> %"36", i8 0 - %"75" = extractelement <4 x i16> %"36", i8 1 - %"76" = extractelement <4 x i16> %"36", i8 2 - %"77" = extractelement <4 x i16> %"36", i8 3 - store i16 %"74", ptr addrspace(5) %"51", align 2 - store i16 %"75", ptr addrspace(5) %"48", align 2 - store i16 %"76", ptr addrspace(5) %"49", align 2 - store i16 %"77", ptr addrspace(5) %"50", align 2 - %"78" = load i16, ptr addrspace(5) %"48", align 2 - %"79" = load i16, ptr addrspace(5) %"49", align 2 - %"80" = load i16, ptr addrspace(5) %"50", align 2 - %"81" = load i16, ptr addrspace(5) %"51", align 2 - %"88" = trunc i16 %"78" to i8 - %"89" = trunc i16 %"79" to i8 - %"90" = trunc i16 %"80" to i8 - %"91" = trunc i16 %"81" to i8 - %8 = insertelement <4 x i8> undef, i8 %"88", i8 0 - %9 = insertelement <4 x i8> %8, i8 %"89", i8 1 - %10 = insertelement <4 x i8> %9, i8 %"90", i8 2 - %"37" = insertelement <4 x i8> %10, i8 %"91", i8 3 - %"82" = load i64, ptr addrspace(5) %"47", align 4 - %"92" = inttoptr i64 %"82" to ptr addrspace(1) - store <4 x i8> %"37", ptr addrspace(1) %"92", align 4 + br label %"39" + +"39": ; preds = %1 + %"49" = load i64, ptr addrspace(4) %"40", align 4 + store i64 %"49", ptr addrspace(5) %"42", align 4 + %"50" = load i64, ptr addrspace(4) %"41", align 4 + store i64 %"50", ptr addrspace(5) %"43", align 4 + %"51" = load i64, ptr addrspace(5) %"42", align 4 + %"79" = inttoptr i64 %"51" to ptr addrspace(1) + %"33" = load <4 x i8>, ptr addrspace(1) %"79", align 4 + %"80" = extractelement <4 x i8> %"33", i8 0 + %"81" = extractelement <4 x i8> %"33", i8 1 + %"82" = extractelement <4 x i8> %"33", i8 2 + %"83" = extractelement <4 x i8> %"33", i8 3 + %"52" = zext i8 %"80" to i16 + %"53" = zext i8 %"81" to i16 + %"54" = zext i8 %"82" to i16 + %"55" = zext i8 %"83" to i16 + store i16 %"52", ptr addrspace(5) %"44", align 2 + store i16 %"53", ptr addrspace(5) %"45", align 2 + store i16 %"54", ptr addrspace(5) %"46", align 2 + store i16 %"55", ptr addrspace(5) %"47", align 2 + %"56" = load i16, ptr addrspace(5) %"45", align 2 + %"57" = load i16, ptr addrspace(5) %"46", align 2 + %"58" = load i16, ptr addrspace(5) %"47", align 2 + %"59" = load i16, ptr addrspace(5) %"44", align 2 + %2 = insertelement <4 x i16> undef, i16 %"56", i8 0 + %3 = insertelement <4 x i16> %2, i16 %"57", i8 1 + %4 = insertelement <4 x i16> %3, i16 %"58", i8 2 + %"34" = insertelement <4 x i16> %4, i16 %"59", i8 3 + store <4 x i16> %"34", ptr addrspace(5) %"48", align 8 + %"61" = load <4 x i16>, ptr addrspace(5) %"48", align 8 + %"62" = extractelement <4 x i16> %"61", i8 0 + %"63" = extractelement <4 x i16> %"61", i8 1 + %"64" = extractelement <4 x i16> %"61", i8 2 + %"65" = extractelement <4 x i16> %"61", i8 3 + store i16 %"62", ptr addrspace(5) %"46", align 2 + store i16 %"63", ptr addrspace(5) %"47", align 2 + store i16 %"64", ptr addrspace(5) %"44", align 2 + store i16 %"65", ptr addrspace(5) %"45", align 2 + %"66" = load i16, ptr addrspace(5) %"46", align 2 + %"67" = load i16, ptr addrspace(5) %"47", align 2 + %"68" = load i16, ptr addrspace(5) %"44", align 2 + %"69" = load i16, ptr addrspace(5) %"45", align 2 + %5 = insertelement <4 x i16> undef, i16 %"66", i8 0 + %6 = insertelement <4 x i16> %5, i16 %"67", i8 1 + %7 = insertelement <4 x i16> %6, i16 %"68", i8 2 + %"37" = insertelement <4 x i16> %7, i16 %"69", i8 3 + %"70" = extractelement <4 x i16> %"37", i8 0 + %"71" = extractelement <4 x i16> %"37", i8 1 + %"72" = extractelement <4 x i16> %"37", i8 2 + %"73" = extractelement <4 x i16> %"37", i8 3 + store i16 %"70", ptr addrspace(5) %"47", align 2 + store i16 %"71", ptr addrspace(5) %"44", align 2 + store i16 %"72", ptr addrspace(5) %"45", align 2 + store i16 %"73", ptr addrspace(5) %"46", align 2 + %"74" = load i16, ptr addrspace(5) %"44", align 2 + %"75" = load i16, ptr addrspace(5) %"45", align 2 + %"76" = load i16, ptr addrspace(5) %"46", align 2 + %"77" = load i16, ptr addrspace(5) %"47", align 2 + %"84" = trunc i16 %"74" to i8 + %"85" = trunc i16 %"75" to i8 + %"86" = trunc i16 %"76" to i8 + %"87" = trunc i16 %"77" to i8 + %8 = insertelement <4 x i8> undef, i8 %"84", i8 0 + %9 = insertelement <4 x i8> %8, i8 %"85", i8 1 + %10 = insertelement <4 x i8> %9, i8 %"86", i8 2 + %"38" = insertelement <4 x i8> %10, i8 %"87", i8 3 + %"78" = load i64, ptr addrspace(5) %"43", align 4 + %"88" = inttoptr i64 %"78" to ptr addrspace(1) + store <4 x i8> %"38", ptr addrspace(1) %"88", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/xor.ll b/ptx/src/test/ll/xor.ll index 859decb..6f9633d 100644 --- a/ptx/src/test/ll/xor.ll +++ b/ptx/src/test/ll/xor.ll @@ -1,45 +1,36 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0 - -declare i32 @__zluda_ptx_impl_sreg_clock() #0 - -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 - -define amdgpu_kernel void @xor(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { - %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i64, align 8, addrspace(5) - %"41" = alloca i32, align 4, addrspace(5) - %"42" = alloca i32, align 4, addrspace(5) +define amdgpu_kernel void @xor(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 { + %"35" = alloca i64, align 8, addrspace(5) + %"36" = alloca i64, align 8, addrspace(5) + %"37" = alloca i32, align 4, addrspace(5) + %"38" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - %"43" = load i64, ptr addrspace(4) %"37", align 4 - store i64 %"43", ptr addrspace(5) %"39", align 4 - %"44" = load i64, ptr addrspace(4) %"38", align 4 - store i64 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"39", align 4 - %"54" = inttoptr i64 %"46" to ptr - %"45" = load i32, ptr %"54", align 4 - store i32 %"45", ptr addrspace(5) %"41", align 4 - %"47" = load i64, ptr addrspace(5) %"39", align 4 - %"55" = inttoptr i64 %"47" to ptr - %"30" = getelementptr inbounds i8, ptr %"55", i64 4 - %"48" = load i32, ptr %"30", align 4 - store i32 %"48", ptr addrspace(5) %"42", align 4 - %"50" = load i32, ptr addrspace(5) %"41", align 4 - %"51" = load i32, ptr addrspace(5) %"42", align 4 - %"49" = xor i32 %"50", %"51" - store i32 %"49", ptr addrspace(5) %"41", align 4 - %"52" = load i64, ptr addrspace(5) %"40", align 4 - %"53" = load i32, ptr addrspace(5) %"41", align 4 - %"56" = inttoptr i64 %"52" to ptr - store i32 %"53", ptr %"56", align 4 + br label %"32" + +"32": ; preds = %1 + %"39" = load i64, ptr addrspace(4) %"33", align 4 + store i64 %"39", ptr addrspace(5) %"35", align 4 + %"40" = load i64, ptr addrspace(4) %"34", align 4 + store i64 %"40", ptr addrspace(5) %"36", align 4 + %"42" = load i64, ptr addrspace(5) %"35", align 4 + %"50" = inttoptr i64 %"42" to ptr + %"41" = load i32, ptr %"50", align 4 + store i32 %"41", ptr addrspace(5) %"37", align 4 + %"43" = load i64, ptr addrspace(5) %"35", align 4 + %"51" = inttoptr i64 %"43" to ptr + %"31" = getelementptr inbounds i8, ptr %"51", i64 4 + %"44" = load i32, ptr %"31", align 4 + store i32 %"44", ptr addrspace(5) %"38", align 4 + %"46" = load i32, ptr addrspace(5) %"37", align 4 + %"47" = load i32, ptr addrspace(5) %"38", align 4 + %"45" = xor i32 %"46", %"47" + store i32 %"45", ptr addrspace(5) %"37", align 4 + %"48" = load i64, ptr addrspace(5) %"36", align 4 + %"49" = load i32, ptr addrspace(5) %"37", align 4 + %"52" = inttoptr i64 %"48" to ptr + store i32 %"49", ptr %"52", align 4 ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/spirv_run/add_ftz.ptx b/ptx/src/test/spirv_run/add_ftz.ptx new file mode 100644 index 0000000..6909e96 --- /dev/null +++ b/ptx/src/test/spirv_run/add_ftz.ptx @@ -0,0 +1,24 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry add_ftz( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .f32 temp<4>; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.f32 temp0, [in_addr]; + ld.f32 temp1, [in_addr+4]; + add.ftz.f32 temp2, temp0, temp1; + add.f32 temp3, temp0, temp1; + st.f32 [out_addr], temp2; + st.f32 [out_addr+4], temp3; + ret; +} diff --git a/ptx/src/test/spirv_run/call_rnd.ptx b/ptx/src/test/spirv_run/call_rnd.ptx new file mode 100644 index 0000000..440c99f --- /dev/null +++ b/ptx/src/test/spirv_run/call_rnd.ptx @@ -0,0 +1,72 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.func (.param.f32 output) add_rm (.param.f32 x, .param.f32 y); +.func (.param.f32 output) add_rp (.param.f32 x, .param.f32 y); + +.visible .entry call_rnd( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .f32 x_1; + .reg .f32 y_1; + .reg .f32 x_2; + .reg .f32 y_2; + .reg .f32 z_1; + .reg .f32 z_2; + .param .f32 x_1_p; + .param .f32 y_1_p; + .param .f32 result_1; + .param .f32 x_2_p; + .param .f32 y_2_p; + .param .f32 result_2; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.f32 x_1, [in_addr]; + ld.f32 y_1, [in_addr+4]; + ld.f32 x_2, [in_addr+8]; + ld.f32 y_2, [in_addr+12]; + st.param.f32 [x_1_p], x_1; + st.param.f32 [y_1_p], y_1; + call (result_1), add_rp, (x_1_p, y_1_p); + ld.param.f32 z_1, [result_1]; + //add.rp.f32 z_1, x_1, y_1; + st.f32 [out_addr], z_1; + st.param.f32 [x_2_p], x_2; + st.param.f32 [y_2_p], y_2; + call (result_2), add_rm, (x_2_p, y_2_p); + ld.param.f32 z_2, [result_2]; + //add.rm.f32 z_2, x_2, y_2; + st.f32 [out_addr+4], z_2; + ret; +} + +.func (.param.f32 output) add_rm (.param.f32 x, .param.f32 y) { + .reg .f32 x_value; + .reg .f32 y_value; + + ld.param.f32 x_value, [x]; + ld.param.f32 y_value, [y]; + add.rm.f32 x_value, x_value, y_value; + st.param.f32 [output], x_value; + + ret; +} + +.func (.param.f32 output) add_rp (.param.f32 x, .param.f32 y) { + .reg .f32 x_value; + .reg .f32 y_value; + + ld.param.f32 x_value, [x]; + ld.param.f32 y_value, [y]; + add.rp.f32 x_value, x_value, y_value; + st.param.f32 [output], x_value; + + ret; +} diff --git a/ptx/src/test/spirv_run/malformed_label.ptx b/ptx/src/test/spirv_run/malformed_label.ptx new file mode 100644 index 0000000..cb41a7c --- /dev/null +++ b/ptx/src/test/spirv_run/malformed_label.ptx @@ -0,0 +1,27 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry malformed_label( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u64 temp; + .reg .u64 temp2; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + bra BB0; +// this basic block does not start with a label + ld.u64 temp, [out_addr]; + +BB0: + ld.u64 temp, [in_addr]; + add.u64 temp2, temp, 1; + st.u64 [out_addr], temp2; + ret; +} diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index a57f71e..cafa480 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -1,5 +1,6 @@ use crate::pass; use hip_runtime_sys::hipError_t; +use pretty_assertions; use std::env; use std::error; use std::ffi::{CStr, CString}; @@ -10,7 +11,6 @@ use std::mem; use std::path::Path; use std::ptr; use std::str; -use pretty_assertions; macro_rules! test_ptx { ($fn_name:ident, $input:expr, $output:expr) => { @@ -195,6 +195,26 @@ test_ptx!(activemask, [0u32], [1u32]); test_ptx!(membar, [152731u32], [152731u32]); test_ptx!(shared_unify_extern, [7681u64, 7682u64], [15363u64]); test_ptx!(shared_unify_local, [16752u64, 714u64], [17466u64]); +// This test currently fails for reasons outside of ZLUDA's control. +// One of the LLVM passes does not understand that setreg instruction changes +// global floating point state and assumes that both floating point +// additions are the exact same expressions and optimizes second addition away. +test_ptx!( + add_ftz, + [f32::from_bits(0x800000), f32::from_bits(0x007FFFFF)], + [0x800000u32, 0xFFFFFF] +); +test_ptx!(malformed_label, [2u64], [3u64]); +test_ptx!( + call_rnd, + [ + 1.0f32, + f32::from_bits(0x33800000), + 1.0f32, + f32::from_bits(0x33800000) + ], + [1.0000001, 1.0f32] +); test_ptx!(assertfail); test_ptx!(func_ptr); @@ -238,12 +258,10 @@ fn test_hip_assert< Ok(()) } -fn test_llvm_assert< - 'a, ->( +fn test_llvm_assert<'a>( name: &str, ptx_text: &'a str, - expected_ll: &str + expected_ll: &str, ) -> Result<(), Box> { let ast = ptx_parser::parse_module_checked(ptx_text).unwrap(); let llvm_ir = pass::to_llvm_module(ast).unwrap(); @@ -258,7 +276,7 @@ fn test_llvm_assert< let mut output_file = File::create(output_file).unwrap(); output_file.write_all(actual_ll.as_bytes()).unwrap(); } - let comparison = pretty_assertions::StrComparison::new(actual_ll, expected_ll); + let comparison = pretty_assertions::StrComparison::new(expected_ll, actual_ll); panic!("assertion failed: `(left == right)`\n\n{}", comparison); } Ok(()) diff --git a/ptx_parser/src/ast.rs b/ptx_parser/src/ast.rs index 3c0db16..55b950a 100644 --- a/ptx_parser/src/ast.rs +++ b/ptx_parser/src/ast.rs @@ -1028,9 +1028,16 @@ pub struct ArithInteger { #[derive(Copy, Clone)] pub struct ArithFloat { pub type_: ScalarType, - pub rounding: Option, + pub rounding: RoundingMode, pub flush_to_zero: Option, pub saturate: bool, + // From PTX documentation: https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions-add + // Note that an add instruction with an explicit rounding modifier is treated conservatively by + // the code optimizer. An add instruction with no rounding modifier defaults to + // round-to-nearest-even and may be optimized aggressively by the code optimizer. In particular, + // mul/add sequences with no rounding modifiers may be optimized to use fused-multiply-add + // instructions on the target device. + pub is_fusable: bool, } #[derive(Copy, Clone, PartialEq, Eq)] @@ -1042,7 +1049,7 @@ pub enum LdStQualifier { Release(MemScope), } -#[derive(PartialEq, Eq, Copy, Clone)] +#[derive(PartialEq, Eq, Copy, Clone, Debug)] pub enum RoundingMode { NearestEven, Zero, @@ -1456,6 +1463,7 @@ pub struct CvtDetails { pub mode: CvtMode, } +#[derive(Clone, Copy)] pub enum CvtMode { // int from int ZeroExtend, @@ -1474,7 +1482,7 @@ pub enum CvtMode { flush_to_zero: Option, }, FPRound { - integer_rounding: Option, + integer_rounding: RoundingMode, flush_to_zero: Option, }, // int from float @@ -1528,7 +1536,7 @@ impl CvtDetails { flush_to_zero, }, Ordering::Equal => CvtMode::FPRound { - integer_rounding: rounding, + integer_rounding: rounding.unwrap_or(RoundingMode::NearestEven), flush_to_zero, }, Ordering::Greater => { diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index 12f8a4b..da46a8c 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -1909,9 +1909,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: f32, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: Some(ftz), - saturate: sat + saturate: sat, + is_fusable: rnd.is_none() } ), arguments: AddArgs { @@ -1924,9 +1925,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: f64, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: None, - saturate: false + saturate: false, + is_fusable: rnd.is_none() } ), arguments: AddArgs { @@ -1943,9 +1945,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: f16, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: Some(ftz), - saturate: sat + saturate: sat, + is_fusable: rnd.is_none() } ), arguments: AddArgs { @@ -1958,9 +1961,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: f16x2, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: Some(ftz), - saturate: sat + saturate: sat, + is_fusable: rnd.is_none() } ), arguments: AddArgs { @@ -1973,9 +1977,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: bf16, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: None, - saturate: false + saturate: false, + is_fusable: rnd.is_none() } ), arguments: AddArgs { @@ -1988,9 +1993,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: bf16x2, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: None, - saturate: false + saturate: false, + is_fusable: rnd.is_none() } ), arguments: AddArgs { @@ -2035,9 +2041,10 @@ derive_parser!( data: ast::MulDetails::Float ( ast::ArithFloat { type_: f32, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: Some(ftz), saturate: sat, + is_fusable: rnd.is_none() } ), arguments: MulArgs { dst: d, src1: a, src2: b } @@ -2048,9 +2055,10 @@ derive_parser!( data: ast::MulDetails::Float ( ast::ArithFloat { type_: f64, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: None, saturate: false, + is_fusable: rnd.is_none() } ), arguments: MulArgs { dst: d, src1: a, src2: b } @@ -2064,9 +2072,10 @@ derive_parser!( data: ast::MulDetails::Float ( ast::ArithFloat { type_: f16, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: Some(ftz), saturate: sat, + is_fusable: rnd.is_none() } ), arguments: MulArgs { dst: d, src1: a, src2: b } @@ -2077,9 +2086,10 @@ derive_parser!( data: ast::MulDetails::Float ( ast::ArithFloat { type_: f16x2, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: Some(ftz), saturate: sat, + is_fusable: rnd.is_none() } ), arguments: MulArgs { dst: d, src1: a, src2: b } @@ -2090,9 +2100,10 @@ derive_parser!( data: ast::MulDetails::Float ( ast::ArithFloat { type_: bf16, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: None, saturate: false, + is_fusable: rnd.is_none() } ), arguments: MulArgs { dst: d, src1: a, src2: b } @@ -2103,9 +2114,10 @@ derive_parser!( data: ast::MulDetails::Float ( ast::ArithFloat { type_: bf16x2, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: None, saturate: false, + is_fusable: rnd.is_none() } ), arguments: MulArgs { dst: d, src1: a, src2: b } @@ -2389,9 +2401,10 @@ derive_parser!( data: ast::MadDetails::Float( ast::ArithFloat { type_: f32, - rounding: None, + rounding: ast::RoundingMode::NearestEven, flush_to_zero: Some(ftz), - saturate: sat + saturate: sat, + is_fusable: false } ), arguments: MadArgs { dst: d, src1: a, src2: b, src3: c } @@ -2402,9 +2415,10 @@ derive_parser!( data: ast::MadDetails::Float( ast::ArithFloat { type_: f32, - rounding: Some(rnd.into()), + rounding: rnd.into(), flush_to_zero: Some(ftz), - saturate: sat + saturate: sat, + is_fusable: false } ), arguments: MadArgs { dst: d, src1: a, src2: b, src3: c } @@ -2415,9 +2429,10 @@ derive_parser!( data: ast::MadDetails::Float( ast::ArithFloat { type_: f64, - rounding: Some(rnd.into()), + rounding: rnd.into(), flush_to_zero: None, - saturate: false + saturate: false, + is_fusable: false } ), arguments: MadArgs { dst: d, src1: a, src2: b, src3: c } @@ -2432,9 +2447,10 @@ derive_parser!( ast::Instruction::Fma { data: ast::ArithFloat { type_: f32, - rounding: Some(rnd.into()), + rounding: rnd.into(), flush_to_zero: Some(ftz), - saturate: sat + saturate: sat, + is_fusable: false }, arguments: FmaArgs { dst: d, src1: a, src2: b, src3: c } } @@ -2443,9 +2459,10 @@ derive_parser!( ast::Instruction::Fma { data: ast::ArithFloat { type_: f64, - rounding: Some(rnd.into()), + rounding: rnd.into(), flush_to_zero: None, - saturate: false + saturate: false, + is_fusable: false }, arguments: FmaArgs { dst: d, src1: a, src2: b, src3: c } } @@ -2457,9 +2474,10 @@ derive_parser!( ast::Instruction::Fma { data: ast::ArithFloat { type_: f16, - rounding: Some(rnd.into()), + rounding: rnd.into(), flush_to_zero: Some(ftz), - saturate: sat + saturate: sat, + is_fusable: false }, arguments: FmaArgs { dst: d, src1: a, src2: b, src3: c } } @@ -2507,9 +2525,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: f32, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: Some(ftz), - saturate: sat + saturate: sat, + is_fusable: rnd.is_none() } ), arguments: SubArgs { dst: d, src1: a, src2: b } @@ -2520,9 +2539,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: f64, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: None, - saturate: false + saturate: false, + is_fusable: rnd.is_none() } ), arguments: SubArgs { dst: d, src1: a, src2: b } @@ -2536,9 +2556,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: f16, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: Some(ftz), - saturate: sat + saturate: sat, + is_fusable: rnd.is_none() } ), arguments: SubArgs { dst: d, src1: a, src2: b } @@ -2549,9 +2570,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: f16x2, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: Some(ftz), - saturate: sat + saturate: sat, + is_fusable: rnd.is_none() } ), arguments: SubArgs { dst: d, src1: a, src2: b } @@ -2562,9 +2584,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: bf16, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: None, - saturate: false + saturate: false, + is_fusable: rnd.is_none() } ), arguments: SubArgs { dst: d, src1: a, src2: b } @@ -2575,9 +2598,10 @@ derive_parser!( data: ast::ArithDetails::Float( ast::ArithFloat { type_: bf16x2, - rounding: rnd.map(Into::into), + rounding: rnd.map(Into::into).unwrap_or(ast::RoundingMode::NearestEven), flush_to_zero: None, - saturate: false + saturate: false, + is_fusable: rnd.is_none() } ), arguments: SubArgs { dst: d, src1: a, src2: b } @@ -2880,7 +2904,7 @@ derive_parser!( rsqrt.approx.f64 d, a => { ast::Instruction::Rsqrt { data: ast::TypeFtz { - flush_to_zero: None, + flush_to_zero: Some(false), type_: f64 }, arguments: RsqrtArgs { dst: d, src: a } @@ -2889,7 +2913,7 @@ derive_parser!( rsqrt.approx.ftz.f64 d, a => { ast::Instruction::Rsqrt { data: ast::TypeFtz { - flush_to_zero: None, + flush_to_zero: Some(true), type_: f64 }, arguments: RsqrtArgs { dst: d, src: a }