diff --git a/comgr/src/lib.rs b/comgr/src/lib.rs index f5e4e30..9c5671b 100644 --- a/comgr/src/lib.rs +++ b/comgr/src/lib.rs @@ -175,22 +175,35 @@ impl Data { } } +#[derive(Clone, Copy, Debug)] +struct Symbol(u64); + +impl Symbol { + fn comgr2(&self) -> comgr2::amd_comgr_symbol_t { + comgr2::amd_comgr_symbol_s { handle: self.0 } + } + + fn comgr3(&self) -> comgr3::amd_comgr_symbol_t { + comgr3::amd_comgr_symbol_s { handle: self.0 } + } +} + pub fn compile_bitcode( comgr: &Comgr, gcn_arch: &str, main_buffer: &[u8], - attributes_buffer: &[u8], ptx_impl: &[u8], + attributes_buffer: &[u8], compiler_hook: Option<&dyn Fn(&Vec, String)>, ) -> Result, Error> { let bitcode_data_set = DataSet::new(comgr)?; let main_bitcode_data = Data::new(comgr, DataKind::Bc, c"zluda.bc", main_buffer)?; bitcode_data_set.add(&main_bitcode_data)?; + let stdlib_bitcode_data = Data::new(comgr, DataKind::Bc, c"ptx_impl.bc", ptx_impl)?; + bitcode_data_set.add(&stdlib_bitcode_data)?; let attributes_bitcode_data = Data::new(comgr, DataKind::Bc, c"attributes.bc", attributes_buffer)?; bitcode_data_set.add(&attributes_bitcode_data)?; - let stdlib_bitcode_data = Data::new(comgr, DataKind::Bc, c"ptx_impl.bc", ptx_impl)?; - bitcode_data_set.add(&stdlib_bitcode_data)?; let linking_info = ActionInfo::new(comgr)?; let linked_data_set = comgr.do_action(ActionKind::LinkBcToBc, &linking_info, &bitcode_data_set)?; @@ -214,6 +227,9 @@ pub fn compile_bitcode( c"-Xclang", c"-fdenormal-fp-math=dynamic", c"-O3", + // To consider + //c"-mllvm", + //c"-amdgpu-internalize-symbols", c"-mno-wavefrontsize64", c"-mcumode", // Useful for inlining reports, combined with AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_EMIT_VERBOSE_LOGS=1 AMD_COMGR_REDIRECT_LOGS=stderr @@ -263,6 +279,22 @@ pub fn compile_bitcode( executable } +pub fn get_symbols(comgr: &Comgr, elf: &[u8]) -> Result, Error> { + let elf = Data::new(comgr, DataKind::Executable, c"elf.o", elf)?; + let mut symbols = Vec::new(); + comgr.iterate_symbols(&elf, &mut |symbol| { + let type_ = unsafe { comgr.symbol_get_info::(symbol, SymbolInfo::Type)? }; + let name_length = unsafe { comgr.symbol_get_info::(symbol, SymbolInfo::NameLength)? }; + let mut name = + unsafe { comgr.symbol_get_buffer(symbol, SymbolInfo::Name, name_length as usize + 1)? }; + name.pop(); + let name = String::from_utf8(name).map_err(|_| Error::UNKNOWN)?; + symbols.push((type_, name)); + Ok(()) + })?; + Ok(symbols) +} + pub fn get_clang_version(comgr: &Comgr) -> Result { let version_string_set = DataSet::new(comgr)?; let version_string = Data::new( @@ -310,6 +342,8 @@ pub enum Comgr { V3(amd_comgr_sys::comgr3::Comgr3), } +type SymbolIterator = dyn FnMut(Symbol) -> Result<(), Error>; + impl Comgr { pub fn new() -> Result { unsafe { libloading::Library::new(os::COMGR3) } @@ -351,16 +385,78 @@ impl Comgr { (major, minor) >= (2, 9) } - fn do_action( - &self, + fn do_action<'a>( + &'a self, kind: ActionKind, action: &ActionInfo, data_set: &DataSet, - ) -> Result { + ) -> Result, Error> { let result = DataSet::new(self)?; call_dispatch!(self => amd_comgr_do_action(kind, action, data_set, result)); Ok(result) } + + fn iterate_symbols<'this>( + &'this self, + data: &Data, + mut fn_: &mut (dyn FnMut(Symbol) -> Result<(), Error> + 'this), + ) -> Result<(), Error> { + let thin_pointer = &mut fn_; + match self { + Comgr::V2(comgr) => { + unsafe { + comgr.amd_comgr_iterate_symbols( + data.comgr2(), + Some(Self::iterate_callback_v2), + mem::transmute(thin_pointer), + ) + }?; + } + Comgr::V3(comgr) => { + unsafe { + comgr.amd_comgr_iterate_symbols( + data.comgr3(), + Some(Self::iterate_callback_v3), + mem::transmute(thin_pointer), + ) + }?; + } + } + Ok(()) + } + + extern "C" fn iterate_callback_v3( + symbol: comgr3::amd_comgr_symbol_t, + user_data: *mut ::std::os::raw::c_void, + ) -> Result<(), comgr3::amd_comgr_status_s> { + let user_data = unsafe { mem::transmute::<_, &mut &mut SymbolIterator>(user_data) }; + (*user_data)(Symbol(symbol.handle)).map_err(Into::into) + } + + extern "C" fn iterate_callback_v2( + symbol: comgr2::amd_comgr_symbol_t, + user_data: *mut ::std::os::raw::c_void, + ) -> Result<(), comgr2::amd_comgr_status_s> { + let user_data = unsafe { mem::transmute::<_, &mut &mut SymbolIterator>(user_data) }; + (*user_data)(Symbol(symbol.handle)).map_err(Into::into) + } + + unsafe fn symbol_get_info(&self, symbol: Symbol, attribute: SymbolInfo) -> Result { + let mut value = unsafe { mem::zeroed::() }; + call_dispatch!(self => amd_comgr_symbol_get_info(symbol, attribute, { ptr::from_mut(&mut value).cast() })); + Ok(value) + } + + unsafe fn symbol_get_buffer( + &self, + symbol: Symbol, + attribute: SymbolInfo, + length: usize, + ) -> Result, Error> { + let mut value = vec![0u8; length]; + call_dispatch!(self => amd_comgr_symbol_get_info(symbol, attribute, { value.as_mut_ptr().cast() })); + Ok(value) + } } #[derive(Debug, thiserror::Error)] @@ -388,12 +484,24 @@ impl From for Error { } } +impl Into for Error { + fn into(self) -> comgr2::amd_comgr_status_s { + comgr2::amd_comgr_status_s(self.0) + } +} + impl From for Error { fn from(status: comgr3::amd_comgr_status_s) -> Self { Error(status.0) } } +impl Into for Error { + fn into(self) -> comgr3::amd_comgr_status_s { + comgr3::amd_comgr_status_s(self.0) + } +} + macro_rules! impl_into { ($self_type:ident, $to_type:ident, [$($from:ident => $to:ident),+]) => { #[derive(Copy, Clone)] @@ -405,6 +513,7 @@ macro_rules! impl_into { } impl $self_type { + #[allow(dead_code)] fn comgr2(self) -> comgr2::$to_type { match self { $( @@ -413,6 +522,7 @@ macro_rules! impl_into { } } + #[allow(dead_code)] fn comgr3(self) -> comgr3::$to_type { match self { $( @@ -435,6 +545,31 @@ impl_into!( ] ); +impl_into!( + SymbolType, + amd_comgr_symbol_type_t, + [ + Unknown => AMD_COMGR_SYMBOL_TYPE_UNKNOWN, + NoType => AMD_COMGR_SYMBOL_TYPE_NOTYPE, + Object => AMD_COMGR_SYMBOL_TYPE_OBJECT, + Func => AMD_COMGR_SYMBOL_TYPE_FUNC, + Section => AMD_COMGR_SYMBOL_TYPE_SECTION, + File => AMD_COMGR_SYMBOL_TYPE_FILE, + Common => AMD_COMGR_SYMBOL_TYPE_COMMON, + AmdgpuHsaKernel => AMD_COMGR_SYMBOL_TYPE_AMDGPU_HSA_KERNEL + ] +); + +impl_into!( + SymbolInfo, + amd_comgr_symbol_info_t, + [ + NameLength => AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, + Name => AMD_COMGR_SYMBOL_INFO_NAME, + Type => AMD_COMGR_SYMBOL_INFO_TYPE + ] +); + impl_into!( DataKind, amd_comgr_data_kind_t, diff --git a/dark_api/src/fatbin.rs b/dark_api/src/fatbin.rs index 7fb4493..8d7868d 100644 --- a/dark_api/src/fatbin.rs +++ b/dark_api/src/fatbin.rs @@ -106,7 +106,7 @@ impl<'a> FatbinSubmodule<'a> { FatbinSubmodule { header } } - pub fn get_files(&self) -> FatbinFileIterator { + pub fn get_files(&self) -> FatbinFileIterator<'a> { unsafe { FatbinFileIterator::new(self.header) } } } @@ -200,7 +200,7 @@ impl<'a> FatbinFileIterator<'a> { Self { file_buffer } } - pub unsafe fn next(&mut self) -> Result, ParseError> { + pub unsafe fn next(&mut self) -> Result>, ParseError> { if self.file_buffer.len() < std::mem::size_of::() { return Ok(None); } diff --git a/dark_api/src/lib.rs b/dark_api/src/lib.rs index 53c6073..e2ad634 100644 --- a/dark_api/src/lib.rs +++ b/dark_api/src/lib.rs @@ -124,7 +124,7 @@ macro_rules! dark_api { result } - pub fn get(&self, key: &cuda_types::cuda::CUuuid) -> Option { + pub fn get(&self, key: &cuda_types::cuda::CUuuid) -> Option> { match key { $( &Self::$name => { diff --git a/ext/amd_comgr-sys/README b/ext/amd_comgr-sys/README index a3e8e79..1ea190c 100644 --- a/ext/amd_comgr-sys/README +++ b/ext/amd_comgr-sys/README @@ -1,4 +1,9 @@ # On ROCm 6.3 and lower -bindgen --rust-target 1.77 /opt/rocm-6.3.4/include/amd_comgr/amd_comgr.h -o src/comgr2.rs --no-layout-tests --default-enum-style=newtype --no-derive-debug --must-use-type amd_comgr_status_t --allowlist-var "^AMD_COMGR.*$" --dynamic-loading Comgr2 --allowlist-function amd_comgr_do_action --allowlist-function amd_comgr_action_data_get_data --allowlist-function amd_comgr_action_info_set_isa_name --allowlist-function amd_comgr_action_info_set_language --allowlist-function amd_comgr_create_action_info --allowlist-function amd_comgr_create_data --allowlist-function amd_comgr_create_data_set --allowlist-function amd_comgr_data_set_add --allowlist-function amd_comgr_destroy_action_info --allowlist-function amd_comgr_destroy_data_set --allowlist-function amd_comgr_get_data --allowlist-function amd_comgr_set_data --allowlist-function amd_comgr_set_data_name --allowlist-function amd_comgr_action_info_set_option_list --allowlist-function amd_comgr_get_version +wget https://repo.radeon.com/rocm/apt/6.3.4/pool/main/c/comgr6.3.4/comgr6.3.4_2.8.0.60304-76~24.04_amd64.deb +apt install ./rocm-core6.3.4_6.3.4.60304-76~24.04_amd64.deb +wget https://repo.radeon.com/rocm/apt/6.3.4/pool/main/r/rocm-core6.3.4/rocm-core6.3.4_6.3.4.60304-76~24.04_amd64.deb +apt install ./comgr6.3.4_2.8.0.60304-76~24.04_amd64.deb +cargo install bindgen-cli +bindgen --rust-target 1.77 /opt/rocm-6.3.4/include/amd_comgr/amd_comgr.h -o src/comgr2.rs --no-layout-tests --default-enum-style=newtype --no-derive-debug --must-use-type amd_comgr_status_t --allowlist-var "^AMD_COMGR.*$" --dynamic-loading Comgr2 --allowlist-function amd_comgr_do_action --allowlist-function amd_comgr_action_data_get_data --allowlist-function amd_comgr_action_info_set_isa_name --allowlist-function amd_comgr_action_info_set_language --allowlist-function amd_comgr_create_action_info --allowlist-function amd_comgr_create_data --allowlist-function amd_comgr_create_data_set --allowlist-function amd_comgr_data_set_add --allowlist-function amd_comgr_destroy_action_info --allowlist-function amd_comgr_destroy_data_set --allowlist-function amd_comgr_get_data --allowlist-function amd_comgr_set_data --allowlist-function amd_comgr_set_data_name --allowlist-function amd_comgr_action_info_set_option_list --allowlist-function amd_comgr_get_version --allowlist-function amd_comgr_iterate_symbols --allowlist-function amd_comgr_symbol_get_info --allowlist-type amd_comgr_symbol_type_t # On ROCm 6.4 and higher -bindgen --rust-target 1.77 /opt/rocm/include/amd_comgr/amd_comgr.h -o src/comgr3.rs --no-layout-tests --default-enum-style=newtype --no-derive-debug --must-use-type amd_comgr_status_t --allowlist-var "^AMD_COMGR.*$" --dynamic-loading Comgr3 --allowlist-function amd_comgr_do_action --allowlist-function amd_comgr_action_data_get_data --allowlist-function amd_comgr_action_info_set_isa_name --allowlist-function amd_comgr_action_info_set_language --allowlist-function amd_comgr_create_action_info --allowlist-function amd_comgr_create_data --allowlist-function amd_comgr_create_data_set --allowlist-function amd_comgr_data_set_add --allowlist-function amd_comgr_destroy_action_info --allowlist-function amd_comgr_destroy_data_set --allowlist-function amd_comgr_get_data --allowlist-function amd_comgr_set_data --allowlist-function amd_comgr_set_data_name --allowlist-function amd_comgr_action_info_set_option_list --allowlist-function amd_comgr_get_version +bindgen --rust-target 1.77 /opt/rocm/include/amd_comgr/amd_comgr.h -o src/comgr3.rs --no-layout-tests --default-enum-style=newtype --no-derive-debug --must-use-type amd_comgr_status_t --allowlist-var "^AMD_COMGR.*$" --dynamic-loading Comgr3 --allowlist-function amd_comgr_do_action --allowlist-function amd_comgr_action_data_get_data --allowlist-function amd_comgr_action_info_set_isa_name --allowlist-function amd_comgr_action_info_set_language --allowlist-function amd_comgr_create_action_info --allowlist-function amd_comgr_create_data --allowlist-function amd_comgr_create_data_set --allowlist-function amd_comgr_data_set_add --allowlist-function amd_comgr_destroy_action_info --allowlist-function amd_comgr_destroy_data_set --allowlist-function amd_comgr_get_data --allowlist-function amd_comgr_set_data --allowlist-function amd_comgr_set_data_name --allowlist-function amd_comgr_action_info_set_option_list --allowlist-function amd_comgr_get_version --allowlist-function amd_comgr_iterate_symbols --allowlist-function amd_comgr_symbol_get_info --allowlist-type amd_comgr_symbol_type_t diff --git a/ext/amd_comgr-sys/src/comgr2.rs b/ext/amd_comgr-sys/src/comgr2.rs index 9811644..44a4268 100644 --- a/ext/amd_comgr-sys/src/comgr2.rs +++ b/ext/amd_comgr-sys/src/comgr2.rs @@ -1,4 +1,4 @@ -/* automatically generated by rust-bindgen 0.71.1 */ +/* automatically generated by rust-bindgen 0.72.0 */ pub const AMD_COMGR_INTERFACE_VERSION_MAJOR: u32 = 2; pub const AMD_COMGR_INTERFACE_VERSION_MINOR: u32 = 8; @@ -6,13 +6,9 @@ impl amd_comgr_status_s { #[doc = " A generic error has occurred."] pub const AMD_COMGR_STATUS_ERROR: amd_comgr_status_s = amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(1) }); -} -impl amd_comgr_status_s { #[doc = " One of the actual arguments does not meet a precondition stated\n in the documentation of the corresponding formal argument. This\n includes both invalid Action types, and invalid arguments to\n valid Action types."] pub const AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT: amd_comgr_status_s = amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(2) }); -} -impl amd_comgr_status_s { #[doc = " Failed to allocate the necessary resources."] pub const AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES: amd_comgr_status_s = amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(3) }); @@ -25,28 +21,16 @@ type amd_comgr_status_t = Result<(), self::amd_comgr_status_s>; impl amd_comgr_language_s { #[doc = " No high level language."] pub const AMD_COMGR_LANGUAGE_NONE: amd_comgr_language_s = amd_comgr_language_s(0); -} -impl amd_comgr_language_s { #[doc = " OpenCL 1.2."] pub const AMD_COMGR_LANGUAGE_OPENCL_1_2: amd_comgr_language_s = amd_comgr_language_s(1); -} -impl amd_comgr_language_s { #[doc = " OpenCL 2.0."] pub const AMD_COMGR_LANGUAGE_OPENCL_2_0: amd_comgr_language_s = amd_comgr_language_s(2); -} -impl amd_comgr_language_s { #[doc = " AMD Hetrogeneous C++ (HC)."] pub const AMD_COMGR_LANGUAGE_HC: amd_comgr_language_s = amd_comgr_language_s(3); -} -impl amd_comgr_language_s { #[doc = " HIP."] pub const AMD_COMGR_LANGUAGE_HIP: amd_comgr_language_s = amd_comgr_language_s(4); -} -impl amd_comgr_language_s { #[doc = " LLVM IR, either textual (.ll) or bitcode (.bc) format."] pub const AMD_COMGR_LANGUAGE_LLVM_IR: amd_comgr_language_s = amd_comgr_language_s(5); -} -impl amd_comgr_language_s { #[doc = " Marker for last valid language."] pub const AMD_COMGR_LANGUAGE_LAST: amd_comgr_language_s = amd_comgr_language_s(5); } @@ -59,65 +43,35 @@ pub use self::amd_comgr_language_s as amd_comgr_language_t; impl amd_comgr_data_kind_s { #[doc = " No data is available."] pub const AMD_COMGR_DATA_KIND_UNDEF: amd_comgr_data_kind_s = amd_comgr_data_kind_s(0); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a textual main source."] pub const AMD_COMGR_DATA_KIND_SOURCE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(1); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a textual source that is included in the main source\n or other include source."] pub const AMD_COMGR_DATA_KIND_INCLUDE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(2); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a precompiled-header source that is included in the main\n source or other include source."] pub const AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER: amd_comgr_data_kind_s = amd_comgr_data_kind_s(3); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a diagnostic output."] pub const AMD_COMGR_DATA_KIND_DIAGNOSTIC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(4); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a textual log output."] pub const AMD_COMGR_DATA_KIND_LOG: amd_comgr_data_kind_s = amd_comgr_data_kind_s(5); -} -impl amd_comgr_data_kind_s { #[doc = " The data is compiler LLVM IR bit code for a specific isa."] pub const AMD_COMGR_DATA_KIND_BC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(6); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a relocatable machine code object for a specific isa."] pub const AMD_COMGR_DATA_KIND_RELOCATABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(7); -} -impl amd_comgr_data_kind_s { #[doc = " The data is an executable machine code object for a specific\n isa. An executable is the kind of code object that can be loaded\n and executed."] pub const AMD_COMGR_DATA_KIND_EXECUTABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(8); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a block of bytes."] pub const AMD_COMGR_DATA_KIND_BYTES: amd_comgr_data_kind_s = amd_comgr_data_kind_s(9); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a fat binary (clang-offload-bundler output)."] pub const AMD_COMGR_DATA_KIND_FATBIN: amd_comgr_data_kind_s = amd_comgr_data_kind_s(16); -} -impl amd_comgr_data_kind_s { #[doc = " The data is an archive."] pub const AMD_COMGR_DATA_KIND_AR: amd_comgr_data_kind_s = amd_comgr_data_kind_s(17); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a bitcode bundle."] pub const AMD_COMGR_DATA_KIND_BC_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(18); -} -impl amd_comgr_data_kind_s { #[doc = " The data is an archive bundle."] pub const AMD_COMGR_DATA_KIND_AR_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(19); -} -impl amd_comgr_data_kind_s { #[doc = " The data is an object file bundle."] pub const AMD_COMGR_DATA_KIND_OBJ_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(20); -} -impl amd_comgr_data_kind_s { #[doc = " Marker for last valid data kind."] pub const AMD_COMGR_DATA_KIND_LAST: amd_comgr_data_kind_s = amd_comgr_data_kind_s(20); } @@ -151,100 +105,70 @@ pub struct amd_comgr_action_info_s { } #[doc = " @brief A handle to an action information object.\n\n An action information object holds all the necessary information,\n excluding the input data objects, required to perform an action."] pub type amd_comgr_action_info_t = amd_comgr_action_info_s; +#[doc = " @brief A handle to a machine code object symbol.\n\n A symbol handle is used to obtain the properties of symbols of a machine code\n object. A symbol handle is invalidated when the data object containing the\n symbol is destroyed."] +#[repr(C)] +#[derive(Copy, Clone)] +pub struct amd_comgr_symbol_s { + pub handle: u64, +} +#[doc = " @brief A handle to a machine code object symbol.\n\n A symbol handle is used to obtain the properties of symbols of a machine code\n object. A symbol handle is invalidated when the data object containing the\n symbol is destroyed."] +pub type amd_comgr_symbol_t = amd_comgr_symbol_s; impl amd_comgr_action_kind_s { #[doc = " Preprocess each source data object in @p input in order. For each\n successful preprocessor invocation, add a source data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the working\n directory path in @p info. Preprocess the source for the language in @p\n info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any preprocessing fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_SOURCE_TO_PREPROCESSOR: amd_comgr_action_kind_s = amd_comgr_action_kind_s(0); -} -impl amd_comgr_action_kind_s { #[doc = " Copy all existing data objects in @p input to @p output, then add the\n device-specific and language-specific precompiled headers required for\n compilation.\n\n Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2\n and @p AMD_COMGR_LANGUAGE_OPENCL_2_0.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language\n is not set in @p info, or the language is not supported."] pub const AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS: amd_comgr_action_kind_s = amd_comgr_action_kind_s(1); -} -impl amd_comgr_action_kind_s { #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(2); -} -impl amd_comgr_action_kind_s { #[doc = " Copy all existing data objects in @p input to @p output, then add the\n device-specific and language-specific bitcode libraries required for\n compilation.\n\n Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2,\n @p AMD_COMGR_LANGUAGE_OPENCL_2_0, and @p AMD_COMGR_LANGUAGE_HIP.\n\n The options in @p info should be set to a set of language-specific flags.\n For OpenCL and HIP these include:\n\n correctly_rounded_sqrt\n daz_opt\n finite_only\n unsafe_math\n wavefrontsize64\n\n For example, to enable daz_opt and unsafe_math, the options should be set\n as:\n\n const char *options[] = {\"daz_opt, \"unsafe_math\"};\n size_t optionsCount = sizeof(options) / sizeof(options[0]);\n amd_comgr_action_info_set_option_list(info, options, optionsCount);\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language\n is not set in @p info, the language is not supported, an unknown\n language-specific flag is supplied, or a language-specific flag is\n repeated.\n\n @deprecated since 1.7\n @warning This action, followed by @c AMD_COMGR_ACTION_LINK_BC_TO_BC, may\n result in subtle bugs due to incorrect linking of the device libraries.\n The @c AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC action can\n be used as a workaround which ensures the link occurs correctly."] pub const AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES: amd_comgr_action_kind_s = amd_comgr_action_kind_s(3); -} -impl amd_comgr_action_kind_s { #[doc = " Link a collection of bitcodes, bundled bitcodes, and bundled bitcode\n archives in @p into a single composite (unbundled) bitcode @p.\n Any device library bc data object must be explicitly added to @p input if\n needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link or unbundling fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if IsaName is not set in @p info and does not match the isa name\n of all bc data objects in @p input, or if the Name field is not set for\n any DataObject in the input set."] pub const AMD_COMGR_ACTION_LINK_BC_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(4); -} -impl amd_comgr_action_kind_s { #[doc = " Optimize each bc data object in @p input and create an optimized bc data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if the optimization fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."] pub const AMD_COMGR_ACTION_OPTIMIZE_BC_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(5); -} -impl amd_comgr_action_kind_s { #[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add a relocatable data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."] pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(6); -} -impl amd_comgr_action_kind_s { #[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add an assembly source data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."] pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_ASSEMBLY: amd_comgr_action_kind_s = amd_comgr_action_kind_s(7); -} -impl amd_comgr_action_kind_s { #[doc = " Link each relocatable data object in @p input together and add\n the linked relocatable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."] pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_RELOCATABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(8); -} -impl amd_comgr_action_kind_s { #[doc = " Link each relocatable data object in @p input together and add\n the linked executable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."] pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(9); -} -impl amd_comgr_action_kind_s { #[doc = " Assemble each source data object in @p input in order into machine code.\n For each successful assembly add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects in\n @p input. Resolve any include relative path names using the working\n directory path in @p info. Produce relocatable for isa name in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any assembly fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name is not set in\n @p info."] pub const AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(10); -} -impl amd_comgr_action_kind_s { #[doc = " Disassemble each relocatable data object in @p input in\n order. For each successful disassembly add a source data object to\n @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."] pub const AMD_COMGR_ACTION_DISASSEMBLE_RELOCATABLE_TO_SOURCE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(11); -} -impl amd_comgr_action_kind_s { #[doc = " Disassemble each executable data object in @p input in order. For\n each successful disassembly add a source data object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."] pub const AMD_COMGR_ACTION_DISASSEMBLE_EXECUTABLE_TO_SOURCE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(12); -} -impl amd_comgr_action_kind_s { #[doc = " Disassemble each bytes data object in @p input in order. For each\n successful disassembly add a source data object to @p\n result. Only simple assembly language commands are generate that\n corresponf to raw bytes are supported, not any directives that\n control the code object layout, or symbolic branch targets or\n names.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info"] pub const AMD_COMGR_ACTION_DISASSEMBLE_BYTES_TO_SOURCE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(13); -} -impl amd_comgr_action_kind_s { #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a fat binary to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce fat binary for isa name in @p\n info. Compile the source for the language in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info.\n\n @deprecated since 2.5\n @see in-process compilation via AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, etc.\n insteaad"] pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_FATBIN: amd_comgr_action_kind_s = amd_comgr_action_kind_s(14); -} -impl amd_comgr_action_kind_s { #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(15); -} -impl amd_comgr_action_kind_s { #[doc = " Compile a single source data object in @p input in order. For each\n successful compilation add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce relocatable for hip name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation. Currently only supports HIP language.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(16); -} -impl amd_comgr_action_kind_s { #[doc = " Compile each source data object in @p input and create a single executabele\n in @p result. Resolve any include source names using the names of include\n data objects in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce executable for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(17); -} -impl amd_comgr_action_kind_s { #[doc = " Unbundle each source data object in @p input. These objects can be\n bitcode bundles, or an archive containing bitcode bundles. For each\n successful unbundling, add a bc object or archive object to @p result,\n depending on the corresponding input.\n\n Return @p AMD_COMGR_STATUS_ERROR if any unbundling\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_UNBUNDLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(18); -} -impl amd_comgr_action_kind_s { #[doc = " Marker for last valid action kind."] pub const AMD_COMGR_ACTION_LAST: amd_comgr_action_kind_s = amd_comgr_action_kind_s(18); } @@ -254,6 +178,55 @@ impl amd_comgr_action_kind_s { pub struct amd_comgr_action_kind_s(pub ::std::os::raw::c_uint); #[doc = " @brief The kinds of actions that can be performed."] pub use self::amd_comgr_action_kind_s as amd_comgr_action_kind_t; +impl amd_comgr_symbol_type_s { + #[doc = " The symbol's type is unknown.\n\n The user should not infer any specific type for symbols which return\n `AMD_COMGR_SYMBOL_TYPE_UNKNOWN`, and these symbols may return different\n types in future releases."] + pub const AMD_COMGR_SYMBOL_TYPE_UNKNOWN: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(-1); + #[doc = " The symbol's type is not specified."] + pub const AMD_COMGR_SYMBOL_TYPE_NOTYPE: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(0); + #[doc = " The symbol is associated with a data object, such as a variable, an array,\n and so on."] + pub const AMD_COMGR_SYMBOL_TYPE_OBJECT: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(1); + #[doc = " The symbol is associated with a function or other executable code."] + pub const AMD_COMGR_SYMBOL_TYPE_FUNC: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(2); + #[doc = " The symbol is associated with a section. Symbol table entries of this type\n exist primarily for relocation."] + pub const AMD_COMGR_SYMBOL_TYPE_SECTION: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(3); + #[doc = " Conventionally, the symbol's name gives the name of the source file\n associated with the object file."] + pub const AMD_COMGR_SYMBOL_TYPE_FILE: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(4); + #[doc = " The symbol labels an uninitialized common block."] + pub const AMD_COMGR_SYMBOL_TYPE_COMMON: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(5); + #[doc = " The symbol is associated with an AMDGPU Code Object V2 kernel function."] + pub const AMD_COMGR_SYMBOL_TYPE_AMDGPU_HSA_KERNEL: amd_comgr_symbol_type_s = + amd_comgr_symbol_type_s(10); +} +#[repr(transparent)] +#[doc = " @brief Machine code object symbol type."] +#[derive(Copy, Clone, Hash, PartialEq, Eq)] +pub struct amd_comgr_symbol_type_s(pub ::std::os::raw::c_int); +#[doc = " @brief Machine code object symbol type."] +pub use self::amd_comgr_symbol_type_s as amd_comgr_symbol_type_t; +impl amd_comgr_symbol_info_s { + #[doc = " The length of the symbol name in bytes. Does not include the NUL\n terminator. The type of this attribute is uint64_t."] + pub const AMD_COMGR_SYMBOL_INFO_NAME_LENGTH: amd_comgr_symbol_info_s = + amd_comgr_symbol_info_s(0); + #[doc = " The name of the symbol. The type of this attribute is character array with\n the length equal to the value of the @p AMD_COMGR_SYMBOL_INFO_NAME_LENGTH\n attribute plus 1 for a NUL terminator."] + pub const AMD_COMGR_SYMBOL_INFO_NAME: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(1); + #[doc = " The kind of the symbol. The type of this attribute is @p\n amd_comgr_symbol_type_t."] + pub const AMD_COMGR_SYMBOL_INFO_TYPE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(2); + #[doc = " Size of the variable. The value of this attribute is undefined if the\n symbol is not a variable. The type of this attribute is uint64_t."] + pub const AMD_COMGR_SYMBOL_INFO_SIZE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(3); + #[doc = " Indicates whether the symbol is undefined. The type of this attribute is\n bool."] + pub const AMD_COMGR_SYMBOL_INFO_IS_UNDEFINED: amd_comgr_symbol_info_s = + amd_comgr_symbol_info_s(4); + #[doc = " The value of the symbol. The type of this attribute is uint64_t."] + pub const AMD_COMGR_SYMBOL_INFO_VALUE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(5); + #[doc = " Marker for last valid symbol info."] + pub const AMD_COMGR_SYMBOL_INFO_LAST: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(5); +} +#[repr(transparent)] +#[doc = " @brief Machine code object symbol attributes."] +#[derive(Copy, Clone, Hash, PartialEq, Eq)] +pub struct amd_comgr_symbol_info_s(pub ::std::os::raw::c_uint); +#[doc = " @brief Machine code object symbol attributes."] +pub use self::amd_comgr_symbol_info_s as amd_comgr_symbol_info_t; pub struct Comgr2 { __library: ::libloading::Library, pub amd_comgr_get_version: @@ -351,6 +324,27 @@ pub struct Comgr2 { ) -> amd_comgr_status_t, ::libloading::Error, >, + pub amd_comgr_iterate_symbols: Result< + unsafe extern "C" fn( + data: amd_comgr_data_t, + callback: ::std::option::Option< + unsafe extern "C" fn( + symbol: amd_comgr_symbol_t, + user_data: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t, + >, + user_data: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t, + ::libloading::Error, + >, + pub amd_comgr_symbol_get_info: Result< + unsafe extern "C" fn( + symbol: amd_comgr_symbol_t, + attribute: amd_comgr_symbol_info_t, + value: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t, + ::libloading::Error, + >, } impl Comgr2 { pub unsafe fn new

(path: P) -> Result @@ -396,6 +390,12 @@ impl Comgr2 { .get(b"amd_comgr_action_info_set_option_list\0") .map(|sym| *sym); let amd_comgr_do_action = __library.get(b"amd_comgr_do_action\0").map(|sym| *sym); + let amd_comgr_iterate_symbols = __library + .get(b"amd_comgr_iterate_symbols\0") + .map(|sym| *sym); + let amd_comgr_symbol_get_info = __library + .get(b"amd_comgr_symbol_get_info\0") + .map(|sym| *sym); Ok(Comgr2 { __library, amd_comgr_get_version, @@ -413,6 +413,8 @@ impl Comgr2 { amd_comgr_action_info_set_language, amd_comgr_action_info_set_option_list, amd_comgr_do_action, + amd_comgr_iterate_symbols, + amd_comgr_symbol_get_info, }) } #[doc = " @brief Get the version of the code object manager interface\n supported.\n\n An interface is backwards compatible with an implementation with an\n equal major version, and a greater than or equal minor version.\n\n @param[out] major Major version number.\n\n @param[out] minor Minor version number."] @@ -593,4 +595,35 @@ impl Comgr2 { .as_ref() .expect("Expected function, got error."))(kind, info, input, result) } + #[must_use] + #[doc = " @brief Iterate over the symbols of a machine code object.\n\n For a AMD_COMGR_DATA_KIND_RELOCATABLE the symbols in the ELF symtab section\n are iterated. For a AMD_COMGR_DATA_KIND_EXECUTABLE the symbols in the ELF\n dynsymtab are iterated.\n\n @param[in] data The data object to query.\n\n @param[in] callback The function to call for each symbol in the machine code\n data object. The symbol handle is passed in @p symbol and @p user_data is\n passed as @p user_data. If the function returns with a status other than @p\n AMD_COMGR_STATUS_SUCCESS then iteration is stopped.\n\n @param[in] user_data The value to pass to each invocation of @p\n callback. Allows context to be passed into the call back function.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR An error was\n reported by @p callback.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data is an invalid data\n object, or not of kind @p AMD_COMGR_DATA_KIND_RELOCATABLE or\n AMD_COMGR_DATA_KIND_EXECUTABLE. @p callback is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to iterate the data object as out of resources."] + pub unsafe fn amd_comgr_iterate_symbols( + &self, + data: amd_comgr_data_t, + callback: ::std::option::Option< + unsafe extern "C" fn( + symbol: amd_comgr_symbol_t, + user_data: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t, + >, + user_data: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t { + (self + .amd_comgr_iterate_symbols + .as_ref() + .expect("Expected function, got error."))(data, callback, user_data) + } + #[must_use] + #[doc = " @brief Query information about a machine code object symbol.\n\n @param[in] symbol The symbol to query.\n\n @param[in] attribute Attribute to query.\n\n @param[out] value Pointer to an application-allocated buffer where to store\n the value of the attribute. If the buffer passed by the application is not\n large enough to hold the value of attribute, the behavior is undefined. The\n type of value returned is specified by @p amd_comgr_symbol_info_t.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The @p symbol does not have the requested @p\n attribute.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p symbol is an invalid\n symbol. @p attribute is an invalid value. @p value is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to query symbol as out of resources."] + pub unsafe fn amd_comgr_symbol_get_info( + &self, + symbol: amd_comgr_symbol_t, + attribute: amd_comgr_symbol_info_t, + value: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t { + (self + .amd_comgr_symbol_get_info + .as_ref() + .expect("Expected function, got error."))(symbol, attribute, value) + } } diff --git a/ext/amd_comgr-sys/src/comgr3.rs b/ext/amd_comgr-sys/src/comgr3.rs index c0c1474..b20a7ab 100644 --- a/ext/amd_comgr-sys/src/comgr3.rs +++ b/ext/amd_comgr-sys/src/comgr3.rs @@ -1,4 +1,4 @@ -/* automatically generated by rust-bindgen 0.71.1 */ +/* automatically generated by rust-bindgen 0.72.0 */ pub const AMD_COMGR_INTERFACE_VERSION_MAJOR: u32 = 3; pub const AMD_COMGR_INTERFACE_VERSION_MINOR: u32 = 0; @@ -6,13 +6,9 @@ impl amd_comgr_status_s { #[doc = " A generic error has occurred."] pub const AMD_COMGR_STATUS_ERROR: amd_comgr_status_s = amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(1) }); -} -impl amd_comgr_status_s { #[doc = " One of the actual arguments does not meet a precondition stated\n in the documentation of the corresponding formal argument. This\n includes both invalid Action types, and invalid arguments to\n valid Action types."] pub const AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT: amd_comgr_status_s = amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(2) }); -} -impl amd_comgr_status_s { #[doc = " Failed to allocate the necessary resources."] pub const AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES: amd_comgr_status_s = amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(3) }); @@ -25,24 +21,14 @@ type amd_comgr_status_t = Result<(), self::amd_comgr_status_s>; impl amd_comgr_language_s { #[doc = " No high level language."] pub const AMD_COMGR_LANGUAGE_NONE: amd_comgr_language_s = amd_comgr_language_s(0); -} -impl amd_comgr_language_s { #[doc = " OpenCL 1.2."] pub const AMD_COMGR_LANGUAGE_OPENCL_1_2: amd_comgr_language_s = amd_comgr_language_s(1); -} -impl amd_comgr_language_s { #[doc = " OpenCL 2.0."] pub const AMD_COMGR_LANGUAGE_OPENCL_2_0: amd_comgr_language_s = amd_comgr_language_s(2); -} -impl amd_comgr_language_s { #[doc = " HIP."] pub const AMD_COMGR_LANGUAGE_HIP: amd_comgr_language_s = amd_comgr_language_s(3); -} -impl amd_comgr_language_s { #[doc = " LLVM IR, either textual (.ll) or bitcode (.bc) format."] pub const AMD_COMGR_LANGUAGE_LLVM_IR: amd_comgr_language_s = amd_comgr_language_s(4); -} -impl amd_comgr_language_s { #[doc = " Marker for last valid language."] pub const AMD_COMGR_LANGUAGE_LAST: amd_comgr_language_s = amd_comgr_language_s(4); } @@ -55,69 +41,37 @@ pub use self::amd_comgr_language_s as amd_comgr_language_t; impl amd_comgr_data_kind_s { #[doc = " No data is available."] pub const AMD_COMGR_DATA_KIND_UNDEF: amd_comgr_data_kind_s = amd_comgr_data_kind_s(0); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a textual main source."] pub const AMD_COMGR_DATA_KIND_SOURCE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(1); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a textual source that is included in the main source\n or other include source."] pub const AMD_COMGR_DATA_KIND_INCLUDE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(2); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a precompiled-header source that is included in the main\n source or other include source."] pub const AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER: amd_comgr_data_kind_s = amd_comgr_data_kind_s(3); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a diagnostic output."] pub const AMD_COMGR_DATA_KIND_DIAGNOSTIC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(4); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a textual log output."] pub const AMD_COMGR_DATA_KIND_LOG: amd_comgr_data_kind_s = amd_comgr_data_kind_s(5); -} -impl amd_comgr_data_kind_s { #[doc = " The data is compiler LLVM IR bit code for a specific isa."] pub const AMD_COMGR_DATA_KIND_BC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(6); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a relocatable machine code object for a specific isa."] pub const AMD_COMGR_DATA_KIND_RELOCATABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(7); -} -impl amd_comgr_data_kind_s { #[doc = " The data is an executable machine code object for a specific\n isa. An executable is the kind of code object that can be loaded\n and executed."] pub const AMD_COMGR_DATA_KIND_EXECUTABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(8); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a block of bytes."] pub const AMD_COMGR_DATA_KIND_BYTES: amd_comgr_data_kind_s = amd_comgr_data_kind_s(9); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a fat binary (clang-offload-bundler output)."] pub const AMD_COMGR_DATA_KIND_FATBIN: amd_comgr_data_kind_s = amd_comgr_data_kind_s(16); -} -impl amd_comgr_data_kind_s { #[doc = " The data is an archive."] pub const AMD_COMGR_DATA_KIND_AR: amd_comgr_data_kind_s = amd_comgr_data_kind_s(17); -} -impl amd_comgr_data_kind_s { #[doc = " The data is a bitcode bundle."] pub const AMD_COMGR_DATA_KIND_BC_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(18); -} -impl amd_comgr_data_kind_s { #[doc = " The data is an archive bundle."] pub const AMD_COMGR_DATA_KIND_AR_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(19); -} -impl amd_comgr_data_kind_s { #[doc = " The data is an object file bundle."] pub const AMD_COMGR_DATA_KIND_OBJ_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(20); -} -impl amd_comgr_data_kind_s { #[doc = " The data is SPIR-V IR"] pub const AMD_COMGR_DATA_KIND_SPIRV: amd_comgr_data_kind_s = amd_comgr_data_kind_s(21); -} -impl amd_comgr_data_kind_s { #[doc = " Marker for last valid data kind."] pub const AMD_COMGR_DATA_KIND_LAST: amd_comgr_data_kind_s = amd_comgr_data_kind_s(21); } @@ -151,90 +105,64 @@ pub struct amd_comgr_action_info_s { } #[doc = " @brief A handle to an action information object.\n\n An action information object holds all the necessary information,\n excluding the input data objects, required to perform an action."] pub type amd_comgr_action_info_t = amd_comgr_action_info_s; +#[doc = " @brief A handle to a machine code object symbol.\n\n A symbol handle is used to obtain the properties of symbols of a machine code\n object. A symbol handle is invalidated when the data object containing the\n symbol is destroyed."] +#[repr(C)] +#[derive(Copy, Clone)] +pub struct amd_comgr_symbol_s { + pub handle: u64, +} +#[doc = " @brief A handle to a machine code object symbol.\n\n A symbol handle is used to obtain the properties of symbols of a machine code\n object. A symbol handle is invalidated when the data object containing the\n symbol is destroyed."] +pub type amd_comgr_symbol_t = amd_comgr_symbol_s; impl amd_comgr_action_kind_s { #[doc = " Preprocess each source data object in @p input in order. For each\n successful preprocessor invocation, add a source data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the working\n directory path in @p info. Preprocess the source for the language in @p\n info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any preprocessing fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_SOURCE_TO_PREPROCESSOR: amd_comgr_action_kind_s = amd_comgr_action_kind_s(0); -} -impl amd_comgr_action_kind_s { #[doc = " Copy all existing data objects in @p input to @p output, then add the\n device-specific and language-specific precompiled headers required for\n compilation.\n\n Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2\n and @p AMD_COMGR_LANGUAGE_OPENCL_2_0.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language\n is not set in @p info, or the language is not supported."] pub const AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS: amd_comgr_action_kind_s = amd_comgr_action_kind_s(1); -} -impl amd_comgr_action_kind_s { #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(2); -} -impl amd_comgr_action_kind_s { #[doc = " Link a collection of bitcodes, bundled bitcodes, and bundled bitcode\n archives in @p into a single composite (unbundled) bitcode @p.\n Any device library bc data object must be explicitly added to @p input if\n needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link or unbundling fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if IsaName is not set in @p info and does not match the isa name\n of all bc data objects in @p input, or if the Name field is not set for\n any DataObject in the input set."] pub const AMD_COMGR_ACTION_LINK_BC_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(3); -} -impl amd_comgr_action_kind_s { #[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add a relocatable data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."] pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(4); -} -impl amd_comgr_action_kind_s { #[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add an assembly source data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."] pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_ASSEMBLY: amd_comgr_action_kind_s = amd_comgr_action_kind_s(5); -} -impl amd_comgr_action_kind_s { #[doc = " Link each relocatable data object in @p input together and add\n the linked relocatable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."] pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_RELOCATABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(6); -} -impl amd_comgr_action_kind_s { #[doc = " Link each relocatable data object in @p input together and add\n the linked executable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."] pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(7); -} -impl amd_comgr_action_kind_s { #[doc = " Assemble each source data object in @p input in order into machine code.\n For each successful assembly add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects in\n @p input. Resolve any include relative path names using the working\n directory path in @p info. Produce relocatable for isa name in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any assembly fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name is not set in\n @p info."] pub const AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(8); -} -impl amd_comgr_action_kind_s { #[doc = " Disassemble each relocatable data object in @p input in\n order. For each successful disassembly add a source data object to\n @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."] pub const AMD_COMGR_ACTION_DISASSEMBLE_RELOCATABLE_TO_SOURCE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(9); -} -impl amd_comgr_action_kind_s { #[doc = " Disassemble each executable data object in @p input in order. For\n each successful disassembly add a source data object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."] pub const AMD_COMGR_ACTION_DISASSEMBLE_EXECUTABLE_TO_SOURCE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(10); -} -impl amd_comgr_action_kind_s { #[doc = " Disassemble each bytes data object in @p input in order. For each\n successful disassembly add a source data object to @p\n result. Only simple assembly language commands are generate that\n corresponf to raw bytes are supported, not any directives that\n control the code object layout, or symbolic branch targets or\n names.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info"] pub const AMD_COMGR_ACTION_DISASSEMBLE_BYTES_TO_SOURCE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(11); -} -impl amd_comgr_action_kind_s { #[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(12); -} -impl amd_comgr_action_kind_s { #[doc = " Compile a single source data object in @p input in order. For each\n successful compilation add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce relocatable for hip name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation. Currently only supports HIP language.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(13); -} -impl amd_comgr_action_kind_s { #[doc = " Compile each source data object in @p input and create a single executabele\n in @p result. Resolve any include source names using the names of include\n data objects in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce executable for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(14); -} -impl amd_comgr_action_kind_s { #[doc = " Unbundle each source data object in @p input. These objects can be\n bitcode bundles, or an archive containing bitcode bundles. For each\n successful unbundling, add a bc object or archive object to @p result,\n depending on the corresponding input.\n\n Return @p AMD_COMGR_STATUS_ERROR if any unbundling\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."] pub const AMD_COMGR_ACTION_UNBUNDLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(15); -} -impl amd_comgr_action_kind_s { #[doc = " Translate each source SPIR-V object in @p input into LLVM IR Bitcode.\n For each successful translation, add a bc object to @p result *\n\n Return @p AMD_COMGR_STATUS_ERROR if any translation fails\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if any input is not SPIR-V."] pub const AMD_COMGR_ACTION_TRANSLATE_SPIRV_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(19); -} -impl amd_comgr_action_kind_s { #[doc = " Marker for last valid action kind."] pub const AMD_COMGR_ACTION_LAST: amd_comgr_action_kind_s = amd_comgr_action_kind_s(19); } @@ -244,6 +172,55 @@ impl amd_comgr_action_kind_s { pub struct amd_comgr_action_kind_s(pub ::std::os::raw::c_uint); #[doc = " @brief The kinds of actions that can be performed."] pub use self::amd_comgr_action_kind_s as amd_comgr_action_kind_t; +impl amd_comgr_symbol_type_s { + #[doc = " The symbol's type is unknown.\n\n The user should not infer any specific type for symbols which return\n `AMD_COMGR_SYMBOL_TYPE_UNKNOWN`, and these symbols may return different\n types in future releases."] + pub const AMD_COMGR_SYMBOL_TYPE_UNKNOWN: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(-1); + #[doc = " The symbol's type is not specified."] + pub const AMD_COMGR_SYMBOL_TYPE_NOTYPE: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(0); + #[doc = " The symbol is associated with a data object, such as a variable, an array,\n and so on."] + pub const AMD_COMGR_SYMBOL_TYPE_OBJECT: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(1); + #[doc = " The symbol is associated with a function or other executable code."] + pub const AMD_COMGR_SYMBOL_TYPE_FUNC: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(2); + #[doc = " The symbol is associated with a section. Symbol table entries of this type\n exist primarily for relocation."] + pub const AMD_COMGR_SYMBOL_TYPE_SECTION: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(3); + #[doc = " Conventionally, the symbol's name gives the name of the source file\n associated with the object file."] + pub const AMD_COMGR_SYMBOL_TYPE_FILE: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(4); + #[doc = " The symbol labels an uninitialized common block."] + pub const AMD_COMGR_SYMBOL_TYPE_COMMON: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(5); + #[doc = " The symbol is associated with an AMDGPU Code Object V2 kernel function."] + pub const AMD_COMGR_SYMBOL_TYPE_AMDGPU_HSA_KERNEL: amd_comgr_symbol_type_s = + amd_comgr_symbol_type_s(10); +} +#[repr(transparent)] +#[doc = " @brief Machine code object symbol type."] +#[derive(Copy, Clone, Hash, PartialEq, Eq)] +pub struct amd_comgr_symbol_type_s(pub ::std::os::raw::c_int); +#[doc = " @brief Machine code object symbol type."] +pub use self::amd_comgr_symbol_type_s as amd_comgr_symbol_type_t; +impl amd_comgr_symbol_info_s { + #[doc = " The length of the symbol name in bytes. Does not include the NUL\n terminator. The type of this attribute is uint64_t."] + pub const AMD_COMGR_SYMBOL_INFO_NAME_LENGTH: amd_comgr_symbol_info_s = + amd_comgr_symbol_info_s(0); + #[doc = " The name of the symbol. The type of this attribute is character array with\n the length equal to the value of the @p AMD_COMGR_SYMBOL_INFO_NAME_LENGTH\n attribute plus 1 for a NUL terminator."] + pub const AMD_COMGR_SYMBOL_INFO_NAME: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(1); + #[doc = " The kind of the symbol. The type of this attribute is @p\n amd_comgr_symbol_type_t."] + pub const AMD_COMGR_SYMBOL_INFO_TYPE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(2); + #[doc = " Size of the variable. The value of this attribute is undefined if the\n symbol is not a variable. The type of this attribute is uint64_t."] + pub const AMD_COMGR_SYMBOL_INFO_SIZE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(3); + #[doc = " Indicates whether the symbol is undefined. The type of this attribute is\n bool."] + pub const AMD_COMGR_SYMBOL_INFO_IS_UNDEFINED: amd_comgr_symbol_info_s = + amd_comgr_symbol_info_s(4); + #[doc = " The value of the symbol. The type of this attribute is uint64_t."] + pub const AMD_COMGR_SYMBOL_INFO_VALUE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(5); + #[doc = " Marker for last valid symbol info."] + pub const AMD_COMGR_SYMBOL_INFO_LAST: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(5); +} +#[repr(transparent)] +#[doc = " @brief Machine code object symbol attributes."] +#[derive(Copy, Clone, Hash, PartialEq, Eq)] +pub struct amd_comgr_symbol_info_s(pub ::std::os::raw::c_uint); +#[doc = " @brief Machine code object symbol attributes."] +pub use self::amd_comgr_symbol_info_s as amd_comgr_symbol_info_t; pub struct Comgr3 { __library: ::libloading::Library, pub amd_comgr_get_version: @@ -341,6 +318,27 @@ pub struct Comgr3 { ) -> amd_comgr_status_t, ::libloading::Error, >, + pub amd_comgr_iterate_symbols: Result< + unsafe extern "C" fn( + data: amd_comgr_data_t, + callback: ::std::option::Option< + unsafe extern "C" fn( + symbol: amd_comgr_symbol_t, + user_data: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t, + >, + user_data: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t, + ::libloading::Error, + >, + pub amd_comgr_symbol_get_info: Result< + unsafe extern "C" fn( + symbol: amd_comgr_symbol_t, + attribute: amd_comgr_symbol_info_t, + value: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t, + ::libloading::Error, + >, } impl Comgr3 { pub unsafe fn new

(path: P) -> Result @@ -386,6 +384,12 @@ impl Comgr3 { .get(b"amd_comgr_action_info_set_option_list\0") .map(|sym| *sym); let amd_comgr_do_action = __library.get(b"amd_comgr_do_action\0").map(|sym| *sym); + let amd_comgr_iterate_symbols = __library + .get(b"amd_comgr_iterate_symbols\0") + .map(|sym| *sym); + let amd_comgr_symbol_get_info = __library + .get(b"amd_comgr_symbol_get_info\0") + .map(|sym| *sym); Ok(Comgr3 { __library, amd_comgr_get_version, @@ -403,6 +407,8 @@ impl Comgr3 { amd_comgr_action_info_set_language, amd_comgr_action_info_set_option_list, amd_comgr_do_action, + amd_comgr_iterate_symbols, + amd_comgr_symbol_get_info, }) } #[doc = " @brief Get the version of the code object manager interface\n supported.\n\n An interface is backwards compatible with an implementation with an\n equal major version, and a greater than or equal minor version.\n\n @param[out] major Major version number.\n\n @param[out] minor Minor version number."] @@ -583,4 +589,35 @@ impl Comgr3 { .as_ref() .expect("Expected function, got error."))(kind, info, input, result) } + #[must_use] + #[doc = " @brief Iterate over the symbols of a machine code object.\n\n For a AMD_COMGR_DATA_KIND_RELOCATABLE the symbols in the ELF symtab section\n are iterated. For a AMD_COMGR_DATA_KIND_EXECUTABLE the symbols in the ELF\n dynsymtab are iterated.\n\n @param[in] data The data object to query.\n\n @param[in] callback The function to call for each symbol in the machine code\n data object. The symbol handle is passed in @p symbol and @p user_data is\n passed as @p user_data. If the function returns with a status other than @p\n AMD_COMGR_STATUS_SUCCESS then iteration is stopped.\n\n @param[in] user_data The value to pass to each invocation of @p\n callback. Allows context to be passed into the call back function.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR An error was\n reported by @p callback.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data is an invalid data\n object, or not of kind @p AMD_COMGR_DATA_KIND_RELOCATABLE or\n AMD_COMGR_DATA_KIND_EXECUTABLE. @p callback is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to iterate the data object as out of resources."] + pub unsafe fn amd_comgr_iterate_symbols( + &self, + data: amd_comgr_data_t, + callback: ::std::option::Option< + unsafe extern "C" fn( + symbol: amd_comgr_symbol_t, + user_data: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t, + >, + user_data: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t { + (self + .amd_comgr_iterate_symbols + .as_ref() + .expect("Expected function, got error."))(data, callback, user_data) + } + #[must_use] + #[doc = " @brief Query information about a machine code object symbol.\n\n @param[in] symbol The symbol to query.\n\n @param[in] attribute Attribute to query.\n\n @param[out] value Pointer to an application-allocated buffer where to store\n the value of the attribute. If the buffer passed by the application is not\n large enough to hold the value of attribute, the behavior is undefined. The\n type of value returned is specified by @p amd_comgr_symbol_info_t.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The @p symbol does not have the requested @p\n attribute.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p symbol is an invalid\n symbol. @p attribute is an invalid value. @p value is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to query symbol as out of resources."] + pub unsafe fn amd_comgr_symbol_get_info( + &self, + symbol: amd_comgr_symbol_t, + attribute: amd_comgr_symbol_info_t, + value: *mut ::std::os::raw::c_void, + ) -> amd_comgr_status_t { + (self + .amd_comgr_symbol_get_info + .as_ref() + .expect("Expected function, got error."))(symbol, attribute, value) + } } diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index 9cabbcc..853b9e1 100644 Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index 38a26bb..9de6f61 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -1,6 +1,6 @@ // Every time this file changes it must te rebuilt, you need `rocm-llvm-dev` and `llvm-17` // `fdenormal-fp-math=dynamic` is required to make functions eligible for inlining -// /opt/rocm/llvm/bin/clang -std=c++20 -Xclang -fdenormal-fp-math=dynamic -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc +// /opt/rocm/llvm/bin/clang -std=c++20 -Xclang -fdenormal-fp-math=dynamic -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | sed '/__hip_cuid/d' | sed 's/external protected/external hidden/g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc #include #include @@ -8,10 +8,12 @@ #include #include +#define CONSTANT_SPACE __attribute__((address_space(4))) + #define FUNC(NAME) __device__ __attribute__((retain)) __zluda_ptx_impl_##NAME #define ATTR(NAME) __ZLUDA_PTX_IMPL_ATTRIBUTE_##NAME #define DECLARE_ATTR(TYPE, NAME) \ - extern const TYPE ATTR(NAME) \ + extern "C" __attribute__((constant)) CONSTANT_SPACE TYPE ATTR(NAME) \ __device__ extern "C" diff --git a/ptx/src/pass/insert_post_saturation.rs b/ptx/src/pass/insert_post_saturation.rs index 94d18e4..c46149f 100644 --- a/ptx/src/pass/insert_post_saturation.rs +++ b/ptx/src/pass/insert_post_saturation.rs @@ -74,6 +74,7 @@ fn run_instruction<'input>( | ast::Instruction::And { .. } | ast::Instruction::Atom { .. } | ast::Instruction::AtomCas { .. } + | ast::Instruction::BarWarp { .. } | ast::Instruction::Bar { .. } | ast::Instruction::BarRed { .. } | ast::Instruction::Bfe { .. } @@ -120,6 +121,7 @@ fn run_instruction<'input>( } | ast::Instruction::Cvta { .. } | ast::Instruction::Div { .. } + | ast::Instruction::Dp4a { .. } | ast::Instruction::Ex2 { .. } | ast::Instruction::Fma { data: ast::ArithFloat { diff --git a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs index ba98a23..1981473 100644 --- a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs +++ b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs @@ -1829,6 +1829,7 @@ fn get_modes(inst: &ast::Instruction) -> InstructionModes { | ast::Instruction::Bra { .. } | ast::Instruction::Clz { .. } | ast::Instruction::Brev { .. } + | ast::Instruction::Dp4a { .. } | ast::Instruction::Popc { .. } | ast::Instruction::Xor { .. } | ast::Instruction::Rem { .. } @@ -1844,6 +1845,7 @@ fn get_modes(inst: &ast::Instruction) -> InstructionModes { | ast::Instruction::Shl { .. } | ast::Instruction::Selp { .. } | ast::Instruction::Ret { .. } + | ast::Instruction::BarWarp { .. } | ast::Instruction::Bar { .. } | ast::Instruction::BarRed { .. } | ast::Instruction::Cvta { .. } diff --git a/ptx/src/pass/llvm/attributes.rs b/ptx/src/pass/llvm/attributes.rs index 92ee20f..6413236 100644 --- a/ptx/src/pass/llvm/attributes.rs +++ b/ptx/src/pass/llvm/attributes.rs @@ -33,9 +33,11 @@ fn emit_attribute( module.get(), attribute_type, name.as_ptr(), - get_state_space(ast::StateSpace::Global)?, + get_state_space(ast::StateSpace::Const)?, ) }; + unsafe { LLVMSetLinkage(global, llvm_zluda::LLVMLinkage::LLVMExternalLinkage) }; + unsafe { LLVMSetVisibility(global, llvm_zluda::LLVMVisibility::LLVMHiddenVisibility) }; unsafe { LLVMSetInitializer(global, LLVMConstInt(attribute_type, attribute as u64, 0)) }; unsafe { LLVMSetGlobalConstant(global, 1) }; Ok(()) diff --git a/ptx/src/pass/llvm/emit.rs b/ptx/src/pass/llvm/emit.rs index 90e12e6..f6b8ca0 100644 --- a/ptx/src/pass/llvm/emit.rs +++ b/ptx/src/pass/llvm/emit.rs @@ -170,6 +170,11 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> { unsafe { LLVMAddAttributeAtIndex(fn_, i as u32 + 1, attr) }; } } + if !method.is_kernel { + unsafe { + LLVMSetVisibility(fn_, llvm_zluda::LLVMVisibility::LLVMHiddenVisibility); + } + } let call_conv = if method.is_kernel { Self::kernel_call_convention() } else { @@ -474,6 +479,7 @@ impl<'a> MethodEmitContext<'a> { ast::Instruction::Mov { data: _, arguments } => self.emit_mov(arguments), ast::Instruction::Ld { data, arguments } => self.emit_ld(data, arguments), ast::Instruction::Add { data, arguments } => self.emit_add(data, arguments), + ast::Instruction::Dp4a { data, arguments } => self.emit_dp4a(data, arguments), ast::Instruction::St { data, arguments } => self.emit_st(data, arguments), ast::Instruction::Mul { data, arguments } => self.emit_mul(data, arguments), ast::Instruction::Mul24 { data, arguments } => self.emit_mul24(data, arguments), @@ -515,12 +521,13 @@ impl<'a> MethodEmitContext<'a> { ast::Instruction::Popc { data, arguments } => self.emit_popc(data, arguments), ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments), ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments), + ast::Instruction::BarWarp { .. } => self.emit_bar_warp(), ast::Instruction::PrmtSlow { .. } => { Err(error_todo_msg("PrmtSlow is not implemented yet")) } ast::Instruction::Prmt { data, arguments } => self.emit_prmt(data, arguments), ast::Instruction::Membar { data } => self.emit_membar(data), - ast::Instruction::Trap {} => Err(error_todo_msg("Trap is not implemented yet")), + ast::Instruction::Trap {} => self.emit_trap(), ast::Instruction::Tanh { data, arguments } => self.emit_tanh(data, arguments), ast::Instruction::CpAsync { data, arguments } => self.emit_cp_async(data, arguments), ast::Instruction::CpAsyncCommitGroup {} => Ok(()), // nop @@ -2198,6 +2205,11 @@ impl<'a> MethodEmitContext<'a> { Ok(()) } + fn emit_bar_warp(&mut self) -> Result<(), TranslateError> { + self.emit_intrinsic(c"llvm.amdgcn.barrier.warp", None, None, vec![])?; + Ok(()) + } + fn emit_popc( &mut self, type_: ptx_parser::ScalarType, @@ -2764,6 +2776,50 @@ impl<'a> MethodEmitContext<'a> { Ok(()) } + fn emit_dp4a( + &mut self, + data: ast::Dp4aDetails, + arguments: ast::Dp4aArgs, + ) -> Result<(), TranslateError> { + let intrinsic = match (data.atype, data.btype) { + (ast::ScalarType::U32, ast::ScalarType::U32) => c"llvm.amdgcn.udot4", + (ast::ScalarType::S32, ast::ScalarType::S32) => c"llvm.amdgcn.sdot4", + (ast::ScalarType::U32, ast::ScalarType::S32) + | (ast::ScalarType::S32, ast::ScalarType::U32) => { + return Err(error_todo_msg("dp4a with mixed types is not yet supported")) + } + _ => return Err(error_unreachable()), + }; + let pred = get_scalar_type(self.context, ast::ScalarType::Pred); + let zero = unsafe { LLVMConstInt(pred, 0, 0) }; + self.emit_intrinsic( + intrinsic, + Some(arguments.dst), + Some(&data.ctype().into()), + vec![ + ( + self.resolver.value(arguments.src1)?, + get_scalar_type(self.context, data.ctype()), + ), + ( + self.resolver.value(arguments.src2)?, + get_scalar_type(self.context, data.ctype()), + ), + ( + self.resolver.value(arguments.src3)?, + get_scalar_type(self.context, data.ctype()), + ), + (zero, pred), + ], + )?; + Ok(()) + } + + fn emit_trap(&mut self) -> Result<(), TranslateError> { + self.emit_intrinsic(c"llvm.trap", None, None, vec![])?; + Ok(()) + } + /* // Currently unused, LLVM 18 (ROCm 6.2) does not support `llvm.set.rounding` // Should be available in LLVM 19 diff --git a/ptx/src/pass/replace_instructions_with_functions.rs b/ptx/src/pass/replace_instructions_with_functions.rs index 0f6a36c..edcaaa1 100644 --- a/ptx/src/pass/replace_instructions_with_functions.rs +++ b/ptx/src/pass/replace_instructions_with_functions.rs @@ -269,6 +269,7 @@ fn run_instruction<'input>( data: ast::RcpData { kind: ast::RcpKind::Compliant(ast::RoundingMode::NearestEven), + type_: ast::ScalarType::F32, flush_to_zero: Some(true), .. }, @@ -281,6 +282,7 @@ fn run_instruction<'input>( data: ast::RcpData { kind: ast::RcpKind::Compliant(ast::RoundingMode::NearestEven), + type_: ast::ScalarType::F32, .. }, .. @@ -288,29 +290,6 @@ fn run_instruction<'input>( let name = "sqrt_rn_f32"; to_call(resolver, fn_declarations, name.into(), i)? } - i @ ptx_parser::Instruction::Div { - data: - ast::DivDetails::Float(ast::DivFloatDetails { - kind: ast::DivFloatKind::Rounding(_), - flush_to_zero: Some(true), - .. - }), - .. - } => { - let name = "div_rn_ftz_f32"; - to_call(resolver, fn_declarations, name.into(), i)? - } - i @ ptx_parser::Instruction::Div { - data: - ast::DivDetails::Float(ast::DivFloatDetails { - kind: ast::DivFloatKind::Rounding(_), - .. - }), - .. - } => { - let name = "div_rn_f32"; - to_call(resolver, fn_declarations, name.into(), i)? - } i @ ptx_parser::Instruction::Bfi { data, .. } => { let name = ["bfi_", scalar_to_ptx_name(data)].concat(); to_call(resolver, fn_declarations, name.into(), i)? diff --git a/ptx/src/test/ll/_attributes.ll b/ptx/src/test/ll/_attributes.ll index bd06a06..f2bf604 100644 --- a/ptx/src/test/ll/_attributes.ll +++ b/ptx/src/test/ll/_attributes.ll @@ -1 +1 @@ -@__ZLUDA_PTX_IMPL_ATTRIBUTE_CLOCK_RATE = addrspace(1) constant i32 2124000 \ No newline at end of file +@__ZLUDA_PTX_IMPL_ATTRIBUTE_CLOCK_RATE = hidden addrspace(4) constant i32 2124000 \ No newline at end of file diff --git a/ptx/src/test/ll/abs.ll b/ptx/src/test/ll/abs.ll index 026c854..2b726b5 100644 --- a/ptx/src/test/ll/abs.ll +++ b/ptx/src/test/ll/abs.ll @@ -9,18 +9,18 @@ define amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"31", ptr addrspace( br label %"30" "30": ; preds = %1 - %"37" = load i64, ptr addrspace(4) %"31", align 4 - store i64 %"37", ptr addrspace(5) %"33", align 4 - %"38" = load i64, ptr addrspace(4) %"32", align 4 - store i64 %"38", ptr addrspace(5) %"34", align 4 - %"40" = load i64, ptr addrspace(5) %"33", align 4 + %"37" = load i64, ptr addrspace(4) %"31", align 8 + store i64 %"37", ptr addrspace(5) %"33", align 8 + %"38" = load i64, ptr addrspace(4) %"32", align 8 + store i64 %"38", ptr addrspace(5) %"34", align 8 + %"40" = load i64, ptr addrspace(5) %"33", align 8 %"45" = inttoptr i64 %"40" to ptr %"39" = load i32, ptr %"45", align 4 store i32 %"39", ptr addrspace(5) %"35", align 4 %"42" = load i32, ptr addrspace(5) %"35", align 4 - %"41" = call i32 @llvm.abs.i32(i32 %"42", i1 false) - store i32 %"41", ptr addrspace(5) %"36", align 4 - %"43" = load i64, ptr addrspace(5) %"34", align 4 + %2 = call i32 @llvm.abs.i32(i32 %"42", i1 false) + store i32 %2, ptr addrspace(5) %"36", align 4 + %"43" = load i64, ptr addrspace(5) %"34", align 8 %"44" = load i32, ptr addrspace(5) %"36", align 4 %"46" = inttoptr i64 %"43" to ptr store i32 %"44", ptr %"46", align 4 diff --git a/ptx/src/test/ll/activemask.ll b/ptx/src/test/ll/activemask.ll index 89c8a97..b4bf84b 100644 --- a/ptx/src/test/ll/activemask.ll +++ b/ptx/src/test/ll/activemask.ll @@ -1,4 +1,4 @@ -declare i32 @__zluda_ptx_impl_activemask() #0 +declare hidden i32 @__zluda_ptx_impl_activemask() #0 define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 { %"31" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/assertfail.ll b/ptx/src/test/ll/assertfail.ll index 2dfc81c..5061a56 100644 --- a/ptx/src/test/ll/assertfail.ll +++ b/ptx/src/test/ll/assertfail.ll @@ -1,4 +1,4 @@ -declare void @__zluda_ptx_impl___assertfail(i64, i64, i32, i64, i64) #0 +declare hidden void @__zluda_ptx_impl___assertfail(i64, i64, i32, i64, i64) #0 define amdgpu_kernel void @assertfail(ptr addrspace(4) byref(i64) %"86", ptr addrspace(4) byref(i64) %"87") #1 { %"88" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/bar_red_and_pred.ll b/ptx/src/test/ll/bar_red_and_pred.ll index d2ce83a..512d0ad 100644 --- a/ptx/src/test/ll/bar_red_and_pred.ll +++ b/ptx/src/test/ll/bar_red_and_pred.ll @@ -1,8 +1,8 @@ -declare i1 @__zluda_ptx_impl_bar_red_and_pred(i32, i1, i1) #0 +declare hidden i1 @__zluda_ptx_impl_bar_red_and_pred(i32, i1, i1) #0 -declare i1 @__zluda_ptx_impl_bar_red_or_pred(i32, i1, i1) #0 +declare hidden i1 @__zluda_ptx_impl_bar_red_or_pred(i32, i1, i1) #0 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 +declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0 define amdgpu_kernel void @bar_red_and_pred(ptr addrspace(4) byref(i64) %"73", ptr addrspace(4) byref(i64) %"74") #1 { %"75" = alloca i64, align 8, addrspace(5) @@ -29,8 +29,8 @@ define amdgpu_kernel void @bar_red_and_pred(ptr addrspace(4) byref(i64) %"73", p %"84" = urem i32 %"85", 2 store i32 %"84", ptr addrspace(5) %"78", align 4 %"87" = load i32, ptr addrspace(5) %"78", align 4 - %"86" = icmp eq i32 %"87", 0 - store i1 %"86", ptr addrspace(5) %"80", align 1 + %2 = icmp eq i32 %"87", 0 + store i1 %2, ptr addrspace(5) %"80", align 1 store i32 0, ptr addrspace(5) %"81", align 4 %"90" = load i1, ptr addrspace(5) %"80", align 1 %"89" = call i1 @__zluda_ptx_impl_bar_red_and_pred(i32 1, i1 %"90", i1 false) diff --git a/ptx/src/test/ll/bfe.ll b/ptx/src/test/ll/bfe.ll index d5393fd..1214b12 100644 --- a/ptx/src/test/ll/bfe.ll +++ b/ptx/src/test/ll/bfe.ll @@ -1,4 +1,4 @@ -declare i32 @__zluda_ptx_impl_bfe_u32(i32, i32, i32) #0 +declare hidden i32 @__zluda_ptx_impl_bfe_u32(i32, i32, i32) #0 define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/bfi.ll b/ptx/src/test/ll/bfi.ll index 2938df8..b9a1d65 100644 --- a/ptx/src/test/ll/bfi.ll +++ b/ptx/src/test/ll/bfi.ll @@ -1,4 +1,4 @@ -declare i32 @__zluda_ptx_impl_bfi_b32(i32, i32, i32, i32) #0 +declare hidden i32 @__zluda_ptx_impl_bfi_b32(i32, i32, i32, i32) #0 define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { %"41" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/call.ll b/ptx/src/test/ll/call.ll index 094b2b8..1c2117a 100644 --- a/ptx/src/test/ll/call.ll +++ b/ptx/src/test/ll/call.ll @@ -1,4 +1,4 @@ -define i64 @incr(i64 %"43") #0 { +define hidden i64 @incr(i64 %"43") #0 { %"63" = alloca i64, align 8, addrspace(5) %"64" = alloca i64, align 8, addrspace(5) %"65" = alloca i64, align 8, addrspace(5) @@ -61,4 +61,4 @@ define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"48", ptr addrspace } attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/call_rnd.ll b/ptx/src/test/ll/call_rnd.ll index 22a3d40..22a4048 100644 --- a/ptx/src/test/ll/call_rnd.ll +++ b/ptx/src/test/ll/call_rnd.ll @@ -1,4 +1,4 @@ -define float @add_rm(float %"79", float %"80") #0 { +define hidden float @add_rm(float %"79", float %"80") #0 { %"128" = alloca float, align 4, addrspace(5) %"129" = alloca float, align 4, addrspace(5) %"130" = alloca float, align 4, addrspace(5) @@ -33,7 +33,7 @@ define float @add_rm(float %"79", float %"80") #0 { ret float %2 } -define float @add_rp(float %"82", float %"83") #0 { +define hidden float @add_rp(float %"82", float %"83") #0 { %"141" = alloca float, align 4, addrspace(5) %"142" = alloca float, align 4, addrspace(5) %"143" = alloca float, align 4, addrspace(5) diff --git a/ptx/src/test/ll/cp_async.ll b/ptx/src/test/ll/cp_async.ll index a9b87da..2e33a09 100644 --- a/ptx/src/test/ll/cp_async.ll +++ b/ptx/src/test/ll/cp_async.ll @@ -14,10 +14,10 @@ define amdgpu_kernel void @cp_async(ptr addrspace(4) byref(i64) %"48", ptr addrs br label %"47" "47": ; preds = %1 - %"56" = load i64, ptr addrspace(4) %"48", align 4 - store i64 %"56", ptr addrspace(5) %"50", align 4 - %"57" = load i64, ptr addrspace(4) %"49", align 4 - store i64 %"57", ptr addrspace(5) %"51", align 4 + %"56" = load i64, ptr addrspace(4) %"48", align 8 + store i64 %"56", ptr addrspace(5) %"50", align 8 + %"57" = load i64, ptr addrspace(4) %"49", align 8 + store i64 %"57", ptr addrspace(5) %"51", align 8 %2 = load i96, ptr addrspace(1) @from, align 128 %3 = zext i96 %2 to i128 store i128 %3, ptr addrspace(3) @to, align 4 @@ -29,21 +29,21 @@ define amdgpu_kernel void @cp_async(ptr addrspace(4) byref(i64) %"48", ptr addrs store i32 %"60", ptr addrspace(5) %"54", align 4 %"61" = load i32, ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @to to ptr), i64 12), align 4 store i32 %"61", ptr addrspace(5) %"55", align 4 - %"62" = load i64, ptr addrspace(5) %"51", align 4 + %"62" = load i64, ptr addrspace(5) %"51", align 8 %"63" = load i32, ptr addrspace(5) %"52", align 4 %"76" = inttoptr i64 %"62" to ptr store i32 %"63", ptr %"76", align 4 - %"64" = load i64, ptr addrspace(5) %"51", align 4 + %"64" = load i64, ptr addrspace(5) %"51", align 8 %"77" = inttoptr i64 %"64" to ptr %"42" = getelementptr inbounds i8, ptr %"77", i64 4 %"65" = load i32, ptr addrspace(5) %"53", align 4 store i32 %"65", ptr %"42", align 4 - %"66" = load i64, ptr addrspace(5) %"51", align 4 + %"66" = load i64, ptr addrspace(5) %"51", align 8 %"78" = inttoptr i64 %"66" to ptr %"44" = getelementptr inbounds i8, ptr %"78", i64 8 %"67" = load i32, ptr addrspace(5) %"54", align 4 store i32 %"67", ptr %"44", align 4 - %"68" = load i64, ptr addrspace(5) %"51", align 4 + %"68" = load i64, ptr addrspace(5) %"51", align 8 %"79" = inttoptr i64 %"68" to ptr %"46" = getelementptr inbounds i8, ptr %"79", i64 12 %"69" = load i32, ptr addrspace(5) %"55", align 4 diff --git a/ptx/src/test/ll/cvt_f64_f32.ll b/ptx/src/test/ll/cvt_f64_f32.ll index 84e0d4d..043964f 100644 --- a/ptx/src/test/ll/cvt_f64_f32.ll +++ b/ptx/src/test/ll/cvt_f64_f32.ll @@ -27,4 +27,4 @@ define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"31", ptr ad ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_rni.ll b/ptx/src/test/ll/cvt_rni.ll index b5f5af8..d51e113 100644 --- a/ptx/src/test/ll/cvt_rni.ll +++ b/ptx/src/test/ll/cvt_rni.ll @@ -24,12 +24,10 @@ define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"35", ptr addrsp store float %"46", ptr addrspace(5) %"40", align 4 %"48" = load float, ptr addrspace(5) %"39", align 4 %2 = call float @llvm.roundeven.f32(float %"48") - %"47" = freeze float %2 - store float %"47", ptr addrspace(5) %"39", align 4 + store float %2, ptr addrspace(5) %"39", align 4 %"50" = load float, ptr addrspace(5) %"40", align 4 %3 = call float @llvm.roundeven.f32(float %"50") - %"49" = freeze float %3 - store float %"49", ptr addrspace(5) %"40", align 4 + store float %3, ptr addrspace(5) %"40", align 4 %"51" = load i64, ptr addrspace(5) %"38", align 8 %"52" = load float, ptr addrspace(5) %"39", align 4 %"57" = inttoptr i64 %"51" to ptr @@ -46,4 +44,4 @@ define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"35", ptr addrsp declare float @llvm.roundeven.f32(float) #1 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_rzi.ll b/ptx/src/test/ll/cvt_rzi.ll index a5c996a..cde0133 100644 --- a/ptx/src/test/ll/cvt_rzi.ll +++ b/ptx/src/test/ll/cvt_rzi.ll @@ -9,7 +9,6 @@ define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"35", ptr addrsp br label %"34" "34": ; preds = %1 - call void @llvm.amdgcn.s.setreg(i32 6145, i32 3) %"41" = load i64, ptr addrspace(4) %"35", align 8 store i64 %"41", ptr addrspace(5) %"37", align 8 %"42" = load i64, ptr addrspace(4) %"36", align 8 @@ -25,12 +24,10 @@ define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"35", ptr addrsp store float %"46", ptr addrspace(5) %"40", align 4 %"48" = load float, ptr addrspace(5) %"39", align 4 %2 = call float @llvm.trunc.f32(float %"48") - %"47" = freeze float %2 - store float %"47", ptr addrspace(5) %"39", align 4 + store float %2, ptr addrspace(5) %"39", align 4 %"50" = load float, ptr addrspace(5) %"40", align 4 %3 = call float @llvm.trunc.f32(float %"50") - %"49" = freeze float %3 - store float %"49", ptr addrspace(5) %"40", align 4 + store float %3, ptr addrspace(5) %"40", align 4 %"51" = load i64, ptr addrspace(5) %"38", align 8 %"52" = load float, ptr addrspace(5) %"39", align 4 %"57" = inttoptr i64 %"51" to ptr @@ -43,12 +40,8 @@ define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"35", ptr addrsp ret void } -; Function Attrs: nocallback nofree nosync nounwind willreturn -declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1 - ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.trunc.f32(float) #2 +declare float @llvm.trunc.f32(float) #1 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind willreturn } -attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/cvt_sat_s_u.ll b/ptx/src/test/ll/cvt_sat_s_u.ll index 6b31407..89abbd3 100644 --- a/ptx/src/test/ll/cvt_sat_s_u.ll +++ b/ptx/src/test/ll/cvt_sat_s_u.ll @@ -20,7 +20,7 @@ define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"32", ptr ad store i32 %"41", ptr addrspace(5) %"36", align 4 %"44" = load i32, ptr addrspace(5) %"36", align 4 %2 = call i32 @llvm.smax.i32(i32 %"44", i32 0) - %3 = call i32 @llvm.umin.i32(i32 %2, i32 -1) + %3 = call i32 @llvm.smin.i32(i32 %2, i32 2147483647) store i32 %3, ptr addrspace(5) %"37", align 4 %"46" = load i32, ptr addrspace(5) %"37", align 4 store i32 %"46", ptr addrspace(5) %"38", align 4 @@ -35,7 +35,7 @@ define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"32", ptr ad declare i32 @llvm.smax.i32(i32, i32) #1 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.umin.i32(i32, i32) #1 +declare i32 @llvm.smin.i32(i32, i32) #1 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/div_ftz.ll b/ptx/src/test/ll/div_ftz.ll index 6898edb..15af75c 100644 --- a/ptx/src/test/ll/div_ftz.ll +++ b/ptx/src/test/ll/div_ftz.ll @@ -1,8 +1,8 @@ %struct.f32.f32.f32.i8 = type { float, float, float, i8 } -declare %struct.f32.f32.f32.i8 @__zluda_ptx_impl_div_f32_part1(float, float) #0 +declare hidden %struct.f32.f32.f32.i8 @__zluda_ptx_impl_div_f32_part1(float, float) #0 -declare float @__zluda_ptx_impl_div_f32_part2(float, float, float, float, float, i8) #0 +declare hidden float @__zluda_ptx_impl_div_f32_part2(float, float, float, float, float, i8) #0 define amdgpu_kernel void @div_ftz(ptr addrspace(4) byref(i64) %"63", ptr addrspace(4) byref(i64) %"64") #1 { %"65" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/div_noftz.ll b/ptx/src/test/ll/div_noftz.ll index 46be55f..2e7a3fe 100644 --- a/ptx/src/test/ll/div_noftz.ll +++ b/ptx/src/test/ll/div_noftz.ll @@ -1,8 +1,8 @@ %struct.f32.f32.f32.i8 = type { float, float, float, i8 } -declare %struct.f32.f32.f32.i8 @__zluda_ptx_impl_div_f32_part1(float, float) #0 +declare hidden %struct.f32.f32.f32.i8 @__zluda_ptx_impl_div_f32_part1(float, float) #0 -declare float @__zluda_ptx_impl_div_f32_part2(float, float, float, float, float, i8) #0 +declare hidden float @__zluda_ptx_impl_div_f32_part2(float, float, float, float, float, i8) #0 define amdgpu_kernel void @div_noftz(ptr addrspace(4) byref(i64) %"62", ptr addrspace(4) byref(i64) %"63") #1 { %"64" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/dp4a.ll b/ptx/src/test/ll/dp4a.ll new file mode 100644 index 0000000..9879e0d --- /dev/null +++ b/ptx/src/test/ll/dp4a.ll @@ -0,0 +1,48 @@ +define amdgpu_kernel void @dp4a(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { + %"39" = alloca i64, align 8, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"36" + +"36": ; preds = %1 + %"45" = load i64, ptr addrspace(4) %"37", align 8 + store i64 %"45", ptr addrspace(5) %"39", align 8 + %"46" = load i64, ptr addrspace(4) %"38", align 8 + store i64 %"46", ptr addrspace(5) %"40", align 8 + %"48" = load i64, ptr addrspace(5) %"39", align 8 + %"59" = inttoptr i64 %"48" to ptr + %"47" = load i32, ptr %"59", align 4 + store i32 %"47", ptr addrspace(5) %"41", align 4 + %"49" = load i64, ptr addrspace(5) %"39", align 8 + %"60" = inttoptr i64 %"49" to ptr + %"33" = getelementptr inbounds i8, ptr %"60", i64 4 + %"50" = load i32, ptr %"33", align 4 + store i32 %"50", ptr addrspace(5) %"42", align 4 + %"51" = load i64, ptr addrspace(5) %"39", align 8 + %"61" = inttoptr i64 %"51" to ptr + %"35" = getelementptr inbounds i8, ptr %"61", i64 8 + %"52" = load i32, ptr %"35", align 4 + store i32 %"52", ptr addrspace(5) %"43", align 4 + %"54" = load i32, ptr addrspace(5) %"41", align 4 + %"55" = load i32, ptr addrspace(5) %"42", align 4 + %"56" = load i32, ptr addrspace(5) %"43", align 4 + %"53" = call i32 @llvm.amdgcn.sdot4(i32 %"54", i32 %"55", i32 %"56", i1 false) + store i32 %"53", ptr addrspace(5) %"44", align 4 + %"57" = load i64, ptr addrspace(5) %"40", align 8 + %"58" = load i32, ptr addrspace(5) %"44", align 4 + %"65" = inttoptr i64 %"57" to ptr + store i32 %"58", ptr %"65", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.amdgcn.sdot4(i32, i32, i32, i1 immarg) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file diff --git a/ptx/src/test/ll/ex2.ll b/ptx/src/test/ll/ex2.ll index 1d51761..e0eec03 100644 --- a/ptx/src/test/ll/ex2.ll +++ b/ptx/src/test/ll/ex2.ll @@ -1,4 +1,6 @@ -define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { +declare hidden float @__zluda_ptx_impl_ex2_approx_f32(float) #0 + +define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 { %"32" = alloca i64, align 8, addrspace(5) %"33" = alloca i64, align 8, addrspace(5) %"34" = alloca float, align 4, addrspace(5) @@ -17,7 +19,7 @@ define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace( %"37" = load float, ptr %"43", align 4 store float %"37", ptr addrspace(5) %"34", align 4 %"40" = load float, ptr addrspace(5) %"34", align 4 - %"39" = call float @llvm.amdgcn.exp2.f32(float %"40") + %"39" = call float @__zluda_ptx_impl_ex2_approx_f32(float %"40") store float %"39", ptr addrspace(5) %"34", align 4 %"41" = load i64, ptr addrspace(5) %"33", align 8 %"42" = load float, ptr addrspace(5) %"34", align 4 @@ -26,8 +28,5 @@ define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace( ret void } -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.amdgcn.exp2.f32(float) #1 - -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/extern_func.ll b/ptx/src/test/ll/extern_func.ll index 24fe9ba..d013396 100644 --- a/ptx/src/test/ll/extern_func.ll +++ b/ptx/src/test/ll/extern_func.ll @@ -1,4 +1,4 @@ -declare [16 x i8] @foobar(i64) #0 +declare hidden [16 x i8] @foobar(i64) #0 define amdgpu_kernel void @extern_func(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #1 { %"46" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/extern_shared_call.ll b/ptx/src/test/ll/extern_shared_call.ll index 22cf174..28eadfe 100644 --- a/ptx/src/test/ll/extern_shared_call.ll +++ b/ptx/src/test/ll/extern_shared_call.ll @@ -1,6 +1,6 @@ @shared_mem = external addrspace(3) global [0 x i32], align 4 -define void @incr_shared_2_global() #0 { +define hidden void @incr_shared_2_global() #0 { %"36" = alloca i64, align 8, addrspace(5) br label %1 diff --git a/ptx/src/test/ll/lanemask_lt.ll b/ptx/src/test/ll/lanemask_lt.ll index 2c7090e..6ce12b8 100644 --- a/ptx/src/test/ll/lanemask_lt.ll +++ b/ptx/src/test/ll/lanemask_lt.ll @@ -1,4 +1,4 @@ -declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 +declare hidden i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0 define amdgpu_kernel void @lanemask_lt(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/lg2.ll b/ptx/src/test/ll/lg2.ll index ae0c03d..94f6881 100644 --- a/ptx/src/test/ll/lg2.ll +++ b/ptx/src/test/ll/lg2.ll @@ -1,4 +1,6 @@ -define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { +declare hidden float @__zluda_ptx_impl_lg2_approx_f32(float) #0 + +define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 { %"32" = alloca i64, align 8, addrspace(5) %"33" = alloca i64, align 8, addrspace(5) %"34" = alloca float, align 4, addrspace(5) @@ -17,7 +19,7 @@ define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace( %"37" = load float, ptr %"43", align 4 store float %"37", ptr addrspace(5) %"34", align 4 %"40" = load float, ptr addrspace(5) %"34", align 4 - %"39" = call float @llvm.amdgcn.log.f32(float %"40") + %"39" = call float @__zluda_ptx_impl_lg2_approx_f32(float %"40") store float %"39", ptr addrspace(5) %"34", align 4 %"41" = load i64, ptr addrspace(5) %"33", align 8 %"42" = load float, ptr addrspace(5) %"34", align 4 @@ -26,8 +28,5 @@ define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace( ret void } -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.amdgcn.log.f32(float) #1 - -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/multiple_return.ll b/ptx/src/test/ll/multiple_return.ll index 42b98b8..d8aed86 100644 --- a/ptx/src/test/ll/multiple_return.ll +++ b/ptx/src/test/ll/multiple_return.ll @@ -1,4 +1,6 @@ -define [2 x i32] @do_something(i32 %"10") #0 { +%struct.i32.i1 = type { i32, i1 } + +define hidden %struct.i32.i1 @do_something(i32 %"10") #0 { %"46" = alloca i32, align 4, addrspace(5) %"47" = alloca i1, align 1, addrspace(5) br label %1 @@ -12,10 +14,9 @@ define [2 x i32] @do_something(i32 %"10") #0 { store i1 true, ptr addrspace(5) %"47", align 1 %2 = load i32, ptr addrspace(5) %"46", align 4 %3 = load i1, ptr addrspace(5) %"47", align 1 - %4 = insertvalue [2 x i32] poison, i32 %2, 0 - %5 = zext i1 %3 to i32 - %6 = insertvalue [2 x i32] %4, i32 %5, 1 - ret [2 x i32] %6 + %4 = insertvalue %struct.i32.i1 undef, i32 %2, 0 + %5 = insertvalue %struct.i32.i1 %4, i1 %3, 1 + ret %struct.i32.i1 %5 } define amdgpu_kernel void @multiple_return(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #1 { @@ -39,10 +40,9 @@ define amdgpu_kernel void @multiple_return(ptr addrspace(4) byref(i64) %"50", pt %"59" = load i32, ptr %"68", align 4 store i32 %"59", ptr addrspace(5) %"54", align 4 %"63" = load i32, ptr addrspace(5) %"54", align 4 - %2 = call [2 x i32] @do_something(i32 %"63") - %"61" = extractvalue [2 x i32] %2, 0 - %3 = extractvalue [2 x i32] %2, 1 - %"62" = trunc i32 %3 to i1 + %2 = call %struct.i32.i1 @do_something(i32 %"63") + %"61" = extractvalue %struct.i32.i1 %2, 0 + %"62" = extractvalue %struct.i32.i1 %2, 1 store i32 %"61", ptr addrspace(5) %"55", align 4 store i1 %"62", ptr addrspace(5) %"56", align 1 br label %"45" diff --git a/ptx/src/test/ll/nanosleep.ll b/ptx/src/test/ll/nanosleep.ll index d567302..5aaedfc 100644 --- a/ptx/src/test/ll/nanosleep.ll +++ b/ptx/src/test/ll/nanosleep.ll @@ -1,4 +1,4 @@ -declare void @__zluda_ptx_impl_nanosleep_u32(i32) #0 +declare hidden void @__zluda_ptx_impl_nanosleep_u32(i32) #0 define amdgpu_kernel void @nanosleep(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 { br label %1 diff --git a/ptx/src/test/ll/ntid.ll b/ptx/src/test/ll/ntid.ll index f4a7d5c..de64e8e 100644 --- a/ptx/src/test/ll/ntid.ll +++ b/ptx/src/test/ll/ntid.ll @@ -1,4 +1,4 @@ -declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 +declare hidden i32 @__zluda_ptx_impl_sreg_ntid(i8) #0 define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { %"37" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/pred_not.ll b/ptx/src/test/ll/pred_not.ll index 94c64f9..f905570 100644 --- a/ptx/src/test/ll/pred_not.ll +++ b/ptx/src/test/ll/pred_not.ll @@ -26,8 +26,8 @@ define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"41", ptr addrs store i64 %"54", ptr addrspace(5) %"46", align 8 %"56" = load i64, ptr addrspace(5) %"45", align 8 %"57" = load i64, ptr addrspace(5) %"46", align 8 - %"55" = icmp ult i64 %"56", %"57" - store i1 %"55", ptr addrspace(5) %"48", align 1 + %2 = icmp ult i64 %"56", %"57" + store i1 %2, ptr addrspace(5) %"48", align 1 %"59" = load i1, ptr addrspace(5) %"48", align 1 %"58" = xor i1 %"59", true store i1 %"58", ptr addrspace(5) %"48", align 1 diff --git a/ptx/src/test/ll/rcp.ll b/ptx/src/test/ll/rcp.ll index cdc6ffc..48f9e5b 100644 --- a/ptx/src/test/ll/rcp.ll +++ b/ptx/src/test/ll/rcp.ll @@ -1,4 +1,6 @@ -define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { +declare hidden float @__zluda_ptx_impl_rcp_approx_f32(float) #0 + +define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 { %"32" = alloca i64, align 8, addrspace(5) %"33" = alloca i64, align 8, addrspace(5) %"34" = alloca float, align 4, addrspace(5) @@ -17,7 +19,7 @@ define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace( %"37" = load float, ptr %"43", align 4 store float %"37", ptr addrspace(5) %"34", align 4 %"40" = load float, ptr addrspace(5) %"34", align 4 - %"39" = call float @llvm.amdgcn.rcp.f32(float %"40") + %"39" = call float @__zluda_ptx_impl_rcp_approx_f32(float %"40") store float %"39", ptr addrspace(5) %"34", align 4 %"41" = load i64, ptr addrspace(5) %"33", align 8 %"42" = load float, ptr addrspace(5) %"34", align 4 @@ -26,8 +28,5 @@ define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace( ret void } -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.amdgcn.rcp.f32(float) #1 - -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } \ No newline at end of file +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/setp.ll b/ptx/src/test/ll/setp.ll index 0ed02d7..b100797 100644 --- a/ptx/src/test/ll/setp.ll +++ b/ptx/src/test/ll/setp.ll @@ -26,8 +26,8 @@ define amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"41", ptr addrspace store i64 %"54", ptr addrspace(5) %"46", align 8 %"56" = load i64, ptr addrspace(5) %"45", align 8 %"57" = load i64, ptr addrspace(5) %"46", align 8 - %"55" = icmp ult i64 %"56", %"57" - store i1 %"55", ptr addrspace(5) %"48", align 1 + %2 = icmp ult i64 %"56", %"57" + store i1 %2, ptr addrspace(5) %"48", align 1 %"58" = load i1, ptr addrspace(5) %"48", align 1 br i1 %"58", label %"16", label %"17" diff --git a/ptx/src/test/ll/setp_gt.ll b/ptx/src/test/ll/setp_gt.ll index 7b18301..d5f42d2 100644 --- a/ptx/src/test/ll/setp_gt.ll +++ b/ptx/src/test/ll/setp_gt.ll @@ -26,8 +26,8 @@ define amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"39", ptr addrsp store float %"52", ptr addrspace(5) %"44", align 4 %"54" = load float, ptr addrspace(5) %"43", align 4 %"55" = load float, ptr addrspace(5) %"44", align 4 - %"53" = fcmp ogt float %"54", %"55" - store i1 %"53", ptr addrspace(5) %"46", align 1 + %2 = fcmp ogt float %"54", %"55" + store i1 %2, ptr addrspace(5) %"46", align 1 %"56" = load i1, ptr addrspace(5) %"46", align 1 br i1 %"56", label %"16", label %"17" diff --git a/ptx/src/test/ll/setp_leu.ll b/ptx/src/test/ll/setp_leu.ll index 21f8d26..6165784 100644 --- a/ptx/src/test/ll/setp_leu.ll +++ b/ptx/src/test/ll/setp_leu.ll @@ -26,8 +26,8 @@ define amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"39", ptr addrs store float %"52", ptr addrspace(5) %"44", align 4 %"54" = load float, ptr addrspace(5) %"43", align 4 %"55" = load float, ptr addrspace(5) %"44", align 4 - %"53" = fcmp ule float %"54", %"55" - store i1 %"53", ptr addrspace(5) %"46", align 1 + %2 = fcmp ule float %"54", %"55" + store i1 %2, ptr addrspace(5) %"46", align 1 %"56" = load i1, ptr addrspace(5) %"46", align 1 br i1 %"56", label %"16", label %"17" diff --git a/ptx/src/test/ll/setp_nan.ll b/ptx/src/test/ll/setp_nan.ll index 6910977..84d88ea 100644 --- a/ptx/src/test/ll/setp_nan.ll +++ b/ptx/src/test/ll/setp_nan.ll @@ -62,8 +62,8 @@ define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"83", ptr addrs store float %"114", ptr addrspace(5) %"94", align 4 %"116" = load float, ptr addrspace(5) %"87", align 4 %"117" = load float, ptr addrspace(5) %"88", align 4 - %"115" = fcmp uno float %"116", %"117" - store i1 %"115", ptr addrspace(5) %"96", align 1 + %2 = fcmp uno float %"116", %"117" + store i1 %2, ptr addrspace(5) %"96", align 1 %"118" = load i1, ptr addrspace(5) %"96", align 1 br i1 %"118", label %"22", label %"23" @@ -86,8 +86,8 @@ define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"83", ptr addrs store i32 %"123", ptr %"159", align 4 %"125" = load float, ptr addrspace(5) %"89", align 4 %"126" = load float, ptr addrspace(5) %"90", align 4 - %"124" = fcmp uno float %"125", %"126" - store i1 %"124", ptr addrspace(5) %"96", align 1 + %3 = fcmp uno float %"125", %"126" + store i1 %3, ptr addrspace(5) %"96", align 1 %"127" = load i1, ptr addrspace(5) %"96", align 1 br i1 %"127", label %"26", label %"27" @@ -111,8 +111,8 @@ define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"83", ptr addrs store i32 %"132", ptr %"73", align 4 %"134" = load float, ptr addrspace(5) %"91", align 4 %"135" = load float, ptr addrspace(5) %"92", align 4 - %"133" = fcmp uno float %"134", %"135" - store i1 %"133", ptr addrspace(5) %"96", align 1 + %4 = fcmp uno float %"134", %"135" + store i1 %4, ptr addrspace(5) %"96", align 1 %"136" = load i1, ptr addrspace(5) %"96", align 1 br i1 %"136", label %"30", label %"31" @@ -136,8 +136,8 @@ define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"83", ptr addrs store i32 %"141", ptr %"77", align 4 %"143" = load float, ptr addrspace(5) %"93", align 4 %"144" = load float, ptr addrspace(5) %"94", align 4 - %"142" = fcmp uno float %"143", %"144" - store i1 %"142", ptr addrspace(5) %"96", align 1 + %5 = fcmp uno float %"143", %"144" + store i1 %5, ptr addrspace(5) %"96", align 1 %"145" = load i1, ptr addrspace(5) %"96", align 1 br i1 %"145", label %"34", label %"35" diff --git a/ptx/src/test/ll/setp_num.ll b/ptx/src/test/ll/setp_num.ll index 834ade6..532ffac 100644 --- a/ptx/src/test/ll/setp_num.ll +++ b/ptx/src/test/ll/setp_num.ll @@ -62,8 +62,8 @@ define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"83", ptr addrs store float %"114", ptr addrspace(5) %"94", align 4 %"116" = load float, ptr addrspace(5) %"87", align 4 %"117" = load float, ptr addrspace(5) %"88", align 4 - %"115" = fcmp ord float %"116", %"117" - store i1 %"115", ptr addrspace(5) %"96", align 1 + %2 = fcmp ord float %"116", %"117" + store i1 %2, ptr addrspace(5) %"96", align 1 %"118" = load i1, ptr addrspace(5) %"96", align 1 br i1 %"118", label %"22", label %"23" @@ -86,8 +86,8 @@ define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"83", ptr addrs store i32 %"123", ptr %"159", align 4 %"125" = load float, ptr addrspace(5) %"89", align 4 %"126" = load float, ptr addrspace(5) %"90", align 4 - %"124" = fcmp ord float %"125", %"126" - store i1 %"124", ptr addrspace(5) %"96", align 1 + %3 = fcmp ord float %"125", %"126" + store i1 %3, ptr addrspace(5) %"96", align 1 %"127" = load i1, ptr addrspace(5) %"96", align 1 br i1 %"127", label %"26", label %"27" @@ -111,8 +111,8 @@ define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"83", ptr addrs store i32 %"132", ptr %"73", align 4 %"134" = load float, ptr addrspace(5) %"91", align 4 %"135" = load float, ptr addrspace(5) %"92", align 4 - %"133" = fcmp ord float %"134", %"135" - store i1 %"133", ptr addrspace(5) %"96", align 1 + %4 = fcmp ord float %"134", %"135" + store i1 %4, ptr addrspace(5) %"96", align 1 %"136" = load i1, ptr addrspace(5) %"96", align 1 br i1 %"136", label %"30", label %"31" @@ -136,8 +136,8 @@ define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"83", ptr addrs store i32 %"141", ptr %"77", align 4 %"143" = load float, ptr addrspace(5) %"93", align 4 %"144" = load float, ptr addrspace(5) %"94", align 4 - %"142" = fcmp ord float %"143", %"144" - store i1 %"142", ptr addrspace(5) %"96", align 1 + %5 = fcmp ord float %"143", %"144" + store i1 %5, ptr addrspace(5) %"96", align 1 %"145" = load i1, ptr addrspace(5) %"96", align 1 br i1 %"145", label %"34", label %"35" diff --git a/ptx/src/test/ll/shared_unify_extern.ll b/ptx/src/test/ll/shared_unify_extern.ll index 1af840e..c9c4f18 100644 --- a/ptx/src/test/ll/shared_unify_extern.ll +++ b/ptx/src/test/ll/shared_unify_extern.ll @@ -1,7 +1,7 @@ @shared_ex = external addrspace(3) global [0 x i32] @shared_mod = external addrspace(3) global [4 x i32] -define i64 @add() #0 { +define hidden i64 @add() #0 { %"46" = alloca i64, align 8, addrspace(5) %"47" = alloca i64, align 8, addrspace(5) %"48" = alloca i64, align 8, addrspace(5) @@ -23,7 +23,7 @@ define i64 @add() #0 { ret i64 %2 } -define i64 @set_shared_temp1(i64 %"15") #0 { +define hidden i64 @set_shared_temp1(i64 %"15") #0 { %"54" = alloca i64, align 8, addrspace(5) br label %1 diff --git a/ptx/src/test/ll/shared_unify_local.ll b/ptx/src/test/ll/shared_unify_local.ll index f211134..c3cff90 100644 --- a/ptx/src/test/ll/shared_unify_local.ll +++ b/ptx/src/test/ll/shared_unify_local.ll @@ -1,7 +1,7 @@ @shared_ex = external addrspace(3) global [0 x i32] @shared_mod = external addrspace(3) global i64, align 4 -define i64 @add(i64 %"10") #0 { +define hidden i64 @add(i64 %"10") #0 { %"47" = alloca i64, align 8, addrspace(5) %"48" = alloca i64, align 8, addrspace(5) br label %1 @@ -21,7 +21,7 @@ define i64 @add(i64 %"10") #0 { ret i64 %2 } -define i64 @set_shared_temp1(i64 %"15", i64 %"16") #0 { +define hidden i64 @set_shared_temp1(i64 %"15", i64 %"16") #0 { %"52" = alloca i64, align 8, addrspace(5) br label %1 diff --git a/ptx/src/test/ll/shfl_sync_bfly_b32_pred.ll b/ptx/src/test/ll/shfl_sync_bfly_b32_pred.ll index 45d21e4..c0639ab 100644 --- a/ptx/src/test/ll/shfl_sync_bfly_b32_pred.ll +++ b/ptx/src/test/ll/shfl_sync_bfly_b32_pred.ll @@ -1,6 +1,6 @@ -declare [2 x i32] @__zluda_ptx_impl_shfl_sync_bfly_b32_pred(i32, i32, i32, i32) #0 +declare hidden <2 x i32> @__zluda_ptx_impl_shfl_sync_bfly_b32_pred(i32, i32, i32, i32) #0 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 +declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0 define amdgpu_kernel void @shfl_sync_bfly_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 { %"43" = alloca i64, align 8, addrspace(5) @@ -22,10 +22,10 @@ define amdgpu_kernel void @shfl_sync_bfly_b32_pred(ptr addrspace(4) byref(i64) % "40": ; preds = %"39" store i32 %"33", ptr addrspace(5) %"45", align 4 %"52" = load i32, ptr addrspace(5) %"45", align 4 - %2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_bfly_b32_pred(i32 %"52", i32 3, i32 31, i32 -1) - %"65" = extractvalue [2 x i32] %2, 0 - %3 = extractvalue [2 x i32] %2, 1 - %"51" = trunc i32 %3 to i1 + %"68" = call <2 x i32> @__zluda_ptx_impl_shfl_sync_bfly_b32_pred(i32 %"52", i32 3, i32 31, i32 -1) + %"65" = extractelement <2 x i32> %"68", i8 0 + %"69" = extractelement <2 x i32> %"68", i8 1 + %"51" = trunc i32 %"69" to i1 store i32 %"65", ptr addrspace(5) %"46", align 4 store i1 %"51", ptr addrspace(5) %"47", align 1 %"53" = load i1, ptr addrspace(5) %"47", align 1 diff --git a/ptx/src/test/ll/shfl_sync_down_b32_pred.ll b/ptx/src/test/ll/shfl_sync_down_b32_pred.ll index 2e1ce34..f03be78 100644 --- a/ptx/src/test/ll/shfl_sync_down_b32_pred.ll +++ b/ptx/src/test/ll/shfl_sync_down_b32_pred.ll @@ -1,6 +1,6 @@ -declare [2 x i32] @__zluda_ptx_impl_shfl_sync_down_b32_pred(i32, i32, i32, i32) #0 +declare hidden <2 x i32> @__zluda_ptx_impl_shfl_sync_down_b32_pred(i32, i32, i32, i32) #0 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 +declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0 define amdgpu_kernel void @shfl_sync_down_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 { %"43" = alloca i64, align 8, addrspace(5) @@ -22,10 +22,10 @@ define amdgpu_kernel void @shfl_sync_down_b32_pred(ptr addrspace(4) byref(i64) % "40": ; preds = %"39" store i32 %"33", ptr addrspace(5) %"45", align 4 %"52" = load i32, ptr addrspace(5) %"45", align 4 - %2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_down_b32_pred(i32 %"52", i32 3, i32 31, i32 -1) - %"65" = extractvalue [2 x i32] %2, 0 - %3 = extractvalue [2 x i32] %2, 1 - %"51" = trunc i32 %3 to i1 + %"68" = call <2 x i32> @__zluda_ptx_impl_shfl_sync_down_b32_pred(i32 %"52", i32 3, i32 31, i32 -1) + %"65" = extractelement <2 x i32> %"68", i8 0 + %"69" = extractelement <2 x i32> %"68", i8 1 + %"51" = trunc i32 %"69" to i1 store i32 %"65", ptr addrspace(5) %"46", align 4 store i1 %"51", ptr addrspace(5) %"47", align 1 %"53" = load i1, ptr addrspace(5) %"47", align 1 diff --git a/ptx/src/test/ll/shfl_sync_idx_b32_pred.ll b/ptx/src/test/ll/shfl_sync_idx_b32_pred.ll index 6c58633..515dd7e 100644 --- a/ptx/src/test/ll/shfl_sync_idx_b32_pred.ll +++ b/ptx/src/test/ll/shfl_sync_idx_b32_pred.ll @@ -1,6 +1,6 @@ -declare [2 x i32] @__zluda_ptx_impl_shfl_sync_idx_b32_pred(i32, i32, i32, i32) #0 +declare hidden <2 x i32> @__zluda_ptx_impl_shfl_sync_idx_b32_pred(i32, i32, i32, i32) #0 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 +declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0 define amdgpu_kernel void @shfl_sync_idx_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 { %"43" = alloca i64, align 8, addrspace(5) @@ -22,10 +22,10 @@ define amdgpu_kernel void @shfl_sync_idx_b32_pred(ptr addrspace(4) byref(i64) %" "40": ; preds = %"39" store i32 %"33", ptr addrspace(5) %"45", align 4 %"52" = load i32, ptr addrspace(5) %"45", align 4 - %2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_idx_b32_pred(i32 %"52", i32 12, i32 31, i32 -1) - %"65" = extractvalue [2 x i32] %2, 0 - %3 = extractvalue [2 x i32] %2, 1 - %"51" = trunc i32 %3 to i1 + %"68" = call <2 x i32> @__zluda_ptx_impl_shfl_sync_idx_b32_pred(i32 %"52", i32 12, i32 31, i32 -1) + %"65" = extractelement <2 x i32> %"68", i8 0 + %"69" = extractelement <2 x i32> %"68", i8 1 + %"51" = trunc i32 %"69" to i1 store i32 %"65", ptr addrspace(5) %"46", align 4 store i1 %"51", ptr addrspace(5) %"47", align 1 %"53" = load i1, ptr addrspace(5) %"47", align 1 diff --git a/ptx/src/test/ll/shfl_sync_mode_b32.ll b/ptx/src/test/ll/shfl_sync_mode_b32.ll index 913c4aa..40e247e 100644 --- a/ptx/src/test/ll/shfl_sync_mode_b32.ll +++ b/ptx/src/test/ll/shfl_sync_mode_b32.ll @@ -1,12 +1,12 @@ -declare i32 @__zluda_ptx_impl_shfl_sync_down_b32(i32, i32, i32, i32) #0 +declare hidden i32 @__zluda_ptx_impl_shfl_sync_down_b32(i32, i32, i32, i32) #0 -declare i32 @__zluda_ptx_impl_shfl_sync_up_b32(i32, i32, i32, i32) #0 +declare hidden i32 @__zluda_ptx_impl_shfl_sync_up_b32(i32, i32, i32, i32) #0 -declare i32 @__zluda_ptx_impl_shfl_sync_bfly_b32(i32, i32, i32, i32) #0 +declare hidden i32 @__zluda_ptx_impl_shfl_sync_bfly_b32(i32, i32, i32, i32) #0 -declare i32 @__zluda_ptx_impl_shfl_sync_idx_b32(i32, i32, i32, i32) #0 +declare hidden i32 @__zluda_ptx_impl_shfl_sync_idx_b32(i32, i32, i32, i32) #0 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 +declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0 define amdgpu_kernel void @shfl_sync_mode_b32(ptr addrspace(4) byref(i64) %"48") #1 { %"49" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/shfl_sync_up_b32_pred.ll b/ptx/src/test/ll/shfl_sync_up_b32_pred.ll index a75f4bf..b9ba5c3 100644 --- a/ptx/src/test/ll/shfl_sync_up_b32_pred.ll +++ b/ptx/src/test/ll/shfl_sync_up_b32_pred.ll @@ -1,6 +1,6 @@ -declare [2 x i32] @__zluda_ptx_impl_shfl_sync_up_b32_pred(i32, i32, i32, i32) #0 +declare hidden <2 x i32> @__zluda_ptx_impl_shfl_sync_up_b32_pred(i32, i32, i32, i32) #0 -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 +declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0 define amdgpu_kernel void @shfl_sync_up_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 { %"43" = alloca i64, align 8, addrspace(5) @@ -22,10 +22,10 @@ define amdgpu_kernel void @shfl_sync_up_b32_pred(ptr addrspace(4) byref(i64) %"4 "40": ; preds = %"39" store i32 %"33", ptr addrspace(5) %"45", align 4 %"52" = load i32, ptr addrspace(5) %"45", align 4 - %2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_up_b32_pred(i32 %"52", i32 3, i32 0, i32 -1) - %"65" = extractvalue [2 x i32] %2, 0 - %3 = extractvalue [2 x i32] %2, 1 - %"51" = trunc i32 %3 to i1 + %"68" = call <2 x i32> @__zluda_ptx_impl_shfl_sync_up_b32_pred(i32 %"52", i32 3, i32 0, i32 -1) + %"65" = extractelement <2 x i32> %"68", i8 0 + %"69" = extractelement <2 x i32> %"68", i8 1 + %"51" = trunc i32 %"69" to i1 store i32 %"65", ptr addrspace(5) %"46", align 4 store i1 %"51", ptr addrspace(5) %"47", align 1 %"53" = load i1, ptr addrspace(5) %"47", align 1 diff --git a/ptx/src/test/ll/shr_oob.ll b/ptx/src/test/ll/shr_oob.ll index cfe2532..e136352 100644 --- a/ptx/src/test/ll/shr_oob.ll +++ b/ptx/src/test/ll/shr_oob.ll @@ -8,11 +8,11 @@ define amdgpu_kernel void @shr_oob(ptr addrspace(4) byref(i64) %"31", ptr addrsp br label %"30" "30": ; preds = %1 - %"36" = load i64, ptr addrspace(4) %"31", align 4 - store i64 %"36", ptr addrspace(5) %"33", align 4 - %"37" = load i64, ptr addrspace(4) %"32", align 4 - store i64 %"37", ptr addrspace(5) %"34", align 4 - %"39" = load i64, ptr addrspace(5) %"33", align 4 + %"36" = load i64, ptr addrspace(4) %"31", align 8 + store i64 %"36", ptr addrspace(5) %"33", align 8 + %"37" = load i64, ptr addrspace(4) %"32", align 8 + store i64 %"37", ptr addrspace(5) %"34", align 8 + %"39" = load i64, ptr addrspace(5) %"33", align 8 %"44" = inttoptr i64 %"39" to ptr %"38" = load i16, ptr %"44", align 2 store i16 %"38", ptr addrspace(5) %"35", align 2 @@ -21,7 +21,7 @@ define amdgpu_kernel void @shr_oob(ptr addrspace(4) byref(i64) %"31", ptr addrsp %3 = ashr i16 %"41", 16 %"40" = select i1 true, i16 %2, i16 %3 store i16 %"40", ptr addrspace(5) %"35", align 2 - %"42" = load i64, ptr addrspace(5) %"34", align 4 + %"42" = load i64, ptr addrspace(5) %"34", align 8 %"43" = load i16, ptr addrspace(5) %"35", align 2 %"45" = inttoptr i64 %"42" to ptr store i16 %"43", ptr %"45", align 2 diff --git a/ptx/src/test/ll/sqrt.ll b/ptx/src/test/ll/sqrt.ll index e8ec284..724af8a 100644 --- a/ptx/src/test/ll/sqrt.ll +++ b/ptx/src/test/ll/sqrt.ll @@ -1,4 +1,4 @@ -declare float @__zluda_ptx_impl_sqrt_approx_f32(float) #0 +declare hidden float @__zluda_ptx_impl_sqrt_approx_f32(float) #0 define amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 { %"32" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/sqrt_rn_ftz.ll b/ptx/src/test/ll/sqrt_rn_ftz.ll index 5881807..66b63b1 100644 --- a/ptx/src/test/ll/sqrt_rn_ftz.ll +++ b/ptx/src/test/ll/sqrt_rn_ftz.ll @@ -1,4 +1,4 @@ -declare float @__zluda_ptx_impl_sqrt_rn_ftz_f32(float) #0 +declare hidden float @__zluda_ptx_impl_sqrt_rn_ftz_f32(float) #0 define amdgpu_kernel void @sqrt_rn_ftz(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 { %"32" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/stateful_ld_st_ntid.ll b/ptx/src/test/ll/stateful_ld_st_ntid.ll index 1f9d754..40ebf38 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid.ll @@ -1,4 +1,4 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 +declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0 define amdgpu_kernel void @stateful_ld_st_ntid(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 { %"38" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll b/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll index eb5c4c1..2e71969 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid_chain.ll @@ -1,4 +1,4 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 +declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0 define amdgpu_kernel void @stateful_ld_st_ntid_chain(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { %"42" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll b/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll index e85ad52..c89d918 100644 --- a/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll +++ b/ptx/src/test/ll/stateful_ld_st_ntid_sub.ll @@ -1,4 +1,4 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 +declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0 define amdgpu_kernel void @stateful_ld_st_ntid_sub(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #1 { %"46" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/tanh.ll b/ptx/src/test/ll/tanh.ll index 71f8af5..f7abba4 100644 --- a/ptx/src/test/ll/tanh.ll +++ b/ptx/src/test/ll/tanh.ll @@ -8,18 +8,18 @@ define amdgpu_kernel void @tanh(ptr addrspace(4) byref(i64) %"30", ptr addrspace br label %"29" "29": ; preds = %1 - %"35" = load i64, ptr addrspace(4) %"30", align 4 - store i64 %"35", ptr addrspace(5) %"32", align 4 - %"36" = load i64, ptr addrspace(4) %"31", align 4 - store i64 %"36", ptr addrspace(5) %"33", align 4 - %"38" = load i64, ptr addrspace(5) %"32", align 4 + %"35" = load i64, ptr addrspace(4) %"30", align 8 + store i64 %"35", ptr addrspace(5) %"32", align 8 + %"36" = load i64, ptr addrspace(4) %"31", align 8 + store i64 %"36", ptr addrspace(5) %"33", align 8 + %"38" = load i64, ptr addrspace(5) %"32", align 8 %"43" = inttoptr i64 %"38" to ptr %"37" = load float, ptr %"43", align 4 store float %"37", ptr addrspace(5) %"34", align 4 %"40" = load float, ptr addrspace(5) %"34", align 4 %"39" = call afn float @__ocml_tanh_f32(float %"40") store float %"39", ptr addrspace(5) %"34", align 4 - %"41" = load i64, ptr addrspace(5) %"33", align 4 + %"41" = load i64, ptr addrspace(5) %"33", align 8 %"42" = load float, ptr addrspace(5) %"34", align 4 %"44" = inttoptr i64 %"41" to ptr store float %"42", ptr %"44", align 4 diff --git a/ptx/src/test/ll/tid.ll b/ptx/src/test/ll/tid.ll index ece8736..19ebb3d 100644 --- a/ptx/src/test/ll/tid.ll +++ b/ptx/src/test/ll/tid.ll @@ -1,4 +1,4 @@ -declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0 +declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0 define amdgpu_kernel void @tid(ptr addrspace(4) byref(i64) %"34") #1 { %"35" = alloca i64, align 8, addrspace(5) diff --git a/ptx/src/test/ll/trap.ll b/ptx/src/test/ll/trap.ll new file mode 100644 index 0000000..29ae1e2 --- /dev/null +++ b/ptx/src/test/ll/trap.ll @@ -0,0 +1,16 @@ +define amdgpu_kernel void @trap(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { + br label %1 + +1: ; preds = %0 + br label %"26" + +"26": ; preds = %1 + call void @llvm.trap() + ret void +} + +; Function Attrs: cold noreturn nounwind +declare void @llvm.trap() #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { cold noreturn nounwind } \ No newline at end of file diff --git a/ptx/src/test/ll/vector.ll b/ptx/src/test/ll/vector.ll index e2b5a0c..0738f26 100644 --- a/ptx/src/test/ll/vector.ll +++ b/ptx/src/test/ll/vector.ll @@ -1,4 +1,4 @@ -define <2 x i32> @impl(<2 x i32> %"9") #0 { +define hidden <2 x i32> @impl(<2 x i32> %"9") #0 { %"47" = alloca <2 x i32>, align 8, addrspace(5) %"48" = alloca <2 x i32>, align 8, addrspace(5) %"49" = alloca i32, align 4, addrspace(5) diff --git a/ptx/src/test/spirv_run/dp4a.ptx b/ptx/src/test/spirv_run/dp4a.ptx new file mode 100644 index 0000000..b992160 --- /dev/null +++ b/ptx/src/test/spirv_run/dp4a.ptx @@ -0,0 +1,26 @@ +.version 6.5 +.target sm_70 +.address_size 64 + +.visible .entry dp4a( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .b32 src1; + .reg .b32 src2; + .reg .b32 src3; + .reg .b32 dst; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.b32 src1, [in_addr]; + ld.b32 src2, [in_addr+4]; + ld.b32 src3, [in_addr+8]; + dp4a.s32.s32 dst, src1, src2, src3; + st.b32 [out_addr], dst; + ret; +} diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index ed760ed..ca412be 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -317,12 +317,18 @@ test_ptx!( [0x12345678u32, 0x9abcdef0u32, 44], [0xef012345u32] ); +test_ptx!( + dp4a, + [0x8e2da590u32, 0xedeaee14, 0x248a9f70], + [613065134u32] +); test_ptx!(assertfail); // TODO: not yet supported //test_ptx!(func_ptr); test_ptx!(lanemask_lt); test_ptx!(extern_func); +test_ptx!(trap); test_ptx_warp!( tid, @@ -640,11 +646,15 @@ fn run_hip + Copy + Debug, Output: From + Copy + Debug + Def .to_str() .unwrap(), &*module.llvm_ir.write_bitcode_to_memory(), - &*module.attributes_ir.write_bitcode_to_memory(), module.linked_bitcode(), + &*module.attributes_ir.write_bitcode_to_memory(), None, ) .unwrap(); + // TODO: Re-enable when we are able to privatize function-scoped + // globals and constants + // let fns = comgr::get_symbols(&comgr, &elf_module).unwrap(); + // verify_symbols(fns); let mut module = unsafe { mem::zeroed() }; unsafe { hipModuleLoadData(&mut module, elf_module.as_ptr() as _) }.unwrap(); let mut kernel = unsafe { mem::zeroed() }; @@ -704,3 +714,30 @@ fn run_hip + Copy + Debug, Output: From + Copy + Debug + Def } Ok(result) } + +// TODO: Re-enable when we are able to privatize function-scoped +// globals and constants +/* +fn verify_symbols(mut symbols: Vec<(u32, String)>) { + symbols.sort(); + if symbols.len() != 2 { + panic!("Expected exactly two symbols, found: {:?}", symbols); + } + assert_eq!( + symbols[0].0, 1, + "Wrong symbols exported from binary: {:?}", + symbols + ); + assert_eq!( + symbols[1].0, 2, + "Wrong symbols exported from binary: {:?}", + symbols + ); + assert_eq!( + symbols[0].1, + format!("{}.kd", symbols[1].1), + "Wrong symbols exported from binary: {:?}", + symbols + ); +} + */ diff --git a/ptx/src/test/spirv_run/trap.ptx b/ptx/src/test/spirv_run/trap.ptx new file mode 100644 index 0000000..63c6587 --- /dev/null +++ b/ptx/src/test/spirv_run/trap.ptx @@ -0,0 +1,11 @@ +.version 6.5 +.target sm_30 +.address_size 64 +.visible .entry trap( + .param .u64 input, + .param .u64 output +) +{ + trap; + ret; +} \ No newline at end of file diff --git a/ptx_parser/src/ast.rs b/ptx_parser/src/ast.rs index ed5eb9d..108c2d3 100644 --- a/ptx_parser/src/ast.rs +++ b/ptx_parser/src/ast.rs @@ -89,6 +89,13 @@ ptx_parser_macros::generate_instruction_type!( src3: T, } }, + BarWarp { + type: Type::Scalar(ScalarType::U32), + data: (), + arguments: { + src: T, + } + }, Bar { type: Type::Scalar(ScalarType::U32), data: BarData, @@ -245,6 +252,27 @@ ptx_parser_macros::generate_instruction_type!( src2: T, } }, + Dp4a { + data: Dp4aDetails, + arguments: { + dst: { + repr: T, + type: { Type::Scalar(ScalarType::B32) }, + }, + src1: { + repr: T, + type: { Type::Scalar(data.atype) }, + }, + src2: { + repr: T, + type: { Type::Scalar(data.btype) }, + }, + src3: { + repr: T, + type: { Type::Scalar(data.ctype()) }, + }, + } + }, Ex2 { type: Type::Scalar(ScalarType::F32), data: TypeFtz, @@ -2059,6 +2087,21 @@ pub enum DivDetails { Float(DivFloatDetails), } +#[derive(Copy, Clone)] +pub struct Dp4aDetails { + pub atype: ScalarType, + pub btype: ScalarType, +} + +impl Dp4aDetails { + pub fn ctype(self) -> ScalarType { + match (self.atype, self.btype) { + (ScalarType::U32, ScalarType::U32) => ScalarType::U32, + _ => ScalarType::S32, + } + } +} + impl DivDetails { pub fn type_(&self) -> ScalarType { match self { diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index e545f49..56331a6 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -3150,6 +3150,13 @@ derive_parser!( } .op: Reduction = { .and, .or }; + bar.warp.sync membermask => { + ast::Instruction::BarWarp { + data: (), + arguments: BarWarpArgs { src: membermask } + } + } + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom atom{.sem}{.scope}{.space}.op{.level::cache_hint}.type d, [a], b{, cache_policy} => { if level_cache_hint || cache_policy.is_some() { @@ -3689,6 +3696,25 @@ derive_parser!( .dir: ShiftDirection = { .l, .r }; .mode: FunnelShiftMode = { .clamp, .wrap }; + + trap => { + Instruction::Trap {} + } + + // https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-dp4a + + dp4a.atype.btype d, a, b, c => { + Instruction::Dp4a { + data: Dp4aDetails { + atype, + btype + }, + arguments: Dp4aArgs { dst: d, src1: a, src2: b, src3: c } + } + } + + .atype: ScalarType = { .u32, .s32 }; + .btype: ScalarType = { .u32, .s32 }; ); #[cfg(test)] diff --git a/ptx_parser_macros_impl/src/parser.rs b/ptx_parser_macros_impl/src/parser.rs index b9a5d2f..691b86d 100644 --- a/ptx_parser_macros_impl/src/parser.rs +++ b/ptx_parser_macros_impl/src/parser.rs @@ -82,7 +82,7 @@ pub struct OpcodeDecl(pub Instruction, pub Arguments); impl OpcodeDecl { fn peek(input: syn::parse::ParseStream) -> bool { - Instruction::peek(input) && !input.peek2(Token![=]) + Instruction::peek(input) } } @@ -234,7 +234,13 @@ pub struct Instruction { } impl Instruction { fn peek(input: syn::parse::ParseStream) -> bool { - input.peek(Ident) + if !input.peek(Ident) { + return false; + } + if input.peek2(Token![=]) && input.peek3(syn::token::Brace) { + return false; + } + true } } @@ -910,4 +916,24 @@ mod tests { }; syn::parse2::(input).unwrap(); } + + #[test] + fn instruction_no_options() { + let input = quote! { + trap => { + Instruction::Trap {} + } + foo.bar => { + todo!() + } + ScalarType = { .f32 }; + }; + let args = syn::parse2::(input).unwrap(); + assert_eq!(args.1.len(), 1); + let (ref opcode1, _) = args.0 .0[0]; + assert_eq!(opcode1.0.name.to_string(), "trap"); + let (ref opcode2, _) = args.0 .0[1]; + assert_eq!(opcode2.0.name.to_string(), "foo"); + assert_eq!(args.1.len(), 1); + } } diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs index 0c2c5b2..f8db917 100644 --- a/zluda/src/impl/module.rs +++ b/zluda/src/impl/module.rs @@ -145,8 +145,8 @@ fn compile_from_ptx_and_cache( comgr, gcn_arch, &*llvm_module.llvm_ir.write_bitcode_to_memory(), - &*llvm_module.attributes_ir.write_bitcode_to_memory(), llvm_module.linked_bitcode(), + &*llvm_module.attributes_ir.write_bitcode_to_memory(), None, ) .map_err(|_| CUerror::UNKNOWN)?; diff --git a/zluda_trace/src/lib.rs b/zluda_trace/src/lib.rs index 964c09d..13e4d2a 100644 --- a/zluda_trace/src/lib.rs +++ b/zluda_trace/src/lib.rs @@ -1,6 +1,7 @@ use ::dark_api::fatbin::FatbinFileIterator; use ::dark_api::FnFfi; use cuda_types::cuda::*; +use cuda_types::dark_api::FatbinHeader; use dark_api::DarkApiState2; use log::{CudaFunctionName, ErrorEntry}; use parking_lot::ReentrantMutex; @@ -368,7 +369,7 @@ impl DarkApiTrace { FatbinFileIterator::new( fatbin_header .as_ref() - .ok_or(ErrorEntry::NullPointer("get_module_from_cubin_ext2_post"))?, + .ok_or(ErrorEntry::NullPointer("FatbinHeader"))?, ), ) }); @@ -1232,9 +1233,9 @@ impl Settings { fn parse_compute_capability(env_string: &str) -> Option<(u32, u32)> { let regex = Regex::new(r"(\d+)\.(\d+)").unwrap(); let captures = regex.captures(&env_string)?; - let major = captures.get(0)?; + let major = captures.get(1)?; let major = str::parse::(major.as_str()).ok()?; - let minor = captures.get(1)?; + let minor = captures.get(2)?; let minor = str::parse::(minor.as_str()).ok()?; Some((major, minor)) } @@ -1353,13 +1354,27 @@ pub(crate) fn cuModuleGetFunction_Post( #[allow(non_snake_case)] pub(crate) fn cuDeviceGetAttribute_Post( - _pi: *mut ::std::os::raw::c_int, - _attrib: CUdevice_attribute, + pi: *mut ::std::os::raw::c_int, + attrib: CUdevice_attribute, _dev: CUdevice, - _state: &mut trace::StateTracker, + state: &mut trace::StateTracker, _fn_logger: &mut FnCallLog, _result: CUresult, ) { + if attrib == CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR { + if let Some((major_override, _)) = state.override_cc { + unsafe { + *pi = major_override as i32; + }; + } + } + if attrib == CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR { + if let Some((_, minor_override)) = state.override_cc { + unsafe { + *pi = minor_override as i32; + }; + } + } } #[allow(non_snake_case)] @@ -1381,12 +1396,25 @@ pub(crate) fn cuDeviceComputeCapability_Post( #[allow(non_snake_case)] pub(crate) fn cuModuleLoadFatBinary_Post( - _module: *mut CUmodule, - _fatCubin: *const ::std::os::raw::c_void, - _state: &mut trace::StateTracker, - _fn_logger: &mut FnCallLog, + module: *mut CUmodule, + fatbin_header: *const ::std::os::raw::c_void, + state: &mut trace::StateTracker, + fn_logger: &mut FnCallLog, _result: CUresult, ) { + fn_logger.try_(|fn_logger| unsafe { + trace::record_submodules( + *module, + fn_logger, + state, + FatbinFileIterator::new( + fatbin_header + .cast::() + .as_ref() + .ok_or(ErrorEntry::NullPointer("FatbinHeader"))?, + ), + ) + }); } #[allow(non_snake_case)]