Add more instructions, tighten generated assembly (#475)
Some checks are pending
ZLUDA / Build (Linux) (push) Waiting to run
ZLUDA / Build (Windows) (push) Waiting to run
ZLUDA / Build AMD GPU unit tests (push) Waiting to run
ZLUDA / Run AMD GPU unit tests (push) Blocked by required conditions

This commit is contained in:
Andrzej Janik 2025-08-18 20:12:57 +02:00 committed by GitHub
commit 547536de4a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
70 changed files with 885 additions and 383 deletions

View file

@ -175,22 +175,35 @@ impl Data {
}
}
#[derive(Clone, Copy, Debug)]
struct Symbol(u64);
impl Symbol {
fn comgr2(&self) -> comgr2::amd_comgr_symbol_t {
comgr2::amd_comgr_symbol_s { handle: self.0 }
}
fn comgr3(&self) -> comgr3::amd_comgr_symbol_t {
comgr3::amd_comgr_symbol_s { handle: self.0 }
}
}
pub fn compile_bitcode(
comgr: &Comgr,
gcn_arch: &str,
main_buffer: &[u8],
attributes_buffer: &[u8],
ptx_impl: &[u8],
attributes_buffer: &[u8],
compiler_hook: Option<&dyn Fn(&Vec<u8>, String)>,
) -> Result<Vec<u8>, Error> {
let bitcode_data_set = DataSet::new(comgr)?;
let main_bitcode_data = Data::new(comgr, DataKind::Bc, c"zluda.bc", main_buffer)?;
bitcode_data_set.add(&main_bitcode_data)?;
let stdlib_bitcode_data = Data::new(comgr, DataKind::Bc, c"ptx_impl.bc", ptx_impl)?;
bitcode_data_set.add(&stdlib_bitcode_data)?;
let attributes_bitcode_data =
Data::new(comgr, DataKind::Bc, c"attributes.bc", attributes_buffer)?;
bitcode_data_set.add(&attributes_bitcode_data)?;
let stdlib_bitcode_data = Data::new(comgr, DataKind::Bc, c"ptx_impl.bc", ptx_impl)?;
bitcode_data_set.add(&stdlib_bitcode_data)?;
let linking_info = ActionInfo::new(comgr)?;
let linked_data_set =
comgr.do_action(ActionKind::LinkBcToBc, &linking_info, &bitcode_data_set)?;
@ -214,6 +227,9 @@ pub fn compile_bitcode(
c"-Xclang",
c"-fdenormal-fp-math=dynamic",
c"-O3",
// To consider
//c"-mllvm",
//c"-amdgpu-internalize-symbols",
c"-mno-wavefrontsize64",
c"-mcumode",
// Useful for inlining reports, combined with AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_EMIT_VERBOSE_LOGS=1 AMD_COMGR_REDIRECT_LOGS=stderr
@ -263,6 +279,22 @@ pub fn compile_bitcode(
executable
}
pub fn get_symbols(comgr: &Comgr, elf: &[u8]) -> Result<Vec<(u32, String)>, Error> {
let elf = Data::new(comgr, DataKind::Executable, c"elf.o", elf)?;
let mut symbols = Vec::new();
comgr.iterate_symbols(&elf, &mut |symbol| {
let type_ = unsafe { comgr.symbol_get_info::<u32>(symbol, SymbolInfo::Type)? };
let name_length = unsafe { comgr.symbol_get_info::<u64>(symbol, SymbolInfo::NameLength)? };
let mut name =
unsafe { comgr.symbol_get_buffer(symbol, SymbolInfo::Name, name_length as usize + 1)? };
name.pop();
let name = String::from_utf8(name).map_err(|_| Error::UNKNOWN)?;
symbols.push((type_, name));
Ok(())
})?;
Ok(symbols)
}
pub fn get_clang_version(comgr: &Comgr) -> Result<String, Error> {
let version_string_set = DataSet::new(comgr)?;
let version_string = Data::new(
@ -310,6 +342,8 @@ pub enum Comgr {
V3(amd_comgr_sys::comgr3::Comgr3),
}
type SymbolIterator = dyn FnMut(Symbol) -> Result<(), Error>;
impl Comgr {
pub fn new() -> Result<Self, Error> {
unsafe { libloading::Library::new(os::COMGR3) }
@ -351,16 +385,78 @@ impl Comgr {
(major, minor) >= (2, 9)
}
fn do_action(
&self,
fn do_action<'a>(
&'a self,
kind: ActionKind,
action: &ActionInfo,
data_set: &DataSet,
) -> Result<DataSet, Error> {
) -> Result<DataSet<'a>, Error> {
let result = DataSet::new(self)?;
call_dispatch!(self => amd_comgr_do_action(kind, action, data_set, result));
Ok(result)
}
fn iterate_symbols<'this>(
&'this self,
data: &Data,
mut fn_: &mut (dyn FnMut(Symbol) -> Result<(), Error> + 'this),
) -> Result<(), Error> {
let thin_pointer = &mut fn_;
match self {
Comgr::V2(comgr) => {
unsafe {
comgr.amd_comgr_iterate_symbols(
data.comgr2(),
Some(Self::iterate_callback_v2),
mem::transmute(thin_pointer),
)
}?;
}
Comgr::V3(comgr) => {
unsafe {
comgr.amd_comgr_iterate_symbols(
data.comgr3(),
Some(Self::iterate_callback_v3),
mem::transmute(thin_pointer),
)
}?;
}
}
Ok(())
}
extern "C" fn iterate_callback_v3(
symbol: comgr3::amd_comgr_symbol_t,
user_data: *mut ::std::os::raw::c_void,
) -> Result<(), comgr3::amd_comgr_status_s> {
let user_data = unsafe { mem::transmute::<_, &mut &mut SymbolIterator>(user_data) };
(*user_data)(Symbol(symbol.handle)).map_err(Into::into)
}
extern "C" fn iterate_callback_v2(
symbol: comgr2::amd_comgr_symbol_t,
user_data: *mut ::std::os::raw::c_void,
) -> Result<(), comgr2::amd_comgr_status_s> {
let user_data = unsafe { mem::transmute::<_, &mut &mut SymbolIterator>(user_data) };
(*user_data)(Symbol(symbol.handle)).map_err(Into::into)
}
unsafe fn symbol_get_info<T>(&self, symbol: Symbol, attribute: SymbolInfo) -> Result<T, Error> {
let mut value = unsafe { mem::zeroed::<T>() };
call_dispatch!(self => amd_comgr_symbol_get_info(symbol, attribute, { ptr::from_mut(&mut value).cast() }));
Ok(value)
}
unsafe fn symbol_get_buffer(
&self,
symbol: Symbol,
attribute: SymbolInfo,
length: usize,
) -> Result<Vec<u8>, Error> {
let mut value = vec![0u8; length];
call_dispatch!(self => amd_comgr_symbol_get_info(symbol, attribute, { value.as_mut_ptr().cast() }));
Ok(value)
}
}
#[derive(Debug, thiserror::Error)]
@ -388,12 +484,24 @@ impl From<comgr2::amd_comgr_status_s> for Error {
}
}
impl Into<comgr2::amd_comgr_status_s> for Error {
fn into(self) -> comgr2::amd_comgr_status_s {
comgr2::amd_comgr_status_s(self.0)
}
}
impl From<comgr3::amd_comgr_status_s> for Error {
fn from(status: comgr3::amd_comgr_status_s) -> Self {
Error(status.0)
}
}
impl Into<comgr3::amd_comgr_status_s> for Error {
fn into(self) -> comgr3::amd_comgr_status_s {
comgr3::amd_comgr_status_s(self.0)
}
}
macro_rules! impl_into {
($self_type:ident, $to_type:ident, [$($from:ident => $to:ident),+]) => {
#[derive(Copy, Clone)]
@ -405,6 +513,7 @@ macro_rules! impl_into {
}
impl $self_type {
#[allow(dead_code)]
fn comgr2(self) -> comgr2::$to_type {
match self {
$(
@ -413,6 +522,7 @@ macro_rules! impl_into {
}
}
#[allow(dead_code)]
fn comgr3(self) -> comgr3::$to_type {
match self {
$(
@ -435,6 +545,31 @@ impl_into!(
]
);
impl_into!(
SymbolType,
amd_comgr_symbol_type_t,
[
Unknown => AMD_COMGR_SYMBOL_TYPE_UNKNOWN,
NoType => AMD_COMGR_SYMBOL_TYPE_NOTYPE,
Object => AMD_COMGR_SYMBOL_TYPE_OBJECT,
Func => AMD_COMGR_SYMBOL_TYPE_FUNC,
Section => AMD_COMGR_SYMBOL_TYPE_SECTION,
File => AMD_COMGR_SYMBOL_TYPE_FILE,
Common => AMD_COMGR_SYMBOL_TYPE_COMMON,
AmdgpuHsaKernel => AMD_COMGR_SYMBOL_TYPE_AMDGPU_HSA_KERNEL
]
);
impl_into!(
SymbolInfo,
amd_comgr_symbol_info_t,
[
NameLength => AMD_COMGR_SYMBOL_INFO_NAME_LENGTH,
Name => AMD_COMGR_SYMBOL_INFO_NAME,
Type => AMD_COMGR_SYMBOL_INFO_TYPE
]
);
impl_into!(
DataKind,
amd_comgr_data_kind_t,

View file

@ -106,7 +106,7 @@ impl<'a> FatbinSubmodule<'a> {
FatbinSubmodule { header }
}
pub fn get_files(&self) -> FatbinFileIterator {
pub fn get_files(&self) -> FatbinFileIterator<'a> {
unsafe { FatbinFileIterator::new(self.header) }
}
}
@ -200,7 +200,7 @@ impl<'a> FatbinFileIterator<'a> {
Self { file_buffer }
}
pub unsafe fn next(&mut self) -> Result<Option<FatbinFile>, ParseError> {
pub unsafe fn next(&mut self) -> Result<Option<FatbinFile<'a>>, ParseError> {
if self.file_buffer.len() < std::mem::size_of::<FatbinFileHeader>() {
return Ok(None);
}

View file

@ -124,7 +124,7 @@ macro_rules! dark_api {
result
}
pub fn get(&self, key: &cuda_types::cuda::CUuuid) -> Option<crate::DarkApiTable> {
pub fn get(&self, key: &cuda_types::cuda::CUuuid) -> Option<crate::DarkApiTable<'_>> {
match key {
$(
&Self::$name => {

View file

@ -1,4 +1,9 @@
# On ROCm 6.3 and lower
bindgen --rust-target 1.77 /opt/rocm-6.3.4/include/amd_comgr/amd_comgr.h -o src/comgr2.rs --no-layout-tests --default-enum-style=newtype --no-derive-debug --must-use-type amd_comgr_status_t --allowlist-var "^AMD_COMGR.*$" --dynamic-loading Comgr2 --allowlist-function amd_comgr_do_action --allowlist-function amd_comgr_action_data_get_data --allowlist-function amd_comgr_action_info_set_isa_name --allowlist-function amd_comgr_action_info_set_language --allowlist-function amd_comgr_create_action_info --allowlist-function amd_comgr_create_data --allowlist-function amd_comgr_create_data_set --allowlist-function amd_comgr_data_set_add --allowlist-function amd_comgr_destroy_action_info --allowlist-function amd_comgr_destroy_data_set --allowlist-function amd_comgr_get_data --allowlist-function amd_comgr_set_data --allowlist-function amd_comgr_set_data_name --allowlist-function amd_comgr_action_info_set_option_list --allowlist-function amd_comgr_get_version
wget https://repo.radeon.com/rocm/apt/6.3.4/pool/main/c/comgr6.3.4/comgr6.3.4_2.8.0.60304-76~24.04_amd64.deb
apt install ./rocm-core6.3.4_6.3.4.60304-76~24.04_amd64.deb
wget https://repo.radeon.com/rocm/apt/6.3.4/pool/main/r/rocm-core6.3.4/rocm-core6.3.4_6.3.4.60304-76~24.04_amd64.deb
apt install ./comgr6.3.4_2.8.0.60304-76~24.04_amd64.deb
cargo install bindgen-cli
bindgen --rust-target 1.77 /opt/rocm-6.3.4/include/amd_comgr/amd_comgr.h -o src/comgr2.rs --no-layout-tests --default-enum-style=newtype --no-derive-debug --must-use-type amd_comgr_status_t --allowlist-var "^AMD_COMGR.*$" --dynamic-loading Comgr2 --allowlist-function amd_comgr_do_action --allowlist-function amd_comgr_action_data_get_data --allowlist-function amd_comgr_action_info_set_isa_name --allowlist-function amd_comgr_action_info_set_language --allowlist-function amd_comgr_create_action_info --allowlist-function amd_comgr_create_data --allowlist-function amd_comgr_create_data_set --allowlist-function amd_comgr_data_set_add --allowlist-function amd_comgr_destroy_action_info --allowlist-function amd_comgr_destroy_data_set --allowlist-function amd_comgr_get_data --allowlist-function amd_comgr_set_data --allowlist-function amd_comgr_set_data_name --allowlist-function amd_comgr_action_info_set_option_list --allowlist-function amd_comgr_get_version --allowlist-function amd_comgr_iterate_symbols --allowlist-function amd_comgr_symbol_get_info --allowlist-type amd_comgr_symbol_type_t
# On ROCm 6.4 and higher
bindgen --rust-target 1.77 /opt/rocm/include/amd_comgr/amd_comgr.h -o src/comgr3.rs --no-layout-tests --default-enum-style=newtype --no-derive-debug --must-use-type amd_comgr_status_t --allowlist-var "^AMD_COMGR.*$" --dynamic-loading Comgr3 --allowlist-function amd_comgr_do_action --allowlist-function amd_comgr_action_data_get_data --allowlist-function amd_comgr_action_info_set_isa_name --allowlist-function amd_comgr_action_info_set_language --allowlist-function amd_comgr_create_action_info --allowlist-function amd_comgr_create_data --allowlist-function amd_comgr_create_data_set --allowlist-function amd_comgr_data_set_add --allowlist-function amd_comgr_destroy_action_info --allowlist-function amd_comgr_destroy_data_set --allowlist-function amd_comgr_get_data --allowlist-function amd_comgr_set_data --allowlist-function amd_comgr_set_data_name --allowlist-function amd_comgr_action_info_set_option_list --allowlist-function amd_comgr_get_version
bindgen --rust-target 1.77 /opt/rocm/include/amd_comgr/amd_comgr.h -o src/comgr3.rs --no-layout-tests --default-enum-style=newtype --no-derive-debug --must-use-type amd_comgr_status_t --allowlist-var "^AMD_COMGR.*$" --dynamic-loading Comgr3 --allowlist-function amd_comgr_do_action --allowlist-function amd_comgr_action_data_get_data --allowlist-function amd_comgr_action_info_set_isa_name --allowlist-function amd_comgr_action_info_set_language --allowlist-function amd_comgr_create_action_info --allowlist-function amd_comgr_create_data --allowlist-function amd_comgr_create_data_set --allowlist-function amd_comgr_data_set_add --allowlist-function amd_comgr_destroy_action_info --allowlist-function amd_comgr_destroy_data_set --allowlist-function amd_comgr_get_data --allowlist-function amd_comgr_set_data --allowlist-function amd_comgr_set_data_name --allowlist-function amd_comgr_action_info_set_option_list --allowlist-function amd_comgr_get_version --allowlist-function amd_comgr_iterate_symbols --allowlist-function amd_comgr_symbol_get_info --allowlist-type amd_comgr_symbol_type_t

View file

@ -1,4 +1,4 @@
/* automatically generated by rust-bindgen 0.71.1 */
/* automatically generated by rust-bindgen 0.72.0 */
pub const AMD_COMGR_INTERFACE_VERSION_MAJOR: u32 = 2;
pub const AMD_COMGR_INTERFACE_VERSION_MINOR: u32 = 8;
@ -6,13 +6,9 @@ impl amd_comgr_status_s {
#[doc = " A generic error has occurred."]
pub const AMD_COMGR_STATUS_ERROR: amd_comgr_status_s =
amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(1) });
}
impl amd_comgr_status_s {
#[doc = " One of the actual arguments does not meet a precondition stated\n in the documentation of the corresponding formal argument. This\n includes both invalid Action types, and invalid arguments to\n valid Action types."]
pub const AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT: amd_comgr_status_s =
amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(2) });
}
impl amd_comgr_status_s {
#[doc = " Failed to allocate the necessary resources."]
pub const AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES: amd_comgr_status_s =
amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(3) });
@ -25,28 +21,16 @@ type amd_comgr_status_t = Result<(), self::amd_comgr_status_s>;
impl amd_comgr_language_s {
#[doc = " No high level language."]
pub const AMD_COMGR_LANGUAGE_NONE: amd_comgr_language_s = amd_comgr_language_s(0);
}
impl amd_comgr_language_s {
#[doc = " OpenCL 1.2."]
pub const AMD_COMGR_LANGUAGE_OPENCL_1_2: amd_comgr_language_s = amd_comgr_language_s(1);
}
impl amd_comgr_language_s {
#[doc = " OpenCL 2.0."]
pub const AMD_COMGR_LANGUAGE_OPENCL_2_0: amd_comgr_language_s = amd_comgr_language_s(2);
}
impl amd_comgr_language_s {
#[doc = " AMD Hetrogeneous C++ (HC)."]
pub const AMD_COMGR_LANGUAGE_HC: amd_comgr_language_s = amd_comgr_language_s(3);
}
impl amd_comgr_language_s {
#[doc = " HIP."]
pub const AMD_COMGR_LANGUAGE_HIP: amd_comgr_language_s = amd_comgr_language_s(4);
}
impl amd_comgr_language_s {
#[doc = " LLVM IR, either textual (.ll) or bitcode (.bc) format."]
pub const AMD_COMGR_LANGUAGE_LLVM_IR: amd_comgr_language_s = amd_comgr_language_s(5);
}
impl amd_comgr_language_s {
#[doc = " Marker for last valid language."]
pub const AMD_COMGR_LANGUAGE_LAST: amd_comgr_language_s = amd_comgr_language_s(5);
}
@ -59,65 +43,35 @@ pub use self::amd_comgr_language_s as amd_comgr_language_t;
impl amd_comgr_data_kind_s {
#[doc = " No data is available."]
pub const AMD_COMGR_DATA_KIND_UNDEF: amd_comgr_data_kind_s = amd_comgr_data_kind_s(0);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a textual main source."]
pub const AMD_COMGR_DATA_KIND_SOURCE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(1);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a textual source that is included in the main source\n or other include source."]
pub const AMD_COMGR_DATA_KIND_INCLUDE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(2);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a precompiled-header source that is included in the main\n source or other include source."]
pub const AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER: amd_comgr_data_kind_s =
amd_comgr_data_kind_s(3);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a diagnostic output."]
pub const AMD_COMGR_DATA_KIND_DIAGNOSTIC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(4);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a textual log output."]
pub const AMD_COMGR_DATA_KIND_LOG: amd_comgr_data_kind_s = amd_comgr_data_kind_s(5);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is compiler LLVM IR bit code for a specific isa."]
pub const AMD_COMGR_DATA_KIND_BC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(6);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a relocatable machine code object for a specific isa."]
pub const AMD_COMGR_DATA_KIND_RELOCATABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(7);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is an executable machine code object for a specific\n isa. An executable is the kind of code object that can be loaded\n and executed."]
pub const AMD_COMGR_DATA_KIND_EXECUTABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(8);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a block of bytes."]
pub const AMD_COMGR_DATA_KIND_BYTES: amd_comgr_data_kind_s = amd_comgr_data_kind_s(9);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a fat binary (clang-offload-bundler output)."]
pub const AMD_COMGR_DATA_KIND_FATBIN: amd_comgr_data_kind_s = amd_comgr_data_kind_s(16);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is an archive."]
pub const AMD_COMGR_DATA_KIND_AR: amd_comgr_data_kind_s = amd_comgr_data_kind_s(17);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a bitcode bundle."]
pub const AMD_COMGR_DATA_KIND_BC_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(18);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is an archive bundle."]
pub const AMD_COMGR_DATA_KIND_AR_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(19);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is an object file bundle."]
pub const AMD_COMGR_DATA_KIND_OBJ_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(20);
}
impl amd_comgr_data_kind_s {
#[doc = " Marker for last valid data kind."]
pub const AMD_COMGR_DATA_KIND_LAST: amd_comgr_data_kind_s = amd_comgr_data_kind_s(20);
}
@ -151,100 +105,70 @@ pub struct amd_comgr_action_info_s {
}
#[doc = " @brief A handle to an action information object.\n\n An action information object holds all the necessary information,\n excluding the input data objects, required to perform an action."]
pub type amd_comgr_action_info_t = amd_comgr_action_info_s;
#[doc = " @brief A handle to a machine code object symbol.\n\n A symbol handle is used to obtain the properties of symbols of a machine code\n object. A symbol handle is invalidated when the data object containing the\n symbol is destroyed."]
#[repr(C)]
#[derive(Copy, Clone)]
pub struct amd_comgr_symbol_s {
pub handle: u64,
}
#[doc = " @brief A handle to a machine code object symbol.\n\n A symbol handle is used to obtain the properties of symbols of a machine code\n object. A symbol handle is invalidated when the data object containing the\n symbol is destroyed."]
pub type amd_comgr_symbol_t = amd_comgr_symbol_s;
impl amd_comgr_action_kind_s {
#[doc = " Preprocess each source data object in @p input in order. For each\n successful preprocessor invocation, add a source data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the working\n directory path in @p info. Preprocess the source for the language in @p\n info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any preprocessing fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_SOURCE_TO_PREPROCESSOR: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(0);
}
impl amd_comgr_action_kind_s {
#[doc = " Copy all existing data objects in @p input to @p output, then add the\n device-specific and language-specific precompiled headers required for\n compilation.\n\n Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2\n and @p AMD_COMGR_LANGUAGE_OPENCL_2_0.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language\n is not set in @p info, or the language is not supported."]
pub const AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(1);
}
impl amd_comgr_action_kind_s {
#[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(2);
}
impl amd_comgr_action_kind_s {
#[doc = " Copy all existing data objects in @p input to @p output, then add the\n device-specific and language-specific bitcode libraries required for\n compilation.\n\n Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2,\n @p AMD_COMGR_LANGUAGE_OPENCL_2_0, and @p AMD_COMGR_LANGUAGE_HIP.\n\n The options in @p info should be set to a set of language-specific flags.\n For OpenCL and HIP these include:\n\n correctly_rounded_sqrt\n daz_opt\n finite_only\n unsafe_math\n wavefrontsize64\n\n For example, to enable daz_opt and unsafe_math, the options should be set\n as:\n\n const char *options[] = {\"daz_opt, \"unsafe_math\"};\n size_t optionsCount = sizeof(options) / sizeof(options[0]);\n amd_comgr_action_info_set_option_list(info, options, optionsCount);\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language\n is not set in @p info, the language is not supported, an unknown\n language-specific flag is supplied, or a language-specific flag is\n repeated.\n\n @deprecated since 1.7\n @warning This action, followed by @c AMD_COMGR_ACTION_LINK_BC_TO_BC, may\n result in subtle bugs due to incorrect linking of the device libraries.\n The @c AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC action can\n be used as a workaround which ensures the link occurs correctly."]
pub const AMD_COMGR_ACTION_ADD_DEVICE_LIBRARIES: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(3);
}
impl amd_comgr_action_kind_s {
#[doc = " Link a collection of bitcodes, bundled bitcodes, and bundled bitcode\n archives in @p into a single composite (unbundled) bitcode @p.\n Any device library bc data object must be explicitly added to @p input if\n needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link or unbundling fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if IsaName is not set in @p info and does not match the isa name\n of all bc data objects in @p input, or if the Name field is not set for\n any DataObject in the input set."]
pub const AMD_COMGR_ACTION_LINK_BC_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(4);
}
impl amd_comgr_action_kind_s {
#[doc = " Optimize each bc data object in @p input and create an optimized bc data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if the optimization fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
pub const AMD_COMGR_ACTION_OPTIMIZE_BC_TO_BC: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(5);
}
impl amd_comgr_action_kind_s {
#[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add a relocatable data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(6);
}
impl amd_comgr_action_kind_s {
#[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add an assembly source data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_ASSEMBLY: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(7);
}
impl amd_comgr_action_kind_s {
#[doc = " Link each relocatable data object in @p input together and add\n the linked relocatable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_RELOCATABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(8);
}
impl amd_comgr_action_kind_s {
#[doc = " Link each relocatable data object in @p input together and add\n the linked executable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(9);
}
impl amd_comgr_action_kind_s {
#[doc = " Assemble each source data object in @p input in order into machine code.\n For each successful assembly add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects in\n @p input. Resolve any include relative path names using the working\n directory path in @p info. Produce relocatable for isa name in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any assembly fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name is not set in\n @p info."]
pub const AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(10);
}
impl amd_comgr_action_kind_s {
#[doc = " Disassemble each relocatable data object in @p input in\n order. For each successful disassembly add a source data object to\n @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
pub const AMD_COMGR_ACTION_DISASSEMBLE_RELOCATABLE_TO_SOURCE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(11);
}
impl amd_comgr_action_kind_s {
#[doc = " Disassemble each executable data object in @p input in order. For\n each successful disassembly add a source data object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
pub const AMD_COMGR_ACTION_DISASSEMBLE_EXECUTABLE_TO_SOURCE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(12);
}
impl amd_comgr_action_kind_s {
#[doc = " Disassemble each bytes data object in @p input in order. For each\n successful disassembly add a source data object to @p\n result. Only simple assembly language commands are generate that\n corresponf to raw bytes are supported, not any directives that\n control the code object layout, or symbolic branch targets or\n names.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info"]
pub const AMD_COMGR_ACTION_DISASSEMBLE_BYTES_TO_SOURCE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(13);
}
impl amd_comgr_action_kind_s {
#[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a fat binary to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce fat binary for isa name in @p\n info. Compile the source for the language in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info.\n\n @deprecated since 2.5\n @see in-process compilation via AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC, etc.\n insteaad"]
pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_FATBIN: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(14);
}
impl amd_comgr_action_kind_s {
#[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(15);
}
impl amd_comgr_action_kind_s {
#[doc = " Compile a single source data object in @p input in order. For each\n successful compilation add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce relocatable for hip name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation. Currently only supports HIP language.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(16);
}
impl amd_comgr_action_kind_s {
#[doc = " Compile each source data object in @p input and create a single executabele\n in @p result. Resolve any include source names using the names of include\n data objects in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce executable for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(17);
}
impl amd_comgr_action_kind_s {
#[doc = " Unbundle each source data object in @p input. These objects can be\n bitcode bundles, or an archive containing bitcode bundles. For each\n successful unbundling, add a bc object or archive object to @p result,\n depending on the corresponding input.\n\n Return @p AMD_COMGR_STATUS_ERROR if any unbundling\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_UNBUNDLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(18);
}
impl amd_comgr_action_kind_s {
#[doc = " Marker for last valid action kind."]
pub const AMD_COMGR_ACTION_LAST: amd_comgr_action_kind_s = amd_comgr_action_kind_s(18);
}
@ -254,6 +178,55 @@ impl amd_comgr_action_kind_s {
pub struct amd_comgr_action_kind_s(pub ::std::os::raw::c_uint);
#[doc = " @brief The kinds of actions that can be performed."]
pub use self::amd_comgr_action_kind_s as amd_comgr_action_kind_t;
impl amd_comgr_symbol_type_s {
#[doc = " The symbol's type is unknown.\n\n The user should not infer any specific type for symbols which return\n `AMD_COMGR_SYMBOL_TYPE_UNKNOWN`, and these symbols may return different\n types in future releases."]
pub const AMD_COMGR_SYMBOL_TYPE_UNKNOWN: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(-1);
#[doc = " The symbol's type is not specified."]
pub const AMD_COMGR_SYMBOL_TYPE_NOTYPE: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(0);
#[doc = " The symbol is associated with a data object, such as a variable, an array,\n and so on."]
pub const AMD_COMGR_SYMBOL_TYPE_OBJECT: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(1);
#[doc = " The symbol is associated with a function or other executable code."]
pub const AMD_COMGR_SYMBOL_TYPE_FUNC: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(2);
#[doc = " The symbol is associated with a section. Symbol table entries of this type\n exist primarily for relocation."]
pub const AMD_COMGR_SYMBOL_TYPE_SECTION: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(3);
#[doc = " Conventionally, the symbol's name gives the name of the source file\n associated with the object file."]
pub const AMD_COMGR_SYMBOL_TYPE_FILE: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(4);
#[doc = " The symbol labels an uninitialized common block."]
pub const AMD_COMGR_SYMBOL_TYPE_COMMON: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(5);
#[doc = " The symbol is associated with an AMDGPU Code Object V2 kernel function."]
pub const AMD_COMGR_SYMBOL_TYPE_AMDGPU_HSA_KERNEL: amd_comgr_symbol_type_s =
amd_comgr_symbol_type_s(10);
}
#[repr(transparent)]
#[doc = " @brief Machine code object symbol type."]
#[derive(Copy, Clone, Hash, PartialEq, Eq)]
pub struct amd_comgr_symbol_type_s(pub ::std::os::raw::c_int);
#[doc = " @brief Machine code object symbol type."]
pub use self::amd_comgr_symbol_type_s as amd_comgr_symbol_type_t;
impl amd_comgr_symbol_info_s {
#[doc = " The length of the symbol name in bytes. Does not include the NUL\n terminator. The type of this attribute is uint64_t."]
pub const AMD_COMGR_SYMBOL_INFO_NAME_LENGTH: amd_comgr_symbol_info_s =
amd_comgr_symbol_info_s(0);
#[doc = " The name of the symbol. The type of this attribute is character array with\n the length equal to the value of the @p AMD_COMGR_SYMBOL_INFO_NAME_LENGTH\n attribute plus 1 for a NUL terminator."]
pub const AMD_COMGR_SYMBOL_INFO_NAME: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(1);
#[doc = " The kind of the symbol. The type of this attribute is @p\n amd_comgr_symbol_type_t."]
pub const AMD_COMGR_SYMBOL_INFO_TYPE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(2);
#[doc = " Size of the variable. The value of this attribute is undefined if the\n symbol is not a variable. The type of this attribute is uint64_t."]
pub const AMD_COMGR_SYMBOL_INFO_SIZE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(3);
#[doc = " Indicates whether the symbol is undefined. The type of this attribute is\n bool."]
pub const AMD_COMGR_SYMBOL_INFO_IS_UNDEFINED: amd_comgr_symbol_info_s =
amd_comgr_symbol_info_s(4);
#[doc = " The value of the symbol. The type of this attribute is uint64_t."]
pub const AMD_COMGR_SYMBOL_INFO_VALUE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(5);
#[doc = " Marker for last valid symbol info."]
pub const AMD_COMGR_SYMBOL_INFO_LAST: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(5);
}
#[repr(transparent)]
#[doc = " @brief Machine code object symbol attributes."]
#[derive(Copy, Clone, Hash, PartialEq, Eq)]
pub struct amd_comgr_symbol_info_s(pub ::std::os::raw::c_uint);
#[doc = " @brief Machine code object symbol attributes."]
pub use self::amd_comgr_symbol_info_s as amd_comgr_symbol_info_t;
pub struct Comgr2 {
__library: ::libloading::Library,
pub amd_comgr_get_version:
@ -351,6 +324,27 @@ pub struct Comgr2 {
) -> amd_comgr_status_t,
::libloading::Error,
>,
pub amd_comgr_iterate_symbols: Result<
unsafe extern "C" fn(
data: amd_comgr_data_t,
callback: ::std::option::Option<
unsafe extern "C" fn(
symbol: amd_comgr_symbol_t,
user_data: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t,
>,
user_data: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t,
::libloading::Error,
>,
pub amd_comgr_symbol_get_info: Result<
unsafe extern "C" fn(
symbol: amd_comgr_symbol_t,
attribute: amd_comgr_symbol_info_t,
value: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t,
::libloading::Error,
>,
}
impl Comgr2 {
pub unsafe fn new<P>(path: P) -> Result<Self, ::libloading::Error>
@ -396,6 +390,12 @@ impl Comgr2 {
.get(b"amd_comgr_action_info_set_option_list\0")
.map(|sym| *sym);
let amd_comgr_do_action = __library.get(b"amd_comgr_do_action\0").map(|sym| *sym);
let amd_comgr_iterate_symbols = __library
.get(b"amd_comgr_iterate_symbols\0")
.map(|sym| *sym);
let amd_comgr_symbol_get_info = __library
.get(b"amd_comgr_symbol_get_info\0")
.map(|sym| *sym);
Ok(Comgr2 {
__library,
amd_comgr_get_version,
@ -413,6 +413,8 @@ impl Comgr2 {
amd_comgr_action_info_set_language,
amd_comgr_action_info_set_option_list,
amd_comgr_do_action,
amd_comgr_iterate_symbols,
amd_comgr_symbol_get_info,
})
}
#[doc = " @brief Get the version of the code object manager interface\n supported.\n\n An interface is backwards compatible with an implementation with an\n equal major version, and a greater than or equal minor version.\n\n @param[out] major Major version number.\n\n @param[out] minor Minor version number."]
@ -593,4 +595,35 @@ impl Comgr2 {
.as_ref()
.expect("Expected function, got error."))(kind, info, input, result)
}
#[must_use]
#[doc = " @brief Iterate over the symbols of a machine code object.\n\n For a AMD_COMGR_DATA_KIND_RELOCATABLE the symbols in the ELF symtab section\n are iterated. For a AMD_COMGR_DATA_KIND_EXECUTABLE the symbols in the ELF\n dynsymtab are iterated.\n\n @param[in] data The data object to query.\n\n @param[in] callback The function to call for each symbol in the machine code\n data object. The symbol handle is passed in @p symbol and @p user_data is\n passed as @p user_data. If the function returns with a status other than @p\n AMD_COMGR_STATUS_SUCCESS then iteration is stopped.\n\n @param[in] user_data The value to pass to each invocation of @p\n callback. Allows context to be passed into the call back function.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR An error was\n reported by @p callback.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data is an invalid data\n object, or not of kind @p AMD_COMGR_DATA_KIND_RELOCATABLE or\n AMD_COMGR_DATA_KIND_EXECUTABLE. @p callback is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to iterate the data object as out of resources."]
pub unsafe fn amd_comgr_iterate_symbols(
&self,
data: amd_comgr_data_t,
callback: ::std::option::Option<
unsafe extern "C" fn(
symbol: amd_comgr_symbol_t,
user_data: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t,
>,
user_data: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t {
(self
.amd_comgr_iterate_symbols
.as_ref()
.expect("Expected function, got error."))(data, callback, user_data)
}
#[must_use]
#[doc = " @brief Query information about a machine code object symbol.\n\n @param[in] symbol The symbol to query.\n\n @param[in] attribute Attribute to query.\n\n @param[out] value Pointer to an application-allocated buffer where to store\n the value of the attribute. If the buffer passed by the application is not\n large enough to hold the value of attribute, the behavior is undefined. The\n type of value returned is specified by @p amd_comgr_symbol_info_t.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The @p symbol does not have the requested @p\n attribute.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p symbol is an invalid\n symbol. @p attribute is an invalid value. @p value is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to query symbol as out of resources."]
pub unsafe fn amd_comgr_symbol_get_info(
&self,
symbol: amd_comgr_symbol_t,
attribute: amd_comgr_symbol_info_t,
value: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t {
(self
.amd_comgr_symbol_get_info
.as_ref()
.expect("Expected function, got error."))(symbol, attribute, value)
}
}

View file

@ -1,4 +1,4 @@
/* automatically generated by rust-bindgen 0.71.1 */
/* automatically generated by rust-bindgen 0.72.0 */
pub const AMD_COMGR_INTERFACE_VERSION_MAJOR: u32 = 3;
pub const AMD_COMGR_INTERFACE_VERSION_MINOR: u32 = 0;
@ -6,13 +6,9 @@ impl amd_comgr_status_s {
#[doc = " A generic error has occurred."]
pub const AMD_COMGR_STATUS_ERROR: amd_comgr_status_s =
amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(1) });
}
impl amd_comgr_status_s {
#[doc = " One of the actual arguments does not meet a precondition stated\n in the documentation of the corresponding formal argument. This\n includes both invalid Action types, and invalid arguments to\n valid Action types."]
pub const AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT: amd_comgr_status_s =
amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(2) });
}
impl amd_comgr_status_s {
#[doc = " Failed to allocate the necessary resources."]
pub const AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES: amd_comgr_status_s =
amd_comgr_status_s(unsafe { ::std::num::NonZeroU32::new_unchecked(3) });
@ -25,24 +21,14 @@ type amd_comgr_status_t = Result<(), self::amd_comgr_status_s>;
impl amd_comgr_language_s {
#[doc = " No high level language."]
pub const AMD_COMGR_LANGUAGE_NONE: amd_comgr_language_s = amd_comgr_language_s(0);
}
impl amd_comgr_language_s {
#[doc = " OpenCL 1.2."]
pub const AMD_COMGR_LANGUAGE_OPENCL_1_2: amd_comgr_language_s = amd_comgr_language_s(1);
}
impl amd_comgr_language_s {
#[doc = " OpenCL 2.0."]
pub const AMD_COMGR_LANGUAGE_OPENCL_2_0: amd_comgr_language_s = amd_comgr_language_s(2);
}
impl amd_comgr_language_s {
#[doc = " HIP."]
pub const AMD_COMGR_LANGUAGE_HIP: amd_comgr_language_s = amd_comgr_language_s(3);
}
impl amd_comgr_language_s {
#[doc = " LLVM IR, either textual (.ll) or bitcode (.bc) format."]
pub const AMD_COMGR_LANGUAGE_LLVM_IR: amd_comgr_language_s = amd_comgr_language_s(4);
}
impl amd_comgr_language_s {
#[doc = " Marker for last valid language."]
pub const AMD_COMGR_LANGUAGE_LAST: amd_comgr_language_s = amd_comgr_language_s(4);
}
@ -55,69 +41,37 @@ pub use self::amd_comgr_language_s as amd_comgr_language_t;
impl amd_comgr_data_kind_s {
#[doc = " No data is available."]
pub const AMD_COMGR_DATA_KIND_UNDEF: amd_comgr_data_kind_s = amd_comgr_data_kind_s(0);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a textual main source."]
pub const AMD_COMGR_DATA_KIND_SOURCE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(1);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a textual source that is included in the main source\n or other include source."]
pub const AMD_COMGR_DATA_KIND_INCLUDE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(2);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a precompiled-header source that is included in the main\n source or other include source."]
pub const AMD_COMGR_DATA_KIND_PRECOMPILED_HEADER: amd_comgr_data_kind_s =
amd_comgr_data_kind_s(3);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a diagnostic output."]
pub const AMD_COMGR_DATA_KIND_DIAGNOSTIC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(4);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a textual log output."]
pub const AMD_COMGR_DATA_KIND_LOG: amd_comgr_data_kind_s = amd_comgr_data_kind_s(5);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is compiler LLVM IR bit code for a specific isa."]
pub const AMD_COMGR_DATA_KIND_BC: amd_comgr_data_kind_s = amd_comgr_data_kind_s(6);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a relocatable machine code object for a specific isa."]
pub const AMD_COMGR_DATA_KIND_RELOCATABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(7);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is an executable machine code object for a specific\n isa. An executable is the kind of code object that can be loaded\n and executed."]
pub const AMD_COMGR_DATA_KIND_EXECUTABLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(8);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a block of bytes."]
pub const AMD_COMGR_DATA_KIND_BYTES: amd_comgr_data_kind_s = amd_comgr_data_kind_s(9);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a fat binary (clang-offload-bundler output)."]
pub const AMD_COMGR_DATA_KIND_FATBIN: amd_comgr_data_kind_s = amd_comgr_data_kind_s(16);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is an archive."]
pub const AMD_COMGR_DATA_KIND_AR: amd_comgr_data_kind_s = amd_comgr_data_kind_s(17);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is a bitcode bundle."]
pub const AMD_COMGR_DATA_KIND_BC_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(18);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is an archive bundle."]
pub const AMD_COMGR_DATA_KIND_AR_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(19);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is an object file bundle."]
pub const AMD_COMGR_DATA_KIND_OBJ_BUNDLE: amd_comgr_data_kind_s = amd_comgr_data_kind_s(20);
}
impl amd_comgr_data_kind_s {
#[doc = " The data is SPIR-V IR"]
pub const AMD_COMGR_DATA_KIND_SPIRV: amd_comgr_data_kind_s = amd_comgr_data_kind_s(21);
}
impl amd_comgr_data_kind_s {
#[doc = " Marker for last valid data kind."]
pub const AMD_COMGR_DATA_KIND_LAST: amd_comgr_data_kind_s = amd_comgr_data_kind_s(21);
}
@ -151,90 +105,64 @@ pub struct amd_comgr_action_info_s {
}
#[doc = " @brief A handle to an action information object.\n\n An action information object holds all the necessary information,\n excluding the input data objects, required to perform an action."]
pub type amd_comgr_action_info_t = amd_comgr_action_info_s;
#[doc = " @brief A handle to a machine code object symbol.\n\n A symbol handle is used to obtain the properties of symbols of a machine code\n object. A symbol handle is invalidated when the data object containing the\n symbol is destroyed."]
#[repr(C)]
#[derive(Copy, Clone)]
pub struct amd_comgr_symbol_s {
pub handle: u64,
}
#[doc = " @brief A handle to a machine code object symbol.\n\n A symbol handle is used to obtain the properties of symbols of a machine code\n object. A symbol handle is invalidated when the data object containing the\n symbol is destroyed."]
pub type amd_comgr_symbol_t = amd_comgr_symbol_s;
impl amd_comgr_action_kind_s {
#[doc = " Preprocess each source data object in @p input in order. For each\n successful preprocessor invocation, add a source data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the working\n directory path in @p info. Preprocess the source for the language in @p\n info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any preprocessing fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_SOURCE_TO_PREPROCESSOR: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(0);
}
impl amd_comgr_action_kind_s {
#[doc = " Copy all existing data objects in @p input to @p output, then add the\n device-specific and language-specific precompiled headers required for\n compilation.\n\n Currently the only supported languages are @p AMD_COMGR_LANGUAGE_OPENCL_1_2\n and @p AMD_COMGR_LANGUAGE_OPENCL_2_0.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name or language\n is not set in @p info, or the language is not supported."]
pub const AMD_COMGR_ACTION_ADD_PRECOMPILED_HEADERS: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(1);
}
impl amd_comgr_action_kind_s {
#[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(2);
}
impl amd_comgr_action_kind_s {
#[doc = " Link a collection of bitcodes, bundled bitcodes, and bundled bitcode\n archives in @p into a single composite (unbundled) bitcode @p.\n Any device library bc data object must be explicitly added to @p input if\n needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link or unbundling fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if IsaName is not set in @p info and does not match the isa name\n of all bc data objects in @p input, or if the Name field is not set for\n any DataObject in the input set."]
pub const AMD_COMGR_ACTION_LINK_BC_TO_BC: amd_comgr_action_kind_s = amd_comgr_action_kind_s(3);
}
impl amd_comgr_action_kind_s {
#[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add a relocatable data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(4);
}
impl amd_comgr_action_kind_s {
#[doc = " Perform code generation for each bc data object in @p input in\n order. For each successful code generation add an assembly source data\n object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any code\n generation fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all bc data objects in @p input."]
pub const AMD_COMGR_ACTION_CODEGEN_BC_TO_ASSEMBLY: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(5);
}
impl amd_comgr_action_kind_s {
#[doc = " Link each relocatable data object in @p input together and add\n the linked relocatable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_RELOCATABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(6);
}
impl amd_comgr_action_kind_s {
#[doc = " Link each relocatable data object in @p input together and add\n the linked executable data object to @p result. Any device\n library relocatable data object must be explicitly added to @p\n input if needed.\n\n Return @p AMD_COMGR_STATUS_ERROR if the link fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
pub const AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(7);
}
impl amd_comgr_action_kind_s {
#[doc = " Assemble each source data object in @p input in order into machine code.\n For each successful assembly add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects in\n @p input. Resolve any include relative path names using the working\n directory path in @p info. Produce relocatable for isa name in @p info.\n\n Return @p AMD_COMGR_STATUS_ERROR if any assembly fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT if isa name is not set in\n @p info."]
pub const AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(8);
}
impl amd_comgr_action_kind_s {
#[doc = " Disassemble each relocatable data object in @p input in\n order. For each successful disassembly add a source data object to\n @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
pub const AMD_COMGR_ACTION_DISASSEMBLE_RELOCATABLE_TO_SOURCE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(9);
}
impl amd_comgr_action_kind_s {
#[doc = " Disassemble each executable data object in @p input in order. For\n each successful disassembly add a source data object to @p result.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info and does not match the isa name\n of all relocatable data objects in @p input."]
pub const AMD_COMGR_ACTION_DISASSEMBLE_EXECUTABLE_TO_SOURCE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(10);
}
impl amd_comgr_action_kind_s {
#[doc = " Disassemble each bytes data object in @p input in order. For each\n successful disassembly add a source data object to @p\n result. Only simple assembly language commands are generate that\n corresponf to raw bytes are supported, not any directives that\n control the code object layout, or symbolic branch targets or\n names.\n\n Return @p AMD_COMGR_STATUS_ERROR if any disassembly\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name is not set in @p info"]
pub const AMD_COMGR_ACTION_DISASSEMBLE_BYTES_TO_SOURCE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(11);
}
impl amd_comgr_action_kind_s {
#[doc = " Compile each source data object in @p input in order. For each\n successful compilation add a bc data object to @p result. Resolve\n any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce bc for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(12);
}
impl amd_comgr_action_kind_s {
#[doc = " Compile a single source data object in @p input in order. For each\n successful compilation add a relocatable data object to @p result.\n Resolve any include source names using the names of include data objects\n in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce relocatable for hip name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation. Currently only supports HIP language.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(13);
}
impl amd_comgr_action_kind_s {
#[doc = " Compile each source data object in @p input and create a single executabele\n in @p result. Resolve any include source names using the names of include\n data objects in @p input. Resolve any include relative path names using the\n working directory path in @p info. Produce executable for isa name in @p\n info. Compile the source for the language in @p info. Link against\n the device-specific and language-specific bitcode device libraries\n required for compilation.\n\n Return @p AMD_COMGR_STATUS_ERROR if any compilation\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(14);
}
impl amd_comgr_action_kind_s {
#[doc = " Unbundle each source data object in @p input. These objects can be\n bitcode bundles, or an archive containing bitcode bundles. For each\n successful unbundling, add a bc object or archive object to @p result,\n depending on the corresponding input.\n\n Return @p AMD_COMGR_STATUS_ERROR if any unbundling\n fails.\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if isa name or language is not set in @p info."]
pub const AMD_COMGR_ACTION_UNBUNDLE: amd_comgr_action_kind_s = amd_comgr_action_kind_s(15);
}
impl amd_comgr_action_kind_s {
#[doc = " Translate each source SPIR-V object in @p input into LLVM IR Bitcode.\n For each successful translation, add a bc object to @p result *\n\n Return @p AMD_COMGR_STATUS_ERROR if any translation fails\n\n Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT\n if any input is not SPIR-V."]
pub const AMD_COMGR_ACTION_TRANSLATE_SPIRV_TO_BC: amd_comgr_action_kind_s =
amd_comgr_action_kind_s(19);
}
impl amd_comgr_action_kind_s {
#[doc = " Marker for last valid action kind."]
pub const AMD_COMGR_ACTION_LAST: amd_comgr_action_kind_s = amd_comgr_action_kind_s(19);
}
@ -244,6 +172,55 @@ impl amd_comgr_action_kind_s {
pub struct amd_comgr_action_kind_s(pub ::std::os::raw::c_uint);
#[doc = " @brief The kinds of actions that can be performed."]
pub use self::amd_comgr_action_kind_s as amd_comgr_action_kind_t;
impl amd_comgr_symbol_type_s {
#[doc = " The symbol's type is unknown.\n\n The user should not infer any specific type for symbols which return\n `AMD_COMGR_SYMBOL_TYPE_UNKNOWN`, and these symbols may return different\n types in future releases."]
pub const AMD_COMGR_SYMBOL_TYPE_UNKNOWN: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(-1);
#[doc = " The symbol's type is not specified."]
pub const AMD_COMGR_SYMBOL_TYPE_NOTYPE: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(0);
#[doc = " The symbol is associated with a data object, such as a variable, an array,\n and so on."]
pub const AMD_COMGR_SYMBOL_TYPE_OBJECT: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(1);
#[doc = " The symbol is associated with a function or other executable code."]
pub const AMD_COMGR_SYMBOL_TYPE_FUNC: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(2);
#[doc = " The symbol is associated with a section. Symbol table entries of this type\n exist primarily for relocation."]
pub const AMD_COMGR_SYMBOL_TYPE_SECTION: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(3);
#[doc = " Conventionally, the symbol's name gives the name of the source file\n associated with the object file."]
pub const AMD_COMGR_SYMBOL_TYPE_FILE: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(4);
#[doc = " The symbol labels an uninitialized common block."]
pub const AMD_COMGR_SYMBOL_TYPE_COMMON: amd_comgr_symbol_type_s = amd_comgr_symbol_type_s(5);
#[doc = " The symbol is associated with an AMDGPU Code Object V2 kernel function."]
pub const AMD_COMGR_SYMBOL_TYPE_AMDGPU_HSA_KERNEL: amd_comgr_symbol_type_s =
amd_comgr_symbol_type_s(10);
}
#[repr(transparent)]
#[doc = " @brief Machine code object symbol type."]
#[derive(Copy, Clone, Hash, PartialEq, Eq)]
pub struct amd_comgr_symbol_type_s(pub ::std::os::raw::c_int);
#[doc = " @brief Machine code object symbol type."]
pub use self::amd_comgr_symbol_type_s as amd_comgr_symbol_type_t;
impl amd_comgr_symbol_info_s {
#[doc = " The length of the symbol name in bytes. Does not include the NUL\n terminator. The type of this attribute is uint64_t."]
pub const AMD_COMGR_SYMBOL_INFO_NAME_LENGTH: amd_comgr_symbol_info_s =
amd_comgr_symbol_info_s(0);
#[doc = " The name of the symbol. The type of this attribute is character array with\n the length equal to the value of the @p AMD_COMGR_SYMBOL_INFO_NAME_LENGTH\n attribute plus 1 for a NUL terminator."]
pub const AMD_COMGR_SYMBOL_INFO_NAME: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(1);
#[doc = " The kind of the symbol. The type of this attribute is @p\n amd_comgr_symbol_type_t."]
pub const AMD_COMGR_SYMBOL_INFO_TYPE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(2);
#[doc = " Size of the variable. The value of this attribute is undefined if the\n symbol is not a variable. The type of this attribute is uint64_t."]
pub const AMD_COMGR_SYMBOL_INFO_SIZE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(3);
#[doc = " Indicates whether the symbol is undefined. The type of this attribute is\n bool."]
pub const AMD_COMGR_SYMBOL_INFO_IS_UNDEFINED: amd_comgr_symbol_info_s =
amd_comgr_symbol_info_s(4);
#[doc = " The value of the symbol. The type of this attribute is uint64_t."]
pub const AMD_COMGR_SYMBOL_INFO_VALUE: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(5);
#[doc = " Marker for last valid symbol info."]
pub const AMD_COMGR_SYMBOL_INFO_LAST: amd_comgr_symbol_info_s = amd_comgr_symbol_info_s(5);
}
#[repr(transparent)]
#[doc = " @brief Machine code object symbol attributes."]
#[derive(Copy, Clone, Hash, PartialEq, Eq)]
pub struct amd_comgr_symbol_info_s(pub ::std::os::raw::c_uint);
#[doc = " @brief Machine code object symbol attributes."]
pub use self::amd_comgr_symbol_info_s as amd_comgr_symbol_info_t;
pub struct Comgr3 {
__library: ::libloading::Library,
pub amd_comgr_get_version:
@ -341,6 +318,27 @@ pub struct Comgr3 {
) -> amd_comgr_status_t,
::libloading::Error,
>,
pub amd_comgr_iterate_symbols: Result<
unsafe extern "C" fn(
data: amd_comgr_data_t,
callback: ::std::option::Option<
unsafe extern "C" fn(
symbol: amd_comgr_symbol_t,
user_data: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t,
>,
user_data: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t,
::libloading::Error,
>,
pub amd_comgr_symbol_get_info: Result<
unsafe extern "C" fn(
symbol: amd_comgr_symbol_t,
attribute: amd_comgr_symbol_info_t,
value: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t,
::libloading::Error,
>,
}
impl Comgr3 {
pub unsafe fn new<P>(path: P) -> Result<Self, ::libloading::Error>
@ -386,6 +384,12 @@ impl Comgr3 {
.get(b"amd_comgr_action_info_set_option_list\0")
.map(|sym| *sym);
let amd_comgr_do_action = __library.get(b"amd_comgr_do_action\0").map(|sym| *sym);
let amd_comgr_iterate_symbols = __library
.get(b"amd_comgr_iterate_symbols\0")
.map(|sym| *sym);
let amd_comgr_symbol_get_info = __library
.get(b"amd_comgr_symbol_get_info\0")
.map(|sym| *sym);
Ok(Comgr3 {
__library,
amd_comgr_get_version,
@ -403,6 +407,8 @@ impl Comgr3 {
amd_comgr_action_info_set_language,
amd_comgr_action_info_set_option_list,
amd_comgr_do_action,
amd_comgr_iterate_symbols,
amd_comgr_symbol_get_info,
})
}
#[doc = " @brief Get the version of the code object manager interface\n supported.\n\n An interface is backwards compatible with an implementation with an\n equal major version, and a greater than or equal minor version.\n\n @param[out] major Major version number.\n\n @param[out] minor Minor version number."]
@ -583,4 +589,35 @@ impl Comgr3 {
.as_ref()
.expect("Expected function, got error."))(kind, info, input, result)
}
#[must_use]
#[doc = " @brief Iterate over the symbols of a machine code object.\n\n For a AMD_COMGR_DATA_KIND_RELOCATABLE the symbols in the ELF symtab section\n are iterated. For a AMD_COMGR_DATA_KIND_EXECUTABLE the symbols in the ELF\n dynsymtab are iterated.\n\n @param[in] data The data object to query.\n\n @param[in] callback The function to call for each symbol in the machine code\n data object. The symbol handle is passed in @p symbol and @p user_data is\n passed as @p user_data. If the function returns with a status other than @p\n AMD_COMGR_STATUS_SUCCESS then iteration is stopped.\n\n @param[in] user_data The value to pass to each invocation of @p\n callback. Allows context to be passed into the call back function.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR An error was\n reported by @p callback.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p data is an invalid data\n object, or not of kind @p AMD_COMGR_DATA_KIND_RELOCATABLE or\n AMD_COMGR_DATA_KIND_EXECUTABLE. @p callback is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to iterate the data object as out of resources."]
pub unsafe fn amd_comgr_iterate_symbols(
&self,
data: amd_comgr_data_t,
callback: ::std::option::Option<
unsafe extern "C" fn(
symbol: amd_comgr_symbol_t,
user_data: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t,
>,
user_data: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t {
(self
.amd_comgr_iterate_symbols
.as_ref()
.expect("Expected function, got error."))(data, callback, user_data)
}
#[must_use]
#[doc = " @brief Query information about a machine code object symbol.\n\n @param[in] symbol The symbol to query.\n\n @param[in] attribute Attribute to query.\n\n @param[out] value Pointer to an application-allocated buffer where to store\n the value of the attribute. If the buffer passed by the application is not\n large enough to hold the value of attribute, the behavior is undefined. The\n type of value returned is specified by @p amd_comgr_symbol_info_t.\n\n @retval ::AMD_COMGR_STATUS_SUCCESS The function has\n been executed successfully.\n\n @retval ::AMD_COMGR_STATUS_ERROR The @p symbol does not have the requested @p\n attribute.\n\n @retval ::AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT @p symbol is an invalid\n symbol. @p attribute is an invalid value. @p value is NULL.\n\n @retval ::AMD_COMGR_STATUS_ERROR_OUT_OF_RESOURCES\n Unable to query symbol as out of resources."]
pub unsafe fn amd_comgr_symbol_get_info(
&self,
symbol: amd_comgr_symbol_t,
attribute: amd_comgr_symbol_info_t,
value: *mut ::std::os::raw::c_void,
) -> amd_comgr_status_t {
(self
.amd_comgr_symbol_get_info
.as_ref()
.expect("Expected function, got error."))(symbol, attribute, value)
}
}

Binary file not shown.

View file

@ -1,6 +1,6 @@
// Every time this file changes it must te rebuilt, you need `rocm-llvm-dev` and `llvm-17`
// `fdenormal-fp-math=dynamic` is required to make functions eligible for inlining
// /opt/rocm/llvm/bin/clang -std=c++20 -Xclang -fdenormal-fp-math=dynamic -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
// /opt/rocm/llvm/bin/clang -std=c++20 -Xclang -fdenormal-fp-math=dynamic -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | sed '/__hip_cuid/d' | sed 's/external protected/external hidden/g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
#include <cstddef>
#include <cstdint>
@ -8,10 +8,12 @@
#include <cmath>
#include <hip/amd_detail/amd_device_functions.h>
#define CONSTANT_SPACE __attribute__((address_space(4)))
#define FUNC(NAME) __device__ __attribute__((retain)) __zluda_ptx_impl_##NAME
#define ATTR(NAME) __ZLUDA_PTX_IMPL_ATTRIBUTE_##NAME
#define DECLARE_ATTR(TYPE, NAME) \
extern const TYPE ATTR(NAME) \
extern "C" __attribute__((constant)) CONSTANT_SPACE TYPE ATTR(NAME) \
__device__
extern "C"

View file

@ -74,6 +74,7 @@ fn run_instruction<'input>(
| ast::Instruction::And { .. }
| ast::Instruction::Atom { .. }
| ast::Instruction::AtomCas { .. }
| ast::Instruction::BarWarp { .. }
| ast::Instruction::Bar { .. }
| ast::Instruction::BarRed { .. }
| ast::Instruction::Bfe { .. }
@ -120,6 +121,7 @@ fn run_instruction<'input>(
}
| ast::Instruction::Cvta { .. }
| ast::Instruction::Div { .. }
| ast::Instruction::Dp4a { .. }
| ast::Instruction::Ex2 { .. }
| ast::Instruction::Fma {
data: ast::ArithFloat {

View file

@ -1829,6 +1829,7 @@ fn get_modes<T: ast::Operand>(inst: &ast::Instruction<T>) -> InstructionModes {
| ast::Instruction::Bra { .. }
| ast::Instruction::Clz { .. }
| ast::Instruction::Brev { .. }
| ast::Instruction::Dp4a { .. }
| ast::Instruction::Popc { .. }
| ast::Instruction::Xor { .. }
| ast::Instruction::Rem { .. }
@ -1844,6 +1845,7 @@ fn get_modes<T: ast::Operand>(inst: &ast::Instruction<T>) -> InstructionModes {
| ast::Instruction::Shl { .. }
| ast::Instruction::Selp { .. }
| ast::Instruction::Ret { .. }
| ast::Instruction::BarWarp { .. }
| ast::Instruction::Bar { .. }
| ast::Instruction::BarRed { .. }
| ast::Instruction::Cvta { .. }

View file

@ -33,9 +33,11 @@ fn emit_attribute(
module.get(),
attribute_type,
name.as_ptr(),
get_state_space(ast::StateSpace::Global)?,
get_state_space(ast::StateSpace::Const)?,
)
};
unsafe { LLVMSetLinkage(global, llvm_zluda::LLVMLinkage::LLVMExternalLinkage) };
unsafe { LLVMSetVisibility(global, llvm_zluda::LLVMVisibility::LLVMHiddenVisibility) };
unsafe { LLVMSetInitializer(global, LLVMConstInt(attribute_type, attribute as u64, 0)) };
unsafe { LLVMSetGlobalConstant(global, 1) };
Ok(())

View file

@ -170,6 +170,11 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
unsafe { LLVMAddAttributeAtIndex(fn_, i as u32 + 1, attr) };
}
}
if !method.is_kernel {
unsafe {
LLVMSetVisibility(fn_, llvm_zluda::LLVMVisibility::LLVMHiddenVisibility);
}
}
let call_conv = if method.is_kernel {
Self::kernel_call_convention()
} else {
@ -474,6 +479,7 @@ impl<'a> MethodEmitContext<'a> {
ast::Instruction::Mov { data: _, arguments } => self.emit_mov(arguments),
ast::Instruction::Ld { data, arguments } => self.emit_ld(data, arguments),
ast::Instruction::Add { data, arguments } => self.emit_add(data, arguments),
ast::Instruction::Dp4a { data, arguments } => self.emit_dp4a(data, arguments),
ast::Instruction::St { data, arguments } => self.emit_st(data, arguments),
ast::Instruction::Mul { data, arguments } => self.emit_mul(data, arguments),
ast::Instruction::Mul24 { data, arguments } => self.emit_mul24(data, arguments),
@ -515,12 +521,13 @@ impl<'a> MethodEmitContext<'a> {
ast::Instruction::Popc { data, arguments } => self.emit_popc(data, arguments),
ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments),
ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments),
ast::Instruction::BarWarp { .. } => self.emit_bar_warp(),
ast::Instruction::PrmtSlow { .. } => {
Err(error_todo_msg("PrmtSlow is not implemented yet"))
}
ast::Instruction::Prmt { data, arguments } => self.emit_prmt(data, arguments),
ast::Instruction::Membar { data } => self.emit_membar(data),
ast::Instruction::Trap {} => Err(error_todo_msg("Trap is not implemented yet")),
ast::Instruction::Trap {} => self.emit_trap(),
ast::Instruction::Tanh { data, arguments } => self.emit_tanh(data, arguments),
ast::Instruction::CpAsync { data, arguments } => self.emit_cp_async(data, arguments),
ast::Instruction::CpAsyncCommitGroup {} => Ok(()), // nop
@ -2198,6 +2205,11 @@ impl<'a> MethodEmitContext<'a> {
Ok(())
}
fn emit_bar_warp(&mut self) -> Result<(), TranslateError> {
self.emit_intrinsic(c"llvm.amdgcn.barrier.warp", None, None, vec![])?;
Ok(())
}
fn emit_popc(
&mut self,
type_: ptx_parser::ScalarType,
@ -2764,6 +2776,50 @@ impl<'a> MethodEmitContext<'a> {
Ok(())
}
fn emit_dp4a(
&mut self,
data: ast::Dp4aDetails,
arguments: ast::Dp4aArgs<SpirvWord>,
) -> Result<(), TranslateError> {
let intrinsic = match (data.atype, data.btype) {
(ast::ScalarType::U32, ast::ScalarType::U32) => c"llvm.amdgcn.udot4",
(ast::ScalarType::S32, ast::ScalarType::S32) => c"llvm.amdgcn.sdot4",
(ast::ScalarType::U32, ast::ScalarType::S32)
| (ast::ScalarType::S32, ast::ScalarType::U32) => {
return Err(error_todo_msg("dp4a with mixed types is not yet supported"))
}
_ => return Err(error_unreachable()),
};
let pred = get_scalar_type(self.context, ast::ScalarType::Pred);
let zero = unsafe { LLVMConstInt(pred, 0, 0) };
self.emit_intrinsic(
intrinsic,
Some(arguments.dst),
Some(&data.ctype().into()),
vec![
(
self.resolver.value(arguments.src1)?,
get_scalar_type(self.context, data.ctype()),
),
(
self.resolver.value(arguments.src2)?,
get_scalar_type(self.context, data.ctype()),
),
(
self.resolver.value(arguments.src3)?,
get_scalar_type(self.context, data.ctype()),
),
(zero, pred),
],
)?;
Ok(())
}
fn emit_trap(&mut self) -> Result<(), TranslateError> {
self.emit_intrinsic(c"llvm.trap", None, None, vec![])?;
Ok(())
}
/*
// Currently unused, LLVM 18 (ROCm 6.2) does not support `llvm.set.rounding`
// Should be available in LLVM 19

View file

@ -269,6 +269,7 @@ fn run_instruction<'input>(
data:
ast::RcpData {
kind: ast::RcpKind::Compliant(ast::RoundingMode::NearestEven),
type_: ast::ScalarType::F32,
flush_to_zero: Some(true),
..
},
@ -281,6 +282,7 @@ fn run_instruction<'input>(
data:
ast::RcpData {
kind: ast::RcpKind::Compliant(ast::RoundingMode::NearestEven),
type_: ast::ScalarType::F32,
..
},
..
@ -288,29 +290,6 @@ fn run_instruction<'input>(
let name = "sqrt_rn_f32";
to_call(resolver, fn_declarations, name.into(), i)?
}
i @ ptx_parser::Instruction::Div {
data:
ast::DivDetails::Float(ast::DivFloatDetails {
kind: ast::DivFloatKind::Rounding(_),
flush_to_zero: Some(true),
..
}),
..
} => {
let name = "div_rn_ftz_f32";
to_call(resolver, fn_declarations, name.into(), i)?
}
i @ ptx_parser::Instruction::Div {
data:
ast::DivDetails::Float(ast::DivFloatDetails {
kind: ast::DivFloatKind::Rounding(_),
..
}),
..
} => {
let name = "div_rn_f32";
to_call(resolver, fn_declarations, name.into(), i)?
}
i @ ptx_parser::Instruction::Bfi { data, .. } => {
let name = ["bfi_", scalar_to_ptx_name(data)].concat();
to_call(resolver, fn_declarations, name.into(), i)?

View file

@ -1 +1 @@
@__ZLUDA_PTX_IMPL_ATTRIBUTE_CLOCK_RATE = addrspace(1) constant i32 2124000
@__ZLUDA_PTX_IMPL_ATTRIBUTE_CLOCK_RATE = hidden addrspace(4) constant i32 2124000

View file

@ -9,18 +9,18 @@ define amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"31", ptr addrspace(
br label %"30"
"30": ; preds = %1
%"37" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"37", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"38", ptr addrspace(5) %"34", align 4
%"40" = load i64, ptr addrspace(5) %"33", align 4
%"37" = load i64, ptr addrspace(4) %"31", align 8
store i64 %"37", ptr addrspace(5) %"33", align 8
%"38" = load i64, ptr addrspace(4) %"32", align 8
store i64 %"38", ptr addrspace(5) %"34", align 8
%"40" = load i64, ptr addrspace(5) %"33", align 8
%"45" = inttoptr i64 %"40" to ptr
%"39" = load i32, ptr %"45", align 4
store i32 %"39", ptr addrspace(5) %"35", align 4
%"42" = load i32, ptr addrspace(5) %"35", align 4
%"41" = call i32 @llvm.abs.i32(i32 %"42", i1 false)
store i32 %"41", ptr addrspace(5) %"36", align 4
%"43" = load i64, ptr addrspace(5) %"34", align 4
%2 = call i32 @llvm.abs.i32(i32 %"42", i1 false)
store i32 %2, ptr addrspace(5) %"36", align 4
%"43" = load i64, ptr addrspace(5) %"34", align 8
%"44" = load i32, ptr addrspace(5) %"36", align 4
%"46" = inttoptr i64 %"43" to ptr
store i32 %"44", ptr %"46", align 4

View file

@ -1,4 +1,4 @@
declare i32 @__zluda_ptx_impl_activemask() #0
declare hidden i32 @__zluda_ptx_impl_activemask() #0
define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 {
%"31" = alloca i64, align 8, addrspace(5)

View file

@ -1,4 +1,4 @@
declare void @__zluda_ptx_impl___assertfail(i64, i64, i32, i64, i64) #0
declare hidden void @__zluda_ptx_impl___assertfail(i64, i64, i32, i64, i64) #0
define amdgpu_kernel void @assertfail(ptr addrspace(4) byref(i64) %"86", ptr addrspace(4) byref(i64) %"87") #1 {
%"88" = alloca i64, align 8, addrspace(5)

View file

@ -1,8 +1,8 @@
declare i1 @__zluda_ptx_impl_bar_red_and_pred(i32, i1, i1) #0
declare hidden i1 @__zluda_ptx_impl_bar_red_and_pred(i32, i1, i1) #0
declare i1 @__zluda_ptx_impl_bar_red_or_pred(i32, i1, i1) #0
declare hidden i1 @__zluda_ptx_impl_bar_red_or_pred(i32, i1, i1) #0
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0
define amdgpu_kernel void @bar_red_and_pred(ptr addrspace(4) byref(i64) %"73", ptr addrspace(4) byref(i64) %"74") #1 {
%"75" = alloca i64, align 8, addrspace(5)
@ -29,8 +29,8 @@ define amdgpu_kernel void @bar_red_and_pred(ptr addrspace(4) byref(i64) %"73", p
%"84" = urem i32 %"85", 2
store i32 %"84", ptr addrspace(5) %"78", align 4
%"87" = load i32, ptr addrspace(5) %"78", align 4
%"86" = icmp eq i32 %"87", 0
store i1 %"86", ptr addrspace(5) %"80", align 1
%2 = icmp eq i32 %"87", 0
store i1 %2, ptr addrspace(5) %"80", align 1
store i32 0, ptr addrspace(5) %"81", align 4
%"90" = load i1, ptr addrspace(5) %"80", align 1
%"89" = call i1 @__zluda_ptx_impl_bar_red_and_pred(i32 1, i1 %"90", i1 false)

View file

@ -1,4 +1,4 @@
declare i32 @__zluda_ptx_impl_bfe_u32(i32, i32, i32) #0
declare hidden i32 @__zluda_ptx_impl_bfe_u32(i32, i32, i32) #0
define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 {
%"38" = alloca i64, align 8, addrspace(5)

View file

@ -1,4 +1,4 @@
declare i32 @__zluda_ptx_impl_bfi_b32(i32, i32, i32, i32) #0
declare hidden i32 @__zluda_ptx_impl_bfi_b32(i32, i32, i32, i32) #0
define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 {
%"41" = alloca i64, align 8, addrspace(5)

View file

@ -1,4 +1,4 @@
define i64 @incr(i64 %"43") #0 {
define hidden i64 @incr(i64 %"43") #0 {
%"63" = alloca i64, align 8, addrspace(5)
%"64" = alloca i64, align 8, addrspace(5)
%"65" = alloca i64, align 8, addrspace(5)
@ -61,4 +61,4 @@ define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"48", ptr addrspace
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View file

@ -1,4 +1,4 @@
define float @add_rm(float %"79", float %"80") #0 {
define hidden float @add_rm(float %"79", float %"80") #0 {
%"128" = alloca float, align 4, addrspace(5)
%"129" = alloca float, align 4, addrspace(5)
%"130" = alloca float, align 4, addrspace(5)
@ -33,7 +33,7 @@ define float @add_rm(float %"79", float %"80") #0 {
ret float %2
}
define float @add_rp(float %"82", float %"83") #0 {
define hidden float @add_rp(float %"82", float %"83") #0 {
%"141" = alloca float, align 4, addrspace(5)
%"142" = alloca float, align 4, addrspace(5)
%"143" = alloca float, align 4, addrspace(5)

View file

@ -14,10 +14,10 @@ define amdgpu_kernel void @cp_async(ptr addrspace(4) byref(i64) %"48", ptr addrs
br label %"47"
"47": ; preds = %1
%"56" = load i64, ptr addrspace(4) %"48", align 4
store i64 %"56", ptr addrspace(5) %"50", align 4
%"57" = load i64, ptr addrspace(4) %"49", align 4
store i64 %"57", ptr addrspace(5) %"51", align 4
%"56" = load i64, ptr addrspace(4) %"48", align 8
store i64 %"56", ptr addrspace(5) %"50", align 8
%"57" = load i64, ptr addrspace(4) %"49", align 8
store i64 %"57", ptr addrspace(5) %"51", align 8
%2 = load i96, ptr addrspace(1) @from, align 128
%3 = zext i96 %2 to i128
store i128 %3, ptr addrspace(3) @to, align 4
@ -29,21 +29,21 @@ define amdgpu_kernel void @cp_async(ptr addrspace(4) byref(i64) %"48", ptr addrs
store i32 %"60", ptr addrspace(5) %"54", align 4
%"61" = load i32, ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @to to ptr), i64 12), align 4
store i32 %"61", ptr addrspace(5) %"55", align 4
%"62" = load i64, ptr addrspace(5) %"51", align 4
%"62" = load i64, ptr addrspace(5) %"51", align 8
%"63" = load i32, ptr addrspace(5) %"52", align 4
%"76" = inttoptr i64 %"62" to ptr
store i32 %"63", ptr %"76", align 4
%"64" = load i64, ptr addrspace(5) %"51", align 4
%"64" = load i64, ptr addrspace(5) %"51", align 8
%"77" = inttoptr i64 %"64" to ptr
%"42" = getelementptr inbounds i8, ptr %"77", i64 4
%"65" = load i32, ptr addrspace(5) %"53", align 4
store i32 %"65", ptr %"42", align 4
%"66" = load i64, ptr addrspace(5) %"51", align 4
%"66" = load i64, ptr addrspace(5) %"51", align 8
%"78" = inttoptr i64 %"66" to ptr
%"44" = getelementptr inbounds i8, ptr %"78", i64 8
%"67" = load i32, ptr addrspace(5) %"54", align 4
store i32 %"67", ptr %"44", align 4
%"68" = load i64, ptr addrspace(5) %"51", align 4
%"68" = load i64, ptr addrspace(5) %"51", align 8
%"79" = inttoptr i64 %"68" to ptr
%"46" = getelementptr inbounds i8, ptr %"79", i64 12
%"69" = load i32, ptr addrspace(5) %"55", align 4

View file

@ -27,4 +27,4 @@ define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"31", ptr ad
ret void
}
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View file

@ -24,12 +24,10 @@ define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"35", ptr addrsp
store float %"46", ptr addrspace(5) %"40", align 4
%"48" = load float, ptr addrspace(5) %"39", align 4
%2 = call float @llvm.roundeven.f32(float %"48")
%"47" = freeze float %2
store float %"47", ptr addrspace(5) %"39", align 4
store float %2, ptr addrspace(5) %"39", align 4
%"50" = load float, ptr addrspace(5) %"40", align 4
%3 = call float @llvm.roundeven.f32(float %"50")
%"49" = freeze float %3
store float %"49", ptr addrspace(5) %"40", align 4
store float %3, ptr addrspace(5) %"40", align 4
%"51" = load i64, ptr addrspace(5) %"38", align 8
%"52" = load float, ptr addrspace(5) %"39", align 4
%"57" = inttoptr i64 %"51" to ptr
@ -46,4 +44,4 @@ define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"35", ptr addrsp
declare float @llvm.roundeven.f32(float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View file

@ -9,7 +9,6 @@ define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"35", ptr addrsp
br label %"34"
"34": ; preds = %1
call void @llvm.amdgcn.s.setreg(i32 6145, i32 3)
%"41" = load i64, ptr addrspace(4) %"35", align 8
store i64 %"41", ptr addrspace(5) %"37", align 8
%"42" = load i64, ptr addrspace(4) %"36", align 8
@ -25,12 +24,10 @@ define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"35", ptr addrsp
store float %"46", ptr addrspace(5) %"40", align 4
%"48" = load float, ptr addrspace(5) %"39", align 4
%2 = call float @llvm.trunc.f32(float %"48")
%"47" = freeze float %2
store float %"47", ptr addrspace(5) %"39", align 4
store float %2, ptr addrspace(5) %"39", align 4
%"50" = load float, ptr addrspace(5) %"40", align 4
%3 = call float @llvm.trunc.f32(float %"50")
%"49" = freeze float %3
store float %"49", ptr addrspace(5) %"40", align 4
store float %3, ptr addrspace(5) %"40", align 4
%"51" = load i64, ptr addrspace(5) %"38", align 8
%"52" = load float, ptr addrspace(5) %"39", align 4
%"57" = inttoptr i64 %"51" to ptr
@ -43,12 +40,8 @@ define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"35", ptr addrsp
ret void
}
; Function Attrs: nocallback nofree nosync nounwind willreturn
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.trunc.f32(float) #2
declare float @llvm.trunc.f32(float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind willreturn }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View file

@ -20,7 +20,7 @@ define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"32", ptr ad
store i32 %"41", ptr addrspace(5) %"36", align 4
%"44" = load i32, ptr addrspace(5) %"36", align 4
%2 = call i32 @llvm.smax.i32(i32 %"44", i32 0)
%3 = call i32 @llvm.umin.i32(i32 %2, i32 -1)
%3 = call i32 @llvm.smin.i32(i32 %2, i32 2147483647)
store i32 %3, ptr addrspace(5) %"37", align 4
%"46" = load i32, ptr addrspace(5) %"37", align 4
store i32 %"46", ptr addrspace(5) %"38", align 4
@ -35,7 +35,7 @@ define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"32", ptr ad
declare i32 @llvm.smax.i32(i32, i32) #1
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.umin.i32(i32, i32) #1
declare i32 @llvm.smin.i32(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View file

@ -1,8 +1,8 @@
%struct.f32.f32.f32.i8 = type { float, float, float, i8 }
declare %struct.f32.f32.f32.i8 @__zluda_ptx_impl_div_f32_part1(float, float) #0
declare hidden %struct.f32.f32.f32.i8 @__zluda_ptx_impl_div_f32_part1(float, float) #0
declare float @__zluda_ptx_impl_div_f32_part2(float, float, float, float, float, i8) #0
declare hidden float @__zluda_ptx_impl_div_f32_part2(float, float, float, float, float, i8) #0
define amdgpu_kernel void @div_ftz(ptr addrspace(4) byref(i64) %"63", ptr addrspace(4) byref(i64) %"64") #1 {
%"65" = alloca i64, align 8, addrspace(5)

View file

@ -1,8 +1,8 @@
%struct.f32.f32.f32.i8 = type { float, float, float, i8 }
declare %struct.f32.f32.f32.i8 @__zluda_ptx_impl_div_f32_part1(float, float) #0
declare hidden %struct.f32.f32.f32.i8 @__zluda_ptx_impl_div_f32_part1(float, float) #0
declare float @__zluda_ptx_impl_div_f32_part2(float, float, float, float, float, i8) #0
declare hidden float @__zluda_ptx_impl_div_f32_part2(float, float, float, float, float, i8) #0
define amdgpu_kernel void @div_noftz(ptr addrspace(4) byref(i64) %"62", ptr addrspace(4) byref(i64) %"63") #1 {
%"64" = alloca i64, align 8, addrspace(5)

48
ptx/src/test/ll/dp4a.ll Normal file
View file

@ -0,0 +1,48 @@
define amdgpu_kernel void @dp4a(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
%"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i32, align 4, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
%"43" = alloca i32, align 4, addrspace(5)
%"44" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
br label %"36"
"36": ; preds = %1
%"45" = load i64, ptr addrspace(4) %"37", align 8
store i64 %"45", ptr addrspace(5) %"39", align 8
%"46" = load i64, ptr addrspace(4) %"38", align 8
store i64 %"46", ptr addrspace(5) %"40", align 8
%"48" = load i64, ptr addrspace(5) %"39", align 8
%"59" = inttoptr i64 %"48" to ptr
%"47" = load i32, ptr %"59", align 4
store i32 %"47", ptr addrspace(5) %"41", align 4
%"49" = load i64, ptr addrspace(5) %"39", align 8
%"60" = inttoptr i64 %"49" to ptr
%"33" = getelementptr inbounds i8, ptr %"60", i64 4
%"50" = load i32, ptr %"33", align 4
store i32 %"50", ptr addrspace(5) %"42", align 4
%"51" = load i64, ptr addrspace(5) %"39", align 8
%"61" = inttoptr i64 %"51" to ptr
%"35" = getelementptr inbounds i8, ptr %"61", i64 8
%"52" = load i32, ptr %"35", align 4
store i32 %"52", ptr addrspace(5) %"43", align 4
%"54" = load i32, ptr addrspace(5) %"41", align 4
%"55" = load i32, ptr addrspace(5) %"42", align 4
%"56" = load i32, ptr addrspace(5) %"43", align 4
%"53" = call i32 @llvm.amdgcn.sdot4(i32 %"54", i32 %"55", i32 %"56", i1 false)
store i32 %"53", ptr addrspace(5) %"44", align 4
%"57" = load i64, ptr addrspace(5) %"40", align 8
%"58" = load i32, ptr addrspace(5) %"44", align 4
%"65" = inttoptr i64 %"57" to ptr
store i32 %"58", ptr %"65", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.amdgcn.sdot4(i32, i32, i32, i1 immarg) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View file

@ -1,4 +1,6 @@
define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
declare hidden float @__zluda_ptx_impl_ex2_approx_f32(float) #0
define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca float, align 4, addrspace(5)
@ -17,7 +19,7 @@ define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(
%"37" = load float, ptr %"43", align 4
store float %"37", ptr addrspace(5) %"34", align 4
%"40" = load float, ptr addrspace(5) %"34", align 4
%"39" = call float @llvm.amdgcn.exp2.f32(float %"40")
%"39" = call float @__zluda_ptx_impl_ex2_approx_f32(float %"40")
store float %"39", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 8
%"42" = load float, ptr addrspace(5) %"34", align 4
@ -26,8 +28,5 @@ define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.amdgcn.exp2.f32(float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View file

@ -1,4 +1,4 @@
declare [16 x i8] @foobar(i64) #0
declare hidden [16 x i8] @foobar(i64) #0
define amdgpu_kernel void @extern_func(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #1 {
%"46" = alloca i64, align 8, addrspace(5)

View file

@ -1,6 +1,6 @@
@shared_mem = external addrspace(3) global [0 x i32], align 4
define void @incr_shared_2_global() #0 {
define hidden void @incr_shared_2_global() #0 {
%"36" = alloca i64, align 8, addrspace(5)
br label %1

View file

@ -1,4 +1,4 @@
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
declare hidden i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
define amdgpu_kernel void @lanemask_lt(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 {
%"38" = alloca i64, align 8, addrspace(5)

View file

@ -1,4 +1,6 @@
define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
declare hidden float @__zluda_ptx_impl_lg2_approx_f32(float) #0
define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca float, align 4, addrspace(5)
@ -17,7 +19,7 @@ define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(
%"37" = load float, ptr %"43", align 4
store float %"37", ptr addrspace(5) %"34", align 4
%"40" = load float, ptr addrspace(5) %"34", align 4
%"39" = call float @llvm.amdgcn.log.f32(float %"40")
%"39" = call float @__zluda_ptx_impl_lg2_approx_f32(float %"40")
store float %"39", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 8
%"42" = load float, ptr addrspace(5) %"34", align 4
@ -26,8 +28,5 @@ define amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.amdgcn.log.f32(float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View file

@ -1,4 +1,6 @@
define [2 x i32] @do_something(i32 %"10") #0 {
%struct.i32.i1 = type { i32, i1 }
define hidden %struct.i32.i1 @do_something(i32 %"10") #0 {
%"46" = alloca i32, align 4, addrspace(5)
%"47" = alloca i1, align 1, addrspace(5)
br label %1
@ -12,10 +14,9 @@ define [2 x i32] @do_something(i32 %"10") #0 {
store i1 true, ptr addrspace(5) %"47", align 1
%2 = load i32, ptr addrspace(5) %"46", align 4
%3 = load i1, ptr addrspace(5) %"47", align 1
%4 = insertvalue [2 x i32] poison, i32 %2, 0
%5 = zext i1 %3 to i32
%6 = insertvalue [2 x i32] %4, i32 %5, 1
ret [2 x i32] %6
%4 = insertvalue %struct.i32.i1 undef, i32 %2, 0
%5 = insertvalue %struct.i32.i1 %4, i1 %3, 1
ret %struct.i32.i1 %5
}
define amdgpu_kernel void @multiple_return(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #1 {
@ -39,10 +40,9 @@ define amdgpu_kernel void @multiple_return(ptr addrspace(4) byref(i64) %"50", pt
%"59" = load i32, ptr %"68", align 4
store i32 %"59", ptr addrspace(5) %"54", align 4
%"63" = load i32, ptr addrspace(5) %"54", align 4
%2 = call [2 x i32] @do_something(i32 %"63")
%"61" = extractvalue [2 x i32] %2, 0
%3 = extractvalue [2 x i32] %2, 1
%"62" = trunc i32 %3 to i1
%2 = call %struct.i32.i1 @do_something(i32 %"63")
%"61" = extractvalue %struct.i32.i1 %2, 0
%"62" = extractvalue %struct.i32.i1 %2, 1
store i32 %"61", ptr addrspace(5) %"55", align 4
store i1 %"62", ptr addrspace(5) %"56", align 1
br label %"45"

View file

@ -1,4 +1,4 @@
declare void @__zluda_ptx_impl_nanosleep_u32(i32) #0
declare hidden void @__zluda_ptx_impl_nanosleep_u32(i32) #0
define amdgpu_kernel void @nanosleep(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 {
br label %1

View file

@ -1,4 +1,4 @@
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
declare hidden i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
define amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 {
%"37" = alloca i64, align 8, addrspace(5)

View file

@ -26,8 +26,8 @@ define amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"41", ptr addrs
store i64 %"54", ptr addrspace(5) %"46", align 8
%"56" = load i64, ptr addrspace(5) %"45", align 8
%"57" = load i64, ptr addrspace(5) %"46", align 8
%"55" = icmp ult i64 %"56", %"57"
store i1 %"55", ptr addrspace(5) %"48", align 1
%2 = icmp ult i64 %"56", %"57"
store i1 %2, ptr addrspace(5) %"48", align 1
%"59" = load i1, ptr addrspace(5) %"48", align 1
%"58" = xor i1 %"59", true
store i1 %"58", ptr addrspace(5) %"48", align 1

View file

@ -1,4 +1,6 @@
define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
declare hidden float @__zluda_ptx_impl_rcp_approx_f32(float) #0
define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 {
%"32" = alloca i64, align 8, addrspace(5)
%"33" = alloca i64, align 8, addrspace(5)
%"34" = alloca float, align 4, addrspace(5)
@ -17,7 +19,7 @@ define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace(
%"37" = load float, ptr %"43", align 4
store float %"37", ptr addrspace(5) %"34", align 4
%"40" = load float, ptr addrspace(5) %"34", align 4
%"39" = call float @llvm.amdgcn.rcp.f32(float %"40")
%"39" = call float @__zluda_ptx_impl_rcp_approx_f32(float %"40")
store float %"39", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 8
%"42" = load float, ptr addrspace(5) %"34", align 4
@ -26,8 +28,5 @@ define amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"30", ptr addrspace(
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare float @llvm.amdgcn.rcp.f32(float) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View file

@ -26,8 +26,8 @@ define amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"41", ptr addrspace
store i64 %"54", ptr addrspace(5) %"46", align 8
%"56" = load i64, ptr addrspace(5) %"45", align 8
%"57" = load i64, ptr addrspace(5) %"46", align 8
%"55" = icmp ult i64 %"56", %"57"
store i1 %"55", ptr addrspace(5) %"48", align 1
%2 = icmp ult i64 %"56", %"57"
store i1 %2, ptr addrspace(5) %"48", align 1
%"58" = load i1, ptr addrspace(5) %"48", align 1
br i1 %"58", label %"16", label %"17"

View file

@ -26,8 +26,8 @@ define amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"39", ptr addrsp
store float %"52", ptr addrspace(5) %"44", align 4
%"54" = load float, ptr addrspace(5) %"43", align 4
%"55" = load float, ptr addrspace(5) %"44", align 4
%"53" = fcmp ogt float %"54", %"55"
store i1 %"53", ptr addrspace(5) %"46", align 1
%2 = fcmp ogt float %"54", %"55"
store i1 %2, ptr addrspace(5) %"46", align 1
%"56" = load i1, ptr addrspace(5) %"46", align 1
br i1 %"56", label %"16", label %"17"

View file

@ -26,8 +26,8 @@ define amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"39", ptr addrs
store float %"52", ptr addrspace(5) %"44", align 4
%"54" = load float, ptr addrspace(5) %"43", align 4
%"55" = load float, ptr addrspace(5) %"44", align 4
%"53" = fcmp ule float %"54", %"55"
store i1 %"53", ptr addrspace(5) %"46", align 1
%2 = fcmp ule float %"54", %"55"
store i1 %2, ptr addrspace(5) %"46", align 1
%"56" = load i1, ptr addrspace(5) %"46", align 1
br i1 %"56", label %"16", label %"17"

View file

@ -62,8 +62,8 @@ define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"83", ptr addrs
store float %"114", ptr addrspace(5) %"94", align 4
%"116" = load float, ptr addrspace(5) %"87", align 4
%"117" = load float, ptr addrspace(5) %"88", align 4
%"115" = fcmp uno float %"116", %"117"
store i1 %"115", ptr addrspace(5) %"96", align 1
%2 = fcmp uno float %"116", %"117"
store i1 %2, ptr addrspace(5) %"96", align 1
%"118" = load i1, ptr addrspace(5) %"96", align 1
br i1 %"118", label %"22", label %"23"
@ -86,8 +86,8 @@ define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"83", ptr addrs
store i32 %"123", ptr %"159", align 4
%"125" = load float, ptr addrspace(5) %"89", align 4
%"126" = load float, ptr addrspace(5) %"90", align 4
%"124" = fcmp uno float %"125", %"126"
store i1 %"124", ptr addrspace(5) %"96", align 1
%3 = fcmp uno float %"125", %"126"
store i1 %3, ptr addrspace(5) %"96", align 1
%"127" = load i1, ptr addrspace(5) %"96", align 1
br i1 %"127", label %"26", label %"27"
@ -111,8 +111,8 @@ define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"83", ptr addrs
store i32 %"132", ptr %"73", align 4
%"134" = load float, ptr addrspace(5) %"91", align 4
%"135" = load float, ptr addrspace(5) %"92", align 4
%"133" = fcmp uno float %"134", %"135"
store i1 %"133", ptr addrspace(5) %"96", align 1
%4 = fcmp uno float %"134", %"135"
store i1 %4, ptr addrspace(5) %"96", align 1
%"136" = load i1, ptr addrspace(5) %"96", align 1
br i1 %"136", label %"30", label %"31"
@ -136,8 +136,8 @@ define amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"83", ptr addrs
store i32 %"141", ptr %"77", align 4
%"143" = load float, ptr addrspace(5) %"93", align 4
%"144" = load float, ptr addrspace(5) %"94", align 4
%"142" = fcmp uno float %"143", %"144"
store i1 %"142", ptr addrspace(5) %"96", align 1
%5 = fcmp uno float %"143", %"144"
store i1 %5, ptr addrspace(5) %"96", align 1
%"145" = load i1, ptr addrspace(5) %"96", align 1
br i1 %"145", label %"34", label %"35"

View file

@ -62,8 +62,8 @@ define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"83", ptr addrs
store float %"114", ptr addrspace(5) %"94", align 4
%"116" = load float, ptr addrspace(5) %"87", align 4
%"117" = load float, ptr addrspace(5) %"88", align 4
%"115" = fcmp ord float %"116", %"117"
store i1 %"115", ptr addrspace(5) %"96", align 1
%2 = fcmp ord float %"116", %"117"
store i1 %2, ptr addrspace(5) %"96", align 1
%"118" = load i1, ptr addrspace(5) %"96", align 1
br i1 %"118", label %"22", label %"23"
@ -86,8 +86,8 @@ define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"83", ptr addrs
store i32 %"123", ptr %"159", align 4
%"125" = load float, ptr addrspace(5) %"89", align 4
%"126" = load float, ptr addrspace(5) %"90", align 4
%"124" = fcmp ord float %"125", %"126"
store i1 %"124", ptr addrspace(5) %"96", align 1
%3 = fcmp ord float %"125", %"126"
store i1 %3, ptr addrspace(5) %"96", align 1
%"127" = load i1, ptr addrspace(5) %"96", align 1
br i1 %"127", label %"26", label %"27"
@ -111,8 +111,8 @@ define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"83", ptr addrs
store i32 %"132", ptr %"73", align 4
%"134" = load float, ptr addrspace(5) %"91", align 4
%"135" = load float, ptr addrspace(5) %"92", align 4
%"133" = fcmp ord float %"134", %"135"
store i1 %"133", ptr addrspace(5) %"96", align 1
%4 = fcmp ord float %"134", %"135"
store i1 %4, ptr addrspace(5) %"96", align 1
%"136" = load i1, ptr addrspace(5) %"96", align 1
br i1 %"136", label %"30", label %"31"
@ -136,8 +136,8 @@ define amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"83", ptr addrs
store i32 %"141", ptr %"77", align 4
%"143" = load float, ptr addrspace(5) %"93", align 4
%"144" = load float, ptr addrspace(5) %"94", align 4
%"142" = fcmp ord float %"143", %"144"
store i1 %"142", ptr addrspace(5) %"96", align 1
%5 = fcmp ord float %"143", %"144"
store i1 %5, ptr addrspace(5) %"96", align 1
%"145" = load i1, ptr addrspace(5) %"96", align 1
br i1 %"145", label %"34", label %"35"

View file

@ -1,7 +1,7 @@
@shared_ex = external addrspace(3) global [0 x i32]
@shared_mod = external addrspace(3) global [4 x i32]
define i64 @add() #0 {
define hidden i64 @add() #0 {
%"46" = alloca i64, align 8, addrspace(5)
%"47" = alloca i64, align 8, addrspace(5)
%"48" = alloca i64, align 8, addrspace(5)
@ -23,7 +23,7 @@ define i64 @add() #0 {
ret i64 %2
}
define i64 @set_shared_temp1(i64 %"15") #0 {
define hidden i64 @set_shared_temp1(i64 %"15") #0 {
%"54" = alloca i64, align 8, addrspace(5)
br label %1

View file

@ -1,7 +1,7 @@
@shared_ex = external addrspace(3) global [0 x i32]
@shared_mod = external addrspace(3) global i64, align 4
define i64 @add(i64 %"10") #0 {
define hidden i64 @add(i64 %"10") #0 {
%"47" = alloca i64, align 8, addrspace(5)
%"48" = alloca i64, align 8, addrspace(5)
br label %1
@ -21,7 +21,7 @@ define i64 @add(i64 %"10") #0 {
ret i64 %2
}
define i64 @set_shared_temp1(i64 %"15", i64 %"16") #0 {
define hidden i64 @set_shared_temp1(i64 %"15", i64 %"16") #0 {
%"52" = alloca i64, align 8, addrspace(5)
br label %1

View file

@ -1,6 +1,6 @@
declare [2 x i32] @__zluda_ptx_impl_shfl_sync_bfly_b32_pred(i32, i32, i32, i32) #0
declare hidden <2 x i32> @__zluda_ptx_impl_shfl_sync_bfly_b32_pred(i32, i32, i32, i32) #0
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0
define amdgpu_kernel void @shfl_sync_bfly_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 {
%"43" = alloca i64, align 8, addrspace(5)
@ -22,10 +22,10 @@ define amdgpu_kernel void @shfl_sync_bfly_b32_pred(ptr addrspace(4) byref(i64) %
"40": ; preds = %"39"
store i32 %"33", ptr addrspace(5) %"45", align 4
%"52" = load i32, ptr addrspace(5) %"45", align 4
%2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_bfly_b32_pred(i32 %"52", i32 3, i32 31, i32 -1)
%"65" = extractvalue [2 x i32] %2, 0
%3 = extractvalue [2 x i32] %2, 1
%"51" = trunc i32 %3 to i1
%"68" = call <2 x i32> @__zluda_ptx_impl_shfl_sync_bfly_b32_pred(i32 %"52", i32 3, i32 31, i32 -1)
%"65" = extractelement <2 x i32> %"68", i8 0
%"69" = extractelement <2 x i32> %"68", i8 1
%"51" = trunc i32 %"69" to i1
store i32 %"65", ptr addrspace(5) %"46", align 4
store i1 %"51", ptr addrspace(5) %"47", align 1
%"53" = load i1, ptr addrspace(5) %"47", align 1

View file

@ -1,6 +1,6 @@
declare [2 x i32] @__zluda_ptx_impl_shfl_sync_down_b32_pred(i32, i32, i32, i32) #0
declare hidden <2 x i32> @__zluda_ptx_impl_shfl_sync_down_b32_pred(i32, i32, i32, i32) #0
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0
define amdgpu_kernel void @shfl_sync_down_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 {
%"43" = alloca i64, align 8, addrspace(5)
@ -22,10 +22,10 @@ define amdgpu_kernel void @shfl_sync_down_b32_pred(ptr addrspace(4) byref(i64) %
"40": ; preds = %"39"
store i32 %"33", ptr addrspace(5) %"45", align 4
%"52" = load i32, ptr addrspace(5) %"45", align 4
%2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_down_b32_pred(i32 %"52", i32 3, i32 31, i32 -1)
%"65" = extractvalue [2 x i32] %2, 0
%3 = extractvalue [2 x i32] %2, 1
%"51" = trunc i32 %3 to i1
%"68" = call <2 x i32> @__zluda_ptx_impl_shfl_sync_down_b32_pred(i32 %"52", i32 3, i32 31, i32 -1)
%"65" = extractelement <2 x i32> %"68", i8 0
%"69" = extractelement <2 x i32> %"68", i8 1
%"51" = trunc i32 %"69" to i1
store i32 %"65", ptr addrspace(5) %"46", align 4
store i1 %"51", ptr addrspace(5) %"47", align 1
%"53" = load i1, ptr addrspace(5) %"47", align 1

View file

@ -1,6 +1,6 @@
declare [2 x i32] @__zluda_ptx_impl_shfl_sync_idx_b32_pred(i32, i32, i32, i32) #0
declare hidden <2 x i32> @__zluda_ptx_impl_shfl_sync_idx_b32_pred(i32, i32, i32, i32) #0
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0
define amdgpu_kernel void @shfl_sync_idx_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 {
%"43" = alloca i64, align 8, addrspace(5)
@ -22,10 +22,10 @@ define amdgpu_kernel void @shfl_sync_idx_b32_pred(ptr addrspace(4) byref(i64) %"
"40": ; preds = %"39"
store i32 %"33", ptr addrspace(5) %"45", align 4
%"52" = load i32, ptr addrspace(5) %"45", align 4
%2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_idx_b32_pred(i32 %"52", i32 12, i32 31, i32 -1)
%"65" = extractvalue [2 x i32] %2, 0
%3 = extractvalue [2 x i32] %2, 1
%"51" = trunc i32 %3 to i1
%"68" = call <2 x i32> @__zluda_ptx_impl_shfl_sync_idx_b32_pred(i32 %"52", i32 12, i32 31, i32 -1)
%"65" = extractelement <2 x i32> %"68", i8 0
%"69" = extractelement <2 x i32> %"68", i8 1
%"51" = trunc i32 %"69" to i1
store i32 %"65", ptr addrspace(5) %"46", align 4
store i1 %"51", ptr addrspace(5) %"47", align 1
%"53" = load i1, ptr addrspace(5) %"47", align 1

View file

@ -1,12 +1,12 @@
declare i32 @__zluda_ptx_impl_shfl_sync_down_b32(i32, i32, i32, i32) #0
declare hidden i32 @__zluda_ptx_impl_shfl_sync_down_b32(i32, i32, i32, i32) #0
declare i32 @__zluda_ptx_impl_shfl_sync_up_b32(i32, i32, i32, i32) #0
declare hidden i32 @__zluda_ptx_impl_shfl_sync_up_b32(i32, i32, i32, i32) #0
declare i32 @__zluda_ptx_impl_shfl_sync_bfly_b32(i32, i32, i32, i32) #0
declare hidden i32 @__zluda_ptx_impl_shfl_sync_bfly_b32(i32, i32, i32, i32) #0
declare i32 @__zluda_ptx_impl_shfl_sync_idx_b32(i32, i32, i32, i32) #0
declare hidden i32 @__zluda_ptx_impl_shfl_sync_idx_b32(i32, i32, i32, i32) #0
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0
define amdgpu_kernel void @shfl_sync_mode_b32(ptr addrspace(4) byref(i64) %"48") #1 {
%"49" = alloca i64, align 8, addrspace(5)

View file

@ -1,6 +1,6 @@
declare [2 x i32] @__zluda_ptx_impl_shfl_sync_up_b32_pred(i32, i32, i32, i32) #0
declare hidden <2 x i32> @__zluda_ptx_impl_shfl_sync_up_b32_pred(i32, i32, i32, i32) #0
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0
define amdgpu_kernel void @shfl_sync_up_b32_pred(ptr addrspace(4) byref(i64) %"42") #1 {
%"43" = alloca i64, align 8, addrspace(5)
@ -22,10 +22,10 @@ define amdgpu_kernel void @shfl_sync_up_b32_pred(ptr addrspace(4) byref(i64) %"4
"40": ; preds = %"39"
store i32 %"33", ptr addrspace(5) %"45", align 4
%"52" = load i32, ptr addrspace(5) %"45", align 4
%2 = call [2 x i32] @__zluda_ptx_impl_shfl_sync_up_b32_pred(i32 %"52", i32 3, i32 0, i32 -1)
%"65" = extractvalue [2 x i32] %2, 0
%3 = extractvalue [2 x i32] %2, 1
%"51" = trunc i32 %3 to i1
%"68" = call <2 x i32> @__zluda_ptx_impl_shfl_sync_up_b32_pred(i32 %"52", i32 3, i32 0, i32 -1)
%"65" = extractelement <2 x i32> %"68", i8 0
%"69" = extractelement <2 x i32> %"68", i8 1
%"51" = trunc i32 %"69" to i1
store i32 %"65", ptr addrspace(5) %"46", align 4
store i1 %"51", ptr addrspace(5) %"47", align 1
%"53" = load i1, ptr addrspace(5) %"47", align 1

View file

@ -8,11 +8,11 @@ define amdgpu_kernel void @shr_oob(ptr addrspace(4) byref(i64) %"31", ptr addrsp
br label %"30"
"30": ; preds = %1
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"37" = load i64, ptr addrspace(4) %"32", align 4
store i64 %"37", ptr addrspace(5) %"34", align 4
%"39" = load i64, ptr addrspace(5) %"33", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 8
store i64 %"36", ptr addrspace(5) %"33", align 8
%"37" = load i64, ptr addrspace(4) %"32", align 8
store i64 %"37", ptr addrspace(5) %"34", align 8
%"39" = load i64, ptr addrspace(5) %"33", align 8
%"44" = inttoptr i64 %"39" to ptr
%"38" = load i16, ptr %"44", align 2
store i16 %"38", ptr addrspace(5) %"35", align 2
@ -21,7 +21,7 @@ define amdgpu_kernel void @shr_oob(ptr addrspace(4) byref(i64) %"31", ptr addrsp
%3 = ashr i16 %"41", 16
%"40" = select i1 true, i16 %2, i16 %3
store i16 %"40", ptr addrspace(5) %"35", align 2
%"42" = load i64, ptr addrspace(5) %"34", align 4
%"42" = load i64, ptr addrspace(5) %"34", align 8
%"43" = load i16, ptr addrspace(5) %"35", align 2
%"45" = inttoptr i64 %"42" to ptr
store i16 %"43", ptr %"45", align 2

View file

@ -1,4 +1,4 @@
declare float @__zluda_ptx_impl_sqrt_approx_f32(float) #0
declare hidden float @__zluda_ptx_impl_sqrt_approx_f32(float) #0
define amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 {
%"32" = alloca i64, align 8, addrspace(5)

View file

@ -1,4 +1,4 @@
declare float @__zluda_ptx_impl_sqrt_rn_ftz_f32(float) #0
declare hidden float @__zluda_ptx_impl_sqrt_rn_ftz_f32(float) #0
define amdgpu_kernel void @sqrt_rn_ftz(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 {
%"32" = alloca i64, align 8, addrspace(5)

View file

@ -1,4 +1,4 @@
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0
define amdgpu_kernel void @stateful_ld_st_ntid(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 {
%"38" = alloca i64, align 8, addrspace(5)

View file

@ -1,4 +1,4 @@
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0
define amdgpu_kernel void @stateful_ld_st_ntid_chain(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 {
%"42" = alloca i64, align 8, addrspace(5)

View file

@ -1,4 +1,4 @@
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0
define amdgpu_kernel void @stateful_ld_st_ntid_sub(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #1 {
%"46" = alloca i64, align 8, addrspace(5)

View file

@ -8,18 +8,18 @@ define amdgpu_kernel void @tanh(ptr addrspace(4) byref(i64) %"30", ptr addrspace
br label %"29"
"29": ; preds = %1
%"35" = load i64, ptr addrspace(4) %"30", align 4
store i64 %"35", ptr addrspace(5) %"32", align 4
%"36" = load i64, ptr addrspace(4) %"31", align 4
store i64 %"36", ptr addrspace(5) %"33", align 4
%"38" = load i64, ptr addrspace(5) %"32", align 4
%"35" = load i64, ptr addrspace(4) %"30", align 8
store i64 %"35", ptr addrspace(5) %"32", align 8
%"36" = load i64, ptr addrspace(4) %"31", align 8
store i64 %"36", ptr addrspace(5) %"33", align 8
%"38" = load i64, ptr addrspace(5) %"32", align 8
%"43" = inttoptr i64 %"38" to ptr
%"37" = load float, ptr %"43", align 4
store float %"37", ptr addrspace(5) %"34", align 4
%"40" = load float, ptr addrspace(5) %"34", align 4
%"39" = call afn float @__ocml_tanh_f32(float %"40")
store float %"39", ptr addrspace(5) %"34", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 4
%"41" = load i64, ptr addrspace(5) %"33", align 8
%"42" = load float, ptr addrspace(5) %"34", align 4
%"44" = inttoptr i64 %"41" to ptr
store float %"42", ptr %"44", align 4

View file

@ -1,4 +1,4 @@
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
declare hidden i32 @__zluda_ptx_impl_sreg_tid(i8) #0
define amdgpu_kernel void @tid(ptr addrspace(4) byref(i64) %"34") #1 {
%"35" = alloca i64, align 8, addrspace(5)

16
ptx/src/test/ll/trap.ll Normal file
View file

@ -0,0 +1,16 @@
define amdgpu_kernel void @trap(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
br label %1
1: ; preds = %0
br label %"26"
"26": ; preds = %1
call void @llvm.trap()
ret void
}
; Function Attrs: cold noreturn nounwind
declare void @llvm.trap() #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { cold noreturn nounwind }

View file

@ -1,4 +1,4 @@
define <2 x i32> @impl(<2 x i32> %"9") #0 {
define hidden <2 x i32> @impl(<2 x i32> %"9") #0 {
%"47" = alloca <2 x i32>, align 8, addrspace(5)
%"48" = alloca <2 x i32>, align 8, addrspace(5)
%"49" = alloca i32, align 4, addrspace(5)

View file

@ -0,0 +1,26 @@
.version 6.5
.target sm_70
.address_size 64
.visible .entry dp4a(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .b32 src1;
.reg .b32 src2;
.reg .b32 src3;
.reg .b32 dst;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.b32 src1, [in_addr];
ld.b32 src2, [in_addr+4];
ld.b32 src3, [in_addr+8];
dp4a.s32.s32 dst, src1, src2, src3;
st.b32 [out_addr], dst;
ret;
}

View file

@ -317,12 +317,18 @@ test_ptx!(
[0x12345678u32, 0x9abcdef0u32, 44],
[0xef012345u32]
);
test_ptx!(
dp4a,
[0x8e2da590u32, 0xedeaee14, 0x248a9f70],
[613065134u32]
);
test_ptx!(assertfail);
// TODO: not yet supported
//test_ptx!(func_ptr);
test_ptx!(lanemask_lt);
test_ptx!(extern_func);
test_ptx!(trap);
test_ptx_warp!(
tid,
@ -640,11 +646,15 @@ fn run_hip<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Def
.to_str()
.unwrap(),
&*module.llvm_ir.write_bitcode_to_memory(),
&*module.attributes_ir.write_bitcode_to_memory(),
module.linked_bitcode(),
&*module.attributes_ir.write_bitcode_to_memory(),
None,
)
.unwrap();
// TODO: Re-enable when we are able to privatize function-scoped
// globals and constants
// let fns = comgr::get_symbols(&comgr, &elf_module).unwrap();
// verify_symbols(fns);
let mut module = unsafe { mem::zeroed() };
unsafe { hipModuleLoadData(&mut module, elf_module.as_ptr() as _) }.unwrap();
let mut kernel = unsafe { mem::zeroed() };
@ -704,3 +714,30 @@ fn run_hip<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + Def
}
Ok(result)
}
// TODO: Re-enable when we are able to privatize function-scoped
// globals and constants
/*
fn verify_symbols(mut symbols: Vec<(u32, String)>) {
symbols.sort();
if symbols.len() != 2 {
panic!("Expected exactly two symbols, found: {:?}", symbols);
}
assert_eq!(
symbols[0].0, 1,
"Wrong symbols exported from binary: {:?}",
symbols
);
assert_eq!(
symbols[1].0, 2,
"Wrong symbols exported from binary: {:?}",
symbols
);
assert_eq!(
symbols[0].1,
format!("{}.kd", symbols[1].1),
"Wrong symbols exported from binary: {:?}",
symbols
);
}
*/

View file

@ -0,0 +1,11 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry trap(
.param .u64 input,
.param .u64 output
)
{
trap;
ret;
}

View file

@ -89,6 +89,13 @@ ptx_parser_macros::generate_instruction_type!(
src3: T,
}
},
BarWarp {
type: Type::Scalar(ScalarType::U32),
data: (),
arguments<T>: {
src: T,
}
},
Bar {
type: Type::Scalar(ScalarType::U32),
data: BarData,
@ -245,6 +252,27 @@ ptx_parser_macros::generate_instruction_type!(
src2: T,
}
},
Dp4a {
data: Dp4aDetails,
arguments<T>: {
dst: {
repr: T,
type: { Type::Scalar(ScalarType::B32) },
},
src1: {
repr: T,
type: { Type::Scalar(data.atype) },
},
src2: {
repr: T,
type: { Type::Scalar(data.btype) },
},
src3: {
repr: T,
type: { Type::Scalar(data.ctype()) },
},
}
},
Ex2 {
type: Type::Scalar(ScalarType::F32),
data: TypeFtz,
@ -2059,6 +2087,21 @@ pub enum DivDetails {
Float(DivFloatDetails),
}
#[derive(Copy, Clone)]
pub struct Dp4aDetails {
pub atype: ScalarType,
pub btype: ScalarType,
}
impl Dp4aDetails {
pub fn ctype(self) -> ScalarType {
match (self.atype, self.btype) {
(ScalarType::U32, ScalarType::U32) => ScalarType::U32,
_ => ScalarType::S32,
}
}
}
impl DivDetails {
pub fn type_(&self) -> ScalarType {
match self {

View file

@ -3150,6 +3150,13 @@ derive_parser!(
}
.op: Reduction = { .and, .or };
bar.warp.sync membermask => {
ast::Instruction::BarWarp {
data: (),
arguments: BarWarpArgs { src: membermask }
}
}
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
atom{.sem}{.scope}{.space}.op{.level::cache_hint}.type d, [a], b{, cache_policy} => {
if level_cache_hint || cache_policy.is_some() {
@ -3689,6 +3696,25 @@ derive_parser!(
.dir: ShiftDirection = { .l, .r };
.mode: FunnelShiftMode = { .clamp, .wrap };
trap => {
Instruction::Trap {}
}
// https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-dp4a
dp4a.atype.btype d, a, b, c => {
Instruction::Dp4a {
data: Dp4aDetails {
atype,
btype
},
arguments: Dp4aArgs { dst: d, src1: a, src2: b, src3: c }
}
}
.atype: ScalarType = { .u32, .s32 };
.btype: ScalarType = { .u32, .s32 };
);
#[cfg(test)]

View file

@ -82,7 +82,7 @@ pub struct OpcodeDecl(pub Instruction, pub Arguments);
impl OpcodeDecl {
fn peek(input: syn::parse::ParseStream) -> bool {
Instruction::peek(input) && !input.peek2(Token![=])
Instruction::peek(input)
}
}
@ -234,7 +234,13 @@ pub struct Instruction {
}
impl Instruction {
fn peek(input: syn::parse::ParseStream) -> bool {
input.peek(Ident)
if !input.peek(Ident) {
return false;
}
if input.peek2(Token![=]) && input.peek3(syn::token::Brace) {
return false;
}
true
}
}
@ -910,4 +916,24 @@ mod tests {
};
syn::parse2::<super::OpcodeDefinition>(input).unwrap();
}
#[test]
fn instruction_no_options() {
let input = quote! {
trap => {
Instruction::Trap {}
}
foo.bar => {
todo!()
}
ScalarType = { .f32 };
};
let args = syn::parse2::<super::OpcodeDefinition>(input).unwrap();
assert_eq!(args.1.len(), 1);
let (ref opcode1, _) = args.0 .0[0];
assert_eq!(opcode1.0.name.to_string(), "trap");
let (ref opcode2, _) = args.0 .0[1];
assert_eq!(opcode2.0.name.to_string(), "foo");
assert_eq!(args.1.len(), 1);
}
}

View file

@ -145,8 +145,8 @@ fn compile_from_ptx_and_cache(
comgr,
gcn_arch,
&*llvm_module.llvm_ir.write_bitcode_to_memory(),
&*llvm_module.attributes_ir.write_bitcode_to_memory(),
llvm_module.linked_bitcode(),
&*llvm_module.attributes_ir.write_bitcode_to_memory(),
None,
)
.map_err(|_| CUerror::UNKNOWN)?;

View file

@ -1,6 +1,7 @@
use ::dark_api::fatbin::FatbinFileIterator;
use ::dark_api::FnFfi;
use cuda_types::cuda::*;
use cuda_types::dark_api::FatbinHeader;
use dark_api::DarkApiState2;
use log::{CudaFunctionName, ErrorEntry};
use parking_lot::ReentrantMutex;
@ -368,7 +369,7 @@ impl DarkApiTrace {
FatbinFileIterator::new(
fatbin_header
.as_ref()
.ok_or(ErrorEntry::NullPointer("get_module_from_cubin_ext2_post"))?,
.ok_or(ErrorEntry::NullPointer("FatbinHeader"))?,
),
)
});
@ -1232,9 +1233,9 @@ impl Settings {
fn parse_compute_capability(env_string: &str) -> Option<(u32, u32)> {
let regex = Regex::new(r"(\d+)\.(\d+)").unwrap();
let captures = regex.captures(&env_string)?;
let major = captures.get(0)?;
let major = captures.get(1)?;
let major = str::parse::<u32>(major.as_str()).ok()?;
let minor = captures.get(1)?;
let minor = captures.get(2)?;
let minor = str::parse::<u32>(minor.as_str()).ok()?;
Some((major, minor))
}
@ -1353,13 +1354,27 @@ pub(crate) fn cuModuleGetFunction_Post(
#[allow(non_snake_case)]
pub(crate) fn cuDeviceGetAttribute_Post(
_pi: *mut ::std::os::raw::c_int,
_attrib: CUdevice_attribute,
pi: *mut ::std::os::raw::c_int,
attrib: CUdevice_attribute,
_dev: CUdevice,
_state: &mut trace::StateTracker,
state: &mut trace::StateTracker,
_fn_logger: &mut FnCallLog,
_result: CUresult,
) {
if attrib == CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR {
if let Some((major_override, _)) = state.override_cc {
unsafe {
*pi = major_override as i32;
};
}
}
if attrib == CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR {
if let Some((_, minor_override)) = state.override_cc {
unsafe {
*pi = minor_override as i32;
};
}
}
}
#[allow(non_snake_case)]
@ -1381,12 +1396,25 @@ pub(crate) fn cuDeviceComputeCapability_Post(
#[allow(non_snake_case)]
pub(crate) fn cuModuleLoadFatBinary_Post(
_module: *mut CUmodule,
_fatCubin: *const ::std::os::raw::c_void,
_state: &mut trace::StateTracker,
_fn_logger: &mut FnCallLog,
module: *mut CUmodule,
fatbin_header: *const ::std::os::raw::c_void,
state: &mut trace::StateTracker,
fn_logger: &mut FnCallLog,
_result: CUresult,
) {
fn_logger.try_(|fn_logger| unsafe {
trace::record_submodules(
*module,
fn_logger,
state,
FatbinFileIterator::new(
fatbin_header
.cast::<FatbinHeader>()
.as_ref()
.ok_or(ErrorEntry::NullPointer("FatbinHeader"))?,
),
)
});
}
#[allow(non_snake_case)]