mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-10-02 14:19:27 +00:00
Minor fixes and make zoc lazy load HIP runtime
This commit is contained in:
parent
cc47110c24
commit
6b90b5acba
8 changed files with 367 additions and 41 deletions
5
Cargo.lock
generated
5
Cargo.lock
generated
|
@ -357,10 +357,9 @@ dependencies = [
|
||||||
name = "compiler"
|
name = "compiler"
|
||||||
version = "0.0.0"
|
version = "0.0.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"amd_comgr-sys",
|
|
||||||
"bpaf",
|
"bpaf",
|
||||||
"comgr",
|
"comgr",
|
||||||
"hip_runtime-sys",
|
"libloading",
|
||||||
"ptx",
|
"ptx",
|
||||||
"ptx_parser",
|
"ptx_parser",
|
||||||
"thiserror 2.0.12",
|
"thiserror 2.0.12",
|
||||||
|
@ -3791,6 +3790,7 @@ name = "zluda_common"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cuda_types",
|
"cuda_types",
|
||||||
|
"dark_api",
|
||||||
"hip_runtime-sys",
|
"hip_runtime-sys",
|
||||||
"rocblas-sys",
|
"rocblas-sys",
|
||||||
]
|
]
|
||||||
|
@ -3883,6 +3883,7 @@ dependencies = [
|
||||||
"unwrap_or",
|
"unwrap_or",
|
||||||
"wchar",
|
"wchar",
|
||||||
"winapi",
|
"winapi",
|
||||||
|
"zluda_common",
|
||||||
"zluda_trace_common",
|
"zluda_trace_common",
|
||||||
"zstd-safe",
|
"zstd-safe",
|
||||||
]
|
]
|
||||||
|
|
|
@ -10,12 +10,11 @@ name = "zoc"
|
||||||
path = "src/main.rs"
|
path = "src/main.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
amd_comgr-sys = { path = "../ext/amd_comgr-sys" }
|
|
||||||
bpaf = { version = "0.9.19", features = ["derive"] }
|
bpaf = { version = "0.9.19", features = ["derive"] }
|
||||||
comgr = { path = "../comgr" }
|
comgr = { path = "../comgr" }
|
||||||
hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
|
|
||||||
ptx = { path = "../ptx" }
|
ptx = { path = "../ptx" }
|
||||||
ptx_parser = { path = "../ptx_parser" }
|
ptx_parser = { path = "../ptx_parser" }
|
||||||
|
libloading = "0.8"
|
||||||
thiserror = "2.0.12"
|
thiserror = "2.0.12"
|
||||||
|
|
||||||
[package.metadata.zluda]
|
[package.metadata.zluda]
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
|
use ptx::TranslateError;
|
||||||
|
use ptx_parser::PtxError;
|
||||||
use std::ffi::FromBytesUntilNulError;
|
use std::ffi::FromBytesUntilNulError;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::str::Utf8Error;
|
use std::str::Utf8Error;
|
||||||
|
|
||||||
use hip_runtime_sys::hipErrorCode_t;
|
|
||||||
use ptx::TranslateError;
|
|
||||||
use ptx_parser::PtxError;
|
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
pub enum CompilerError {
|
pub enum CompilerError {
|
||||||
#[error("HIP error code: {0:?}")]
|
#[error("HIP error code: {0:?}")]
|
||||||
HipError(hipErrorCode_t),
|
HipError(u32),
|
||||||
|
#[error(transparent)]
|
||||||
|
Libloading(#[from] libloading::Error),
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
ComgrError(#[from] comgr::Error),
|
ComgrError(#[from] comgr::Error),
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
|
@ -26,12 +26,6 @@ pub enum CompilerError {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<hipErrorCode_t> for CompilerError {
|
|
||||||
fn from(error_code: hipErrorCode_t) -> Self {
|
|
||||||
CompilerError::HipError(error_code)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Vec<PtxError<'_>>> for CompilerError {
|
impl From<Vec<PtxError<'_>>> for CompilerError {
|
||||||
fn from(causes: Vec<PtxError>) -> Self {
|
fn from(causes: Vec<PtxError>) -> Self {
|
||||||
let errors: Vec<String> = causes
|
let errors: Vec<String> = causes
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
use bpaf::Bpaf;
|
||||||
|
use error::CompilerError;
|
||||||
use std::ffi::CStr;
|
use std::ffi::CStr;
|
||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
|
@ -6,11 +8,7 @@ use std::process::ExitCode;
|
||||||
use std::str;
|
use std::str;
|
||||||
use std::{env, mem};
|
use std::{env, mem};
|
||||||
|
|
||||||
use bpaf::Bpaf;
|
|
||||||
|
|
||||||
mod error;
|
mod error;
|
||||||
use error::CompilerError;
|
|
||||||
use hip_runtime_sys::{hipDeviceProp_tR0600, hipGetDevicePropertiesR0600, hipInit};
|
|
||||||
|
|
||||||
const DEFAULT_ARCH: &'static str = "gfx1100";
|
const DEFAULT_ARCH: &'static str = "gfx1100";
|
||||||
|
|
||||||
|
@ -60,12 +58,17 @@ fn main_core() -> Result<(), CompilerError> {
|
||||||
let arch: String = match opts.arch {
|
let arch: String = match opts.arch {
|
||||||
Some(s) => s,
|
Some(s) => s,
|
||||||
None => {
|
None => {
|
||||||
unsafe { hipInit(0) }?;
|
(|| {
|
||||||
let mut dev_props: hipDeviceProp_tR0600 = unsafe { mem::zeroed() };
|
let runtime = hip::Runtime::load()?;
|
||||||
unsafe { hipGetDevicePropertiesR0600(&mut dev_props, 0) }?;
|
runtime.init()?;
|
||||||
|
get_gpu_arch(&runtime)
|
||||||
|
})()
|
||||||
|
.unwrap_or_else(|_| DEFAULT_ARCH.to_owned())
|
||||||
|
/*
|
||||||
get_gpu_arch(&mut dev_props)
|
get_gpu_arch(&mut dev_props)
|
||||||
.map(String::from)
|
.map(String::from)
|
||||||
.unwrap_or(DEFAULT_ARCH.to_owned())
|
.unwrap_or(DEFAULT_ARCH.to_owned())
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -122,12 +125,13 @@ struct LLVMArtifacts {
|
||||||
llvm_ir: Vec<u8>,
|
llvm_ir: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_gpu_arch<'a>(dev_props: &'a mut hipDeviceProp_tR0600) -> Result<&'a str, CompilerError> {
|
fn get_gpu_arch(runtime: &hip::Runtime) -> Result<String, CompilerError> {
|
||||||
unsafe { hipGetDevicePropertiesR0600(dev_props, 0) }?;
|
let mut dev_props = unsafe { mem::zeroed() };
|
||||||
|
runtime.device_get_properties(&mut dev_props, 0)?;
|
||||||
let gcn_arch_name = &dev_props.gcnArchName;
|
let gcn_arch_name = &dev_props.gcnArchName;
|
||||||
let gcn_arch_name = unsafe { CStr::from_ptr(gcn_arch_name.as_ptr()) };
|
let gcn_arch_name = unsafe { CStr::from_ptr(gcn_arch_name.as_ptr()) };
|
||||||
let gcn_arch_name = gcn_arch_name.to_str();
|
let gcn_arch_name = gcn_arch_name.to_str()?;
|
||||||
gcn_arch_name.map_err(CompilerError::from)
|
Ok(gcn_arch_name.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_to_file(content: &[u8], path: &Path) -> io::Result<()> {
|
fn write_to_file(content: &[u8], path: &Path) -> io::Result<()> {
|
||||||
|
@ -137,3 +141,316 @@ fn write_to_file(content: &[u8], path: &Path) -> io::Result<()> {
|
||||||
println!("Wrote to {}", path.to_str().unwrap());
|
println!("Wrote to {}", path.to_str().unwrap());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mod hip {
|
||||||
|
use crate::error::CompilerError;
|
||||||
|
|
||||||
|
// We lazy load HIP runtime because we want to work on systems with no
|
||||||
|
// HIP driver installed
|
||||||
|
pub struct Runtime(libloading::Library);
|
||||||
|
|
||||||
|
impl Runtime {
|
||||||
|
fn hip_check(err: u32) -> Result<(), CompilerError> {
|
||||||
|
match err {
|
||||||
|
0 => Ok(()),
|
||||||
|
err_code => Err(CompilerError::HipError(err_code)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn load() -> Result<Self, CompilerError> {
|
||||||
|
#[cfg(windows)]
|
||||||
|
let lib_name = "amdhip64_6.dll\0";
|
||||||
|
#[cfg(unix)]
|
||||||
|
let lib_name = "libamdhip64.so.6\0";
|
||||||
|
let library = unsafe { libloading::Library::new(lib_name)? };
|
||||||
|
Ok(Self(library))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn init(&self) -> Result<(), CompilerError> {
|
||||||
|
unsafe {
|
||||||
|
let hip_init: libloading::Symbol<unsafe extern "C" fn(u32) -> u32> =
|
||||||
|
self.0.get(b"hipInit\0")?;
|
||||||
|
Self::hip_check(hip_init(0))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn device_get_properties(
|
||||||
|
&self,
|
||||||
|
prop: &mut hipDeviceProp_tR0600,
|
||||||
|
device: i32,
|
||||||
|
) -> Result<(), CompilerError> {
|
||||||
|
unsafe {
|
||||||
|
let hip_get_device_properties: libloading::Symbol<
|
||||||
|
unsafe extern "C" fn(*mut hipDeviceProp_tR0600, i32) -> u32,
|
||||||
|
> = self.0.get(b"hipGetDevicePropertiesR0600\0")?;
|
||||||
|
Self::hip_check(hip_get_device_properties(prop, device))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(non_snake_case, non_camel_case_types)]
|
||||||
|
#[repr(C)]
|
||||||
|
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
|
||||||
|
pub struct hipDeviceProp_tR0600 {
|
||||||
|
///< Device name.
|
||||||
|
pub name: [::core::ffi::c_char; 256usize],
|
||||||
|
///< UUID of a device
|
||||||
|
pub uuid: hipUUID,
|
||||||
|
///< 8-byte unique identifier. Only valid on windows
|
||||||
|
pub luid: [::core::ffi::c_char; 8usize],
|
||||||
|
///< LUID node mask
|
||||||
|
pub luidDeviceNodeMask: ::core::ffi::c_uint,
|
||||||
|
///< Size of global memory region (in bytes).
|
||||||
|
pub totalGlobalMem: usize,
|
||||||
|
///< Size of shared memory per block (in bytes).
|
||||||
|
pub sharedMemPerBlock: usize,
|
||||||
|
///< Registers per block.
|
||||||
|
pub regsPerBlock: ::core::ffi::c_int,
|
||||||
|
///< Warp size.
|
||||||
|
pub warpSize: ::core::ffi::c_int,
|
||||||
|
/**< Maximum pitch in bytes allowed by memory copies
|
||||||
|
< pitched memory*/
|
||||||
|
pub memPitch: usize,
|
||||||
|
///< Max work items per work group or workgroup max size.
|
||||||
|
pub maxThreadsPerBlock: ::core::ffi::c_int,
|
||||||
|
///< Max number of threads in each dimension (XYZ) of a block.
|
||||||
|
pub maxThreadsDim: [::core::ffi::c_int; 3usize],
|
||||||
|
///< Max grid dimensions (XYZ).
|
||||||
|
pub maxGridSize: [::core::ffi::c_int; 3usize],
|
||||||
|
///< Max clock frequency of the multiProcessors in khz.
|
||||||
|
pub clockRate: ::core::ffi::c_int,
|
||||||
|
/**< Size of shared constant memory region on the device
|
||||||
|
< (in bytes).*/
|
||||||
|
pub totalConstMem: usize,
|
||||||
|
/**< Major compute capability. On HCC, this is an approximation and features may
|
||||||
|
< differ from CUDA CC. See the arch feature flags for portable ways to query
|
||||||
|
< feature caps.*/
|
||||||
|
pub major: ::core::ffi::c_int,
|
||||||
|
/**< Minor compute capability. On HCC, this is an approximation and features may
|
||||||
|
< differ from CUDA CC. See the arch feature flags for portable ways to query
|
||||||
|
< feature caps.*/
|
||||||
|
pub minor: ::core::ffi::c_int,
|
||||||
|
///< Alignment requirement for textures
|
||||||
|
pub textureAlignment: usize,
|
||||||
|
///< Pitch alignment requirement for texture references bound to
|
||||||
|
pub texturePitchAlignment: usize,
|
||||||
|
///< Deprecated. Use asyncEngineCount instead
|
||||||
|
pub deviceOverlap: ::core::ffi::c_int,
|
||||||
|
///< Number of multi-processors (compute units).
|
||||||
|
pub multiProcessorCount: ::core::ffi::c_int,
|
||||||
|
///< Run time limit for kernels executed on the device
|
||||||
|
pub kernelExecTimeoutEnabled: ::core::ffi::c_int,
|
||||||
|
///< APU vs dGPU
|
||||||
|
pub integrated: ::core::ffi::c_int,
|
||||||
|
///< Check whether HIP can map host memory
|
||||||
|
pub canMapHostMemory: ::core::ffi::c_int,
|
||||||
|
///< Compute mode.
|
||||||
|
pub computeMode: ::core::ffi::c_int,
|
||||||
|
///< Maximum number of elements in 1D images
|
||||||
|
pub maxTexture1D: ::core::ffi::c_int,
|
||||||
|
///< Maximum 1D mipmap texture size
|
||||||
|
pub maxTexture1DMipmap: ::core::ffi::c_int,
|
||||||
|
///< Maximum size for 1D textures bound to linear memory
|
||||||
|
pub maxTexture1DLinear: ::core::ffi::c_int,
|
||||||
|
///< Maximum dimensions (width, height) of 2D images, in image elements
|
||||||
|
pub maxTexture2D: [::core::ffi::c_int; 2usize],
|
||||||
|
///< Maximum number of elements in 2D array mipmap of images
|
||||||
|
pub maxTexture2DMipmap: [::core::ffi::c_int; 2usize],
|
||||||
|
///< Maximum 2D tex dimensions if tex are bound to pitched memory
|
||||||
|
pub maxTexture2DLinear: [::core::ffi::c_int; 3usize],
|
||||||
|
///< Maximum 2D tex dimensions if gather has to be performed
|
||||||
|
pub maxTexture2DGather: [::core::ffi::c_int; 2usize],
|
||||||
|
/**< Maximum dimensions (width, height, depth) of 3D images, in image
|
||||||
|
< elements*/
|
||||||
|
pub maxTexture3D: [::core::ffi::c_int; 3usize],
|
||||||
|
///< Maximum alternate 3D texture dims
|
||||||
|
pub maxTexture3DAlt: [::core::ffi::c_int; 3usize],
|
||||||
|
///< Maximum cubemap texture dims
|
||||||
|
pub maxTextureCubemap: ::core::ffi::c_int,
|
||||||
|
///< Maximum number of elements in 1D array images
|
||||||
|
pub maxTexture1DLayered: [::core::ffi::c_int; 2usize],
|
||||||
|
///< Maximum number of elements in 2D array images
|
||||||
|
pub maxTexture2DLayered: [::core::ffi::c_int; 3usize],
|
||||||
|
///< Maximum cubemaps layered texture dims
|
||||||
|
pub maxTextureCubemapLayered: [::core::ffi::c_int; 2usize],
|
||||||
|
///< Maximum 1D surface size
|
||||||
|
pub maxSurface1D: ::core::ffi::c_int,
|
||||||
|
///< Maximum 2D surface size
|
||||||
|
pub maxSurface2D: [::core::ffi::c_int; 2usize],
|
||||||
|
///< Maximum 3D surface size
|
||||||
|
pub maxSurface3D: [::core::ffi::c_int; 3usize],
|
||||||
|
///< Maximum 1D layered surface size
|
||||||
|
pub maxSurface1DLayered: [::core::ffi::c_int; 2usize],
|
||||||
|
///< Maximum 2D layared surface size
|
||||||
|
pub maxSurface2DLayered: [::core::ffi::c_int; 3usize],
|
||||||
|
///< Maximum cubemap surface size
|
||||||
|
pub maxSurfaceCubemap: ::core::ffi::c_int,
|
||||||
|
///< Maximum cubemap layered surface size
|
||||||
|
pub maxSurfaceCubemapLayered: [::core::ffi::c_int; 2usize],
|
||||||
|
///< Alignment requirement for surface
|
||||||
|
pub surfaceAlignment: usize,
|
||||||
|
///< Device can possibly execute multiple kernels concurrently.
|
||||||
|
pub concurrentKernels: ::core::ffi::c_int,
|
||||||
|
///< Device has ECC support enabled
|
||||||
|
pub ECCEnabled: ::core::ffi::c_int,
|
||||||
|
///< PCI Bus ID.
|
||||||
|
pub pciBusID: ::core::ffi::c_int,
|
||||||
|
///< PCI Device ID.
|
||||||
|
pub pciDeviceID: ::core::ffi::c_int,
|
||||||
|
///< PCI Domain ID
|
||||||
|
pub pciDomainID: ::core::ffi::c_int,
|
||||||
|
///< 1:If device is Tesla device using TCC driver, else 0
|
||||||
|
pub tccDriver: ::core::ffi::c_int,
|
||||||
|
///< Number of async engines
|
||||||
|
pub asyncEngineCount: ::core::ffi::c_int,
|
||||||
|
///< Does device and host share unified address space
|
||||||
|
pub unifiedAddressing: ::core::ffi::c_int,
|
||||||
|
///< Max global memory clock frequency in khz.
|
||||||
|
pub memoryClockRate: ::core::ffi::c_int,
|
||||||
|
///< Global memory bus width in bits.
|
||||||
|
pub memoryBusWidth: ::core::ffi::c_int,
|
||||||
|
///< L2 cache size.
|
||||||
|
pub l2CacheSize: ::core::ffi::c_int,
|
||||||
|
///< Device's max L2 persisting lines in bytes
|
||||||
|
pub persistingL2CacheMaxSize: ::core::ffi::c_int,
|
||||||
|
///< Maximum resident threads per multi-processor.
|
||||||
|
pub maxThreadsPerMultiProcessor: ::core::ffi::c_int,
|
||||||
|
///< Device supports stream priority
|
||||||
|
pub streamPrioritiesSupported: ::core::ffi::c_int,
|
||||||
|
///< Indicates globals are cached in L1
|
||||||
|
pub globalL1CacheSupported: ::core::ffi::c_int,
|
||||||
|
///< Locals are cahced in L1
|
||||||
|
pub localL1CacheSupported: ::core::ffi::c_int,
|
||||||
|
///< Amount of shared memory available per multiprocessor.
|
||||||
|
pub sharedMemPerMultiprocessor: usize,
|
||||||
|
///< registers available per multiprocessor
|
||||||
|
pub regsPerMultiprocessor: ::core::ffi::c_int,
|
||||||
|
///< Device supports allocating managed memory on this system
|
||||||
|
pub managedMemory: ::core::ffi::c_int,
|
||||||
|
///< 1 if device is on a multi-GPU board, 0 if not.
|
||||||
|
pub isMultiGpuBoard: ::core::ffi::c_int,
|
||||||
|
///< Unique identifier for a group of devices on same multiboard GPU
|
||||||
|
pub multiGpuBoardGroupID: ::core::ffi::c_int,
|
||||||
|
///< Link between host and device supports native atomics
|
||||||
|
pub hostNativeAtomicSupported: ::core::ffi::c_int,
|
||||||
|
///< Deprecated. CUDA only.
|
||||||
|
pub singleToDoublePrecisionPerfRatio: ::core::ffi::c_int,
|
||||||
|
/**< Device supports coherently accessing pageable memory
|
||||||
|
< without calling hipHostRegister on it*/
|
||||||
|
pub pageableMemoryAccess: ::core::ffi::c_int,
|
||||||
|
/**< Device can coherently access managed memory concurrently with
|
||||||
|
< the CPU*/
|
||||||
|
pub concurrentManagedAccess: ::core::ffi::c_int,
|
||||||
|
///< Is compute preemption supported on the device
|
||||||
|
pub computePreemptionSupported: ::core::ffi::c_int,
|
||||||
|
/**< Device can access host registered memory with same
|
||||||
|
< address as the host*/
|
||||||
|
pub canUseHostPointerForRegisteredMem: ::core::ffi::c_int,
|
||||||
|
///< HIP device supports cooperative launch
|
||||||
|
pub cooperativeLaunch: ::core::ffi::c_int,
|
||||||
|
/**< HIP device supports cooperative launch on multiple
|
||||||
|
< devices*/
|
||||||
|
pub cooperativeMultiDeviceLaunch: ::core::ffi::c_int,
|
||||||
|
///< Per device m ax shared mem per block usable by special opt in
|
||||||
|
pub sharedMemPerBlockOptin: usize,
|
||||||
|
/**< Device accesses pageable memory via the host's
|
||||||
|
< page tables*/
|
||||||
|
pub pageableMemoryAccessUsesHostPageTables: ::core::ffi::c_int,
|
||||||
|
/**< Host can directly access managed memory on the device
|
||||||
|
< without migration*/
|
||||||
|
pub directManagedMemAccessFromHost: ::core::ffi::c_int,
|
||||||
|
///< Max number of blocks on CU
|
||||||
|
pub maxBlocksPerMultiProcessor: ::core::ffi::c_int,
|
||||||
|
///< Max value of access policy window
|
||||||
|
pub accessPolicyMaxWindowSize: ::core::ffi::c_int,
|
||||||
|
///< Shared memory reserved by driver per block
|
||||||
|
pub reservedSharedMemPerBlock: usize,
|
||||||
|
///< Device supports hipHostRegister
|
||||||
|
pub hostRegisterSupported: ::core::ffi::c_int,
|
||||||
|
///< Indicates if device supports sparse hip arrays
|
||||||
|
pub sparseHipArraySupported: ::core::ffi::c_int,
|
||||||
|
/**< Device supports using the hipHostRegisterReadOnly flag
|
||||||
|
< with hipHostRegistger*/
|
||||||
|
pub hostRegisterReadOnlySupported: ::core::ffi::c_int,
|
||||||
|
///< Indicates external timeline semaphore support
|
||||||
|
pub timelineSemaphoreInteropSupported: ::core::ffi::c_int,
|
||||||
|
///< Indicates if device supports hipMallocAsync and hipMemPool APIs
|
||||||
|
pub memoryPoolsSupported: ::core::ffi::c_int,
|
||||||
|
///< Indicates device support of RDMA APIs
|
||||||
|
pub gpuDirectRDMASupported: ::core::ffi::c_int,
|
||||||
|
/**< Bitmask to be interpreted according to
|
||||||
|
< hipFlushGPUDirectRDMAWritesOptions*/
|
||||||
|
pub gpuDirectRDMAFlushWritesOptions: ::core::ffi::c_uint,
|
||||||
|
///< value of hipGPUDirectRDMAWritesOrdering
|
||||||
|
pub gpuDirectRDMAWritesOrdering: ::core::ffi::c_int,
|
||||||
|
///< Bitmask of handle types support with mempool based IPC
|
||||||
|
pub memoryPoolSupportedHandleTypes: ::core::ffi::c_uint,
|
||||||
|
/**< Device supports deferred mapping HIP arrays and HIP
|
||||||
|
< mipmapped arrays*/
|
||||||
|
pub deferredMappingHipArraySupported: ::core::ffi::c_int,
|
||||||
|
///< Device supports IPC events
|
||||||
|
pub ipcEventSupported: ::core::ffi::c_int,
|
||||||
|
///< Device supports cluster launch
|
||||||
|
pub clusterLaunch: ::core::ffi::c_int,
|
||||||
|
///< Indicates device supports unified function pointers
|
||||||
|
pub unifiedFunctionPointers: ::core::ffi::c_int,
|
||||||
|
///< CUDA Reserved.
|
||||||
|
pub reserved: [::core::ffi::c_int; 63usize],
|
||||||
|
///< Reserved for adding new entries for HIP/CUDA.
|
||||||
|
pub hipReserved: [::core::ffi::c_int; 32usize],
|
||||||
|
///< AMD GCN Arch Name. HIP Only.
|
||||||
|
pub gcnArchName: [::core::ffi::c_char; 256usize],
|
||||||
|
///< Maximum Shared Memory Per CU. HIP Only.
|
||||||
|
pub maxSharedMemoryPerMultiProcessor: usize,
|
||||||
|
/**< Frequency in khz of the timer used by the device-side "clock*"
|
||||||
|
< instructions. New for HIP.*/
|
||||||
|
pub clockInstructionRate: ::core::ffi::c_int,
|
||||||
|
///< Architectural feature flags. New for HIP.
|
||||||
|
pub arch: hipDeviceArch_t,
|
||||||
|
///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
|
||||||
|
pub hdpMemFlushCntl: *mut ::core::ffi::c_uint,
|
||||||
|
///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
|
||||||
|
pub hdpRegFlushCntl: *mut ::core::ffi::c_uint,
|
||||||
|
/**< HIP device supports cooperative launch on
|
||||||
|
< multiple*/
|
||||||
|
pub cooperativeMultiDeviceUnmatchedFunc: ::core::ffi::c_int,
|
||||||
|
/**< HIP device supports cooperative launch on
|
||||||
|
< multiple*/
|
||||||
|
pub cooperativeMultiDeviceUnmatchedGridDim: ::core::ffi::c_int,
|
||||||
|
/**< HIP device supports cooperative launch on
|
||||||
|
< multiple*/
|
||||||
|
pub cooperativeMultiDeviceUnmatchedBlockDim: ::core::ffi::c_int,
|
||||||
|
/**< HIP device supports cooperative launch on
|
||||||
|
< multiple*/
|
||||||
|
pub cooperativeMultiDeviceUnmatchedSharedMem: ::core::ffi::c_int,
|
||||||
|
///< 1: if it is a large PCI bar device, else 0
|
||||||
|
pub isLargeBar: ::core::ffi::c_int,
|
||||||
|
///< Revision of the GPU in this device
|
||||||
|
pub asicRevision: ::core::ffi::c_int,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(non_snake_case, non_camel_case_types)]
|
||||||
|
#[repr(C)]
|
||||||
|
#[repr(align(4))]
|
||||||
|
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
|
||||||
|
pub struct hipDeviceArch_t {
|
||||||
|
pub _bitfield_align_1: [u8; 0],
|
||||||
|
pub _bitfield_1: __BindgenBitfieldUnit<[u8; 3usize]>,
|
||||||
|
pub __bindgen_padding_0: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[repr(C)]
|
||||||
|
#[derive(Copy, Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
|
||||||
|
pub struct __BindgenBitfieldUnit<Storage> {
|
||||||
|
storage: Storage,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(non_camel_case_types)]
|
||||||
|
#[repr(C)]
|
||||||
|
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
|
||||||
|
pub struct hipUUID_t {
|
||||||
|
pub bytes: [::core::ffi::c_char; 16usize],
|
||||||
|
}
|
||||||
|
#[allow(non_camel_case_types)]
|
||||||
|
pub type hipUUID = hipUUID_t;
|
||||||
|
}
|
||||||
|
|
|
@ -77,13 +77,13 @@ bitflags! {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FatbincWrapper {
|
impl FatbincWrapper {
|
||||||
pub const MAGIC: [u8; 4] = [0x46, 0x62, 0x43, 0xB1];
|
pub const MAGIC: [u8; 4] = 0x466243B1u32.to_le_bytes();
|
||||||
pub const VERSION_V1: c_uint = 0x1;
|
pub const VERSION_V1: c_uint = 0x1;
|
||||||
pub const VERSION_V2: c_uint = 0x2;
|
pub const VERSION_V2: c_uint = 0x2;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FatbinHeader {
|
impl FatbinHeader {
|
||||||
pub const MAGIC: [u8; 4] = [0xBA, 0x55, 0xED, 0x50];
|
pub const MAGIC: [u8; 4] = 0xBA55ED50u32.to_le_bytes();
|
||||||
pub const VERSION: c_ushort = 0x01;
|
pub const VERSION: c_ushort = 0x01;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -125,6 +125,10 @@ pub enum FatbinIter<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> FatbinIter<'a> {
|
impl<'a> FatbinIter<'a> {
|
||||||
|
pub fn multi_module(&self) -> bool {
|
||||||
|
matches!(self, FatbinIter::V2(_))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn next(&mut self) -> Option<Result<FatbinSubmodule<'a>, ParseError>> {
|
pub fn next(&mut self) -> Option<Result<FatbinSubmodule<'a>, ParseError>> {
|
||||||
match self {
|
match self {
|
||||||
FatbinIter::V1(opt) => Ok(opt.take()).transpose(),
|
FatbinIter::V1(opt) => Ok(opt.take()).transpose(),
|
||||||
|
|
|
@ -510,30 +510,38 @@ impl<'a> CodeLibaryRef<'a> {
|
||||||
let module_iter = fatbin.get_submodules();
|
let module_iter = fatbin.get_submodules();
|
||||||
match module_iter {
|
match module_iter {
|
||||||
Ok(mut iter) => {
|
Ok(mut iter) => {
|
||||||
let mut module_index = 0;
|
let mut module_index = if iter.multi_module() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(0usize)
|
||||||
|
};
|
||||||
while let Some(maybe_submodule) = iter.next() {
|
while let Some(maybe_submodule) = iter.next() {
|
||||||
match maybe_submodule {
|
match maybe_submodule {
|
||||||
Ok(submodule) => iterate_modules_fatbin_header(
|
Ok(submodule) => iterate_modules_fatbin_header(
|
||||||
&mut |subindex, module| {
|
|subindex, module| {
|
||||||
let (subindex, _) = subindex.unwrap();
|
let index = match module_index {
|
||||||
fn_(Some((module_index, Some(subindex))), module)
|
Some(index) => (index, Some(subindex)),
|
||||||
|
None => (subindex, None),
|
||||||
|
};
|
||||||
|
fn_(Some(index), module)
|
||||||
},
|
},
|
||||||
&submodule,
|
&submodule,
|
||||||
),
|
),
|
||||||
Err(err) => fn_(
|
Err(err) => fn_(
|
||||||
Some((module_index, None)),
|
module_index.map(|module_index| (module_index, None)),
|
||||||
Err(FatbinError::ParseFailure(err)),
|
Err(FatbinError::ParseFailure(err)),
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
module_index += 1;
|
module_index = module_index.map(|index| index + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(err) => fn_(None, Err(err)),
|
Err(err) => fn_(None, Err(err)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
CodeLibaryRef::FatbinHeader(submodule) => {
|
CodeLibaryRef::FatbinHeader(submodule) => iterate_modules_fatbin_header(
|
||||||
iterate_modules_fatbin_header(&mut fn_, submodule);
|
|index, module| fn_(Some((index, None)), module),
|
||||||
}
|
submodule,
|
||||||
|
),
|
||||||
CodeLibaryRef::Text(text) => fn_(None, Ok(CodeModule::Text(*text))),
|
CodeLibaryRef::Text(text) => fn_(None, Ok(CodeModule::Text(*text))),
|
||||||
CodeLibaryRef::Elf(elf) => fn_(None, Ok(CodeModule::Elf(*elf))),
|
CodeLibaryRef::Elf(elf) => fn_(None, Ok(CodeModule::Elf(*elf))),
|
||||||
CodeLibaryRef::Archive(ar) => fn_(None, Ok(CodeModule::Archive(*ar))),
|
CodeLibaryRef::Archive(ar) => fn_(None, Ok(CodeModule::Archive(*ar))),
|
||||||
|
@ -542,14 +550,14 @@ impl<'a> CodeLibaryRef<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe fn iterate_modules_fatbin_header(
|
unsafe fn iterate_modules_fatbin_header(
|
||||||
fn_: &mut impl FnMut(Option<(usize, Option<usize>)>, Result<CodeModule, FatbinError>),
|
mut fn_: impl FnMut(usize, Result<CodeModule, FatbinError>),
|
||||||
submodule: &FatbinSubmodule<'_>,
|
submodule: &FatbinSubmodule<'_>,
|
||||||
) {
|
) {
|
||||||
let mut iter = submodule.get_files();
|
let mut iter = submodule.get_files();
|
||||||
let mut index = 0;
|
let mut index = 0;
|
||||||
while let Some(file) = iter.next() {
|
while let Some(file) = iter.next() {
|
||||||
fn_(
|
fn_(
|
||||||
Some((index, None)),
|
index,
|
||||||
file.map(CodeModule::File)
|
file.map(CodeModule::File)
|
||||||
.map_err(FatbinError::ParseFailure),
|
.map_err(FatbinError::ParseFailure),
|
||||||
);
|
);
|
||||||
|
|
|
@ -366,12 +366,15 @@ impl DumpWriter {
|
||||||
format!("module_{:04}.{:02}", module_index, kind)
|
format!("module_{:04}.{:02}", module_index, kind)
|
||||||
}
|
}
|
||||||
Some((sub_index, None)) => {
|
Some((sub_index, None)) => {
|
||||||
format!("module_{:04}_{:02}.{}", module_index, sub_index, kind)
|
format!("module_{:04}_{:02}.{}", module_index, sub_index + 1, kind)
|
||||||
}
|
}
|
||||||
Some((sub_index, Some(subsub_index))) => {
|
Some((sub_index, Some(subsub_index))) => {
|
||||||
format!(
|
format!(
|
||||||
"module_{:04}_{:02}_{:02}.{}",
|
"module_{:04}_{:02}_{:02}.{}",
|
||||||
module_index, sub_index, subsub_index, kind
|
module_index,
|
||||||
|
sub_index + 1,
|
||||||
|
subsub_index + 1,
|
||||||
|
kind
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue