Minor fixes and make zoc lazy load HIP runtime

This commit is contained in:
Andrzej Janik 2025-09-06 00:51:43 +00:00
commit 6b90b5acba
8 changed files with 367 additions and 41 deletions

5
Cargo.lock generated
View file

@ -357,10 +357,9 @@ dependencies = [
name = "compiler"
version = "0.0.0"
dependencies = [
"amd_comgr-sys",
"bpaf",
"comgr",
"hip_runtime-sys",
"libloading",
"ptx",
"ptx_parser",
"thiserror 2.0.12",
@ -3791,6 +3790,7 @@ name = "zluda_common"
version = "0.1.0"
dependencies = [
"cuda_types",
"dark_api",
"hip_runtime-sys",
"rocblas-sys",
]
@ -3883,6 +3883,7 @@ dependencies = [
"unwrap_or",
"wchar",
"winapi",
"zluda_common",
"zluda_trace_common",
"zstd-safe",
]

View file

@ -10,12 +10,11 @@ name = "zoc"
path = "src/main.rs"
[dependencies]
amd_comgr-sys = { path = "../ext/amd_comgr-sys" }
bpaf = { version = "0.9.19", features = ["derive"] }
comgr = { path = "../comgr" }
hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
ptx = { path = "../ptx" }
ptx_parser = { path = "../ptx_parser" }
libloading = "0.8"
thiserror = "2.0.12"
[package.metadata.zluda]

View file

@ -1,15 +1,15 @@
use ptx::TranslateError;
use ptx_parser::PtxError;
use std::ffi::FromBytesUntilNulError;
use std::io;
use std::str::Utf8Error;
use hip_runtime_sys::hipErrorCode_t;
use ptx::TranslateError;
use ptx_parser::PtxError;
#[derive(Debug, thiserror::Error)]
pub enum CompilerError {
#[error("HIP error code: {0:?}")]
HipError(hipErrorCode_t),
HipError(u32),
#[error(transparent)]
Libloading(#[from] libloading::Error),
#[error(transparent)]
ComgrError(#[from] comgr::Error),
#[error(transparent)]
@ -26,12 +26,6 @@ pub enum CompilerError {
},
}
impl From<hipErrorCode_t> for CompilerError {
fn from(error_code: hipErrorCode_t) -> Self {
CompilerError::HipError(error_code)
}
}
impl From<Vec<PtxError<'_>>> for CompilerError {
fn from(causes: Vec<PtxError>) -> Self {
let errors: Vec<String> = causes

View file

@ -1,3 +1,5 @@
use bpaf::Bpaf;
use error::CompilerError;
use std::ffi::CStr;
use std::fs::{self, File};
use std::io::{self, Write};
@ -6,11 +8,7 @@ use std::process::ExitCode;
use std::str;
use std::{env, mem};
use bpaf::Bpaf;
mod error;
use error::CompilerError;
use hip_runtime_sys::{hipDeviceProp_tR0600, hipGetDevicePropertiesR0600, hipInit};
const DEFAULT_ARCH: &'static str = "gfx1100";
@ -60,12 +58,17 @@ fn main_core() -> Result<(), CompilerError> {
let arch: String = match opts.arch {
Some(s) => s,
None => {
unsafe { hipInit(0) }?;
let mut dev_props: hipDeviceProp_tR0600 = unsafe { mem::zeroed() };
unsafe { hipGetDevicePropertiesR0600(&mut dev_props, 0) }?;
(|| {
let runtime = hip::Runtime::load()?;
runtime.init()?;
get_gpu_arch(&runtime)
})()
.unwrap_or_else(|_| DEFAULT_ARCH.to_owned())
/*
get_gpu_arch(&mut dev_props)
.map(String::from)
.unwrap_or(DEFAULT_ARCH.to_owned())
*/
}
};
@ -122,12 +125,13 @@ struct LLVMArtifacts {
llvm_ir: Vec<u8>,
}
fn get_gpu_arch<'a>(dev_props: &'a mut hipDeviceProp_tR0600) -> Result<&'a str, CompilerError> {
unsafe { hipGetDevicePropertiesR0600(dev_props, 0) }?;
fn get_gpu_arch(runtime: &hip::Runtime) -> Result<String, CompilerError> {
let mut dev_props = unsafe { mem::zeroed() };
runtime.device_get_properties(&mut dev_props, 0)?;
let gcn_arch_name = &dev_props.gcnArchName;
let gcn_arch_name = unsafe { CStr::from_ptr(gcn_arch_name.as_ptr()) };
let gcn_arch_name = gcn_arch_name.to_str();
gcn_arch_name.map_err(CompilerError::from)
let gcn_arch_name = gcn_arch_name.to_str()?;
Ok(gcn_arch_name.to_string())
}
fn write_to_file(content: &[u8], path: &Path) -> io::Result<()> {
@ -137,3 +141,316 @@ fn write_to_file(content: &[u8], path: &Path) -> io::Result<()> {
println!("Wrote to {}", path.to_str().unwrap());
Ok(())
}
mod hip {
use crate::error::CompilerError;
// We lazy load HIP runtime because we want to work on systems with no
// HIP driver installed
pub struct Runtime(libloading::Library);
impl Runtime {
fn hip_check(err: u32) -> Result<(), CompilerError> {
match err {
0 => Ok(()),
err_code => Err(CompilerError::HipError(err_code)),
}
}
pub fn load() -> Result<Self, CompilerError> {
#[cfg(windows)]
let lib_name = "amdhip64_6.dll\0";
#[cfg(unix)]
let lib_name = "libamdhip64.so.6\0";
let library = unsafe { libloading::Library::new(lib_name)? };
Ok(Self(library))
}
pub fn init(&self) -> Result<(), CompilerError> {
unsafe {
let hip_init: libloading::Symbol<unsafe extern "C" fn(u32) -> u32> =
self.0.get(b"hipInit\0")?;
Self::hip_check(hip_init(0))
}
}
pub fn device_get_properties(
&self,
prop: &mut hipDeviceProp_tR0600,
device: i32,
) -> Result<(), CompilerError> {
unsafe {
let hip_get_device_properties: libloading::Symbol<
unsafe extern "C" fn(*mut hipDeviceProp_tR0600, i32) -> u32,
> = self.0.get(b"hipGetDevicePropertiesR0600\0")?;
Self::hip_check(hip_get_device_properties(prop, device))
}
}
}
#[allow(non_snake_case, non_camel_case_types)]
#[repr(C)]
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
pub struct hipDeviceProp_tR0600 {
///< Device name.
pub name: [::core::ffi::c_char; 256usize],
///< UUID of a device
pub uuid: hipUUID,
///< 8-byte unique identifier. Only valid on windows
pub luid: [::core::ffi::c_char; 8usize],
///< LUID node mask
pub luidDeviceNodeMask: ::core::ffi::c_uint,
///< Size of global memory region (in bytes).
pub totalGlobalMem: usize,
///< Size of shared memory per block (in bytes).
pub sharedMemPerBlock: usize,
///< Registers per block.
pub regsPerBlock: ::core::ffi::c_int,
///< Warp size.
pub warpSize: ::core::ffi::c_int,
/**< Maximum pitch in bytes allowed by memory copies
< pitched memory*/
pub memPitch: usize,
///< Max work items per work group or workgroup max size.
pub maxThreadsPerBlock: ::core::ffi::c_int,
///< Max number of threads in each dimension (XYZ) of a block.
pub maxThreadsDim: [::core::ffi::c_int; 3usize],
///< Max grid dimensions (XYZ).
pub maxGridSize: [::core::ffi::c_int; 3usize],
///< Max clock frequency of the multiProcessors in khz.
pub clockRate: ::core::ffi::c_int,
/**< Size of shared constant memory region on the device
< (in bytes).*/
pub totalConstMem: usize,
/**< Major compute capability. On HCC, this is an approximation and features may
< differ from CUDA CC. See the arch feature flags for portable ways to query
< feature caps.*/
pub major: ::core::ffi::c_int,
/**< Minor compute capability. On HCC, this is an approximation and features may
< differ from CUDA CC. See the arch feature flags for portable ways to query
< feature caps.*/
pub minor: ::core::ffi::c_int,
///< Alignment requirement for textures
pub textureAlignment: usize,
///< Pitch alignment requirement for texture references bound to
pub texturePitchAlignment: usize,
///< Deprecated. Use asyncEngineCount instead
pub deviceOverlap: ::core::ffi::c_int,
///< Number of multi-processors (compute units).
pub multiProcessorCount: ::core::ffi::c_int,
///< Run time limit for kernels executed on the device
pub kernelExecTimeoutEnabled: ::core::ffi::c_int,
///< APU vs dGPU
pub integrated: ::core::ffi::c_int,
///< Check whether HIP can map host memory
pub canMapHostMemory: ::core::ffi::c_int,
///< Compute mode.
pub computeMode: ::core::ffi::c_int,
///< Maximum number of elements in 1D images
pub maxTexture1D: ::core::ffi::c_int,
///< Maximum 1D mipmap texture size
pub maxTexture1DMipmap: ::core::ffi::c_int,
///< Maximum size for 1D textures bound to linear memory
pub maxTexture1DLinear: ::core::ffi::c_int,
///< Maximum dimensions (width, height) of 2D images, in image elements
pub maxTexture2D: [::core::ffi::c_int; 2usize],
///< Maximum number of elements in 2D array mipmap of images
pub maxTexture2DMipmap: [::core::ffi::c_int; 2usize],
///< Maximum 2D tex dimensions if tex are bound to pitched memory
pub maxTexture2DLinear: [::core::ffi::c_int; 3usize],
///< Maximum 2D tex dimensions if gather has to be performed
pub maxTexture2DGather: [::core::ffi::c_int; 2usize],
/**< Maximum dimensions (width, height, depth) of 3D images, in image
< elements*/
pub maxTexture3D: [::core::ffi::c_int; 3usize],
///< Maximum alternate 3D texture dims
pub maxTexture3DAlt: [::core::ffi::c_int; 3usize],
///< Maximum cubemap texture dims
pub maxTextureCubemap: ::core::ffi::c_int,
///< Maximum number of elements in 1D array images
pub maxTexture1DLayered: [::core::ffi::c_int; 2usize],
///< Maximum number of elements in 2D array images
pub maxTexture2DLayered: [::core::ffi::c_int; 3usize],
///< Maximum cubemaps layered texture dims
pub maxTextureCubemapLayered: [::core::ffi::c_int; 2usize],
///< Maximum 1D surface size
pub maxSurface1D: ::core::ffi::c_int,
///< Maximum 2D surface size
pub maxSurface2D: [::core::ffi::c_int; 2usize],
///< Maximum 3D surface size
pub maxSurface3D: [::core::ffi::c_int; 3usize],
///< Maximum 1D layered surface size
pub maxSurface1DLayered: [::core::ffi::c_int; 2usize],
///< Maximum 2D layared surface size
pub maxSurface2DLayered: [::core::ffi::c_int; 3usize],
///< Maximum cubemap surface size
pub maxSurfaceCubemap: ::core::ffi::c_int,
///< Maximum cubemap layered surface size
pub maxSurfaceCubemapLayered: [::core::ffi::c_int; 2usize],
///< Alignment requirement for surface
pub surfaceAlignment: usize,
///< Device can possibly execute multiple kernels concurrently.
pub concurrentKernels: ::core::ffi::c_int,
///< Device has ECC support enabled
pub ECCEnabled: ::core::ffi::c_int,
///< PCI Bus ID.
pub pciBusID: ::core::ffi::c_int,
///< PCI Device ID.
pub pciDeviceID: ::core::ffi::c_int,
///< PCI Domain ID
pub pciDomainID: ::core::ffi::c_int,
///< 1:If device is Tesla device using TCC driver, else 0
pub tccDriver: ::core::ffi::c_int,
///< Number of async engines
pub asyncEngineCount: ::core::ffi::c_int,
///< Does device and host share unified address space
pub unifiedAddressing: ::core::ffi::c_int,
///< Max global memory clock frequency in khz.
pub memoryClockRate: ::core::ffi::c_int,
///< Global memory bus width in bits.
pub memoryBusWidth: ::core::ffi::c_int,
///< L2 cache size.
pub l2CacheSize: ::core::ffi::c_int,
///< Device's max L2 persisting lines in bytes
pub persistingL2CacheMaxSize: ::core::ffi::c_int,
///< Maximum resident threads per multi-processor.
pub maxThreadsPerMultiProcessor: ::core::ffi::c_int,
///< Device supports stream priority
pub streamPrioritiesSupported: ::core::ffi::c_int,
///< Indicates globals are cached in L1
pub globalL1CacheSupported: ::core::ffi::c_int,
///< Locals are cahced in L1
pub localL1CacheSupported: ::core::ffi::c_int,
///< Amount of shared memory available per multiprocessor.
pub sharedMemPerMultiprocessor: usize,
///< registers available per multiprocessor
pub regsPerMultiprocessor: ::core::ffi::c_int,
///< Device supports allocating managed memory on this system
pub managedMemory: ::core::ffi::c_int,
///< 1 if device is on a multi-GPU board, 0 if not.
pub isMultiGpuBoard: ::core::ffi::c_int,
///< Unique identifier for a group of devices on same multiboard GPU
pub multiGpuBoardGroupID: ::core::ffi::c_int,
///< Link between host and device supports native atomics
pub hostNativeAtomicSupported: ::core::ffi::c_int,
///< Deprecated. CUDA only.
pub singleToDoublePrecisionPerfRatio: ::core::ffi::c_int,
/**< Device supports coherently accessing pageable memory
< without calling hipHostRegister on it*/
pub pageableMemoryAccess: ::core::ffi::c_int,
/**< Device can coherently access managed memory concurrently with
< the CPU*/
pub concurrentManagedAccess: ::core::ffi::c_int,
///< Is compute preemption supported on the device
pub computePreemptionSupported: ::core::ffi::c_int,
/**< Device can access host registered memory with same
< address as the host*/
pub canUseHostPointerForRegisteredMem: ::core::ffi::c_int,
///< HIP device supports cooperative launch
pub cooperativeLaunch: ::core::ffi::c_int,
/**< HIP device supports cooperative launch on multiple
< devices*/
pub cooperativeMultiDeviceLaunch: ::core::ffi::c_int,
///< Per device m ax shared mem per block usable by special opt in
pub sharedMemPerBlockOptin: usize,
/**< Device accesses pageable memory via the host's
< page tables*/
pub pageableMemoryAccessUsesHostPageTables: ::core::ffi::c_int,
/**< Host can directly access managed memory on the device
< without migration*/
pub directManagedMemAccessFromHost: ::core::ffi::c_int,
///< Max number of blocks on CU
pub maxBlocksPerMultiProcessor: ::core::ffi::c_int,
///< Max value of access policy window
pub accessPolicyMaxWindowSize: ::core::ffi::c_int,
///< Shared memory reserved by driver per block
pub reservedSharedMemPerBlock: usize,
///< Device supports hipHostRegister
pub hostRegisterSupported: ::core::ffi::c_int,
///< Indicates if device supports sparse hip arrays
pub sparseHipArraySupported: ::core::ffi::c_int,
/**< Device supports using the hipHostRegisterReadOnly flag
< with hipHostRegistger*/
pub hostRegisterReadOnlySupported: ::core::ffi::c_int,
///< Indicates external timeline semaphore support
pub timelineSemaphoreInteropSupported: ::core::ffi::c_int,
///< Indicates if device supports hipMallocAsync and hipMemPool APIs
pub memoryPoolsSupported: ::core::ffi::c_int,
///< Indicates device support of RDMA APIs
pub gpuDirectRDMASupported: ::core::ffi::c_int,
/**< Bitmask to be interpreted according to
< hipFlushGPUDirectRDMAWritesOptions*/
pub gpuDirectRDMAFlushWritesOptions: ::core::ffi::c_uint,
///< value of hipGPUDirectRDMAWritesOrdering
pub gpuDirectRDMAWritesOrdering: ::core::ffi::c_int,
///< Bitmask of handle types support with mempool based IPC
pub memoryPoolSupportedHandleTypes: ::core::ffi::c_uint,
/**< Device supports deferred mapping HIP arrays and HIP
< mipmapped arrays*/
pub deferredMappingHipArraySupported: ::core::ffi::c_int,
///< Device supports IPC events
pub ipcEventSupported: ::core::ffi::c_int,
///< Device supports cluster launch
pub clusterLaunch: ::core::ffi::c_int,
///< Indicates device supports unified function pointers
pub unifiedFunctionPointers: ::core::ffi::c_int,
///< CUDA Reserved.
pub reserved: [::core::ffi::c_int; 63usize],
///< Reserved for adding new entries for HIP/CUDA.
pub hipReserved: [::core::ffi::c_int; 32usize],
///< AMD GCN Arch Name. HIP Only.
pub gcnArchName: [::core::ffi::c_char; 256usize],
///< Maximum Shared Memory Per CU. HIP Only.
pub maxSharedMemoryPerMultiProcessor: usize,
/**< Frequency in khz of the timer used by the device-side "clock*"
< instructions. New for HIP.*/
pub clockInstructionRate: ::core::ffi::c_int,
///< Architectural feature flags. New for HIP.
pub arch: hipDeviceArch_t,
///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
pub hdpMemFlushCntl: *mut ::core::ffi::c_uint,
///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
pub hdpRegFlushCntl: *mut ::core::ffi::c_uint,
/**< HIP device supports cooperative launch on
< multiple*/
pub cooperativeMultiDeviceUnmatchedFunc: ::core::ffi::c_int,
/**< HIP device supports cooperative launch on
< multiple*/
pub cooperativeMultiDeviceUnmatchedGridDim: ::core::ffi::c_int,
/**< HIP device supports cooperative launch on
< multiple*/
pub cooperativeMultiDeviceUnmatchedBlockDim: ::core::ffi::c_int,
/**< HIP device supports cooperative launch on
< multiple*/
pub cooperativeMultiDeviceUnmatchedSharedMem: ::core::ffi::c_int,
///< 1: if it is a large PCI bar device, else 0
pub isLargeBar: ::core::ffi::c_int,
///< Revision of the GPU in this device
pub asicRevision: ::core::ffi::c_int,
}
#[allow(non_snake_case, non_camel_case_types)]
#[repr(C)]
#[repr(align(4))]
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
pub struct hipDeviceArch_t {
pub _bitfield_align_1: [u8; 0],
pub _bitfield_1: __BindgenBitfieldUnit<[u8; 3usize]>,
pub __bindgen_padding_0: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct __BindgenBitfieldUnit<Storage> {
storage: Storage,
}
#[allow(non_camel_case_types)]
#[repr(C)]
#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
pub struct hipUUID_t {
pub bytes: [::core::ffi::c_char; 16usize],
}
#[allow(non_camel_case_types)]
pub type hipUUID = hipUUID_t;
}

View file

@ -77,13 +77,13 @@ bitflags! {
}
impl FatbincWrapper {
pub const MAGIC: [u8; 4] = [0x46, 0x62, 0x43, 0xB1];
pub const MAGIC: [u8; 4] = 0x466243B1u32.to_le_bytes();
pub const VERSION_V1: c_uint = 0x1;
pub const VERSION_V2: c_uint = 0x2;
}
impl FatbinHeader {
pub const MAGIC: [u8; 4] = [0xBA, 0x55, 0xED, 0x50];
pub const MAGIC: [u8; 4] = 0xBA55ED50u32.to_le_bytes();
pub const VERSION: c_ushort = 0x01;
}

View file

@ -125,6 +125,10 @@ pub enum FatbinIter<'a> {
}
impl<'a> FatbinIter<'a> {
pub fn multi_module(&self) -> bool {
matches!(self, FatbinIter::V2(_))
}
pub fn next(&mut self) -> Option<Result<FatbinSubmodule<'a>, ParseError>> {
match self {
FatbinIter::V1(opt) => Ok(opt.take()).transpose(),

View file

@ -510,30 +510,38 @@ impl<'a> CodeLibaryRef<'a> {
let module_iter = fatbin.get_submodules();
match module_iter {
Ok(mut iter) => {
let mut module_index = 0;
let mut module_index = if iter.multi_module() {
None
} else {
Some(0usize)
};
while let Some(maybe_submodule) = iter.next() {
match maybe_submodule {
Ok(submodule) => iterate_modules_fatbin_header(
&mut |subindex, module| {
let (subindex, _) = subindex.unwrap();
fn_(Some((module_index, Some(subindex))), module)
|subindex, module| {
let index = match module_index {
Some(index) => (index, Some(subindex)),
None => (subindex, None),
};
fn_(Some(index), module)
},
&submodule,
),
Err(err) => fn_(
Some((module_index, None)),
module_index.map(|module_index| (module_index, None)),
Err(FatbinError::ParseFailure(err)),
),
}
module_index += 1;
module_index = module_index.map(|index| index + 1);
}
}
Err(err) => fn_(None, Err(err)),
}
}
CodeLibaryRef::FatbinHeader(submodule) => {
iterate_modules_fatbin_header(&mut fn_, submodule);
}
CodeLibaryRef::FatbinHeader(submodule) => iterate_modules_fatbin_header(
|index, module| fn_(Some((index, None)), module),
submodule,
),
CodeLibaryRef::Text(text) => fn_(None, Ok(CodeModule::Text(*text))),
CodeLibaryRef::Elf(elf) => fn_(None, Ok(CodeModule::Elf(*elf))),
CodeLibaryRef::Archive(ar) => fn_(None, Ok(CodeModule::Archive(*ar))),
@ -542,14 +550,14 @@ impl<'a> CodeLibaryRef<'a> {
}
unsafe fn iterate_modules_fatbin_header(
fn_: &mut impl FnMut(Option<(usize, Option<usize>)>, Result<CodeModule, FatbinError>),
mut fn_: impl FnMut(usize, Result<CodeModule, FatbinError>),
submodule: &FatbinSubmodule<'_>,
) {
let mut iter = submodule.get_files();
let mut index = 0;
while let Some(file) = iter.next() {
fn_(
Some((index, None)),
index,
file.map(CodeModule::File)
.map_err(FatbinError::ParseFailure),
);

View file

@ -366,12 +366,15 @@ impl DumpWriter {
format!("module_{:04}.{:02}", module_index, kind)
}
Some((sub_index, None)) => {
format!("module_{:04}_{:02}.{}", module_index, sub_index, kind)
format!("module_{:04}_{:02}.{}", module_index, sub_index + 1, kind)
}
Some((sub_index, Some(subsub_index))) => {
format!(
"module_{:04}_{:02}_{:02}.{}",
module_index, sub_index, subsub_index, kind
module_index,
sub_index + 1,
subsub_index + 1,
kind
)
}
}