diff --git a/Cargo.lock b/Cargo.lock index 4d29361..a6ded6c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -357,10 +357,9 @@ dependencies = [ name = "compiler" version = "0.0.0" dependencies = [ - "amd_comgr-sys", "bpaf", "comgr", - "hip_runtime-sys", + "libloading", "ptx", "ptx_parser", "thiserror 2.0.12", @@ -3791,6 +3790,7 @@ name = "zluda_common" version = "0.1.0" dependencies = [ "cuda_types", + "dark_api", "hip_runtime-sys", "rocblas-sys", ] @@ -3883,6 +3883,7 @@ dependencies = [ "unwrap_or", "wchar", "winapi", + "zluda_common", "zluda_trace_common", "zstd-safe", ] diff --git a/compiler/Cargo.toml b/compiler/Cargo.toml index 16dca14..7b4c4df 100644 --- a/compiler/Cargo.toml +++ b/compiler/Cargo.toml @@ -10,12 +10,11 @@ name = "zoc" path = "src/main.rs" [dependencies] -amd_comgr-sys = { path = "../ext/amd_comgr-sys" } bpaf = { version = "0.9.19", features = ["derive"] } comgr = { path = "../comgr" } -hip_runtime-sys = { path = "../ext/hip_runtime-sys" } ptx = { path = "../ptx" } ptx_parser = { path = "../ptx_parser" } +libloading = "0.8" thiserror = "2.0.12" [package.metadata.zluda] diff --git a/compiler/src/error.rs b/compiler/src/error.rs index f5bfe11..9da1b7e 100644 --- a/compiler/src/error.rs +++ b/compiler/src/error.rs @@ -1,15 +1,15 @@ +use ptx::TranslateError; +use ptx_parser::PtxError; use std::ffi::FromBytesUntilNulError; use std::io; use std::str::Utf8Error; -use hip_runtime_sys::hipErrorCode_t; -use ptx::TranslateError; -use ptx_parser::PtxError; - #[derive(Debug, thiserror::Error)] pub enum CompilerError { #[error("HIP error code: {0:?}")] - HipError(hipErrorCode_t), + HipError(u32), + #[error(transparent)] + Libloading(#[from] libloading::Error), #[error(transparent)] ComgrError(#[from] comgr::Error), #[error(transparent)] @@ -26,12 +26,6 @@ pub enum CompilerError { }, } -impl From for CompilerError { - fn from(error_code: hipErrorCode_t) -> Self { - CompilerError::HipError(error_code) - } -} - impl From>> for CompilerError { fn from(causes: Vec) -> Self { let errors: Vec = causes diff --git a/compiler/src/main.rs b/compiler/src/main.rs index fb8feb0..5effaaf 100644 --- a/compiler/src/main.rs +++ b/compiler/src/main.rs @@ -1,3 +1,5 @@ +use bpaf::Bpaf; +use error::CompilerError; use std::ffi::CStr; use std::fs::{self, File}; use std::io::{self, Write}; @@ -6,11 +8,7 @@ use std::process::ExitCode; use std::str; use std::{env, mem}; -use bpaf::Bpaf; - mod error; -use error::CompilerError; -use hip_runtime_sys::{hipDeviceProp_tR0600, hipGetDevicePropertiesR0600, hipInit}; const DEFAULT_ARCH: &'static str = "gfx1100"; @@ -60,12 +58,17 @@ fn main_core() -> Result<(), CompilerError> { let arch: String = match opts.arch { Some(s) => s, None => { - unsafe { hipInit(0) }?; - let mut dev_props: hipDeviceProp_tR0600 = unsafe { mem::zeroed() }; - unsafe { hipGetDevicePropertiesR0600(&mut dev_props, 0) }?; + (|| { + let runtime = hip::Runtime::load()?; + runtime.init()?; + get_gpu_arch(&runtime) + })() + .unwrap_or_else(|_| DEFAULT_ARCH.to_owned()) + /* get_gpu_arch(&mut dev_props) .map(String::from) .unwrap_or(DEFAULT_ARCH.to_owned()) + */ } }; @@ -122,12 +125,13 @@ struct LLVMArtifacts { llvm_ir: Vec, } -fn get_gpu_arch<'a>(dev_props: &'a mut hipDeviceProp_tR0600) -> Result<&'a str, CompilerError> { - unsafe { hipGetDevicePropertiesR0600(dev_props, 0) }?; +fn get_gpu_arch(runtime: &hip::Runtime) -> Result { + let mut dev_props = unsafe { mem::zeroed() }; + runtime.device_get_properties(&mut dev_props, 0)?; let gcn_arch_name = &dev_props.gcnArchName; let gcn_arch_name = unsafe { CStr::from_ptr(gcn_arch_name.as_ptr()) }; - let gcn_arch_name = gcn_arch_name.to_str(); - gcn_arch_name.map_err(CompilerError::from) + let gcn_arch_name = gcn_arch_name.to_str()?; + Ok(gcn_arch_name.to_string()) } fn write_to_file(content: &[u8], path: &Path) -> io::Result<()> { @@ -137,3 +141,316 @@ fn write_to_file(content: &[u8], path: &Path) -> io::Result<()> { println!("Wrote to {}", path.to_str().unwrap()); Ok(()) } + +mod hip { + use crate::error::CompilerError; + + // We lazy load HIP runtime because we want to work on systems with no + // HIP driver installed + pub struct Runtime(libloading::Library); + + impl Runtime { + fn hip_check(err: u32) -> Result<(), CompilerError> { + match err { + 0 => Ok(()), + err_code => Err(CompilerError::HipError(err_code)), + } + } + + pub fn load() -> Result { + #[cfg(windows)] + let lib_name = "amdhip64_6.dll\0"; + #[cfg(unix)] + let lib_name = "libamdhip64.so.6\0"; + let library = unsafe { libloading::Library::new(lib_name)? }; + Ok(Self(library)) + } + + pub fn init(&self) -> Result<(), CompilerError> { + unsafe { + let hip_init: libloading::Symbol u32> = + self.0.get(b"hipInit\0")?; + Self::hip_check(hip_init(0)) + } + } + + pub fn device_get_properties( + &self, + prop: &mut hipDeviceProp_tR0600, + device: i32, + ) -> Result<(), CompilerError> { + unsafe { + let hip_get_device_properties: libloading::Symbol< + unsafe extern "C" fn(*mut hipDeviceProp_tR0600, i32) -> u32, + > = self.0.get(b"hipGetDevicePropertiesR0600\0")?; + Self::hip_check(hip_get_device_properties(prop, device)) + } + } + } + + #[allow(non_snake_case, non_camel_case_types)] + #[repr(C)] + #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] + pub struct hipDeviceProp_tR0600 { + ///< Device name. + pub name: [::core::ffi::c_char; 256usize], + ///< UUID of a device + pub uuid: hipUUID, + ///< 8-byte unique identifier. Only valid on windows + pub luid: [::core::ffi::c_char; 8usize], + ///< LUID node mask + pub luidDeviceNodeMask: ::core::ffi::c_uint, + ///< Size of global memory region (in bytes). + pub totalGlobalMem: usize, + ///< Size of shared memory per block (in bytes). + pub sharedMemPerBlock: usize, + ///< Registers per block. + pub regsPerBlock: ::core::ffi::c_int, + ///< Warp size. + pub warpSize: ::core::ffi::c_int, + /**< Maximum pitch in bytes allowed by memory copies + < pitched memory*/ + pub memPitch: usize, + ///< Max work items per work group or workgroup max size. + pub maxThreadsPerBlock: ::core::ffi::c_int, + ///< Max number of threads in each dimension (XYZ) of a block. + pub maxThreadsDim: [::core::ffi::c_int; 3usize], + ///< Max grid dimensions (XYZ). + pub maxGridSize: [::core::ffi::c_int; 3usize], + ///< Max clock frequency of the multiProcessors in khz. + pub clockRate: ::core::ffi::c_int, + /**< Size of shared constant memory region on the device + < (in bytes).*/ + pub totalConstMem: usize, + /**< Major compute capability. On HCC, this is an approximation and features may + < differ from CUDA CC. See the arch feature flags for portable ways to query + < feature caps.*/ + pub major: ::core::ffi::c_int, + /**< Minor compute capability. On HCC, this is an approximation and features may + < differ from CUDA CC. See the arch feature flags for portable ways to query + < feature caps.*/ + pub minor: ::core::ffi::c_int, + ///< Alignment requirement for textures + pub textureAlignment: usize, + ///< Pitch alignment requirement for texture references bound to + pub texturePitchAlignment: usize, + ///< Deprecated. Use asyncEngineCount instead + pub deviceOverlap: ::core::ffi::c_int, + ///< Number of multi-processors (compute units). + pub multiProcessorCount: ::core::ffi::c_int, + ///< Run time limit for kernels executed on the device + pub kernelExecTimeoutEnabled: ::core::ffi::c_int, + ///< APU vs dGPU + pub integrated: ::core::ffi::c_int, + ///< Check whether HIP can map host memory + pub canMapHostMemory: ::core::ffi::c_int, + ///< Compute mode. + pub computeMode: ::core::ffi::c_int, + ///< Maximum number of elements in 1D images + pub maxTexture1D: ::core::ffi::c_int, + ///< Maximum 1D mipmap texture size + pub maxTexture1DMipmap: ::core::ffi::c_int, + ///< Maximum size for 1D textures bound to linear memory + pub maxTexture1DLinear: ::core::ffi::c_int, + ///< Maximum dimensions (width, height) of 2D images, in image elements + pub maxTexture2D: [::core::ffi::c_int; 2usize], + ///< Maximum number of elements in 2D array mipmap of images + pub maxTexture2DMipmap: [::core::ffi::c_int; 2usize], + ///< Maximum 2D tex dimensions if tex are bound to pitched memory + pub maxTexture2DLinear: [::core::ffi::c_int; 3usize], + ///< Maximum 2D tex dimensions if gather has to be performed + pub maxTexture2DGather: [::core::ffi::c_int; 2usize], + /**< Maximum dimensions (width, height, depth) of 3D images, in image + < elements*/ + pub maxTexture3D: [::core::ffi::c_int; 3usize], + ///< Maximum alternate 3D texture dims + pub maxTexture3DAlt: [::core::ffi::c_int; 3usize], + ///< Maximum cubemap texture dims + pub maxTextureCubemap: ::core::ffi::c_int, + ///< Maximum number of elements in 1D array images + pub maxTexture1DLayered: [::core::ffi::c_int; 2usize], + ///< Maximum number of elements in 2D array images + pub maxTexture2DLayered: [::core::ffi::c_int; 3usize], + ///< Maximum cubemaps layered texture dims + pub maxTextureCubemapLayered: [::core::ffi::c_int; 2usize], + ///< Maximum 1D surface size + pub maxSurface1D: ::core::ffi::c_int, + ///< Maximum 2D surface size + pub maxSurface2D: [::core::ffi::c_int; 2usize], + ///< Maximum 3D surface size + pub maxSurface3D: [::core::ffi::c_int; 3usize], + ///< Maximum 1D layered surface size + pub maxSurface1DLayered: [::core::ffi::c_int; 2usize], + ///< Maximum 2D layared surface size + pub maxSurface2DLayered: [::core::ffi::c_int; 3usize], + ///< Maximum cubemap surface size + pub maxSurfaceCubemap: ::core::ffi::c_int, + ///< Maximum cubemap layered surface size + pub maxSurfaceCubemapLayered: [::core::ffi::c_int; 2usize], + ///< Alignment requirement for surface + pub surfaceAlignment: usize, + ///< Device can possibly execute multiple kernels concurrently. + pub concurrentKernels: ::core::ffi::c_int, + ///< Device has ECC support enabled + pub ECCEnabled: ::core::ffi::c_int, + ///< PCI Bus ID. + pub pciBusID: ::core::ffi::c_int, + ///< PCI Device ID. + pub pciDeviceID: ::core::ffi::c_int, + ///< PCI Domain ID + pub pciDomainID: ::core::ffi::c_int, + ///< 1:If device is Tesla device using TCC driver, else 0 + pub tccDriver: ::core::ffi::c_int, + ///< Number of async engines + pub asyncEngineCount: ::core::ffi::c_int, + ///< Does device and host share unified address space + pub unifiedAddressing: ::core::ffi::c_int, + ///< Max global memory clock frequency in khz. + pub memoryClockRate: ::core::ffi::c_int, + ///< Global memory bus width in bits. + pub memoryBusWidth: ::core::ffi::c_int, + ///< L2 cache size. + pub l2CacheSize: ::core::ffi::c_int, + ///< Device's max L2 persisting lines in bytes + pub persistingL2CacheMaxSize: ::core::ffi::c_int, + ///< Maximum resident threads per multi-processor. + pub maxThreadsPerMultiProcessor: ::core::ffi::c_int, + ///< Device supports stream priority + pub streamPrioritiesSupported: ::core::ffi::c_int, + ///< Indicates globals are cached in L1 + pub globalL1CacheSupported: ::core::ffi::c_int, + ///< Locals are cahced in L1 + pub localL1CacheSupported: ::core::ffi::c_int, + ///< Amount of shared memory available per multiprocessor. + pub sharedMemPerMultiprocessor: usize, + ///< registers available per multiprocessor + pub regsPerMultiprocessor: ::core::ffi::c_int, + ///< Device supports allocating managed memory on this system + pub managedMemory: ::core::ffi::c_int, + ///< 1 if device is on a multi-GPU board, 0 if not. + pub isMultiGpuBoard: ::core::ffi::c_int, + ///< Unique identifier for a group of devices on same multiboard GPU + pub multiGpuBoardGroupID: ::core::ffi::c_int, + ///< Link between host and device supports native atomics + pub hostNativeAtomicSupported: ::core::ffi::c_int, + ///< Deprecated. CUDA only. + pub singleToDoublePrecisionPerfRatio: ::core::ffi::c_int, + /**< Device supports coherently accessing pageable memory + < without calling hipHostRegister on it*/ + pub pageableMemoryAccess: ::core::ffi::c_int, + /**< Device can coherently access managed memory concurrently with + < the CPU*/ + pub concurrentManagedAccess: ::core::ffi::c_int, + ///< Is compute preemption supported on the device + pub computePreemptionSupported: ::core::ffi::c_int, + /**< Device can access host registered memory with same + < address as the host*/ + pub canUseHostPointerForRegisteredMem: ::core::ffi::c_int, + ///< HIP device supports cooperative launch + pub cooperativeLaunch: ::core::ffi::c_int, + /**< HIP device supports cooperative launch on multiple + < devices*/ + pub cooperativeMultiDeviceLaunch: ::core::ffi::c_int, + ///< Per device m ax shared mem per block usable by special opt in + pub sharedMemPerBlockOptin: usize, + /**< Device accesses pageable memory via the host's + < page tables*/ + pub pageableMemoryAccessUsesHostPageTables: ::core::ffi::c_int, + /**< Host can directly access managed memory on the device + < without migration*/ + pub directManagedMemAccessFromHost: ::core::ffi::c_int, + ///< Max number of blocks on CU + pub maxBlocksPerMultiProcessor: ::core::ffi::c_int, + ///< Max value of access policy window + pub accessPolicyMaxWindowSize: ::core::ffi::c_int, + ///< Shared memory reserved by driver per block + pub reservedSharedMemPerBlock: usize, + ///< Device supports hipHostRegister + pub hostRegisterSupported: ::core::ffi::c_int, + ///< Indicates if device supports sparse hip arrays + pub sparseHipArraySupported: ::core::ffi::c_int, + /**< Device supports using the hipHostRegisterReadOnly flag + < with hipHostRegistger*/ + pub hostRegisterReadOnlySupported: ::core::ffi::c_int, + ///< Indicates external timeline semaphore support + pub timelineSemaphoreInteropSupported: ::core::ffi::c_int, + ///< Indicates if device supports hipMallocAsync and hipMemPool APIs + pub memoryPoolsSupported: ::core::ffi::c_int, + ///< Indicates device support of RDMA APIs + pub gpuDirectRDMASupported: ::core::ffi::c_int, + /**< Bitmask to be interpreted according to + < hipFlushGPUDirectRDMAWritesOptions*/ + pub gpuDirectRDMAFlushWritesOptions: ::core::ffi::c_uint, + ///< value of hipGPUDirectRDMAWritesOrdering + pub gpuDirectRDMAWritesOrdering: ::core::ffi::c_int, + ///< Bitmask of handle types support with mempool based IPC + pub memoryPoolSupportedHandleTypes: ::core::ffi::c_uint, + /**< Device supports deferred mapping HIP arrays and HIP + < mipmapped arrays*/ + pub deferredMappingHipArraySupported: ::core::ffi::c_int, + ///< Device supports IPC events + pub ipcEventSupported: ::core::ffi::c_int, + ///< Device supports cluster launch + pub clusterLaunch: ::core::ffi::c_int, + ///< Indicates device supports unified function pointers + pub unifiedFunctionPointers: ::core::ffi::c_int, + ///< CUDA Reserved. + pub reserved: [::core::ffi::c_int; 63usize], + ///< Reserved for adding new entries for HIP/CUDA. + pub hipReserved: [::core::ffi::c_int; 32usize], + ///< AMD GCN Arch Name. HIP Only. + pub gcnArchName: [::core::ffi::c_char; 256usize], + ///< Maximum Shared Memory Per CU. HIP Only. + pub maxSharedMemoryPerMultiProcessor: usize, + /**< Frequency in khz of the timer used by the device-side "clock*" + < instructions. New for HIP.*/ + pub clockInstructionRate: ::core::ffi::c_int, + ///< Architectural feature flags. New for HIP. + pub arch: hipDeviceArch_t, + ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register + pub hdpMemFlushCntl: *mut ::core::ffi::c_uint, + ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register + pub hdpRegFlushCntl: *mut ::core::ffi::c_uint, + /**< HIP device supports cooperative launch on + < multiple*/ + pub cooperativeMultiDeviceUnmatchedFunc: ::core::ffi::c_int, + /**< HIP device supports cooperative launch on + < multiple*/ + pub cooperativeMultiDeviceUnmatchedGridDim: ::core::ffi::c_int, + /**< HIP device supports cooperative launch on + < multiple*/ + pub cooperativeMultiDeviceUnmatchedBlockDim: ::core::ffi::c_int, + /**< HIP device supports cooperative launch on + < multiple*/ + pub cooperativeMultiDeviceUnmatchedSharedMem: ::core::ffi::c_int, + ///< 1: if it is a large PCI bar device, else 0 + pub isLargeBar: ::core::ffi::c_int, + ///< Revision of the GPU in this device + pub asicRevision: ::core::ffi::c_int, + } + + #[allow(non_snake_case, non_camel_case_types)] + #[repr(C)] + #[repr(align(4))] + #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] + pub struct hipDeviceArch_t { + pub _bitfield_align_1: [u8; 0], + pub _bitfield_1: __BindgenBitfieldUnit<[u8; 3usize]>, + pub __bindgen_padding_0: u8, + } + + #[repr(C)] + #[derive(Copy, Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] + pub struct __BindgenBitfieldUnit { + storage: Storage, + } + + #[allow(non_camel_case_types)] + #[repr(C)] + #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] + pub struct hipUUID_t { + pub bytes: [::core::ffi::c_char; 16usize], + } + #[allow(non_camel_case_types)] + pub type hipUUID = hipUUID_t; +} diff --git a/cuda_types/src/dark_api.rs b/cuda_types/src/dark_api.rs index 435b472..1db5b2b 100644 --- a/cuda_types/src/dark_api.rs +++ b/cuda_types/src/dark_api.rs @@ -77,13 +77,13 @@ bitflags! { } impl FatbincWrapper { - pub const MAGIC: [u8; 4] = [0x46, 0x62, 0x43, 0xB1]; + pub const MAGIC: [u8; 4] = 0x466243B1u32.to_le_bytes(); pub const VERSION_V1: c_uint = 0x1; pub const VERSION_V2: c_uint = 0x2; } impl FatbinHeader { - pub const MAGIC: [u8; 4] = [0xBA, 0x55, 0xED, 0x50]; + pub const MAGIC: [u8; 4] = 0xBA55ED50u32.to_le_bytes(); pub const VERSION: c_ushort = 0x01; } diff --git a/dark_api/src/fatbin.rs b/dark_api/src/fatbin.rs index c1772e0..9043fc9 100644 --- a/dark_api/src/fatbin.rs +++ b/dark_api/src/fatbin.rs @@ -125,6 +125,10 @@ pub enum FatbinIter<'a> { } impl<'a> FatbinIter<'a> { + pub fn multi_module(&self) -> bool { + matches!(self, FatbinIter::V2(_)) + } + pub fn next(&mut self) -> Option, ParseError>> { match self { FatbinIter::V1(opt) => Ok(opt.take()).transpose(), diff --git a/zluda_common/src/lib.rs b/zluda_common/src/lib.rs index 339d861..28c17c4 100644 --- a/zluda_common/src/lib.rs +++ b/zluda_common/src/lib.rs @@ -510,30 +510,38 @@ impl<'a> CodeLibaryRef<'a> { let module_iter = fatbin.get_submodules(); match module_iter { Ok(mut iter) => { - let mut module_index = 0; + let mut module_index = if iter.multi_module() { + None + } else { + Some(0usize) + }; while let Some(maybe_submodule) = iter.next() { match maybe_submodule { Ok(submodule) => iterate_modules_fatbin_header( - &mut |subindex, module| { - let (subindex, _) = subindex.unwrap(); - fn_(Some((module_index, Some(subindex))), module) + |subindex, module| { + let index = match module_index { + Some(index) => (index, Some(subindex)), + None => (subindex, None), + }; + fn_(Some(index), module) }, &submodule, ), Err(err) => fn_( - Some((module_index, None)), + module_index.map(|module_index| (module_index, None)), Err(FatbinError::ParseFailure(err)), ), } - module_index += 1; + module_index = module_index.map(|index| index + 1); } } Err(err) => fn_(None, Err(err)), } } - CodeLibaryRef::FatbinHeader(submodule) => { - iterate_modules_fatbin_header(&mut fn_, submodule); - } + CodeLibaryRef::FatbinHeader(submodule) => iterate_modules_fatbin_header( + |index, module| fn_(Some((index, None)), module), + submodule, + ), CodeLibaryRef::Text(text) => fn_(None, Ok(CodeModule::Text(*text))), CodeLibaryRef::Elf(elf) => fn_(None, Ok(CodeModule::Elf(*elf))), CodeLibaryRef::Archive(ar) => fn_(None, Ok(CodeModule::Archive(*ar))), @@ -542,14 +550,14 @@ impl<'a> CodeLibaryRef<'a> { } unsafe fn iterate_modules_fatbin_header( - fn_: &mut impl FnMut(Option<(usize, Option)>, Result), + mut fn_: impl FnMut(usize, Result), submodule: &FatbinSubmodule<'_>, ) { let mut iter = submodule.get_files(); let mut index = 0; while let Some(file) = iter.next() { fn_( - Some((index, None)), + index, file.map(CodeModule::File) .map_err(FatbinError::ParseFailure), ); diff --git a/zluda_trace/src/trace.rs b/zluda_trace/src/trace.rs index ec27fa0..4da53ce 100644 --- a/zluda_trace/src/trace.rs +++ b/zluda_trace/src/trace.rs @@ -366,12 +366,15 @@ impl DumpWriter { format!("module_{:04}.{:02}", module_index, kind) } Some((sub_index, None)) => { - format!("module_{:04}_{:02}.{}", module_index, sub_index, kind) + format!("module_{:04}_{:02}.{}", module_index, sub_index + 1, kind) } Some((sub_index, Some(subsub_index))) => { format!( "module_{:04}_{:02}_{:02}.{}", - module_index, sub_index, subsub_index, kind + module_index, + sub_index + 1, + subsub_index + 1, + kind ) } }