Progress compilation despite parsing errors (#495)

Previously if we ran into a broken instruction we'd fail whole compilation. This PR changes it so (only in Release mode) we try and progress at all cost. Meaning that if we had trouble parsing an instruction we just remove function form the output and continue. For some workloads we can still compile a semi-broken, but meaningful subset of a module
2025-10-04 07:09:55 +00:00 · 2025-09-08 23:35:29 +02:00 · 2025-09-08 23:35:29 +02:00 · 869d291099
commit 869d291099
parent 00ca92167d
23 changed files with 1043 additions and 391 deletions
--- a/compiler/Cargo.toml
+++ b/compiler/Cargo.toml
@ -10,12 +10,11 @@ name = "zoc"
 path = "src/main.rs"

 [dependencies]
-amd_comgr-sys = { path = "../ext/amd_comgr-sys" }
 bpaf = { version = "0.9.19", features = ["derive"] }
 comgr = { path = "../comgr" }
-hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
 ptx = { path = "../ptx" }
 ptx_parser = { path = "../ptx_parser" }
+libloading = "0.8"
 thiserror = "2.0.12"

 [package.metadata.zluda]
--- a/compiler/src/error.rs
+++ b/compiler/src/error.rs
@ -1,15 +1,15 @@
+use ptx::TranslateError;
+use ptx_parser::PtxError;
 use std::ffi::FromBytesUntilNulError;
 use std::io;
 use std::str::Utf8Error;

-use hip_runtime_sys::hipErrorCode_t;
-use ptx::TranslateError;
-use ptx_parser::PtxError;
-
 #[derive(Debug, thiserror::Error)]
 pub enum CompilerError {
    #[error("HIP error code: {0:?}")]
-    HipError(hipErrorCode_t),
+    HipError(u32),
+    #[error(transparent)]
+    Libloading(#[from] libloading::Error),
    #[error(transparent)]
    ComgrError(#[from] comgr::Error),
    #[error(transparent)]
@ -26,12 +26,6 @@ pub enum CompilerError {
    },
 }

-impl From<hipErrorCode_t> for CompilerError {
-    fn from(error_code: hipErrorCode_t) -> Self {
-        CompilerError::HipError(error_code)
-    }
-}
-
 impl From<Vec<PtxError<'_>>> for CompilerError {
    fn from(causes: Vec<PtxError>) -> Self {
        let errors: Vec<String> = causes
--- a/compiler/src/main.rs
+++ b/compiler/src/main.rs
@ -1,3 +1,5 @@
+use bpaf::Bpaf;
+use error::CompilerError;
 use std::ffi::CStr;
 use std::fs::{self, File};
 use std::io::{self, Write};
@ -6,11 +8,7 @@ use std::process::ExitCode;
 use std::str;
 use std::{env, mem};

-use bpaf::Bpaf;
-
 mod error;
-use error::CompilerError;
-use hip_runtime_sys::{hipDeviceProp_tR0600, hipGetDevicePropertiesR0600, hipInit};

 const DEFAULT_ARCH: &'static str = "gfx1100";

@ -60,12 +58,17 @@ fn main_core() -> Result<(), CompilerError> {
    let arch: String = match opts.arch {
        Some(s) => s,
        None => {
-            unsafe { hipInit(0) }?;
-            let mut dev_props: hipDeviceProp_tR0600 = unsafe { mem::zeroed() };
-            unsafe { hipGetDevicePropertiesR0600(&mut dev_props, 0) }?;
+            (|| {
+                let runtime = hip::Runtime::load()?;
+                runtime.init()?;
+                get_gpu_arch(&runtime)
+            })()
+            .unwrap_or_else(|_| DEFAULT_ARCH.to_owned())
+            /*
            get_gpu_arch(&mut dev_props)
                .map(String::from)
                .unwrap_or(DEFAULT_ARCH.to_owned())
+             */
        }
    };

@ -122,12 +125,13 @@ struct LLVMArtifacts {
    llvm_ir: Vec<u8>,
 }

-fn get_gpu_arch<'a>(dev_props: &'a mut hipDeviceProp_tR0600) -> Result<&'a str, CompilerError> {
-    unsafe { hipGetDevicePropertiesR0600(dev_props, 0) }?;
+fn get_gpu_arch(runtime: &hip::Runtime) -> Result<String, CompilerError> {
+    let mut dev_props = unsafe { mem::zeroed() };
+    runtime.device_get_properties(&mut dev_props, 0)?;
    let gcn_arch_name = &dev_props.gcnArchName;
    let gcn_arch_name = unsafe { CStr::from_ptr(gcn_arch_name.as_ptr()) };
-    let gcn_arch_name = gcn_arch_name.to_str();
-    gcn_arch_name.map_err(CompilerError::from)
+    let gcn_arch_name = gcn_arch_name.to_str()?;
+    Ok(gcn_arch_name.to_string())
 }

 fn write_to_file(content: &[u8], path: &Path) -> io::Result<()> {
@ -137,3 +141,316 @@ fn write_to_file(content: &[u8], path: &Path) -> io::Result<()> {
    println!("Wrote to {}", path.to_str().unwrap());
    Ok(())
 }
+
+mod hip {
+    use crate::error::CompilerError;
+
+    // We lazy load HIP runtime because we want to work on systems with no
+    // HIP driver installed
+    pub struct Runtime(libloading::Library);
+
+    impl Runtime {
+        fn hip_check(err: u32) -> Result<(), CompilerError> {
+            match err {
+                0 => Ok(()),
+                err_code => Err(CompilerError::HipError(err_code)),
+            }
+        }
+
+        pub fn load() -> Result<Self, CompilerError> {
+            #[cfg(windows)]
+            let lib_name = "amdhip64_6.dll\0";
+            #[cfg(unix)]
+            let lib_name = "libamdhip64.so.6\0";
+            let library = unsafe { libloading::Library::new(lib_name)? };
+            Ok(Self(library))
+        }
+
+        pub fn init(&self) -> Result<(), CompilerError> {
+            unsafe {
+                let hip_init: libloading::Symbol<unsafe extern "C" fn(u32) -> u32> =
+                    self.0.get(b"hipInit\0")?;
+                Self::hip_check(hip_init(0))
+            }
+        }
+
+        pub fn device_get_properties(
+            &self,
+            prop: &mut hipDeviceProp_tR0600,
+            device: i32,
+        ) -> Result<(), CompilerError> {
+            unsafe {
+                let hip_get_device_properties: libloading::Symbol<
+                    unsafe extern "C" fn(*mut hipDeviceProp_tR0600, i32) -> u32,
+                > = self.0.get(b"hipGetDevicePropertiesR0600\0")?;
+                Self::hip_check(hip_get_device_properties(prop, device))
+            }
+        }
+    }
+
+    #[allow(non_snake_case, non_camel_case_types)]
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+    pub struct hipDeviceProp_tR0600 {
+        ///< Device name.
+        pub name: [::core::ffi::c_char; 256usize],
+        ///< UUID of a device
+        pub uuid: hipUUID,
+        ///< 8-byte unique identifier. Only valid on windows
+        pub luid: [::core::ffi::c_char; 8usize],
+        ///< LUID node mask
+        pub luidDeviceNodeMask: ::core::ffi::c_uint,
+        ///< Size of global memory region (in bytes).
+        pub totalGlobalMem: usize,
+        ///< Size of shared memory per block (in bytes).
+        pub sharedMemPerBlock: usize,
+        ///< Registers per block.
+        pub regsPerBlock: ::core::ffi::c_int,
+        ///< Warp size.
+        pub warpSize: ::core::ffi::c_int,
+        /**< Maximum pitch in bytes allowed by memory copies
+        < pitched memory*/
+        pub memPitch: usize,
+        ///< Max work items per work group or workgroup max size.
+        pub maxThreadsPerBlock: ::core::ffi::c_int,
+        ///< Max number of threads in each dimension (XYZ) of a block.
+        pub maxThreadsDim: [::core::ffi::c_int; 3usize],
+        ///< Max grid dimensions (XYZ).
+        pub maxGridSize: [::core::ffi::c_int; 3usize],
+        ///< Max clock frequency of the multiProcessors in khz.
+        pub clockRate: ::core::ffi::c_int,
+        /**< Size of shared constant memory region on the device
+        < (in bytes).*/
+        pub totalConstMem: usize,
+        /**< Major compute capability.  On HCC, this is an approximation and features may
+        < differ from CUDA CC.  See the arch feature flags for portable ways to query
+        < feature caps.*/
+        pub major: ::core::ffi::c_int,
+        /**< Minor compute capability.  On HCC, this is an approximation and features may
+        < differ from CUDA CC.  See the arch feature flags for portable ways to query
+        < feature caps.*/
+        pub minor: ::core::ffi::c_int,
+        ///< Alignment requirement for textures
+        pub textureAlignment: usize,
+        ///< Pitch alignment requirement for texture references bound to
+        pub texturePitchAlignment: usize,
+        ///< Deprecated. Use asyncEngineCount instead
+        pub deviceOverlap: ::core::ffi::c_int,
+        ///< Number of multi-processors (compute units).
+        pub multiProcessorCount: ::core::ffi::c_int,
+        ///< Run time limit for kernels executed on the device
+        pub kernelExecTimeoutEnabled: ::core::ffi::c_int,
+        ///< APU vs dGPU
+        pub integrated: ::core::ffi::c_int,
+        ///< Check whether HIP can map host memory
+        pub canMapHostMemory: ::core::ffi::c_int,
+        ///< Compute mode.
+        pub computeMode: ::core::ffi::c_int,
+        ///< Maximum number of elements in 1D images
+        pub maxTexture1D: ::core::ffi::c_int,
+        ///< Maximum 1D mipmap texture size
+        pub maxTexture1DMipmap: ::core::ffi::c_int,
+        ///< Maximum size for 1D textures bound to linear memory
+        pub maxTexture1DLinear: ::core::ffi::c_int,
+        ///< Maximum dimensions (width, height) of 2D images, in image elements
+        pub maxTexture2D: [::core::ffi::c_int; 2usize],
+        ///< Maximum number of elements in 2D array mipmap of images
+        pub maxTexture2DMipmap: [::core::ffi::c_int; 2usize],
+        ///< Maximum 2D tex dimensions if tex are bound to pitched memory
+        pub maxTexture2DLinear: [::core::ffi::c_int; 3usize],
+        ///< Maximum 2D tex dimensions if gather has to be performed
+        pub maxTexture2DGather: [::core::ffi::c_int; 2usize],
+        /**< Maximum dimensions (width, height, depth) of 3D images, in image
+        < elements*/
+        pub maxTexture3D: [::core::ffi::c_int; 3usize],
+        ///< Maximum alternate 3D texture dims
+        pub maxTexture3DAlt: [::core::ffi::c_int; 3usize],
+        ///< Maximum cubemap texture dims
+        pub maxTextureCubemap: ::core::ffi::c_int,
+        ///< Maximum number of elements in 1D array images
+        pub maxTexture1DLayered: [::core::ffi::c_int; 2usize],
+        ///< Maximum number of elements in 2D array images
+        pub maxTexture2DLayered: [::core::ffi::c_int; 3usize],
+        ///< Maximum cubemaps layered texture dims
+        pub maxTextureCubemapLayered: [::core::ffi::c_int; 2usize],
+        ///< Maximum 1D surface size
+        pub maxSurface1D: ::core::ffi::c_int,
+        ///< Maximum 2D surface size
+        pub maxSurface2D: [::core::ffi::c_int; 2usize],
+        ///< Maximum 3D surface size
+        pub maxSurface3D: [::core::ffi::c_int; 3usize],
+        ///< Maximum 1D layered surface size
+        pub maxSurface1DLayered: [::core::ffi::c_int; 2usize],
+        ///< Maximum 2D layared surface size
+        pub maxSurface2DLayered: [::core::ffi::c_int; 3usize],
+        ///< Maximum cubemap surface size
+        pub maxSurfaceCubemap: ::core::ffi::c_int,
+        ///< Maximum cubemap layered surface size
+        pub maxSurfaceCubemapLayered: [::core::ffi::c_int; 2usize],
+        ///< Alignment requirement for surface
+        pub surfaceAlignment: usize,
+        ///< Device can possibly execute multiple kernels concurrently.
+        pub concurrentKernels: ::core::ffi::c_int,
+        ///< Device has ECC support enabled
+        pub ECCEnabled: ::core::ffi::c_int,
+        ///< PCI Bus ID.
+        pub pciBusID: ::core::ffi::c_int,
+        ///< PCI Device ID.
+        pub pciDeviceID: ::core::ffi::c_int,
+        ///< PCI Domain ID
+        pub pciDomainID: ::core::ffi::c_int,
+        ///< 1:If device is Tesla device using TCC driver, else 0
+        pub tccDriver: ::core::ffi::c_int,
+        ///< Number of async engines
+        pub asyncEngineCount: ::core::ffi::c_int,
+        ///< Does device and host share unified address space
+        pub unifiedAddressing: ::core::ffi::c_int,
+        ///< Max global memory clock frequency in khz.
+        pub memoryClockRate: ::core::ffi::c_int,
+        ///< Global memory bus width in bits.
+        pub memoryBusWidth: ::core::ffi::c_int,
+        ///< L2 cache size.
+        pub l2CacheSize: ::core::ffi::c_int,
+        ///< Device's max L2 persisting lines in bytes
+        pub persistingL2CacheMaxSize: ::core::ffi::c_int,
+        ///< Maximum resident threads per multi-processor.
+        pub maxThreadsPerMultiProcessor: ::core::ffi::c_int,
+        ///< Device supports stream priority
+        pub streamPrioritiesSupported: ::core::ffi::c_int,
+        ///< Indicates globals are cached in L1
+        pub globalL1CacheSupported: ::core::ffi::c_int,
+        ///< Locals are cahced in L1
+        pub localL1CacheSupported: ::core::ffi::c_int,
+        ///< Amount of shared memory available per multiprocessor.
+        pub sharedMemPerMultiprocessor: usize,
+        ///< registers available per multiprocessor
+        pub regsPerMultiprocessor: ::core::ffi::c_int,
+        ///< Device supports allocating managed memory on this system
+        pub managedMemory: ::core::ffi::c_int,
+        ///< 1 if device is on a multi-GPU board, 0 if not.
+        pub isMultiGpuBoard: ::core::ffi::c_int,
+        ///< Unique identifier for a group of devices on same multiboard GPU
+        pub multiGpuBoardGroupID: ::core::ffi::c_int,
+        ///< Link between host and device supports native atomics
+        pub hostNativeAtomicSupported: ::core::ffi::c_int,
+        ///< Deprecated. CUDA only.
+        pub singleToDoublePrecisionPerfRatio: ::core::ffi::c_int,
+        /**< Device supports coherently accessing pageable memory
+        < without calling hipHostRegister on it*/
+        pub pageableMemoryAccess: ::core::ffi::c_int,
+        /**< Device can coherently access managed memory concurrently with
+        < the CPU*/
+        pub concurrentManagedAccess: ::core::ffi::c_int,
+        ///< Is compute preemption supported on the device
+        pub computePreemptionSupported: ::core::ffi::c_int,
+        /**< Device can access host registered memory with same
+        < address as the host*/
+        pub canUseHostPointerForRegisteredMem: ::core::ffi::c_int,
+        ///< HIP device supports cooperative launch
+        pub cooperativeLaunch: ::core::ffi::c_int,
+        /**< HIP device supports cooperative launch on multiple
+        < devices*/
+        pub cooperativeMultiDeviceLaunch: ::core::ffi::c_int,
+        ///< Per device m ax shared mem per block usable by special opt in
+        pub sharedMemPerBlockOptin: usize,
+        /**< Device accesses pageable memory via the host's
+        < page tables*/
+        pub pageableMemoryAccessUsesHostPageTables: ::core::ffi::c_int,
+        /**< Host can directly access managed memory on the device
+        < without migration*/
+        pub directManagedMemAccessFromHost: ::core::ffi::c_int,
+        ///< Max number of blocks on CU
+        pub maxBlocksPerMultiProcessor: ::core::ffi::c_int,
+        ///< Max value of access policy window
+        pub accessPolicyMaxWindowSize: ::core::ffi::c_int,
+        ///< Shared memory reserved by driver per block
+        pub reservedSharedMemPerBlock: usize,
+        ///< Device supports hipHostRegister
+        pub hostRegisterSupported: ::core::ffi::c_int,
+        ///< Indicates if device supports sparse hip arrays
+        pub sparseHipArraySupported: ::core::ffi::c_int,
+        /**< Device supports using the hipHostRegisterReadOnly flag
+        < with hipHostRegistger*/
+        pub hostRegisterReadOnlySupported: ::core::ffi::c_int,
+        ///< Indicates external timeline semaphore support
+        pub timelineSemaphoreInteropSupported: ::core::ffi::c_int,
+        ///< Indicates if device supports hipMallocAsync and hipMemPool APIs
+        pub memoryPoolsSupported: ::core::ffi::c_int,
+        ///< Indicates device support of RDMA APIs
+        pub gpuDirectRDMASupported: ::core::ffi::c_int,
+        /**< Bitmask to be interpreted according to
+        < hipFlushGPUDirectRDMAWritesOptions*/
+        pub gpuDirectRDMAFlushWritesOptions: ::core::ffi::c_uint,
+        ///< value of hipGPUDirectRDMAWritesOrdering
+        pub gpuDirectRDMAWritesOrdering: ::core::ffi::c_int,
+        ///< Bitmask of handle types support with mempool based IPC
+        pub memoryPoolSupportedHandleTypes: ::core::ffi::c_uint,
+        /**< Device supports deferred mapping HIP arrays and HIP
+        < mipmapped arrays*/
+        pub deferredMappingHipArraySupported: ::core::ffi::c_int,
+        ///< Device supports IPC events
+        pub ipcEventSupported: ::core::ffi::c_int,
+        ///< Device supports cluster launch
+        pub clusterLaunch: ::core::ffi::c_int,
+        ///< Indicates device supports unified function pointers
+        pub unifiedFunctionPointers: ::core::ffi::c_int,
+        ///< CUDA Reserved.
+        pub reserved: [::core::ffi::c_int; 63usize],
+        ///< Reserved for adding new entries for HIP/CUDA.
+        pub hipReserved: [::core::ffi::c_int; 32usize],
+        ///< AMD GCN Arch Name. HIP Only.
+        pub gcnArchName: [::core::ffi::c_char; 256usize],
+        ///< Maximum Shared Memory Per CU. HIP Only.
+        pub maxSharedMemoryPerMultiProcessor: usize,
+        /**< Frequency in khz of the timer used by the device-side "clock*"
+        < instructions.  New for HIP.*/
+        pub clockInstructionRate: ::core::ffi::c_int,
+        ///< Architectural feature flags.  New for HIP.
+        pub arch: hipDeviceArch_t,
+        ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
+        pub hdpMemFlushCntl: *mut ::core::ffi::c_uint,
+        ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
+        pub hdpRegFlushCntl: *mut ::core::ffi::c_uint,
+        /**< HIP device supports cooperative launch on
+        < multiple*/
+        pub cooperativeMultiDeviceUnmatchedFunc: ::core::ffi::c_int,
+        /**< HIP device supports cooperative launch on
+        < multiple*/
+        pub cooperativeMultiDeviceUnmatchedGridDim: ::core::ffi::c_int,
+        /**< HIP device supports cooperative launch on
+        < multiple*/
+        pub cooperativeMultiDeviceUnmatchedBlockDim: ::core::ffi::c_int,
+        /**< HIP device supports cooperative launch on
+        < multiple*/
+        pub cooperativeMultiDeviceUnmatchedSharedMem: ::core::ffi::c_int,
+        ///< 1: if it is a large PCI bar device, else 0
+        pub isLargeBar: ::core::ffi::c_int,
+        ///< Revision of the GPU in this device
+        pub asicRevision: ::core::ffi::c_int,
+    }
+
+    #[allow(non_snake_case, non_camel_case_types)]
+    #[repr(C)]
+    #[repr(align(4))]
+    #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+    pub struct hipDeviceArch_t {
+        pub _bitfield_align_1: [u8; 0],
+        pub _bitfield_1: __BindgenBitfieldUnit<[u8; 3usize]>,
+        pub __bindgen_padding_0: u8,
+    }
+
+    #[repr(C)]
+    #[derive(Copy, Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
+    pub struct __BindgenBitfieldUnit<Storage> {
+        storage: Storage,
+    }
+
+    #[allow(non_camel_case_types)]
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+    pub struct hipUUID_t {
+        pub bytes: [::core::ffi::c_char; 16usize],
+    }
+    #[allow(non_camel_case_types)]
+    pub type hipUUID = hipUUID_t;
+}