Minor fixes and make zoc lazy load HIP runtime

2025-10-01 13:49:11 +00:00 · 2025-09-06 00:51:43 +00:00 · 2025-09-06 00:51:43 +00:00 · 6b90b5acba
commit 6b90b5acba
parent cc47110c24
8 changed files with 367 additions and 41 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -357,10 +357,9 @@ dependencies = [
 name = "compiler"
 version = "0.0.0"
 dependencies = [
- "amd_comgr-sys",
 "bpaf",
 "comgr",
- "hip_runtime-sys",
+ "libloading",
 "ptx",
 "ptx_parser",
 "thiserror 2.0.12",
@ -3791,6 +3790,7 @@ name = "zluda_common"
 version = "0.1.0"
 dependencies = [
 "cuda_types",
+ "dark_api",
 "hip_runtime-sys",
 "rocblas-sys",
 ]
@ -3883,6 +3883,7 @@ dependencies = [
 "unwrap_or",
 "wchar",
 "winapi",
+ "zluda_common",
 "zluda_trace_common",
 "zstd-safe",
 ]
--- a/compiler/Cargo.toml
+++ b/compiler/Cargo.toml
@ -10,12 +10,11 @@ name = "zoc"
 path = "src/main.rs"

 [dependencies]
-amd_comgr-sys = { path = "../ext/amd_comgr-sys" }
 bpaf = { version = "0.9.19", features = ["derive"] }
 comgr = { path = "../comgr" }
-hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
 ptx = { path = "../ptx" }
 ptx_parser = { path = "../ptx_parser" }
+libloading = "0.8"
 thiserror = "2.0.12"

 [package.metadata.zluda]
--- a/compiler/src/error.rs
+++ b/compiler/src/error.rs
@ -1,15 +1,15 @@
+use ptx::TranslateError;
+use ptx_parser::PtxError;
 use std::ffi::FromBytesUntilNulError;
 use std::io;
 use std::str::Utf8Error;

-use hip_runtime_sys::hipErrorCode_t;
-use ptx::TranslateError;
-use ptx_parser::PtxError;
-
 #[derive(Debug, thiserror::Error)]
 pub enum CompilerError {
    #[error("HIP error code: {0:?}")]
-    HipError(hipErrorCode_t),
+    HipError(u32),
+    #[error(transparent)]
+    Libloading(#[from] libloading::Error),
    #[error(transparent)]
    ComgrError(#[from] comgr::Error),
    #[error(transparent)]
@ -26,12 +26,6 @@ pub enum CompilerError {
    },
 }

-impl From<hipErrorCode_t> for CompilerError {
-    fn from(error_code: hipErrorCode_t) -> Self {
-        CompilerError::HipError(error_code)
-    }
-}
-
 impl From<Vec<PtxError<'_>>> for CompilerError {
    fn from(causes: Vec<PtxError>) -> Self {
        let errors: Vec<String> = causes
--- a/compiler/src/main.rs
+++ b/compiler/src/main.rs
@ -1,3 +1,5 @@
+use bpaf::Bpaf;
+use error::CompilerError;
 use std::ffi::CStr;
 use std::fs::{self, File};
 use std::io::{self, Write};
@ -6,11 +8,7 @@ use std::process::ExitCode;
 use std::str;
 use std::{env, mem};

-use bpaf::Bpaf;
-
 mod error;
-use error::CompilerError;
-use hip_runtime_sys::{hipDeviceProp_tR0600, hipGetDevicePropertiesR0600, hipInit};

 const DEFAULT_ARCH: &'static str = "gfx1100";

@ -60,12 +58,17 @@ fn main_core() -> Result<(), CompilerError> {
    let arch: String = match opts.arch {
        Some(s) => s,
        None => {
-            unsafe { hipInit(0) }?;
-            let mut dev_props: hipDeviceProp_tR0600 = unsafe { mem::zeroed() };
-            unsafe { hipGetDevicePropertiesR0600(&mut dev_props, 0) }?;
+            (|| {
+                let runtime = hip::Runtime::load()?;
+                runtime.init()?;
+                get_gpu_arch(&runtime)
+            })()
+            .unwrap_or_else(|_| DEFAULT_ARCH.to_owned())
+            /*
            get_gpu_arch(&mut dev_props)
                .map(String::from)
                .unwrap_or(DEFAULT_ARCH.to_owned())
+             */
        }
    };

@ -122,12 +125,13 @@ struct LLVMArtifacts {
    llvm_ir: Vec<u8>,
 }

-fn get_gpu_arch<'a>(dev_props: &'a mut hipDeviceProp_tR0600) -> Result<&'a str, CompilerError> {
-    unsafe { hipGetDevicePropertiesR0600(dev_props, 0) }?;
+fn get_gpu_arch(runtime: &hip::Runtime) -> Result<String, CompilerError> {
+    let mut dev_props = unsafe { mem::zeroed() };
+    runtime.device_get_properties(&mut dev_props, 0)?;
    let gcn_arch_name = &dev_props.gcnArchName;
    let gcn_arch_name = unsafe { CStr::from_ptr(gcn_arch_name.as_ptr()) };
-    let gcn_arch_name = gcn_arch_name.to_str();
-    gcn_arch_name.map_err(CompilerError::from)
+    let gcn_arch_name = gcn_arch_name.to_str()?;
+    Ok(gcn_arch_name.to_string())
 }

 fn write_to_file(content: &[u8], path: &Path) -> io::Result<()> {
@ -137,3 +141,316 @@ fn write_to_file(content: &[u8], path: &Path) -> io::Result<()> {
    println!("Wrote to {}", path.to_str().unwrap());
    Ok(())
 }
+
+mod hip {
+    use crate::error::CompilerError;
+
+    // We lazy load HIP runtime because we want to work on systems with no
+    // HIP driver installed
+    pub struct Runtime(libloading::Library);
+
+    impl Runtime {
+        fn hip_check(err: u32) -> Result<(), CompilerError> {
+            match err {
+                0 => Ok(()),
+                err_code => Err(CompilerError::HipError(err_code)),
+            }
+        }
+
+        pub fn load() -> Result<Self, CompilerError> {
+            #[cfg(windows)]
+            let lib_name = "amdhip64_6.dll\0";
+            #[cfg(unix)]
+            let lib_name = "libamdhip64.so.6\0";
+            let library = unsafe { libloading::Library::new(lib_name)? };
+            Ok(Self(library))
+        }
+
+        pub fn init(&self) -> Result<(), CompilerError> {
+            unsafe {
+                let hip_init: libloading::Symbol<unsafe extern "C" fn(u32) -> u32> =
+                    self.0.get(b"hipInit\0")?;
+                Self::hip_check(hip_init(0))
+            }
+        }
+
+        pub fn device_get_properties(
+            &self,
+            prop: &mut hipDeviceProp_tR0600,
+            device: i32,
+        ) -> Result<(), CompilerError> {
+            unsafe {
+                let hip_get_device_properties: libloading::Symbol<
+                    unsafe extern "C" fn(*mut hipDeviceProp_tR0600, i32) -> u32,
+                > = self.0.get(b"hipGetDevicePropertiesR0600\0")?;
+                Self::hip_check(hip_get_device_properties(prop, device))
+            }
+        }
+    }
+
+    #[allow(non_snake_case, non_camel_case_types)]
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+    pub struct hipDeviceProp_tR0600 {
+        ///< Device name.
+        pub name: [::core::ffi::c_char; 256usize],
+        ///< UUID of a device
+        pub uuid: hipUUID,
+        ///< 8-byte unique identifier. Only valid on windows
+        pub luid: [::core::ffi::c_char; 8usize],
+        ///< LUID node mask
+        pub luidDeviceNodeMask: ::core::ffi::c_uint,
+        ///< Size of global memory region (in bytes).
+        pub totalGlobalMem: usize,
+        ///< Size of shared memory per block (in bytes).
+        pub sharedMemPerBlock: usize,
+        ///< Registers per block.
+        pub regsPerBlock: ::core::ffi::c_int,
+        ///< Warp size.
+        pub warpSize: ::core::ffi::c_int,
+        /**< Maximum pitch in bytes allowed by memory copies
+        < pitched memory*/
+        pub memPitch: usize,
+        ///< Max work items per work group or workgroup max size.
+        pub maxThreadsPerBlock: ::core::ffi::c_int,
+        ///< Max number of threads in each dimension (XYZ) of a block.
+        pub maxThreadsDim: [::core::ffi::c_int; 3usize],
+        ///< Max grid dimensions (XYZ).
+        pub maxGridSize: [::core::ffi::c_int; 3usize],
+        ///< Max clock frequency of the multiProcessors in khz.
+        pub clockRate: ::core::ffi::c_int,
+        /**< Size of shared constant memory region on the device
+        < (in bytes).*/
+        pub totalConstMem: usize,
+        /**< Major compute capability.  On HCC, this is an approximation and features may
+        < differ from CUDA CC.  See the arch feature flags for portable ways to query
+        < feature caps.*/
+        pub major: ::core::ffi::c_int,
+        /**< Minor compute capability.  On HCC, this is an approximation and features may
+        < differ from CUDA CC.  See the arch feature flags for portable ways to query
+        < feature caps.*/
+        pub minor: ::core::ffi::c_int,
+        ///< Alignment requirement for textures
+        pub textureAlignment: usize,
+        ///< Pitch alignment requirement for texture references bound to
+        pub texturePitchAlignment: usize,
+        ///< Deprecated. Use asyncEngineCount instead
+        pub deviceOverlap: ::core::ffi::c_int,
+        ///< Number of multi-processors (compute units).
+        pub multiProcessorCount: ::core::ffi::c_int,
+        ///< Run time limit for kernels executed on the device
+        pub kernelExecTimeoutEnabled: ::core::ffi::c_int,
+        ///< APU vs dGPU
+        pub integrated: ::core::ffi::c_int,
+        ///< Check whether HIP can map host memory
+        pub canMapHostMemory: ::core::ffi::c_int,
+        ///< Compute mode.
+        pub computeMode: ::core::ffi::c_int,
+        ///< Maximum number of elements in 1D images
+        pub maxTexture1D: ::core::ffi::c_int,
+        ///< Maximum 1D mipmap texture size
+        pub maxTexture1DMipmap: ::core::ffi::c_int,
+        ///< Maximum size for 1D textures bound to linear memory
+        pub maxTexture1DLinear: ::core::ffi::c_int,
+        ///< Maximum dimensions (width, height) of 2D images, in image elements
+        pub maxTexture2D: [::core::ffi::c_int; 2usize],
+        ///< Maximum number of elements in 2D array mipmap of images
+        pub maxTexture2DMipmap: [::core::ffi::c_int; 2usize],
+        ///< Maximum 2D tex dimensions if tex are bound to pitched memory
+        pub maxTexture2DLinear: [::core::ffi::c_int; 3usize],
+        ///< Maximum 2D tex dimensions if gather has to be performed
+        pub maxTexture2DGather: [::core::ffi::c_int; 2usize],
+        /**< Maximum dimensions (width, height, depth) of 3D images, in image
+        < elements*/
+        pub maxTexture3D: [::core::ffi::c_int; 3usize],
+        ///< Maximum alternate 3D texture dims
+        pub maxTexture3DAlt: [::core::ffi::c_int; 3usize],
+        ///< Maximum cubemap texture dims
+        pub maxTextureCubemap: ::core::ffi::c_int,
+        ///< Maximum number of elements in 1D array images
+        pub maxTexture1DLayered: [::core::ffi::c_int; 2usize],
+        ///< Maximum number of elements in 2D array images
+        pub maxTexture2DLayered: [::core::ffi::c_int; 3usize],
+        ///< Maximum cubemaps layered texture dims
+        pub maxTextureCubemapLayered: [::core::ffi::c_int; 2usize],
+        ///< Maximum 1D surface size
+        pub maxSurface1D: ::core::ffi::c_int,
+        ///< Maximum 2D surface size
+        pub maxSurface2D: [::core::ffi::c_int; 2usize],
+        ///< Maximum 3D surface size
+        pub maxSurface3D: [::core::ffi::c_int; 3usize],
+        ///< Maximum 1D layered surface size
+        pub maxSurface1DLayered: [::core::ffi::c_int; 2usize],
+        ///< Maximum 2D layared surface size
+        pub maxSurface2DLayered: [::core::ffi::c_int; 3usize],
+        ///< Maximum cubemap surface size
+        pub maxSurfaceCubemap: ::core::ffi::c_int,
+        ///< Maximum cubemap layered surface size
+        pub maxSurfaceCubemapLayered: [::core::ffi::c_int; 2usize],
+        ///< Alignment requirement for surface
+        pub surfaceAlignment: usize,
+        ///< Device can possibly execute multiple kernels concurrently.
+        pub concurrentKernels: ::core::ffi::c_int,
+        ///< Device has ECC support enabled
+        pub ECCEnabled: ::core::ffi::c_int,
+        ///< PCI Bus ID.
+        pub pciBusID: ::core::ffi::c_int,
+        ///< PCI Device ID.
+        pub pciDeviceID: ::core::ffi::c_int,
+        ///< PCI Domain ID
+        pub pciDomainID: ::core::ffi::c_int,
+        ///< 1:If device is Tesla device using TCC driver, else 0
+        pub tccDriver: ::core::ffi::c_int,
+        ///< Number of async engines
+        pub asyncEngineCount: ::core::ffi::c_int,
+        ///< Does device and host share unified address space
+        pub unifiedAddressing: ::core::ffi::c_int,
+        ///< Max global memory clock frequency in khz.
+        pub memoryClockRate: ::core::ffi::c_int,
+        ///< Global memory bus width in bits.
+        pub memoryBusWidth: ::core::ffi::c_int,
+        ///< L2 cache size.
+        pub l2CacheSize: ::core::ffi::c_int,
+        ///< Device's max L2 persisting lines in bytes
+        pub persistingL2CacheMaxSize: ::core::ffi::c_int,
+        ///< Maximum resident threads per multi-processor.
+        pub maxThreadsPerMultiProcessor: ::core::ffi::c_int,
+        ///< Device supports stream priority
+        pub streamPrioritiesSupported: ::core::ffi::c_int,
+        ///< Indicates globals are cached in L1
+        pub globalL1CacheSupported: ::core::ffi::c_int,
+        ///< Locals are cahced in L1
+        pub localL1CacheSupported: ::core::ffi::c_int,
+        ///< Amount of shared memory available per multiprocessor.
+        pub sharedMemPerMultiprocessor: usize,
+        ///< registers available per multiprocessor
+        pub regsPerMultiprocessor: ::core::ffi::c_int,
+        ///< Device supports allocating managed memory on this system
+        pub managedMemory: ::core::ffi::c_int,
+        ///< 1 if device is on a multi-GPU board, 0 if not.
+        pub isMultiGpuBoard: ::core::ffi::c_int,
+        ///< Unique identifier for a group of devices on same multiboard GPU
+        pub multiGpuBoardGroupID: ::core::ffi::c_int,
+        ///< Link between host and device supports native atomics
+        pub hostNativeAtomicSupported: ::core::ffi::c_int,
+        ///< Deprecated. CUDA only.
+        pub singleToDoublePrecisionPerfRatio: ::core::ffi::c_int,
+        /**< Device supports coherently accessing pageable memory
+        < without calling hipHostRegister on it*/
+        pub pageableMemoryAccess: ::core::ffi::c_int,
+        /**< Device can coherently access managed memory concurrently with
+        < the CPU*/
+        pub concurrentManagedAccess: ::core::ffi::c_int,
+        ///< Is compute preemption supported on the device
+        pub computePreemptionSupported: ::core::ffi::c_int,
+        /**< Device can access host registered memory with same
+        < address as the host*/
+        pub canUseHostPointerForRegisteredMem: ::core::ffi::c_int,
+        ///< HIP device supports cooperative launch
+        pub cooperativeLaunch: ::core::ffi::c_int,
+        /**< HIP device supports cooperative launch on multiple
+        < devices*/
+        pub cooperativeMultiDeviceLaunch: ::core::ffi::c_int,
+        ///< Per device m ax shared mem per block usable by special opt in
+        pub sharedMemPerBlockOptin: usize,
+        /**< Device accesses pageable memory via the host's
+        < page tables*/
+        pub pageableMemoryAccessUsesHostPageTables: ::core::ffi::c_int,
+        /**< Host can directly access managed memory on the device
+        < without migration*/
+        pub directManagedMemAccessFromHost: ::core::ffi::c_int,
+        ///< Max number of blocks on CU
+        pub maxBlocksPerMultiProcessor: ::core::ffi::c_int,
+        ///< Max value of access policy window
+        pub accessPolicyMaxWindowSize: ::core::ffi::c_int,
+        ///< Shared memory reserved by driver per block
+        pub reservedSharedMemPerBlock: usize,
+        ///< Device supports hipHostRegister
+        pub hostRegisterSupported: ::core::ffi::c_int,
+        ///< Indicates if device supports sparse hip arrays
+        pub sparseHipArraySupported: ::core::ffi::c_int,
+        /**< Device supports using the hipHostRegisterReadOnly flag
+        < with hipHostRegistger*/
+        pub hostRegisterReadOnlySupported: ::core::ffi::c_int,
+        ///< Indicates external timeline semaphore support
+        pub timelineSemaphoreInteropSupported: ::core::ffi::c_int,
+        ///< Indicates if device supports hipMallocAsync and hipMemPool APIs
+        pub memoryPoolsSupported: ::core::ffi::c_int,
+        ///< Indicates device support of RDMA APIs
+        pub gpuDirectRDMASupported: ::core::ffi::c_int,
+        /**< Bitmask to be interpreted according to
+        < hipFlushGPUDirectRDMAWritesOptions*/
+        pub gpuDirectRDMAFlushWritesOptions: ::core::ffi::c_uint,
+        ///< value of hipGPUDirectRDMAWritesOrdering
+        pub gpuDirectRDMAWritesOrdering: ::core::ffi::c_int,
+        ///< Bitmask of handle types support with mempool based IPC
+        pub memoryPoolSupportedHandleTypes: ::core::ffi::c_uint,
+        /**< Device supports deferred mapping HIP arrays and HIP
+        < mipmapped arrays*/
+        pub deferredMappingHipArraySupported: ::core::ffi::c_int,
+        ///< Device supports IPC events
+        pub ipcEventSupported: ::core::ffi::c_int,
+        ///< Device supports cluster launch
+        pub clusterLaunch: ::core::ffi::c_int,
+        ///< Indicates device supports unified function pointers
+        pub unifiedFunctionPointers: ::core::ffi::c_int,
+        ///< CUDA Reserved.
+        pub reserved: [::core::ffi::c_int; 63usize],
+        ///< Reserved for adding new entries for HIP/CUDA.
+        pub hipReserved: [::core::ffi::c_int; 32usize],
+        ///< AMD GCN Arch Name. HIP Only.
+        pub gcnArchName: [::core::ffi::c_char; 256usize],
+        ///< Maximum Shared Memory Per CU. HIP Only.
+        pub maxSharedMemoryPerMultiProcessor: usize,
+        /**< Frequency in khz of the timer used by the device-side "clock*"
+        < instructions.  New for HIP.*/
+        pub clockInstructionRate: ::core::ffi::c_int,
+        ///< Architectural feature flags.  New for HIP.
+        pub arch: hipDeviceArch_t,
+        ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
+        pub hdpMemFlushCntl: *mut ::core::ffi::c_uint,
+        ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
+        pub hdpRegFlushCntl: *mut ::core::ffi::c_uint,
+        /**< HIP device supports cooperative launch on
+        < multiple*/
+        pub cooperativeMultiDeviceUnmatchedFunc: ::core::ffi::c_int,
+        /**< HIP device supports cooperative launch on
+        < multiple*/
+        pub cooperativeMultiDeviceUnmatchedGridDim: ::core::ffi::c_int,
+        /**< HIP device supports cooperative launch on
+        < multiple*/
+        pub cooperativeMultiDeviceUnmatchedBlockDim: ::core::ffi::c_int,
+        /**< HIP device supports cooperative launch on
+        < multiple*/
+        pub cooperativeMultiDeviceUnmatchedSharedMem: ::core::ffi::c_int,
+        ///< 1: if it is a large PCI bar device, else 0
+        pub isLargeBar: ::core::ffi::c_int,
+        ///< Revision of the GPU in this device
+        pub asicRevision: ::core::ffi::c_int,
+    }
+
+    #[allow(non_snake_case, non_camel_case_types)]
+    #[repr(C)]
+    #[repr(align(4))]
+    #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+    pub struct hipDeviceArch_t {
+        pub _bitfield_align_1: [u8; 0],
+        pub _bitfield_1: __BindgenBitfieldUnit<[u8; 3usize]>,
+        pub __bindgen_padding_0: u8,
+    }
+
+    #[repr(C)]
+    #[derive(Copy, Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
+    pub struct __BindgenBitfieldUnit<Storage> {
+        storage: Storage,
+    }
+
+    #[allow(non_camel_case_types)]
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
+    pub struct hipUUID_t {
+        pub bytes: [::core::ffi::c_char; 16usize],
+    }
+    #[allow(non_camel_case_types)]
+    pub type hipUUID = hipUUID_t;
+}
--- a/cuda_types/src/dark_api.rs
+++ b/cuda_types/src/dark_api.rs
@ -77,13 +77,13 @@ bitflags! {
 }

 impl FatbincWrapper {
-    pub const MAGIC: [u8; 4] = [0x46, 0x62, 0x43, 0xB1];
+    pub const MAGIC: [u8; 4] = 0x466243B1u32.to_le_bytes();
    pub const VERSION_V1: c_uint = 0x1;
    pub const VERSION_V2: c_uint = 0x2;
 }

 impl FatbinHeader {
-    pub const MAGIC: [u8; 4] = [0xBA, 0x55, 0xED, 0x50];
+    pub const MAGIC: [u8; 4] = 0xBA55ED50u32.to_le_bytes();
    pub const VERSION: c_ushort = 0x01;
 }

--- a/dark_api/src/fatbin.rs
+++ b/dark_api/src/fatbin.rs
@ -125,6 +125,10 @@ pub enum FatbinIter<'a> {
 }

 impl<'a> FatbinIter<'a> {
+    pub fn multi_module(&self) -> bool {
+        matches!(self, FatbinIter::V2(_))
+    }
+
    pub fn next(&mut self) -> Option<Result<FatbinSubmodule<'a>, ParseError>> {
        match self {
            FatbinIter::V1(opt) => Ok(opt.take()).transpose(),
--- a/zluda_common/src/lib.rs
+++ b/zluda_common/src/lib.rs
@ -510,30 +510,38 @@ impl<'a> CodeLibaryRef<'a> {
                let module_iter = fatbin.get_submodules();
                match module_iter {
                    Ok(mut iter) => {
-                        let mut module_index = 0;
+                        let mut module_index = if iter.multi_module() {
+                            None
+                        } else {
+                            Some(0usize)
+                        };
                        while let Some(maybe_submodule) = iter.next() {
                            match maybe_submodule {
                                Ok(submodule) => iterate_modules_fatbin_header(
-                                    &mut |subindex, module| {
-                                        let (subindex, _) = subindex.unwrap();
-                                        fn_(Some((module_index, Some(subindex))), module)
+                                    |subindex, module| {
+                                        let index = match module_index {
+                                            Some(index) => (index, Some(subindex)),
+                                            None => (subindex, None),
+                                        };
+                                        fn_(Some(index), module)
                                    },
                                    &submodule,
                                ),
                                Err(err) => fn_(
-                                    Some((module_index, None)),
+                                    module_index.map(|module_index| (module_index, None)),
                                    Err(FatbinError::ParseFailure(err)),
                                ),
                            }
-                            module_index += 1;
+                            module_index = module_index.map(|index| index + 1);
                        }
                    }
                    Err(err) => fn_(None, Err(err)),
                }
            }
-            CodeLibaryRef::FatbinHeader(submodule) => {
-                iterate_modules_fatbin_header(&mut fn_, submodule);
-            }
+            CodeLibaryRef::FatbinHeader(submodule) => iterate_modules_fatbin_header(
+                |index, module| fn_(Some((index, None)), module),
+                submodule,
+            ),
            CodeLibaryRef::Text(text) => fn_(None, Ok(CodeModule::Text(*text))),
            CodeLibaryRef::Elf(elf) => fn_(None, Ok(CodeModule::Elf(*elf))),
            CodeLibaryRef::Archive(ar) => fn_(None, Ok(CodeModule::Archive(*ar))),
@ -542,14 +550,14 @@ impl<'a> CodeLibaryRef<'a> {
 }

 unsafe fn iterate_modules_fatbin_header(
-    fn_: &mut impl FnMut(Option<(usize, Option<usize>)>, Result<CodeModule, FatbinError>),
+    mut fn_: impl FnMut(usize, Result<CodeModule, FatbinError>),
    submodule: &FatbinSubmodule<'_>,
 ) {
    let mut iter = submodule.get_files();
    let mut index = 0;
    while let Some(file) = iter.next() {
        fn_(
-            Some((index, None)),
+            index,
            file.map(CodeModule::File)
                .map_err(FatbinError::ParseFailure),
        );
--- a/zluda_trace/src/trace.rs
+++ b/zluda_trace/src/trace.rs
@ -366,12 +366,15 @@ impl DumpWriter {
                format!("module_{:04}.{:02}", module_index, kind)
            }
            Some((sub_index, None)) => {
-                format!("module_{:04}_{:02}.{}", module_index, sub_index, kind)
+                format!("module_{:04}_{:02}.{}", module_index, sub_index + 1, kind)
            }
            Some((sub_index, Some(subsub_index))) => {
                format!(
                    "module_{:04}_{:02}_{:02}.{}",
-                    module_index, sub_index, subsub_index, kind
+                    module_index,
+                    sub_index + 1,
+                    subsub_index + 1,
+                    kind
                )
            }
        }