From bfef3317dc3d494aee11b1231d70b1492b1b3328 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Fri, 19 Sep 2025 00:39:27 +0000 Subject: [PATCH 01/11] Start working on trace replay --- Cargo.lock | 21 ++- cuda_macros/Cargo.toml | 2 +- ptx_parser/src/lib.rs | 30 ++++- zluda/Cargo.toml | 2 +- zluda_bindgen/Cargo.toml | 2 +- zluda_trace/Cargo.toml | 2 +- zluda_trace/src/lib.rs | 222 +++++++++++++++++++++++++------ zluda_trace/src/log.rs | 166 ++++++++++++----------- zluda_trace/src/replay.rs | 110 +++++++++++++++ zluda_trace/src/trace.rs | 139 +++++++++++++++---- zluda_trace_common/Cargo.toml | 4 + zluda_trace_common/src/lib.rs | 2 + zluda_trace_common/src/replay.rs | 83 ++++++++++++ 13 files changed, 621 insertions(+), 164 deletions(-) create mode 100644 zluda_trace/src/replay.rs create mode 100644 zluda_trace_common/src/replay.rs diff --git a/Cargo.lock b/Cargo.lock index cfe4cff..ee0d570 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -420,7 +420,7 @@ version = "0.0.0" dependencies = [ "proc-macro2", "quote", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "syn 2.0.89", ] @@ -3706,7 +3706,7 @@ dependencies = [ "paste", "ptx", "ptx_parser", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "serde", "serde_json", "tempfile", @@ -3726,7 +3726,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "syn 2.0.89", ] @@ -3854,7 +3854,7 @@ dependencies = [ "ptx", "ptx_parser", "regex", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "unwrap_or", "wchar", "winapi", @@ -3903,6 +3903,10 @@ dependencies = [ "format", "libc", "libloading", + "serde", + "serde_json", + "tar", + "zstd", ] [[package]] @@ -3979,6 +3983,15 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + [[package]] name = "zstd-safe" version = "7.2.4" diff --git a/cuda_macros/Cargo.toml b/cuda_macros/Cargo.toml index cfefc62..aa4e377 100644 --- a/cuda_macros/Cargo.toml +++ b/cuda_macros/Cargo.toml @@ -8,7 +8,7 @@ edition = "2021" quote = "1.0" syn = { version = "2.0", features = ["full", "visit-mut", "extra-traits"] } proc-macro2 = "1.0" -rustc-hash = "1.1.0" +rustc-hash = "2.0.0" [lib] proc-macro = true diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index a4f9080..fc19499 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -2,6 +2,7 @@ use derive_more::Display; use logos::Logos; use ptx_parser_macros::derive_parser; use rustc_hash::FxHashMap; +use std::alloc::Layout; use std::fmt::Debug; use std::num::{NonZeroU8, ParseFloatError, ParseIntError}; use std::{iter, usize}; @@ -345,7 +346,9 @@ fn reg_or_immediate<'a, 'input>( .parse_next(stream) } -pub fn parse_for_errors<'input>(text: &'input str) -> Vec> { +pub fn parse_for_errors_and_params<'input>( + text: &'input str, +) -> (Vec>, FxHashMap>) { let (tokens, mut errors) = lex_with_span_unchecked(text); let parse_result = { let state = PtxParserState::new(text, &mut errors); @@ -357,13 +360,30 @@ pub fn parse_for_errors<'input>(text: &'input str) -> Vec> { .parse(parser) .map_err(|err| PtxError::Parser(err.into_inner())) }; - match parse_result { - Ok(_) => {} + let params = match parse_result { + Ok(module) => module + .directives + .into_iter() + .filter_map(|directive| { + if let ast::Directive::Method(_, func) = directive { + let layouts = func + .func_directive + .input_arguments + .iter() + .map(|arg| arg.v_type.layout()) + .collect(); + Some((func.func_directive.name().to_string(), layouts)) + } else { + None + } + }) + .collect(), Err(err) => { errors.push(err); + FxHashMap::default() } - } - errors + }; + (errors, params) } fn lex_with_span_unchecked<'input>( diff --git a/zluda/Cargo.toml b/zluda/Cargo.toml index d0a65f4..1060e2b 100644 --- a/zluda/Cargo.toml +++ b/zluda/Cargo.toml @@ -22,7 +22,7 @@ num_enum = "0.4" lz4-sys = "1.9" tempfile = "3" paste = "1.0" -rustc-hash = "1.1" +rustc-hash = "2.0.0" zluda_common = { path = "../zluda_common" } blake3 = "1.8.2" serde = "1.0.219" diff --git a/zluda_bindgen/Cargo.toml b/zluda_bindgen/Cargo.toml index 5753307..8e7bb4d 100644 --- a/zluda_bindgen/Cargo.toml +++ b/zluda_bindgen/Cargo.toml @@ -9,6 +9,6 @@ syn = { version = "2.0", features = ["full", "visit-mut"] } proc-macro2 = "1.0.89" quote = "1.0" prettyplease = "0.2.25" -rustc-hash = "1.1.0" +rustc-hash = "2.0.0" libloading = "0.8" cuda_types = { path = "../cuda_types" } diff --git a/zluda_trace/Cargo.toml b/zluda_trace/Cargo.toml index a6c4120..0925c1a 100644 --- a/zluda_trace/Cargo.toml +++ b/zluda_trace/Cargo.toml @@ -24,7 +24,7 @@ paste = "1.0" cuda_macros = { path = "../cuda_macros" } cuda_types = { path = "../cuda_types" } parking_lot = "0.12.3" -rustc-hash = "1.1.0" +rustc-hash = "2.0.0" cglue = "0.3.5" zstd-safe = { version = "7.2.4", features = ["std"] } unwrap_or = "1.0.1" diff --git a/zluda_trace/src/lib.rs b/zluda_trace/src/lib.rs index f61fed6..a0c56b4 100644 --- a/zluda_trace/src/lib.rs +++ b/zluda_trace/src/lib.rs @@ -12,6 +12,7 @@ use std::ptr::NonNull; use std::sync::LazyLock; use std::{env, error::Error, fs, path::PathBuf, sync::Mutex}; use std::{io, mem, ptr, usize}; +use unwrap_or::unwrap_some_or; extern crate cuda_types; @@ -110,7 +111,7 @@ macro_rules! override_fn_core { ).ok(); formatted_args }; - let extract_fn_ptr = |_: &mut GlobalDelayedState, _: &mut FnCallLog| Some(()); + let extract_fn_ptr = |_: &mut GlobalDelayedState, _: &mut FnCallLog| Some(((), ())); let cuda_call = |_| { paste!{ [<$fn_name _impl >] ( $($arg_id),* ) } }; @@ -121,7 +122,7 @@ macro_rules! override_fn_core { format_curesult, extract_fn_ptr, cuda_call, - move |_, _, _, _| {} + move |_, _, _, _, _| {} ) } )* @@ -157,9 +158,9 @@ impl ::dark_api::zluda_trace::CudaDarkApi for InternalTableImpl { Some(|| args.call().to_vec()), internal_error, |status| format_status(status).to_vec(), - |_, _| Some(()), + |_, _| Some(((), ())), |_| fn_.call(), - move |_, _, _, _| {}, + move |_, _, _, _, _| {}, ) } } @@ -201,7 +202,7 @@ macro_rules! dark_api_fn_redirect_log { ).ok(); formatted_args }; - let extract_fn_ptr = |_: &mut GlobalDelayedState, _: &mut FnCallLog| { Some(()) }; + let extract_fn_ptr = |_: &mut GlobalDelayedState, _: &mut FnCallLog| { Some(((), ())) }; let cuda_call = |_: () | { ReprUsize::to_usize(original_fn( $( $arg_id ),* )) }; @@ -215,7 +216,7 @@ macro_rules! dark_api_fn_redirect_log { |status| <$ret_type as ReprUsize>::format_status(status).to_vec(), extract_fn_ptr, cuda_call, - move |_, _, _, _| {} + move |_, _, _, _, _| {} )) } )+ @@ -256,7 +257,7 @@ macro_rules! dark_api_fn_redirect_log_post { ).ok(); formatted_args }; - let extract_fn_ptr = |_: &mut GlobalDelayedState, _: &mut FnCallLog| { Some(()) }; + let extract_fn_ptr = |_: &mut GlobalDelayedState, _: &mut FnCallLog| { Some(((), ())) }; let cuda_call = |_: () | { ReprUsize::to_usize(original_fn( $( $arg_id ),* )) }; @@ -270,7 +271,7 @@ macro_rules! dark_api_fn_redirect_log_post { |status| <$ret_type as ReprUsize>::format_status(status).to_vec(), extract_fn_ptr, cuda_call, - move |state, logger, _, cuda_result| paste! { Self:: [<$fn_ _post>] } ( $( $arg_id ),* , &mut state.cuda_state, logger, <$ret_type as ReprUsize>::from_usize(cuda_result)) + move |state, logger, _, _, cuda_result| paste! { Self:: [<$fn_ _post>] } ( $( $arg_id ),* , &mut state.cuda_state, logger, <$ret_type as ReprUsize>::from_usize(cuda_result)) )) } )+ @@ -287,7 +288,11 @@ impl DarkApiTrace { fn_logger: &mut FnCallLog, _result: CUresult, ) { - state.record_new_library(unsafe { *module }, fatbinc_wrapper.cast(), fn_logger) + state.record_new_library( + unsafe { *module }.0.cast(), + fatbinc_wrapper.cast(), + fn_logger, + ) } fn get_module_from_cubin_ext1_post( @@ -321,7 +326,11 @@ impl DarkApiTrace { observed: UInt::U32(arg5), }); } - state.record_new_library(unsafe { *module }, fatbinc_wrapper.cast(), fn_logger) + state.record_new_library( + unsafe { *module }.0.cast(), + fatbinc_wrapper.cast(), + fn_logger, + ) } fn get_module_from_cubin_ext2_post( @@ -355,7 +364,7 @@ impl DarkApiTrace { observed: UInt::U32(arg5), }); } - state.record_new_library(unsafe { *module }, fatbin_header.cast(), fn_logger) + state.record_new_library(unsafe { *module }.0.cast(), fatbin_header.cast(), fn_logger) } } @@ -770,7 +779,7 @@ macro_rules! extern_redirect { }; let extract_fn_ptr = |state: &mut GlobalDelayedState, _: &mut FnCallLog| { paste::paste! { - state.libcuda. []() + state.libcuda. []().map(|x| ((), x) ) } }; let cuda_call = |fn_ptr: extern $abi fn ( $($arg_type),* ) -> $ret_type | { @@ -783,7 +792,7 @@ macro_rules! extern_redirect { format_curesult, extract_fn_ptr, cuda_call, - move |_, _, _, _| {} + move |_, _, _, _, _| {} ) } )* @@ -806,7 +815,7 @@ macro_rules! extern_redirect_with_post { }; let extract_fn_ptr = |state: &mut GlobalDelayedState, _: &mut FnCallLog| { paste::paste! { - state.libcuda. []() + state.libcuda. []().map(|x| ((), x) ) } }; let cuda_call = |fn_ptr: extern $abi fn ( $($arg_type),* ) -> $ret_type | { @@ -819,7 +828,43 @@ macro_rules! extern_redirect_with_post { format_curesult, extract_fn_ptr, cuda_call, - move |state, logger, _, cuda_result| paste! { [<$fn_name _Post>] } ( $( $arg_id ),* , &mut state.cuda_state, logger, cuda_result ) + move |state, logger, _, _, cuda_result| paste! { [<$fn_name _Post>] } ( $( $arg_id ),* , &mut state.cuda_state, logger, cuda_result ) + ) + } + )* + }; +} + +macro_rules! extern_redirect_with_pre_post { + ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => { + $( + #[no_mangle] + #[allow(improper_ctypes_definitions)] + pub extern $abi fn $fn_name ( $( $arg_id : $arg_type),* ) -> $ret_type { + let format_args = || { + let mut formatted_args = Vec::new(); + (paste! { format :: [] }) ( + &mut formatted_args + $(,$arg_id)* + ).ok(); + formatted_args + }; + let extract_fn_ptr = |state: &mut GlobalDelayedState, logger: &mut FnCallLog| { + paste::paste! { + state.libcuda. []().map(|x| (paste! { [<$fn_name _Pre>] } ( $( $arg_id ),* , &mut state.libcuda, &mut state.cuda_state, logger ), x )) + } + }; + let cuda_call = |fn_ptr: extern $abi fn ( $($arg_type),* ) -> $ret_type | { + fn_ptr( $( $arg_id ),* ) + }; + GlobalState2::under_lock( + CudaFunctionName::Normal(stringify!($fn_name)), + Some(format_args), + CUresult::INTERNAL_ERROR, + format_curesult, + extract_fn_ptr, + cuda_call, + move |state, logger, pre_state, _, cuda_result| paste! { [<$fn_name _Post>] } ( $( $arg_id ),* , pre_state, &mut state.libcuda, &mut state.cuda_state, logger, cuda_result ) ) } )* @@ -843,13 +888,15 @@ cuda_function_declarations!( cuModuleLoad, cuModuleLoadData, cuModuleLoadDataEx, + cuLibraryGetFunction, cuModuleGetFunction, cuDeviceGetAttribute, cuDeviceComputeCapability, cuModuleLoadFatBinary, cuLibraryGetModule, - cuLibraryLoadData + cuLibraryLoadData, ], + extern_redirect_with_pre_post <= [cuLaunchKernelEx], override_fn_core <= [cuGetProcAddress, cuGetProcAddress_v2], override_fn_full <= [cuGetExportTable], ); @@ -859,6 +906,7 @@ mod log; #[cfg_attr(windows, path = "os_win.rs")] #[cfg_attr(not(windows), path = "os_unix.rs")] mod os; +mod replay; mod trace; struct GlobalState2 { @@ -907,27 +955,33 @@ impl GlobalState2 { // * Post-call: // We log the output of the CUDA function and any errors that may have occurred. This phase // is also covered by a drop guard which will flush the log buffer in case of panic - fn under_lock<'a, FnPtr: Copy, InnerResult: Copy>( + fn under_lock<'a, PreState, FnPtr: Copy, InnerResult: Copy>( name: CudaFunctionName, args: Option Vec>, internal_error: InnerResult, format_status: impl FnOnce(InnerResult) -> Vec, - pre_call: impl FnOnce(&mut GlobalDelayedState, &mut FnCallLog) -> Option, + pre_call: impl FnOnce(&mut GlobalDelayedState, &mut FnCallLog) -> Option<(PreState, FnPtr)>, inner_call: impl FnOnce(FnPtr) -> InnerResult, - post_call: impl FnOnce(&mut GlobalDelayedState, &mut FnCallLog, FnPtr, InnerResult), + post_call: impl FnOnce(&mut GlobalDelayedState, &mut FnCallLog, PreState, FnPtr, InnerResult), ) -> InnerResult { - fn under_lock_impl<'a, FnPtr: Copy, InnerResult: Copy>( + fn under_lock_impl<'a, PreState, FnPtr: Copy, InnerResult: Copy>( name: CudaFunctionName, args: Option Vec>, internal_error: InnerResult, format_status: impl FnOnce(InnerResult) -> Vec, - pre_call: impl FnOnce(&mut GlobalDelayedState, &mut FnCallLog) -> Option, + pre_call: impl FnOnce(&mut GlobalDelayedState, &mut FnCallLog) -> Option<(PreState, FnPtr)>, inner_call: impl FnOnce(FnPtr) -> InnerResult, - post_call: impl FnOnce(&mut GlobalDelayedState, &mut FnCallLog, FnPtr, InnerResult), + post_call: impl FnOnce( + &mut GlobalDelayedState, + &mut FnCallLog, + PreState, + FnPtr, + InnerResult, + ), ) -> InnerResult { let global_state = GLOBAL_STATE2.lock(); let global_state_ref_cell = &*global_state; - let pre_value = { + let (pre_state, pre_ptr) = { let mut global_state_ref_mut = global_state_ref_cell.borrow_mut(); let global_state = &mut *global_state_ref_mut; let panic_guard = OuterCallGuard { @@ -963,7 +1017,7 @@ impl GlobalState2 { } }; let panic_guard = InnerCallGuard(global_state_ref_cell); - let inner_result = inner_call(pre_value); + let inner_result = inner_call(pre_ptr); let global_state = &mut *global_state_ref_cell.borrow_mut(); mem::forget(panic_guard); let _drop_guard = OuterCallGuard { @@ -978,7 +1032,8 @@ impl GlobalState2 { post_call( global_state.delayed_state.as_mut().unwrap(), &mut logger, - pre_value, + pre_state, + pre_ptr, inner_result, ); inner_result @@ -1098,6 +1153,22 @@ impl FnCallLog { } } + fn try_cuda(&mut self, fn_: impl FnOnce() -> Option) -> Option<()> { + match fn_() { + Some(Ok(())) => Some(()), + None => { + self.subcalls + .push(LogEntry::Error(ErrorEntry::CudaError(None))); + None + } + Some(Err(err)) => { + self.subcalls + .push(LogEntry::Error(ErrorEntry::CudaError(Some(err)))); + None + } + } + } + fn try_(&mut self, f: impl FnOnce(&mut Self) -> Result) -> Option { match f(self) { Err(e) => { @@ -1307,7 +1378,7 @@ pub(crate) fn cuModuleLoadData_Post( fn_logger: &mut FnCallLog, _result: CUresult, ) { - state.record_new_library(unsafe { *module }, raw_image, fn_logger) + state.record_new_library(unsafe { *module }.0.cast(), raw_image, fn_logger) } #[allow(non_snake_case)] @@ -1326,13 +1397,17 @@ pub(crate) fn cuModuleLoadDataEx_Post( #[allow(non_snake_case)] pub(crate) fn cuModuleGetFunction_Post( - _hfunc: *mut CUfunction, - _hmod: CUmodule, - _name: *const ::std::os::raw::c_char, - _state: &mut trace::StateTracker, - _fn_logger: &mut FnCallLog, - _result: CUresult, + hfunc: *mut CUfunction, + hmod: CUmodule, + name: *const ::std::os::raw::c_char, + state: &mut trace::StateTracker, + fn_logger: &mut FnCallLog, + result: CUresult, ) { + if !result.is_ok() { + return; + } + state.record_function_from_module(fn_logger, unsafe { *hfunc }, hmod, name); } #[allow(non_snake_case)] @@ -1385,7 +1460,7 @@ pub(crate) fn cuModuleLoadFatBinary_Post( fn_logger: &mut FnCallLog, _result: CUresult, ) { - state.record_new_library(unsafe { *module }, fatbin_header.cast(), fn_logger) + state.record_new_library(unsafe { *module }.0.cast(), fatbin_header.cast(), fn_logger) } #[allow(non_snake_case)] @@ -1393,13 +1468,13 @@ pub(crate) fn cuLibraryGetModule_Post( module: *mut cuda_types::cuda::CUmodule, library: cuda_types::cuda::CUlibrary, state: &mut trace::StateTracker, - fn_logger: &mut FnCallLog, - _result: CUresult, + _fn_logger: &mut FnCallLog, + result: CUresult, ) { - match state.libraries.get(&library).copied() { - None => fn_logger.log(log::ErrorEntry::UnknownLibrary(library)), - Some(code) => state.record_new_library(unsafe { *module }, code.0, fn_logger), + if !result.is_ok() { + return; } + state.record_module_in_library(unsafe { *module }, library); } #[allow(non_snake_case)] @@ -1416,10 +1491,69 @@ pub(crate) fn cuLibraryLoadData_Post( fn_logger: &mut FnCallLog, _result: CUresult, ) { - state - .libraries - .insert(unsafe { *library }, trace::CodePointer(code)); - // TODO: this is not correct, but it's enough for now, we just want to - // save the binary to disk - state.record_new_library(unsafe { CUmodule((*library).0.cast()) }, code, fn_logger); + state.record_new_library(unsafe { *library }.0.cast(), code, fn_logger); +} + +/* +#[allow(non_snake_case)] +pub(crate) fn cuLaunchKernel_Post( + f: cuda_types::cuda::CUfunction, + gridDimX: ::core::ffi::c_uint, + gridDimY: ::core::ffi::c_uint, + gridDimZ: ::core::ffi::c_uint, + blockDimX: ::core::ffi::c_uint, + blockDimY: ::core::ffi::c_uint, + blockDimZ: ::core::ffi::c_uint, + sharedMemBytes: ::core::ffi::c_uint, + hStream: cuda_types::cuda::CUstream, + kernelParams: *mut *mut ::core::ffi::c_void, + extra: *mut *mut ::core::ffi::c_void, + libcuda: &mut CudaDynamicFns, + state: &mut trace::StateTracker, + fn_logger: &mut FnCallLog, + _result: CUresult, +) { + todo!() +} + */ + +#[allow(non_snake_case)] +pub(crate) fn cuLaunchKernelEx_Pre( + _config: *const cuda_types::cuda::CUlaunchConfig, + f: cuda_types::cuda::CUfunction, + kernel_params: *mut *mut ::core::ffi::c_void, + _extra: *mut *mut ::core::ffi::c_void, + libcuda: &mut CudaDynamicFns, + state: &mut trace::StateTracker, + fn_logger: &mut FnCallLog, +) -> Option> { + state.enqueue_counter += 1; + if kernel_params.is_null() { + fn_logger.log(ErrorEntry::NullPointer("kernel_params")); + return None; + } + replay::pre_kernel_launch(libcuda, state, fn_logger, f, kernel_params) +} + +#[allow(non_snake_case)] +pub(crate) fn cuLaunchKernelEx_Post( + _config: *const cuda_types::cuda::CUlaunchConfig, + _f: cuda_types::cuda::CUfunction, + kernel_params: *mut *mut ::core::ffi::c_void, + _extra: *mut *mut ::core::ffi::c_void, + pre_state: Option>, + libcuda: &mut CudaDynamicFns, + state: &mut trace::StateTracker, + fn_logger: &mut FnCallLog, + _result: CUresult, +) { + let pre_state = unwrap_some_or!(pre_state, return); + replay::post_kernel_launch( + libcuda, + fn_logger, + kernel_params, + pre_state, + state.enqueue_counter, + "".to_string(), + ); } diff --git a/zluda_trace/src/log.rs b/zluda_trace/src/log.rs index 9cbb9cc..b87ff49 100644 --- a/zluda_trace/src/log.rs +++ b/zluda_trace/src/log.rs @@ -1,8 +1,8 @@ use super::Settings; +use crate::trace::SendablePtr; use crate::FnCallLog; use crate::LogEntry; use cuda_types::cuda::*; -use format::CudaDisplay; use std::error::Error; use std::ffi::c_void; use std::ffi::NulError; @@ -267,13 +267,12 @@ pub(crate) enum ErrorEntry { CreatedDumpDirectory(PathBuf), ErrorBox(Box), UnsupportedModule { - module: CUmodule, + handle: *mut c_void, raw_image: *const c_void, kind: &'static str, }, FunctionNotFound(CudaFunctionName), - MalformedModulePath(Utf8Error), - NonUtf8ModuleText(Utf8Error), + Utf8Error(Utf8Error), NulInsideModuleText(NulError), ModuleParsingError(String), Lz4DecompressionFailure, @@ -302,8 +301,11 @@ pub(crate) enum ErrorEntry { overriden: [u64; 2], }, NullPointer(&'static str), - UnknownLibrary(CUlibrary), SavedModule(String), + UnknownFunctionHandle(CUfunction), + UnknownLibrary(CUfunction, SendablePtr), + UnknownFunction(CUfunction, SendablePtr, String), + CudaError(Option), } unsafe impl Send for ErrorEntry {} @@ -345,94 +347,100 @@ impl Display for ErrorEntry { match self { ErrorEntry::IoError(e) => e.fmt(f), ErrorEntry::CreatedDumpDirectory(dir) => { - write!( - f, - "Created trace directory {} ", - dir.as_os_str().to_string_lossy() - ) - } + write!( + f, + "Created trace directory {} ", + dir.as_os_str().to_string_lossy() + ) + } ErrorEntry::ErrorBox(e) => e.fmt(f), ErrorEntry::UnsupportedModule { - module, - raw_image, - kind, - } => { - write!( - f, - "Unsupported {} module {:?} loaded from module image {:?}", - kind, module, raw_image - ) - } - ErrorEntry::MalformedModulePath(e) => e.fmt(f), - ErrorEntry::NonUtf8ModuleText(e) => e.fmt(f), + handle, + raw_image, + kind, + } => { + write!( + f, + "Unsupported {} module {:p} loaded from module image {:p}", + kind, handle, raw_image + ) + } + ErrorEntry::Utf8Error(e) => e.fmt(f), ErrorEntry::ModuleParsingError(file_name) => { - write!( - f, - "Error parsing module, log has been written to {}", - file_name - ) - } + write!( + f, + "Error parsing module, log has been written to {}", + file_name + ) + } ErrorEntry::NulInsideModuleText(e) => e.fmt(f), ErrorEntry::Lz4DecompressionFailure => write!(f, "LZ4 decompression failure"), ErrorEntry::ZstdDecompressionFailure(err_code) => write!(f, "Zstd decompression failure: {}", zstd_safe::get_error_name(*err_code)), ErrorEntry::UnexpectedBinaryField { - field_name, - expected, - observed, - } => write!( - f, - "Unexpected field {}. Expected one of: [{}], observed: {}", - field_name, - expected - .iter() - .map(|x| x.to_string()) - .collect::>() - .join(", "), - observed - ), + field_name, + expected, + observed, + } => write!( + f, + "Unexpected field {}. Expected one of: [{}], observed: {}", + field_name, + expected + .iter() + .map(|x| x.to_string()) + .collect::>() + .join(", "), + observed + ), ErrorEntry::UnexpectedArgument { - arg_name, - expected, - observed, - } => write!( - f, - "Unexpected argument {}. Expected one of: {{{}}}, observed: {}", - arg_name, - expected - .iter() - .map(|x| x.to_string()) - .collect::>() - .join(", "), - observed - ), + arg_name, + expected, + observed, + } => write!( + f, + "Unexpected argument {}. Expected one of: {{{}}}, observed: {}", + arg_name, + expected + .iter() + .map(|x| x.to_string()) + .collect::>() + .join(", "), + observed + ), ErrorEntry::InvalidEnvVar { - var, - pattern, - value, - } => write!( - f, - "Unexpected value of environment variable {var}. Expected pattern: {pattern}, got value: {value}" - ), + var, + pattern, + value, + } => write!( + f, + "Unexpected value of environment variable {var}. Expected pattern: {pattern}, got value: {value}" + ), ErrorEntry::FunctionNotFound(cuda_function_name) => write!( - f, - "No function {cuda_function_name} in the underlying library" - ), + f, + "No function {cuda_function_name} in the underlying library" + ), ErrorEntry::UnexpectedExportTableSize { expected, computed } => { - write!(f, "Table length mismatch. Expected: {expected}, got: {computed}") - } + write!(f, "Table length mismatch. Expected: {expected}, got: {computed}") + } ErrorEntry::IntegrityCheck { original, overriden } => { - write!(f, "Overriding integrity check hash. Original: {original:?}, overriden: {overriden:?}") - } + write!(f, "Overriding integrity check hash. Original: {original:?}, overriden: {overriden:?}") + } ErrorEntry::NullPointer(type_) => { - write!(f, "Null pointer of type {type_} encountered") - } - ErrorEntry::UnknownLibrary(culibrary) => { - write!(f, "Unknown library: ")?; - let mut temp_buffer = Vec::new(); - CudaDisplay::write(culibrary, "", 0, &mut temp_buffer).ok(); - f.write_str(&unsafe { String::from_utf8_unchecked(temp_buffer) }) - } + write!(f, "Null pointer of type {type_} encountered") + } ErrorEntry::SavedModule(file) => write!(f, "Saved module to {file}"), + ErrorEntry::UnknownFunctionHandle(cuda_function_name) => { + write!(f, "Function with unknown provenance: {cuda_function_name:p}") + } + ErrorEntry::UnknownLibrary(cuda_function_name, owner) => { + write!(f, "Function with unknown provenance: {cuda_function_name:p}, owner: {owner:p}") + } + ErrorEntry::UnknownFunction(cuda_function_name, owner, name) => { + write!(f, "Function with unknown provenance: {cuda_function_name:p}, owner: {owner:p}, name: {name}") + } + ErrorEntry::CudaError(cuerror) => { + let cuerror = cuerror.map(|e| e.0); + write!(f, "CUDA error encountered: {cuerror:#?}") + }, } } } diff --git a/zluda_trace/src/replay.rs b/zluda_trace/src/replay.rs new file mode 100644 index 0000000..fd30836 --- /dev/null +++ b/zluda_trace/src/replay.rs @@ -0,0 +1,110 @@ +use crate::{ + log::ErrorEntry, + trace::{self, ParsedModule, SavedKernel}, + CudaDynamicFns, FnCallLog, +}; +use cuda_types::cuda::*; +use zluda_trace_common::replay::KernelParameter; + +pub(crate) fn pre_kernel_launch( + libcuda: &mut CudaDynamicFns, + state: &mut trace::StateTracker, + fn_logger: &mut FnCallLog, + f: CUfunction, + args: *mut *mut std::ffi::c_void, +) -> Option> { + let SavedKernel { name, owner } = fn_logger.try_return(|| { + state + .kernels + .get(&f) + .ok_or(ErrorEntry::UnknownFunctionHandle(f)) + })?; + let ParsedModule { kernels } = fn_logger.try_return(|| { + state + .parsed_libraries + .get(owner) + .ok_or(ErrorEntry::UnknownLibrary(f, *owner)) + })?; + let kernel_params = fn_logger.try_return(|| { + kernels + .get(name) + .ok_or_else(|| ErrorEntry::UnknownFunction(f, *owner, name.clone())) + })?; + let raw_args = unsafe { std::slice::from_raw_parts(args, kernel_params.len()) }; + let mut all_params = Vec::new(); + for (raw_arg, layout) in raw_args.iter().zip(kernel_params.iter()) { + let mut offset = 0; + let mut ptr_overrides = Vec::new(); + while offset + std::mem::size_of::() <= layout.size() { + let maybe_ptr = unsafe { raw_arg.cast::().add(offset) }; + let maybe_ptr = unsafe { maybe_ptr.cast::().read_unaligned() }; + let attrs = &mut [ + CUpointer_attribute_enum::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, + CUpointer_attribute_enum::CU_POINTER_ATTRIBUTE_RANGE_SIZE, + ]; + let mut start = 0usize; + let mut size = 0usize; + let mut data = [ + (&mut start as *mut usize).cast::(), + (&mut size as *mut usize).cast::(), + ]; + if let Some(Ok(())) = libcuda.cuPointerGetAttributes( + 2, + attrs.as_mut_ptr(), + data.as_mut_ptr(), + CUdeviceptr_v2(maybe_ptr as _), + ) { + let mut pre_buffer = vec![0u8; size]; + let post_buffer = vec![0u8; size]; + fn_logger.try_cuda(|| { + libcuda.cuMemcpyDtoH_v2( + pre_buffer.as_mut_ptr().cast(), + CUdeviceptr_v2(start as _), + size, + ) + })?; + let buffer_offset = maybe_ptr - start; + ptr_overrides.push((offset, buffer_offset, pre_buffer, post_buffer)); + } + offset += std::mem::size_of::(); + } + all_params.push(KernelParameter { + data: unsafe { std::slice::from_raw_parts(raw_arg.cast::(), layout.size()) } + .to_vec(), + device_ptrs: ptr_overrides, + }); + } + Some(all_params) +} + +pub(crate) fn post_kernel_launch( + libcuda: &mut CudaDynamicFns, + fn_logger: &mut FnCallLog, + args: *mut *mut std::ffi::c_void, + mut kernel_params: Vec, + enqueue_counter: usize, + kernel_name: String, +) -> Option<()> { + let raw_args = unsafe { std::slice::from_raw_parts(args, kernel_params.len()) }; + for (raw_arg, param) in raw_args.iter().zip(kernel_params.iter_mut()) { + for (offset_in_param, offset_in_buffer, _, data_after) in param.device_ptrs.iter_mut() { + let dev_ptr_param = unsafe { raw_arg.cast::().add(*offset_in_param) }; + let mut dev_ptr = unsafe { dev_ptr_param.cast::().read_unaligned() }; + dev_ptr -= *offset_in_buffer; + fn_logger.try_cuda(|| { + libcuda.cuMemcpyDtoH_v2( + data_after.as_mut_ptr().cast(), + CUdeviceptr_v2(dev_ptr as _), + data_after.len(), + ) + })?; + } + } + let path = format!("kernel_{enqueue_counter}_.tar.zst"); + let file = + fn_logger.try_return(|| std::fs::File::create_new(path).map_err(ErrorEntry::IoError))?; + fn_logger.try_return(|| { + zluda_trace_common::replay::save(file, kernel_name, kernel_params) + .map_err(ErrorEntry::IoError) + }) +} diff --git a/zluda_trace/src/trace.rs b/zluda_trace/src/trace.rs index f397d34..1740d31 100644 --- a/zluda_trace/src/trace.rs +++ b/zluda_trace/src/trace.rs @@ -4,8 +4,9 @@ use crate::{ }; use cuda_types::cuda::*; use goblin::{elf, elf32, elf64}; -use rustc_hash::{FxHashMap, FxHashSet}; +use rustc_hash::FxHashMap; use std::{ + alloc::Layout, ffi::{c_void, CStr, CString}, fs::{self, File}, io::{self, Read, Write}, @@ -20,25 +21,38 @@ use unwrap_or::unwrap_some_or; // * writes out relevant state change and details to disk and log pub(crate) struct StateTracker { writer: DumpWriter, - pub(crate) libraries: FxHashMap, - saved_modules: FxHashSet, + pub(crate) parsed_libraries: FxHashMap, + pub(crate) submodules: FxHashMap, + pub(crate) kernels: FxHashMap, library_counter: usize, + pub(crate) enqueue_counter: usize, pub(crate) override_cc: Option<(u32, u32)>, } -#[derive(Clone, Copy)] -pub(crate) struct CodePointer(pub *const c_void); +pub(crate) struct ParsedModule { + pub kernels: FxHashMap>, +} -unsafe impl Send for CodePointer {} -unsafe impl Sync for CodePointer {} +pub(crate) struct SavedKernel { + pub name: String, + pub owner: SendablePtr, +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +pub(crate) struct SendablePtr(*mut c_void); + +unsafe impl Send for SendablePtr {} +unsafe impl Sync for SendablePtr {} impl StateTracker { pub(crate) fn new(settings: &Settings) -> Self { StateTracker { writer: DumpWriter::new(settings.dump_dir.clone()), - libraries: FxHashMap::default(), - saved_modules: FxHashSet::default(), + parsed_libraries: FxHashMap::default(), + submodules: FxHashMap::default(), + kernels: FxHashMap::default(), library_counter: 0, + enqueue_counter: 0, override_cc: settings.override_cc, } } @@ -52,7 +66,7 @@ impl StateTracker { let file_name = match unsafe { CStr::from_ptr(file_name) }.to_str() { Ok(f) => f, Err(err) => { - fn_logger.log(log::ErrorEntry::MalformedModulePath(err)); + fn_logger.log(log::ErrorEntry::Utf8Error(err)); return; } }; @@ -69,21 +83,26 @@ impl StateTracker { let mut module_file = fs::File::open(file_name)?; let mut read_buff = Vec::new(); module_file.read_to_end(&mut read_buff)?; - self.record_new_library(module, read_buff.as_ptr() as *const _, fn_logger); + self.record_new_library(module.0.cast(), read_buff.as_ptr() as *const _, fn_logger); Ok(()) } pub(crate) fn record_new_library( &mut self, - cu_module: CUmodule, + handle: *mut c_void, raw_image: *const c_void, fn_logger: &mut FnCallLog, ) { - self.saved_modules.insert(cu_module); + fn overwrite(current: &mut Option, value: Option) { + if value.is_some() { + *current = value; + } + } + let mut kernel_arguments = None; self.library_counter += 1; let code_ref = fn_logger.try_return(|| { unsafe { zluda_common::CodeLibraryRef::try_load(raw_image) } - .map_err(ErrorEntry::NonUtf8ModuleText) + .map_err(ErrorEntry::Utf8Error) }); let code_ref = unwrap_some_or!(code_ref, return); unsafe { @@ -92,17 +111,20 @@ impl StateTracker { Ok(zluda_common::CodeModuleRef::Elf(elf)) => match get_elf_size(elf) { Some(len) => { let elf_image = std::slice::from_raw_parts(elf.cast::(), len); - self.record_new_submodule(index, elf_image, fn_logger, "elf"); + overwrite( + &mut kernel_arguments, + self.record_new_submodule(index, elf_image, fn_logger, "elf"), + ); } None => fn_logger.log(log::ErrorEntry::UnsupportedModule { - module: cu_module, + handle, raw_image: elf, kind: "ELF", }), }, Ok(zluda_common::CodeModuleRef::Archive(archive)) => { fn_logger.log(log::ErrorEntry::UnsupportedModule { - module: cu_module, + handle, raw_image: archive, kind: "archive", }) @@ -111,23 +133,36 @@ impl StateTracker { if let Some(buffer) = fn_logger .try_(|_| file.get_or_decompress_content().map_err(ErrorEntry::from)) { - self.record_new_submodule(index, &*buffer, fn_logger, file.kind()); + overwrite( + &mut kernel_arguments, + self.record_new_submodule(index, &*buffer, fn_logger, file.kind()), + ); } } Ok(zluda_common::CodeModuleRef::Text(ptx)) => { - self.record_new_submodule(index, ptx.as_bytes(), fn_logger, "ptx"); + overwrite( + &mut kernel_arguments, + self.record_new_submodule(index, ptx.as_bytes(), fn_logger, "ptx"), + ); } }); }; + self.parsed_libraries.insert( + SendablePtr(handle), + ParsedModule { + kernels: kernel_arguments.unwrap_or_default(), + }, + ); } + #[must_use] pub(crate) fn record_new_submodule( &mut self, index: Option<(usize, Option)>, submodule: &[u8], fn_logger: &mut FnCallLog, type_: &'static str, - ) { + ) -> Option>> { fn_logger.try_(|fn_logger| { self.writer .save_module(fn_logger, self.library_counter, index, submodule, type_) @@ -135,28 +170,36 @@ impl StateTracker { }); if type_ == "ptx" { match CString::new(submodule) { - Err(e) => fn_logger.log(log::ErrorEntry::NulInsideModuleText(e)), + Err(e) => { + fn_logger.log(log::ErrorEntry::NulInsideModuleText(e)); + None + } Ok(submodule_cstring) => match submodule_cstring.to_str() { - Err(e) => fn_logger.log(log::ErrorEntry::NonUtf8ModuleText(e)), - Ok(submodule_text) => self.try_parse_and_record_kernels( + Err(e) => { + fn_logger.log(log::ErrorEntry::Utf8Error(e)); + None + } + Ok(submodule_text) => Some(self.try_parse_and_record_kernels( fn_logger, self.library_counter, index, submodule_text, - ), + )), }, } + } else { + None } } - fn try_parse_and_record_kernels( + fn try_parse_and_record_kernels<'input>( &mut self, fn_logger: &mut FnCallLog, module_index: usize, submodule_index: Option<(usize, Option)>, - module_text: &str, - ) { - let errors = ptx_parser::parse_for_errors(module_text); + module_text: &'input str, + ) -> FxHashMap> { + let (errors, params) = ptx_parser::parse_for_errors_and_params(module_text); if !errors.is_empty() { fn_logger.log(log::ErrorEntry::ModuleParsingError( DumpWriter::get_file_name(module_index, submodule_index, "log"), @@ -167,6 +210,46 @@ impl StateTracker { &*errors, )); } + params + } + + pub(crate) fn record_module_in_library(&mut self, module: CUmodule, library: CUlibrary) { + self.submodules.insert(module, library); + } + + pub(crate) fn record_function_from_module( + &mut self, + fn_logger: &mut FnCallLog, + func: CUfunction, + hmod: CUmodule, + name: *const i8, + ) { + let owner = match self.submodules.get(&hmod) { + Some(m) => m.0.cast::(), + None => hmod.0.cast::(), + }; + self.record_function_from_impl(fn_logger, func, owner, name); + } + + fn record_function_from_impl( + &mut self, + fn_logger: &mut FnCallLog, + func: CUfunction, + owner: *mut c_void, + name: *const i8, + ) { + let name = match unsafe { CStr::from_ptr(name) }.to_str() { + Ok(f) => f, + Err(err) => { + fn_logger.log(log::ErrorEntry::Utf8Error(err)); + return; + } + }; + let saved_kernel = SavedKernel { + name: name.to_string(), + owner: SendablePtr(owner), + }; + self.kernels.insert(func, saved_kernel); } } diff --git a/zluda_trace_common/Cargo.toml b/zluda_trace_common/Cargo.toml index 3bf40b5..3eb012a 100644 --- a/zluda_trace_common/Cargo.toml +++ b/zluda_trace_common/Cargo.toml @@ -11,6 +11,10 @@ cuda_types = { path = "../cuda_types" } dark_api = { path = "../dark_api" } format = { path = "../format" } cglue = "0.3.5" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0.142" +tar = "0.4" +zstd = "0.13" [target.'cfg(not(windows))'.dependencies] libc = "0.2" diff --git a/zluda_trace_common/src/lib.rs b/zluda_trace_common/src/lib.rs index 547e982..b2118fc 100644 --- a/zluda_trace_common/src/lib.rs +++ b/zluda_trace_common/src/lib.rs @@ -8,6 +8,8 @@ use cuda_types::{ use dark_api::ByteVecFfi; use std::{borrow::Cow, ffi::c_void, num::NonZero, ptr, sync::LazyLock}; +pub mod replay; + pub fn get_export_table() -> Option<::dark_api::zluda_trace::ZludaTraceInternal> { static CU_GET_EXPORT_TABLE: LazyLock< Result< diff --git a/zluda_trace_common/src/replay.rs b/zluda_trace_common/src/replay.rs new file mode 100644 index 0000000..9b7ee68 --- /dev/null +++ b/zluda_trace_common/src/replay.rs @@ -0,0 +1,83 @@ +use std::io::Write; +use tar::Header; + +#[derive(serde::Serialize, serde::Deserialize)] +struct Manifest { + kernel_name: String, + parameters: Vec, +} + +#[derive(serde::Serialize, serde::Deserialize)] +struct Parameter { + pointer_offsets: Vec, +} + +#[derive(serde::Serialize, serde::Deserialize)] +struct ParameterPointer { + offset_in_param: usize, + offset_in_buffer: usize, +} + +impl Manifest { + const PATH: &'static str = "manifest.json"; + + fn serialize(&self) -> std::io::Result<(Header, Vec)> { + let vec = serde_json::to_vec(self)?; + let mut header = Header::new_gnu(); + header.set_size(vec.len() as u64); + Ok((header, vec)) + } +} + +pub struct KernelParameter { + pub data: Vec, + pub device_ptrs: Vec<(usize, usize, Vec, Vec)>, +} + +pub fn save( + writer: impl Write, + kernel_name: String, + kernel_params: Vec, +) -> std::io::Result<()> { + let archive = zstd::Encoder::new(writer, 0)?; + let mut builder = tar::Builder::new(archive); + let (mut header, manifest) = Manifest { + kernel_name, + parameters: kernel_params + .iter() + .map(|param| Parameter { + pointer_offsets: param + .device_ptrs + .iter() + .map( + |(offset_in_param, offset_in_buffer, _, _)| ParameterPointer { + offset_in_param: *offset_in_param, + offset_in_buffer: *offset_in_buffer, + }, + ) + .collect(), + }) + .collect(), + } + .serialize()?; + builder.append_data(&mut header, Manifest::PATH, &*manifest)?; + for (i, param) in kernel_params.into_iter().enumerate() { + let path = format!("param_{i}.bin"); + let mut header = Header::new_gnu(); + header.set_size(param.data.len() as u64); + builder.append_data(&mut header, &*path, &*param.data)?; + for (offset_in_param, _, data_before, data_after) in param.device_ptrs { + let path = format!("param_{i}_ptr_{offset_in_param}_pre.bin"); + let mut header = Header::new_gnu(); + header.set_size(data_before.len() as u64); + builder.append_data(&mut header, &*path, &*data_before)?; + let path = format!("param_{i}_ptr_{offset_in_param}_post.bin"); + let mut header = Header::new_gnu(); + header.set_size(data_after.len() as u64); + builder.append_data(&mut header, &*path, &*data_after)?; + } + } + builder.finish()?; + builder.into_inner()?.finish()?; + Ok(()) +} From d880ee78b5bfeb21bbdeec5bb01fa644be9332f7 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Fri, 19 Sep 2025 00:58:42 +0000 Subject: [PATCH 02/11] Fix some bugs --- zluda_trace/src/lib.rs | 70 ++++++++++++++++++++++++++++----------- zluda_trace/src/replay.rs | 6 ++-- 2 files changed, 54 insertions(+), 22 deletions(-) diff --git a/zluda_trace/src/lib.rs b/zluda_trace/src/lib.rs index a0c56b4..e0f5daf 100644 --- a/zluda_trace/src/lib.rs +++ b/zluda_trace/src/lib.rs @@ -896,7 +896,7 @@ cuda_function_declarations!( cuLibraryGetModule, cuLibraryLoadData, ], - extern_redirect_with_pre_post <= [cuLaunchKernelEx], + extern_redirect_with_pre_post <= [cuLaunchKernel, cuLaunchKernelEx], override_fn_core <= [cuGetProcAddress, cuGetProcAddress_v2], override_fn_full <= [cuGetExportTable], ); @@ -1494,28 +1494,60 @@ pub(crate) fn cuLibraryLoadData_Post( state.record_new_library(unsafe { *library }.0.cast(), code, fn_logger); } -/* +#[allow(non_snake_case)] +pub(crate) fn cuLaunchKernel_Pre( + f: cuda_types::cuda::CUfunction, + _gridDimX: ::core::ffi::c_uint, + _gridDimY: ::core::ffi::c_uint, + _gridDimZ: ::core::ffi::c_uint, + _blockDimX: ::core::ffi::c_uint, + _blockDimY: ::core::ffi::c_uint, + _blockDimZ: ::core::ffi::c_uint, + _sharedMemBytes: ::core::ffi::c_uint, + _hStream: cuda_types::cuda::CUstream, + kernel_params: *mut *mut ::core::ffi::c_void, + _extra: *mut *mut ::core::ffi::c_void, + libcuda: &mut CudaDynamicFns, + state: &mut trace::StateTracker, + fn_logger: &mut FnCallLog, +) -> Option<(String, Vec)> { + state.enqueue_counter += 1; + if kernel_params.is_null() { + fn_logger.log(ErrorEntry::NullPointer("kernel_params")); + return None; + } + replay::pre_kernel_launch(libcuda, state, fn_logger, f, kernel_params) +} + #[allow(non_snake_case)] pub(crate) fn cuLaunchKernel_Post( - f: cuda_types::cuda::CUfunction, - gridDimX: ::core::ffi::c_uint, - gridDimY: ::core::ffi::c_uint, - gridDimZ: ::core::ffi::c_uint, - blockDimX: ::core::ffi::c_uint, - blockDimY: ::core::ffi::c_uint, - blockDimZ: ::core::ffi::c_uint, - sharedMemBytes: ::core::ffi::c_uint, - hStream: cuda_types::cuda::CUstream, - kernelParams: *mut *mut ::core::ffi::c_void, - extra: *mut *mut ::core::ffi::c_void, + _f: cuda_types::cuda::CUfunction, + _gridDimX: ::core::ffi::c_uint, + _gridDimY: ::core::ffi::c_uint, + _gridDimZ: ::core::ffi::c_uint, + _blockDimX: ::core::ffi::c_uint, + _blockDimY: ::core::ffi::c_uint, + _blockDimZ: ::core::ffi::c_uint, + _sharedMemBytes: ::core::ffi::c_uint, + _hStream: cuda_types::cuda::CUstream, + kernel_params: *mut *mut ::core::ffi::c_void, + _extra: *mut *mut ::core::ffi::c_void, + pre_state: Option<(String, Vec)>, libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, _result: CUresult, ) { - todo!() + let (kernel_name, pre_state) = unwrap_some_or!(pre_state, return); + replay::post_kernel_launch( + libcuda, + fn_logger, + kernel_params, + pre_state, + state.enqueue_counter, + kernel_name, + ); } - */ #[allow(non_snake_case)] pub(crate) fn cuLaunchKernelEx_Pre( @@ -1526,7 +1558,7 @@ pub(crate) fn cuLaunchKernelEx_Pre( libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, -) -> Option> { +) -> Option<(String, Vec)> { state.enqueue_counter += 1; if kernel_params.is_null() { fn_logger.log(ErrorEntry::NullPointer("kernel_params")); @@ -1541,19 +1573,19 @@ pub(crate) fn cuLaunchKernelEx_Post( _f: cuda_types::cuda::CUfunction, kernel_params: *mut *mut ::core::ffi::c_void, _extra: *mut *mut ::core::ffi::c_void, - pre_state: Option>, + pre_state: Option<(String, Vec)>, libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, _result: CUresult, ) { - let pre_state = unwrap_some_or!(pre_state, return); + let (kernel_name, pre_state) = unwrap_some_or!(pre_state, return); replay::post_kernel_launch( libcuda, fn_logger, kernel_params, pre_state, state.enqueue_counter, - "".to_string(), + kernel_name, ); } diff --git a/zluda_trace/src/replay.rs b/zluda_trace/src/replay.rs index fd30836..7bfd7da 100644 --- a/zluda_trace/src/replay.rs +++ b/zluda_trace/src/replay.rs @@ -12,7 +12,7 @@ pub(crate) fn pre_kernel_launch( fn_logger: &mut FnCallLog, f: CUfunction, args: *mut *mut std::ffi::c_void, -) -> Option> { +) -> Option<(String, Vec)> { let SavedKernel { name, owner } = fn_logger.try_return(|| { state .kernels @@ -74,7 +74,7 @@ pub(crate) fn pre_kernel_launch( device_ptrs: ptr_overrides, }); } - Some(all_params) + Some((name.to_string(), all_params)) } pub(crate) fn post_kernel_launch( @@ -100,7 +100,7 @@ pub(crate) fn post_kernel_launch( })?; } } - let path = format!("kernel_{enqueue_counter}_.tar.zst"); + let path = format!("kernel_{enqueue_counter}_{kernel_name}.tar.zst"); let file = fn_logger.try_return(|| std::fs::File::create_new(path).map_err(ErrorEntry::IoError))?; fn_logger.try_return(|| { From f3e143d8ddda55c3f4db5328409fedf2dbba2ce1 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Fri, 19 Sep 2025 01:53:01 +0000 Subject: [PATCH 03/11] Save source ptx and save to the right path --- zluda_trace/src/lib.rs | 63 ++++++++++++++++++-------------- zluda_trace/src/replay.rs | 46 +++++++++++++++++------ zluda_trace/src/trace.rs | 26 ++++++++----- zluda_trace_common/src/replay.rs | 4 ++ 4 files changed, 91 insertions(+), 48 deletions(-) diff --git a/zluda_trace/src/lib.rs b/zluda_trace/src/lib.rs index e0f5daf..8aa78bc 100644 --- a/zluda_trace/src/lib.rs +++ b/zluda_trace/src/lib.rs @@ -1504,19 +1504,33 @@ pub(crate) fn cuLaunchKernel_Pre( _blockDimY: ::core::ffi::c_uint, _blockDimZ: ::core::ffi::c_uint, _sharedMemBytes: ::core::ffi::c_uint, - _hStream: cuda_types::cuda::CUstream, + stream: cuda_types::cuda::CUstream, kernel_params: *mut *mut ::core::ffi::c_void, _extra: *mut *mut ::core::ffi::c_void, libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, -) -> Option<(String, Vec)> { +) -> Option { + launch_kernel_pre(f, stream, kernel_params, libcuda, state, fn_logger) +} + +fn launch_kernel_pre( + f: cuda_types::cuda::CUfunction, + stream: cuda_types::cuda::CUstream, + kernel_params: *mut *mut ::core::ffi::c_void, + libcuda: &mut CudaDynamicFns, + state: &mut trace::StateTracker, + fn_logger: &mut FnCallLog, +) -> Option { state.enqueue_counter += 1; if kernel_params.is_null() { fn_logger.log(ErrorEntry::NullPointer("kernel_params")); return None; } - replay::pre_kernel_launch(libcuda, state, fn_logger, f, kernel_params) + if state.dump_dir().is_none() { + return None; + } + replay::pre_kernel_launch(libcuda, state, fn_logger, f, stream, kernel_params) } #[allow(non_snake_case)] @@ -1529,63 +1543,58 @@ pub(crate) fn cuLaunchKernel_Post( _blockDimY: ::core::ffi::c_uint, _blockDimZ: ::core::ffi::c_uint, _sharedMemBytes: ::core::ffi::c_uint, - _hStream: cuda_types::cuda::CUstream, + stream: cuda_types::cuda::CUstream, kernel_params: *mut *mut ::core::ffi::c_void, _extra: *mut *mut ::core::ffi::c_void, - pre_state: Option<(String, Vec)>, + pre_state: Option, libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, _result: CUresult, ) { - let (kernel_name, pre_state) = unwrap_some_or!(pre_state, return); - replay::post_kernel_launch( - libcuda, - fn_logger, - kernel_params, - pre_state, - state.enqueue_counter, - kernel_name, - ); + let pre_state = unwrap_some_or!(pre_state, return); + replay::post_kernel_launch(libcuda, state, fn_logger, stream, kernel_params, pre_state); } #[allow(non_snake_case)] pub(crate) fn cuLaunchKernelEx_Pre( - _config: *const cuda_types::cuda::CUlaunchConfig, + config: *const cuda_types::cuda::CUlaunchConfig, f: cuda_types::cuda::CUfunction, kernel_params: *mut *mut ::core::ffi::c_void, _extra: *mut *mut ::core::ffi::c_void, libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, -) -> Option<(String, Vec)> { - state.enqueue_counter += 1; - if kernel_params.is_null() { - fn_logger.log(ErrorEntry::NullPointer("kernel_params")); - return None; - } - replay::pre_kernel_launch(libcuda, state, fn_logger, f, kernel_params) +) -> Option { + launch_kernel_pre( + f, + unsafe { *config }.hStream, + kernel_params, + libcuda, + state, + fn_logger, + ) } #[allow(non_snake_case)] pub(crate) fn cuLaunchKernelEx_Post( - _config: *const cuda_types::cuda::CUlaunchConfig, + config: *const cuda_types::cuda::CUlaunchConfig, _f: cuda_types::cuda::CUfunction, kernel_params: *mut *mut ::core::ffi::c_void, _extra: *mut *mut ::core::ffi::c_void, - pre_state: Option<(String, Vec)>, + pre_state: Option, libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, _result: CUresult, ) { - let (kernel_name, pre_state) = unwrap_some_or!(pre_state, return); + let pre_state = unwrap_some_or!(pre_state, return); replay::post_kernel_launch( libcuda, + state, fn_logger, + unsafe { *config }.hStream, kernel_params, pre_state, - state.enqueue_counter, - kernel_name, ); } diff --git a/zluda_trace/src/replay.rs b/zluda_trace/src/replay.rs index 7bfd7da..8056e07 100644 --- a/zluda_trace/src/replay.rs +++ b/zluda_trace/src/replay.rs @@ -6,20 +6,28 @@ use crate::{ use cuda_types::cuda::*; use zluda_trace_common::replay::KernelParameter; +pub struct LaunchPreState { + kernel_name: String, + source: String, + kernel_params: Vec, +} + pub(crate) fn pre_kernel_launch( libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, f: CUfunction, + stream: CUstream, args: *mut *mut std::ffi::c_void, -) -> Option<(String, Vec)> { +) -> Option { + fn_logger.try_cuda(|| libcuda.cuStreamSynchronize(stream))?; let SavedKernel { name, owner } = fn_logger.try_return(|| { state .kernels .get(&f) .ok_or(ErrorEntry::UnknownFunctionHandle(f)) })?; - let ParsedModule { kernels } = fn_logger.try_return(|| { + let ParsedModule { source, kernels } = fn_logger.try_return(|| { state .parsed_libraries .get(owner) @@ -74,19 +82,25 @@ pub(crate) fn pre_kernel_launch( device_ptrs: ptr_overrides, }); } - Some((name.to_string(), all_params)) + Some(LaunchPreState { + kernel_name: name.to_string(), + source: source.to_string(), + kernel_params: all_params, + }) } pub(crate) fn post_kernel_launch( libcuda: &mut CudaDynamicFns, + state: &trace::StateTracker, fn_logger: &mut FnCallLog, - args: *mut *mut std::ffi::c_void, - mut kernel_params: Vec, - enqueue_counter: usize, - kernel_name: String, + stream: CUstream, + kernel_params: *mut *mut std::ffi::c_void, + mut pre_state: LaunchPreState, ) -> Option<()> { - let raw_args = unsafe { std::slice::from_raw_parts(args, kernel_params.len()) }; - for (raw_arg, param) in raw_args.iter().zip(kernel_params.iter_mut()) { + fn_logger.try_cuda(|| libcuda.cuStreamSynchronize(stream))?; + let raw_args = + unsafe { std::slice::from_raw_parts(kernel_params, pre_state.kernel_params.len()) }; + for (raw_arg, param) in raw_args.iter().zip(pre_state.kernel_params.iter_mut()) { for (offset_in_param, offset_in_buffer, _, data_after) in param.device_ptrs.iter_mut() { let dev_ptr_param = unsafe { raw_arg.cast::().add(*offset_in_param) }; let mut dev_ptr = unsafe { dev_ptr_param.cast::().read_unaligned() }; @@ -100,11 +114,19 @@ pub(crate) fn post_kernel_launch( })?; } } - let path = format!("kernel_{enqueue_counter}_{kernel_name}.tar.zst"); + let enqueue_counter = state.enqueue_counter; + let kernel_name = &pre_state.kernel_name; + let mut path = state.dump_dir()?.to_path_buf(); + path.push(format!("kernel_{enqueue_counter}_{kernel_name}.tar.zst")); let file = fn_logger.try_return(|| std::fs::File::create_new(path).map_err(ErrorEntry::IoError))?; fn_logger.try_return(|| { - zluda_trace_common::replay::save(file, kernel_name, kernel_params) - .map_err(ErrorEntry::IoError) + zluda_trace_common::replay::save( + file, + pre_state.kernel_name, + pre_state.source, + pre_state.kernel_params, + ) + .map_err(ErrorEntry::IoError) }) } diff --git a/zluda_trace/src/trace.rs b/zluda_trace/src/trace.rs index 1740d31..219d032 100644 --- a/zluda_trace/src/trace.rs +++ b/zluda_trace/src/trace.rs @@ -30,6 +30,7 @@ pub(crate) struct StateTracker { } pub(crate) struct ParsedModule { + pub source: String, pub kernels: FxHashMap>, } @@ -57,6 +58,10 @@ impl StateTracker { } } + pub(crate) fn dump_dir(&self) -> Option<&PathBuf> { + self.writer.dump_dir.as_ref() + } + pub(crate) fn record_new_module_file( &mut self, module: CUmodule, @@ -147,12 +152,15 @@ impl StateTracker { } }); }; - self.parsed_libraries.insert( - SendablePtr(handle), - ParsedModule { - kernels: kernel_arguments.unwrap_or_default(), - }, - ); + if let Some((source, kernel_arguments)) = kernel_arguments { + self.parsed_libraries.insert( + SendablePtr(handle), + ParsedModule { + source, + kernels: kernel_arguments, + }, + ); + } } #[must_use] @@ -162,7 +170,7 @@ impl StateTracker { submodule: &[u8], fn_logger: &mut FnCallLog, type_: &'static str, - ) -> Option>> { + ) -> Option<(String, FxHashMap>)> { fn_logger.try_(|fn_logger| { self.writer .save_module(fn_logger, self.library_counter, index, submodule, type_) @@ -198,7 +206,7 @@ impl StateTracker { module_index: usize, submodule_index: Option<(usize, Option)>, module_text: &'input str, - ) -> FxHashMap> { + ) -> (String, FxHashMap>) { let (errors, params) = ptx_parser::parse_for_errors_and_params(module_text); if !errors.is_empty() { fn_logger.log(log::ErrorEntry::ModuleParsingError( @@ -210,7 +218,7 @@ impl StateTracker { &*errors, )); } - params + (module_text.to_string(), params) } pub(crate) fn record_module_in_library(&mut self, module: CUmodule, library: CUlibrary) { diff --git a/zluda_trace_common/src/replay.rs b/zluda_trace_common/src/replay.rs index 9b7ee68..fe98e7c 100644 --- a/zluda_trace_common/src/replay.rs +++ b/zluda_trace_common/src/replay.rs @@ -37,6 +37,7 @@ pub struct KernelParameter { pub fn save( writer: impl Write, kernel_name: String, + source: String, kernel_params: Vec, ) -> std::io::Result<()> { let archive = zstd::Encoder::new(writer, 0)?; @@ -61,6 +62,9 @@ pub fn save( } .serialize()?; builder.append_data(&mut header, Manifest::PATH, &*manifest)?; + let mut header = Header::new_gnu(); + header.set_size(source.len() as u64); + builder.append_data(&mut header, "source.ptx", source.as_bytes())?; for (i, param) in kernel_params.into_iter().enumerate() { let path = format!("param_{i}.bin"); let mut header = Header::new_gnu(); From 3289d92f598da0c50f3fa9ea8a028ca9bebd9eca Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Fri, 19 Sep 2025 02:07:59 +0000 Subject: [PATCH 04/11] Filter enqueues to be saved --- zluda_trace/src/lib.rs | 16 ++++++++++++++++ zluda_trace/src/replay.rs | 4 ++++ zluda_trace/src/trace.rs | 2 ++ 3 files changed, 22 insertions(+) diff --git a/zluda_trace/src/lib.rs b/zluda_trace/src/lib.rs index 8aa78bc..46ef901 100644 --- a/zluda_trace/src/lib.rs +++ b/zluda_trace/src/lib.rs @@ -1280,6 +1280,7 @@ struct Settings { dump_dir: Option, libcuda_path: String, override_cc: Option<(u32, u32)>, + kernel_name_filter: Option, } impl Settings { @@ -1328,10 +1329,25 @@ impl Settings { }) }), }; + let kernel_name_filter = match env::var("ZLUDA_SAVE_KERNELS") { + Err(env::VarError::NotPresent) => None, + Err(e) => { + logger.log(log::ErrorEntry::ErrorBox(Box::new(e) as _)); + None + } + Ok(env_string) => logger.try_return(|| { + regex::Regex::new(&env_string).map_err(|e| ErrorEntry::InvalidEnvVar { + var: "ZLUDA_SAVE_KERNELS", + pattern: "valid regex", + value: format!("{} ({})", env_string, e), + }) + }), + }; Settings { dump_dir, libcuda_path, override_cc, + kernel_name_filter, } } diff --git a/zluda_trace/src/replay.rs b/zluda_trace/src/replay.rs index 8056e07..74fe292 100644 --- a/zluda_trace/src/replay.rs +++ b/zluda_trace/src/replay.rs @@ -27,6 +27,10 @@ pub(crate) fn pre_kernel_launch( .get(&f) .ok_or(ErrorEntry::UnknownFunctionHandle(f)) })?; + let kernel_name_filter = state.kernel_name_filter.as_ref()?; + if !kernel_name_filter.is_match(name) { + return None; + } let ParsedModule { source, kernels } = fn_logger.try_return(|| { state .parsed_libraries diff --git a/zluda_trace/src/trace.rs b/zluda_trace/src/trace.rs index 219d032..4b57920 100644 --- a/zluda_trace/src/trace.rs +++ b/zluda_trace/src/trace.rs @@ -27,6 +27,7 @@ pub(crate) struct StateTracker { library_counter: usize, pub(crate) enqueue_counter: usize, pub(crate) override_cc: Option<(u32, u32)>, + pub(crate) kernel_name_filter: Option, } pub(crate) struct ParsedModule { @@ -55,6 +56,7 @@ impl StateTracker { library_counter: 0, enqueue_counter: 0, override_cc: settings.override_cc, + kernel_name_filter: settings.kernel_name_filter.clone(), } } From 160048a293e98b4e9a424c3dbc2d357562768f30 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Fri, 19 Sep 2025 23:30:29 +0000 Subject: [PATCH 05/11] Fix cuCtxPopCurrent --- zluda/src/impl/context.rs | 49 ++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs index 8933116..e6fb35e 100644 --- a/zluda/src/impl/context.rs +++ b/zluda/src/impl/context.rs @@ -188,12 +188,25 @@ pub(crate) unsafe fn push_current_v2(ctx: CUcontext) -> CUresult { push_current(ctx) } -pub(crate) unsafe fn pop_current(ctx: &mut CUcontext) -> CUresult { - STACK.with(|stack| { - if let Some((_ctx, _)) = stack.borrow_mut().pop() { - *ctx = _ctx; - } +pub(crate) unsafe fn pop_current(result: Option<&mut CUcontext>) -> CUresult { + let old_ctx_and_new_device = STACK.with(|stack| { + let mut stack = stack.borrow_mut(); + stack + .pop() + .map(|(ctx, _)| (ctx, stack.last().map(|(_, dev)| *dev))) }); + let ctx = match old_ctx_and_new_device { + Some((old_ctx, new_device)) => { + if let Some(new_device) = new_device { + hipSetDevice(new_device)?; + } + old_ctx + } + None => return CUresult::ERROR_INVALID_CONTEXT, + }; + if let Some(out) = result { + *out = ctx; + } Ok(()) } @@ -213,7 +226,7 @@ pub(crate) unsafe fn destroy_v2(ctx: CUcontext) -> CUresult { zluda_common::drop_checked::(ctx) } -pub(crate) unsafe fn pop_current_v2(ctx: &mut CUcontext) -> CUresult { +pub(crate) unsafe fn pop_current_v2(ctx: Option<&mut CUcontext>) -> CUresult { pop_current(ctx) } @@ -241,3 +254,27 @@ pub(crate) unsafe fn get_api_version( *version = 3020; Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::tests::CudaApi; + use cuda_macros::test_cuda; + use std::mem; + + #[test_cuda] + fn empty_pop_fails(api: impl CudaApi) { + api.cuInit(0); + assert_eq!( + api.cuCtxPopCurrent_v2_unchecked(&mut unsafe { mem::zeroed() }), + CUresult::ERROR_INVALID_CONTEXT + ); + } + + #[test_cuda] + fn pop_into_null_succeeds(api: impl CudaApi) { + api.cuInit(0); + api.cuCtxCreate_v2(&mut unsafe { mem::zeroed() }, 0, 0); + api.cuCtxPopCurrent_v2(ptr::null_mut()); + } +} From 2b9c8946ecb8b0a59fc09c1a42b99a37a4df545c Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Sat, 20 Sep 2025 00:43:29 +0000 Subject: [PATCH 06/11] Add replayer --- Cargo.lock | 11 ++++ Cargo.toml | 1 + ptx_parser/src/lib.rs | 2 +- zluda_replay/Cargo.toml | 17 ++++++ zluda_replay/src/main.rs | 98 ++++++++++++++++++++++++++++++++ zluda_trace/src/lib.rs | 38 +++++++++---- zluda_trace/src/replay.rs | 9 ++- zluda_trace_common/Cargo.toml | 1 + zluda_trace_common/src/replay.rs | 60 ++++++++++++++++--- 9 files changed, 215 insertions(+), 22 deletions(-) create mode 100644 zluda_replay/Cargo.toml create mode 100644 zluda_replay/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index ee0d570..31ec2d8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3826,6 +3826,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "zluda_replay" +version = "0.0.0" +dependencies = [ + "cuda_macros", + "cuda_types", + "libloading", + "zluda_trace_common", +] + [[package]] name = "zluda_sparse" version = "0.0.0" @@ -3903,6 +3913,7 @@ dependencies = [ "format", "libc", "libloading", + "rustc-hash 2.0.0", "serde", "serde_json", "tar", diff --git a/Cargo.toml b/Cargo.toml index ca051ac..63a82c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ members = [ "zluda_inject", "zluda_ld", "zluda_ml", + "zluda_replay", "zluda_redirect", "zluda_sparse", "compiler", diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index 359669a..6078dc5 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -370,7 +370,7 @@ pub fn parse_for_errors_and_params<'input>( .func_directive .input_arguments .iter() - .map(|arg| arg.v_type.layout()) + .map(|arg| arg.info.v_type.layout()) .collect(); Some((func.func_directive.name().to_string(), layouts)) } else { diff --git a/zluda_replay/Cargo.toml b/zluda_replay/Cargo.toml new file mode 100644 index 0000000..73295d4 --- /dev/null +++ b/zluda_replay/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "zluda_replay" +version = "0.0.0" +authors = ["Andrzej Janik "] +edition = "2021" + +[[bin]] +name = "zluda_replay" + +[dependencies] +zluda_trace_common = { path = "../zluda_trace_common" } +cuda_macros = { path = "../cuda_macros" } +cuda_types = { path = "../cuda_types" } +libloading = "0.8" + +[package.metadata.zluda] +debug_only = true diff --git a/zluda_replay/src/main.rs b/zluda_replay/src/main.rs new file mode 100644 index 0000000..50d2d99 --- /dev/null +++ b/zluda_replay/src/main.rs @@ -0,0 +1,98 @@ +use std::mem; + +use cuda_types::cuda::{CUdeviceptr_v2, CUstream}; + +struct CudaDynamicFns { + handle: libloading::Library, +} + +impl CudaDynamicFns { + unsafe fn new(path: &str) -> Result { + let handle = libloading::Library::new(path)?; + Ok(Self { handle }) + } +} + +macro_rules! emit_cuda_fn_table { + ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => { + impl CudaDynamicFns { + $( + #[allow(dead_code)] + unsafe fn $fn_name(&self, $($arg_id : $arg_type),*) -> $ret_type { + let func = self.handle.get:: $ret_type>(concat!(stringify!($fn_name), "\0").as_bytes()); + (func.unwrap())($($arg_id),*) + } + )* + } + }; +} + +cuda_macros::cuda_function_declarations!(emit_cuda_fn_table); + +fn main() { + let args: Vec = std::env::args().collect(); + let libcuda = unsafe { CudaDynamicFns::new(&args[1]).unwrap() }; + unsafe { libcuda.cuInit(0) }.unwrap(); + unsafe { libcuda.cuCtxCreate_v2(&mut mem::zeroed(), 0, 0) }.unwrap(); + let reader = std::fs::File::open(&args[2]).unwrap(); + let (mut manifest, mut source, mut buffers) = zluda_trace_common::replay::load(reader); + let mut args = manifest + .parameters + .iter() + .enumerate() + .map(|(i, param)| { + let mut buffer = buffers.remove(&format!("param_{i}.bin")).unwrap(); + for param_ptr in param.pointer_offsets.iter() { + let buffer_param_slice = &mut buffer[param_ptr.offset_in_param + ..param_ptr.offset_in_param + std::mem::size_of::()]; + let mut dev_ptr = unsafe { mem::zeroed() }; + let host_buffer = buffers + .remove(&format!( + "param_{i}_ptr_{}_pre.bin", + param_ptr.offset_in_param + )) + .unwrap(); + unsafe { libcuda.cuMemAlloc_v2(&mut dev_ptr, host_buffer.len()) }.unwrap(); + unsafe { + libcuda.cuMemcpyHtoD_v2(dev_ptr, host_buffer.as_ptr().cast(), host_buffer.len()) + } + .unwrap(); + dev_ptr = CUdeviceptr_v2(unsafe { + dev_ptr + .0 + .cast::() + .add(param_ptr.offset_in_buffer) + .cast() + }); + buffer_param_slice.copy_from_slice(&(dev_ptr.0 as usize).to_ne_bytes()); + } + }) + .collect::>(); + let mut module = unsafe { mem::zeroed() }; + std::fs::write("/tmp/source.ptx", &source).unwrap(); + source.push('\0'); + unsafe { libcuda.cuModuleLoadData(&mut module, source.as_ptr().cast()) }.unwrap(); + let mut function = unsafe { mem::zeroed() }; + manifest.kernel_name.push('\0'); + unsafe { + libcuda.cuModuleGetFunction(&mut function, module, manifest.kernel_name.as_ptr().cast()) + } + .unwrap(); + unsafe { + libcuda.cuLaunchKernel( + function, + manifest.config.grid_dim.0, + manifest.config.grid_dim.1, + manifest.config.grid_dim.2, + manifest.config.block_dim.0, + manifest.config.block_dim.1, + manifest.config.block_dim.2, + manifest.config.shared_mem_bytes, + CUstream(std::ptr::null_mut()), + args.as_mut_ptr().cast(), + std::ptr::null_mut(), + ) + } + .unwrap(); + todo!(); +} diff --git a/zluda_trace/src/lib.rs b/zluda_trace/src/lib.rs index 46ef901..fe2e41d 100644 --- a/zluda_trace/src/lib.rs +++ b/zluda_trace/src/lib.rs @@ -1552,14 +1552,14 @@ fn launch_kernel_pre( #[allow(non_snake_case)] pub(crate) fn cuLaunchKernel_Post( _f: cuda_types::cuda::CUfunction, - _gridDimX: ::core::ffi::c_uint, - _gridDimY: ::core::ffi::c_uint, - _gridDimZ: ::core::ffi::c_uint, - _blockDimX: ::core::ffi::c_uint, - _blockDimY: ::core::ffi::c_uint, - _blockDimZ: ::core::ffi::c_uint, - _sharedMemBytes: ::core::ffi::c_uint, - stream: cuda_types::cuda::CUstream, + gridDimX: ::core::ffi::c_uint, + gridDimY: ::core::ffi::c_uint, + gridDimZ: ::core::ffi::c_uint, + blockDimX: ::core::ffi::c_uint, + blockDimY: ::core::ffi::c_uint, + blockDimZ: ::core::ffi::c_uint, + sharedMemBytes: ::core::ffi::c_uint, + hStream: cuda_types::cuda::CUstream, kernel_params: *mut *mut ::core::ffi::c_void, _extra: *mut *mut ::core::ffi::c_void, pre_state: Option, @@ -1569,7 +1569,25 @@ pub(crate) fn cuLaunchKernel_Post( _result: CUresult, ) { let pre_state = unwrap_some_or!(pre_state, return); - replay::post_kernel_launch(libcuda, state, fn_logger, stream, kernel_params, pre_state); + replay::post_kernel_launch( + libcuda, + state, + fn_logger, + CUlaunchConfig { + gridDimX, + gridDimY, + gridDimZ, + blockDimX, + blockDimY, + blockDimZ, + sharedMemBytes, + hStream, + attrs: ptr::null_mut(), + numAttrs: 0, + }, + kernel_params, + pre_state, + ); } #[allow(non_snake_case)] @@ -1609,7 +1627,7 @@ pub(crate) fn cuLaunchKernelEx_Post( libcuda, state, fn_logger, - unsafe { *config }.hStream, + unsafe { *config }, kernel_params, pre_state, ); diff --git a/zluda_trace/src/replay.rs b/zluda_trace/src/replay.rs index 74fe292..1b6c01d 100644 --- a/zluda_trace/src/replay.rs +++ b/zluda_trace/src/replay.rs @@ -97,11 +97,11 @@ pub(crate) fn post_kernel_launch( libcuda: &mut CudaDynamicFns, state: &trace::StateTracker, fn_logger: &mut FnCallLog, - stream: CUstream, + config: CUlaunchConfig, kernel_params: *mut *mut std::ffi::c_void, mut pre_state: LaunchPreState, ) -> Option<()> { - fn_logger.try_cuda(|| libcuda.cuStreamSynchronize(stream))?; + fn_logger.try_cuda(|| libcuda.cuStreamSynchronize(config.hStream))?; let raw_args = unsafe { std::slice::from_raw_parts(kernel_params, pre_state.kernel_params.len()) }; for (raw_arg, param) in raw_args.iter().zip(pre_state.kernel_params.iter_mut()) { @@ -128,6 +128,11 @@ pub(crate) fn post_kernel_launch( zluda_trace_common::replay::save( file, pre_state.kernel_name, + zluda_trace_common::replay::LaunchConfig { + grid_dim: (config.gridDimX, config.gridDimY, config.gridDimZ), + block_dim: (config.blockDimX, config.blockDimY, config.blockDimZ), + shared_mem_bytes: config.sharedMemBytes, + }, pre_state.source, pre_state.kernel_params, ) diff --git a/zluda_trace_common/Cargo.toml b/zluda_trace_common/Cargo.toml index 3eb012a..fc24d59 100644 --- a/zluda_trace_common/Cargo.toml +++ b/zluda_trace_common/Cargo.toml @@ -15,6 +15,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0.142" tar = "0.4" zstd = "0.13" +rustc-hash = "2.0.0" [target.'cfg(not(windows))'.dependencies] libc = "0.2" diff --git a/zluda_trace_common/src/replay.rs b/zluda_trace_common/src/replay.rs index fe98e7c..53005dc 100644 --- a/zluda_trace_common/src/replay.rs +++ b/zluda_trace_common/src/replay.rs @@ -1,21 +1,30 @@ -use std::io::Write; +use rustc_hash::FxHashMap; +use std::io::{Read, Write}; use tar::Header; #[derive(serde::Serialize, serde::Deserialize)] -struct Manifest { - kernel_name: String, - parameters: Vec, +pub struct Manifest { + pub kernel_name: String, + pub config: LaunchConfig, + pub parameters: Vec, } #[derive(serde::Serialize, serde::Deserialize)] -struct Parameter { - pointer_offsets: Vec, +pub struct LaunchConfig { + pub grid_dim: (u32, u32, u32), + pub block_dim: (u32, u32, u32), + pub shared_mem_bytes: u32, } #[derive(serde::Serialize, serde::Deserialize)] -struct ParameterPointer { - offset_in_param: usize, - offset_in_buffer: usize, +pub struct Parameter { + pub pointer_offsets: Vec, +} + +#[derive(serde::Serialize, serde::Deserialize)] +pub struct ParameterPointer { + pub offset_in_param: usize, + pub offset_in_buffer: usize, } impl Manifest { @@ -37,6 +46,7 @@ pub struct KernelParameter { pub fn save( writer: impl Write, kernel_name: String, + config: LaunchConfig, source: String, kernel_params: Vec, ) -> std::io::Result<()> { @@ -44,6 +54,7 @@ pub fn save( let mut builder = tar::Builder::new(archive); let (mut header, manifest) = Manifest { kernel_name, + config, parameters: kernel_params .iter() .map(|param| Parameter { @@ -85,3 +96,34 @@ pub fn save( builder.into_inner()?.finish()?; Ok(()) } + +pub fn load(reader: impl Read) -> (Manifest, String, FxHashMap>) { + let archive = zstd::Decoder::new(reader).unwrap(); + let mut archive = tar::Archive::new(archive); + let mut manifest = None; + let mut source = None; + let mut buffers = FxHashMap::default(); + for entry in archive.entries().unwrap() { + let mut entry = entry.unwrap(); + let path = entry.path().unwrap().to_string_lossy().to_string(); + match &*path { + Manifest::PATH => { + manifest = Some(serde_json::from_reader::<_, Manifest>(&mut entry).unwrap()); + } + "source.ptx" => { + let mut string = String::new(); + entry.read_to_string(&mut string).unwrap(); + dbg!(string.len()); + source = Some(string); + } + _ => { + let mut buffer = Vec::new(); + entry.read_to_end(&mut buffer).unwrap(); + buffers.insert(path, buffer); + } + } + } + let manifest = manifest.unwrap(); + let source = source.unwrap(); + (manifest, source, buffers) +} From 18a2b765f759817bc6f0982482dc8c5a5cb03d10 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Sat, 20 Sep 2025 01:54:40 +0000 Subject: [PATCH 07/11] Pass correct arguments --- zluda_replay/src/main.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/zluda_replay/src/main.rs b/zluda_replay/src/main.rs index 50d2d99..47b3ccd 100644 --- a/zluda_replay/src/main.rs +++ b/zluda_replay/src/main.rs @@ -66,6 +66,7 @@ fn main() { }); buffer_param_slice.copy_from_slice(&(dev_ptr.0 as usize).to_ne_bytes()); } + buffer }) .collect::>(); let mut module = unsafe { mem::zeroed() }; @@ -78,6 +79,10 @@ fn main() { libcuda.cuModuleGetFunction(&mut function, module, manifest.kernel_name.as_ptr().cast()) } .unwrap(); + let mut cuda_args = args + .iter_mut() + .map(|arg| arg.as_mut_ptr().cast::()) + .collect::>(); unsafe { libcuda.cuLaunchKernel( function, @@ -89,10 +94,10 @@ fn main() { manifest.config.block_dim.2, manifest.config.shared_mem_bytes, CUstream(std::ptr::null_mut()), - args.as_mut_ptr().cast(), + cuda_args.as_mut_ptr().cast(), std::ptr::null_mut(), ) } .unwrap(); - todo!(); + unsafe { libcuda.cuCtxSynchronize() }.unwrap(); } From 3bad9852a588850d21d5f855e8a24a458accaf86 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Mon, 22 Sep 2025 20:29:22 +0000 Subject: [PATCH 08/11] Minor compiler improvements --- ptx/src/pass/llvm/emit.rs | 46 +++++++++++++++++++-------------------- ptx_parser/src/lib.rs | 5 +++-- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/ptx/src/pass/llvm/emit.rs b/ptx/src/pass/llvm/emit.rs index 76717e1..c27f1aa 100644 --- a/ptx/src/pass/llvm/emit.rs +++ b/ptx/src/pass/llvm/emit.rs @@ -1656,25 +1656,23 @@ impl<'a> MethodEmitContext<'a> { .ok_or_else(|| error_mismatched_type())?, ); let src2 = self.resolver.value(src2)?; - self.resolver.with_result(arguments.dst, |dst| { - let vec = unsafe { - LLVMBuildInsertElement( - self.builder, - LLVMGetPoison(dst_type), - llvm_fn(self.builder, src, packed_type, LLVM_UNNAMED.as_ptr()), - LLVMConstInt(LLVMInt32TypeInContext(self.context), 1, false as i32), - LLVM_UNNAMED.as_ptr(), - ) - }; - unsafe { - LLVMBuildInsertElement( - self.builder, - vec, - llvm_fn(self.builder, src2, packed_type, LLVM_UNNAMED.as_ptr()), - LLVMConstInt(LLVMInt32TypeInContext(self.context), 0, false as i32), - dst, - ) - } + let vec = unsafe { + LLVMBuildInsertElement( + self.builder, + LLVMGetPoison(dst_type), + llvm_fn(self.builder, src, packed_type, LLVM_UNNAMED.as_ptr()), + LLVMConstInt(LLVMInt32TypeInContext(self.context), 1, false as i32), + LLVM_UNNAMED.as_ptr(), + ) + }; + self.resolver.with_result(arguments.dst, |dst| unsafe { + LLVMBuildInsertElement( + self.builder, + vec, + llvm_fn(self.builder, src2, packed_type, LLVM_UNNAMED.as_ptr()), + LLVMConstInt(LLVMInt32TypeInContext(self.context), 0, false as i32), + dst, + ) }) } else { self.resolver.with_result(arguments.dst, |dst| unsafe { @@ -2200,7 +2198,7 @@ impl<'a> MethodEmitContext<'a> { Some(&ast::ScalarType::F32.into()), vec![( self.resolver.value(arguments.src)?, - get_scalar_type(self.context, ast::ScalarType::F32.into()), + get_scalar_type(self.context, ast::ScalarType::F32), )], )?; Ok(()) @@ -2703,14 +2701,14 @@ impl<'a> MethodEmitContext<'a> { let load = unsafe { LLVMBuildLoad2(self.builder, from_type, from, LLVM_UNNAMED.as_ptr()) }; unsafe { - LLVMSetAlignment(load, (cp_size.as_u64() as u32) * 8); + LLVMSetAlignment(load, cp_size.as_u64() as u32); } let extended = unsafe { LLVMBuildZExt(self.builder, load, to_type, LLVM_UNNAMED.as_ptr()) }; - unsafe { LLVMBuildStore(self.builder, extended, to) }; + let store = unsafe { LLVMBuildStore(self.builder, extended, to) }; unsafe { - LLVMSetAlignment(load, (cp_size.as_u64() as u32) * 8); + LLVMSetAlignment(store, cp_size.as_u64() as u32); } Ok(()) } @@ -2990,7 +2988,7 @@ fn get_scope_membar(scope: ast::MemScope) -> Result<*const i8, TranslateError> { Ok(match scope { ast::MemScope::Cta => c"workgroup", ast::MemScope::Gpu => c"agent", - ast::MemScope::Sys => c"", + ast::MemScope::Sys => c"system", ast::MemScope::Cluster => todo!(), } .as_ptr()) diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index 6078dc5..152f72b 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -227,8 +227,9 @@ fn int_immediate<'a, 'input>(input: &mut PtxParser<'a, 'input>) -> PResult Ok(ast::ImmediateValue::S64(-x)), + let full_number = format!("-{num}"); + match i64::from_str_radix(&full_number, radix) { + Ok(x) => Ok(ast::ImmediateValue::S64(x)), Err(err) => Err((ast::ImmediateValue::S64(0), PtxError::from(err))), } } else if is_unsigned { From 07acc64d332b51a92b822ea75e3fa4f63642b790 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Mon, 22 Sep 2025 21:18:01 +0000 Subject: [PATCH 09/11] Allow skipping post-values --- zluda_trace/src/lib.rs | 59 ++++++++++++++++++++++++++------ zluda_trace/src/replay.rs | 52 ++++++++++++++++++++++------ zluda_trace/src/trace.rs | 2 ++ zluda_trace_common/src/replay.rs | 6 ++++ 4 files changed, 98 insertions(+), 21 deletions(-) diff --git a/zluda_trace/src/lib.rs b/zluda_trace/src/lib.rs index fe2e41d..4a52791 100644 --- a/zluda_trace/src/lib.rs +++ b/zluda_trace/src/lib.rs @@ -1281,6 +1281,7 @@ struct Settings { libcuda_path: String, override_cc: Option<(u32, u32)>, kernel_name_filter: Option, + kernel_no_output: Option, } impl Settings { @@ -1343,11 +1344,28 @@ impl Settings { }) }), }; + let kernel_no_output = match env::var("ZLUDA_SAVE_KERNELS_NO_OUTPUT") { + Err(env::VarError::NotPresent) => None, + Err(e) => { + logger.log(log::ErrorEntry::ErrorBox(Box::new(e) as _)); + None + } + Ok(env_string) => logger + .try_return(|| { + str::parse::(&env_string).map_err(|err| ErrorEntry::InvalidEnvVar { + var: "ZLUDA_SAVE_KERNELS_NO_OUTPUT", + pattern: "number", + value: format!("{} ({})", env_string, err), + }) + }) + .map(|x| x != 0), + }; Settings { dump_dir, libcuda_path, override_cc, kernel_name_filter, + kernel_no_output, } } @@ -1513,25 +1531,45 @@ pub(crate) fn cuLibraryLoadData_Post( #[allow(non_snake_case)] pub(crate) fn cuLaunchKernel_Pre( f: cuda_types::cuda::CUfunction, - _gridDimX: ::core::ffi::c_uint, - _gridDimY: ::core::ffi::c_uint, - _gridDimZ: ::core::ffi::c_uint, - _blockDimX: ::core::ffi::c_uint, - _blockDimY: ::core::ffi::c_uint, - _blockDimZ: ::core::ffi::c_uint, - _sharedMemBytes: ::core::ffi::c_uint, - stream: cuda_types::cuda::CUstream, + gridDimX: ::core::ffi::c_uint, + gridDimY: ::core::ffi::c_uint, + gridDimZ: ::core::ffi::c_uint, + blockDimX: ::core::ffi::c_uint, + blockDimY: ::core::ffi::c_uint, + blockDimZ: ::core::ffi::c_uint, + sharedMemBytes: ::core::ffi::c_uint, + hStream: cuda_types::cuda::CUstream, kernel_params: *mut *mut ::core::ffi::c_void, _extra: *mut *mut ::core::ffi::c_void, libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, ) -> Option { - launch_kernel_pre(f, stream, kernel_params, libcuda, state, fn_logger) + launch_kernel_pre( + f, + CUlaunchConfig { + gridDimX, + gridDimY, + gridDimZ, + blockDimX, + blockDimY, + blockDimZ, + sharedMemBytes, + hStream, + attrs: ptr::null_mut(), + numAttrs: 0, + }, + hStream, + kernel_params, + libcuda, + state, + fn_logger, + ) } fn launch_kernel_pre( f: cuda_types::cuda::CUfunction, + config: CUlaunchConfig, stream: cuda_types::cuda::CUstream, kernel_params: *mut *mut ::core::ffi::c_void, libcuda: &mut CudaDynamicFns, @@ -1546,7 +1584,7 @@ fn launch_kernel_pre( if state.dump_dir().is_none() { return None; } - replay::pre_kernel_launch(libcuda, state, fn_logger, f, stream, kernel_params) + replay::pre_kernel_launch(libcuda, state, fn_logger, config, f, stream, kernel_params) } #[allow(non_snake_case)] @@ -1602,6 +1640,7 @@ pub(crate) fn cuLaunchKernelEx_Pre( ) -> Option { launch_kernel_pre( f, + unsafe { *config }, unsafe { *config }.hStream, kernel_params, libcuda, diff --git a/zluda_trace/src/replay.rs b/zluda_trace/src/replay.rs index 1b6c01d..3201d0f 100644 --- a/zluda_trace/src/replay.rs +++ b/zluda_trace/src/replay.rs @@ -16,6 +16,7 @@ pub(crate) fn pre_kernel_launch( libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, + config: CUlaunchConfig, f: CUfunction, stream: CUstream, args: *mut *mut std::ffi::c_void, @@ -60,12 +61,15 @@ pub(crate) fn pre_kernel_launch( (&mut start as *mut usize).cast::(), (&mut size as *mut usize).cast::(), ]; - if let Some(Ok(())) = libcuda.cuPointerGetAttributes( - 2, - attrs.as_mut_ptr(), - data.as_mut_ptr(), - CUdeviceptr_v2(maybe_ptr as _), - ) { + fn_logger.try_cuda(|| { + libcuda.cuPointerGetAttributes( + 2, + attrs.as_mut_ptr(), + data.as_mut_ptr(), + CUdeviceptr_v2(maybe_ptr as _), + ) + })?; + if size != 0 { let mut pre_buffer = vec![0u8; size]; let post_buffer = vec![0u8; size]; fn_logger.try_cuda(|| { @@ -86,11 +90,36 @@ pub(crate) fn pre_kernel_launch( device_ptrs: ptr_overrides, }); } - Some(LaunchPreState { - kernel_name: name.to_string(), - source: source.to_string(), - kernel_params: all_params, - }) + if state.kernel_no_output { + let enqueue_counter = state.enqueue_counter; + let kernel_name = name; + let mut path = state.dump_dir()?.to_path_buf(); + path.push(format!("kernel_{enqueue_counter}_{kernel_name}.tar.zst")); + let file = fn_logger + .try_return(|| std::fs::File::create_new(path).map_err(ErrorEntry::IoError))?; + fn_logger.try_return(|| { + zluda_trace_common::replay::save( + file, + name.to_string(), + false, + zluda_trace_common::replay::LaunchConfig { + grid_dim: (config.gridDimX, config.gridDimY, config.gridDimZ), + block_dim: (config.blockDimX, config.blockDimY, config.blockDimZ), + shared_mem_bytes: config.sharedMemBytes, + }, + source.to_string(), + all_params, + ) + .map_err(ErrorEntry::IoError) + }); + None + } else { + Some(LaunchPreState { + kernel_name: name.to_string(), + source: source.to_string(), + kernel_params: all_params, + }) + } } pub(crate) fn post_kernel_launch( @@ -128,6 +157,7 @@ pub(crate) fn post_kernel_launch( zluda_trace_common::replay::save( file, pre_state.kernel_name, + true, zluda_trace_common::replay::LaunchConfig { grid_dim: (config.gridDimX, config.gridDimY, config.gridDimZ), block_dim: (config.blockDimX, config.blockDimY, config.blockDimZ), diff --git a/zluda_trace/src/trace.rs b/zluda_trace/src/trace.rs index 4b57920..9fe8660 100644 --- a/zluda_trace/src/trace.rs +++ b/zluda_trace/src/trace.rs @@ -28,6 +28,7 @@ pub(crate) struct StateTracker { pub(crate) enqueue_counter: usize, pub(crate) override_cc: Option<(u32, u32)>, pub(crate) kernel_name_filter: Option, + pub(crate) kernel_no_output: bool, } pub(crate) struct ParsedModule { @@ -57,6 +58,7 @@ impl StateTracker { enqueue_counter: 0, override_cc: settings.override_cc, kernel_name_filter: settings.kernel_name_filter.clone(), + kernel_no_output: settings.kernel_no_output.unwrap_or(false), } } diff --git a/zluda_trace_common/src/replay.rs b/zluda_trace_common/src/replay.rs index 53005dc..6933f8d 100644 --- a/zluda_trace_common/src/replay.rs +++ b/zluda_trace_common/src/replay.rs @@ -5,6 +5,7 @@ use tar::Header; #[derive(serde::Serialize, serde::Deserialize)] pub struct Manifest { pub kernel_name: String, + pub outputs: bool, pub config: LaunchConfig, pub parameters: Vec, } @@ -46,6 +47,7 @@ pub struct KernelParameter { pub fn save( writer: impl Write, kernel_name: String, + has_outputs: bool, config: LaunchConfig, source: String, kernel_params: Vec, @@ -54,6 +56,7 @@ pub fn save( let mut builder = tar::Builder::new(archive); let (mut header, manifest) = Manifest { kernel_name, + outputs: has_outputs, config, parameters: kernel_params .iter() @@ -86,6 +89,9 @@ pub fn save( let mut header = Header::new_gnu(); header.set_size(data_before.len() as u64); builder.append_data(&mut header, &*path, &*data_before)?; + if !has_outputs { + continue; + } let path = format!("param_{i}_ptr_{offset_in_param}_post.bin"); let mut header = Header::new_gnu(); header.set_size(data_after.len() as u64); From a66fddc0fa70dbfbfcca49618891e70cbc56176a Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Tue, 23 Sep 2025 01:27:53 +0000 Subject: [PATCH 10/11] Fail linking on undefined --- comgr/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comgr/src/lib.rs b/comgr/src/lib.rs index 8546203..2b3c65d 100644 --- a/comgr/src/lib.rs +++ b/comgr/src/lib.rs @@ -219,6 +219,8 @@ pub fn compile_bitcode( compile_to_exec.set_isa_name(gcn_arch)?; compile_to_exec.set_language(Language::LlvmIr)?; let common_options = [ + c"-Xlinker", + c"--no-undefined", c"-mllvm", c"-ignore-tti-inline-compatible", // c"-mllvm", From 7b007074bd24a38b9465c8ab20e769264567e4dd Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Tue, 23 Sep 2025 17:43:26 +0000 Subject: [PATCH 11/11] Set modes --- zluda_trace_common/src/replay.rs | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/zluda_trace_common/src/replay.rs b/zluda_trace_common/src/replay.rs index 6933f8d..85650ca 100644 --- a/zluda_trace_common/src/replay.rs +++ b/zluda_trace_common/src/replay.rs @@ -33,8 +33,7 @@ impl Manifest { fn serialize(&self) -> std::io::Result<(Header, Vec)> { let vec = serde_json::to_vec(self)?; - let mut header = Header::new_gnu(); - header.set_size(vec.len() as u64); + let header = tar_header(vec.len()); Ok((header, vec)) } } @@ -76,25 +75,21 @@ pub fn save( } .serialize()?; builder.append_data(&mut header, Manifest::PATH, &*manifest)?; - let mut header = Header::new_gnu(); - header.set_size(source.len() as u64); + let mut header = tar_header(source.len()); builder.append_data(&mut header, "source.ptx", source.as_bytes())?; for (i, param) in kernel_params.into_iter().enumerate() { let path = format!("param_{i}.bin"); - let mut header = Header::new_gnu(); - header.set_size(param.data.len() as u64); + let mut header = tar_header(param.data.len()); builder.append_data(&mut header, &*path, &*param.data)?; for (offset_in_param, _, data_before, data_after) in param.device_ptrs { let path = format!("param_{i}_ptr_{offset_in_param}_pre.bin"); - let mut header = Header::new_gnu(); - header.set_size(data_before.len() as u64); + let mut header = tar_header(data_before.len()); builder.append_data(&mut header, &*path, &*data_before)?; if !has_outputs { continue; } let path = format!("param_{i}_ptr_{offset_in_param}_post.bin"); - let mut header = Header::new_gnu(); - header.set_size(data_after.len() as u64); + let mut header = tar_header(data_after.len()); builder.append_data(&mut header, &*path, &*data_after)?; } } @@ -103,6 +98,13 @@ pub fn save( Ok(()) } +fn tar_header(size: usize) -> Header { + let mut header = Header::new_gnu(); + header.set_mode(0o644); + header.set_size(size as u64); + header +} + pub fn load(reader: impl Read) -> (Manifest, String, FxHashMap>) { let archive = zstd::Decoder::new(reader).unwrap(); let mut archive = tar::Archive::new(archive);