From 07acc64d332b51a92b822ea75e3fa4f63642b790 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Mon, 22 Sep 2025 21:18:01 +0000 Subject: [PATCH] Allow skipping post-values --- zluda_trace/src/lib.rs | 59 ++++++++++++++++++++++++++------ zluda_trace/src/replay.rs | 52 ++++++++++++++++++++++------ zluda_trace/src/trace.rs | 2 ++ zluda_trace_common/src/replay.rs | 6 ++++ 4 files changed, 98 insertions(+), 21 deletions(-) diff --git a/zluda_trace/src/lib.rs b/zluda_trace/src/lib.rs index fe2e41d..4a52791 100644 --- a/zluda_trace/src/lib.rs +++ b/zluda_trace/src/lib.rs @@ -1281,6 +1281,7 @@ struct Settings { libcuda_path: String, override_cc: Option<(u32, u32)>, kernel_name_filter: Option, + kernel_no_output: Option, } impl Settings { @@ -1343,11 +1344,28 @@ impl Settings { }) }), }; + let kernel_no_output = match env::var("ZLUDA_SAVE_KERNELS_NO_OUTPUT") { + Err(env::VarError::NotPresent) => None, + Err(e) => { + logger.log(log::ErrorEntry::ErrorBox(Box::new(e) as _)); + None + } + Ok(env_string) => logger + .try_return(|| { + str::parse::(&env_string).map_err(|err| ErrorEntry::InvalidEnvVar { + var: "ZLUDA_SAVE_KERNELS_NO_OUTPUT", + pattern: "number", + value: format!("{} ({})", env_string, err), + }) + }) + .map(|x| x != 0), + }; Settings { dump_dir, libcuda_path, override_cc, kernel_name_filter, + kernel_no_output, } } @@ -1513,25 +1531,45 @@ pub(crate) fn cuLibraryLoadData_Post( #[allow(non_snake_case)] pub(crate) fn cuLaunchKernel_Pre( f: cuda_types::cuda::CUfunction, - _gridDimX: ::core::ffi::c_uint, - _gridDimY: ::core::ffi::c_uint, - _gridDimZ: ::core::ffi::c_uint, - _blockDimX: ::core::ffi::c_uint, - _blockDimY: ::core::ffi::c_uint, - _blockDimZ: ::core::ffi::c_uint, - _sharedMemBytes: ::core::ffi::c_uint, - stream: cuda_types::cuda::CUstream, + gridDimX: ::core::ffi::c_uint, + gridDimY: ::core::ffi::c_uint, + gridDimZ: ::core::ffi::c_uint, + blockDimX: ::core::ffi::c_uint, + blockDimY: ::core::ffi::c_uint, + blockDimZ: ::core::ffi::c_uint, + sharedMemBytes: ::core::ffi::c_uint, + hStream: cuda_types::cuda::CUstream, kernel_params: *mut *mut ::core::ffi::c_void, _extra: *mut *mut ::core::ffi::c_void, libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, ) -> Option { - launch_kernel_pre(f, stream, kernel_params, libcuda, state, fn_logger) + launch_kernel_pre( + f, + CUlaunchConfig { + gridDimX, + gridDimY, + gridDimZ, + blockDimX, + blockDimY, + blockDimZ, + sharedMemBytes, + hStream, + attrs: ptr::null_mut(), + numAttrs: 0, + }, + hStream, + kernel_params, + libcuda, + state, + fn_logger, + ) } fn launch_kernel_pre( f: cuda_types::cuda::CUfunction, + config: CUlaunchConfig, stream: cuda_types::cuda::CUstream, kernel_params: *mut *mut ::core::ffi::c_void, libcuda: &mut CudaDynamicFns, @@ -1546,7 +1584,7 @@ fn launch_kernel_pre( if state.dump_dir().is_none() { return None; } - replay::pre_kernel_launch(libcuda, state, fn_logger, f, stream, kernel_params) + replay::pre_kernel_launch(libcuda, state, fn_logger, config, f, stream, kernel_params) } #[allow(non_snake_case)] @@ -1602,6 +1640,7 @@ pub(crate) fn cuLaunchKernelEx_Pre( ) -> Option { launch_kernel_pre( f, + unsafe { *config }, unsafe { *config }.hStream, kernel_params, libcuda, diff --git a/zluda_trace/src/replay.rs b/zluda_trace/src/replay.rs index 1b6c01d..3201d0f 100644 --- a/zluda_trace/src/replay.rs +++ b/zluda_trace/src/replay.rs @@ -16,6 +16,7 @@ pub(crate) fn pre_kernel_launch( libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, + config: CUlaunchConfig, f: CUfunction, stream: CUstream, args: *mut *mut std::ffi::c_void, @@ -60,12 +61,15 @@ pub(crate) fn pre_kernel_launch( (&mut start as *mut usize).cast::(), (&mut size as *mut usize).cast::(), ]; - if let Some(Ok(())) = libcuda.cuPointerGetAttributes( - 2, - attrs.as_mut_ptr(), - data.as_mut_ptr(), - CUdeviceptr_v2(maybe_ptr as _), - ) { + fn_logger.try_cuda(|| { + libcuda.cuPointerGetAttributes( + 2, + attrs.as_mut_ptr(), + data.as_mut_ptr(), + CUdeviceptr_v2(maybe_ptr as _), + ) + })?; + if size != 0 { let mut pre_buffer = vec![0u8; size]; let post_buffer = vec![0u8; size]; fn_logger.try_cuda(|| { @@ -86,11 +90,36 @@ pub(crate) fn pre_kernel_launch( device_ptrs: ptr_overrides, }); } - Some(LaunchPreState { - kernel_name: name.to_string(), - source: source.to_string(), - kernel_params: all_params, - }) + if state.kernel_no_output { + let enqueue_counter = state.enqueue_counter; + let kernel_name = name; + let mut path = state.dump_dir()?.to_path_buf(); + path.push(format!("kernel_{enqueue_counter}_{kernel_name}.tar.zst")); + let file = fn_logger + .try_return(|| std::fs::File::create_new(path).map_err(ErrorEntry::IoError))?; + fn_logger.try_return(|| { + zluda_trace_common::replay::save( + file, + name.to_string(), + false, + zluda_trace_common::replay::LaunchConfig { + grid_dim: (config.gridDimX, config.gridDimY, config.gridDimZ), + block_dim: (config.blockDimX, config.blockDimY, config.blockDimZ), + shared_mem_bytes: config.sharedMemBytes, + }, + source.to_string(), + all_params, + ) + .map_err(ErrorEntry::IoError) + }); + None + } else { + Some(LaunchPreState { + kernel_name: name.to_string(), + source: source.to_string(), + kernel_params: all_params, + }) + } } pub(crate) fn post_kernel_launch( @@ -128,6 +157,7 @@ pub(crate) fn post_kernel_launch( zluda_trace_common::replay::save( file, pre_state.kernel_name, + true, zluda_trace_common::replay::LaunchConfig { grid_dim: (config.gridDimX, config.gridDimY, config.gridDimZ), block_dim: (config.blockDimX, config.blockDimY, config.blockDimZ), diff --git a/zluda_trace/src/trace.rs b/zluda_trace/src/trace.rs index 4b57920..9fe8660 100644 --- a/zluda_trace/src/trace.rs +++ b/zluda_trace/src/trace.rs @@ -28,6 +28,7 @@ pub(crate) struct StateTracker { pub(crate) enqueue_counter: usize, pub(crate) override_cc: Option<(u32, u32)>, pub(crate) kernel_name_filter: Option, + pub(crate) kernel_no_output: bool, } pub(crate) struct ParsedModule { @@ -57,6 +58,7 @@ impl StateTracker { enqueue_counter: 0, override_cc: settings.override_cc, kernel_name_filter: settings.kernel_name_filter.clone(), + kernel_no_output: settings.kernel_no_output.unwrap_or(false), } } diff --git a/zluda_trace_common/src/replay.rs b/zluda_trace_common/src/replay.rs index 53005dc..6933f8d 100644 --- a/zluda_trace_common/src/replay.rs +++ b/zluda_trace_common/src/replay.rs @@ -5,6 +5,7 @@ use tar::Header; #[derive(serde::Serialize, serde::Deserialize)] pub struct Manifest { pub kernel_name: String, + pub outputs: bool, pub config: LaunchConfig, pub parameters: Vec, } @@ -46,6 +47,7 @@ pub struct KernelParameter { pub fn save( writer: impl Write, kernel_name: String, + has_outputs: bool, config: LaunchConfig, source: String, kernel_params: Vec, @@ -54,6 +56,7 @@ pub fn save( let mut builder = tar::Builder::new(archive); let (mut header, manifest) = Manifest { kernel_name, + outputs: has_outputs, config, parameters: kernel_params .iter() @@ -86,6 +89,9 @@ pub fn save( let mut header = Header::new_gnu(); header.set_size(data_before.len() as u64); builder.append_data(&mut header, &*path, &*data_before)?; + if !has_outputs { + continue; + } let path = format!("param_{i}_ptr_{offset_in_param}_post.bin"); let mut header = Header::new_gnu(); header.set_size(data_after.len() as u64);