From 3bad9852a588850d21d5f855e8a24a458accaf86 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Mon, 22 Sep 2025 20:29:22 +0000 Subject: [PATCH 1/2] Minor compiler improvements --- ptx/src/pass/llvm/emit.rs | 46 +++++++++++++++++++-------------------- ptx_parser/src/lib.rs | 5 +++-- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/ptx/src/pass/llvm/emit.rs b/ptx/src/pass/llvm/emit.rs index 76717e1..c27f1aa 100644 --- a/ptx/src/pass/llvm/emit.rs +++ b/ptx/src/pass/llvm/emit.rs @@ -1656,25 +1656,23 @@ impl<'a> MethodEmitContext<'a> { .ok_or_else(|| error_mismatched_type())?, ); let src2 = self.resolver.value(src2)?; - self.resolver.with_result(arguments.dst, |dst| { - let vec = unsafe { - LLVMBuildInsertElement( - self.builder, - LLVMGetPoison(dst_type), - llvm_fn(self.builder, src, packed_type, LLVM_UNNAMED.as_ptr()), - LLVMConstInt(LLVMInt32TypeInContext(self.context), 1, false as i32), - LLVM_UNNAMED.as_ptr(), - ) - }; - unsafe { - LLVMBuildInsertElement( - self.builder, - vec, - llvm_fn(self.builder, src2, packed_type, LLVM_UNNAMED.as_ptr()), - LLVMConstInt(LLVMInt32TypeInContext(self.context), 0, false as i32), - dst, - ) - } + let vec = unsafe { + LLVMBuildInsertElement( + self.builder, + LLVMGetPoison(dst_type), + llvm_fn(self.builder, src, packed_type, LLVM_UNNAMED.as_ptr()), + LLVMConstInt(LLVMInt32TypeInContext(self.context), 1, false as i32), + LLVM_UNNAMED.as_ptr(), + ) + }; + self.resolver.with_result(arguments.dst, |dst| unsafe { + LLVMBuildInsertElement( + self.builder, + vec, + llvm_fn(self.builder, src2, packed_type, LLVM_UNNAMED.as_ptr()), + LLVMConstInt(LLVMInt32TypeInContext(self.context), 0, false as i32), + dst, + ) }) } else { self.resolver.with_result(arguments.dst, |dst| unsafe { @@ -2200,7 +2198,7 @@ impl<'a> MethodEmitContext<'a> { Some(&ast::ScalarType::F32.into()), vec![( self.resolver.value(arguments.src)?, - get_scalar_type(self.context, ast::ScalarType::F32.into()), + get_scalar_type(self.context, ast::ScalarType::F32), )], )?; Ok(()) @@ -2703,14 +2701,14 @@ impl<'a> MethodEmitContext<'a> { let load = unsafe { LLVMBuildLoad2(self.builder, from_type, from, LLVM_UNNAMED.as_ptr()) }; unsafe { - LLVMSetAlignment(load, (cp_size.as_u64() as u32) * 8); + LLVMSetAlignment(load, cp_size.as_u64() as u32); } let extended = unsafe { LLVMBuildZExt(self.builder, load, to_type, LLVM_UNNAMED.as_ptr()) }; - unsafe { LLVMBuildStore(self.builder, extended, to) }; + let store = unsafe { LLVMBuildStore(self.builder, extended, to) }; unsafe { - LLVMSetAlignment(load, (cp_size.as_u64() as u32) * 8); + LLVMSetAlignment(store, cp_size.as_u64() as u32); } Ok(()) } @@ -2990,7 +2988,7 @@ fn get_scope_membar(scope: ast::MemScope) -> Result<*const i8, TranslateError> { Ok(match scope { ast::MemScope::Cta => c"workgroup", ast::MemScope::Gpu => c"agent", - ast::MemScope::Sys => c"", + ast::MemScope::Sys => c"system", ast::MemScope::Cluster => todo!(), } .as_ptr()) diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index 6078dc5..152f72b 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -227,8 +227,9 @@ fn int_immediate<'a, 'input>(input: &mut PtxParser<'a, 'input>) -> PResult Ok(ast::ImmediateValue::S64(-x)), + let full_number = format!("-{num}"); + match i64::from_str_radix(&full_number, radix) { + Ok(x) => Ok(ast::ImmediateValue::S64(x)), Err(err) => Err((ast::ImmediateValue::S64(0), PtxError::from(err))), } } else if is_unsigned { From 07acc64d332b51a92b822ea75e3fa4f63642b790 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Mon, 22 Sep 2025 21:18:01 +0000 Subject: [PATCH 2/2] Allow skipping post-values --- zluda_trace/src/lib.rs | 59 ++++++++++++++++++++++++++------ zluda_trace/src/replay.rs | 52 ++++++++++++++++++++++------ zluda_trace/src/trace.rs | 2 ++ zluda_trace_common/src/replay.rs | 6 ++++ 4 files changed, 98 insertions(+), 21 deletions(-) diff --git a/zluda_trace/src/lib.rs b/zluda_trace/src/lib.rs index fe2e41d..4a52791 100644 --- a/zluda_trace/src/lib.rs +++ b/zluda_trace/src/lib.rs @@ -1281,6 +1281,7 @@ struct Settings { libcuda_path: String, override_cc: Option<(u32, u32)>, kernel_name_filter: Option, + kernel_no_output: Option, } impl Settings { @@ -1343,11 +1344,28 @@ impl Settings { }) }), }; + let kernel_no_output = match env::var("ZLUDA_SAVE_KERNELS_NO_OUTPUT") { + Err(env::VarError::NotPresent) => None, + Err(e) => { + logger.log(log::ErrorEntry::ErrorBox(Box::new(e) as _)); + None + } + Ok(env_string) => logger + .try_return(|| { + str::parse::(&env_string).map_err(|err| ErrorEntry::InvalidEnvVar { + var: "ZLUDA_SAVE_KERNELS_NO_OUTPUT", + pattern: "number", + value: format!("{} ({})", env_string, err), + }) + }) + .map(|x| x != 0), + }; Settings { dump_dir, libcuda_path, override_cc, kernel_name_filter, + kernel_no_output, } } @@ -1513,25 +1531,45 @@ pub(crate) fn cuLibraryLoadData_Post( #[allow(non_snake_case)] pub(crate) fn cuLaunchKernel_Pre( f: cuda_types::cuda::CUfunction, - _gridDimX: ::core::ffi::c_uint, - _gridDimY: ::core::ffi::c_uint, - _gridDimZ: ::core::ffi::c_uint, - _blockDimX: ::core::ffi::c_uint, - _blockDimY: ::core::ffi::c_uint, - _blockDimZ: ::core::ffi::c_uint, - _sharedMemBytes: ::core::ffi::c_uint, - stream: cuda_types::cuda::CUstream, + gridDimX: ::core::ffi::c_uint, + gridDimY: ::core::ffi::c_uint, + gridDimZ: ::core::ffi::c_uint, + blockDimX: ::core::ffi::c_uint, + blockDimY: ::core::ffi::c_uint, + blockDimZ: ::core::ffi::c_uint, + sharedMemBytes: ::core::ffi::c_uint, + hStream: cuda_types::cuda::CUstream, kernel_params: *mut *mut ::core::ffi::c_void, _extra: *mut *mut ::core::ffi::c_void, libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, ) -> Option { - launch_kernel_pre(f, stream, kernel_params, libcuda, state, fn_logger) + launch_kernel_pre( + f, + CUlaunchConfig { + gridDimX, + gridDimY, + gridDimZ, + blockDimX, + blockDimY, + blockDimZ, + sharedMemBytes, + hStream, + attrs: ptr::null_mut(), + numAttrs: 0, + }, + hStream, + kernel_params, + libcuda, + state, + fn_logger, + ) } fn launch_kernel_pre( f: cuda_types::cuda::CUfunction, + config: CUlaunchConfig, stream: cuda_types::cuda::CUstream, kernel_params: *mut *mut ::core::ffi::c_void, libcuda: &mut CudaDynamicFns, @@ -1546,7 +1584,7 @@ fn launch_kernel_pre( if state.dump_dir().is_none() { return None; } - replay::pre_kernel_launch(libcuda, state, fn_logger, f, stream, kernel_params) + replay::pre_kernel_launch(libcuda, state, fn_logger, config, f, stream, kernel_params) } #[allow(non_snake_case)] @@ -1602,6 +1640,7 @@ pub(crate) fn cuLaunchKernelEx_Pre( ) -> Option { launch_kernel_pre( f, + unsafe { *config }, unsafe { *config }.hStream, kernel_params, libcuda, diff --git a/zluda_trace/src/replay.rs b/zluda_trace/src/replay.rs index 1b6c01d..3201d0f 100644 --- a/zluda_trace/src/replay.rs +++ b/zluda_trace/src/replay.rs @@ -16,6 +16,7 @@ pub(crate) fn pre_kernel_launch( libcuda: &mut CudaDynamicFns, state: &mut trace::StateTracker, fn_logger: &mut FnCallLog, + config: CUlaunchConfig, f: CUfunction, stream: CUstream, args: *mut *mut std::ffi::c_void, @@ -60,12 +61,15 @@ pub(crate) fn pre_kernel_launch( (&mut start as *mut usize).cast::(), (&mut size as *mut usize).cast::(), ]; - if let Some(Ok(())) = libcuda.cuPointerGetAttributes( - 2, - attrs.as_mut_ptr(), - data.as_mut_ptr(), - CUdeviceptr_v2(maybe_ptr as _), - ) { + fn_logger.try_cuda(|| { + libcuda.cuPointerGetAttributes( + 2, + attrs.as_mut_ptr(), + data.as_mut_ptr(), + CUdeviceptr_v2(maybe_ptr as _), + ) + })?; + if size != 0 { let mut pre_buffer = vec![0u8; size]; let post_buffer = vec![0u8; size]; fn_logger.try_cuda(|| { @@ -86,11 +90,36 @@ pub(crate) fn pre_kernel_launch( device_ptrs: ptr_overrides, }); } - Some(LaunchPreState { - kernel_name: name.to_string(), - source: source.to_string(), - kernel_params: all_params, - }) + if state.kernel_no_output { + let enqueue_counter = state.enqueue_counter; + let kernel_name = name; + let mut path = state.dump_dir()?.to_path_buf(); + path.push(format!("kernel_{enqueue_counter}_{kernel_name}.tar.zst")); + let file = fn_logger + .try_return(|| std::fs::File::create_new(path).map_err(ErrorEntry::IoError))?; + fn_logger.try_return(|| { + zluda_trace_common::replay::save( + file, + name.to_string(), + false, + zluda_trace_common::replay::LaunchConfig { + grid_dim: (config.gridDimX, config.gridDimY, config.gridDimZ), + block_dim: (config.blockDimX, config.blockDimY, config.blockDimZ), + shared_mem_bytes: config.sharedMemBytes, + }, + source.to_string(), + all_params, + ) + .map_err(ErrorEntry::IoError) + }); + None + } else { + Some(LaunchPreState { + kernel_name: name.to_string(), + source: source.to_string(), + kernel_params: all_params, + }) + } } pub(crate) fn post_kernel_launch( @@ -128,6 +157,7 @@ pub(crate) fn post_kernel_launch( zluda_trace_common::replay::save( file, pre_state.kernel_name, + true, zluda_trace_common::replay::LaunchConfig { grid_dim: (config.gridDimX, config.gridDimY, config.gridDimZ), block_dim: (config.blockDimX, config.blockDimY, config.blockDimZ), diff --git a/zluda_trace/src/trace.rs b/zluda_trace/src/trace.rs index 4b57920..9fe8660 100644 --- a/zluda_trace/src/trace.rs +++ b/zluda_trace/src/trace.rs @@ -28,6 +28,7 @@ pub(crate) struct StateTracker { pub(crate) enqueue_counter: usize, pub(crate) override_cc: Option<(u32, u32)>, pub(crate) kernel_name_filter: Option, + pub(crate) kernel_no_output: bool, } pub(crate) struct ParsedModule { @@ -57,6 +58,7 @@ impl StateTracker { enqueue_counter: 0, override_cc: settings.override_cc, kernel_name_filter: settings.kernel_name_filter.clone(), + kernel_no_output: settings.kernel_no_output.unwrap_or(false), } } diff --git a/zluda_trace_common/src/replay.rs b/zluda_trace_common/src/replay.rs index 53005dc..6933f8d 100644 --- a/zluda_trace_common/src/replay.rs +++ b/zluda_trace_common/src/replay.rs @@ -5,6 +5,7 @@ use tar::Header; #[derive(serde::Serialize, serde::Deserialize)] pub struct Manifest { pub kernel_name: String, + pub outputs: bool, pub config: LaunchConfig, pub parameters: Vec, } @@ -46,6 +47,7 @@ pub struct KernelParameter { pub fn save( writer: impl Write, kernel_name: String, + has_outputs: bool, config: LaunchConfig, source: String, kernel_params: Vec, @@ -54,6 +56,7 @@ pub fn save( let mut builder = tar::Builder::new(archive); let (mut header, manifest) = Manifest { kernel_name, + outputs: has_outputs, config, parameters: kernel_params .iter() @@ -86,6 +89,9 @@ pub fn save( let mut header = Header::new_gnu(); header.set_size(data_before.len() as u64); builder.append_data(&mut header, &*path, &*data_before)?; + if !has_outputs { + continue; + } let path = format!("param_{i}_ptr_{offset_in_param}_post.bin"); let mut header = Header::new_gnu(); header.set_size(data_after.len() as u64);