diff --git a/Cargo.lock b/Cargo.lock index ee0d570..31ec2d8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3826,6 +3826,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "zluda_replay" +version = "0.0.0" +dependencies = [ + "cuda_macros", + "cuda_types", + "libloading", + "zluda_trace_common", +] + [[package]] name = "zluda_sparse" version = "0.0.0" @@ -3903,6 +3913,7 @@ dependencies = [ "format", "libc", "libloading", + "rustc-hash 2.0.0", "serde", "serde_json", "tar", diff --git a/Cargo.toml b/Cargo.toml index ca051ac..63a82c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ members = [ "zluda_inject", "zluda_ld", "zluda_ml", + "zluda_replay", "zluda_redirect", "zluda_sparse", "compiler", diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index 359669a..6078dc5 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -370,7 +370,7 @@ pub fn parse_for_errors_and_params<'input>( .func_directive .input_arguments .iter() - .map(|arg| arg.v_type.layout()) + .map(|arg| arg.info.v_type.layout()) .collect(); Some((func.func_directive.name().to_string(), layouts)) } else { diff --git a/zluda_replay/Cargo.toml b/zluda_replay/Cargo.toml new file mode 100644 index 0000000..73295d4 --- /dev/null +++ b/zluda_replay/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "zluda_replay" +version = "0.0.0" +authors = ["Andrzej Janik "] +edition = "2021" + +[[bin]] +name = "zluda_replay" + +[dependencies] +zluda_trace_common = { path = "../zluda_trace_common" } +cuda_macros = { path = "../cuda_macros" } +cuda_types = { path = "../cuda_types" } +libloading = "0.8" + +[package.metadata.zluda] +debug_only = true diff --git a/zluda_replay/src/main.rs b/zluda_replay/src/main.rs new file mode 100644 index 0000000..50d2d99 --- /dev/null +++ b/zluda_replay/src/main.rs @@ -0,0 +1,98 @@ +use std::mem; + +use cuda_types::cuda::{CUdeviceptr_v2, CUstream}; + +struct CudaDynamicFns { + handle: libloading::Library, +} + +impl CudaDynamicFns { + unsafe fn new(path: &str) -> Result { + let handle = libloading::Library::new(path)?; + Ok(Self { handle }) + } +} + +macro_rules! emit_cuda_fn_table { + ($($abi:literal fn $fn_name:ident( $($arg_id:ident : $arg_type:ty),* ) -> $ret_type:ty;)*) => { + impl CudaDynamicFns { + $( + #[allow(dead_code)] + unsafe fn $fn_name(&self, $($arg_id : $arg_type),*) -> $ret_type { + let func = self.handle.get:: $ret_type>(concat!(stringify!($fn_name), "\0").as_bytes()); + (func.unwrap())($($arg_id),*) + } + )* + } + }; +} + +cuda_macros::cuda_function_declarations!(emit_cuda_fn_table); + +fn main() { + let args: Vec = std::env::args().collect(); + let libcuda = unsafe { CudaDynamicFns::new(&args[1]).unwrap() }; + unsafe { libcuda.cuInit(0) }.unwrap(); + unsafe { libcuda.cuCtxCreate_v2(&mut mem::zeroed(), 0, 0) }.unwrap(); + let reader = std::fs::File::open(&args[2]).unwrap(); + let (mut manifest, mut source, mut buffers) = zluda_trace_common::replay::load(reader); + let mut args = manifest + .parameters + .iter() + .enumerate() + .map(|(i, param)| { + let mut buffer = buffers.remove(&format!("param_{i}.bin")).unwrap(); + for param_ptr in param.pointer_offsets.iter() { + let buffer_param_slice = &mut buffer[param_ptr.offset_in_param + ..param_ptr.offset_in_param + std::mem::size_of::()]; + let mut dev_ptr = unsafe { mem::zeroed() }; + let host_buffer = buffers + .remove(&format!( + "param_{i}_ptr_{}_pre.bin", + param_ptr.offset_in_param + )) + .unwrap(); + unsafe { libcuda.cuMemAlloc_v2(&mut dev_ptr, host_buffer.len()) }.unwrap(); + unsafe { + libcuda.cuMemcpyHtoD_v2(dev_ptr, host_buffer.as_ptr().cast(), host_buffer.len()) + } + .unwrap(); + dev_ptr = CUdeviceptr_v2(unsafe { + dev_ptr + .0 + .cast::() + .add(param_ptr.offset_in_buffer) + .cast() + }); + buffer_param_slice.copy_from_slice(&(dev_ptr.0 as usize).to_ne_bytes()); + } + }) + .collect::>(); + let mut module = unsafe { mem::zeroed() }; + std::fs::write("/tmp/source.ptx", &source).unwrap(); + source.push('\0'); + unsafe { libcuda.cuModuleLoadData(&mut module, source.as_ptr().cast()) }.unwrap(); + let mut function = unsafe { mem::zeroed() }; + manifest.kernel_name.push('\0'); + unsafe { + libcuda.cuModuleGetFunction(&mut function, module, manifest.kernel_name.as_ptr().cast()) + } + .unwrap(); + unsafe { + libcuda.cuLaunchKernel( + function, + manifest.config.grid_dim.0, + manifest.config.grid_dim.1, + manifest.config.grid_dim.2, + manifest.config.block_dim.0, + manifest.config.block_dim.1, + manifest.config.block_dim.2, + manifest.config.shared_mem_bytes, + CUstream(std::ptr::null_mut()), + args.as_mut_ptr().cast(), + std::ptr::null_mut(), + ) + } + .unwrap(); + todo!(); +} diff --git a/zluda_trace/src/lib.rs b/zluda_trace/src/lib.rs index 46ef901..fe2e41d 100644 --- a/zluda_trace/src/lib.rs +++ b/zluda_trace/src/lib.rs @@ -1552,14 +1552,14 @@ fn launch_kernel_pre( #[allow(non_snake_case)] pub(crate) fn cuLaunchKernel_Post( _f: cuda_types::cuda::CUfunction, - _gridDimX: ::core::ffi::c_uint, - _gridDimY: ::core::ffi::c_uint, - _gridDimZ: ::core::ffi::c_uint, - _blockDimX: ::core::ffi::c_uint, - _blockDimY: ::core::ffi::c_uint, - _blockDimZ: ::core::ffi::c_uint, - _sharedMemBytes: ::core::ffi::c_uint, - stream: cuda_types::cuda::CUstream, + gridDimX: ::core::ffi::c_uint, + gridDimY: ::core::ffi::c_uint, + gridDimZ: ::core::ffi::c_uint, + blockDimX: ::core::ffi::c_uint, + blockDimY: ::core::ffi::c_uint, + blockDimZ: ::core::ffi::c_uint, + sharedMemBytes: ::core::ffi::c_uint, + hStream: cuda_types::cuda::CUstream, kernel_params: *mut *mut ::core::ffi::c_void, _extra: *mut *mut ::core::ffi::c_void, pre_state: Option, @@ -1569,7 +1569,25 @@ pub(crate) fn cuLaunchKernel_Post( _result: CUresult, ) { let pre_state = unwrap_some_or!(pre_state, return); - replay::post_kernel_launch(libcuda, state, fn_logger, stream, kernel_params, pre_state); + replay::post_kernel_launch( + libcuda, + state, + fn_logger, + CUlaunchConfig { + gridDimX, + gridDimY, + gridDimZ, + blockDimX, + blockDimY, + blockDimZ, + sharedMemBytes, + hStream, + attrs: ptr::null_mut(), + numAttrs: 0, + }, + kernel_params, + pre_state, + ); } #[allow(non_snake_case)] @@ -1609,7 +1627,7 @@ pub(crate) fn cuLaunchKernelEx_Post( libcuda, state, fn_logger, - unsafe { *config }.hStream, + unsafe { *config }, kernel_params, pre_state, ); diff --git a/zluda_trace/src/replay.rs b/zluda_trace/src/replay.rs index 74fe292..1b6c01d 100644 --- a/zluda_trace/src/replay.rs +++ b/zluda_trace/src/replay.rs @@ -97,11 +97,11 @@ pub(crate) fn post_kernel_launch( libcuda: &mut CudaDynamicFns, state: &trace::StateTracker, fn_logger: &mut FnCallLog, - stream: CUstream, + config: CUlaunchConfig, kernel_params: *mut *mut std::ffi::c_void, mut pre_state: LaunchPreState, ) -> Option<()> { - fn_logger.try_cuda(|| libcuda.cuStreamSynchronize(stream))?; + fn_logger.try_cuda(|| libcuda.cuStreamSynchronize(config.hStream))?; let raw_args = unsafe { std::slice::from_raw_parts(kernel_params, pre_state.kernel_params.len()) }; for (raw_arg, param) in raw_args.iter().zip(pre_state.kernel_params.iter_mut()) { @@ -128,6 +128,11 @@ pub(crate) fn post_kernel_launch( zluda_trace_common::replay::save( file, pre_state.kernel_name, + zluda_trace_common::replay::LaunchConfig { + grid_dim: (config.gridDimX, config.gridDimY, config.gridDimZ), + block_dim: (config.blockDimX, config.blockDimY, config.blockDimZ), + shared_mem_bytes: config.sharedMemBytes, + }, pre_state.source, pre_state.kernel_params, ) diff --git a/zluda_trace_common/Cargo.toml b/zluda_trace_common/Cargo.toml index 3eb012a..fc24d59 100644 --- a/zluda_trace_common/Cargo.toml +++ b/zluda_trace_common/Cargo.toml @@ -15,6 +15,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0.142" tar = "0.4" zstd = "0.13" +rustc-hash = "2.0.0" [target.'cfg(not(windows))'.dependencies] libc = "0.2" diff --git a/zluda_trace_common/src/replay.rs b/zluda_trace_common/src/replay.rs index fe98e7c..53005dc 100644 --- a/zluda_trace_common/src/replay.rs +++ b/zluda_trace_common/src/replay.rs @@ -1,21 +1,30 @@ -use std::io::Write; +use rustc_hash::FxHashMap; +use std::io::{Read, Write}; use tar::Header; #[derive(serde::Serialize, serde::Deserialize)] -struct Manifest { - kernel_name: String, - parameters: Vec, +pub struct Manifest { + pub kernel_name: String, + pub config: LaunchConfig, + pub parameters: Vec, } #[derive(serde::Serialize, serde::Deserialize)] -struct Parameter { - pointer_offsets: Vec, +pub struct LaunchConfig { + pub grid_dim: (u32, u32, u32), + pub block_dim: (u32, u32, u32), + pub shared_mem_bytes: u32, } #[derive(serde::Serialize, serde::Deserialize)] -struct ParameterPointer { - offset_in_param: usize, - offset_in_buffer: usize, +pub struct Parameter { + pub pointer_offsets: Vec, +} + +#[derive(serde::Serialize, serde::Deserialize)] +pub struct ParameterPointer { + pub offset_in_param: usize, + pub offset_in_buffer: usize, } impl Manifest { @@ -37,6 +46,7 @@ pub struct KernelParameter { pub fn save( writer: impl Write, kernel_name: String, + config: LaunchConfig, source: String, kernel_params: Vec, ) -> std::io::Result<()> { @@ -44,6 +54,7 @@ pub fn save( let mut builder = tar::Builder::new(archive); let (mut header, manifest) = Manifest { kernel_name, + config, parameters: kernel_params .iter() .map(|param| Parameter { @@ -85,3 +96,34 @@ pub fn save( builder.into_inner()?.finish()?; Ok(()) } + +pub fn load(reader: impl Read) -> (Manifest, String, FxHashMap>) { + let archive = zstd::Decoder::new(reader).unwrap(); + let mut archive = tar::Archive::new(archive); + let mut manifest = None; + let mut source = None; + let mut buffers = FxHashMap::default(); + for entry in archive.entries().unwrap() { + let mut entry = entry.unwrap(); + let path = entry.path().unwrap().to_string_lossy().to_string(); + match &*path { + Manifest::PATH => { + manifest = Some(serde_json::from_reader::<_, Manifest>(&mut entry).unwrap()); + } + "source.ptx" => { + let mut string = String::new(); + entry.read_to_string(&mut string).unwrap(); + dbg!(string.len()); + source = Some(string); + } + _ => { + let mut buffer = Vec::new(); + entry.read_to_end(&mut buffer).unwrap(); + buffers.insert(path, buffer); + } + } + } + let manifest = manifest.unwrap(); + let source = source.unwrap(); + (manifest, source, buffers) +}