Add script for replaying dumped kernel (#34)

zluda_dump can already create traces of GPU execution, this script can replay those traces. Additionally, changed added just enough code in core ZLUDA to support simple PyCUDAexecution
2025-08-06 16:19:29 +00:00 · 2021-01-20 01:49:54 +01:00 · 2021-01-20 01:49:54 +01:00 · 3e2e73ac33
commit 3e2e73ac33
parent ff8135e8a3
7 changed files with 397 additions and 25 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,18 +1,61 @@
-## Dependencies
+# Dependencies

 Development builds of ZLUDA requires following dependencies:

 * CMake
 * Python 3

-Additionally repository have to be clone with Git submodules initalized. If you cloned the repo without initalizing submodules, do this:
+Additionally the repository has to be cloned with Git submodules initalized. If you cloned the repo without initalizing submodules, do this:
 ```
 git submodule update --init --recursive
 ```

-## Tests
+# Tests

 Tests should be executed with `--workspace` option to test non-default targets:
 ```
 cargo test --workspace
 ```
+
+# Debugging
+
+## Debuggging CUDA applications
+
+When running an application with ZLUDA quite often you will run into subtle bugs or incompatibilities in the generated GPU code. The best way to debug an application's GPU CUDA code is to use ZLUDA dumper.
+
+Library `zluda_dump` can be injected into a CUDA application and produce a trace which, for every launched GPU function contains:
+* PTX source
+* Launch arguments (block size, grid size, shared memory size)
+* Dump of function arguments. Both after and before
+
+Example use with GeekBench:
+```
+set ZLUDA_DUMP_KERNEL=knn_match
+set ZLUDA_DUMP_DIR=C:\temp\zluda_dump
+"<ZLUDA_PATH>\zluda_with.exe" "<ZLUDA_PATH>\zluda_dump.dll" -- "geekbench_x86_64.exe" --compute CUDA
+```
+
+The example above, for every execution of GPU function `knn_match`, will save its details into the directory `C:\temp\zluda_dump`
+
+This dump can be replayed with `replay.py` script from `zluda_dump` source directory. Use it like this:
+```
+python replay.py "C:\temp\zluda_dump\geekbench_x86_64.exe"
+```
+You must copy (or symlink) ZLUDA `nvcuda.dll` into PyCUDA directory, so it will run using ZLUDA. Example output:
+```
+Intel(R) Graphics [0x3e92] [github.com/vosen/ZLUDA]
+C:\temp\zluda_dump\geekbench_x86_64.exe\4140_scale_pyramid
+C:\temp\zluda_dump\geekbench_x86_64.exe\4345_convolve_1d_vertical_grayscale
+    Skipping, launch block size (512) bigger than maximum block size (256)
+C:\temp\zluda_dump\geekbench_x86_64.exe\4480_scale_pyramid
+6: 
+Arrays are not equal
+
+Mismatched elements: 1200 / 19989588 (0.006%)
+Max absolute difference: 255
+Max relative difference: 255.
+ x: array([  7,   6,   8, ..., 193, 195, 193], dtype=uint8)
+ y: array([  7,   6,   8, ..., 193, 195, 193], dtype=uint8)
+```
+From this output one can observe that in kernel launch 4480, 6th argument to function `scale_pyramid` differs between what was executed on an NVIDIA GPU using CUDA and Intel GPU using ZLUDA.  
+__Important__: It's impossible to infer what was the type (and semantics) of argument passed to a GPU function. At our level it's a buffer of bytes and by default `replay.py` simply checks if two buffers are byte-equal. That means you will have a ton of false negatives when running  `replay.py`. You should override them for your particular case in `replay.py` - it already contains some overrides for GeekBench kernels
--- a/zluda/src/cuda.rs
+++ b/zluda/src/cuda.rs
@ -2186,7 +2186,7 @@ pub extern "C" fn cuGetErrorString(
    error: CUresult,
    pStr: *mut *const ::std::os::raw::c_char,
 ) -> CUresult {
-    r#impl::unimplemented()
+    r#impl::get_error_string(error,  pStr).encuda()
 }

 #[cfg_attr(not(test), no_mangle)]
@ -2344,7 +2344,7 @@ pub extern "C" fn cuCtxDestroy_v2(ctx: CUcontext) -> CUresult {

 #[cfg_attr(not(test), no_mangle)]
 pub extern "C" fn cuCtxPushCurrent_v2(ctx: CUcontext) -> CUresult {
-    r#impl::unimplemented()
+    r#impl::context::push_current_v2(ctx.decuda())
 }

 #[cfg_attr(not(test), no_mangle)]
@ -2443,7 +2443,7 @@ pub extern "C" fn cuModuleLoad(
    module: *mut CUmodule,
    fname: *const ::std::os::raw::c_char,
 ) -> CUresult {
-    r#impl::unimplemented()
+    r#impl::module::load(module.decuda(), fname).encuda()
 }

 #[cfg_attr(not(test), no_mangle)]
@ -3671,7 +3671,7 @@ pub extern "C" fn cuFuncSetBlockShape(
    y: ::std::os::raw::c_int,
    z: ::std::os::raw::c_int,
 ) -> CUresult {
-    r#impl::unimplemented()
+    r#impl::function::set_block_shape(hfunc.decuda(), x, y, z).encuda()
 }

 #[cfg_attr(not(test), no_mangle)]
@ -4503,3 +4503,33 @@ pub extern "C" fn cuGetExportTable(
 pub extern "C" fn cuFuncGetModule(hmod: *mut CUmodule, hfunc: CUfunction) -> CUresult {
    r#impl::unimplemented()
 }
+
+impl CUoutput_mode_enum {
+    pub const CU_OUT_KEY_VALUE_PAIR: CUoutput_mode_enum = CUoutput_mode_enum(0);
+}
+impl CUoutput_mode_enum {
+    pub const CU_OUT_CSV: CUoutput_mode_enum = CUoutput_mode_enum(1);
+}
+#[repr(transparent)]
+#[derive(Copy, Clone, Hash, PartialEq, Eq)]
+pub struct CUoutput_mode_enum(pub ::std::os::raw::c_uint);
+pub use self::CUoutput_mode_enum as CUoutput_mode;
+
+#[cfg_attr(not(test), no_mangle)]
+pub extern "C" fn cuProfilerInitialize(
+    configFile: *const ::std::os::raw::c_char,
+    outputFile: *const ::std::os::raw::c_char,
+    outputMode: CUoutput_mode,
+) -> CUresult {
+    r#impl::unimplemented()
+}
+
+#[cfg_attr(not(test), no_mangle)]
+pub extern "C" fn cuProfilerStart() -> CUresult {
+    r#impl::unimplemented()
+}
+
+#[cfg_attr(not(test), no_mangle)]
+pub extern "C" fn cuProfilerStop() -> CUresult {
+    r#impl::unimplemented()
+}
--- a/zluda/src/impl/context.rs
+++ b/zluda/src/impl/context.rs
@ -169,6 +169,14 @@ pub fn destroy_v2(ctx: *mut Context) -> Result<(), CUresult> {
    GlobalState::lock(|_| Context::destroy_impl(ctx))?
 }

+pub(crate) fn push_current_v2(pctx: *mut Context) -> CUresult {
+    if pctx == ptr::null_mut() {
+        return CUresult::CUDA_ERROR_INVALID_VALUE;
+    }
+    CONTEXT_STACK.with(|stack| stack.borrow_mut().push(pctx));
+    CUresult::CUDA_SUCCESS
+}
+
 pub fn pop_current_v2(pctx: *mut *mut Context) -> CUresult {
    if pctx == ptr::null_mut() {
        return CUresult::CUDA_ERROR_INVALID_VALUE;
--- a/zluda/src/impl/function.rs
+++ b/zluda/src/impl/function.rs
@ -1,9 +1,11 @@
+use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
+use crate::cuda::CUfunction_attribute;
 use ::std::os::raw::{c_uint, c_void};
 use std::{hint, ptr};

-use crate::cuda::CUfunction_attribute;
-
-use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
+const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
+const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
+const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _;

 pub type Function = LiveCheck<FunctionData>;

@ -26,6 +28,26 @@ pub struct FunctionData {
    pub arg_size: Vec<usize>,
    pub use_shared_mem: bool,
    pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>,
+    pub legacy_args: LegacyArguments,
+}
+
+pub struct LegacyArguments {
+    block_shape: Option<(i32, i32, i32)>,
+}
+
+impl LegacyArguments {
+    pub fn new() -> Self {
+        LegacyArguments { block_shape: None }
+    }
+
+    #[allow(dead_code)]
+    pub fn is_initialized(&self) -> bool {
+        self.block_shape.is_some()
+    }
+
+    pub fn reset(&mut self) {
+        self.block_shape = None;
+    }
 }

 impl FunctionData {
@ -53,19 +75,62 @@ pub fn launch_kernel(
    kernel_params: *mut *mut c_void,
    extra: *mut *mut c_void,
 ) -> Result<(), CUresult> {
-    if f == ptr::null_mut() {
+    if f == ptr::null_mut()
+        || (kernel_params == ptr::null_mut() && extra == ptr::null_mut())
+        || (kernel_params != ptr::null_mut() && extra != ptr::null_mut())
+    {
        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
    }
-    if extra != ptr::null_mut() {
-        return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
-    }
    GlobalState::lock_stream(hstream, |stream| {
        let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
-        for (i, arg_size) in func.arg_size.iter().enumerate() {
-            unsafe {
-                func.base
-                    .set_arg_raw(i as u32, *arg_size, *kernel_params.add(i))?
-            };
+        if kernel_params != ptr::null_mut() {
+            for (i, arg_size) in func.arg_size.iter().enumerate() {
+                unsafe {
+                    func.base
+                        .set_arg_raw(i as u32, *arg_size, *kernel_params.add(i))?
+                };
+            }
+        } else {
+            let mut offset = 0;
+            let mut buffer_ptr = None;
+            let mut buffer_size = None;
+            loop {
+                match unsafe { *extra.add(offset) } {
+                    CU_LAUNCH_PARAM_END => break,
+                    CU_LAUNCH_PARAM_BUFFER_POINTER => {
+                        buffer_ptr = Some(unsafe { *extra.add(offset + 1) as *mut u8 });
+                    }
+                    CU_LAUNCH_PARAM_BUFFER_SIZE => {
+                        buffer_size = Some(unsafe { *(*extra.add(offset + 1) as *mut usize) });
+                    }
+                    _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+                }
+                offset += 2;
+            }
+            match (buffer_size, buffer_ptr) {
+                (Some(buffer_size), Some(buffer_ptr)) => {
+                    let sum_of_kernel_argument_sizes =
+                        func.arg_size.iter().fold(0, |offset, size_of_arg| {
+                            size_of_arg + round_up_to_multiple(offset, *size_of_arg)
+                        });
+                    if buffer_size != sum_of_kernel_argument_sizes {
+                        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+                    }
+                    let mut offset = 0;
+                    for (i, arg_size) in func.arg_size.iter().enumerate() {
+                        let buffer_offset = round_up_to_multiple(offset, *arg_size);
+                        unsafe {
+                            func.base.set_arg_raw(
+                                i as u32,
+                                *arg_size,
+                                buffer_ptr.add(buffer_offset) as *const _,
+                            )?
+                        };
+                        offset = buffer_offset + *arg_size;
+                    }
+                }
+                _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
+            }
        }
        if func.use_shared_mem {
            unsafe {
@ -78,6 +143,7 @@ pub fn launch_kernel(
        }
        func.base
            .set_group_size(block_dim_x, block_dim_y, block_dim_z)?;
+        func.legacy_args.reset();
        let mut cmd_list = stream.command_list()?;
        cmd_list.append_launch_kernel(
            &mut func.base,
@ -90,6 +156,10 @@ pub fn launch_kernel(
    })?
 }

+fn round_up_to_multiple(x: usize, multiple: usize) -> usize {
+    ((x + multiple - 1) / multiple) * multiple
+}
+
 pub(crate) fn get_attribute(
    pi: *mut i32,
    attrib: CUfunction_attribute,
@ -110,3 +180,12 @@ pub(crate) fn get_attribute(
        _ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
    }
 }
+
+pub(crate) fn set_block_shape(func: *mut Function, x: i32, y: i32, z: i32) -> Result<(), CUresult> {
+    if func == ptr::null_mut() || x < 0 || y < 0 || z < 0 {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    GlobalState::lock_function(func, |func| {
+        func.legacy_args.block_shape = Some((x, y, z));
+    })
+}
--- a/zluda/src/impl/mod.rs
+++ b/zluda/src/impl/mod.rs
@ -138,10 +138,10 @@ impl From<l0::sys::ze_result_t> for CUresult {
            l0_sys::ze_result_t::ZE_RESULT_ERROR_UNINITIALIZED => {
                CUresult::CUDA_ERROR_NOT_INITIALIZED
            }
-            l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION => {
-                CUresult::CUDA_ERROR_INVALID_VALUE
-            }
-            l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT => {
+            l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION
+            | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT
+            | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION
+            | l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION => {
                CUresult::CUDA_ERROR_INVALID_VALUE
            }
            l0_sys::ze_result_t::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY => {
@ -306,6 +306,110 @@ pub fn init() -> Result<(), CUresult> {
    Ok(())
 }

+macro_rules! stringify_curesult {
+    ($x:ident => [ $($variant:ident),+ ]) => {
+        match $x {
+            $(
+                CUresult::$variant => Some(concat!(stringify!($variant), "\0")),
+            )+
+            _ => None
+        }
+    }
+}
+
+pub(crate) fn get_error_string(error: CUresult, str: *mut *const i8) -> CUresult {
+    if str == ptr::null_mut() {
+        return CUresult::CUDA_ERROR_INVALID_VALUE;
+    }
+    let text = stringify_curesult!(
+        error => [
+            CUDA_SUCCESS,
+            CUDA_ERROR_INVALID_VALUE,
+            CUDA_ERROR_OUT_OF_MEMORY,
+            CUDA_ERROR_NOT_INITIALIZED,
+            CUDA_ERROR_DEINITIALIZED,
+            CUDA_ERROR_PROFILER_DISABLED,
+            CUDA_ERROR_PROFILER_NOT_INITIALIZED,
+            CUDA_ERROR_PROFILER_ALREADY_STARTED,
+            CUDA_ERROR_PROFILER_ALREADY_STOPPED,
+            CUDA_ERROR_NO_DEVICE,
+            CUDA_ERROR_INVALID_DEVICE,
+            CUDA_ERROR_INVALID_IMAGE,
+            CUDA_ERROR_INVALID_CONTEXT,
+            CUDA_ERROR_CONTEXT_ALREADY_CURRENT,
+            CUDA_ERROR_MAP_FAILED,
+            CUDA_ERROR_UNMAP_FAILED,
+            CUDA_ERROR_ARRAY_IS_MAPPED,
+            CUDA_ERROR_ALREADY_MAPPED,
+            CUDA_ERROR_NO_BINARY_FOR_GPU,
+            CUDA_ERROR_ALREADY_ACQUIRED,
+            CUDA_ERROR_NOT_MAPPED,
+            CUDA_ERROR_NOT_MAPPED_AS_ARRAY,
+            CUDA_ERROR_NOT_MAPPED_AS_POINTER,
+            CUDA_ERROR_ECC_UNCORRECTABLE,
+            CUDA_ERROR_UNSUPPORTED_LIMIT,
+            CUDA_ERROR_CONTEXT_ALREADY_IN_USE,
+            CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+            CUDA_ERROR_INVALID_PTX,
+            CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
+            CUDA_ERROR_NVLINK_UNCORRECTABLE,
+            CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
+            CUDA_ERROR_INVALID_SOURCE,
+            CUDA_ERROR_FILE_NOT_FOUND,
+            CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+            CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+            CUDA_ERROR_OPERATING_SYSTEM,
+            CUDA_ERROR_INVALID_HANDLE,
+            CUDA_ERROR_ILLEGAL_STATE,
+            CUDA_ERROR_NOT_FOUND,
+            CUDA_ERROR_NOT_READY,
+            CUDA_ERROR_ILLEGAL_ADDRESS,
+            CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+            CUDA_ERROR_LAUNCH_TIMEOUT,
+            CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+            CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
+            CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
+            CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE,
+            CUDA_ERROR_CONTEXT_IS_DESTROYED,
+            CUDA_ERROR_ASSERT,
+            CUDA_ERROR_TOO_MANY_PEERS,
+            CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
+            CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+            CUDA_ERROR_HARDWARE_STACK_ERROR,
+            CUDA_ERROR_ILLEGAL_INSTRUCTION,
+            CUDA_ERROR_MISALIGNED_ADDRESS,
+            CUDA_ERROR_INVALID_ADDRESS_SPACE,
+            CUDA_ERROR_INVALID_PC,
+            CUDA_ERROR_LAUNCH_FAILED,
+            CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+            CUDA_ERROR_NOT_PERMITTED,
+            CUDA_ERROR_NOT_SUPPORTED,
+            CUDA_ERROR_SYSTEM_NOT_READY,
+            CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
+            CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE,
+            CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED,
+            CUDA_ERROR_STREAM_CAPTURE_INVALIDATED,
+            CUDA_ERROR_STREAM_CAPTURE_MERGE,
+            CUDA_ERROR_STREAM_CAPTURE_UNMATCHED,
+            CUDA_ERROR_STREAM_CAPTURE_UNJOINED,
+            CUDA_ERROR_STREAM_CAPTURE_ISOLATION,
+            CUDA_ERROR_STREAM_CAPTURE_IMPLICIT,
+            CUDA_ERROR_CAPTURED_EVENT,
+            CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD,
+            CUDA_ERROR_TIMEOUT,
+            CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
+            CUDA_ERROR_UNKNOWN
+        ]
+    );
+    match text {
+        Some(text) => {
+            unsafe { *str = text.as_ptr() as *const _ };
+            CUresult::CUDA_SUCCESS
+        }
+        None => CUresult::CUDA_ERROR_INVALID_VALUE,
+    }
+}
+
 unsafe fn transmute_lifetime_mut<'a, 'b, T: ?Sized>(t: &'a mut T) -> &'b mut T {
    mem::transmute(t)
 }
--- a/zluda/src/impl/module.rs
+++ b/zluda/src/impl/module.rs
@ -4,8 +4,10 @@ use std::{
 };

 use super::{
-    device, function::Function, function::FunctionData, CUresult, GlobalState, HasLivenessCookie,
-    LiveCheck,
+    device,
+    function::Function,
+    function::{FunctionData, LegacyArguments},
+    CUresult, GlobalState, HasLivenessCookie, LiveCheck,
 };
 use ptx;

@ -145,6 +147,7 @@ pub fn get_function(
                    arg_size: kernel_info.arguments_sizes.clone(),
                    use_shared_mem: kernel_info.uses_shared_mem,
                    properties: None,
+                    legacy_args: LegacyArguments::new(),
                })))
            }
        };
@ -186,3 +189,17 @@ pub(crate) fn unload(module: *mut Module) -> Result<(), CUresult> {
    }
    GlobalState::lock(|_| Module::destroy_impl(module))?
 }
+
+pub(crate) fn load(pmod: *mut *mut Module, fname: *const i8) -> Result<(), CUresult> {
+    if pmod == ptr::null_mut() || fname == ptr::null() {
+        return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+    }
+    let path = unsafe { CStr::from_ptr(fname) };
+    let path_utf8 = path
+        .to_str()
+        .map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
+    let file = std::fs::read(path_utf8).map_err(|_| CUresult::CUDA_ERROR_FILE_NOT_FOUND)?;
+    let module_text = std::str::from_utf8(&file).map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
+    let spirv_data = SpirvModule::new(module_text)?;
+    load_data_impl(pmod, spirv_data)
+}
--- a/zluda_dump/src/replay.py
+++ b/zluda_dump/src/replay.py
@ -0,0 +1,91 @@
+import pycuda.autoinit
+import pycuda.driver as drv
+import pycuda.tools as py_tools
+from pathlib import PurePath
+import numpy as np
+from os import path
+import os
+import itertools
+import sys
+
+
+# It's impossible to discern what is the type of a buffer, here you can override equality checks
+def assert_array_equal_override(kernel_name, idx, arr1, arr2):
+    if kernel_name == 'knn_match' and idx == 6:
+        arr1_view = np.frombuffer(arr1, dtype=np.dtype([('f1', np.uint32), ('f2', np.uint32), ('f3', np.uint32)]))
+        np.ndarray.sort(arr1_view)
+        arr2_view = np.frombuffer(arr2, dtype=np.dtype([('f1', np.uint32), ('f2', np.uint32), ('f3', np.uint32)]))
+        np.ndarray.sort(arr2_view)
+    if kernel_name == 'nonmax_suppression' and idx == 7:
+        arr1_view = np.frombuffer(arr1, dtype=np.dtype(np.uint32))
+        np.ndarray.sort(arr1_view)
+        arr2_view = np.frombuffer(arr2, dtype=np.dtype(np.uint32))
+        np.ndarray.sort(arr2_view)
+    np.testing.assert_array_equal(arr1, arr2)
+
+
+def load_arguments(arg_path):
+    is_buffer = arg_path.endswith(".buffer")
+    with open(arg_path, "rb") as f:
+        arg_bytes = f.read()
+    if not is_buffer:
+        if len(arg_bytes) == 1:
+            return np.frombuffer(arg_bytes, dtype=np.uint8)[0], None
+        elif len(arg_bytes) == 2:
+            return np.frombuffer(arg_bytes, dtype=np.uint16)[0], None
+        elif len(arg_bytes) == 4:
+            return np.frombuffer(arg_bytes, dtype=np.uint32)[0], None
+        elif len(arg_bytes) == 8:
+            return np.frombuffer(arg_bytes, dtype=np.uint64)[0], None
+        else:
+            raise Exception('Incorrect size of {}: {}'.format(arg_path, len(arg_bytes)))
+    else:
+        buff = np.frombuffer(bytearray(arg_bytes), dtype=np.uint8)
+        buff.setflags(write=1, align=1)
+        return drv.InOut(buff), buff
+
+
+def parse_arguments(dump_path, prefix):
+    dir = path.join(dump_path, prefix)
+    arg_files = os.listdir(dir)
+    return [load_arguments(path.join(dir, f)) for f in sorted(arg_files)]
+
+def verify_single_dump(input_path, max_block_threads):
+    print(input_path)
+    kernel_name = path.basename(input_path).split("_", 1)[1]
+    with open(path.join(input_path, "launch.txt"), "r") as launch_f:
+        launch_lines = list(map(int, launch_f.readlines()))
+    block = tuple(launch_lines[3:6])
+    launch_block_size = block[0] * block[1] * block[2]
+    if launch_block_size > max_block_threads:
+        print(f"    Skipping, launch block size ({launch_block_size}) bigger than maximum block size ({max_block_threads})")
+        return
+    module = drv.module_from_file(path.join(input_path, "module.ptx"))
+    kernel = module.get_function(kernel_name)
+    pre_args = parse_arguments(input_path, "pre")
+    kernel_pre_args, host_pre_args = zip(*pre_args)
+    kernel(*list(kernel_pre_args), grid=tuple(launch_lines[:3]), block=block, shared=launch_lines[6])
+    post_args = parse_arguments(input_path, "post")
+    _, host_post_args_args = zip(*post_args)
+    for idx, (pre_arg, post_arg) in enumerate(zip(host_pre_args, host_post_args_args)):
+        if pre_arg is None:
+            continue
+        try:
+            assert_array_equal_override(kernel_name, idx, pre_arg, post_arg)
+        except Exception as e:
+            print(f"{idx}: {e}")
+
+def main(argv):
+    device = drv.Device(0)
+    max_threads = device.get_attribute(drv.device_attribute.MAX_THREADS_PER_BLOCK)
+    print(device.name())
+    input_path = argv[1]
+    if os.path.exists(path.join(input_path, "launch.txt")):
+        verify_single_dump(input_path, max_threads)
+    else:
+        for input_subdir in sorted([path.join(input_path, dir_name) for dir_name in os.listdir(input_path)]):
+            verify_single_dump(input_subdir, max_threads)
+
+
+if __name__ == "__main__":
+    main(sys.argv)