Add script for replaying dumped kernel (#34)

zluda_dump can already create traces of GPU execution, this script can replay those traces.
Additionally, changed added just enough code in core ZLUDA to support simple PyCUDAexecution
This commit is contained in:
Andrzej Janik 2021-01-20 01:49:54 +01:00
parent ff8135e8a3
commit 3e2e73ac33
7 changed files with 397 additions and 25 deletions

View file

@ -1,18 +1,61 @@
## Dependencies
# Dependencies
Development builds of ZLUDA requires following dependencies:
* CMake
* Python 3
Additionally repository have to be clone with Git submodules initalized. If you cloned the repo without initalizing submodules, do this:
Additionally the repository has to be cloned with Git submodules initalized. If you cloned the repo without initalizing submodules, do this:
```
git submodule update --init --recursive
```
## Tests
# Tests
Tests should be executed with `--workspace` option to test non-default targets:
```
cargo test --workspace
```
# Debugging
## Debuggging CUDA applications
When running an application with ZLUDA quite often you will run into subtle bugs or incompatibilities in the generated GPU code. The best way to debug an application's GPU CUDA code is to use ZLUDA dumper.
Library `zluda_dump` can be injected into a CUDA application and produce a trace which, for every launched GPU function contains:
* PTX source
* Launch arguments (block size, grid size, shared memory size)
* Dump of function arguments. Both after and before
Example use with GeekBench:
```
set ZLUDA_DUMP_KERNEL=knn_match
set ZLUDA_DUMP_DIR=C:\temp\zluda_dump
"<ZLUDA_PATH>\zluda_with.exe" "<ZLUDA_PATH>\zluda_dump.dll" -- "geekbench_x86_64.exe" --compute CUDA
```
The example above, for every execution of GPU function `knn_match`, will save its details into the directory `C:\temp\zluda_dump`
This dump can be replayed with `replay.py` script from `zluda_dump` source directory. Use it like this:
```
python replay.py "C:\temp\zluda_dump\geekbench_x86_64.exe"
```
You must copy (or symlink) ZLUDA `nvcuda.dll` into PyCUDA directory, so it will run using ZLUDA. Example output:
```
Intel(R) Graphics [0x3e92] [github.com/vosen/ZLUDA]
C:\temp\zluda_dump\geekbench_x86_64.exe\4140_scale_pyramid
C:\temp\zluda_dump\geekbench_x86_64.exe\4345_convolve_1d_vertical_grayscale
Skipping, launch block size (512) bigger than maximum block size (256)
C:\temp\zluda_dump\geekbench_x86_64.exe\4480_scale_pyramid
6:
Arrays are not equal
Mismatched elements: 1200 / 19989588 (0.006%)
Max absolute difference: 255
Max relative difference: 255.
x: array([ 7, 6, 8, ..., 193, 195, 193], dtype=uint8)
y: array([ 7, 6, 8, ..., 193, 195, 193], dtype=uint8)
```
From this output one can observe that in kernel launch 4480, 6th argument to function `scale_pyramid` differs between what was executed on an NVIDIA GPU using CUDA and Intel GPU using ZLUDA.
__Important__: It's impossible to infer what was the type (and semantics) of argument passed to a GPU function. At our level it's a buffer of bytes and by default `replay.py` simply checks if two buffers are byte-equal. That means you will have a ton of false negatives when running `replay.py`. You should override them for your particular case in `replay.py` - it already contains some overrides for GeekBench kernels

View file

@ -2186,7 +2186,7 @@ pub extern "C" fn cuGetErrorString(
error: CUresult,
pStr: *mut *const ::std::os::raw::c_char,
) -> CUresult {
r#impl::unimplemented()
r#impl::get_error_string(error, pStr).encuda()
}
#[cfg_attr(not(test), no_mangle)]
@ -2344,7 +2344,7 @@ pub extern "C" fn cuCtxDestroy_v2(ctx: CUcontext) -> CUresult {
#[cfg_attr(not(test), no_mangle)]
pub extern "C" fn cuCtxPushCurrent_v2(ctx: CUcontext) -> CUresult {
r#impl::unimplemented()
r#impl::context::push_current_v2(ctx.decuda())
}
#[cfg_attr(not(test), no_mangle)]
@ -2443,7 +2443,7 @@ pub extern "C" fn cuModuleLoad(
module: *mut CUmodule,
fname: *const ::std::os::raw::c_char,
) -> CUresult {
r#impl::unimplemented()
r#impl::module::load(module.decuda(), fname).encuda()
}
#[cfg_attr(not(test), no_mangle)]
@ -3671,7 +3671,7 @@ pub extern "C" fn cuFuncSetBlockShape(
y: ::std::os::raw::c_int,
z: ::std::os::raw::c_int,
) -> CUresult {
r#impl::unimplemented()
r#impl::function::set_block_shape(hfunc.decuda(), x, y, z).encuda()
}
#[cfg_attr(not(test), no_mangle)]
@ -4503,3 +4503,33 @@ pub extern "C" fn cuGetExportTable(
pub extern "C" fn cuFuncGetModule(hmod: *mut CUmodule, hfunc: CUfunction) -> CUresult {
r#impl::unimplemented()
}
impl CUoutput_mode_enum {
pub const CU_OUT_KEY_VALUE_PAIR: CUoutput_mode_enum = CUoutput_mode_enum(0);
}
impl CUoutput_mode_enum {
pub const CU_OUT_CSV: CUoutput_mode_enum = CUoutput_mode_enum(1);
}
#[repr(transparent)]
#[derive(Copy, Clone, Hash, PartialEq, Eq)]
pub struct CUoutput_mode_enum(pub ::std::os::raw::c_uint);
pub use self::CUoutput_mode_enum as CUoutput_mode;
#[cfg_attr(not(test), no_mangle)]
pub extern "C" fn cuProfilerInitialize(
configFile: *const ::std::os::raw::c_char,
outputFile: *const ::std::os::raw::c_char,
outputMode: CUoutput_mode,
) -> CUresult {
r#impl::unimplemented()
}
#[cfg_attr(not(test), no_mangle)]
pub extern "C" fn cuProfilerStart() -> CUresult {
r#impl::unimplemented()
}
#[cfg_attr(not(test), no_mangle)]
pub extern "C" fn cuProfilerStop() -> CUresult {
r#impl::unimplemented()
}

View file

@ -169,6 +169,14 @@ pub fn destroy_v2(ctx: *mut Context) -> Result<(), CUresult> {
GlobalState::lock(|_| Context::destroy_impl(ctx))?
}
pub(crate) fn push_current_v2(pctx: *mut Context) -> CUresult {
if pctx == ptr::null_mut() {
return CUresult::CUDA_ERROR_INVALID_VALUE;
}
CONTEXT_STACK.with(|stack| stack.borrow_mut().push(pctx));
CUresult::CUDA_SUCCESS
}
pub fn pop_current_v2(pctx: *mut *mut Context) -> CUresult {
if pctx == ptr::null_mut() {
return CUresult::CUDA_ERROR_INVALID_VALUE;

View file

@ -1,9 +1,11 @@
use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
use crate::cuda::CUfunction_attribute;
use ::std::os::raw::{c_uint, c_void};
use std::{hint, ptr};
use crate::cuda::CUfunction_attribute;
use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
const CU_LAUNCH_PARAM_BUFFER_SIZE: *mut c_void = 2 as *mut _;
pub type Function = LiveCheck<FunctionData>;
@ -26,6 +28,26 @@ pub struct FunctionData {
pub arg_size: Vec<usize>,
pub use_shared_mem: bool,
pub properties: Option<Box<l0::sys::ze_kernel_properties_t>>,
pub legacy_args: LegacyArguments,
}
pub struct LegacyArguments {
block_shape: Option<(i32, i32, i32)>,
}
impl LegacyArguments {
pub fn new() -> Self {
LegacyArguments { block_shape: None }
}
#[allow(dead_code)]
pub fn is_initialized(&self) -> bool {
self.block_shape.is_some()
}
pub fn reset(&mut self) {
self.block_shape = None;
}
}
impl FunctionData {
@ -53,19 +75,62 @@ pub fn launch_kernel(
kernel_params: *mut *mut c_void,
extra: *mut *mut c_void,
) -> Result<(), CUresult> {
if f == ptr::null_mut() {
if f == ptr::null_mut()
|| (kernel_params == ptr::null_mut() && extra == ptr::null_mut())
|| (kernel_params != ptr::null_mut() && extra != ptr::null_mut())
{
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
if extra != ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED);
}
GlobalState::lock_stream(hstream, |stream| {
let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
for (i, arg_size) in func.arg_size.iter().enumerate() {
unsafe {
func.base
.set_arg_raw(i as u32, *arg_size, *kernel_params.add(i))?
};
if kernel_params != ptr::null_mut() {
for (i, arg_size) in func.arg_size.iter().enumerate() {
unsafe {
func.base
.set_arg_raw(i as u32, *arg_size, *kernel_params.add(i))?
};
}
} else {
let mut offset = 0;
let mut buffer_ptr = None;
let mut buffer_size = None;
loop {
match unsafe { *extra.add(offset) } {
CU_LAUNCH_PARAM_END => break,
CU_LAUNCH_PARAM_BUFFER_POINTER => {
buffer_ptr = Some(unsafe { *extra.add(offset + 1) as *mut u8 });
}
CU_LAUNCH_PARAM_BUFFER_SIZE => {
buffer_size = Some(unsafe { *(*extra.add(offset + 1) as *mut usize) });
}
_ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
}
offset += 2;
}
match (buffer_size, buffer_ptr) {
(Some(buffer_size), Some(buffer_ptr)) => {
let sum_of_kernel_argument_sizes =
func.arg_size.iter().fold(0, |offset, size_of_arg| {
size_of_arg + round_up_to_multiple(offset, *size_of_arg)
});
if buffer_size != sum_of_kernel_argument_sizes {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
let mut offset = 0;
for (i, arg_size) in func.arg_size.iter().enumerate() {
let buffer_offset = round_up_to_multiple(offset, *arg_size);
unsafe {
func.base.set_arg_raw(
i as u32,
*arg_size,
buffer_ptr.add(buffer_offset) as *const _,
)?
};
offset = buffer_offset + *arg_size;
}
}
_ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
}
}
if func.use_shared_mem {
unsafe {
@ -78,6 +143,7 @@ pub fn launch_kernel(
}
func.base
.set_group_size(block_dim_x, block_dim_y, block_dim_z)?;
func.legacy_args.reset();
let mut cmd_list = stream.command_list()?;
cmd_list.append_launch_kernel(
&mut func.base,
@ -90,6 +156,10 @@ pub fn launch_kernel(
})?
}
fn round_up_to_multiple(x: usize, multiple: usize) -> usize {
((x + multiple - 1) / multiple) * multiple
}
pub(crate) fn get_attribute(
pi: *mut i32,
attrib: CUfunction_attribute,
@ -110,3 +180,12 @@ pub(crate) fn get_attribute(
_ => Err(CUresult::CUDA_ERROR_NOT_SUPPORTED),
}
}
pub(crate) fn set_block_shape(func: *mut Function, x: i32, y: i32, z: i32) -> Result<(), CUresult> {
if func == ptr::null_mut() || x < 0 || y < 0 || z < 0 {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
GlobalState::lock_function(func, |func| {
func.legacy_args.block_shape = Some((x, y, z));
})
}

View file

@ -138,10 +138,10 @@ impl From<l0::sys::ze_result_t> for CUresult {
l0_sys::ze_result_t::ZE_RESULT_ERROR_UNINITIALIZED => {
CUresult::CUDA_ERROR_NOT_INITIALIZED
}
l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION => {
CUresult::CUDA_ERROR_INVALID_VALUE
}
l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT => {
l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION
| l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT
| l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION
| l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION => {
CUresult::CUDA_ERROR_INVALID_VALUE
}
l0_sys::ze_result_t::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY => {
@ -306,6 +306,110 @@ pub fn init() -> Result<(), CUresult> {
Ok(())
}
macro_rules! stringify_curesult {
($x:ident => [ $($variant:ident),+ ]) => {
match $x {
$(
CUresult::$variant => Some(concat!(stringify!($variant), "\0")),
)+
_ => None
}
}
}
pub(crate) fn get_error_string(error: CUresult, str: *mut *const i8) -> CUresult {
if str == ptr::null_mut() {
return CUresult::CUDA_ERROR_INVALID_VALUE;
}
let text = stringify_curesult!(
error => [
CUDA_SUCCESS,
CUDA_ERROR_INVALID_VALUE,
CUDA_ERROR_OUT_OF_MEMORY,
CUDA_ERROR_NOT_INITIALIZED,
CUDA_ERROR_DEINITIALIZED,
CUDA_ERROR_PROFILER_DISABLED,
CUDA_ERROR_PROFILER_NOT_INITIALIZED,
CUDA_ERROR_PROFILER_ALREADY_STARTED,
CUDA_ERROR_PROFILER_ALREADY_STOPPED,
CUDA_ERROR_NO_DEVICE,
CUDA_ERROR_INVALID_DEVICE,
CUDA_ERROR_INVALID_IMAGE,
CUDA_ERROR_INVALID_CONTEXT,
CUDA_ERROR_CONTEXT_ALREADY_CURRENT,
CUDA_ERROR_MAP_FAILED,
CUDA_ERROR_UNMAP_FAILED,
CUDA_ERROR_ARRAY_IS_MAPPED,
CUDA_ERROR_ALREADY_MAPPED,
CUDA_ERROR_NO_BINARY_FOR_GPU,
CUDA_ERROR_ALREADY_ACQUIRED,
CUDA_ERROR_NOT_MAPPED,
CUDA_ERROR_NOT_MAPPED_AS_ARRAY,
CUDA_ERROR_NOT_MAPPED_AS_POINTER,
CUDA_ERROR_ECC_UNCORRECTABLE,
CUDA_ERROR_UNSUPPORTED_LIMIT,
CUDA_ERROR_CONTEXT_ALREADY_IN_USE,
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
CUDA_ERROR_INVALID_PTX,
CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
CUDA_ERROR_NVLINK_UNCORRECTABLE,
CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
CUDA_ERROR_INVALID_SOURCE,
CUDA_ERROR_FILE_NOT_FOUND,
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
CUDA_ERROR_OPERATING_SYSTEM,
CUDA_ERROR_INVALID_HANDLE,
CUDA_ERROR_ILLEGAL_STATE,
CUDA_ERROR_NOT_FOUND,
CUDA_ERROR_NOT_READY,
CUDA_ERROR_ILLEGAL_ADDRESS,
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
CUDA_ERROR_LAUNCH_TIMEOUT,
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE,
CUDA_ERROR_CONTEXT_IS_DESTROYED,
CUDA_ERROR_ASSERT,
CUDA_ERROR_TOO_MANY_PEERS,
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
CUDA_ERROR_HARDWARE_STACK_ERROR,
CUDA_ERROR_ILLEGAL_INSTRUCTION,
CUDA_ERROR_MISALIGNED_ADDRESS,
CUDA_ERROR_INVALID_ADDRESS_SPACE,
CUDA_ERROR_INVALID_PC,
CUDA_ERROR_LAUNCH_FAILED,
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
CUDA_ERROR_NOT_PERMITTED,
CUDA_ERROR_NOT_SUPPORTED,
CUDA_ERROR_SYSTEM_NOT_READY,
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE,
CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED,
CUDA_ERROR_STREAM_CAPTURE_INVALIDATED,
CUDA_ERROR_STREAM_CAPTURE_MERGE,
CUDA_ERROR_STREAM_CAPTURE_UNMATCHED,
CUDA_ERROR_STREAM_CAPTURE_UNJOINED,
CUDA_ERROR_STREAM_CAPTURE_ISOLATION,
CUDA_ERROR_STREAM_CAPTURE_IMPLICIT,
CUDA_ERROR_CAPTURED_EVENT,
CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD,
CUDA_ERROR_TIMEOUT,
CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
CUDA_ERROR_UNKNOWN
]
);
match text {
Some(text) => {
unsafe { *str = text.as_ptr() as *const _ };
CUresult::CUDA_SUCCESS
}
None => CUresult::CUDA_ERROR_INVALID_VALUE,
}
}
unsafe fn transmute_lifetime_mut<'a, 'b, T: ?Sized>(t: &'a mut T) -> &'b mut T {
mem::transmute(t)
}

View file

@ -4,8 +4,10 @@ use std::{
};
use super::{
device, function::Function, function::FunctionData, CUresult, GlobalState, HasLivenessCookie,
LiveCheck,
device,
function::Function,
function::{FunctionData, LegacyArguments},
CUresult, GlobalState, HasLivenessCookie, LiveCheck,
};
use ptx;
@ -145,6 +147,7 @@ pub fn get_function(
arg_size: kernel_info.arguments_sizes.clone(),
use_shared_mem: kernel_info.uses_shared_mem,
properties: None,
legacy_args: LegacyArguments::new(),
})))
}
};
@ -186,3 +189,17 @@ pub(crate) fn unload(module: *mut Module) -> Result<(), CUresult> {
}
GlobalState::lock(|_| Module::destroy_impl(module))?
}
pub(crate) fn load(pmod: *mut *mut Module, fname: *const i8) -> Result<(), CUresult> {
if pmod == ptr::null_mut() || fname == ptr::null() {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
}
let path = unsafe { CStr::from_ptr(fname) };
let path_utf8 = path
.to_str()
.map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?;
let file = std::fs::read(path_utf8).map_err(|_| CUresult::CUDA_ERROR_FILE_NOT_FOUND)?;
let module_text = std::str::from_utf8(&file).map_err(|_| CUresult::CUDA_ERROR_INVALID_PTX)?;
let spirv_data = SpirvModule::new(module_text)?;
load_data_impl(pmod, spirv_data)
}

91
zluda_dump/src/replay.py Normal file
View file

@ -0,0 +1,91 @@
import pycuda.autoinit
import pycuda.driver as drv
import pycuda.tools as py_tools
from pathlib import PurePath
import numpy as np
from os import path
import os
import itertools
import sys
# It's impossible to discern what is the type of a buffer, here you can override equality checks
def assert_array_equal_override(kernel_name, idx, arr1, arr2):
if kernel_name == 'knn_match' and idx == 6:
arr1_view = np.frombuffer(arr1, dtype=np.dtype([('f1', np.uint32), ('f2', np.uint32), ('f3', np.uint32)]))
np.ndarray.sort(arr1_view)
arr2_view = np.frombuffer(arr2, dtype=np.dtype([('f1', np.uint32), ('f2', np.uint32), ('f3', np.uint32)]))
np.ndarray.sort(arr2_view)
if kernel_name == 'nonmax_suppression' and idx == 7:
arr1_view = np.frombuffer(arr1, dtype=np.dtype(np.uint32))
np.ndarray.sort(arr1_view)
arr2_view = np.frombuffer(arr2, dtype=np.dtype(np.uint32))
np.ndarray.sort(arr2_view)
np.testing.assert_array_equal(arr1, arr2)
def load_arguments(arg_path):
is_buffer = arg_path.endswith(".buffer")
with open(arg_path, "rb") as f:
arg_bytes = f.read()
if not is_buffer:
if len(arg_bytes) == 1:
return np.frombuffer(arg_bytes, dtype=np.uint8)[0], None
elif len(arg_bytes) == 2:
return np.frombuffer(arg_bytes, dtype=np.uint16)[0], None
elif len(arg_bytes) == 4:
return np.frombuffer(arg_bytes, dtype=np.uint32)[0], None
elif len(arg_bytes) == 8:
return np.frombuffer(arg_bytes, dtype=np.uint64)[0], None
else:
raise Exception('Incorrect size of {}: {}'.format(arg_path, len(arg_bytes)))
else:
buff = np.frombuffer(bytearray(arg_bytes), dtype=np.uint8)
buff.setflags(write=1, align=1)
return drv.InOut(buff), buff
def parse_arguments(dump_path, prefix):
dir = path.join(dump_path, prefix)
arg_files = os.listdir(dir)
return [load_arguments(path.join(dir, f)) for f in sorted(arg_files)]
def verify_single_dump(input_path, max_block_threads):
print(input_path)
kernel_name = path.basename(input_path).split("_", 1)[1]
with open(path.join(input_path, "launch.txt"), "r") as launch_f:
launch_lines = list(map(int, launch_f.readlines()))
block = tuple(launch_lines[3:6])
launch_block_size = block[0] * block[1] * block[2]
if launch_block_size > max_block_threads:
print(f" Skipping, launch block size ({launch_block_size}) bigger than maximum block size ({max_block_threads})")
return
module = drv.module_from_file(path.join(input_path, "module.ptx"))
kernel = module.get_function(kernel_name)
pre_args = parse_arguments(input_path, "pre")
kernel_pre_args, host_pre_args = zip(*pre_args)
kernel(*list(kernel_pre_args), grid=tuple(launch_lines[:3]), block=block, shared=launch_lines[6])
post_args = parse_arguments(input_path, "post")
_, host_post_args_args = zip(*post_args)
for idx, (pre_arg, post_arg) in enumerate(zip(host_pre_args, host_post_args_args)):
if pre_arg is None:
continue
try:
assert_array_equal_override(kernel_name, idx, pre_arg, post_arg)
except Exception as e:
print(f"{idx}: {e}")
def main(argv):
device = drv.Device(0)
max_threads = device.get_attribute(drv.device_attribute.MAX_THREADS_PER_BLOCK)
print(device.name())
input_path = argv[1]
if os.path.exists(path.join(input_path, "launch.txt")):
verify_single_dump(input_path, max_threads)
else:
for input_subdir in sorted([path.join(input_path, dir_name) for dir_name in os.listdir(input_path)]):
verify_single_dump(input_subdir, max_threads)
if __name__ == "__main__":
main(sys.argv)