mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-04-20 08:24:44 +00:00
Tune generated code, add a workaround for geekbench
This commit is contained in:
parent
9d4f26bd07
commit
8f68287b18
14 changed files with 212 additions and 80 deletions
|
@ -261,6 +261,7 @@ pub enum Instruction<P: ArgParams> {
|
|||
Call(CallInst<P>),
|
||||
Abs(AbsDetails, Arg2<P>),
|
||||
Mad(MulDetails, Arg4<P>),
|
||||
Fma(ArithFloat, Arg4<P>),
|
||||
Or(ScalarType, Arg3<P>),
|
||||
Sub(ArithDetails, Arg3<P>),
|
||||
Min(MinMaxDetails, Arg3<P>),
|
||||
|
|
|
@ -743,6 +743,7 @@ Instruction: ast::Instruction<ast::ParsedArgParams<'input>> = {
|
|||
InstCall,
|
||||
InstAbs,
|
||||
InstMad,
|
||||
InstFma,
|
||||
InstOr,
|
||||
InstAnd,
|
||||
InstSub,
|
||||
|
@ -1345,7 +1346,11 @@ InstAbs: ast::Instruction<ast::ParsedArgParams<'input>> = {
|
|||
InstMad: ast::Instruction<ast::ParsedArgParams<'input>> = {
|
||||
"mad" <d:MulDetails> <a:Arg4> => ast::Instruction::Mad(d, a),
|
||||
"mad" ".hi" ".sat" ".s32" => todo!(),
|
||||
"fma" <f:ArithFloatMustRound> <a:Arg4> => ast::Instruction::Mad(ast::MulDetails::Float(f), a),
|
||||
};
|
||||
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma
|
||||
InstFma: ast::Instruction<ast::ParsedArgParams<'input>> = {
|
||||
"fma" <f:ArithFloatMustRound> <a:Arg4> => ast::Instruction::Fma(f, a),
|
||||
};
|
||||
|
||||
SignedIntType: ast::ScalarType = {
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
%11 = OpLoad %float %17 Aligned 4
|
||||
OpStore %6 %11
|
||||
%14 = OpLoad %float %6
|
||||
%13 = OpExtInst %float %21 cos %14
|
||||
%13 = OpExtInst %float %21 native_cos %14
|
||||
OpStore %6 %13
|
||||
%15 = OpLoad %ulong %5
|
||||
%16 = OpLoad %float %6
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
%11 = OpLoad %float %17 Aligned 4
|
||||
OpStore %6 %11
|
||||
%14 = OpLoad %float %6
|
||||
%13 = OpExtInst %float %21 exp2 %14
|
||||
%13 = OpExtInst %float %21 native_exp2 %14
|
||||
OpStore %6 %13
|
||||
%15 = OpLoad %ulong %5
|
||||
%16 = OpLoad %float %6
|
||||
|
|
|
@ -59,7 +59,7 @@
|
|||
%20 = OpLoad %float %6
|
||||
%21 = OpLoad %float %7
|
||||
%22 = OpLoad %float %8
|
||||
%19 = OpExtInst %float %35 mad %20 %21 %22
|
||||
%19 = OpExtInst %float %35 fma %20 %21 %22
|
||||
OpStore %6 %19
|
||||
%23 = OpLoad %ulong %5
|
||||
%24 = OpLoad %float %6
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
%11 = OpLoad %float %17 Aligned 4
|
||||
OpStore %6 %11
|
||||
%14 = OpLoad %float %6
|
||||
%13 = OpExtInst %float %21 log2 %14
|
||||
%13 = OpExtInst %float %21 native_log2 %14
|
||||
OpStore %6 %13
|
||||
%15 = OpLoad %ulong %5
|
||||
%16 = OpLoad %float %6
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
%21 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "rcp"
|
||||
OpDecorate %13 FPFastMathMode AllowRecip
|
||||
OpExecutionMode %1 ContractionOff
|
||||
%void = OpTypeVoid
|
||||
%ulong = OpTypeInt 64 0
|
||||
%24 = OpTypeFunction %void %ulong %ulong
|
||||
|
@ -18,7 +18,6 @@
|
|||
%float = OpTypeFloat 32
|
||||
%_ptr_Function_float = OpTypePointer Function %float
|
||||
%_ptr_Generic_float = OpTypePointer Generic %float
|
||||
%float_1 = OpConstant %float 1
|
||||
%1 = OpFunction %void None %24
|
||||
%7 = OpFunctionParameter %ulong
|
||||
%8 = OpFunctionParameter %ulong
|
||||
|
@ -39,11 +38,11 @@
|
|||
%11 = OpLoad %float %17 Aligned 4
|
||||
OpStore %6 %11
|
||||
%14 = OpLoad %float %6
|
||||
%13 = OpFDiv %float %float_1 %14
|
||||
%13 = OpExtInst %float %21 native_recip %14
|
||||
OpStore %6 %13
|
||||
%15 = OpLoad %ulong %5
|
||||
%16 = OpLoad %float %6
|
||||
%18 = OpConvertUToPtr %_ptr_Generic_float %15
|
||||
OpStore %18 %16 Aligned 4
|
||||
OpReturn
|
||||
OpFunctionEnd
|
||||
OpFunctionEnd
|
|
@ -37,7 +37,7 @@
|
|||
%11 = OpLoad %float %17 Aligned 4
|
||||
OpStore %6 %11
|
||||
%14 = OpLoad %float %6
|
||||
%13 = OpExtInst %float %21 sin %14
|
||||
%13 = OpExtInst %float %21 native_sin %14
|
||||
OpStore %6 %13
|
||||
%15 = OpLoad %ulong %5
|
||||
%16 = OpLoad %float %6
|
||||
|
|
|
@ -559,25 +559,29 @@ fn emit_directives<'input>(
|
|||
&directives,
|
||||
kernel_info,
|
||||
)?;
|
||||
for t in f.tuning.iter() {
|
||||
match *t {
|
||||
ast::TuningDirective::MaxNtid(nx, ny, nz) => {
|
||||
builder.execution_mode(
|
||||
fn_id,
|
||||
spirv_headers::ExecutionMode::MaxWorkgroupSizeINTEL,
|
||||
[nx, ny, nz],
|
||||
);
|
||||
if func_decl.name.is_kernel() {
|
||||
// FP contraction happens when compiling source -> PTX and is illegal at this stage (unless you force it in cuModuleLoadDataEx)
|
||||
builder.execution_mode(fn_id, spirv_headers::ExecutionMode::ContractionOff, []);
|
||||
for t in f.tuning.iter() {
|
||||
match *t {
|
||||
ast::TuningDirective::MaxNtid(nx, ny, nz) => {
|
||||
builder.execution_mode(
|
||||
fn_id,
|
||||
spirv_headers::ExecutionMode::MaxWorkgroupSizeINTEL,
|
||||
[nx, ny, nz],
|
||||
);
|
||||
}
|
||||
ast::TuningDirective::ReqNtid(nx, ny, nz) => {
|
||||
builder.execution_mode(
|
||||
fn_id,
|
||||
spirv_headers::ExecutionMode::LocalSize,
|
||||
[nx, ny, nz],
|
||||
);
|
||||
}
|
||||
// Too architecture specific
|
||||
ast::TuningDirective::MaxNReg(..)
|
||||
| ast::TuningDirective::MinNCtaPerSm(..) => {}
|
||||
}
|
||||
ast::TuningDirective::ReqNtid(nx, ny, nz) => {
|
||||
builder.execution_mode(
|
||||
fn_id,
|
||||
spirv_headers::ExecutionMode::LocalSize,
|
||||
[nx, ny, nz],
|
||||
);
|
||||
}
|
||||
// Too architecture specific
|
||||
ast::TuningDirective::MaxNReg(..)
|
||||
| ast::TuningDirective::MinNCtaPerSm(..) => {}
|
||||
}
|
||||
}
|
||||
emit_function_body_ops(builder, map, opencl_id, &f_body)?;
|
||||
|
@ -2772,6 +2776,7 @@ fn emit_function_body_ops(
|
|||
emit_mad_float(builder, map, opencl, desc, arg)?
|
||||
}
|
||||
},
|
||||
ast::Instruction::Fma(fma, arg) => emit_fma_float(builder, map, opencl, fma, arg)?,
|
||||
ast::Instruction::Or(t, a) => {
|
||||
let result_type = map.get_or_add_scalar(builder, ast::ScalarType::from(*t));
|
||||
if *t == ast::ScalarType::Pred {
|
||||
|
@ -2798,7 +2803,7 @@ fn emit_function_body_ops(
|
|||
emit_max(builder, map, opencl, d, a)?;
|
||||
}
|
||||
ast::Instruction::Rcp(d, a) => {
|
||||
emit_rcp(builder, map, d, a)?;
|
||||
emit_rcp(builder, map, opencl, d, a)?;
|
||||
}
|
||||
ast::Instruction::And(t, a) => {
|
||||
let result_type = map.get_or_add_scalar(builder, ast::ScalarType::from(*t));
|
||||
|
@ -2901,7 +2906,7 @@ fn emit_function_body_ops(
|
|||
result_type,
|
||||
Some(arg.dst),
|
||||
opencl,
|
||||
spirv::CLOp::sin as u32,
|
||||
spirv::CLOp::native_sin as u32,
|
||||
[dr::Operand::IdRef(arg.src)].iter().cloned(),
|
||||
)?;
|
||||
}
|
||||
|
@ -2911,7 +2916,7 @@ fn emit_function_body_ops(
|
|||
result_type,
|
||||
Some(arg.dst),
|
||||
opencl,
|
||||
spirv::CLOp::cos as u32,
|
||||
spirv::CLOp::native_cos as u32,
|
||||
[dr::Operand::IdRef(arg.src)].iter().cloned(),
|
||||
)?;
|
||||
}
|
||||
|
@ -2921,7 +2926,7 @@ fn emit_function_body_ops(
|
|||
result_type,
|
||||
Some(arg.dst),
|
||||
opencl,
|
||||
spirv::CLOp::log2 as u32,
|
||||
spirv::CLOp::native_log2 as u32,
|
||||
[dr::Operand::IdRef(arg.src)].iter().cloned(),
|
||||
)?;
|
||||
}
|
||||
|
@ -2931,7 +2936,7 @@ fn emit_function_body_ops(
|
|||
result_type,
|
||||
Some(arg.dst),
|
||||
opencl,
|
||||
spirv::CLOp::exp2 as u32,
|
||||
spirv::CLOp::native_exp2 as u32,
|
||||
[dr::Operand::IdRef(arg.src)].iter().cloned(),
|
||||
)?;
|
||||
}
|
||||
|
@ -3237,20 +3242,31 @@ fn emit_mul_float(
|
|||
fn emit_rcp(
|
||||
builder: &mut dr::Builder,
|
||||
map: &mut TypeWordMap,
|
||||
opencl: spirv::Word,
|
||||
desc: &ast::RcpDetails,
|
||||
a: &ast::Arg2<ExpandedArgParams>,
|
||||
arg: &ast::Arg2<ExpandedArgParams>,
|
||||
) -> Result<(), TranslateError> {
|
||||
let (instr_type, constant) = if desc.is_f64 {
|
||||
(ast::ScalarType::F64, vec_repr(1.0f64))
|
||||
} else {
|
||||
(ast::ScalarType::F32, vec_repr(1.0f32))
|
||||
};
|
||||
let one = map.get_or_add_constant(builder, &ast::Type::Scalar(instr_type), &constant)?;
|
||||
let result_type = map.get_or_add_scalar(builder, instr_type);
|
||||
builder.f_div(result_type, Some(a.dst), one, a.src)?;
|
||||
emit_rounding_decoration(builder, a.dst, desc.rounding);
|
||||
if !desc.is_f64 && desc.rounding.is_none() {
|
||||
builder.ext_inst(
|
||||
result_type,
|
||||
Some(arg.dst),
|
||||
opencl,
|
||||
spirv::CLOp::native_recip as u32,
|
||||
[dr::Operand::IdRef(arg.src)].iter().cloned(),
|
||||
)?;
|
||||
return Ok(());
|
||||
}
|
||||
let one = map.get_or_add_constant(builder, &ast::Type::Scalar(instr_type), &constant)?;
|
||||
builder.f_div(result_type, Some(arg.dst), one, arg.src)?;
|
||||
emit_rounding_decoration(builder, arg.dst, desc.rounding);
|
||||
builder.decorate(
|
||||
a.dst,
|
||||
arg.dst,
|
||||
spirv::Decoration::FPFastMathMode,
|
||||
[dr::Operand::FPFastMathMode(
|
||||
spirv::FPFastMathMode::ALLOW_RECIP,
|
||||
|
@ -3372,6 +3388,30 @@ fn emit_mad_sint(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_fma_float(
|
||||
builder: &mut dr::Builder,
|
||||
map: &mut TypeWordMap,
|
||||
opencl: spirv::Word,
|
||||
desc: &ast::ArithFloat,
|
||||
arg: &ast::Arg4<ExpandedArgParams>,
|
||||
) -> Result<(), dr::Error> {
|
||||
let inst_type = map.get_or_add(builder, SpirvType::from(ast::ScalarType::from(desc.typ)));
|
||||
builder.ext_inst(
|
||||
inst_type,
|
||||
Some(arg.dst),
|
||||
opencl,
|
||||
spirv::CLOp::fma as spirv::Word,
|
||||
[
|
||||
dr::Operand::IdRef(arg.src1),
|
||||
dr::Operand::IdRef(arg.src2),
|
||||
dr::Operand::IdRef(arg.src3),
|
||||
]
|
||||
.iter()
|
||||
.cloned(),
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_mad_float(
|
||||
builder: &mut dr::Builder,
|
||||
map: &mut TypeWordMap,
|
||||
|
@ -5713,6 +5753,10 @@ impl<T: ArgParamsEx> ast::Instruction<T> {
|
|||
let is_wide = d.is_wide();
|
||||
ast::Instruction::Mad(d, a.map(visitor, &inst_type, is_wide)?)
|
||||
}
|
||||
ast::Instruction::Fma(d, a) => {
|
||||
let inst_type = ast::Type::Scalar(d.typ);
|
||||
ast::Instruction::Fma(d, a.map(visitor, &inst_type, false)?)
|
||||
}
|
||||
ast::Instruction::Or(t, a) => ast::Instruction::Or(
|
||||
t,
|
||||
a.map_non_shift(visitor, &ast::Type::Scalar(t.into()), false)?,
|
||||
|
@ -6106,6 +6150,7 @@ impl ast::Instruction<ExpandedArgParams> {
|
|||
| ast::Instruction::Mad(ast::MulDetails::Float(float_control), _) => float_control
|
||||
.flush_to_zero
|
||||
.map(|ftz| (ftz, ast::ScalarType::from(float_control.typ).size_of())),
|
||||
ast::Instruction::Fma(d, _) => d.flush_to_zero.map(|ftz| (ftz, d.typ.size_of())),
|
||||
ast::Instruction::Setp(details, _) => details
|
||||
.flush_to_zero
|
||||
.map(|ftz| (ftz, details.typ.size_of())),
|
||||
|
|
|
@ -494,7 +494,7 @@ pub fn get_attribute(
|
|||
l0::sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE,
|
||||
))
|
||||
*/
|
||||
return Ok(());
|
||||
0
|
||||
}
|
||||
};
|
||||
unsafe { *pi = value };
|
||||
|
|
|
@ -51,6 +51,37 @@ impl LegacyArguments {
|
|||
}
|
||||
}
|
||||
|
||||
unsafe fn set_arg(
|
||||
kernel: &ocl_core::Kernel,
|
||||
arg_index: usize,
|
||||
arg_size: usize,
|
||||
arg_value: *const c_void,
|
||||
is_mem: bool,
|
||||
) -> Result<(), CUresult> {
|
||||
if is_mem {
|
||||
let error = 0;
|
||||
unsafe {
|
||||
ocl_core::ffi::clSetKernelArgSVMPointer(
|
||||
kernel.as_ptr(),
|
||||
arg_index as u32,
|
||||
*(arg_value as *const _),
|
||||
)
|
||||
};
|
||||
if error != 0 {
|
||||
panic!("clSetKernelArgSVMPointer");
|
||||
}
|
||||
} else {
|
||||
unsafe {
|
||||
ocl_core::set_kernel_arg(
|
||||
kernel,
|
||||
arg_index as u32,
|
||||
ocl_core::ArgVal::from_raw(arg_size, arg_value, is_mem),
|
||||
)?;
|
||||
};
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn launch_kernel(
|
||||
f: *mut Function,
|
||||
grid_dim_x: c_uint,
|
||||
|
@ -74,27 +105,7 @@ pub fn launch_kernel(
|
|||
let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
|
||||
if kernel_params != ptr::null_mut() {
|
||||
for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
|
||||
if is_mem {
|
||||
let error = 0;
|
||||
unsafe {
|
||||
ocl_core::ffi::clSetKernelArgSVMPointer(
|
||||
func.base.as_ptr(),
|
||||
i as u32,
|
||||
*(*kernel_params.add(i) as *const _),
|
||||
)
|
||||
};
|
||||
if error != 0 {
|
||||
panic!("clSetKernelArgSVMPointer");
|
||||
}
|
||||
} else {
|
||||
unsafe {
|
||||
ocl_core::set_kernel_arg(
|
||||
&func.base,
|
||||
i as u32,
|
||||
ocl_core::ArgVal::from_raw(arg_size, *kernel_params.add(i), is_mem),
|
||||
)?;
|
||||
};
|
||||
}
|
||||
unsafe { set_arg(&func.base, i, arg_size, *kernel_params.add(i), is_mem)? };
|
||||
}
|
||||
} else {
|
||||
let mut offset = 0;
|
||||
|
@ -126,15 +137,13 @@ pub fn launch_kernel(
|
|||
for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
|
||||
let buffer_offset = round_up_to_multiple(offset, arg_size);
|
||||
unsafe {
|
||||
ocl_core::set_kernel_arg(
|
||||
set_arg(
|
||||
&func.base,
|
||||
i as u32,
|
||||
ocl_core::ArgVal::from_raw(
|
||||
arg_size,
|
||||
buffer_ptr.add(buffer_offset) as *const _,
|
||||
is_mem,
|
||||
),
|
||||
)?;
|
||||
i,
|
||||
arg_size,
|
||||
buffer_ptr.add(buffer_offset) as *const _,
|
||||
is_mem,
|
||||
)?
|
||||
};
|
||||
offset = buffer_offset + arg_size;
|
||||
}
|
||||
|
@ -144,11 +153,13 @@ pub fn launch_kernel(
|
|||
}
|
||||
if func.use_shared_mem {
|
||||
unsafe {
|
||||
ocl_core::set_kernel_arg(
|
||||
set_arg(
|
||||
&func.base,
|
||||
func.arg_size.len() as u32,
|
||||
ocl_core::ArgVal::from_raw(shared_mem_bytes as usize, ptr::null(), false),
|
||||
)?;
|
||||
func.arg_size.len(),
|
||||
shared_mem_bytes as usize,
|
||||
ptr::null(),
|
||||
false,
|
||||
)?
|
||||
};
|
||||
}
|
||||
let global_dims = [
|
||||
|
@ -192,9 +203,9 @@ pub(crate) fn get_attribute(
|
|||
CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
|
||||
let max_threads = GlobalState::lock_function(func, |func| {
|
||||
if let ocl_core::KernelWorkGroupInfoResult::WorkGroupSize(size) =
|
||||
ocl_core::get_kernel_work_group_info::<ocl_core::DeviceId>(
|
||||
ocl_core::get_kernel_work_group_info::<()>(
|
||||
&func.base,
|
||||
unsafe { ocl_core::DeviceId::null() },
|
||||
(),
|
||||
ocl_core::KernelWorkGroupInfo::WorkGroupSize,
|
||||
)?
|
||||
{
|
||||
|
|
|
@ -1,16 +1,32 @@
|
|||
use super::{stream, CUresult, GlobalState};
|
||||
use super::{
|
||||
stream::{self, CU_STREAM_LEGACY},
|
||||
CUresult, GlobalState,
|
||||
};
|
||||
use std::{
|
||||
ffi::c_void,
|
||||
mem::{self, size_of},
|
||||
};
|
||||
|
||||
pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
|
||||
let ptr = GlobalState::lock_current_context(|ctx| {
|
||||
let dev = unsafe { &mut *ctx.device };
|
||||
Ok::<_, CUresult>(unsafe {
|
||||
let ptr = GlobalState::lock_stream(CU_STREAM_LEGACY, |stream_data| {
|
||||
let dev = unsafe { &*(*stream_data.context).device };
|
||||
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||
let ptr = unsafe {
|
||||
dev.ocl_ext
|
||||
.device_mem_alloc(&dev.ocl_context, &dev.ocl_base, bytesize, 0)?
|
||||
})
|
||||
};
|
||||
// CUDA does the same thing and e.g. GeekBench relies on this behavior
|
||||
let event = unsafe {
|
||||
dev.ocl_ext.enqueue_memfill(
|
||||
queue,
|
||||
ptr,
|
||||
&0u8 as *const u8 as *const c_void,
|
||||
1,
|
||||
bytesize,
|
||||
)?
|
||||
};
|
||||
ocl_core::wait_for_event(&event)?;
|
||||
Ok::<_, CUresult>(ptr)
|
||||
})??;
|
||||
unsafe { *dptr = ptr };
|
||||
Ok(())
|
||||
|
|
55
zluda_dump/src/debug.ptx
Normal file
55
zluda_dump/src/debug.ptx
Normal file
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
This collection of functions is here to assist with debugging
|
||||
You use it by manually pasting into a module.ptx that was generated by zluda_dump
|
||||
and inspecting content of additional debug buffer in replay.py
|
||||
*/
|
||||
|
||||
.func debug_dump_from_thread_16(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b16 value)
|
||||
{
|
||||
.reg.u32 local_id;
|
||||
mov.u32 local_id, %tid.x;
|
||||
.reg.u32 local_size;
|
||||
mov.u32 local_size, %ntid.x;
|
||||
.reg.u32 group_id;
|
||||
mov.u32 group_id, %ctaid.x;
|
||||
.reg.b32 global_id;
|
||||
mad.lo.u32 global_id, group_id, local_size, local_id;
|
||||
.reg.pred should_exit;
|
||||
setp.ne.u32 should_exit, global_id, global_id_0;
|
||||
@should_exit bra END;
|
||||
.reg.b32 index;
|
||||
ld.global.u32 index, [debug_addr];
|
||||
st.global.u32 [debug_addr], index+1;
|
||||
.reg.u64 st_offset;
|
||||
cvt.u64.u32 st_offset, index;
|
||||
mad.lo.u64 st_offset, st_offset, 2, 4; // sizeof(b16), sizeof(32)
|
||||
add.u64 debug_addr, debug_addr, st_offset;
|
||||
st.global.u16 [debug_addr], value;
|
||||
END:
|
||||
ret;
|
||||
}
|
||||
|
||||
.func debug_dump_from_thread_32(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b32 value)
|
||||
{
|
||||
.reg.u32 local_id;
|
||||
mov.u32 local_id, %tid.x;
|
||||
.reg.u32 local_size;
|
||||
mov.u32 local_size, %ntid.x;
|
||||
.reg.u32 group_id;
|
||||
mov.u32 group_id, %ctaid.x;
|
||||
.reg.b32 global_id;
|
||||
mad.lo.u32 global_id, group_id, local_size, local_id;
|
||||
.reg.pred should_exit;
|
||||
setp.ne.u32 should_exit, global_id, global_id_0;
|
||||
@should_exit bra END;
|
||||
.reg.b32 index;
|
||||
ld.global.u32 index, [debug_addr];
|
||||
st.global.u32 [debug_addr], index+1;
|
||||
.reg.u64 st_offset;
|
||||
cvt.u64.u32 st_offset, index;
|
||||
mad.lo.u64 st_offset, st_offset, 4, 4; // sizeof(b32), sizeof(32)
|
||||
add.u64 debug_addr, debug_addr, st_offset;
|
||||
st.global.u32 [debug_addr], value;
|
||||
END:
|
||||
ret;
|
||||
}
|
|
@ -53,7 +53,7 @@ def parse_arguments(dump_path, prefix):
|
|||
|
||||
def append_debug_buffer(args, grid, block):
|
||||
args = list(args)
|
||||
items = block[0] * block[1] * block[2] * block[0] * block[1] * block[2]
|
||||
items = grid[0] * grid[1] * grid[2] * block[0] * block[1] * block[2]
|
||||
debug_buff = np.zeros(items, dtype=np.uint32)
|
||||
args.append((drv.InOut(debug_buff), debug_buff))
|
||||
return args
|
||||
|
|
Loading…
Add table
Reference in a new issue