Tune generated code, add a workaround for geekbench

2025-04-20 08:24:44 +00:00 · 2021-07-25 15:19:43 +02:00 · 2021-07-25 15:19:43 +02:00 · 8f68287b18
commit 8f68287b18
parent 9d4f26bd07
14 changed files with 212 additions and 80 deletions
--- a/ptx/src/ast.rs
+++ b/ptx/src/ast.rs
@ -261,6 +261,7 @@ pub enum Instruction<P: ArgParams> {
    Call(CallInst<P>),
    Abs(AbsDetails, Arg2<P>),
    Mad(MulDetails, Arg4<P>),
+    Fma(ArithFloat, Arg4<P>),
    Or(ScalarType, Arg3<P>),
    Sub(ArithDetails, Arg3<P>),
    Min(MinMaxDetails, Arg3<P>),
--- a/ptx/src/ptx.lalrpop
+++ b/ptx/src/ptx.lalrpop
@ -743,6 +743,7 @@ Instruction: ast::Instruction<ast::ParsedArgParams<'input>> = {
    InstCall,
    InstAbs,
    InstMad,
+    InstFma,
    InstOr,
    InstAnd,
    InstSub,
@ -1345,7 +1346,11 @@ InstAbs: ast::Instruction<ast::ParsedArgParams<'input>> = {
 InstMad: ast::Instruction<ast::ParsedArgParams<'input>> = {
    "mad" <d:MulDetails> <a:Arg4> => ast::Instruction::Mad(d, a),
    "mad" ".hi" ".sat" ".s32" => todo!(),
-    "fma" <f:ArithFloatMustRound> <a:Arg4> => ast::Instruction::Mad(ast::MulDetails::Float(f), a),
+};
+
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-fma
+InstFma: ast::Instruction<ast::ParsedArgParams<'input>> = {
+    "fma" <f:ArithFloatMustRound> <a:Arg4> => ast::Instruction::Fma(f, a),
 };

 SignedIntType: ast::ScalarType = {
--- a/ptx/src/test/spirv_run/cos.spvtxt
+++ b/ptx/src/test/spirv_run/cos.spvtxt
@ -37,7 +37,7 @@
         %11 = OpLoad %float %17 Aligned 4
               OpStore %6 %11
         %14 = OpLoad %float %6
-         %13 = OpExtInst %float %21 cos %14
+         %13 = OpExtInst %float %21 native_cos %14
               OpStore %6 %13
         %15 = OpLoad %ulong %5
         %16 = OpLoad %float %6
--- a/ptx/src/test/spirv_run/ex2.spvtxt
+++ b/ptx/src/test/spirv_run/ex2.spvtxt
@ -37,7 +37,7 @@
         %11 = OpLoad %float %17 Aligned 4
               OpStore %6 %11
         %14 = OpLoad %float %6
-         %13 = OpExtInst %float %21 exp2 %14
+         %13 = OpExtInst %float %21 native_exp2 %14
               OpStore %6 %13
         %15 = OpLoad %ulong %5
         %16 = OpLoad %float %6
--- a/ptx/src/test/spirv_run/fma.spvtxt
+++ b/ptx/src/test/spirv_run/fma.spvtxt
@ -59,7 +59,7 @@
         %20 = OpLoad %float %6
         %21 = OpLoad %float %7
         %22 = OpLoad %float %8
-         %19 = OpExtInst %float %35 mad %20 %21 %22
+         %19 = OpExtInst %float %35 fma %20 %21 %22
               OpStore %6 %19
         %23 = OpLoad %ulong %5
         %24 = OpLoad %float %6
--- a/ptx/src/test/spirv_run/lg2.spvtxt
+++ b/ptx/src/test/spirv_run/lg2.spvtxt
@ -37,7 +37,7 @@
         %11 = OpLoad %float %17 Aligned 4
               OpStore %6 %11
         %14 = OpLoad %float %6
-         %13 = OpExtInst %float %21 log2 %14
+         %13 = OpExtInst %float %21 native_log2 %14
               OpStore %6 %13
         %15 = OpLoad %ulong %5
         %16 = OpLoad %float %6
--- a/ptx/src/test/spirv_run/rcp.spvtxt
+++ b/ptx/src/test/spirv_run/rcp.spvtxt
@ -10,7 +10,7 @@
         %21 = OpExtInstImport "OpenCL.std"
               OpMemoryModel Physical64 OpenCL
               OpEntryPoint Kernel %1 "rcp"
-               OpDecorate %13 FPFastMathMode AllowRecip
+               OpExecutionMode %1 ContractionOff
       %void = OpTypeVoid
      %ulong = OpTypeInt 64 0
         %24 = OpTypeFunction %void %ulong %ulong
@ -18,7 +18,6 @@
      %float = OpTypeFloat 32
 %_ptr_Function_float = OpTypePointer Function %float
 %_ptr_Generic_float = OpTypePointer Generic %float
-    %float_1 = OpConstant %float 1
          %1 = OpFunction %void None %24
          %7 = OpFunctionParameter %ulong
          %8 = OpFunctionParameter %ulong
@ -39,11 +38,11 @@
         %11 = OpLoad %float %17 Aligned 4
               OpStore %6 %11
         %14 = OpLoad %float %6
-         %13 = OpFDiv %float %float_1 %14
+         %13 = OpExtInst %float %21 native_recip %14
               OpStore %6 %13
         %15 = OpLoad %ulong %5
         %16 = OpLoad %float %6
         %18 = OpConvertUToPtr %_ptr_Generic_float %15
               OpStore %18 %16 Aligned 4
               OpReturn
-               OpFunctionEnd
+               OpFunctionEnd
--- a/ptx/src/test/spirv_run/sin.spvtxt
+++ b/ptx/src/test/spirv_run/sin.spvtxt
@ -37,7 +37,7 @@
         %11 = OpLoad %float %17 Aligned 4
               OpStore %6 %11
         %14 = OpLoad %float %6
-         %13 = OpExtInst %float %21 sin %14
+         %13 = OpExtInst %float %21 native_sin %14
               OpStore %6 %13
         %15 = OpLoad %ulong %5
         %16 = OpLoad %float %6
--- a/ptx/src/translate.rs
+++ b/ptx/src/translate.rs
@ -559,25 +559,29 @@ fn emit_directives<'input>(
                    &directives,
                    kernel_info,
                )?;
-                for t in f.tuning.iter() {
-                    match *t {
-                        ast::TuningDirective::MaxNtid(nx, ny, nz) => {
-                            builder.execution_mode(
-                                fn_id,
-                                spirv_headers::ExecutionMode::MaxWorkgroupSizeINTEL,
-                                [nx, ny, nz],
-                            );
+                if func_decl.name.is_kernel() {
+                    // FP contraction happens when compiling source -> PTX and is illegal at this stage (unless you force it in cuModuleLoadDataEx)
+                    builder.execution_mode(fn_id, spirv_headers::ExecutionMode::ContractionOff, []);
+                    for t in f.tuning.iter() {
+                        match *t {
+                            ast::TuningDirective::MaxNtid(nx, ny, nz) => {
+                                builder.execution_mode(
+                                    fn_id,
+                                    spirv_headers::ExecutionMode::MaxWorkgroupSizeINTEL,
+                                    [nx, ny, nz],
+                                );
+                            }
+                            ast::TuningDirective::ReqNtid(nx, ny, nz) => {
+                                builder.execution_mode(
+                                    fn_id,
+                                    spirv_headers::ExecutionMode::LocalSize,
+                                    [nx, ny, nz],
+                                );
+                            }
+                            // Too architecture specific
+                            ast::TuningDirective::MaxNReg(..)
+                            | ast::TuningDirective::MinNCtaPerSm(..) => {}
                        }
-                        ast::TuningDirective::ReqNtid(nx, ny, nz) => {
-                            builder.execution_mode(
-                                fn_id,
-                                spirv_headers::ExecutionMode::LocalSize,
-                                [nx, ny, nz],
-                            );
-                        }
-                        // Too architecture specific
-                        ast::TuningDirective::MaxNReg(..)
-                        | ast::TuningDirective::MinNCtaPerSm(..) => {}
                    }
                }
                emit_function_body_ops(builder, map, opencl_id, &f_body)?;
@ -2772,6 +2776,7 @@ fn emit_function_body_ops(
                        emit_mad_float(builder, map, opencl, desc, arg)?
                    }
                },
+                ast::Instruction::Fma(fma, arg) => emit_fma_float(builder, map, opencl, fma, arg)?,
                ast::Instruction::Or(t, a) => {
                    let result_type = map.get_or_add_scalar(builder, ast::ScalarType::from(*t));
                    if *t == ast::ScalarType::Pred {
@ -2798,7 +2803,7 @@ fn emit_function_body_ops(
                    emit_max(builder, map, opencl, d, a)?;
                }
                ast::Instruction::Rcp(d, a) => {
-                    emit_rcp(builder, map, d, a)?;
+                    emit_rcp(builder, map, opencl, d, a)?;
                }
                ast::Instruction::And(t, a) => {
                    let result_type = map.get_or_add_scalar(builder, ast::ScalarType::from(*t));
@ -2901,7 +2906,7 @@ fn emit_function_body_ops(
                        result_type,
                        Some(arg.dst),
                        opencl,
-                        spirv::CLOp::sin as u32,
+                        spirv::CLOp::native_sin as u32,
                        [dr::Operand::IdRef(arg.src)].iter().cloned(),
                    )?;
                }
@ -2911,7 +2916,7 @@ fn emit_function_body_ops(
                        result_type,
                        Some(arg.dst),
                        opencl,
-                        spirv::CLOp::cos as u32,
+                        spirv::CLOp::native_cos as u32,
                        [dr::Operand::IdRef(arg.src)].iter().cloned(),
                    )?;
                }
@ -2921,7 +2926,7 @@ fn emit_function_body_ops(
                        result_type,
                        Some(arg.dst),
                        opencl,
-                        spirv::CLOp::log2 as u32,
+                        spirv::CLOp::native_log2 as u32,
                        [dr::Operand::IdRef(arg.src)].iter().cloned(),
                    )?;
                }
@ -2931,7 +2936,7 @@ fn emit_function_body_ops(
                        result_type,
                        Some(arg.dst),
                        opencl,
-                        spirv::CLOp::exp2 as u32,
+                        spirv::CLOp::native_exp2 as u32,
                        [dr::Operand::IdRef(arg.src)].iter().cloned(),
                    )?;
                }
@ -3237,20 +3242,31 @@ fn emit_mul_float(
 fn emit_rcp(
    builder: &mut dr::Builder,
    map: &mut TypeWordMap,
+    opencl: spirv::Word,
    desc: &ast::RcpDetails,
-    a: &ast::Arg2<ExpandedArgParams>,
+    arg: &ast::Arg2<ExpandedArgParams>,
 ) -> Result<(), TranslateError> {
    let (instr_type, constant) = if desc.is_f64 {
        (ast::ScalarType::F64, vec_repr(1.0f64))
    } else {
        (ast::ScalarType::F32, vec_repr(1.0f32))
    };
-    let one = map.get_or_add_constant(builder, &ast::Type::Scalar(instr_type), &constant)?;
    let result_type = map.get_or_add_scalar(builder, instr_type);
-    builder.f_div(result_type, Some(a.dst), one, a.src)?;
-    emit_rounding_decoration(builder, a.dst, desc.rounding);
+    if !desc.is_f64 && desc.rounding.is_none() {
+        builder.ext_inst(
+            result_type,
+            Some(arg.dst),
+            opencl,
+            spirv::CLOp::native_recip as u32,
+            [dr::Operand::IdRef(arg.src)].iter().cloned(),
+        )?;
+        return Ok(());
+    }
+    let one = map.get_or_add_constant(builder, &ast::Type::Scalar(instr_type), &constant)?;
+    builder.f_div(result_type, Some(arg.dst), one, arg.src)?;
+    emit_rounding_decoration(builder, arg.dst, desc.rounding);
    builder.decorate(
-        a.dst,
+        arg.dst,
        spirv::Decoration::FPFastMathMode,
        [dr::Operand::FPFastMathMode(
            spirv::FPFastMathMode::ALLOW_RECIP,
@ -3372,6 +3388,30 @@ fn emit_mad_sint(
    Ok(())
 }

+fn emit_fma_float(
+    builder: &mut dr::Builder,
+    map: &mut TypeWordMap,
+    opencl: spirv::Word,
+    desc: &ast::ArithFloat,
+    arg: &ast::Arg4<ExpandedArgParams>,
+) -> Result<(), dr::Error> {
+    let inst_type = map.get_or_add(builder, SpirvType::from(ast::ScalarType::from(desc.typ)));
+    builder.ext_inst(
+        inst_type,
+        Some(arg.dst),
+        opencl,
+        spirv::CLOp::fma as spirv::Word,
+        [
+            dr::Operand::IdRef(arg.src1),
+            dr::Operand::IdRef(arg.src2),
+            dr::Operand::IdRef(arg.src3),
+        ]
+        .iter()
+        .cloned(),
+    )?;
+    Ok(())
+}
+
 fn emit_mad_float(
    builder: &mut dr::Builder,
    map: &mut TypeWordMap,
@ -5713,6 +5753,10 @@ impl<T: ArgParamsEx> ast::Instruction<T> {
                let is_wide = d.is_wide();
                ast::Instruction::Mad(d, a.map(visitor, &inst_type, is_wide)?)
            }
+            ast::Instruction::Fma(d, a) => {
+                let inst_type = ast::Type::Scalar(d.typ);
+                ast::Instruction::Fma(d, a.map(visitor, &inst_type, false)?)
+            }
            ast::Instruction::Or(t, a) => ast::Instruction::Or(
                t,
                a.map_non_shift(visitor, &ast::Type::Scalar(t.into()), false)?,
@ -6106,6 +6150,7 @@ impl ast::Instruction<ExpandedArgParams> {
            | ast::Instruction::Mad(ast::MulDetails::Float(float_control), _) => float_control
                .flush_to_zero
                .map(|ftz| (ftz, ast::ScalarType::from(float_control.typ).size_of())),
+            ast::Instruction::Fma(d, _) => d.flush_to_zero.map(|ftz| (ftz, d.typ.size_of())),
            ast::Instruction::Setp(details, _) => details
                .flush_to_zero
                .map(|ftz| (ftz, details.typ.size_of())),
--- a/zluda/src/impl/device.rs
+++ b/zluda/src/impl/device.rs
@ -494,7 +494,7 @@ pub fn get_attribute(
                l0::sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE,
            ))
            */
-            return Ok(());
+            0
        }
    };
    unsafe { *pi = value };
--- a/zluda/src/impl/function.rs
+++ b/zluda/src/impl/function.rs
@ -51,6 +51,37 @@ impl LegacyArguments {
    }
 }

+unsafe fn set_arg(
+    kernel: &ocl_core::Kernel,
+    arg_index: usize,
+    arg_size: usize,
+    arg_value: *const c_void,
+    is_mem: bool,
+) -> Result<(), CUresult> {
+    if is_mem {
+        let error = 0;
+        unsafe {
+            ocl_core::ffi::clSetKernelArgSVMPointer(
+                kernel.as_ptr(),
+                arg_index as u32,
+                *(arg_value as *const _),
+            )
+        };
+        if error != 0 {
+            panic!("clSetKernelArgSVMPointer");
+        }
+    } else {
+        unsafe {
+            ocl_core::set_kernel_arg(
+                kernel,
+                arg_index as u32,
+                ocl_core::ArgVal::from_raw(arg_size, arg_value, is_mem),
+            )?;
+        };
+    }
+    Ok(())
+}
+
 pub fn launch_kernel(
    f: *mut Function,
    grid_dim_x: c_uint,
@ -74,27 +105,7 @@ pub fn launch_kernel(
        let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
        if kernel_params != ptr::null_mut() {
            for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
-                if is_mem {
-                    let error = 0;
-                    unsafe {
-                        ocl_core::ffi::clSetKernelArgSVMPointer(
-                            func.base.as_ptr(),
-                            i as u32,
-                            *(*kernel_params.add(i) as *const _),
-                        )
-                    };
-                    if error != 0 {
-                        panic!("clSetKernelArgSVMPointer");
-                    }
-                } else {
-                    unsafe {
-                        ocl_core::set_kernel_arg(
-                            &func.base,
-                            i as u32,
-                            ocl_core::ArgVal::from_raw(arg_size, *kernel_params.add(i), is_mem),
-                        )?;
-                    };
-                }
+                unsafe { set_arg(&func.base, i, arg_size, *kernel_params.add(i), is_mem)? };
            }
        } else {
            let mut offset = 0;
@ -126,15 +137,13 @@ pub fn launch_kernel(
                    for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
                        let buffer_offset = round_up_to_multiple(offset, arg_size);
                        unsafe {
-                            ocl_core::set_kernel_arg(
+                            set_arg(
                                &func.base,
-                                i as u32,
-                                ocl_core::ArgVal::from_raw(
-                                    arg_size,
-                                    buffer_ptr.add(buffer_offset) as *const _,
-                                    is_mem,
-                                ),
-                            )?;
+                                i,
+                                arg_size,
+                                buffer_ptr.add(buffer_offset) as *const _,
+                                is_mem,
+                            )?
                        };
                        offset = buffer_offset + arg_size;
                    }
@ -144,11 +153,13 @@ pub fn launch_kernel(
        }
        if func.use_shared_mem {
            unsafe {
-                ocl_core::set_kernel_arg(
+                set_arg(
                    &func.base,
-                    func.arg_size.len() as u32,
-                    ocl_core::ArgVal::from_raw(shared_mem_bytes as usize, ptr::null(), false),
-                )?;
+                    func.arg_size.len(),
+                    shared_mem_bytes as usize,
+                    ptr::null(),
+                    false,
+                )?
            };
        }
        let global_dims = [
@ -192,9 +203,9 @@ pub(crate) fn get_attribute(
        CUfunction_attribute::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
            let max_threads = GlobalState::lock_function(func, |func| {
                if let ocl_core::KernelWorkGroupInfoResult::WorkGroupSize(size) =
-                    ocl_core::get_kernel_work_group_info::<ocl_core::DeviceId>(
+                    ocl_core::get_kernel_work_group_info::<()>(
                        &func.base,
-                        unsafe { ocl_core::DeviceId::null() },
+                        (),
                        ocl_core::KernelWorkGroupInfo::WorkGroupSize,
                    )?
                {
--- a/zluda/src/impl/memory.rs
+++ b/zluda/src/impl/memory.rs
@ -1,16 +1,32 @@
-use super::{stream, CUresult, GlobalState};
+use super::{
+    stream::{self, CU_STREAM_LEGACY},
+    CUresult, GlobalState,
+};
 use std::{
    ffi::c_void,
    mem::{self, size_of},
 };

 pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
-    let ptr = GlobalState::lock_current_context(|ctx| {
-        let dev = unsafe { &mut *ctx.device };
-        Ok::<_, CUresult>(unsafe {
+    let ptr = GlobalState::lock_stream(CU_STREAM_LEGACY, |stream_data| {
+        let dev = unsafe { &*(*stream_data.context).device };
+        let queue = stream_data.cmd_list.as_ref().unwrap();
+        let ptr = unsafe {
            dev.ocl_ext
                .device_mem_alloc(&dev.ocl_context, &dev.ocl_base, bytesize, 0)?
-        })
+        };
+        // CUDA does the same thing and e.g. GeekBench relies on this behavior
+        let event = unsafe {
+            dev.ocl_ext.enqueue_memfill(
+                queue,
+                ptr,
+                &0u8 as *const u8 as *const c_void,
+                1,
+                bytesize,
+            )?
+        };
+        ocl_core::wait_for_event(&event)?;
+        Ok::<_, CUresult>(ptr)
    })??;
    unsafe { *dptr = ptr };
    Ok(())
--- a/zluda_dump/src/debug.ptx
+++ b/zluda_dump/src/debug.ptx
@ -0,0 +1,55 @@
+/*
+    This collection of functions is here to assist with debugging
+    You use it by manually pasting into a module.ptx that was generated by zluda_dump
+    and inspecting content of additional debug buffer in replay.py
+*/
+
+.func debug_dump_from_thread_16(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b16 value)
+{
+    .reg.u32        local_id;
+    mov.u32         local_id, %tid.x;
+    .reg.u32        local_size;
+    mov.u32         local_size, %ntid.x;
+    .reg.u32        group_id;
+    mov.u32         group_id, %ctaid.x;
+    .reg.b32        global_id;
+    mad.lo.u32      global_id, group_id, local_size, local_id;
+    .reg.pred       should_exit;
+    setp.ne.u32     should_exit, global_id, global_id_0;
+    @should_exit bra END;
+    .reg.b32        index;
+    ld.global.u32   index, [debug_addr];
+    st.global.u32   [debug_addr], index+1;
+    .reg.u64        st_offset;
+    cvt.u64.u32     st_offset, index;
+    mad.lo.u64      st_offset, st_offset, 2, 4; // sizeof(b16), sizeof(32)
+    add.u64         debug_addr, debug_addr, st_offset;
+    st.global.u16   [debug_addr], value;
+END:
+    ret;
+}
+
+.func debug_dump_from_thread_32(.reg.b64 debug_addr, .reg.u32 global_id_0, .reg.b32 value)
+{
+    .reg.u32        local_id;
+    mov.u32         local_id, %tid.x;
+    .reg.u32        local_size;
+    mov.u32         local_size, %ntid.x;
+    .reg.u32        group_id;
+    mov.u32         group_id, %ctaid.x;
+    .reg.b32        global_id;
+    mad.lo.u32      global_id, group_id, local_size, local_id;
+    .reg.pred       should_exit;
+    setp.ne.u32     should_exit, global_id, global_id_0;
+    @should_exit bra END;
+    .reg.b32        index;
+    ld.global.u32   index, [debug_addr];
+    st.global.u32   [debug_addr], index+1;
+    .reg.u64        st_offset;
+    cvt.u64.u32     st_offset, index;
+    mad.lo.u64      st_offset, st_offset, 4, 4; // sizeof(b32), sizeof(32)
+    add.u64         debug_addr, debug_addr, st_offset;
+    st.global.u32   [debug_addr], value;
+END:
+    ret;
+} 
--- a/zluda_dump/src/replay.py
+++ b/zluda_dump/src/replay.py
@ -53,7 +53,7 @@ def parse_arguments(dump_path, prefix):

 def append_debug_buffer(args, grid, block):
    args = list(args)
-    items = block[0] * block[1] * block[2] * block[0] * block[1] * block[2]
+    items = grid[0] * grid[1] * grid[2] * block[0] * block[1] * block[2]
    debug_buff = np.zeros(items, dtype=np.uint32)
    args.append((drv.InOut(debug_buff), debug_buff))
    return args