mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-04-19 16:04:44 +00:00
Implement atomic float add
This commit is contained in:
parent
efd91e270c
commit
17291019e3
6 changed files with 279 additions and 24 deletions
|
@ -1,7 +1,10 @@
|
|||
// Every time this file changes it must te rebuilt:
|
||||
// ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions" -out_dir . -device kbl -output_no_suffix -spv_only
|
||||
// Additionally you should strip names:
|
||||
// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv
|
||||
// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv --target-env=spv1.3
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
|
||||
|
||||
#define FUNC(NAME) __zluda_ptx_impl__ ## NAME
|
||||
|
||||
|
@ -25,6 +28,20 @@
|
|||
return expected; \
|
||||
}
|
||||
|
||||
#define atomic_add(NAME, SUCCESS, FAILURE, SCOPE, SPACE, TYPE, ATOMIC_TYPE, INT_TYPE) \
|
||||
TYPE FUNC(NAME)(SPACE TYPE* ptr, TYPE value) { \
|
||||
volatile SPACE ATOMIC_TYPE* atomic_ptr = (volatile SPACE ATOMIC_TYPE*)ptr; \
|
||||
union { \
|
||||
INT_TYPE int_view; \
|
||||
TYPE float_view; \
|
||||
} expected, desired; \
|
||||
expected.float_view = *ptr; \
|
||||
do { \
|
||||
desired.float_view = expected.float_view + value; \
|
||||
} while (!atomic_compare_exchange_strong_explicit(atomic_ptr, &expected.int_view, desired.int_view, SUCCESS, FAILURE, SCOPE)); \
|
||||
return expected.float_view; \
|
||||
}
|
||||
|
||||
// We are doing all this mess instead of accepting memory_order and memory_scope parameters
|
||||
// because ocloc emits broken (failing spirv-dis) SPIR-V when memory_order or memory_scope is a parameter
|
||||
|
||||
|
@ -120,6 +137,98 @@ atomic_dec(atom_acquire_sys_shared_dec, memory_order_acquire, memory_order_acqui
|
|||
atomic_dec(atom_release_sys_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
|
||||
|
||||
// atom.add.f32
|
||||
atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
|
||||
// atom.add.f64
|
||||
atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
|
||||
uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
|
||||
return intel_ubfe(base, pos, len);
|
||||
}
|
||||
|
@ -136,11 +245,11 @@ long FUNC(bfe_s64)(long base, uint pos, uint len) {
|
|||
return intel_sbfe(base, pos, len);
|
||||
}
|
||||
|
||||
uint FUNC(bfi_b32)(uint base, uint insert, uint offset, uint count) {
|
||||
uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
|
||||
return intel_bfi(base, insert, offset, count);
|
||||
}
|
||||
|
||||
ulong FUNC(bfi_b64)(ulong base, ulong insert, uint offset, uint count) {
|
||||
ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
|
||||
return intel_bfi(base, insert, offset, count);
|
||||
}
|
||||
|
||||
|
|
Binary file not shown.
28
ptx/src/test/spirv_run/atom_add_float.ptx
Normal file
28
ptx/src/test/spirv_run/atom_add_float.ptx
Normal file
|
@ -0,0 +1,28 @@
|
|||
.version 6.5
|
||||
.target sm_30
|
||||
.address_size 64
|
||||
|
||||
.visible .entry atom_add_float(
|
||||
.param .u64 input,
|
||||
.param .u64 output
|
||||
)
|
||||
{
|
||||
.shared .align 4 .b8 shared_mem[1024];
|
||||
|
||||
.reg .u64 in_addr;
|
||||
.reg .u64 out_addr;
|
||||
.reg .f32 temp1;
|
||||
.reg .f32 temp2;
|
||||
|
||||
ld.param.u64 in_addr, [input];
|
||||
ld.param.u64 out_addr, [output];
|
||||
|
||||
ld.f32 temp1, [in_addr];
|
||||
ld.f32 temp2, [in_addr+4];
|
||||
st.shared.f32 [shared_mem], temp1;
|
||||
atom.shared.add.f32 temp1, [shared_mem], temp2;
|
||||
ld.shared.f32 temp2, [shared_mem];
|
||||
st.f32 [out_addr], temp1;
|
||||
st.f32 [out_addr+4], temp2;
|
||||
ret;
|
||||
}
|
81
ptx/src/test/spirv_run/atom_add_float.spvtxt
Normal file
81
ptx/src/test/spirv_run/atom_add_float.spvtxt
Normal file
|
@ -0,0 +1,81 @@
|
|||
OpCapability GenericPointer
|
||||
OpCapability Linkage
|
||||
OpCapability Addresses
|
||||
OpCapability Kernel
|
||||
OpCapability Int8
|
||||
OpCapability Int16
|
||||
OpCapability Int64
|
||||
OpCapability Float16
|
||||
OpCapability Float64
|
||||
%42 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "atom_add_float" %4
|
||||
OpDecorate %37 LinkageAttributes "__zluda_ptx_impl__atom_relaxed_gpu_shared_add_f32" Import
|
||||
OpDecorate %4 Alignment 4
|
||||
%void = OpTypeVoid
|
||||
%float = OpTypeFloat 32
|
||||
%_ptr_Workgroup_float = OpTypePointer Workgroup %float
|
||||
%46 = OpTypeFunction %float %_ptr_Workgroup_float %float
|
||||
%uint = OpTypeInt 32 0
|
||||
%uchar = OpTypeInt 8 0
|
||||
%uint_1024 = OpConstant %uint 1024
|
||||
%_arr_uchar_uint_1024 = OpTypeArray %uchar %uint_1024
|
||||
%_ptr_Workgroup__arr_uchar_uint_1024 = OpTypePointer Workgroup %_arr_uchar_uint_1024
|
||||
%4 = OpVariable %_ptr_Workgroup__arr_uchar_uint_1024 Workgroup
|
||||
%ulong = OpTypeInt 64 0
|
||||
%53 = OpTypeFunction %void %ulong %ulong
|
||||
%_ptr_Function_ulong = OpTypePointer Function %ulong
|
||||
%_ptr_Function_float = OpTypePointer Function %float
|
||||
%_ptr_Generic_float = OpTypePointer Generic %float
|
||||
%ulong_4 = OpConstant %ulong 4
|
||||
%ulong_4_0 = OpConstant %ulong 4
|
||||
%37 = OpFunction %float None %46
|
||||
%39 = OpFunctionParameter %_ptr_Workgroup_float
|
||||
%40 = OpFunctionParameter %float
|
||||
OpFunctionEnd
|
||||
%1 = OpFunction %void None %53
|
||||
%9 = OpFunctionParameter %ulong
|
||||
%10 = OpFunctionParameter %ulong
|
||||
%36 = OpLabel
|
||||
%2 = OpVariable %_ptr_Function_ulong Function
|
||||
%3 = OpVariable %_ptr_Function_ulong Function
|
||||
%5 = OpVariable %_ptr_Function_ulong Function
|
||||
%6 = OpVariable %_ptr_Function_ulong Function
|
||||
%7 = OpVariable %_ptr_Function_float Function
|
||||
%8 = OpVariable %_ptr_Function_float Function
|
||||
OpStore %2 %9
|
||||
OpStore %3 %10
|
||||
%11 = OpLoad %ulong %2 Aligned 8
|
||||
OpStore %5 %11
|
||||
%12 = OpLoad %ulong %3 Aligned 8
|
||||
OpStore %6 %12
|
||||
%14 = OpLoad %ulong %5
|
||||
%29 = OpConvertUToPtr %_ptr_Generic_float %14
|
||||
%13 = OpLoad %float %29 Aligned 4
|
||||
OpStore %7 %13
|
||||
%16 = OpLoad %ulong %5
|
||||
%26 = OpIAdd %ulong %16 %ulong_4
|
||||
%30 = OpConvertUToPtr %_ptr_Generic_float %26
|
||||
%15 = OpLoad %float %30 Aligned 4
|
||||
OpStore %8 %15
|
||||
%17 = OpLoad %float %7
|
||||
%31 = OpBitcast %_ptr_Workgroup_float %4
|
||||
OpStore %31 %17 Aligned 4
|
||||
%19 = OpLoad %float %8
|
||||
%32 = OpBitcast %_ptr_Workgroup_float %4
|
||||
%18 = OpFunctionCall %float %37 %32 %19
|
||||
OpStore %7 %18
|
||||
%33 = OpBitcast %_ptr_Workgroup_float %4
|
||||
%20 = OpLoad %float %33 Aligned 4
|
||||
OpStore %8 %20
|
||||
%21 = OpLoad %ulong %6
|
||||
%22 = OpLoad %float %7
|
||||
%34 = OpConvertUToPtr %_ptr_Generic_float %21
|
||||
OpStore %34 %22 Aligned 4
|
||||
%23 = OpLoad %ulong %6
|
||||
%24 = OpLoad %float %8
|
||||
%28 = OpIAdd %ulong %23 %ulong_4_0
|
||||
%35 = OpConvertUToPtr %_ptr_Generic_float %28
|
||||
OpStore %35 %24 Aligned 4
|
||||
OpReturn
|
||||
OpFunctionEnd
|
|
@ -155,6 +155,7 @@ test_ptx!(cvt_s64_s32, [-1i32], [-1i64]);
|
|||
test_ptx!(add_tuning, [2u64], [3u64]);
|
||||
test_ptx!(add_non_coherent, [3u64], [4u64]);
|
||||
test_ptx!(sign_extend, [-1i16], [-1i32]);
|
||||
test_ptx!(atom_add_float, [1.25f32, 0.5f32], [1.25f32, 1.75f32]);
|
||||
|
||||
struct DisplayError<T: Debug> {
|
||||
err: T,
|
||||
|
|
|
@ -1505,6 +1505,7 @@ fn extract_globals<'input, 'b>(
|
|||
d,
|
||||
a,
|
||||
"inc",
|
||||
ast::SizedScalarType::U32,
|
||||
));
|
||||
}
|
||||
Statement::Instruction(ast::Instruction::Atom(
|
||||
|
@ -1526,6 +1527,44 @@ fn extract_globals<'input, 'b>(
|
|||
d,
|
||||
a,
|
||||
"dec",
|
||||
ast::SizedScalarType::U32,
|
||||
));
|
||||
}
|
||||
Statement::Instruction(ast::Instruction::Atom(
|
||||
ast::AtomDetails {
|
||||
inner:
|
||||
ast::AtomInnerDetails::Float {
|
||||
op: ast::AtomFloatOp::Add,
|
||||
typ,
|
||||
},
|
||||
semantics,
|
||||
scope,
|
||||
space,
|
||||
},
|
||||
a,
|
||||
)) => {
|
||||
let details = ast::AtomDetails {
|
||||
inner: ast::AtomInnerDetails::Float {
|
||||
op: ast::AtomFloatOp::Add,
|
||||
typ,
|
||||
},
|
||||
semantics,
|
||||
scope,
|
||||
space,
|
||||
};
|
||||
let (op, typ) = match typ {
|
||||
ast::FloatType::F32 => ("add_f32", ast::SizedScalarType::F32),
|
||||
ast::FloatType::F64 => ("add_f64", ast::SizedScalarType::F64),
|
||||
ast::FloatType::F16 => unreachable!(),
|
||||
ast::FloatType::F16x2 => unreachable!(),
|
||||
};
|
||||
local.push(to_ptx_impl_atomic_call(
|
||||
id_def,
|
||||
ptx_impl_imports,
|
||||
details,
|
||||
a,
|
||||
op,
|
||||
typ,
|
||||
));
|
||||
}
|
||||
s => local.push(s),
|
||||
|
@ -1696,6 +1735,7 @@ fn to_ptx_impl_atomic_call(
|
|||
details: ast::AtomDetails,
|
||||
arg: ast::Arg3<ExpandedArgParams>,
|
||||
op: &'static str,
|
||||
typ: ast::SizedScalarType,
|
||||
) -> ExpandedStatement {
|
||||
let semantics = ptx_semantics_name(details.semantics);
|
||||
let scope = ptx_scope_name(details.scope);
|
||||
|
@ -1710,15 +1750,14 @@ fn to_ptx_impl_atomic_call(
|
|||
ast::AtomSpace::Global => ast::PointerStateSpace::Global,
|
||||
ast::AtomSpace::Shared => ast::PointerStateSpace::Shared,
|
||||
};
|
||||
let scalar_typ = ast::ScalarType::from(typ);
|
||||
let fn_id = match ptx_impl_imports.entry(fn_name) {
|
||||
hash_map::Entry::Vacant(entry) => {
|
||||
let fn_id = id_defs.new_non_variable(None);
|
||||
let func_decl = ast::MethodDecl::Func::<spirv::Word>(
|
||||
vec![ast::FnArgument {
|
||||
align: None,
|
||||
v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(
|
||||
ast::ScalarType::U32,
|
||||
)),
|
||||
v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)),
|
||||
name: id_defs.new_non_variable(None),
|
||||
array_init: Vec::new(),
|
||||
}],
|
||||
|
@ -1727,17 +1766,14 @@ fn to_ptx_impl_atomic_call(
|
|||
ast::FnArgument {
|
||||
align: None,
|
||||
v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Pointer(
|
||||
ast::SizedScalarType::U32,
|
||||
ptr_space,
|
||||
typ, ptr_space,
|
||||
)),
|
||||
name: id_defs.new_non_variable(None),
|
||||
array_init: Vec::new(),
|
||||
},
|
||||
ast::FnArgument {
|
||||
align: None,
|
||||
v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(
|
||||
ast::ScalarType::U32,
|
||||
)),
|
||||
v_type: ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)),
|
||||
name: id_defs.new_non_variable(None),
|
||||
array_init: Vec::new(),
|
||||
},
|
||||
|
@ -1768,19 +1804,16 @@ fn to_ptx_impl_atomic_call(
|
|||
func: fn_id,
|
||||
ret_params: vec![(
|
||||
arg.dst,
|
||||
ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(ast::ScalarType::U32)),
|
||||
ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)),
|
||||
)],
|
||||
param_list: vec![
|
||||
(
|
||||
arg.src1,
|
||||
ast::FnArgumentType::Reg(ast::VariableRegType::Pointer(
|
||||
ast::SizedScalarType::U32,
|
||||
ptr_space,
|
||||
)),
|
||||
ast::FnArgumentType::Reg(ast::VariableRegType::Pointer(typ, ptr_space)),
|
||||
),
|
||||
(
|
||||
arg.src2,
|
||||
ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(ast::ScalarType::U32)),
|
||||
ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(scalar_typ)),
|
||||
),
|
||||
],
|
||||
})
|
||||
|
@ -1963,14 +1996,13 @@ fn to_ptx_impl_bfi_call(
|
|||
arg.dst,
|
||||
ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(typ.into())),
|
||||
)],
|
||||
// Note, for some reason PTX and SPIR-V order base&insert arguments differently
|
||||
param_list: vec![
|
||||
(
|
||||
arg.src2,
|
||||
arg.src1,
|
||||
ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(typ.into())),
|
||||
),
|
||||
(
|
||||
arg.src1,
|
||||
arg.src2,
|
||||
ast::FnArgumentType::Reg(ast::VariableRegType::Scalar(typ.into())),
|
||||
),
|
||||
(
|
||||
|
@ -3476,8 +3508,12 @@ fn emit_atom(
|
|||
};
|
||||
(spirv_op, typ.into())
|
||||
}
|
||||
// TODO: Hardware is capable of this, implement it through builtin
|
||||
ast::AtomInnerDetails::Float { .. } => todo!(),
|
||||
ast::AtomInnerDetails::Float { op, typ } => {
|
||||
let spirv_op: fn(&mut dr::Builder, _, _, _, _, _, _) -> _ = match op {
|
||||
ast::AtomFloatOp::Add => dr::Builder::atomic_f_add_ext,
|
||||
};
|
||||
(spirv_op, typ.into())
|
||||
}
|
||||
};
|
||||
let result_type = map.get_or_add_scalar(builder, typ);
|
||||
let memory_const = map.get_or_add_constant(
|
||||
|
@ -4287,8 +4323,8 @@ fn emit_implicit_conversion(
|
|||
}
|
||||
(TypeKind::Scalar, TypeKind::Scalar, ConversionKind::SignExtend) => {
|
||||
let result_type = map.get_or_add(builder, SpirvType::from(cv.to.clone()));
|
||||
builder.s_convert(result_type , Some(cv.dst), cv.src)?;
|
||||
},
|
||||
builder.s_convert(result_type, Some(cv.dst), cv.src)?;
|
||||
}
|
||||
(TypeKind::Vector, TypeKind::Scalar, ConversionKind::Default)
|
||||
| (TypeKind::Scalar, TypeKind::Array, ConversionKind::Default)
|
||||
| (TypeKind::Array, TypeKind::Scalar, ConversionKind::Default) => {
|
||||
|
|
Loading…
Add table
Reference in a new issue