mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-04-20 00:14:45 +00:00
Hack enough functionality that AMD GPU code builds
This commit is contained in:
parent
b4de21fbc5
commit
638786b0ec
5 changed files with 195 additions and 124 deletions
BIN
ptx/lib/zluda_ptx_impl.bc
Normal file
BIN
ptx/lib/zluda_ptx_impl.bc
Normal file
Binary file not shown.
|
@ -1,5 +1,6 @@
|
|||
// Every time this file changes it must te rebuilt:
|
||||
// ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions" -out_dir . -device kbl -output_no_suffix -spv_only
|
||||
// ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions -DINTEL" -out_dir . -device kbl -output_no_suffix -spv_only
|
||||
// /opt/amdgpu-pro/bin/clang -x cl -Xclang -finclude-default-header zluda_ptx_impl.cl -cl-std=CL2.0 -c -target amdgcn-amd-amdhsa -o zluda_ptx_impl.bc -emit-llvm
|
||||
// Additionally you should strip names:
|
||||
// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv --target-env=spv1.3
|
||||
|
||||
|
@ -129,137 +130,142 @@ atomic_dec(atom_acq_rel_cta_shared_dec, memory_order_acq_rel, memory_order_acqui
|
|||
|
||||
atomic_dec(atom_relaxed_gpu_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
|
||||
atomic_dec(atom_acquire_gpu_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_dec(atom_release_gpu_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_dec(atom_acq_rel_gpu_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
|
||||
|
||||
atomic_dec(atom_relaxed_sys_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
|
||||
atomic_dec(atom_acquire_sys_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_dec(atom_release_sys_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
|
||||
|
||||
// atom.add.f32
|
||||
atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
#ifdef INTEL
|
||||
// atom.add.f32
|
||||
atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
|
||||
// atom.add.f64
|
||||
atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
// atom.add.f64
|
||||
atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
|
||||
uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
|
||||
return intel_ubfe(base, pos, len);
|
||||
}
|
||||
uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
|
||||
return intel_ubfe(base, pos, len);
|
||||
}
|
||||
|
||||
ulong FUNC(bfe_u64)(ulong base, uint pos, uint len) {
|
||||
return intel_ubfe(base, pos, len);
|
||||
}
|
||||
ulong FUNC(bfe_u64)(ulong base, uint pos, uint len) {
|
||||
return intel_ubfe(base, pos, len);
|
||||
}
|
||||
|
||||
int FUNC(bfe_s32)(int base, uint pos, uint len) {
|
||||
return intel_sbfe(base, pos, len);
|
||||
}
|
||||
int FUNC(bfe_s32)(int base, uint pos, uint len) {
|
||||
return intel_sbfe(base, pos, len);
|
||||
}
|
||||
|
||||
long FUNC(bfe_s64)(long base, uint pos, uint len) {
|
||||
return intel_sbfe(base, pos, len);
|
||||
}
|
||||
long FUNC(bfe_s64)(long base, uint pos, uint len) {
|
||||
return intel_sbfe(base, pos, len);
|
||||
}
|
||||
|
||||
uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
|
||||
return intel_bfi(base, insert, offset, count);
|
||||
}
|
||||
uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
|
||||
return intel_bfi(base, insert, offset, count);
|
||||
}
|
||||
|
||||
ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
|
||||
return intel_bfi(base, insert, offset, count);
|
||||
}
|
||||
ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
|
||||
return intel_bfi(base, insert, offset, count);
|
||||
}
|
||||
|
||||
uint FUNC(brev_b32)(uint base) {
|
||||
return intel_bfrev(base);
|
||||
}
|
||||
uint FUNC(brev_b32)(uint base) {
|
||||
return intel_bfrev(base);
|
||||
}
|
||||
|
||||
ulong FUNC(brev_b64)(ulong base) {
|
||||
return intel_bfrev(base);
|
||||
}
|
||||
ulong FUNC(brev_b64)(ulong base) {
|
||||
return intel_bfrev(base);
|
||||
}
|
||||
#else
|
||||
uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
|
||||
return amd_bfe(base, pos, len);
|
||||
}
|
||||
|
||||
extern __attribute__((const)) int __llvm_bitreverse_i32(int) __asm("llvm.bitreverse.i32");
|
||||
uint FUNC(brev_b32)(uint base) {
|
||||
return __llvm_bitreverse_i32(base);
|
||||
}
|
||||
#endif
|
||||
|
||||
void FUNC(__assertfail)(
|
||||
__private ulong* message,
|
||||
|
|
|
@ -251,7 +251,7 @@ fn run_spirv<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + D
|
|||
let ctx = ze::Context::new(drv, None)?;
|
||||
let queue = ze::CommandQueue::new(&ctx, dev)?;
|
||||
let (module, maybe_log) = match module.should_link_ptx_impl {
|
||||
Some(ptx_impl) => ze::Module::build_link_spirv(
|
||||
Some((ptx_impl, _)) => ze::Module::build_link_spirv(
|
||||
&ctx,
|
||||
dev,
|
||||
&[ptx_impl, byte_il],
|
||||
|
|
|
@ -7,7 +7,8 @@ use std::{borrow::Cow, collections::BTreeSet, ffi::CString, hash::Hash, iter, me
|
|||
|
||||
use rspirv::binary::Assemble;
|
||||
|
||||
static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.spv");
|
||||
static ZLUDA_PTX_IMPL_INTEL: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.spv");
|
||||
static ZLUDA_PTX_IMPL_AMD: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.bc");
|
||||
static ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl__";
|
||||
|
||||
quick_error! {
|
||||
|
@ -405,7 +406,7 @@ impl TypeWordMap {
|
|||
pub struct Module {
|
||||
pub spirv: dr::Module,
|
||||
pub kernel_info: HashMap<String, KernelInfo>,
|
||||
pub should_link_ptx_impl: Option<&'static [u8]>,
|
||||
pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>,
|
||||
pub build_options: CString,
|
||||
}
|
||||
impl Module {
|
||||
|
@ -466,7 +467,7 @@ pub fn to_spirv_module<'a>(ast: ast::Module<'a>) -> Result<Module, TranslateErro
|
|||
spirv,
|
||||
kernel_info,
|
||||
should_link_ptx_impl: if must_link_ptx_impl {
|
||||
Some(ZLUDA_PTX_IMPL)
|
||||
Some((ZLUDA_PTX_IMPL_INTEL, ZLUDA_PTX_IMPL_AMD))
|
||||
} else {
|
||||
None
|
||||
},
|
||||
|
|
|
@ -7,6 +7,7 @@ use std::{
|
|||
io::{self, Write},
|
||||
mem,
|
||||
os::raw::{c_char, c_int, c_uint},
|
||||
path::PathBuf,
|
||||
process::{Command, Stdio},
|
||||
ptr, slice,
|
||||
};
|
||||
|
@ -49,7 +50,7 @@ pub struct ModuleData {
|
|||
pub struct SpirvModule {
|
||||
pub binaries: Vec<u32>,
|
||||
pub kernel_info: HashMap<String, ptx::KernelInfo>,
|
||||
pub should_link_ptx_impl: Option<&'static [u8]>,
|
||||
pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>,
|
||||
pub build_options: CString,
|
||||
}
|
||||
|
||||
|
@ -93,31 +94,93 @@ impl SpirvModule {
|
|||
|
||||
const LLVM_SPIRV: &'static str = "/home/vosen/amd/llvm-project/build/bin/llvm-spirv";
|
||||
const AMDGPU: &'static str = "/opt/amdgpu-pro/";
|
||||
const AMDGPU_TARGET: &'static str = "amdgcn-amd-amdhsa";
|
||||
const AMDGPU_BITCODE: [&'static str; 8] = [
|
||||
"opencl",
|
||||
"ocml",
|
||||
"ockl",
|
||||
"oclc_correctly_rounded_sqrt_off",
|
||||
"oclc_daz_opt_on",
|
||||
"oclc_finite_only_off",
|
||||
"oclc_unsafe_math_off",
|
||||
"oclc_wavefrontsize64_off",
|
||||
"opencl.bc",
|
||||
"ocml.bc",
|
||||
"ockl.bc",
|
||||
"oclc_correctly_rounded_sqrt_off.bc",
|
||||
"oclc_daz_opt_on.bc",
|
||||
"oclc_finite_only_off.bc",
|
||||
"oclc_unsafe_math_off.bc",
|
||||
"oclc_wavefrontsize64_off.bc",
|
||||
];
|
||||
const AMDGPU_BITCODE_DEVICE_PREFIX: &'static str = "oclc_isa_version_";
|
||||
const AMDGPU_DEVICE: &'static str = "gfx1010";
|
||||
|
||||
fn compile_amd(spirv_il: &[u8]) -> io::Result<()> {
|
||||
fn get_bitcode_paths() -> impl Iterator<Item = PathBuf> {
|
||||
let generic_paths = Self::AMDGPU_BITCODE.iter().map(|x| {
|
||||
let mut path = PathBuf::from(Self::AMDGPU);
|
||||
path.push("amdgcn");
|
||||
path.push("bitcode");
|
||||
path.push(x);
|
||||
path
|
||||
});
|
||||
let mut additional_path = PathBuf::from(Self::AMDGPU);
|
||||
additional_path.push("amdgcn");
|
||||
additional_path.push("bitcode");
|
||||
additional_path.push(format!(
|
||||
"{}{}{}",
|
||||
Self::AMDGPU_BITCODE_DEVICE_PREFIX,
|
||||
&Self::AMDGPU_DEVICE[3..],
|
||||
".bc"
|
||||
));
|
||||
generic_paths.chain(std::iter::once(additional_path))
|
||||
}
|
||||
|
||||
fn compile_amd(spirv_il: &[u8], ptx_lib: Option<&'static [u8]>) -> io::Result<()> {
|
||||
let dir = tempfile::tempdir()?;
|
||||
let mut spirv = NamedTempFile::new_in(&dir)?;
|
||||
let llvm = NamedTempFile::new_in(&dir)?;
|
||||
spirv.write_all(spirv_il)?;
|
||||
let mut cmd = Command::new(Self::LLVM_SPIRV)
|
||||
let to_llvm_cmd = Command::new(Self::LLVM_SPIRV)
|
||||
.arg("-r")
|
||||
.arg("-o")
|
||||
.arg(llvm.path())
|
||||
.arg(spirv.path())
|
||||
.status()?;
|
||||
assert!(cmd.success());
|
||||
assert!(to_llvm_cmd.success());
|
||||
let linked_binary = NamedTempFile::new_in(&dir)?;
|
||||
let mut llvm_link = PathBuf::from(Self::AMDGPU);
|
||||
llvm_link.push("bin");
|
||||
llvm_link.push("llvm-link");
|
||||
let mut linker_cmd = Command::new(&llvm_link);
|
||||
linker_cmd
|
||||
.arg("--only-needed")
|
||||
.arg("-o")
|
||||
.arg(linked_binary.path())
|
||||
.arg(llvm.path())
|
||||
.args(Self::get_bitcode_paths());
|
||||
if cfg!(debug_assertions) {
|
||||
linker_cmd.arg("-v");
|
||||
}
|
||||
let status = linker_cmd.status()?;
|
||||
assert!(status.success());
|
||||
let mut ptx_lib_bitcode = NamedTempFile::new_in(&dir)?;
|
||||
let compiled_binary = NamedTempFile::new_in(&dir)?;
|
||||
let mut cland_exe = PathBuf::from(Self::AMDGPU);
|
||||
cland_exe.push("bin");
|
||||
cland_exe.push("clang");
|
||||
let mut compiler_cmd = Command::new(&cland_exe);
|
||||
compiler_cmd
|
||||
.arg(format!("-mcpu={}", Self::AMDGPU_DEVICE))
|
||||
.arg("-O3")
|
||||
.arg("-Xlinker")
|
||||
.arg("--no-undefined")
|
||||
.arg("-target")
|
||||
.arg(Self::AMDGPU_TARGET)
|
||||
.arg("-o")
|
||||
.arg(compiled_binary.path())
|
||||
.arg(linked_binary.path());
|
||||
if let Some(bitcode) = ptx_lib {
|
||||
ptx_lib_bitcode.write_all(bitcode)?;
|
||||
compiler_cmd.arg(ptx_lib_bitcode.path());
|
||||
};
|
||||
if cfg!(debug_assertions) {
|
||||
compiler_cmd.arg("-v");
|
||||
}
|
||||
let status = compiler_cmd.status()?;
|
||||
assert!(status.success());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -132,10 +195,10 @@ impl SpirvModule {
|
|||
self.binaries.len() * mem::size_of::<u32>(),
|
||||
)
|
||||
};
|
||||
Self::compile_amd(byte_il).unwrap();
|
||||
let main_module = ocl_core::create_program_with_il(ctx, byte_il, None)?;
|
||||
let main_module = match self.should_link_ptx_impl {
|
||||
None => {
|
||||
Self::compile_amd(byte_il, None).unwrap();
|
||||
ocl_core::build_program(
|
||||
&main_module,
|
||||
Some(&[dev]),
|
||||
|
@ -145,8 +208,9 @@ impl SpirvModule {
|
|||
)?;
|
||||
main_module
|
||||
}
|
||||
Some(ptx_impl) => {
|
||||
let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl, None)?;
|
||||
Some((ptx_impl_intel, ptx_impl_amd)) => {
|
||||
Self::compile_amd(byte_il, Some(ptx_impl_amd)).unwrap();
|
||||
let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl_intel, None)?;
|
||||
ocl_core::compile_program(
|
||||
&main_module,
|
||||
Some(&[dev]),
|
||||
|
|
Loading…
Add table
Reference in a new issue