mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-08-05 07:41:25 +00:00
Hack enough functionality that AMD GPU code builds
This commit is contained in:
parent
b4de21fbc5
commit
638786b0ec
5 changed files with 195 additions and 124 deletions
BIN
ptx/lib/zluda_ptx_impl.bc
Normal file
BIN
ptx/lib/zluda_ptx_impl.bc
Normal file
Binary file not shown.
|
@ -1,5 +1,6 @@
|
||||||
// Every time this file changes it must te rebuilt:
|
// Every time this file changes it must te rebuilt:
|
||||||
// ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions" -out_dir . -device kbl -output_no_suffix -spv_only
|
// ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions -DINTEL" -out_dir . -device kbl -output_no_suffix -spv_only
|
||||||
|
// /opt/amdgpu-pro/bin/clang -x cl -Xclang -finclude-default-header zluda_ptx_impl.cl -cl-std=CL2.0 -c -target amdgcn-amd-amdhsa -o zluda_ptx_impl.bc -emit-llvm
|
||||||
// Additionally you should strip names:
|
// Additionally you should strip names:
|
||||||
// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv --target-env=spv1.3
|
// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv --target-env=spv1.3
|
||||||
|
|
||||||
|
@ -129,137 +130,142 @@ atomic_dec(atom_acq_rel_cta_shared_dec, memory_order_acq_rel, memory_order_acqui
|
||||||
|
|
||||||
atomic_dec(atom_relaxed_gpu_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
|
atomic_dec(atom_relaxed_gpu_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
|
||||||
atomic_dec(atom_acquire_gpu_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
|
atomic_dec(atom_acquire_gpu_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
|
||||||
atomic_dec(atom_release_gpu_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
|
|
||||||
atomic_dec(atom_acq_rel_gpu_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
|
|
||||||
|
|
||||||
atomic_dec(atom_relaxed_sys_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
|
|
||||||
atomic_dec(atom_acquire_sys_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
|
|
||||||
atomic_dec(atom_release_sys_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
|
|
||||||
atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
|
atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
|
||||||
|
|
||||||
// atom.add.f32
|
#ifdef INTEL
|
||||||
atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
|
// atom.add.f32
|
||||||
atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
|
||||||
atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||||
atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||||
|
atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||||
atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||||
atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||||
atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||||
atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||||
atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||||
atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
|
atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||||
atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||||
atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||||
atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
|
atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||||
atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||||
atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||||
atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||||
atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
|
atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||||
|
|
||||||
// atom.add.f64
|
// atom.add.f64
|
||||||
atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||||
|
|
||||||
atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||||
atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||||
|
|
||||||
uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
|
uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
|
||||||
return intel_ubfe(base, pos, len);
|
return intel_ubfe(base, pos, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
ulong FUNC(bfe_u64)(ulong base, uint pos, uint len) {
|
ulong FUNC(bfe_u64)(ulong base, uint pos, uint len) {
|
||||||
return intel_ubfe(base, pos, len);
|
return intel_ubfe(base, pos, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
int FUNC(bfe_s32)(int base, uint pos, uint len) {
|
int FUNC(bfe_s32)(int base, uint pos, uint len) {
|
||||||
return intel_sbfe(base, pos, len);
|
return intel_sbfe(base, pos, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
long FUNC(bfe_s64)(long base, uint pos, uint len) {
|
long FUNC(bfe_s64)(long base, uint pos, uint len) {
|
||||||
return intel_sbfe(base, pos, len);
|
return intel_sbfe(base, pos, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
|
uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
|
||||||
return intel_bfi(base, insert, offset, count);
|
return intel_bfi(base, insert, offset, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
|
ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
|
||||||
return intel_bfi(base, insert, offset, count);
|
return intel_bfi(base, insert, offset, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint FUNC(brev_b32)(uint base) {
|
uint FUNC(brev_b32)(uint base) {
|
||||||
return intel_bfrev(base);
|
return intel_bfrev(base);
|
||||||
}
|
}
|
||||||
|
|
||||||
ulong FUNC(brev_b64)(ulong base) {
|
ulong FUNC(brev_b64)(ulong base) {
|
||||||
return intel_bfrev(base);
|
return intel_bfrev(base);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
|
||||||
|
return amd_bfe(base, pos, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __attribute__((const)) int __llvm_bitreverse_i32(int) __asm("llvm.bitreverse.i32");
|
||||||
|
uint FUNC(brev_b32)(uint base) {
|
||||||
|
return __llvm_bitreverse_i32(base);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void FUNC(__assertfail)(
|
void FUNC(__assertfail)(
|
||||||
__private ulong* message,
|
__private ulong* message,
|
||||||
|
|
|
@ -251,7 +251,7 @@ fn run_spirv<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + D
|
||||||
let ctx = ze::Context::new(drv, None)?;
|
let ctx = ze::Context::new(drv, None)?;
|
||||||
let queue = ze::CommandQueue::new(&ctx, dev)?;
|
let queue = ze::CommandQueue::new(&ctx, dev)?;
|
||||||
let (module, maybe_log) = match module.should_link_ptx_impl {
|
let (module, maybe_log) = match module.should_link_ptx_impl {
|
||||||
Some(ptx_impl) => ze::Module::build_link_spirv(
|
Some((ptx_impl, _)) => ze::Module::build_link_spirv(
|
||||||
&ctx,
|
&ctx,
|
||||||
dev,
|
dev,
|
||||||
&[ptx_impl, byte_il],
|
&[ptx_impl, byte_il],
|
||||||
|
|
|
@ -7,7 +7,8 @@ use std::{borrow::Cow, collections::BTreeSet, ffi::CString, hash::Hash, iter, me
|
||||||
|
|
||||||
use rspirv::binary::Assemble;
|
use rspirv::binary::Assemble;
|
||||||
|
|
||||||
static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.spv");
|
static ZLUDA_PTX_IMPL_INTEL: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.spv");
|
||||||
|
static ZLUDA_PTX_IMPL_AMD: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.bc");
|
||||||
static ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl__";
|
static ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl__";
|
||||||
|
|
||||||
quick_error! {
|
quick_error! {
|
||||||
|
@ -405,7 +406,7 @@ impl TypeWordMap {
|
||||||
pub struct Module {
|
pub struct Module {
|
||||||
pub spirv: dr::Module,
|
pub spirv: dr::Module,
|
||||||
pub kernel_info: HashMap<String, KernelInfo>,
|
pub kernel_info: HashMap<String, KernelInfo>,
|
||||||
pub should_link_ptx_impl: Option<&'static [u8]>,
|
pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>,
|
||||||
pub build_options: CString,
|
pub build_options: CString,
|
||||||
}
|
}
|
||||||
impl Module {
|
impl Module {
|
||||||
|
@ -466,7 +467,7 @@ pub fn to_spirv_module<'a>(ast: ast::Module<'a>) -> Result<Module, TranslateErro
|
||||||
spirv,
|
spirv,
|
||||||
kernel_info,
|
kernel_info,
|
||||||
should_link_ptx_impl: if must_link_ptx_impl {
|
should_link_ptx_impl: if must_link_ptx_impl {
|
||||||
Some(ZLUDA_PTX_IMPL)
|
Some((ZLUDA_PTX_IMPL_INTEL, ZLUDA_PTX_IMPL_AMD))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
},
|
},
|
||||||
|
|
|
@ -7,6 +7,7 @@ use std::{
|
||||||
io::{self, Write},
|
io::{self, Write},
|
||||||
mem,
|
mem,
|
||||||
os::raw::{c_char, c_int, c_uint},
|
os::raw::{c_char, c_int, c_uint},
|
||||||
|
path::PathBuf,
|
||||||
process::{Command, Stdio},
|
process::{Command, Stdio},
|
||||||
ptr, slice,
|
ptr, slice,
|
||||||
};
|
};
|
||||||
|
@ -49,7 +50,7 @@ pub struct ModuleData {
|
||||||
pub struct SpirvModule {
|
pub struct SpirvModule {
|
||||||
pub binaries: Vec<u32>,
|
pub binaries: Vec<u32>,
|
||||||
pub kernel_info: HashMap<String, ptx::KernelInfo>,
|
pub kernel_info: HashMap<String, ptx::KernelInfo>,
|
||||||
pub should_link_ptx_impl: Option<&'static [u8]>,
|
pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>,
|
||||||
pub build_options: CString,
|
pub build_options: CString,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -93,31 +94,93 @@ impl SpirvModule {
|
||||||
|
|
||||||
const LLVM_SPIRV: &'static str = "/home/vosen/amd/llvm-project/build/bin/llvm-spirv";
|
const LLVM_SPIRV: &'static str = "/home/vosen/amd/llvm-project/build/bin/llvm-spirv";
|
||||||
const AMDGPU: &'static str = "/opt/amdgpu-pro/";
|
const AMDGPU: &'static str = "/opt/amdgpu-pro/";
|
||||||
|
const AMDGPU_TARGET: &'static str = "amdgcn-amd-amdhsa";
|
||||||
const AMDGPU_BITCODE: [&'static str; 8] = [
|
const AMDGPU_BITCODE: [&'static str; 8] = [
|
||||||
"opencl",
|
"opencl.bc",
|
||||||
"ocml",
|
"ocml.bc",
|
||||||
"ockl",
|
"ockl.bc",
|
||||||
"oclc_correctly_rounded_sqrt_off",
|
"oclc_correctly_rounded_sqrt_off.bc",
|
||||||
"oclc_daz_opt_on",
|
"oclc_daz_opt_on.bc",
|
||||||
"oclc_finite_only_off",
|
"oclc_finite_only_off.bc",
|
||||||
"oclc_unsafe_math_off",
|
"oclc_unsafe_math_off.bc",
|
||||||
"oclc_wavefrontsize64_off",
|
"oclc_wavefrontsize64_off.bc",
|
||||||
];
|
];
|
||||||
const AMDGPU_BITCODE_DEVICE_PREFIX: &'static str = "oclc_isa_version_";
|
const AMDGPU_BITCODE_DEVICE_PREFIX: &'static str = "oclc_isa_version_";
|
||||||
const AMDGPU_DEVICE: &'static str = "gfx1010";
|
const AMDGPU_DEVICE: &'static str = "gfx1010";
|
||||||
|
|
||||||
fn compile_amd(spirv_il: &[u8]) -> io::Result<()> {
|
fn get_bitcode_paths() -> impl Iterator<Item = PathBuf> {
|
||||||
|
let generic_paths = Self::AMDGPU_BITCODE.iter().map(|x| {
|
||||||
|
let mut path = PathBuf::from(Self::AMDGPU);
|
||||||
|
path.push("amdgcn");
|
||||||
|
path.push("bitcode");
|
||||||
|
path.push(x);
|
||||||
|
path
|
||||||
|
});
|
||||||
|
let mut additional_path = PathBuf::from(Self::AMDGPU);
|
||||||
|
additional_path.push("amdgcn");
|
||||||
|
additional_path.push("bitcode");
|
||||||
|
additional_path.push(format!(
|
||||||
|
"{}{}{}",
|
||||||
|
Self::AMDGPU_BITCODE_DEVICE_PREFIX,
|
||||||
|
&Self::AMDGPU_DEVICE[3..],
|
||||||
|
".bc"
|
||||||
|
));
|
||||||
|
generic_paths.chain(std::iter::once(additional_path))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compile_amd(spirv_il: &[u8], ptx_lib: Option<&'static [u8]>) -> io::Result<()> {
|
||||||
let dir = tempfile::tempdir()?;
|
let dir = tempfile::tempdir()?;
|
||||||
let mut spirv = NamedTempFile::new_in(&dir)?;
|
let mut spirv = NamedTempFile::new_in(&dir)?;
|
||||||
let llvm = NamedTempFile::new_in(&dir)?;
|
let llvm = NamedTempFile::new_in(&dir)?;
|
||||||
spirv.write_all(spirv_il)?;
|
spirv.write_all(spirv_il)?;
|
||||||
let mut cmd = Command::new(Self::LLVM_SPIRV)
|
let to_llvm_cmd = Command::new(Self::LLVM_SPIRV)
|
||||||
.arg("-r")
|
.arg("-r")
|
||||||
.arg("-o")
|
.arg("-o")
|
||||||
.arg(llvm.path())
|
.arg(llvm.path())
|
||||||
.arg(spirv.path())
|
.arg(spirv.path())
|
||||||
.status()?;
|
.status()?;
|
||||||
assert!(cmd.success());
|
assert!(to_llvm_cmd.success());
|
||||||
|
let linked_binary = NamedTempFile::new_in(&dir)?;
|
||||||
|
let mut llvm_link = PathBuf::from(Self::AMDGPU);
|
||||||
|
llvm_link.push("bin");
|
||||||
|
llvm_link.push("llvm-link");
|
||||||
|
let mut linker_cmd = Command::new(&llvm_link);
|
||||||
|
linker_cmd
|
||||||
|
.arg("--only-needed")
|
||||||
|
.arg("-o")
|
||||||
|
.arg(linked_binary.path())
|
||||||
|
.arg(llvm.path())
|
||||||
|
.args(Self::get_bitcode_paths());
|
||||||
|
if cfg!(debug_assertions) {
|
||||||
|
linker_cmd.arg("-v");
|
||||||
|
}
|
||||||
|
let status = linker_cmd.status()?;
|
||||||
|
assert!(status.success());
|
||||||
|
let mut ptx_lib_bitcode = NamedTempFile::new_in(&dir)?;
|
||||||
|
let compiled_binary = NamedTempFile::new_in(&dir)?;
|
||||||
|
let mut cland_exe = PathBuf::from(Self::AMDGPU);
|
||||||
|
cland_exe.push("bin");
|
||||||
|
cland_exe.push("clang");
|
||||||
|
let mut compiler_cmd = Command::new(&cland_exe);
|
||||||
|
compiler_cmd
|
||||||
|
.arg(format!("-mcpu={}", Self::AMDGPU_DEVICE))
|
||||||
|
.arg("-O3")
|
||||||
|
.arg("-Xlinker")
|
||||||
|
.arg("--no-undefined")
|
||||||
|
.arg("-target")
|
||||||
|
.arg(Self::AMDGPU_TARGET)
|
||||||
|
.arg("-o")
|
||||||
|
.arg(compiled_binary.path())
|
||||||
|
.arg(linked_binary.path());
|
||||||
|
if let Some(bitcode) = ptx_lib {
|
||||||
|
ptx_lib_bitcode.write_all(bitcode)?;
|
||||||
|
compiler_cmd.arg(ptx_lib_bitcode.path());
|
||||||
|
};
|
||||||
|
if cfg!(debug_assertions) {
|
||||||
|
compiler_cmd.arg("-v");
|
||||||
|
}
|
||||||
|
let status = compiler_cmd.status()?;
|
||||||
|
assert!(status.success());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -132,10 +195,10 @@ impl SpirvModule {
|
||||||
self.binaries.len() * mem::size_of::<u32>(),
|
self.binaries.len() * mem::size_of::<u32>(),
|
||||||
)
|
)
|
||||||
};
|
};
|
||||||
Self::compile_amd(byte_il).unwrap();
|
|
||||||
let main_module = ocl_core::create_program_with_il(ctx, byte_il, None)?;
|
let main_module = ocl_core::create_program_with_il(ctx, byte_il, None)?;
|
||||||
let main_module = match self.should_link_ptx_impl {
|
let main_module = match self.should_link_ptx_impl {
|
||||||
None => {
|
None => {
|
||||||
|
Self::compile_amd(byte_il, None).unwrap();
|
||||||
ocl_core::build_program(
|
ocl_core::build_program(
|
||||||
&main_module,
|
&main_module,
|
||||||
Some(&[dev]),
|
Some(&[dev]),
|
||||||
|
@ -145,8 +208,9 @@ impl SpirvModule {
|
||||||
)?;
|
)?;
|
||||||
main_module
|
main_module
|
||||||
}
|
}
|
||||||
Some(ptx_impl) => {
|
Some((ptx_impl_intel, ptx_impl_amd)) => {
|
||||||
let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl, None)?;
|
Self::compile_amd(byte_il, Some(ptx_impl_amd)).unwrap();
|
||||||
|
let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl_intel, None)?;
|
||||||
ocl_core::compile_program(
|
ocl_core::compile_program(
|
||||||
&main_module,
|
&main_module,
|
||||||
Some(&[dev]),
|
Some(&[dev]),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue