mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-08-01 05:48:37 +00:00
Make ptx unit tests run on AMD (except denormals)
This commit is contained in:
parent
82510ce8fd
commit
18245be7d5
5 changed files with 154 additions and 118 deletions
|
@ -2,6 +2,7 @@ use std::env::VarError;
|
|||
|
||||
fn main() -> Result<(), VarError> {
|
||||
println!("cargo:rustc-link-lib=dylib=amdhip64");
|
||||
println!("cargo:rustc-link-search=native=/opt/rocm/lib/");
|
||||
//println!("cargo:rustc-link-search=native=/opt/rocm/lib/");
|
||||
println!("cargo:rustc-link-search=native=/home/ubuntu/hipamd/build/lib");
|
||||
Ok(())
|
||||
}
|
||||
|
|
Binary file not shown.
|
@ -1,6 +1,6 @@
|
|||
// Every time this file changes it must te rebuilt:
|
||||
// ocloc -file zluda_ptx_impl.cl -64 -options "-cl-std=CL2.0 -Dcl_intel_bit_instructions -DINTEL" -out_dir . -device kbl -output_no_suffix -spv_only
|
||||
// /opt/amdgpu-pro/bin/clang -x cl -Xclang -finclude-default-header zluda_ptx_impl.cl -cl-std=CL2.0 -c -target amdgcn-amd-amdhsa -o zluda_ptx_impl.bc -emit-llvm
|
||||
// /opt/rocm/llvm/bin/clang -Wall -Wextra -Wsign-compare -Wconversion -x cl -Xclang -finclude-default-header zluda_ptx_impl.cl -cl-std=CL2.0 -c -target amdgcn-amd-amdhsa -o zluda_ptx_impl.bc -emit-llvm
|
||||
// Additionally you should strip names:
|
||||
// spirv-opt --strip-debug zluda_ptx_impl.spv -o zluda_ptx_impl.spv --target-env=spv1.3
|
||||
|
||||
|
@ -132,99 +132,98 @@ atomic_dec(atom_relaxed_gpu_shared_dec, memory_order_relaxed, memory_order_relax
|
|||
atomic_dec(atom_acquire_gpu_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
|
||||
|
||||
// atom.add.f32
|
||||
atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
// atom.add.f64
|
||||
atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
|
||||
#ifdef INTEL
|
||||
// atom.add.f32
|
||||
atomic_add(atom_relaxed_cta_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_generic_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_generic_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_generic_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_generic_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_global_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_global_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_global_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_global_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_cta_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_cta_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_cta_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_gpu_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_gpu_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_gpu_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_sys_shared_add_f32, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acquire_sys_shared_add_f32, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_release_sys_shared_add_f32, memory_order_release, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
atomic_add(atom_acq_rel_sys_shared_add_f32, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, float, atomic_uint, uint);
|
||||
|
||||
atomic_add(atom_relaxed_cta_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, , double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_generic_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_generic_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_generic_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_generic_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, , double, atomic_ulong, ulong);
|
||||
|
||||
// atom.add.f64
|
||||
atomic_add(atom_relaxed_cta_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_global_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_global_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_global_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_global_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_cta_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_cta_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_cta_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_cta_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_gpu_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_gpu_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_gpu_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_gpu_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
|
||||
atomic_add(atom_relaxed_sys_shared_add_f64, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acquire_sys_shared_add_f64, memory_order_acquire, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_release_sys_shared_add_f64, memory_order_release, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local, double, atomic_ulong, ulong);
|
||||
|
||||
uint FUNC(bfe_u32)(uint base, uint pos, uint len) {
|
||||
return intel_ubfe(base, pos, len);
|
||||
}
|
||||
|
@ -261,17 +260,44 @@ atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acqui
|
|||
return amd_bfe(base, pos, len);
|
||||
}
|
||||
|
||||
extern __attribute__((const)) int __llvm_bitreverse_i32(int) __asm("llvm.bitreverse.i32");
|
||||
ulong FUNC(bfe_u64)(ulong base, uint pos, uint len) {
|
||||
return (base >> pos) & len;
|
||||
}
|
||||
|
||||
int FUNC(bfe_s32)(int base, uint pos, uint len) {
|
||||
return amd_bfe(base, pos, len);
|
||||
}
|
||||
|
||||
long FUNC(bfe_s64)(long base, uint pos, uint len) {
|
||||
return (base >> pos) & len;
|
||||
}
|
||||
|
||||
uint FUNC(bfi_b32)(uint insert, uint base, uint offset, uint count) {
|
||||
uint mask = amd_bfm(count, offset);
|
||||
return (~mask & base) | (mask & insert);
|
||||
}
|
||||
|
||||
ulong FUNC(bfi_b64)(ulong insert, ulong base, uint offset, uint count) {
|
||||
ulong mask = ((1UL << (count & 0x3f)) - 1UL) << (offset & 0x3f);
|
||||
return (~mask & base) | (mask & insert);
|
||||
}
|
||||
|
||||
extern __attribute__((const)) uint __llvm_bitreverse_i32(uint) __asm("llvm.bitreverse.i32");
|
||||
uint FUNC(brev_b32)(uint base) {
|
||||
return __llvm_bitreverse_i32(base);
|
||||
}
|
||||
|
||||
extern __attribute__((const)) ulong __llvm_bitreverse_i64(ulong) __asm("llvm.bitreverse.i64");
|
||||
ulong FUNC(brev_b64)(ulong base) {
|
||||
return __llvm_bitreverse_i64(base);
|
||||
}
|
||||
#endif
|
||||
|
||||
void FUNC(__assertfail)(
|
||||
__private ulong* message,
|
||||
__private ulong* file,
|
||||
__private uint* line,
|
||||
__private ulong* function,
|
||||
__private ulong* charSize
|
||||
__attribute__((unused)) __private ulong* message,
|
||||
__attribute__((unused)) __private ulong* file,
|
||||
__attribute__((unused)) __private uint* line,
|
||||
__attribute__((unused)) __private ulong* function,
|
||||
__attribute__((unused)) __private ulong* charSize
|
||||
) {
|
||||
}
|
||||
|
|
|
@ -269,11 +269,9 @@ fn run_spirv<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + D
|
|||
hip_call! { hipStreamCreate(&mut stream) };
|
||||
let mut dev_props = unsafe { mem::zeroed() };
|
||||
hip_call! { hipGetDeviceProperties(&mut dev_props, dev) };
|
||||
let nul_terminator = dev_props.gcnArchName.iter().position(|&x| x == 0).unwrap();
|
||||
let gcn_arch_slice = unsafe {
|
||||
slice::from_raw_parts(
|
||||
dev_props.gcnArchName.as_ptr() as _,
|
||||
dev_props.gcnArchName.len(),
|
||||
)
|
||||
slice::from_raw_parts(dev_props.gcnArchName.as_ptr() as _, nul_terminator + 1)
|
||||
};
|
||||
let dev_name =
|
||||
if let Ok(Ok(name)) = CStr::from_bytes_with_nul(gcn_arch_slice).map(|x| x.to_str()) {
|
||||
|
@ -288,9 +286,9 @@ fn run_spirv<Input: From<u8> + Copy + Debug, Output: From<u8> + Copy + Debug + D
|
|||
let mut kernel = ptr::null_mut();
|
||||
hip_call! { hipModuleGetFunction(&mut kernel, module, name.as_ptr()) };
|
||||
let mut inp_b = ptr::null_mut();
|
||||
hip_call! { hipMalloc(&mut inp_b, input.len()) };
|
||||
hip_call! { hipMalloc(&mut inp_b, input.len() * mem::size_of::<Input>()) };
|
||||
let mut out_b = ptr::null_mut();
|
||||
hip_call! { hipMalloc(&mut out_b, output.len()) };
|
||||
hip_call! { hipMalloc(&mut out_b, output.len() * mem::size_of::<Output>()) };
|
||||
hip_call! { hipMemcpyWithStream(inp_b, input.as_ptr() as _, input.len() * mem::size_of::<Input>(), hipMemcpyKind::hipMemcpyHostToDevice, stream) };
|
||||
hip_call! { hipMemset(out_b, 0, output.len() * mem::size_of::<Output>()) };
|
||||
let mut args = [&inp_b, &out_b];
|
||||
|
@ -562,7 +560,7 @@ unsafe extern "C" fn parse_instruction_cb(
|
|||
}
|
||||
|
||||
const LLVM_SPIRV: &'static str = "/home/vosen/amd/llvm-project/build/bin/llvm-spirv";
|
||||
const AMDGPU: &'static str = "/opt/amdgpu-pro/";
|
||||
const AMDGPU: &'static str = "/opt/rocm/";
|
||||
const AMDGPU_TARGET: &'static str = "amdgcn-amd-amdhsa";
|
||||
const AMDGPU_BITCODE: [&'static str; 8] = [
|
||||
"opencl.bc",
|
||||
|
@ -604,6 +602,7 @@ fn compile_amd(
|
|||
assert!(to_llvm_cmd.success());
|
||||
let linked_binary = NamedTempFile::new_in(&dir)?;
|
||||
let mut llvm_link = PathBuf::from(AMDGPU);
|
||||
llvm_link.push("llvm");
|
||||
llvm_link.push("bin");
|
||||
llvm_link.push("llvm-link");
|
||||
let mut linker_cmd = Command::new(&llvm_link);
|
||||
|
@ -620,10 +619,11 @@ fn compile_amd(
|
|||
assert!(status.success());
|
||||
let mut ptx_lib_bitcode = NamedTempFile::new_in(&dir)?;
|
||||
let compiled_binary = NamedTempFile::new_in(&dir)?;
|
||||
let mut cland_exe = PathBuf::from(AMDGPU);
|
||||
cland_exe.push("bin");
|
||||
cland_exe.push("clang");
|
||||
let mut compiler_cmd = Command::new(&cland_exe);
|
||||
let mut clang_exe = PathBuf::from(AMDGPU);
|
||||
clang_exe.push("llvm");
|
||||
clang_exe.push("bin");
|
||||
clang_exe.push("clang");
|
||||
let mut compiler_cmd = Command::new(&clang_exe);
|
||||
compiler_cmd
|
||||
.arg(format!("-mcpu={}", device_name))
|
||||
.arg("-nogpulib")
|
||||
|
|
|
@ -2893,7 +2893,7 @@ fn emit_function_body_ops(
|
|||
result_type,
|
||||
Some(a.dst),
|
||||
opencl,
|
||||
spirv::CLOp::native_rsqrt as spirv::Word,
|
||||
spirv::CLOp::rsqrt as spirv::Word,
|
||||
[dr::Operand::IdRef(a.src)].iter().cloned(),
|
||||
)?;
|
||||
}
|
||||
|
@ -2912,7 +2912,7 @@ fn emit_function_body_ops(
|
|||
result_type,
|
||||
Some(arg.dst),
|
||||
opencl,
|
||||
spirv::CLOp::native_sin as u32,
|
||||
spirv::CLOp::sin as u32,
|
||||
[dr::Operand::IdRef(arg.src)].iter().cloned(),
|
||||
)?;
|
||||
}
|
||||
|
@ -2922,7 +2922,7 @@ fn emit_function_body_ops(
|
|||
result_type,
|
||||
Some(arg.dst),
|
||||
opencl,
|
||||
spirv::CLOp::native_cos as u32,
|
||||
spirv::CLOp::cos as u32,
|
||||
[dr::Operand::IdRef(arg.src)].iter().cloned(),
|
||||
)?;
|
||||
}
|
||||
|
@ -2932,7 +2932,7 @@ fn emit_function_body_ops(
|
|||
result_type,
|
||||
Some(arg.dst),
|
||||
opencl,
|
||||
spirv::CLOp::native_log2 as u32,
|
||||
spirv::CLOp::log2 as u32,
|
||||
[dr::Operand::IdRef(arg.src)].iter().cloned(),
|
||||
)?;
|
||||
}
|
||||
|
@ -2942,7 +2942,7 @@ fn emit_function_body_ops(
|
|||
result_type,
|
||||
Some(arg.dst),
|
||||
opencl,
|
||||
spirv::CLOp::native_exp2 as u32,
|
||||
spirv::CLOp::exp2 as u32,
|
||||
[dr::Operand::IdRef(arg.src)].iter().cloned(),
|
||||
)?;
|
||||
}
|
||||
|
@ -3124,7 +3124,7 @@ fn emit_sqrt(
|
|||
) -> Result<(), TranslateError> {
|
||||
let result_type = map.get_or_add_scalar(builder, details.typ.into());
|
||||
let (ocl_op, rounding) = match details.kind {
|
||||
ast::SqrtKind::Approx => (spirv::CLOp::native_sqrt, None),
|
||||
ast::SqrtKind::Approx => (spirv::CLOp::sqrt, None),
|
||||
ast::SqrtKind::Rounding(rnd) => (spirv::CLOp::sqrt, Some(rnd)),
|
||||
};
|
||||
builder.ext_inst(
|
||||
|
@ -4036,7 +4036,16 @@ fn emit_implicit_conversion(
|
|||
cv.to_space.to_spirv(),
|
||||
),
|
||||
);
|
||||
builder.bitcast(result_type, Some(cv.dst), cv.src)?;
|
||||
if cv.to_space == ast::StateSpace::Generic && cv.from_space != ast::StateSpace::Generic
|
||||
{
|
||||
builder.ptr_cast_to_generic(result_type, Some(cv.dst), cv.src)?;
|
||||
} else if cv.from_space == ast::StateSpace::Generic
|
||||
&& cv.to_space != ast::StateSpace::Generic
|
||||
{
|
||||
builder.generic_cast_to_ptr(result_type, Some(cv.dst), cv.src)?;
|
||||
} else {
|
||||
builder.bitcast(result_type, Some(cv.dst), cv.src)?;
|
||||
}
|
||||
}
|
||||
(_, _, &ConversionKind::AddressOf) => {
|
||||
let dst_type = map.get_or_add(builder, SpirvType::new(cv.to_type.clone()));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue