mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-10-04 23:30:18 +00:00
Re-enable all failing PTX tests (#277)
Additionally remove unused compilation paths
This commit is contained in:
parent
1a63ef62b7
commit
3870a96592
138 changed files with 3047 additions and 25651 deletions
151
ptx/lib/zluda_ptx_impl.cpp
Normal file
151
ptx/lib/zluda_ptx_impl.cpp
Normal file
|
@ -0,0 +1,151 @@
|
|||
// Every time this file changes it must te rebuilt, you need llvm-17:
|
||||
// /opt/rocm/llvm/bin/clang -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && llvm-dis-17 zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | llvm-as-17 - -o zluda_ptx_impl.bc && llvm-dis-17 zluda_ptx_impl.bc
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
#define FUNC(NAME) __device__ __attribute__((retain)) __zluda_ptx_impl_##NAME
|
||||
|
||||
extern "C"
|
||||
{
|
||||
uint32_t FUNC(activemask)()
|
||||
{
|
||||
return __builtin_amdgcn_read_exec_lo();
|
||||
}
|
||||
|
||||
size_t __ockl_get_local_id(uint32_t) __device__;
|
||||
uint32_t FUNC(sreg_tid)(uint8_t member)
|
||||
{
|
||||
return (uint32_t)__ockl_get_local_id(member);
|
||||
}
|
||||
|
||||
size_t __ockl_get_local_size(uint32_t) __device__;
|
||||
uint32_t FUNC(sreg_ntid)(uint8_t member)
|
||||
{
|
||||
return (uint32_t)__ockl_get_local_size(member);
|
||||
}
|
||||
|
||||
size_t __ockl_get_global_id(uint32_t) __device__;
|
||||
uint32_t FUNC(sreg_ctaid)(uint8_t member)
|
||||
{
|
||||
return (uint32_t)__ockl_get_global_id(member);
|
||||
}
|
||||
|
||||
size_t __ockl_get_global_size(uint32_t) __device__;
|
||||
uint32_t FUNC(sreg_nctaid)(uint8_t member)
|
||||
{
|
||||
return (uint32_t)__ockl_get_global_size(member);
|
||||
}
|
||||
|
||||
uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __attribute__((device));
|
||||
uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos_32, uint32_t len_32)
|
||||
{
|
||||
uint32_t pos = pos_32 & 0xFFU;
|
||||
uint32_t len = len_32 & 0xFFU;
|
||||
if (pos >= 32)
|
||||
return 0;
|
||||
// V_BFE_U32 only uses bits [4:0] for len (max value is 31)
|
||||
if (len >= 32)
|
||||
return base >> pos;
|
||||
len = std::min(len, 31U);
|
||||
return __ockl_bfe_u32(base, pos, len);
|
||||
}
|
||||
|
||||
// LLVM contains mentions of llvm.amdgcn.ubfe.i64 and llvm.amdgcn.sbfe.i64,
|
||||
// but using it only leads to LLVM crashes on RDNA2
|
||||
uint64_t FUNC(bfe_u64)(uint64_t base, uint32_t pos, uint32_t len)
|
||||
{
|
||||
// NVIDIA docs are incorrect. In 64 bit `bfe` both `pos` and `len`
|
||||
// parameters use whole 32 bit number and not just bottom 8 bits
|
||||
if (pos >= 64)
|
||||
return 0;
|
||||
if (len >= 64)
|
||||
return base >> pos;
|
||||
len = std::min(len, 63U);
|
||||
return (base >> pos) & ((1UL << len) - 1UL);
|
||||
}
|
||||
|
||||
int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __attribute__((device));
|
||||
int32_t FUNC(bfe_s32)(int32_t base, uint32_t pos_32, uint32_t len_32)
|
||||
{
|
||||
uint32_t pos = pos_32 & 0xFFU;
|
||||
uint32_t len = len_32 & 0xFFU;
|
||||
if (len == 0)
|
||||
return 0;
|
||||
if (pos >= 32)
|
||||
return (base >> 31);
|
||||
// V_BFE_I32 only uses bits [4:0] for len (max value is 31)
|
||||
if (len >= 32)
|
||||
return base >> pos;
|
||||
len = std::min(len, 31U);
|
||||
return __ockl_bfe_i32(base, pos, len);
|
||||
}
|
||||
|
||||
static __device__ uint32_t add_sat(uint32_t x, uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
if (__builtin_add_overflow(x, y, &result))
|
||||
{
|
||||
return UINT32_MAX;
|
||||
}
|
||||
else
|
||||
{
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ uint32_t sub_sat(uint32_t x, uint32_t y)
|
||||
{
|
||||
uint32_t result;
|
||||
if (__builtin_sub_overflow(x, y, &result))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t FUNC(bfe_s64)(int64_t base, uint32_t pos, uint32_t len)
|
||||
{
|
||||
// NVIDIA docs are incorrect. In 64 bit `bfe` both `pos` and `len`
|
||||
// parameters use whole 32 bit number and not just bottom 8 bits
|
||||
if (len == 0)
|
||||
return 0;
|
||||
if (pos >= 64)
|
||||
return (base >> 63U);
|
||||
if (add_sat(pos, len) >= 64)
|
||||
len = sub_sat(64, pos);
|
||||
return (base << (64U - pos - len)) >> (64U - len);
|
||||
}
|
||||
|
||||
uint32_t __ockl_bfm_u32(uint32_t count, uint32_t offset) __attribute__((device));
|
||||
uint32_t FUNC(bfi_b32)(uint32_t insert, uint32_t base, uint32_t pos_32, uint32_t len_32)
|
||||
{
|
||||
uint32_t pos = pos_32 & 0xFFU;
|
||||
uint32_t len = len_32 & 0xFFU;
|
||||
if (pos >= 32)
|
||||
return base;
|
||||
uint32_t mask;
|
||||
if (len >= 32)
|
||||
mask = UINT32_MAX << pos;
|
||||
else
|
||||
mask = __ockl_bfm_u32(len, pos);
|
||||
return (~mask & base) | (mask & (insert << pos));
|
||||
}
|
||||
|
||||
uint64_t FUNC(bfi_b64)(uint64_t insert, uint64_t base, uint32_t pos, uint32_t len)
|
||||
{
|
||||
// NVIDIA docs are incorrect. In 64 bit `bfe` both `pos` and `len`
|
||||
// parameters use whole 32 bit number and not just bottom 8 bits
|
||||
if (pos >= 64)
|
||||
return base;
|
||||
uint64_t mask;
|
||||
if (len >= 64)
|
||||
mask = UINT64_MAX << pos;
|
||||
else
|
||||
mask = ((1UL << len) - 1UL) << (pos);
|
||||
return (~mask & base) | (mask & (insert << pos));
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue