mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-08-03 06:40:21 +00:00
Add correct 64 bit bfe
This commit is contained in:
parent
c59d0c4a92
commit
4f244c29a5
2 changed files with 45 additions and 12 deletions
Binary file not shown.
|
@ -33,6 +33,20 @@ extern "C"
|
||||||
return __ockl_bfe_u32(base, pos, len);
|
return __ockl_bfe_u32(base, pos, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LLVM contains mentions of llvm.amdgcn.ubfe.i64 and llvm.amdgcn.sbfe.i64,
|
||||||
|
// but using it only leads to LLVM crashes on RDNA2
|
||||||
|
uint64_t FUNC(bfe_u64)(uint64_t base, uint32_t pos, uint32_t len)
|
||||||
|
{
|
||||||
|
// NVIDIA docs are incorrect. In 64 bit `bfe` both `pos` and `len`
|
||||||
|
// parameters use whole 32 bit number and not just bottom 8 bits
|
||||||
|
if (pos >= 64)
|
||||||
|
return 0;
|
||||||
|
if (len >= 64)
|
||||||
|
return base >> pos;
|
||||||
|
len = std::min(len, 63U);
|
||||||
|
return (base >> pos) & ((1UL << len) - 1UL);
|
||||||
|
}
|
||||||
|
|
||||||
int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __attribute__((device));
|
int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __attribute__((device));
|
||||||
int32_t FUNC(bfe_s32)(int32_t base, uint32_t pos_32, uint32_t len_32)
|
int32_t FUNC(bfe_s32)(int32_t base, uint32_t pos_32, uint32_t len_32)
|
||||||
{
|
{
|
||||||
|
@ -49,23 +63,42 @@ extern "C"
|
||||||
return __ockl_bfe_i32(base, pos, len);
|
return __ockl_bfe_i32(base, pos, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
// LLVM contains mentions of llvm.amdgcn.ubfe.i64 and llvm.amdgcn.sbfe.i64,
|
static __device__ uint32_t add_sat(uint32_t x, uint32_t y)
|
||||||
// but using it only leads to LLVM crashes on RDNA2
|
|
||||||
uint64_t FUNC(bfe_u64)(uint64_t base, uint32_t b, uint32_t c)
|
|
||||||
{
|
{
|
||||||
uint8_t pos = uint8_t(b);
|
uint32_t result;
|
||||||
uint8_t len = uint8_t(c);
|
if (__builtin_add_overflow(x, y, &result))
|
||||||
if (len == 0)
|
{
|
||||||
return 0;
|
return UINT32_MAX;
|
||||||
return (base >> pos) & ((1U << len) - 1U);
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t FUNC(bfe_s64)(int64_t base, uint32_t b, uint32_t c)
|
static __device__ uint32_t sub_sat(uint32_t x, uint32_t y)
|
||||||
{
|
{
|
||||||
uint8_t pos = uint8_t(b);
|
uint32_t result;
|
||||||
uint8_t len = uint8_t(c);
|
if (__builtin_sub_overflow(x, y, &result))
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t FUNC(bfe_s64)(int64_t base, uint32_t pos, uint32_t len)
|
||||||
|
{
|
||||||
|
// NVIDIA docs are incorrect. In 64 bit `bfe` both `pos` and `len`
|
||||||
|
// parameters use whole 32 bit number and not just bottom 8 bits
|
||||||
if (len == 0)
|
if (len == 0)
|
||||||
return 0;
|
return 0;
|
||||||
return (base >> pos) & ((1U << len) - 1U);
|
if (pos >= 64)
|
||||||
|
return (base >> 63U);
|
||||||
|
if (add_sat(pos, len) >= 64)
|
||||||
|
len = sub_sat(64, pos);
|
||||||
|
return (base << (64U - pos - len)) >> (64U - len);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue