mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-09-27 19:59:08 +00:00
Various fixes
This commit is contained in:
parent
8e9d50de1a
commit
778c4efd2b
9 changed files with 143 additions and 28 deletions
|
@ -196,4 +196,24 @@ void LLVMZludaBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering,
|
|||
Name);
|
||||
}
|
||||
|
||||
void LLVMZludaSetAtomic(
|
||||
LLVMValueRef AtomicInst,
|
||||
LLVMAtomicOrdering Ordering,
|
||||
char * SSID)
|
||||
{
|
||||
auto inst = unwrap(AtomicInst);
|
||||
if (LoadInst *LI = dyn_cast<LoadInst>(inst))
|
||||
{
|
||||
LI->setAtomic(mapFromLLVMOrdering(Ordering), LI->getContext().getOrInsertSyncScopeID(SSID));
|
||||
}
|
||||
else if (StoreInst *SI = dyn_cast<StoreInst>(inst))
|
||||
{
|
||||
SI->setAtomic(mapFromLLVMOrdering(Ordering), SI->getContext().getOrInsertSyncScopeID(SSID));
|
||||
}
|
||||
else
|
||||
{
|
||||
llvm_unreachable("Invalid instruction type for LLVMZludaSetAtomic");
|
||||
}
|
||||
}
|
||||
|
||||
LLVM_C_EXTERN_C_END
|
|
@ -78,4 +78,10 @@ extern "C" {
|
|||
scope: *const i8,
|
||||
Name: *const i8,
|
||||
) -> LLVMValueRef;
|
||||
|
||||
pub fn LLVMZludaSetAtomic(
|
||||
AtomicInst: LLVMValueRef,
|
||||
Ordering: LLVMAtomicOrdering,
|
||||
SSID: *const i8,
|
||||
);
|
||||
}
|
||||
|
|
Binary file not shown.
|
@ -17,6 +17,7 @@ typedef _Float16 half16 __attribute__((ext_vector_type(16)));
|
|||
typedef float float8 __attribute__((ext_vector_type(8)));
|
||||
|
||||
#define FUNC(NAME) __device__ __attribute__((retain)) __zluda_ptx_impl_##NAME
|
||||
#define FUNC_CALL(NAME) __zluda_ptx_impl_##NAME
|
||||
#define ATTR(NAME) __ZLUDA_PTX_IMPL_ATTRIBUTE_##NAME
|
||||
#define DECLARE_ATTR(TYPE, NAME) \
|
||||
extern "C" __attribute__((constant)) CONSTANT_SPACE TYPE ATTR(NAME) \
|
||||
|
@ -58,6 +59,18 @@ extern "C"
|
|||
return __lane_id();
|
||||
}
|
||||
|
||||
uint32_t FUNC(sreg_lanemask_lt)()
|
||||
{
|
||||
uint32_t lane_idx = FUNC_CALL(sreg_laneid)();
|
||||
return (1U << lane_idx) - 1U;
|
||||
}
|
||||
|
||||
uint32_t FUNC(sreg_lanemask_ge)()
|
||||
{
|
||||
uint32_t lane_idx = FUNC_CALL(sreg_laneid)();
|
||||
return (~0U) << lane_idx;
|
||||
}
|
||||
|
||||
uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __device__;
|
||||
uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos_32, uint32_t len_32)
|
||||
{
|
||||
|
|
|
@ -542,17 +542,26 @@ impl<'a> MethodEmitContext<'a> {
|
|||
data: ast::LdDetails,
|
||||
arguments: ast::LdArgs<SpirvWord>,
|
||||
) -> Result<(), TranslateError> {
|
||||
if data.qualifier != ast::LdStQualifier::Weak {
|
||||
todo!()
|
||||
}
|
||||
let builder = self.builder;
|
||||
let type_ = get_type(self.context, &data.typ)?;
|
||||
let ptr = self.resolver.value(arguments.src)?;
|
||||
self.resolver.with_result(arguments.dst, |dst| {
|
||||
let load = unsafe { LLVMBuildLoad2(builder, type_, ptr, dst) };
|
||||
unsafe { LLVMSetAlignment(load, data.typ.layout().align() as u32) };
|
||||
load
|
||||
});
|
||||
let needs_cast = !matches!(data.typ, ast::Type::Scalar(_))
|
||||
&& !matches!(data.qualifier, ast::LdStQualifier::Weak);
|
||||
let underlying_type = get_type(self.context, &data.typ)?;
|
||||
let op_type = if needs_cast {
|
||||
unsafe { LLVMIntTypeInContext(self.context, data.typ.layout().size() as u32 * 8) }
|
||||
} else {
|
||||
underlying_type
|
||||
};
|
||||
let src = self.resolver.value(arguments.src)?;
|
||||
let load = unsafe { LLVMBuildLoad2(builder, op_type, src, LLVM_UNNAMED.as_ptr()) };
|
||||
apply_qualifier(load, data.qualifier)?;
|
||||
unsafe { LLVMSetAlignment(load, data.typ.layout().align() as u32) };
|
||||
if needs_cast {
|
||||
self.resolver.with_result(arguments.dst, |dst| unsafe {
|
||||
LLVMBuildBitCast(builder, load, underlying_type, dst)
|
||||
});
|
||||
} else {
|
||||
self.resolver.register(arguments.dst, load);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -761,11 +770,21 @@ impl<'a> MethodEmitContext<'a> {
|
|||
arguments: ast::StArgs<SpirvWord>,
|
||||
) -> Result<(), TranslateError> {
|
||||
let ptr = self.resolver.value(arguments.src1)?;
|
||||
let value = self.resolver.value(arguments.src2)?;
|
||||
if data.qualifier != ast::LdStQualifier::Weak {
|
||||
todo!()
|
||||
let needs_cast = !matches!(data.typ, ast::Type::Scalar(_))
|
||||
&& !matches!(data.qualifier, ast::LdStQualifier::Weak);
|
||||
let mut value = self.resolver.value(arguments.src2)?;
|
||||
if needs_cast {
|
||||
value = unsafe {
|
||||
LLVMBuildBitCast(
|
||||
self.builder,
|
||||
value,
|
||||
LLVMIntTypeInContext(self.context, data.typ.layout().size() as u32 * 8),
|
||||
LLVM_UNNAMED.as_ptr(),
|
||||
)
|
||||
};
|
||||
}
|
||||
let store = unsafe { LLVMBuildStore(self.builder, value, ptr) };
|
||||
apply_qualifier(store, data.qualifier)?;
|
||||
unsafe {
|
||||
LLVMSetAlignment(store, data.typ.layout().align() as u32);
|
||||
}
|
||||
|
@ -2237,7 +2256,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
}
|
||||
|
||||
fn emit_bar_warp(&mut self) -> Result<(), TranslateError> {
|
||||
self.emit_intrinsic(c"llvm.amdgcn.barrier.warp", None, None, vec![])?;
|
||||
self.emit_intrinsic(c"llvm.amdgcn.wave.barrier", None, None, vec![])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -2966,6 +2985,47 @@ impl<'a> MethodEmitContext<'a> {
|
|||
*/
|
||||
}
|
||||
|
||||
fn apply_qualifier(
|
||||
value: LLVMValueRef,
|
||||
qualifier: ptx_parser::LdStQualifier,
|
||||
) -> Result<(), TranslateError> {
|
||||
match qualifier {
|
||||
ptx_parser::LdStQualifier::Weak => {}
|
||||
ptx_parser::LdStQualifier::Volatile => unsafe {
|
||||
LLVMSetVolatile(value, 1);
|
||||
// The semantics of volatile operations are equivalent to a relaxed memory operation
|
||||
// with system-scope but with the following extra implementation-specific constraints...
|
||||
LLVMZludaSetAtomic(
|
||||
value,
|
||||
LLVMAtomicOrdering::LLVMAtomicOrderingMonotonic,
|
||||
get_scope(ast::MemScope::Sys)?,
|
||||
);
|
||||
},
|
||||
ptx_parser::LdStQualifier::Relaxed(mem_scope) => unsafe {
|
||||
LLVMZludaSetAtomic(
|
||||
value,
|
||||
LLVMAtomicOrdering::LLVMAtomicOrderingMonotonic,
|
||||
get_scope(mem_scope)?,
|
||||
);
|
||||
},
|
||||
ptx_parser::LdStQualifier::Acquire(mem_scope) => unsafe {
|
||||
LLVMZludaSetAtomic(
|
||||
value,
|
||||
LLVMAtomicOrdering::LLVMAtomicOrderingAcquire,
|
||||
get_scope(mem_scope)?,
|
||||
);
|
||||
},
|
||||
ptx_parser::LdStQualifier::Release(mem_scope) => unsafe {
|
||||
LLVMZludaSetAtomic(
|
||||
value,
|
||||
LLVMAtomicOrdering::LLVMAtomicOrderingRelease,
|
||||
get_scope(mem_scope)?,
|
||||
);
|
||||
},
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_pointer_type<'ctx>(
|
||||
context: LLVMContextRef,
|
||||
to_space: ast::StateSpace,
|
||||
|
@ -2979,7 +3039,7 @@ fn get_scope(scope: ast::MemScope) -> Result<*const i8, TranslateError> {
|
|||
ast::MemScope::Cta => c"workgroup-one-as",
|
||||
ast::MemScope::Gpu => c"agent-one-as",
|
||||
ast::MemScope::Sys => c"one-as",
|
||||
ast::MemScope::Cluster => todo!(),
|
||||
ast::MemScope::Cluster => return Err(error_todo()),
|
||||
}
|
||||
.as_ptr())
|
||||
}
|
||||
|
@ -2989,7 +3049,7 @@ fn get_scope_membar(scope: ast::MemScope) -> Result<*const i8, TranslateError> {
|
|||
ast::MemScope::Cta => c"workgroup",
|
||||
ast::MemScope::Gpu => c"agent",
|
||||
ast::MemScope::Sys => c"system",
|
||||
ast::MemScope::Cluster => todo!(),
|
||||
ast::MemScope::Cluster => return Err(error_todo()),
|
||||
}
|
||||
.as_ptr())
|
||||
}
|
||||
|
|
|
@ -139,6 +139,7 @@ enum PtxSpecialRegister {
|
|||
Nctaid,
|
||||
Clock,
|
||||
LanemaskLt,
|
||||
LanemaskGe,
|
||||
Laneid,
|
||||
}
|
||||
|
||||
|
@ -151,6 +152,7 @@ impl PtxSpecialRegister {
|
|||
Self::Nctaid => "%nctaid",
|
||||
Self::Clock => "%clock",
|
||||
Self::LanemaskLt => "%lanemask_lt",
|
||||
Self::LanemaskGe => "%lanemask_ge",
|
||||
Self::Laneid => "%laneid",
|
||||
}
|
||||
}
|
||||
|
@ -173,6 +175,7 @@ impl PtxSpecialRegister {
|
|||
PtxSpecialRegister::Nctaid => ast::ScalarType::U32,
|
||||
PtxSpecialRegister::Clock => ast::ScalarType::U32,
|
||||
PtxSpecialRegister::LanemaskLt => ast::ScalarType::U32,
|
||||
PtxSpecialRegister::LanemaskGe => ast::ScalarType::U32,
|
||||
PtxSpecialRegister::Laneid => ast::ScalarType::U32,
|
||||
}
|
||||
}
|
||||
|
@ -185,6 +188,7 @@ impl PtxSpecialRegister {
|
|||
| PtxSpecialRegister::Nctaid => Some(ast::ScalarType::U8),
|
||||
PtxSpecialRegister::Clock
|
||||
| PtxSpecialRegister::LanemaskLt
|
||||
| PtxSpecialRegister::LanemaskGe
|
||||
| PtxSpecialRegister::Laneid => None,
|
||||
}
|
||||
}
|
||||
|
@ -197,6 +201,7 @@ impl PtxSpecialRegister {
|
|||
PtxSpecialRegister::Nctaid => "sreg_nctaid",
|
||||
PtxSpecialRegister::Clock => "sreg_clock",
|
||||
PtxSpecialRegister::LanemaskLt => "sreg_lanemask_lt",
|
||||
PtxSpecialRegister::LanemaskGe => "sreg_lanemask_ge",
|
||||
PtxSpecialRegister::Laneid => "sreg_laneid",
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,22 +1,33 @@
|
|||
use cuda_types::cuda::CUfunction_attribute;
|
||||
use hip_runtime_sys::*;
|
||||
use std::mem;
|
||||
|
||||
pub(crate) fn get_attribute(
|
||||
pi: &mut i32,
|
||||
cu_attrib: hipFunction_attribute,
|
||||
cu_attrib: CUfunction_attribute,
|
||||
func: hipFunction_t,
|
||||
) -> hipError_t {
|
||||
// TODO: implement HIP_FUNC_ATTRIBUTE_PTX_VERSION
|
||||
// TODO: implement HIP_FUNC_ATTRIBUTE_BINARY_VERSION
|
||||
match cu_attrib {
|
||||
hipFunction_attribute::HIP_FUNC_ATTRIBUTE_PTX_VERSION
|
||||
| hipFunction_attribute::HIP_FUNC_ATTRIBUTE_BINARY_VERSION => {
|
||||
CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION
|
||||
| CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => {
|
||||
*pi = 120;
|
||||
return Ok(());
|
||||
}
|
||||
CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET
|
||||
| CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
|
||||
| CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
|
||||
| CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
|
||||
| CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED
|
||||
| CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE => {
|
||||
*pi = 0;
|
||||
return Ok(());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
unsafe { hipFuncGetAttribute(pi, cu_attrib, func) }?;
|
||||
if cu_attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS {
|
||||
unsafe { hipFuncGetAttribute(pi, mem::transmute(cu_attrib), func) }?;
|
||||
if cu_attrib == CUfunction_attribute::CU_FUNC_ATTRIBUTE_NUM_REGS {
|
||||
*pi = (*pi).max(1);
|
||||
}
|
||||
Ok(())
|
||||
|
@ -55,12 +66,12 @@ pub(crate) fn launch_kernel(
|
|||
|
||||
pub(crate) unsafe fn set_attribute(
|
||||
func: hipFunction_t,
|
||||
attribute: hipFunction_attribute,
|
||||
attribute: CUfunction_attribute,
|
||||
value: i32,
|
||||
) -> hipError_t {
|
||||
match attribute {
|
||||
hipFunction_attribute::HIP_FUNC_ATTRIBUTE_PTX_VERSION
|
||||
| hipFunction_attribute::HIP_FUNC_ATTRIBUTE_BINARY_VERSION => {
|
||||
CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION
|
||||
| CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => {
|
||||
return hipError_t::ErrorNotSupported;
|
||||
}
|
||||
_ => {}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use cuda_types::cuda::CUresult;
|
||||
use cuda_types::cuda::{CUfunction_attribute, CUresult};
|
||||
use hip_runtime_sys::*;
|
||||
|
||||
use crate::r#impl::function;
|
||||
|
@ -9,7 +9,7 @@ pub(crate) unsafe fn get_function(func: &mut hipFunction_t, kernel: hipFunction_
|
|||
}
|
||||
|
||||
pub(crate) unsafe fn set_attribute(
|
||||
attrib: hipFunction_attribute,
|
||||
attrib: CUfunction_attribute,
|
||||
val: ::core::ffi::c_int,
|
||||
kernel: hipFunction_t,
|
||||
_dev: hipDevice_t,
|
||||
|
|
|
@ -173,12 +173,12 @@ from_cuda_nop!(
|
|||
cublasLtMatmulDescAttributes_t,
|
||||
CUmemAllocationGranularity_flags,
|
||||
CUmemAllocationProp,
|
||||
CUresult
|
||||
CUresult,
|
||||
CUfunction_attribute
|
||||
);
|
||||
from_cuda_transmute!(
|
||||
CUuuid => hipUUID,
|
||||
CUfunction => hipFunction_t,
|
||||
CUfunction_attribute => hipFunction_attribute,
|
||||
CUstream => hipStream_t,
|
||||
CUpointer_attribute => hipPointer_attribute,
|
||||
CUdeviceptr_v2 => hipDeviceptr_t,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue