diff --git a/llvm_zluda/src/lib.cpp b/llvm_zluda/src/lib.cpp index c8ac2d7..1151330 100644 --- a/llvm_zluda/src/lib.cpp +++ b/llvm_zluda/src/lib.cpp @@ -196,4 +196,24 @@ void LLVMZludaBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering, Name); } +void LLVMZludaSetAtomic( + LLVMValueRef AtomicInst, + LLVMAtomicOrdering Ordering, + char * SSID) +{ + auto inst = unwrap(AtomicInst); + if (LoadInst *LI = dyn_cast(inst)) + { + LI->setAtomic(mapFromLLVMOrdering(Ordering), LI->getContext().getOrInsertSyncScopeID(SSID)); + } + else if (StoreInst *SI = dyn_cast(inst)) + { + SI->setAtomic(mapFromLLVMOrdering(Ordering), SI->getContext().getOrInsertSyncScopeID(SSID)); + } + else + { + llvm_unreachable("Invalid instruction type for LLVMZludaSetAtomic"); + } +} + LLVM_C_EXTERN_C_END \ No newline at end of file diff --git a/llvm_zluda/src/lib.rs b/llvm_zluda/src/lib.rs index 18046a5..37b1d97 100644 --- a/llvm_zluda/src/lib.rs +++ b/llvm_zluda/src/lib.rs @@ -78,4 +78,10 @@ extern "C" { scope: *const i8, Name: *const i8, ) -> LLVMValueRef; + + pub fn LLVMZludaSetAtomic( + AtomicInst: LLVMValueRef, + Ordering: LLVMAtomicOrdering, + SSID: *const i8, + ); } diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index bc375c3..1bc9856 100644 Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index 6174ec1..74384ac 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -17,6 +17,7 @@ typedef _Float16 half16 __attribute__((ext_vector_type(16))); typedef float float8 __attribute__((ext_vector_type(8))); #define FUNC(NAME) __device__ __attribute__((retain)) __zluda_ptx_impl_##NAME +#define FUNC_CALL(NAME) __zluda_ptx_impl_##NAME #define ATTR(NAME) __ZLUDA_PTX_IMPL_ATTRIBUTE_##NAME #define DECLARE_ATTR(TYPE, NAME) \ extern "C" __attribute__((constant)) CONSTANT_SPACE TYPE ATTR(NAME) \ @@ -58,6 +59,18 @@ extern "C" return __lane_id(); } + uint32_t FUNC(sreg_lanemask_lt)() + { + uint32_t lane_idx = FUNC_CALL(sreg_laneid)(); + return (1U << lane_idx) - 1U; + } + + uint32_t FUNC(sreg_lanemask_ge)() + { + uint32_t lane_idx = FUNC_CALL(sreg_laneid)(); + return (~0U) << lane_idx; + } + uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __device__; uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos_32, uint32_t len_32) { diff --git a/ptx/src/pass/llvm/emit.rs b/ptx/src/pass/llvm/emit.rs index c27f1aa..3d2c154 100644 --- a/ptx/src/pass/llvm/emit.rs +++ b/ptx/src/pass/llvm/emit.rs @@ -542,17 +542,26 @@ impl<'a> MethodEmitContext<'a> { data: ast::LdDetails, arguments: ast::LdArgs, ) -> Result<(), TranslateError> { - if data.qualifier != ast::LdStQualifier::Weak { - todo!() - } let builder = self.builder; - let type_ = get_type(self.context, &data.typ)?; - let ptr = self.resolver.value(arguments.src)?; - self.resolver.with_result(arguments.dst, |dst| { - let load = unsafe { LLVMBuildLoad2(builder, type_, ptr, dst) }; - unsafe { LLVMSetAlignment(load, data.typ.layout().align() as u32) }; - load - }); + let needs_cast = !matches!(data.typ, ast::Type::Scalar(_)) + && !matches!(data.qualifier, ast::LdStQualifier::Weak); + let underlying_type = get_type(self.context, &data.typ)?; + let op_type = if needs_cast { + unsafe { LLVMIntTypeInContext(self.context, data.typ.layout().size() as u32 * 8) } + } else { + underlying_type + }; + let src = self.resolver.value(arguments.src)?; + let load = unsafe { LLVMBuildLoad2(builder, op_type, src, LLVM_UNNAMED.as_ptr()) }; + apply_qualifier(load, data.qualifier)?; + unsafe { LLVMSetAlignment(load, data.typ.layout().align() as u32) }; + if needs_cast { + self.resolver.with_result(arguments.dst, |dst| unsafe { + LLVMBuildBitCast(builder, load, underlying_type, dst) + }); + } else { + self.resolver.register(arguments.dst, load); + } Ok(()) } @@ -761,11 +770,21 @@ impl<'a> MethodEmitContext<'a> { arguments: ast::StArgs, ) -> Result<(), TranslateError> { let ptr = self.resolver.value(arguments.src1)?; - let value = self.resolver.value(arguments.src2)?; - if data.qualifier != ast::LdStQualifier::Weak { - todo!() + let needs_cast = !matches!(data.typ, ast::Type::Scalar(_)) + && !matches!(data.qualifier, ast::LdStQualifier::Weak); + let mut value = self.resolver.value(arguments.src2)?; + if needs_cast { + value = unsafe { + LLVMBuildBitCast( + self.builder, + value, + LLVMIntTypeInContext(self.context, data.typ.layout().size() as u32 * 8), + LLVM_UNNAMED.as_ptr(), + ) + }; } let store = unsafe { LLVMBuildStore(self.builder, value, ptr) }; + apply_qualifier(store, data.qualifier)?; unsafe { LLVMSetAlignment(store, data.typ.layout().align() as u32); } @@ -2237,7 +2256,7 @@ impl<'a> MethodEmitContext<'a> { } fn emit_bar_warp(&mut self) -> Result<(), TranslateError> { - self.emit_intrinsic(c"llvm.amdgcn.barrier.warp", None, None, vec![])?; + self.emit_intrinsic(c"llvm.amdgcn.wave.barrier", None, None, vec![])?; Ok(()) } @@ -2966,6 +2985,47 @@ impl<'a> MethodEmitContext<'a> { */ } +fn apply_qualifier( + value: LLVMValueRef, + qualifier: ptx_parser::LdStQualifier, +) -> Result<(), TranslateError> { + match qualifier { + ptx_parser::LdStQualifier::Weak => {} + ptx_parser::LdStQualifier::Volatile => unsafe { + LLVMSetVolatile(value, 1); + // The semantics of volatile operations are equivalent to a relaxed memory operation + // with system-scope but with the following extra implementation-specific constraints... + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingMonotonic, + get_scope(ast::MemScope::Sys)?, + ); + }, + ptx_parser::LdStQualifier::Relaxed(mem_scope) => unsafe { + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingMonotonic, + get_scope(mem_scope)?, + ); + }, + ptx_parser::LdStQualifier::Acquire(mem_scope) => unsafe { + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingAcquire, + get_scope(mem_scope)?, + ); + }, + ptx_parser::LdStQualifier::Release(mem_scope) => unsafe { + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingRelease, + get_scope(mem_scope)?, + ); + }, + } + Ok(()) +} + fn get_pointer_type<'ctx>( context: LLVMContextRef, to_space: ast::StateSpace, @@ -2979,7 +3039,7 @@ fn get_scope(scope: ast::MemScope) -> Result<*const i8, TranslateError> { ast::MemScope::Cta => c"workgroup-one-as", ast::MemScope::Gpu => c"agent-one-as", ast::MemScope::Sys => c"one-as", - ast::MemScope::Cluster => todo!(), + ast::MemScope::Cluster => return Err(error_todo()), } .as_ptr()) } @@ -2989,7 +3049,7 @@ fn get_scope_membar(scope: ast::MemScope) -> Result<*const i8, TranslateError> { ast::MemScope::Cta => c"workgroup", ast::MemScope::Gpu => c"agent", ast::MemScope::Sys => c"system", - ast::MemScope::Cluster => todo!(), + ast::MemScope::Cluster => return Err(error_todo()), } .as_ptr()) } diff --git a/ptx/src/pass/mod.rs b/ptx/src/pass/mod.rs index 2e88367..b0614d5 100644 --- a/ptx/src/pass/mod.rs +++ b/ptx/src/pass/mod.rs @@ -139,6 +139,7 @@ enum PtxSpecialRegister { Nctaid, Clock, LanemaskLt, + LanemaskGe, Laneid, } @@ -151,6 +152,7 @@ impl PtxSpecialRegister { Self::Nctaid => "%nctaid", Self::Clock => "%clock", Self::LanemaskLt => "%lanemask_lt", + Self::LanemaskGe => "%lanemask_ge", Self::Laneid => "%laneid", } } @@ -173,6 +175,7 @@ impl PtxSpecialRegister { PtxSpecialRegister::Nctaid => ast::ScalarType::U32, PtxSpecialRegister::Clock => ast::ScalarType::U32, PtxSpecialRegister::LanemaskLt => ast::ScalarType::U32, + PtxSpecialRegister::LanemaskGe => ast::ScalarType::U32, PtxSpecialRegister::Laneid => ast::ScalarType::U32, } } @@ -185,6 +188,7 @@ impl PtxSpecialRegister { | PtxSpecialRegister::Nctaid => Some(ast::ScalarType::U8), PtxSpecialRegister::Clock | PtxSpecialRegister::LanemaskLt + | PtxSpecialRegister::LanemaskGe | PtxSpecialRegister::Laneid => None, } } @@ -197,6 +201,7 @@ impl PtxSpecialRegister { PtxSpecialRegister::Nctaid => "sreg_nctaid", PtxSpecialRegister::Clock => "sreg_clock", PtxSpecialRegister::LanemaskLt => "sreg_lanemask_lt", + PtxSpecialRegister::LanemaskGe => "sreg_lanemask_ge", PtxSpecialRegister::Laneid => "sreg_laneid", } } diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs index 90afb51..ee1b557 100644 --- a/zluda/src/impl/function.rs +++ b/zluda/src/impl/function.rs @@ -1,22 +1,33 @@ +use cuda_types::cuda::CUfunction_attribute; use hip_runtime_sys::*; +use std::mem; pub(crate) fn get_attribute( pi: &mut i32, - cu_attrib: hipFunction_attribute, + cu_attrib: CUfunction_attribute, func: hipFunction_t, ) -> hipError_t { // TODO: implement HIP_FUNC_ATTRIBUTE_PTX_VERSION // TODO: implement HIP_FUNC_ATTRIBUTE_BINARY_VERSION match cu_attrib { - hipFunction_attribute::HIP_FUNC_ATTRIBUTE_PTX_VERSION - | hipFunction_attribute::HIP_FUNC_ATTRIBUTE_BINARY_VERSION => { + CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => { *pi = 120; return Ok(()); } + CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE => { + *pi = 0; + return Ok(()); + } _ => {} } - unsafe { hipFuncGetAttribute(pi, cu_attrib, func) }?; - if cu_attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS { + unsafe { hipFuncGetAttribute(pi, mem::transmute(cu_attrib), func) }?; + if cu_attrib == CUfunction_attribute::CU_FUNC_ATTRIBUTE_NUM_REGS { *pi = (*pi).max(1); } Ok(()) @@ -55,12 +66,12 @@ pub(crate) fn launch_kernel( pub(crate) unsafe fn set_attribute( func: hipFunction_t, - attribute: hipFunction_attribute, + attribute: CUfunction_attribute, value: i32, ) -> hipError_t { match attribute { - hipFunction_attribute::HIP_FUNC_ATTRIBUTE_PTX_VERSION - | hipFunction_attribute::HIP_FUNC_ATTRIBUTE_BINARY_VERSION => { + CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => { return hipError_t::ErrorNotSupported; } _ => {} diff --git a/zluda/src/impl/kernel.rs b/zluda/src/impl/kernel.rs index ab45b04..e4c3404 100644 --- a/zluda/src/impl/kernel.rs +++ b/zluda/src/impl/kernel.rs @@ -1,4 +1,4 @@ -use cuda_types::cuda::CUresult; +use cuda_types::cuda::{CUfunction_attribute, CUresult}; use hip_runtime_sys::*; use crate::r#impl::function; @@ -9,7 +9,7 @@ pub(crate) unsafe fn get_function(func: &mut hipFunction_t, kernel: hipFunction_ } pub(crate) unsafe fn set_attribute( - attrib: hipFunction_attribute, + attrib: CUfunction_attribute, val: ::core::ffi::c_int, kernel: hipFunction_t, _dev: hipDevice_t, diff --git a/zluda_common/src/lib.rs b/zluda_common/src/lib.rs index 4f8aef7..4c76ef1 100644 --- a/zluda_common/src/lib.rs +++ b/zluda_common/src/lib.rs @@ -173,12 +173,12 @@ from_cuda_nop!( cublasLtMatmulDescAttributes_t, CUmemAllocationGranularity_flags, CUmemAllocationProp, - CUresult + CUresult, + CUfunction_attribute ); from_cuda_transmute!( CUuuid => hipUUID, CUfunction => hipFunction_t, - CUfunction_attribute => hipFunction_attribute, CUstream => hipStream_t, CUpointer_attribute => hipPointer_attribute, CUdeviceptr_v2 => hipDeviceptr_t,