diff --git a/.github/workflows/pr_master.yml b/.github/workflows/pr_master.yml index 8787c81..c2ceb6d 100644 --- a/.github/workflows/pr_master.yml +++ b/.github/workflows/pr_master.yml @@ -24,15 +24,7 @@ jobs: name: Build (Linux) runs-on: ubuntu-22.04 steps: - - uses: jlumbroso/free-disk-space@main - with: - # Removing Android stuff should be enough - android: true - dotnet: false - haskell: false - large-packages: false - docker-images: false - swap-storage: false + - uses: jlumbroso/free-disk-space@v1.3.1 - uses: actions/checkout@v4 with: submodules: true @@ -79,15 +71,7 @@ jobs: outputs: test_package: ${{ steps.upload_artifacts.outputs.artifact-id }} steps: - - uses: jlumbroso/free-disk-space@main - with: - # Removing Android stuff should be enough - android: true - dotnet: false - haskell: false - large-packages: false - docker-images: false - swap-storage: false + - uses: jlumbroso/free-disk-space@v1.3.1 - uses: actions/checkout@v4 with: submodules: true diff --git a/.github/workflows/push_master.yml b/.github/workflows/push_master.yml index 378fefe..1fdee90 100644 --- a/.github/workflows/push_master.yml +++ b/.github/workflows/push_master.yml @@ -18,15 +18,7 @@ jobs: permissions: contents: write steps: - - uses: jlumbroso/free-disk-space@main - with: - # Removing Android stuff should be enough - android: true - dotnet: false - haskell: false - large-packages: false - docker-images: false - swap-storage: false + - uses: jlumbroso/free-disk-space@v1.3.1 - uses: actions/checkout@v4 # fetch-depth and fetch-tags are required to properly tag pre-release builds with: @@ -117,15 +109,7 @@ jobs: outputs: test_package: ${{ steps.upload_artifacts.outputs.artifact-id }} steps: - - uses: jlumbroso/free-disk-space@main - with: - # Removing Android stuff should be enough - android: true - dotnet: false - haskell: false - large-packages: false - docker-images: false - swap-storage: false + - uses: jlumbroso/free-disk-space@v1.3.1 - uses: actions/checkout@v4 with: submodules: true diff --git a/Cargo.lock b/Cargo.lock index cfe4cff..78ed7bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -420,7 +420,7 @@ version = "0.0.0" dependencies = [ "proc-macro2", "quote", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "syn 2.0.89", ] @@ -3706,7 +3706,7 @@ dependencies = [ "paste", "ptx", "ptx_parser", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "serde", "serde_json", "tempfile", @@ -3726,7 +3726,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "syn 2.0.89", ] @@ -3854,7 +3854,7 @@ dependencies = [ "ptx", "ptx_parser", "regex", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "unwrap_or", "wchar", "winapi", diff --git a/comgr/src/lib.rs b/comgr/src/lib.rs index 8546203..9e36ab6 100644 --- a/comgr/src/lib.rs +++ b/comgr/src/lib.rs @@ -219,6 +219,12 @@ pub fn compile_bitcode( compile_to_exec.set_isa_name(gcn_arch)?; compile_to_exec.set_language(Language::LlvmIr)?; let common_options = [ + // Uncomment for LLVM debug + //c"-mllvm", + //c"-debug", + // Uncomment to save passes + // c"-mllvm", + // c"-print-before-all", c"-mllvm", c"-ignore-tti-inline-compatible", // c"-mllvm", diff --git a/cuda_macros/Cargo.toml b/cuda_macros/Cargo.toml index cfefc62..aa4e377 100644 --- a/cuda_macros/Cargo.toml +++ b/cuda_macros/Cargo.toml @@ -8,7 +8,7 @@ edition = "2021" quote = "1.0" syn = { version = "2.0", features = ["full", "visit-mut", "extra-traits"] } proc-macro2 = "1.0" -rustc-hash = "1.1.0" +rustc-hash = "2.0.0" [lib] proc-macro = true diff --git a/llvm_zluda/src/lib.cpp b/llvm_zluda/src/lib.cpp index c8ac2d7..1151330 100644 --- a/llvm_zluda/src/lib.cpp +++ b/llvm_zluda/src/lib.cpp @@ -196,4 +196,24 @@ void LLVMZludaBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering, Name); } +void LLVMZludaSetAtomic( + LLVMValueRef AtomicInst, + LLVMAtomicOrdering Ordering, + char * SSID) +{ + auto inst = unwrap(AtomicInst); + if (LoadInst *LI = dyn_cast(inst)) + { + LI->setAtomic(mapFromLLVMOrdering(Ordering), LI->getContext().getOrInsertSyncScopeID(SSID)); + } + else if (StoreInst *SI = dyn_cast(inst)) + { + SI->setAtomic(mapFromLLVMOrdering(Ordering), SI->getContext().getOrInsertSyncScopeID(SSID)); + } + else + { + llvm_unreachable("Invalid instruction type for LLVMZludaSetAtomic"); + } +} + LLVM_C_EXTERN_C_END \ No newline at end of file diff --git a/llvm_zluda/src/lib.rs b/llvm_zluda/src/lib.rs index 18046a5..37b1d97 100644 --- a/llvm_zluda/src/lib.rs +++ b/llvm_zluda/src/lib.rs @@ -78,4 +78,10 @@ extern "C" { scope: *const i8, Name: *const i8, ) -> LLVMValueRef; + + pub fn LLVMZludaSetAtomic( + AtomicInst: LLVMValueRef, + Ordering: LLVMAtomicOrdering, + SSID: *const i8, + ); } diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index afc9c2c..fef0853 100644 Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index f247f45..c378d78 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -17,6 +17,7 @@ typedef _Float16 half16 __attribute__((ext_vector_type(16))); typedef float float8 __attribute__((ext_vector_type(8))); #define FUNC(NAME) __device__ __attribute__((retain)) __zluda_ptx_impl_##NAME +#define FUNC_CALL(NAME) __zluda_ptx_impl_##NAME #define ATTR(NAME) __ZLUDA_PTX_IMPL_ATTRIBUTE_##NAME #define DECLARE_ATTR(TYPE, NAME) \ extern "C" __attribute__((constant)) CONSTANT_SPACE TYPE ATTR(NAME) \ @@ -58,6 +59,18 @@ extern "C" return __lane_id(); } + uint32_t FUNC(sreg_lanemask_lt)() + { + uint32_t lane_idx = FUNC_CALL(sreg_laneid)(); + return (1U << lane_idx) - 1U; + } + + uint32_t FUNC(sreg_lanemask_ge)() + { + uint32_t lane_idx = FUNC_CALL(sreg_laneid)(); + return (~0U) << lane_idx; + } + uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __device__; uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos_32, uint32_t len_32) { diff --git a/ptx/src/pass/llvm/emit.rs b/ptx/src/pass/llvm/emit.rs index 0a68f8b..2484440 100644 --- a/ptx/src/pass/llvm/emit.rs +++ b/ptx/src/pass/llvm/emit.rs @@ -539,17 +539,25 @@ impl<'a> MethodEmitContext<'a> { data: ast::LdDetails, arguments: ast::LdArgs, ) -> Result<(), TranslateError> { - if data.qualifier != ast::LdStQualifier::Weak { - todo!() - } let builder = self.builder; - let type_ = get_type(self.context, &data.typ)?; - let ptr = self.resolver.value(arguments.src)?; - self.resolver.with_result(arguments.dst, |dst| { - let load = unsafe { LLVMBuildLoad2(builder, type_, ptr, dst) }; - unsafe { LLVMSetAlignment(load, data.typ.layout().align() as u32) }; - load - }); + let underlying_type = get_type(self.context, &data.typ)?; + let needs_cast = not_supported_by_atomics(data.qualifier, underlying_type); + let op_type = if needs_cast { + unsafe { LLVMIntTypeInContext(self.context, data.typ.layout().size() as u32 * 8) } + } else { + underlying_type + }; + let src = self.resolver.value(arguments.src)?; + let load = unsafe { LLVMBuildLoad2(builder, op_type, src, LLVM_UNNAMED.as_ptr()) }; + apply_qualifier(load, data.qualifier)?; + unsafe { LLVMSetAlignment(load, data.typ.layout().align() as u32) }; + if needs_cast { + self.resolver.with_result(arguments.dst, |dst| unsafe { + LLVMBuildBitCast(builder, load, underlying_type, dst) + }); + } else { + self.resolver.register(arguments.dst, load); + } Ok(()) } @@ -758,11 +766,21 @@ impl<'a> MethodEmitContext<'a> { arguments: ast::StArgs, ) -> Result<(), TranslateError> { let ptr = self.resolver.value(arguments.src1)?; - let value = self.resolver.value(arguments.src2)?; - if data.qualifier != ast::LdStQualifier::Weak { - todo!() + let underlying_type = get_type(self.context, &data.typ)?; + let needs_cast = not_supported_by_atomics(data.qualifier, underlying_type); + let mut value = self.resolver.value(arguments.src2)?; + if needs_cast { + value = unsafe { + LLVMBuildBitCast( + self.builder, + value, + LLVMIntTypeInContext(self.context, data.typ.layout().size() as u32 * 8), + LLVM_UNNAMED.as_ptr(), + ) + }; } let store = unsafe { LLVMBuildStore(self.builder, value, ptr) }; + apply_qualifier(store, data.qualifier)?; unsafe { LLVMSetAlignment(store, data.typ.layout().align() as u32); } @@ -1653,25 +1671,23 @@ impl<'a> MethodEmitContext<'a> { .ok_or_else(|| error_mismatched_type())?, ); let src2 = self.resolver.value(src2)?; - self.resolver.with_result(arguments.dst, |dst| { - let vec = unsafe { - LLVMBuildInsertElement( - self.builder, - LLVMGetPoison(dst_type), - llvm_fn(self.builder, src, packed_type, LLVM_UNNAMED.as_ptr()), - LLVMConstInt(LLVMInt32TypeInContext(self.context), 1, false as i32), - LLVM_UNNAMED.as_ptr(), - ) - }; - unsafe { - LLVMBuildInsertElement( - self.builder, - vec, - llvm_fn(self.builder, src2, packed_type, LLVM_UNNAMED.as_ptr()), - LLVMConstInt(LLVMInt32TypeInContext(self.context), 0, false as i32), - dst, - ) - } + let vec = unsafe { + LLVMBuildInsertElement( + self.builder, + LLVMGetPoison(dst_type), + llvm_fn(self.builder, src, packed_type, LLVM_UNNAMED.as_ptr()), + LLVMConstInt(LLVMInt32TypeInContext(self.context), 1, false as i32), + LLVM_UNNAMED.as_ptr(), + ) + }; + self.resolver.with_result(arguments.dst, |dst| unsafe { + LLVMBuildInsertElement( + self.builder, + vec, + llvm_fn(self.builder, src2, packed_type, LLVM_UNNAMED.as_ptr()), + LLVMConstInt(LLVMInt32TypeInContext(self.context), 0, false as i32), + dst, + ) }) } else { self.resolver.with_result(arguments.dst, |dst| unsafe { @@ -2197,7 +2213,7 @@ impl<'a> MethodEmitContext<'a> { Some(&ast::ScalarType::F32.into()), vec![( self.resolver.value(arguments.src)?, - get_scalar_type(self.context, ast::ScalarType::F32.into()), + get_scalar_type(self.context, ast::ScalarType::F32), )], )?; Ok(()) @@ -2236,7 +2252,7 @@ impl<'a> MethodEmitContext<'a> { } fn emit_bar_warp(&mut self) -> Result<(), TranslateError> { - self.emit_intrinsic(c"llvm.amdgcn.barrier.warp", None, None, vec![])?; + self.emit_intrinsic(c"llvm.amdgcn.wave.barrier", None, None, vec![])?; Ok(()) } @@ -2658,14 +2674,14 @@ impl<'a> MethodEmitContext<'a> { let load = unsafe { LLVMBuildLoad2(self.builder, from_type, from, LLVM_UNNAMED.as_ptr()) }; unsafe { - LLVMSetAlignment(load, (cp_size.as_u64() as u32) * 8); + LLVMSetAlignment(load, cp_size.as_u64() as u32); } let extended = unsafe { LLVMBuildZExt(self.builder, load, to_type, LLVM_UNNAMED.as_ptr()) }; - unsafe { LLVMBuildStore(self.builder, extended, to) }; + let store = unsafe { LLVMBuildStore(self.builder, extended, to) }; unsafe { - LLVMSetAlignment(load, (cp_size.as_u64() as u32) * 8); + LLVMSetAlignment(store, cp_size.as_u64() as u32); } Ok(()) } @@ -2923,6 +2939,61 @@ impl<'a> MethodEmitContext<'a> { */ } +fn not_supported_by_atomics(qualifier: ast::LdStQualifier, underlying_type: *mut LLVMType) -> bool { + // This is not meant to be 100% accurate, just a best-effort guess for atomics + fn is_non_scalar_type(type_: LLVMTypeRef) -> bool { + let kind = unsafe { LLVMGetTypeKind(type_) }; + matches!( + kind, + LLVMTypeKind::LLVMArrayTypeKind + | LLVMTypeKind::LLVMVectorTypeKind + | LLVMTypeKind::LLVMStructTypeKind + ) + } + !matches!(qualifier, ast::LdStQualifier::Weak) && is_non_scalar_type(underlying_type) +} + +fn apply_qualifier( + value: LLVMValueRef, + qualifier: ptx_parser::LdStQualifier, +) -> Result<(), TranslateError> { + match qualifier { + ptx_parser::LdStQualifier::Weak => {} + ptx_parser::LdStQualifier::Volatile => unsafe { + LLVMSetVolatile(value, 1); + // The semantics of volatile operations are equivalent to a relaxed memory operation + // with system-scope but with the following extra implementation-specific constraints... + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingMonotonic, + get_scope(ast::MemScope::Sys)?, + ); + }, + ptx_parser::LdStQualifier::Relaxed(mem_scope) => unsafe { + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingMonotonic, + get_scope(mem_scope)?, + ); + }, + ptx_parser::LdStQualifier::Acquire(mem_scope) => unsafe { + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingAcquire, + get_scope(mem_scope)?, + ); + }, + ptx_parser::LdStQualifier::Release(mem_scope) => unsafe { + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingRelease, + get_scope(mem_scope)?, + ); + }, + } + Ok(()) +} + fn get_pointer_type<'ctx>( context: LLVMContextRef, to_space: ast::StateSpace, @@ -2936,7 +3007,7 @@ fn get_scope(scope: ast::MemScope) -> Result<*const i8, TranslateError> { ast::MemScope::Cta => c"workgroup-one-as", ast::MemScope::Gpu => c"agent-one-as", ast::MemScope::Sys => c"one-as", - ast::MemScope::Cluster => todo!(), + ast::MemScope::Cluster => return Err(error_todo()), } .as_ptr()) } @@ -2945,8 +3016,9 @@ fn get_scope_membar(scope: ast::MemScope) -> Result<*const i8, TranslateError> { Ok(match scope { ast::MemScope::Cta => c"workgroup", ast::MemScope::Gpu => c"agent", + // Don't change to "system", this is the same as __threadfence_system, AMDPGU LLVM expects "" here ast::MemScope::Sys => c"", - ast::MemScope::Cluster => todo!(), + ast::MemScope::Cluster => return Err(error_todo()), } .as_ptr()) } diff --git a/ptx/src/pass/mod.rs b/ptx/src/pass/mod.rs index b14903d..e4b5b27 100644 --- a/ptx/src/pass/mod.rs +++ b/ptx/src/pass/mod.rs @@ -136,6 +136,7 @@ enum PtxSpecialRegister { Nctaid, Clock, LanemaskLt, + LanemaskGe, Laneid, } @@ -148,6 +149,7 @@ impl PtxSpecialRegister { Self::Nctaid => "%nctaid", Self::Clock => "%clock", Self::LanemaskLt => "%lanemask_lt", + Self::LanemaskGe => "%lanemask_ge", Self::Laneid => "%laneid", } } @@ -170,6 +172,7 @@ impl PtxSpecialRegister { PtxSpecialRegister::Nctaid => ast::ScalarType::U32, PtxSpecialRegister::Clock => ast::ScalarType::U32, PtxSpecialRegister::LanemaskLt => ast::ScalarType::U32, + PtxSpecialRegister::LanemaskGe => ast::ScalarType::U32, PtxSpecialRegister::Laneid => ast::ScalarType::U32, } } @@ -182,6 +185,7 @@ impl PtxSpecialRegister { | PtxSpecialRegister::Nctaid => Some(ast::ScalarType::U8), PtxSpecialRegister::Clock | PtxSpecialRegister::LanemaskLt + | PtxSpecialRegister::LanemaskGe | PtxSpecialRegister::Laneid => None, } } @@ -194,6 +198,7 @@ impl PtxSpecialRegister { PtxSpecialRegister::Nctaid => "sreg_nctaid", PtxSpecialRegister::Clock => "sreg_clock", PtxSpecialRegister::LanemaskLt => "sreg_lanemask_lt", + PtxSpecialRegister::LanemaskGe => "sreg_lanemask_ge", PtxSpecialRegister::Laneid => "sreg_laneid", } } diff --git a/ptx/src/test/spirv_run/atomics_128.ptx b/ptx/src/test/spirv_run/atomics_128.ptx new file mode 100644 index 0000000..147d350 --- /dev/null +++ b/ptx/src/test/spirv_run/atomics_128.ptx @@ -0,0 +1,24 @@ +.version 7.0 +.target sm_80 +.address_size 64 + +.visible .entry atomics_128( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u64 temp1; + .reg .u64 temp2; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.acquire.gpu.v2.u64 {temp1, temp2}, [in_addr]; + add.u64 temp1, temp1, 1; + add.u64 temp2, temp2, 1; + st.release.gpu.v2.u64 [out_addr], {temp1, temp2}; + + ret; +} diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index c24ca1a..a7f1989 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -352,6 +352,12 @@ test_ptx!( [613065134u32] ); test_ptx!(param_is_addressable, [0xDEAD], [0u64]); +// TODO: re-enable when we have a patched LLVM +//test_ptx!( +// atomics_128, +// [0xce16728dead1ceb0u64, 0xe7728e3c390b7fb7], +// [0xce16728dead1ceb1u64, 0xe7728e3c390b7fb8] +//); test_ptx!(assertfail); // TODO: not yet supported diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index 26ae5e9..ead37d4 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -226,8 +226,9 @@ fn int_immediate<'a, 'input>(input: &mut PtxParser<'a, 'input>) -> PResult Ok(ast::ImmediateValue::S64(-x)), + let full_number = format!("-{num}"); + match i64::from_str_radix(&full_number, radix) { + Ok(x) => Ok(ast::ImmediateValue::S64(x)), Err(err) => Err((ast::ImmediateValue::S64(0), PtxError::from(err))), } } else if is_unsigned { diff --git a/zluda/Cargo.toml b/zluda/Cargo.toml index d0a65f4..1060e2b 100644 --- a/zluda/Cargo.toml +++ b/zluda/Cargo.toml @@ -22,7 +22,7 @@ num_enum = "0.4" lz4-sys = "1.9" tempfile = "3" paste = "1.0" -rustc-hash = "1.1" +rustc-hash = "2.0.0" zluda_common = { path = "../zluda_common" } blake3 = "1.8.2" serde = "1.0.219" diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs index 90afb51..ee1b557 100644 --- a/zluda/src/impl/function.rs +++ b/zluda/src/impl/function.rs @@ -1,22 +1,33 @@ +use cuda_types::cuda::CUfunction_attribute; use hip_runtime_sys::*; +use std::mem; pub(crate) fn get_attribute( pi: &mut i32, - cu_attrib: hipFunction_attribute, + cu_attrib: CUfunction_attribute, func: hipFunction_t, ) -> hipError_t { // TODO: implement HIP_FUNC_ATTRIBUTE_PTX_VERSION // TODO: implement HIP_FUNC_ATTRIBUTE_BINARY_VERSION match cu_attrib { - hipFunction_attribute::HIP_FUNC_ATTRIBUTE_PTX_VERSION - | hipFunction_attribute::HIP_FUNC_ATTRIBUTE_BINARY_VERSION => { + CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => { *pi = 120; return Ok(()); } + CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE => { + *pi = 0; + return Ok(()); + } _ => {} } - unsafe { hipFuncGetAttribute(pi, cu_attrib, func) }?; - if cu_attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS { + unsafe { hipFuncGetAttribute(pi, mem::transmute(cu_attrib), func) }?; + if cu_attrib == CUfunction_attribute::CU_FUNC_ATTRIBUTE_NUM_REGS { *pi = (*pi).max(1); } Ok(()) @@ -55,12 +66,12 @@ pub(crate) fn launch_kernel( pub(crate) unsafe fn set_attribute( func: hipFunction_t, - attribute: hipFunction_attribute, + attribute: CUfunction_attribute, value: i32, ) -> hipError_t { match attribute { - hipFunction_attribute::HIP_FUNC_ATTRIBUTE_PTX_VERSION - | hipFunction_attribute::HIP_FUNC_ATTRIBUTE_BINARY_VERSION => { + CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => { return hipError_t::ErrorNotSupported; } _ => {} diff --git a/zluda/src/impl/kernel.rs b/zluda/src/impl/kernel.rs index ab45b04..e4c3404 100644 --- a/zluda/src/impl/kernel.rs +++ b/zluda/src/impl/kernel.rs @@ -1,4 +1,4 @@ -use cuda_types::cuda::CUresult; +use cuda_types::cuda::{CUfunction_attribute, CUresult}; use hip_runtime_sys::*; use crate::r#impl::function; @@ -9,7 +9,7 @@ pub(crate) unsafe fn get_function(func: &mut hipFunction_t, kernel: hipFunction_ } pub(crate) unsafe fn set_attribute( - attrib: hipFunction_attribute, + attrib: CUfunction_attribute, val: ::core::ffi::c_int, kernel: hipFunction_t, _dev: hipDevice_t, diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs index 70395ed..4b33460 100644 --- a/zluda/src/impl/memory.rs +++ b/zluda/src/impl/memory.rs @@ -1,16 +1,18 @@ -use std::ptr; - +use crate::r#impl::{context, driver}; use cuda_types::cuda::{CUerror, CUresult, CUresultConsts}; use hip_runtime_sys::*; +use std::{mem, ptr}; -use crate::r#impl::{context, driver}; - -pub(crate) fn alloc_v2(dptr: &mut hipDeviceptr_t, bytesize: usize) -> CUresult { +pub(crate) unsafe fn alloc_v2(dptr: &mut hipDeviceptr_t, bytesize: usize) -> CUresult { let context = context::get_current_context()?; - unsafe { hipMalloc(ptr::from_mut(dptr).cast(), bytesize) }?; + hipMalloc(ptr::from_mut(dptr).cast(), bytesize)?; add_allocation(dptr.0, bytesize, context)?; + let mut status = mem::zeroed(); + hipStreamIsCapturing(hipStream_t(ptr::null_mut()), &mut status)?; // TODO: parametrize for non-Geekbench - unsafe { hipMemsetD8(*dptr, 0, bytesize) }?; + if status != hipStreamCaptureStatus::hipStreamCaptureStatusNone { + hipMemsetD8(*dptr, 0, bytesize)?; + } Ok(()) } diff --git a/zluda_bindgen/Cargo.toml b/zluda_bindgen/Cargo.toml index 5753307..8e7bb4d 100644 --- a/zluda_bindgen/Cargo.toml +++ b/zluda_bindgen/Cargo.toml @@ -9,6 +9,6 @@ syn = { version = "2.0", features = ["full", "visit-mut"] } proc-macro2 = "1.0.89" quote = "1.0" prettyplease = "0.2.25" -rustc-hash = "1.1.0" +rustc-hash = "2.0.0" libloading = "0.8" cuda_types = { path = "../cuda_types" } diff --git a/zluda_common/src/lib.rs b/zluda_common/src/lib.rs index 4f8aef7..4c76ef1 100644 --- a/zluda_common/src/lib.rs +++ b/zluda_common/src/lib.rs @@ -173,12 +173,12 @@ from_cuda_nop!( cublasLtMatmulDescAttributes_t, CUmemAllocationGranularity_flags, CUmemAllocationProp, - CUresult + CUresult, + CUfunction_attribute ); from_cuda_transmute!( CUuuid => hipUUID, CUfunction => hipFunction_t, - CUfunction_attribute => hipFunction_attribute, CUstream => hipStream_t, CUpointer_attribute => hipPointer_attribute, CUdeviceptr_v2 => hipDeviceptr_t, diff --git a/zluda_trace/Cargo.toml b/zluda_trace/Cargo.toml index a6c4120..0925c1a 100644 --- a/zluda_trace/Cargo.toml +++ b/zluda_trace/Cargo.toml @@ -24,7 +24,7 @@ paste = "1.0" cuda_macros = { path = "../cuda_macros" } cuda_types = { path = "../cuda_types" } parking_lot = "0.12.3" -rustc-hash = "1.1.0" +rustc-hash = "2.0.0" cglue = "0.3.5" zstd-safe = { version = "7.2.4", features = ["std"] } unwrap_or = "1.0.1"