diff --git a/llvm_zluda/src/lib.cpp b/llvm_zluda/src/lib.cpp
index c8ac2d7..1151330 100644
--- a/llvm_zluda/src/lib.cpp
+++ b/llvm_zluda/src/lib.cpp
@@ -196,4 +196,24 @@ void LLVMZludaBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering,
                          Name);
 }
 
+void LLVMZludaSetAtomic(
+    LLVMValueRef AtomicInst,
+    LLVMAtomicOrdering Ordering,
+    char * SSID)
+{
+    auto inst = unwrap(AtomicInst);
+    if (LoadInst *LI = dyn_cast<LoadInst>(inst))
+    {
+        LI->setAtomic(mapFromLLVMOrdering(Ordering), LI->getContext().getOrInsertSyncScopeID(SSID));
+    }
+    else if (StoreInst *SI = dyn_cast<StoreInst>(inst))
+    {
+        SI->setAtomic(mapFromLLVMOrdering(Ordering), SI->getContext().getOrInsertSyncScopeID(SSID));
+    }
+    else
+    {
+        llvm_unreachable("Invalid instruction type for LLVMZludaSetAtomic");
+    }
+}
+
 LLVM_C_EXTERN_C_END
\ No newline at end of file
diff --git a/llvm_zluda/src/lib.rs b/llvm_zluda/src/lib.rs
index 18046a5..37b1d97 100644
--- a/llvm_zluda/src/lib.rs
+++ b/llvm_zluda/src/lib.rs
@@ -78,4 +78,10 @@ extern "C" {
         scope: *const i8,
         Name: *const i8,
     ) -> LLVMValueRef;
+
+    pub fn LLVMZludaSetAtomic(
+        AtomicInst: LLVMValueRef,
+        Ordering: LLVMAtomicOrdering,
+        SSID: *const i8,
+    );
 }
diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc
index bc375c3..1bc9856 100644
Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ
diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp
index 6174ec1..74384ac 100644
--- a/ptx/lib/zluda_ptx_impl.cpp
+++ b/ptx/lib/zluda_ptx_impl.cpp
@@ -17,6 +17,7 @@ typedef _Float16 half16 __attribute__((ext_vector_type(16)));
 typedef float    float8 __attribute__((ext_vector_type(8)));
 
 #define FUNC(NAME) __device__ __attribute__((retain)) __zluda_ptx_impl_##NAME
+#define FUNC_CALL(NAME) __zluda_ptx_impl_##NAME
 #define ATTR(NAME) __ZLUDA_PTX_IMPL_ATTRIBUTE_##NAME
 #define DECLARE_ATTR(TYPE, NAME) \
     extern "C" __attribute__((constant)) CONSTANT_SPACE TYPE ATTR(NAME) \
@@ -58,6 +59,18 @@ extern "C"
         return __lane_id();
     }
 
+    uint32_t FUNC(sreg_lanemask_lt)()
+    {
+        uint32_t lane_idx = FUNC_CALL(sreg_laneid)();
+        return (1U << lane_idx) - 1U;
+    }
+
+    uint32_t FUNC(sreg_lanemask_ge)()
+    {
+        uint32_t lane_idx = FUNC_CALL(sreg_laneid)();
+        return (~0U) << lane_idx;
+    }
+
     uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __device__;
     uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos_32, uint32_t len_32)
     {
diff --git a/ptx/src/pass/llvm/emit.rs b/ptx/src/pass/llvm/emit.rs
index c27f1aa..3d2c154 100644
--- a/ptx/src/pass/llvm/emit.rs
+++ b/ptx/src/pass/llvm/emit.rs
@@ -542,17 +542,26 @@ impl<'a> MethodEmitContext<'a> {
         data: ast::LdDetails,
         arguments: ast::LdArgs<SpirvWord>,
     ) -> Result<(), TranslateError> {
-        if data.qualifier != ast::LdStQualifier::Weak {
-            todo!()
-        }
         let builder = self.builder;
-        let type_ = get_type(self.context, &data.typ)?;
-        let ptr = self.resolver.value(arguments.src)?;
-        self.resolver.with_result(arguments.dst, |dst| {
-            let load = unsafe { LLVMBuildLoad2(builder, type_, ptr, dst) };
-            unsafe { LLVMSetAlignment(load, data.typ.layout().align() as u32) };
-            load
-        });
+        let needs_cast = !matches!(data.typ, ast::Type::Scalar(_))
+            && !matches!(data.qualifier, ast::LdStQualifier::Weak);
+        let underlying_type = get_type(self.context, &data.typ)?;
+        let op_type = if needs_cast {
+            unsafe { LLVMIntTypeInContext(self.context, data.typ.layout().size() as u32 * 8) }
+        } else {
+            underlying_type
+        };
+        let src = self.resolver.value(arguments.src)?;
+        let load = unsafe { LLVMBuildLoad2(builder, op_type, src, LLVM_UNNAMED.as_ptr()) };
+        apply_qualifier(load, data.qualifier)?;
+        unsafe { LLVMSetAlignment(load, data.typ.layout().align() as u32) };
+        if needs_cast {
+            self.resolver.with_result(arguments.dst, |dst| unsafe {
+                LLVMBuildBitCast(builder, load, underlying_type, dst)
+            });
+        } else {
+            self.resolver.register(arguments.dst, load);
+        }
         Ok(())
     }
 
@@ -761,11 +770,21 @@ impl<'a> MethodEmitContext<'a> {
         arguments: ast::StArgs<SpirvWord>,
     ) -> Result<(), TranslateError> {
         let ptr = self.resolver.value(arguments.src1)?;
-        let value = self.resolver.value(arguments.src2)?;
-        if data.qualifier != ast::LdStQualifier::Weak {
-            todo!()
+        let needs_cast = !matches!(data.typ, ast::Type::Scalar(_))
+            && !matches!(data.qualifier, ast::LdStQualifier::Weak);
+        let mut value = self.resolver.value(arguments.src2)?;
+        if needs_cast {
+            value = unsafe {
+                LLVMBuildBitCast(
+                    self.builder,
+                    value,
+                    LLVMIntTypeInContext(self.context, data.typ.layout().size() as u32 * 8),
+                    LLVM_UNNAMED.as_ptr(),
+                )
+            };
         }
         let store = unsafe { LLVMBuildStore(self.builder, value, ptr) };
+        apply_qualifier(store, data.qualifier)?;
         unsafe {
             LLVMSetAlignment(store, data.typ.layout().align() as u32);
         }
@@ -2237,7 +2256,7 @@ impl<'a> MethodEmitContext<'a> {
     }
 
     fn emit_bar_warp(&mut self) -> Result<(), TranslateError> {
-        self.emit_intrinsic(c"llvm.amdgcn.barrier.warp", None, None, vec![])?;
+        self.emit_intrinsic(c"llvm.amdgcn.wave.barrier", None, None, vec![])?;
         Ok(())
     }
 
@@ -2966,6 +2985,47 @@ impl<'a> MethodEmitContext<'a> {
      */
 }
 
+fn apply_qualifier(
+    value: LLVMValueRef,
+    qualifier: ptx_parser::LdStQualifier,
+) -> Result<(), TranslateError> {
+    match qualifier {
+        ptx_parser::LdStQualifier::Weak => {}
+        ptx_parser::LdStQualifier::Volatile => unsafe {
+            LLVMSetVolatile(value, 1);
+            // The semantics of volatile operations are equivalent to a relaxed memory operation
+            // with system-scope but with the following extra implementation-specific constraints...
+            LLVMZludaSetAtomic(
+                value,
+                LLVMAtomicOrdering::LLVMAtomicOrderingMonotonic,
+                get_scope(ast::MemScope::Sys)?,
+            );
+        },
+        ptx_parser::LdStQualifier::Relaxed(mem_scope) => unsafe {
+            LLVMZludaSetAtomic(
+                value,
+                LLVMAtomicOrdering::LLVMAtomicOrderingMonotonic,
+                get_scope(mem_scope)?,
+            );
+        },
+        ptx_parser::LdStQualifier::Acquire(mem_scope) => unsafe {
+            LLVMZludaSetAtomic(
+                value,
+                LLVMAtomicOrdering::LLVMAtomicOrderingAcquire,
+                get_scope(mem_scope)?,
+            );
+        },
+        ptx_parser::LdStQualifier::Release(mem_scope) => unsafe {
+            LLVMZludaSetAtomic(
+                value,
+                LLVMAtomicOrdering::LLVMAtomicOrderingRelease,
+                get_scope(mem_scope)?,
+            );
+        },
+    }
+    Ok(())
+}
+
 fn get_pointer_type<'ctx>(
     context: LLVMContextRef,
     to_space: ast::StateSpace,
@@ -2979,7 +3039,7 @@ fn get_scope(scope: ast::MemScope) -> Result<*const i8, TranslateError> {
         ast::MemScope::Cta => c"workgroup-one-as",
         ast::MemScope::Gpu => c"agent-one-as",
         ast::MemScope::Sys => c"one-as",
-        ast::MemScope::Cluster => todo!(),
+        ast::MemScope::Cluster => return Err(error_todo()),
     }
     .as_ptr())
 }
@@ -2989,7 +3049,7 @@ fn get_scope_membar(scope: ast::MemScope) -> Result<*const i8, TranslateError> {
         ast::MemScope::Cta => c"workgroup",
         ast::MemScope::Gpu => c"agent",
         ast::MemScope::Sys => c"system",
-        ast::MemScope::Cluster => todo!(),
+        ast::MemScope::Cluster => return Err(error_todo()),
     }
     .as_ptr())
 }
diff --git a/ptx/src/pass/mod.rs b/ptx/src/pass/mod.rs
index 2e88367..b0614d5 100644
--- a/ptx/src/pass/mod.rs
+++ b/ptx/src/pass/mod.rs
@@ -139,6 +139,7 @@ enum PtxSpecialRegister {
     Nctaid,
     Clock,
     LanemaskLt,
+    LanemaskGe,
     Laneid,
 }
 
@@ -151,6 +152,7 @@ impl PtxSpecialRegister {
             Self::Nctaid => "%nctaid",
             Self::Clock => "%clock",
             Self::LanemaskLt => "%lanemask_lt",
+            Self::LanemaskGe => "%lanemask_ge",
             Self::Laneid => "%laneid",
         }
     }
@@ -173,6 +175,7 @@ impl PtxSpecialRegister {
             PtxSpecialRegister::Nctaid => ast::ScalarType::U32,
             PtxSpecialRegister::Clock => ast::ScalarType::U32,
             PtxSpecialRegister::LanemaskLt => ast::ScalarType::U32,
+            PtxSpecialRegister::LanemaskGe => ast::ScalarType::U32,
             PtxSpecialRegister::Laneid => ast::ScalarType::U32,
         }
     }
@@ -185,6 +188,7 @@ impl PtxSpecialRegister {
             | PtxSpecialRegister::Nctaid => Some(ast::ScalarType::U8),
             PtxSpecialRegister::Clock
             | PtxSpecialRegister::LanemaskLt
+            | PtxSpecialRegister::LanemaskGe
             | PtxSpecialRegister::Laneid => None,
         }
     }
@@ -197,6 +201,7 @@ impl PtxSpecialRegister {
             PtxSpecialRegister::Nctaid => "sreg_nctaid",
             PtxSpecialRegister::Clock => "sreg_clock",
             PtxSpecialRegister::LanemaskLt => "sreg_lanemask_lt",
+            PtxSpecialRegister::LanemaskGe => "sreg_lanemask_ge",
             PtxSpecialRegister::Laneid => "sreg_laneid",
         }
     }
diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs
index 90afb51..ee1b557 100644
--- a/zluda/src/impl/function.rs
+++ b/zluda/src/impl/function.rs
@@ -1,22 +1,33 @@
+use cuda_types::cuda::CUfunction_attribute;
 use hip_runtime_sys::*;
+use std::mem;
 
 pub(crate) fn get_attribute(
     pi: &mut i32,
-    cu_attrib: hipFunction_attribute,
+    cu_attrib: CUfunction_attribute,
     func: hipFunction_t,
 ) -> hipError_t {
     // TODO: implement HIP_FUNC_ATTRIBUTE_PTX_VERSION
     // TODO: implement HIP_FUNC_ATTRIBUTE_BINARY_VERSION
     match cu_attrib {
-        hipFunction_attribute::HIP_FUNC_ATTRIBUTE_PTX_VERSION
-        | hipFunction_attribute::HIP_FUNC_ATTRIBUTE_BINARY_VERSION => {
+        CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => {
             *pi = 120;
             return Ok(());
         }
+        CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE => {
+            *pi = 0;
+            return Ok(());
+        }
         _ => {}
     }
-    unsafe { hipFuncGetAttribute(pi, cu_attrib, func) }?;
-    if cu_attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS {
+    unsafe { hipFuncGetAttribute(pi, mem::transmute(cu_attrib), func) }?;
+    if cu_attrib == CUfunction_attribute::CU_FUNC_ATTRIBUTE_NUM_REGS {
         *pi = (*pi).max(1);
     }
     Ok(())
@@ -55,12 +66,12 @@ pub(crate) fn launch_kernel(
 
 pub(crate) unsafe fn set_attribute(
     func: hipFunction_t,
-    attribute: hipFunction_attribute,
+    attribute: CUfunction_attribute,
     value: i32,
 ) -> hipError_t {
     match attribute {
-        hipFunction_attribute::HIP_FUNC_ATTRIBUTE_PTX_VERSION
-        | hipFunction_attribute::HIP_FUNC_ATTRIBUTE_BINARY_VERSION => {
+        CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION
+        | CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => {
             return hipError_t::ErrorNotSupported;
         }
         _ => {}
diff --git a/zluda/src/impl/kernel.rs b/zluda/src/impl/kernel.rs
index ab45b04..e4c3404 100644
--- a/zluda/src/impl/kernel.rs
+++ b/zluda/src/impl/kernel.rs
@@ -1,4 +1,4 @@
-use cuda_types::cuda::CUresult;
+use cuda_types::cuda::{CUfunction_attribute, CUresult};
 use hip_runtime_sys::*;
 
 use crate::r#impl::function;
@@ -9,7 +9,7 @@ pub(crate) unsafe fn get_function(func: &mut hipFunction_t, kernel: hipFunction_
 }
 
 pub(crate) unsafe fn set_attribute(
-    attrib: hipFunction_attribute,
+    attrib: CUfunction_attribute,
     val: ::core::ffi::c_int,
     kernel: hipFunction_t,
     _dev: hipDevice_t,
diff --git a/zluda_common/src/lib.rs b/zluda_common/src/lib.rs
index 4f8aef7..4c76ef1 100644
--- a/zluda_common/src/lib.rs
+++ b/zluda_common/src/lib.rs
@@ -173,12 +173,12 @@ from_cuda_nop!(
     cublasLtMatmulDescAttributes_t,
     CUmemAllocationGranularity_flags,
     CUmemAllocationProp,
-    CUresult
+    CUresult,
+    CUfunction_attribute
 );
 from_cuda_transmute!(
     CUuuid => hipUUID,
     CUfunction => hipFunction_t,
-    CUfunction_attribute => hipFunction_attribute,
     CUstream => hipStream_t,
     CUpointer_attribute => hipPointer_attribute,
     CUdeviceptr_v2 => hipDeviceptr_t,