diff --git a/Utilities/BEType.h b/Utilities/BEType.h
index 79f09e2495..14fc479a04 100644
--- a/Utilities/BEType.h
+++ b/Utilities/BEType.h
@@ -165,13 +165,6 @@ union u128
 
 	} _bit;
 
-	//operator u64() const { return _u64[0]; }
-	//operator u32() const { return _u32[0]; }
-	//operator u16() const { return _u16[0]; }
-	//operator u8()  const { return _u8[0]; }
-
-	//operator bool() const { return _u64[0] != 0 || _u64[1] != 0; }
-
 	static u128 from64(u64 _0, u64 _1 = 0)
 	{
 		u128 ret;
@@ -443,7 +436,7 @@ static force_inline u128 sync_fetch_and_xor(volatile u128* dest, u128 value)
 	}
 }
 
-template<typename T, int size = sizeof(T)> struct se_t;
+template<typename T, std::size_t Size = sizeof(T)> struct se_t;
 
 template<typename T> struct se_t<T, 2>
 {
@@ -501,16 +494,13 @@ template<typename T> struct se_t<T, 16>
 	}
 };
 
-template<typename T, T _value, size_t size = sizeof(T)> struct const_se_t;
-
-template<u8 _value> struct const_se_t<u8, _value, 1>
-{
-	static const u8 value = _value;
-};
+template<typename T, T _value, std::size_t size = sizeof(T)> struct const_se_t;
 
 template<u16 _value> struct const_se_t<u16, _value, 2>
 {
-	static const u16 value = ((_value >> 8) & 0xff) | ((_value << 8) & 0xff00);
+	static const u16 value =
+		((_value >> 8) & 0x00ff) |
+		((_value << 8) & 0xff00);
 };
 
 template<u32 _value> struct const_se_t<u32, _value, 4>
@@ -600,9 +590,9 @@ public:
 	using stype = be_storage_t<std::remove_cv_t<T>>;
 
 #ifdef IS_LE_MACHINE
-	stype m_data;
+	stype m_data; // don't access directly
 #else
-	type m_data;
+	type m_data; // don't access directly
 #endif
 
 	static_assert(!std::is_class<type>::value, "be_t<> error: invalid type (class or structure)");
@@ -695,41 +685,41 @@ public:
 	be_t& operator --() { *this -= 1; return *this; }
 };
 
-template<typename T1, typename T2> inline std::enable_if_t<std::is_same<T1, T2>::value, bool> operator ==(const be_t<T1>& left, const be_t<T2>& right)
+template<typename T1, typename T2> inline std::enable_if_t<std::is_same<T1, T2>::value && std::is_integral<T1>::value, bool> operator ==(const be_t<T1>& left, const be_t<T2>& right)
 {
 	return left.data() == right.data();
 }
 
-template<typename T1, typename T2> inline std::enable_if_t<std::is_same<T1, T2>::value, bool> operator !=(const be_t<T1>& left, const be_t<T2>& right)
+template<typename T1, typename T2> inline std::enable_if_t<std::is_same<T1, T2>::value && std::is_integral<T1>::value, bool> operator !=(const be_t<T1>& left, const be_t<T2>& right)
 {
 	return left.data() != right.data();
 }
 
-template<typename T1, typename T2> inline std::enable_if_t<std::is_same<T1, T2>::value, be_t<T1>> operator &(const be_t<T1>& left, const be_t<T2>& right)
+template<typename T1, typename T2> inline std::enable_if_t<std::is_same<T1, T2>::value && std::is_integral<T1>::value, be_t<T1>> operator &(const be_t<T1>& left, const be_t<T2>& right)
 {
 	be_t<T1> result;
 	result.m_data = left.data() & right.data();
 	return result;
 }
 
-template<typename T1, typename T2> inline std::enable_if_t<std::is_same<T1, T2>::value, be_t<T1>> operator |(const be_t<T1>& left, const be_t<T2>& right)
+template<typename T1, typename T2> inline std::enable_if_t<std::is_same<T1, T2>::value && std::is_integral<T1>::value, be_t<T1>> operator |(const be_t<T1>& left, const be_t<T2>& right)
 {
 	be_t<T1> result;
 	result.m_data = left.data() | right.data();
 	return result;
 }
 
-template<typename T1, typename T2> inline std::enable_if_t<std::is_same<T1, T2>::value, be_t<T1>> operator ^(const be_t<T1>& left, const be_t<T2>& right)
+template<typename T1, typename T2> inline std::enable_if_t<std::is_same<T1, T2>::value && std::is_integral<T1>::value, be_t<T1>> operator ^(const be_t<T1>& left, const be_t<T2>& right)
 {
 	be_t<T1> result;
 	result.m_data = left.data() ^ right.data();
 	return result;
 }
 
-template<typename T1> inline std::enable_if_t<true, be_t<T1>> operator ~(const be_t<T1>& other)
+template<typename T1> inline std::enable_if_t<std::is_integral<T1>::value, be_t<T1>> operator ~(const be_t<T1>& arg)
 {
 	be_t<T1> result;
-	result.m_data = ~other.data();
+	result.m_data = ~arg.data();
 	return result;
 }
 
@@ -782,7 +772,7 @@ public:
 	using type = std::remove_cv_t<T>;
 	using stype = be_storage_t<std::remove_cv_t<T>>;
 
-	type m_data;
+	type m_data; // don't access directly
 
 	static_assert(!std::is_class<type>::value, "le_t<> error: invalid type (class or structure)");
 	static_assert(!std::is_union<type>::value || std::is_same<type, u128>::value, "le_t<> error: invalid type (union)");
diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp
index d160a46198..4449cbb12a 100644
--- a/Utilities/Thread.cpp
+++ b/Utilities/Thread.cpp
@@ -749,7 +749,7 @@ size_t get_x64_access_size(x64_context* context, x64_op_t op, x64_reg_t reg, siz
 	if (op == X64OP_CMPXCHG)
 	{
 		// detect whether this instruction can't actually modify memory to avoid breaking reservation;
-		// this may theoretically cause endless loop, but it shouldn't be a problem if only read_sync() generates such instruction
+		// this may theoretically cause endless loop, but it shouldn't be a problem if only load_sync() generates such instruction
 		u64 cmp, exch;
 		if (!get_x64_reg_value(context, reg, d_size, i_size, cmp) || !get_x64_reg_value(context, X64R_RAX, d_size, i_size, exch))
 		{
@@ -1480,16 +1480,22 @@ bool thread_t::joinable() const
 	return m_state == TS_JOINABLE;
 }
 
-bool waiter_map_t::is_stopped(u64 signal_id)
+bool waiter_map_t::is_stopped(u32 addr)
 {
 	if (Emu.IsStopped())
 	{
-		LOG_WARNING(Log::HLE, "%s: waiter_op() aborted (signal_id=0x%llx)", name.c_str(), signal_id);
+		LOG_WARNING(Log::HLE, "%s: waiter_op() aborted (addr=0x%x)", name.c_str(), addr);
 		return true;
 	}
 	return false;
 }
 
+void waiter_map_t::notify(u32 addr)
+{
+	// signal appropriate condition variable
+	cv[get_hash(addr)].notify_all();
+}
+
 const std::function<bool()> SQUEUE_ALWAYS_EXIT = [](){ return true; };
 const std::function<bool()> SQUEUE_NEVER_EXIT = [](){ return false; };
 
diff --git a/Utilities/Thread.h b/Utilities/Thread.h
index 783f66176c..fbc528b58d 100644
--- a/Utilities/Thread.h
+++ b/Utilities/Thread.h
@@ -103,7 +103,7 @@ class slw_shared_mutex_t
 
 struct waiter_map_t
 {
-	static const size_t size = 32;
+	static const size_t size = 16;
 
 	std::array<std::mutex, size> mutex;
 	std::array<std::condition_variable, size> cv;
@@ -115,40 +115,43 @@ struct waiter_map_t
 	{
 	}
 
-	bool is_stopped(u64 signal_id);
+	// generate simple "hash" for mutex/cv distribution
+	u32 get_hash(u32 addr)
+	{
+		addr ^= addr >> 16;
+		addr ^= addr >> 24;
+		addr ^= addr >> 28;
+		return addr % size;
+	}
+
+	// check emu status
+	bool is_stopped(u32 addr);
 
 	// wait until waiter_func() returns true, signal_id is an arbitrary number
-	template<typename S, typename WT> force_inline safe_buffers void wait_op(const S& signal_id, const WT waiter_func)
+	template<typename F, typename... Args> safe_buffers auto wait_op(u32 addr, F pred, Args&&... args) -> decltype(static_cast<void>(pred(args...)))
 	{
-		// generate hash
-		const auto hash = std::hash<S>()(signal_id) % size;
+		const u32 hash = get_hash(addr);
 
 		// set mutex locker
-		std::unique_lock<std::mutex> locker(mutex[hash], std::defer_lock);
+		std::unique_lock<std::mutex> lock(mutex[hash], std::defer_lock);
 
-		// check the condition or if the emulator is stopped
-		while (!waiter_func() && !is_stopped(signal_id))
+		while (true)
 		{
+			// check the condition
+			if (pred(args...)) return;
+
 			// lock the mutex and initialize waiter (only once)
-			if (!locker.owns_lock())
-			{
-				locker.lock();
-			}
+			if (!lock) lock.lock();
 			
 			// wait on appropriate condition variable for 1 ms or until signal arrived
-			cv[hash].wait_for(locker, std::chrono::milliseconds(1));
+			cv[hash].wait_for(lock, std::chrono::milliseconds(1));
+
+			if (is_stopped(addr)) return;
 		}
 	}
 
 	// signal all threads waiting on waiter_op() with the same signal_id (signaling only hints those threads that corresponding conditions are *probably* met)
-	template<typename S> force_inline void notify(const S& signal_id)
-	{
-		// generate hash
-		const auto hash = std::hash<S>()(signal_id) % size;
-
-		// signal appropriate condition variable
-		cv[hash].notify_all();
-	}
+	void notify(u32 addr);
 };
 
 extern const std::function<bool()> SQUEUE_ALWAYS_EXIT;
@@ -209,7 +212,7 @@ public:
 	{
 		u32 pos = 0;
 
-		while (u32 res = m_sync.atomic_op_sync(SQSVR_OK, [&pos](squeue_sync_var_t& sync) -> u32
+		while (u32 res = m_sync.atomic_op([&pos](squeue_sync_var_t& sync) -> u32
 		{
 			assert(sync.count <= sq_size);
 			assert(sync.position < sq_size);
@@ -272,7 +275,7 @@ public:
 	{
 		u32 pos = 0;
 
-		while (u32 res = m_sync.atomic_op_sync(SQSVR_OK, [&pos](squeue_sync_var_t& sync) -> u32
+		while (u32 res = m_sync.atomic_op([&pos](squeue_sync_var_t& sync) -> u32
 		{
 			assert(sync.count <= sq_size);
 			assert(sync.position < sq_size);
@@ -341,7 +344,7 @@ public:
 		assert(start_pos < sq_size);
 		u32 pos = 0;
 
-		while (u32 res = m_sync.atomic_op_sync(SQSVR_OK, [&pos, start_pos](squeue_sync_var_t& sync) -> u32
+		while (u32 res = m_sync.atomic_op([&pos, start_pos](squeue_sync_var_t& sync) -> u32
 		{
 			assert(sync.count <= sq_size);
 			assert(sync.position < sq_size);
@@ -425,7 +428,7 @@ public:
 	{
 		u32 pos, count;
 
-		while (m_sync.atomic_op_sync(SQSVR_OK, [&pos, &count](squeue_sync_var_t& sync) -> u32
+		while (m_sync.atomic_op([&pos, &count](squeue_sync_var_t& sync) -> u32
 		{
 			assert(sync.count <= sq_size);
 			assert(sync.position < sq_size);
@@ -463,7 +466,7 @@ public:
 
 	void clear()
 	{
-		while (m_sync.atomic_op_sync(SQSVR_OK, [](squeue_sync_var_t& sync) -> u32
+		while (m_sync.atomic_op([](squeue_sync_var_t& sync) -> u32
 		{
 			assert(sync.count <= sq_size);
 			assert(sync.position < sq_size);
diff --git a/rpcs3/Emu/Cell/RawSPUThread.cpp b/rpcs3/Emu/Cell/RawSPUThread.cpp
index 52c0442956..0ca0f45ca1 100644
--- a/rpcs3/Emu/Cell/RawSPUThread.cpp
+++ b/rpcs3/Emu/Cell/RawSPUThread.cpp
@@ -76,7 +76,7 @@ bool RawSPUThread::ReadReg(const u32 addr, u32& value)
 		
 	case SPU_Status_offs:
 	{
-		value = status.read_relaxed();
+		value = status.load();
 		return true;
 	}
 	}
@@ -185,7 +185,7 @@ bool RawSPUThread::WriteReg(const u32 addr, const u32 value)
 			break;
 		}
 
-		run_ctrl.write_relaxed(value);
+		run_ctrl.store(value);
 		return true;
 	}
 
@@ -196,7 +196,7 @@ bool RawSPUThread::WriteReg(const u32 addr, const u32 value)
 			break;
 		}
 
-		npc.write_relaxed(value);
+		npc.store(value);
 		return true;
 	}
 
@@ -223,5 +223,5 @@ void RawSPUThread::Task()
 
 	SPUThread::Task();
 
-	npc.write_relaxed(PC | 1);
+	npc.store(PC | 1);
 }
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 7b4a7476fa..dde01fde9c 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -495,7 +495,7 @@ u32 SPUThread::get_ch_count(u32 ch)
 	case SPU_RdSigNotify1:    return ch_snr1.get_count(); break;
 	case SPU_RdSigNotify2:    return ch_snr2.get_count(); break;
 	case MFC_RdAtomicStat:    return ch_atomic_stat.get_count(); break;
-	case SPU_RdEventStat:     return ch_event_stat.read_relaxed() & ch_event_mask ? 1 : 0; break;
+	case SPU_RdEventStat:     return ch_event_stat.load() & ch_event_mask ? 1 : 0; break;
 	}
 
 	LOG_ERROR(SPU, "get_ch_count(ch=%d [%s]): unknown/illegal channel", ch, ch < 128 ? spu_ch_name[ch] : "???");
@@ -603,7 +603,7 @@ u32 SPUThread::get_ch_value(u32 ch)
 	case SPU_RdEventStat:
 	{
 		u32 result;
-		while (!(result = ch_event_stat.read_relaxed() & ch_event_mask) && !Emu.IsStopped())
+		while (!(result = ch_event_stat.load() & ch_event_mask) && !Emu.IsStopped())
 		{
 			std::this_thread::sleep_for(std::chrono::milliseconds(1)); // hack
 		}
diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h
index b06963e24c..07035add3c 100644
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@@ -199,7 +199,7 @@ public:
 
 	void set_value(u32 value, u32 count = 1)
 	{
-		sync_var.write_relaxed({ count, value });
+		sync_var.store({ count, value });
 	}
 
 	u32 get_value() volatile
@@ -254,7 +254,7 @@ public:
 	{
 		bool out_result;
 
-		const u32 last_value = value3.read_sync();
+		const u32 last_value = value3.load_sync();
 
 		sync_var.atomic_op([&out_result, &out_value, &out_count, last_value](sync_var_t& data)
 		{
@@ -292,7 +292,7 @@ public:
 	void set(u64 ints)
 	{
 		// leave only enabled interrupts
-		ints &= mask.read_relaxed();
+		ints &= mask.load();
 
 		if (ints && ~stat._or(ints) & ints)
 		{
diff --git a/rpcs3/Emu/Memory/atomic.h b/rpcs3/Emu/Memory/atomic.h
index 6eea856d7c..86bd6cc741 100644
--- a/rpcs3/Emu/Memory/atomic.h
+++ b/rpcs3/Emu/Memory/atomic.h
@@ -32,6 +32,43 @@ template<typename T> struct _to_atomic_subtype<T, 16>
 
 template<typename T> using atomic_subtype_t = typename _to_atomic_subtype<T>::type;
 
+// result wrapper to deal with void result type
+template<typename RT> struct atomic_op_result_t
+{
+	RT result;
+
+	template<typename T, typename... Args> inline atomic_op_result_t(T func, Args&&... args)
+		: result(std::move(func(std::forward<Args>(args)...)))
+	{
+	}
+
+	inline RT move()
+	{
+		return std::move(result);
+	}
+};
+
+// void specialization
+template<> struct atomic_op_result_t<void>
+{
+	template<typename T, typename... Args> inline atomic_op_result_t(T func, Args&&... args)
+	{
+		func(std::forward<Args>(args)...);
+	}
+
+	inline void move()
+	{
+	}
+};
+
+struct break_never_t
+{
+	template<typename RT> inline bool operator()(const atomic_op_result_t<RT>&) const
+	{
+		return false;
+	}
+};
+
 template<typename T> union _atomic_base
 {
 	using type = std::remove_cv_t<T>;
@@ -55,6 +92,27 @@ template<typename T> union _atomic_base
 		return reinterpret_cast<type&>(value);
 	}
 
+private:
+	template<typename T2> force_inline static void write_relaxed(volatile T2& data, const T2& value)
+	{
+		data = value;
+	}
+
+	force_inline static void write_relaxed(volatile u128& data, const u128& value)
+	{
+		sync_lock_test_and_set(&data, value);
+	}
+
+	template<typename T2> force_inline static T2 read_relaxed(const volatile T2& data)
+	{
+		return data;
+	}
+
+	force_inline static u128 read_relaxed(const volatile u128& value)
+	{
+		return sync_val_compare_and_swap(const_cast<volatile u128*>(&value), {}, {});
+	}
+
 public:
 	// atomically compare data with cmp, replace with exch if equal, return previous data value anyway
 	force_inline const type compare_and_swap(const type& cmp, const type& exch) volatile
@@ -69,7 +127,7 @@ public:
 	}
 
 	// read data with memory barrier
-	force_inline const type read_sync() const volatile
+	force_inline const type load_sync() const volatile
 	{
 		const subtype zero = {};
 		return from_subtype(sync_val_compare_and_swap(const_cast<subtype*>(&sub_data), zero, zero));
@@ -81,73 +139,42 @@ public:
 		return from_subtype(sync_lock_test_and_set(&sub_data, to_subtype(exch)));
 	}
 
-	// read data without memory barrier
-	force_inline const type read_relaxed() const volatile
+	// read data without memory barrier (works as load_sync() for 128 bit)
+	force_inline const type load() const volatile
 	{
-		const subtype value = const_cast<const subtype&>(sub_data);
-		return from_subtype(value);
+		return from_subtype(read_relaxed(sub_data));
 	}
 
-	// write data without memory barrier
-	force_inline void write_relaxed(const type& value) volatile
+	// write data without memory barrier (works as exchange() for 128 bit, discarding result)
+	force_inline void store(const type& value) volatile
 	{
-		const_cast<subtype&>(sub_data) = to_subtype(value);
+		write_relaxed(sub_data, to_subtype(value));
 	}
 
-	// perform atomic operation on data
-	template<typename FT> force_inline void atomic_op(const FT atomic_proc) volatile
+	// perform an atomic operation on data (callable object version, first arg is a reference to atomic type)
+	template<typename Break_if = break_never_t, typename F, typename... Args> auto atomic_op(F func, Args&&... args) volatile -> decltype(func(std::declval<T&>(), args...))
 	{
 		while (true)
 		{
-			const subtype old = const_cast<const subtype&>(sub_data);
+			// read the old value from memory
+			const subtype old = read_relaxed(sub_data);
+
+			// copy the old value
 			subtype _new = old;
-			atomic_proc(to_type(_new)); // function should accept reference to T type
-			if (sync_bool_compare_and_swap(&sub_data, old, _new)) return;
+
+			// call atomic op for the local copy of the old value and save the return value of the function
+			atomic_op_result_t<std::result_of_t<F(T&, Args...)>> result(func, to_type(_new), args...);
+
+			// 1) check return value using callable object of Break_if type, return if condition met
+			// 2) atomically compare value with `old`, replace with `_new` and return on success
+			if (Break_if()(result) || sync_bool_compare_and_swap(&sub_data, old, _new)) return result.move();
 		}
 	}
 
-	// perform atomic operation on data with special exit condition (if intermediate result != proceed_value)
-	template<typename RT, typename FT> force_inline RT atomic_op(const RT proceed_value, const FT atomic_proc) volatile
+	// perform an atomic operation on data (member function version)
+	template<typename Break_if = break_never_t, typename CT, typename RT, typename... FArgs, typename... Args, typename = std::enable_if_t<std::is_same<T, CT>::value>> auto atomic_op(RT(CT::* func)(FArgs...), Args&&... args) volatile -> decltype((std::declval<T&>().*func)(args...))
 	{
-		while (true)
-		{
-			const subtype old = const_cast<const subtype&>(sub_data);
-			subtype _new = old;
-			auto res = static_cast<RT>(atomic_proc(to_type(_new))); // function should accept reference to T type and return some value
-			if (res != proceed_value) return res;
-			if (sync_bool_compare_and_swap(&sub_data, old, _new)) return proceed_value;
-		}
-	}
-
-	// perform atomic operation on data with additional memory barrier
-	template<typename FT> force_inline void atomic_op_sync(const FT atomic_proc) volatile
-	{
-		const subtype zero = {};
-		subtype old = sync_val_compare_and_swap(&sub_data, zero, zero);
-		while (true)
-		{
-			subtype _new = old;
-			atomic_proc(to_type(_new)); // function should accept reference to T type
-			const subtype val = sync_val_compare_and_swap(&sub_data, old, _new);
-			if (val == old) return;
-			old = val;
-		}
-	}
-
-	// perform atomic operation on data with additional memory barrier and special exit condition (if intermediate result != proceed_value)
-	template<typename RT, typename FT> force_inline RT atomic_op_sync(const RT proceed_value, const FT atomic_proc) volatile
-	{
-		const subtype zero = {};
-		subtype old = sync_val_compare_and_swap(&sub_data, zero, zero);
-		while (true)
-		{
-			subtype _new = old;
-			auto res = static_cast<RT>(atomic_proc(to_type(_new))); // function should accept reference to T type and return some value
-			if (res != proceed_value) return res;
-			const subtype val = sync_val_compare_and_swap(&sub_data, old, _new);
-			if (val == old) return proceed_value;
-			old = val;
-		}
+		return atomic_op<Break_if>(std::mem_fn(func), std::forward<Args>(args)...);
 	}
 
 	// atomic bitwise OR, returns previous data
@@ -174,17 +201,17 @@ public:
 		return from_subtype(sync_fetch_and_xor(&sub_data, to_subtype(right)));
 	}
 
-	force_inline const type operator |= (const type& right) volatile
+	force_inline const type operator |=(const type& right) volatile
 	{
 		return from_subtype(sync_fetch_and_or(&sub_data, to_subtype(right)) | to_subtype(right));
 	}
 
-	force_inline const type operator &= (const type& right) volatile
+	force_inline const type operator &=(const type& right) volatile
 	{
 		return from_subtype(sync_fetch_and_and(&sub_data, to_subtype(right)) & to_subtype(right));
 	}
 
-	force_inline const type operator ^= (const type& right) volatile
+	force_inline const type operator ^=(const type& right) volatile
 	{
 		return from_subtype(sync_fetch_and_xor(&sub_data, to_subtype(right)) ^ to_subtype(right));
 	}
@@ -225,74 +252,50 @@ template<typename T, typename T2> inline if_integral_le_t<T, T2> operator -=(_at
 
 template<typename T> inline if_integral_be_t<T> operator ++(_atomic_base<be_t<T>>& left)
 {
-	be_t<T> result;
-
-	left.atomic_op([&result](be_t<T>& value)
+	return left.atomic_op([](be_t<T>& value) -> be_t<T>
 	{
-		result = ++value;
+		return ++value;
 	});
-
-	return result;
 }
 
 template<typename T> inline if_integral_be_t<T> operator --(_atomic_base<be_t<T>>& left)
 {
-	be_t<T> result;
-
-	left.atomic_op([&result](be_t<T>& value)
+	return left.atomic_op([](be_t<T>& value) -> be_t<T>
 	{
-		result = --value;
+		return --value;
 	});
-
-	return result;
 }
 
 template<typename T> inline if_integral_be_t<T> operator ++(_atomic_base<be_t<T>>& left, int)
 {
-	be_t<T> result;
-
-	left.atomic_op([&result](be_t<T>& value)
+	return left.atomic_op([](be_t<T>& value) -> be_t<T>
 	{
-		result = value++;
+		return value++;
 	});
-
-	return result;
 }
 
 template<typename T> inline if_integral_be_t<T> operator --(_atomic_base<be_t<T>>& left, int)
 {
-	be_t<T> result;
-
-	left.atomic_op([&result](be_t<T>& value)
+	return left.atomic_op([](be_t<T>& value) -> be_t<T>
 	{
-		result = value--;
+		return value--;
 	});
-
-	return result;
 }
 
 template<typename T, typename T2> inline if_integral_be_t<T, T2> operator +=(_atomic_base<be_t<T>>& left, T2 right)
 {
-	be_t<T> result;
-
-	left.atomic_op([&result, right](be_t<T>& value)
+	return left.atomic_op([right](be_t<T>& value) -> be_t<T>
 	{
-		result = (value += right);
+		return value += right;
 	});
-
-	return result;
 }
 
 template<typename T, typename T2> inline if_integral_be_t<T, T2> operator -=(_atomic_base<be_t<T>>& left, T2 right)
 {
-	be_t<T> result;
-
-	left.atomic_op([&result, right](be_t<T>& value)
+	return left.atomic_op([right](be_t<T>& value) -> be_t<T>
 	{
-		result = (value -= right);
+		return value -= right;
 	});
-
-	return result;
 }
 
 template<typename T> using atomic = _atomic_base<T>; // Atomic Type with native endianness (for emulator memory)
diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp
index 769e63dacd..0774c3a5cf 100644
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@@ -208,7 +208,7 @@ namespace vm
 		{
 			std::lock_guard<reservation_mutex_t> lock(g_reservation_mutex);
 
-			u8 flags = g_page_info[addr >> 12].read_relaxed();
+			u8 flags = g_page_info[addr >> 12].load();
 			if (!(flags & page_writable) || !(flags & page_allocated) || (flags & page_no_reservations))
 			{
 				throw fmt::format("vm::reservation_acquire(addr=0x%x, size=0x%x) failed (invalid page flags: 0x%x)", addr, size, flags);
@@ -355,7 +355,7 @@ namespace vm
 
 		for (u32 i = addr / 4096; i < addr / 4096 + size / 4096; i++)
 		{
-			if (g_page_info[i].read_relaxed())
+			if (g_page_info[i].load())
 			{
 				throw fmt::format("vm::page_map(addr=0x%x, size=0x%x, flags=0x%x) failed (already mapped at 0x%x)", addr, size, flags, i * 4096);
 			}
@@ -398,7 +398,7 @@ namespace vm
 
 		for (u32 i = addr / 4096; i < addr / 4096 + size / 4096; i++)
 		{
-			if ((g_page_info[i].read_relaxed() & flags_test) != (flags_test | page_allocated))
+			if ((g_page_info[i].load() & flags_test) != (flags_test | page_allocated))
 			{
 				return false;
 			}
@@ -447,7 +447,7 @@ namespace vm
 
 		for (u32 i = addr / 4096; i < addr / 4096 + size / 4096; i++)
 		{
-			if (!(g_page_info[i].read_relaxed() & page_allocated))
+			if (!(g_page_info[i].load() & page_allocated))
 			{
 				throw fmt::format("vm::page_unmap(addr=0x%x, size=0x%x) failed (not mapped at 0x%x)", addr, size, i * 4096);
 			}
@@ -491,7 +491,7 @@ namespace vm
 
 		for (u32 i = addr / 4096; i <= (addr + size - 1) / 4096; i++)
 		{
-			if ((g_page_info[i].read_sync() & page_allocated) != page_allocated)
+			if ((g_page_info[i].load() & page_allocated) != page_allocated)
 			{
 				return false;
 			}
diff --git a/rpcs3/Emu/Memory/vm_ref.h b/rpcs3/Emu/Memory/vm_ref.h
index 47ce252b32..8ab50c30df 100644
--- a/rpcs3/Emu/Memory/vm_ref.h
+++ b/rpcs3/Emu/Memory/vm_ref.h
@@ -5,7 +5,7 @@ namespace vm
 	template<typename T, typename AT = u32>
 	struct _ref_base
 	{
-		AT m_addr;
+		AT m_addr; // don't access directly
 
 		static_assert(!std::is_pointer<T>::value, "vm::_ref_base<> error: invalid type (pointer)");
 		static_assert(!std::is_reference<T>::value, "vm::_ref_base<> error: invalid type (reference)");
diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp
index 07da68791d..a4557b57b7 100644
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@@ -2494,8 +2494,8 @@ void RSXThread::Task()
 
 		inc = 1;
 
-		u32 get = m_ctrl->get.read_sync();
-		u32 put = m_ctrl->put.read_sync();
+		u32 put = m_ctrl->put.load();
+		u32 get = m_ctrl->get.load();
 
 		if (put == get || !Emu.IsRunning())
 		{
diff --git a/rpcs3/Emu/SysCalls/Modules/cellAudio.cpp b/rpcs3/Emu/SysCalls/Modules/cellAudio.cpp
index dd03c80d11..84c3fe857b 100644
--- a/rpcs3/Emu/SysCalls/Modules/cellAudio.cpp
+++ b/rpcs3/Emu/SysCalls/Modules/cellAudio.cpp
@@ -31,7 +31,7 @@ s32 cellAudioInit()
 	// clear ports
 	for (auto& port : g_audio.ports)
 	{
-		port.state.write_relaxed(AUDIO_PORT_STATE_CLOSED);
+		port.state.store(AUDIO_PORT_STATE_CLOSED);
 	}
 
 	// reset variables
@@ -82,7 +82,7 @@ s32 cellAudioInit()
 			bool opened = false;
 			float* buffer;
 
-			while (out_queue.pop(buffer, [](){ return g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED; }))
+			while (out_queue.pop(buffer, [](){ return g_audio.state.load() != AUDIO_STATE_INITIALIZED; }))
 			{
 				if (use_u16)
 				{
@@ -146,7 +146,7 @@ s32 cellAudioInit()
 			}
 		});
 
-		while (g_audio.state.read_relaxed() == AUDIO_STATE_INITIALIZED && !Emu.IsStopped())
+		while (g_audio.state.load() == AUDIO_STATE_INITIALIZED && !Emu.IsStopped())
 		{
 			if (Emu.IsPaused())
 			{
@@ -193,7 +193,7 @@ s32 cellAudioInit()
 			// mixing:
 			for (auto& port : g_audio.ports)
 			{
-				if (port.state.read_relaxed() != AUDIO_PORT_STATE_STARTED) continue;
+				if (port.state.load() != AUDIO_PORT_STATE_STARTED) continue;
 
 				const u32 block_size = port.channel * AUDIO_SAMPLES;
 				const u32 position = port.tag % port.block; // old value
@@ -206,7 +206,7 @@ s32 cellAudioInit()
 
 				auto step_volume = [](AudioPortConfig& port) // part of cellAudioSetPortLevel functionality
 				{
-					const auto param = port.level_set.read_sync();
+					const auto param = port.level_set.load();
 
 					if (param.inc != 0.0f)
 					{
@@ -357,7 +357,7 @@ s32 cellAudioInit()
 				memset(out_buffer[out_pos].get(), 0, out_buffer_size * sizeof(float));
 			}
 
-			if (!out_queue.push(out_buffer[out_pos].get(), [](){ return g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED; }))
+			if (!out_queue.push(out_buffer[out_pos].get(), [](){ return g_audio.state.load() != AUDIO_STATE_INITIALIZED; }))
 			{
 				break;
 			}
@@ -375,7 +375,7 @@ s32 cellAudioInit()
 				{
 					AudioPortConfig& port = g_audio.ports[i];
 
-					if (port.state.read_relaxed() != AUDIO_PORT_STATE_STARTED) continue;
+					if (port.state.load() != AUDIO_PORT_STATE_STARTED) continue;
 
 					u32 position = port.tag % port.block; // old value
 					port.counter = g_audio.counter;
@@ -447,7 +447,7 @@ s32 cellAudioPortOpen(vm::ptr<CellAudioPortParam> audioParam, vm::ptr<u32> portN
 {
 	cellAudio.Warning("cellAudioPortOpen(audioParam=*0x%x, portNum=*0x%x)", audioParam, portNum);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -551,7 +551,7 @@ s32 cellAudioGetPortConfig(u32 portNum, vm::ptr<CellAudioPortConfig> portConfig)
 {
 	cellAudio.Warning("cellAudioGetPortConfig(portNum=%d, portConfig=*0x%x)", portNum, portConfig);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -565,7 +565,7 @@ s32 cellAudioGetPortConfig(u32 portNum, vm::ptr<CellAudioPortConfig> portConfig)
 
 	portConfig->readIndexAddr = port.read_index_addr;
 
-	switch (auto state = port.state.read_sync())
+	switch (auto state = port.state.load())
 	{
 	case AUDIO_PORT_STATE_CLOSED: portConfig->status = CELL_AUDIO_STATUS_CLOSE; break;
 	case AUDIO_PORT_STATE_OPENED: portConfig->status = CELL_AUDIO_STATUS_READY; break;
@@ -584,7 +584,7 @@ s32 cellAudioPortStart(u32 portNum)
 {
 	cellAudio.Warning("cellAudioPortStart(portNum=%d)", portNum);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -607,7 +607,7 @@ s32 cellAudioPortClose(u32 portNum)
 {
 	cellAudio.Warning("cellAudioPortClose(portNum=%d)", portNum);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -630,7 +630,7 @@ s32 cellAudioPortStop(u32 portNum)
 {
 	cellAudio.Warning("cellAudioPortStop(portNum=%d)", portNum);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -653,7 +653,7 @@ s32 cellAudioGetPortTimestamp(u32 portNum, u64 tag, vm::ptr<u64> stamp)
 {
 	cellAudio.Log("cellAudioGetPortTimestamp(portNum=%d, tag=0x%llx, stamp=*0x%x)", portNum, tag, stamp);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -665,7 +665,7 @@ s32 cellAudioGetPortTimestamp(u32 portNum, u64 tag, vm::ptr<u64> stamp)
 
 	AudioPortConfig& port = g_audio.ports[portNum];
 
-	if (port.state.read_relaxed() == AUDIO_PORT_STATE_CLOSED)
+	if (port.state.load() == AUDIO_PORT_STATE_CLOSED)
 	{
 		return CELL_AUDIO_ERROR_PORT_NOT_OPEN;
 	}
@@ -683,7 +683,7 @@ s32 cellAudioGetPortBlockTag(u32 portNum, u64 blockNo, vm::ptr<u64> tag)
 {
 	cellAudio.Log("cellAudioGetPortBlockTag(portNum=%d, blockNo=0x%llx, tag=*0x%x)", portNum, blockNo, tag);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -695,7 +695,7 @@ s32 cellAudioGetPortBlockTag(u32 portNum, u64 blockNo, vm::ptr<u64> tag)
 
 	AudioPortConfig& port = g_audio.ports[portNum];
 
-	if (port.state.read_relaxed() == AUDIO_PORT_STATE_CLOSED)
+	if (port.state.load() == AUDIO_PORT_STATE_CLOSED)
 	{
 		return CELL_AUDIO_ERROR_PORT_NOT_OPEN;
 	}
@@ -726,7 +726,7 @@ s32 cellAudioSetPortLevel(u32 portNum, float level)
 {
 	cellAudio.Log("cellAudioSetPortLevel(portNum=%d, level=%f)", portNum, level);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -738,7 +738,7 @@ s32 cellAudioSetPortLevel(u32 portNum, float level)
 
 	AudioPortConfig& port = g_audio.ports[portNum];
 
-	if (port.state.read_relaxed() == AUDIO_PORT_STATE_CLOSED)
+	if (port.state.load() == AUDIO_PORT_STATE_CLOSED)
 	{
 		return CELL_AUDIO_ERROR_PORT_NOT_OPEN;
 	}
@@ -796,7 +796,7 @@ s32 cellAudioSetNotifyEventQueue(u64 key)
 {
 	cellAudio.Warning("cellAudioSetNotifyEventQueue(key=0x%llx)", key);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -829,7 +829,7 @@ s32 cellAudioRemoveNotifyEventQueue(u64 key)
 {
 	cellAudio.Warning("cellAudioRemoveNotifyEventQueue(key=0x%llx)", key);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -862,7 +862,7 @@ s32 cellAudioAddData(u32 portNum, vm::ptr<float> src, u32 samples, float volume)
 {
 	cellAudio.Log("cellAudioAddData(portNum=%d, src=*0x%x, samples=%d, volume=%f)", portNum, src, samples, volume);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -895,7 +895,7 @@ s32 cellAudioAdd2chData(u32 portNum, vm::ptr<float> src, u32 samples, float volu
 {
 	cellAudio.Log("cellAudioAdd2chData(portNum=%d, src=*0x%x, samples=%d, volume=%f)", portNum, src, samples, volume);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -958,7 +958,7 @@ s32 cellAudioAdd6chData(u32 portNum, vm::ptr<float> src, float volume)
 {
 	cellAudio.Log("cellAudioAdd6chData(portNum=%d, src=*0x%x, volume=%f)", portNum, src, volume);
 
-	if (g_audio.state.read_relaxed() != AUDIO_STATE_INITIALIZED)
+	if (g_audio.state.load() != AUDIO_STATE_INITIALIZED)
 	{
 		return CELL_AUDIO_ERROR_NOT_INIT;
 	}
@@ -1024,7 +1024,7 @@ s32 cellAudioUnsetPersonalDevice(s32 iPersonalStream)
 
 Module cellAudio("cellAudio", []()
 {
-	g_audio.state.write_relaxed(AUDIO_STATE_NOT_INITIALIZED);
+	g_audio.state.store(AUDIO_STATE_NOT_INITIALIZED);
 	g_audio.buffer = 0;
 	g_audio.indexes = 0;
 
diff --git a/rpcs3/Emu/SysCalls/Modules/cellFs.cpp b/rpcs3/Emu/SysCalls/Modules/cellFs.cpp
index c0703e216c..8fa066ec55 100644
--- a/rpcs3/Emu/SysCalls/Modules/cellFs.cpp
+++ b/rpcs3/Emu/SysCalls/Modules/cellFs.cpp
@@ -398,7 +398,7 @@ s32 cellFsStReadGetRingBuf(u32 fd, vm::ptr<CellFsRingBuffer> ringbuf)
 		return CELL_FS_EBADF;
 	}
 
-	if (file->st_status.read_sync() == SSS_NOT_INITIALIZED)
+	if (file->st_status.load() == SSS_NOT_INITIALIZED)
 	{
 		return CELL_FS_ENXIO;
 	}
@@ -422,7 +422,7 @@ s32 cellFsStReadGetStatus(u32 fd, vm::ptr<u64> status)
 		return CELL_FS_EBADF;
 	}
 
-	switch (file->st_status.read_sync())
+	switch (file->st_status.load())
 	{
 	case SSS_INITIALIZED:
 	case SSS_STOPPED:
@@ -456,7 +456,7 @@ s32 cellFsStReadGetRegid(u32 fd, vm::ptr<u64> regid)
 		return CELL_FS_EBADF;
 	}
 
-	if (file->st_status.read_sync() == SSS_NOT_INITIALIZED)
+	if (file->st_status.load() == SSS_NOT_INITIALIZED)
 	{
 		return CELL_FS_ENXIO;
 	}
@@ -500,7 +500,7 @@ s32 cellFsStReadStart(u32 fd, u64 offset, u64 size)
 	{
 		std::unique_lock<std::mutex> lock(file->mutex);
 
-		while (file->st_status.read_relaxed() == SSS_STARTED && !Emu.IsStopped())
+		while (file->st_status.load() == SSS_STARTED && !Emu.IsStopped())
 		{
 			// check free space in buffer and available data in stream
 			if (file->st_total_read - file->st_copied <= file->st_ringbuf_size - file->st_block_size && file->st_total_read < file->st_read_size)
@@ -590,7 +590,7 @@ s32 cellFsStRead(u32 fd, vm::ptr<u8> buf, u64 size, vm::ptr<u64> rsize)
 		return CELL_FS_EBADF;
 	}
 
-	if (file->st_status.read_sync() == SSS_NOT_INITIALIZED || file->st_copyless)
+	if (file->st_status.load() == SSS_NOT_INITIALIZED || file->st_copyless)
 	{
 		return CELL_FS_ENXIO;
 	}
@@ -624,7 +624,7 @@ s32 cellFsStReadGetCurrentAddr(u32 fd, vm::ptr<u32> addr, vm::ptr<u64> size)
 		return CELL_FS_EBADF;
 	}
 
-	if (file->st_status.read_sync() == SSS_NOT_INITIALIZED || !file->st_copyless)
+	if (file->st_status.load() == SSS_NOT_INITIALIZED || !file->st_copyless)
 	{
 		return CELL_FS_ENXIO;
 	}
@@ -657,7 +657,7 @@ s32 cellFsStReadPutCurrentAddr(u32 fd, vm::ptr<u8> addr, u64 size)
 		return CELL_FS_EBADF;
 	}
 
-	if (file->st_status.read_sync() == SSS_NOT_INITIALIZED || !file->st_copyless)
+	if (file->st_status.load() == SSS_NOT_INITIALIZED || !file->st_copyless)
 	{
 		return CELL_FS_ENXIO;
 	}
@@ -684,7 +684,7 @@ s32 cellFsStReadWait(u32 fd, u64 size)
 		return CELL_FS_EBADF;
 	}
 
-	if (file->st_status.read_sync() == SSS_NOT_INITIALIZED)
+	if (file->st_status.load() == SSS_NOT_INITIALIZED)
 	{
 		return CELL_FS_ENXIO;
 	}
@@ -718,7 +718,7 @@ s32 cellFsStReadWaitCallback(u32 fd, u64 size, fs_st_cb_t func)
 		return CELL_FS_EBADF;
 	}
 
-	if (file->st_status.read_sync() == SSS_NOT_INITIALIZED)
+	if (file->st_status.load() == SSS_NOT_INITIALIZED)
 	{
 		return CELL_FS_ENXIO;
 	}
diff --git a/rpcs3/Emu/SysCalls/Modules/cellGcmSys.cpp b/rpcs3/Emu/SysCalls/Modules/cellGcmSys.cpp
index 607c81fbc8..82f5efce57 100644
--- a/rpcs3/Emu/SysCalls/Modules/cellGcmSys.cpp
+++ b/rpcs3/Emu/SysCalls/Modules/cellGcmSys.cpp
@@ -379,9 +379,9 @@ s32 _cellGcmInitBody(vm::ptr<CellGcmContextData> context, u32 cmdSize, u32 ioSiz
 	vm::write32(context.addr(), gcm_info.context_addr);
 
 	auto& ctrl = vm::get_ref<CellGcmControl>(gcm_info.control_addr);
-	ctrl.put.write_relaxed(0);
-	ctrl.get.write_relaxed(0);
-	ctrl.ref.write_relaxed(-1);
+	ctrl.put.store(0);
+	ctrl.get.store(0);
+	ctrl.ref.store(-1);
 
 	auto& render = Emu.GetGSManager().GetRender();
 	render.m_ctxt_addr = context.addr();
@@ -1220,7 +1220,7 @@ s32 cellGcmCallback(vm::ptr<CellGcmContextData> context, u32 count)
 	// Wait for rsx to "release" the new command buffer
 	while (!Emu.IsStopped())
 	{
-		u32 getPos = ctrl.get.read_sync().value();
+		u32 getPos = ctrl.get.load().value();
 		if (isInCommandBufferExcept(getPos, newCommandBuffer.first, newCommandBuffer.second))
 			break;
 		std::chrono::time_point<std::chrono::system_clock> waitPoint = std::chrono::system_clock::now();
@@ -1235,7 +1235,7 @@ s32 cellGcmCallback(vm::ptr<CellGcmContextData> context, u32 count)
 	//if (0)
 	//{
 	//	auto& ctrl = vm::get_ref<CellGcmControl>(gcm_info.control_addr);
-	//	be_t<u32> res = context->current - context->begin - ctrl.put.read_relaxed();
+	//	be_t<u32> res = context->current - context->begin - ctrl.put.load();
 
 	//	if (res != 0)
 	//	{
@@ -1245,8 +1245,8 @@ s32 cellGcmCallback(vm::ptr<CellGcmContextData> context, u32 count)
 	//	memmove(vm::get_ptr<void>(context->begin), vm::get_ptr<void>(context->current - res), res);
 
 	//	context->current = context->begin + res;
-	//	ctrl.put.write_relaxed(res);
-	//	ctrl.get.write_relaxed(0);
+	//	ctrl.put.store(res);
+	//	ctrl.get.store(0);
 
 	//	return CELL_OK;
 	//}
diff --git a/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp b/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp
index 791c1de37f..d5b875f4ea 100644
--- a/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp
+++ b/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp
@@ -411,7 +411,7 @@ s32 spursDetachLv2EventQueue(vm::ptr<CellSpurs> spurs, u8 spuPort, bool spursCre
 		auto mask = 1ull << spuPort;
 		if (sdkVer >= 0x180000)
 		{
-			if ((spurs->spuPortBits.read_relaxed() & mask) == 0)
+			if ((spurs->spuPortBits.load() & mask) == 0)
 			{
 				return CELL_SPURS_CORE_ERROR_SRCH;
 			}
@@ -438,7 +438,7 @@ void spursHandlerWaitReady(PPUThread& CPU, vm::ptr<CellSpurs> spurs)
 			spursPpuThreadExit(CPU, 0);
 		}
 
-		if (spurs->handlerExiting.read_relaxed())
+		if (spurs->handlerExiting.load())
 		{
 			if (s32 rc = sys_lwmutex_unlock(CPU, spurs.of(&CellSpurs::mutex)))
 			{
@@ -449,20 +449,20 @@ void spursHandlerWaitReady(PPUThread& CPU, vm::ptr<CellSpurs> spurs)
 		}
 
 		// Find a runnable workload
-		spurs->handlerDirty.write_relaxed(0);
+		spurs->handlerDirty.store(0);
 		if (spurs->exception == 0)
 		{
 			bool foundRunnableWorkload = false;
 			for (u32 i = 0; i < 16; i++)
 			{
-				if (spurs->wklState1[i].read_relaxed() == SPURS_WKL_STATE_RUNNABLE &&
+				if (spurs->wklState1[i].load() == SPURS_WKL_STATE_RUNNABLE &&
 					*((u64*)spurs->wklInfo1[i].priority) != 0 &&
-					spurs->wklMaxContention[i].read_relaxed() & 0x0F)
+					spurs->wklMaxContention[i].load() & 0x0F)
 				{
-					if (spurs->wklReadyCount1[i].read_relaxed() ||
-						spurs->wklSignal1.read_relaxed() & (0x8000u >> i) ||
-						(spurs->wklFlag.flag.read_relaxed() == 0 &&
-							spurs->wklFlagReceiver.read_relaxed() == (u8)i))
+					if (spurs->wklReadyCount1[i].load() ||
+						spurs->wklSignal1.load() & (0x8000u >> i) ||
+						(spurs->wklFlag.flag.load() == 0 &&
+							spurs->wklFlagReceiver.load() == (u8)i))
 					{
 						foundRunnableWorkload = true;
 						break;
@@ -474,14 +474,14 @@ void spursHandlerWaitReady(PPUThread& CPU, vm::ptr<CellSpurs> spurs)
 			{
 				for (u32 i = 0; i < 16; i++)
 				{
-					if (spurs->wklState2[i].read_relaxed() == SPURS_WKL_STATE_RUNNABLE &&
+					if (spurs->wklState2[i].load() == SPURS_WKL_STATE_RUNNABLE &&
 						*((u64*)spurs->wklInfo2[i].priority) != 0 &&
-						spurs->wklMaxContention[i].read_relaxed() & 0xF0)
+						spurs->wklMaxContention[i].load() & 0xF0)
 					{
-						if (spurs->wklIdleSpuCountOrReadyCount2[i].read_relaxed() ||
-							spurs->wklSignal2.read_relaxed() & (0x8000u >> i) ||
-							(spurs->wklFlag.flag.read_relaxed() == 0 &&
-								spurs->wklFlagReceiver.read_relaxed() == (u8)i + 0x10))
+						if (spurs->wklIdleSpuCountOrReadyCount2[i].load() ||
+							spurs->wklSignal2.load() & (0x8000u >> i) ||
+							(spurs->wklFlag.flag.load() == 0 &&
+								spurs->wklFlagReceiver.load() == (u8)i + 0x10))
 						{
 							foundRunnableWorkload = true;
 							break;
@@ -497,8 +497,8 @@ void spursHandlerWaitReady(PPUThread& CPU, vm::ptr<CellSpurs> spurs)
 
 		// If we reach it means there are no runnable workloads in this SPURS instance.
 		// Wait until some workload becomes ready.
-		spurs->handlerWaiting.write_relaxed(1);
-		if (spurs->handlerDirty.read_relaxed() == 0)
+		spurs->handlerWaiting.store(1);
+		if (spurs->handlerDirty.load() == 0)
 		{
 			if (s32 rc = sys_lwcond_wait(CPU, spurs.of(&CellSpurs::cond), 0))
 			{
@@ -506,7 +506,7 @@ void spursHandlerWaitReady(PPUThread& CPU, vm::ptr<CellSpurs> spurs)
 			}
 		}
 
-		spurs->handlerWaiting.write_relaxed(0);
+		spurs->handlerWaiting.store(0);
 	}
 
 	// If we reach here then a runnable workload was found
@@ -557,7 +557,7 @@ void spursHandlerEntry(PPUThread& CPU)
 
 			if ((spurs->flags1 & SF1_EXIT_IF_NO_WORK) == 0)
 			{
-				assert(spurs->handlerExiting.read_relaxed() == 1 || Emu.IsStopped());
+				assert(spurs->handlerExiting.load() == 1 || Emu.IsStopped());
 				spursPpuThreadExit(CPU, 0);
 			}
 		}
@@ -609,12 +609,12 @@ s32 spursWakeUpShutdownCompletionWaiter(PPUThread& CPU, vm::ptr<CellSpurs> spurs
 		return CELL_SPURS_POLICY_MODULE_ERROR_INVAL;
 	}
 
-	if ((spurs->wklEnabled.read_relaxed() & (0x80000000u >> wid)) == 0)
+	if ((spurs->wklEnabled.load() & (0x80000000u >> wid)) == 0)
 	{
 		return CELL_SPURS_POLICY_MODULE_ERROR_SRCH;
 	}
 
-	const u8 wklState = wid < CELL_SPURS_MAX_WORKLOAD ? spurs->wklState1[wid].read_relaxed() : spurs->wklState2[wid & 0x0F].read_relaxed();
+	const u8 wklState = wid < CELL_SPURS_MAX_WORKLOAD ? spurs->wklState1[wid].load() : spurs->wklState2[wid & 0x0F].load();
 
 	if (wklState != SPURS_WKL_STATE_REMOVABLE)
 	{
@@ -628,14 +628,14 @@ s32 spursWakeUpShutdownCompletionWaiter(PPUThread& CPU, vm::ptr<CellSpurs> spurs
 	{
 		wklF.hook(CPU, spurs, wid, wklF.hookArg);
 
-		assert(wklEvent.read_relaxed() & 0x01);
-		assert(wklEvent.read_relaxed() & 0x02);
-		assert((wklEvent.read_relaxed() & 0x20) == 0);
+		assert(wklEvent.load() & 0x01);
+		assert(wklEvent.load() & 0x02);
+		assert((wklEvent.load() & 0x20) == 0);
 		wklEvent |= 0x20;
 	}
 
 	s32 rc = CELL_OK;
-	if (!wklF.hook || wklEvent.read_relaxed() & 0x10)
+	if (!wklF.hook || wklEvent.load() & 0x10)
 	{
 		assert(wklF.x28 == 2);
 		rc = sys_semaphore_post((u32)wklF.sem, 1);
@@ -1028,7 +1028,7 @@ s32 spursInit(
 
 	if (!isSecond)
 	{
-		spurs->wklEnabled.write_relaxed(0xffff);
+		spurs->wklEnabled.store(0xffff);
 	}
 
 	// Initialise trace
@@ -1043,7 +1043,7 @@ s32 spursInit(
 	spurs->wklInfoSysSrv.addr.set(SPURS_IMG_ADDR_SYS_SRV_WORKLOAD);
 	spurs->wklInfoSysSrv.size = 0x2200;
 	spurs->wklInfoSysSrv.arg  = 0;
-	spurs->wklInfoSysSrv.uniqueId.write_relaxed(0xff);
+	spurs->wklInfoSysSrv.uniqueId.store(0xff);
 
 	auto sys_semaphore_attribute_initialize = [](vm::ptr<sys_semaphore_attribute_t> attr)
 	{
@@ -1221,11 +1221,11 @@ s32 spursInit(
 	}
 
 	spurs->flags1 = (flags & SAF_EXIT_IF_NO_WORK ? SF1_EXIT_IF_NO_WORK : 0) | (isSecond ? SF1_32_WORKLOADS : 0);
-	spurs->wklFlagReceiver.write_relaxed(0xff);
-	spurs->wklFlag.flag.write_relaxed(-1);
-	spurs->handlerDirty.write_relaxed(0);
-	spurs->handlerWaiting.write_relaxed(0);
-	spurs->handlerExiting.write_relaxed(0);
+	spurs->wklFlagReceiver.store(0xff);
+	spurs->wklFlag.flag.store(-1);
+	spurs->handlerDirty.store(0);
+	spurs->handlerWaiting.store(0);
+	spurs->handlerExiting.store(0);
 	spurs->ppuPriority = ppuPriority;
 
 	// Create the SPURS event helper thread
@@ -1586,12 +1586,12 @@ s32 cellSpursFinalize(vm::ptr<CellSpurs> spurs)
 		return CELL_SPURS_CORE_ERROR_ALIGN;
 	}
 
-	if (spurs->handlerExiting.read_relaxed())
+	if (spurs->handlerExiting.load())
 	{
 		return CELL_SPURS_CORE_ERROR_STAT;
 	}
 
-	u32 wklEnabled = spurs->wklEnabled.read_relaxed();
+	u32 wklEnabled = spurs->wklEnabled.load();
 
 	if (spurs->flags1 & SF1_32_WORKLOADS)
 	{
@@ -1690,7 +1690,7 @@ s32 cellSpursSetMaxContention(vm::ptr<CellSpurs> spurs, u32 wid, u32 maxContenti
 		return CELL_SPURS_CORE_ERROR_INVAL;
 	}
 
-	if ((spurs->wklEnabled.read_relaxed() & (0x80000000u >> wid)) == 0)
+	if ((spurs->wklEnabled.load() & (0x80000000u >> wid)) == 0)
 	{
 		return CELL_SPURS_CORE_ERROR_SRCH;
 	}
@@ -1734,7 +1734,7 @@ s32 cellSpursSetPriorities(vm::ptr<CellSpurs> spurs, u32 wid, vm::cptr<u8> prior
 		return CELL_SPURS_CORE_ERROR_INVAL;
 	}
 
-	if ((spurs->wklEnabled.read_relaxed() & (0x80000000u >> wid)) == 0)
+	if ((spurs->wklEnabled.load() & (0x80000000u >> wid)) == 0)
 	{
 		return CELL_SPURS_CORE_ERROR_SRCH;
 	}
@@ -1764,8 +1764,8 @@ s32 cellSpursSetPriorities(vm::ptr<CellSpurs> spurs, u32 wid, vm::cptr<u8> prior
 	auto& wklInfo = wid < CELL_SPURS_MAX_WORKLOAD ? spurs->wklInfo1[wid] : spurs->wklInfo2[wid];
 	*((be_t<u64>*)wklInfo.priority) = prio;
 
-	spurs->sysSrvMsgUpdateWorkload.write_relaxed(0xFF);
-	spurs->sysSrvMessage.write_relaxed(0xFF);
+	spurs->sysSrvMsgUpdateWorkload.store(0xFF);
+	spurs->sysSrvMessage.store(0xFF);
 	return CELL_OK;
 }
 
@@ -1907,7 +1907,7 @@ void spursTraceStatusUpdate(vm::ptr<CellSpurs> spurs)
 
 	if (init)
 	{
-		spurs->sysSrvMessage.write_relaxed(0xFF);
+		spurs->sysSrvMessage.store(0xFF);
 
 		if (s32 rc = sys_semaphore_wait((u32)spurs->semPrv, 0))
 		{
@@ -2241,9 +2241,9 @@ s32 spursAddWorkload(
 	{
 		assert((spurs->wklCurrentContention[wnum] & 0xf) == 0);
 		assert((spurs->wklPendingContention[wnum] & 0xf) == 0);
-		spurs->wklState1[wnum].write_relaxed(1);
+		spurs->wklState1[wnum].store(1);
 		spurs->wklStatus1[wnum] = 0;
-		spurs->wklEvent1[wnum].write_relaxed(0);
+		spurs->wklEvent1[wnum].store(0);
 		spurs->wklInfo1[wnum].addr = pm;
 		spurs->wklInfo1[wnum].arg = data;
 		spurs->wklInfo1[wnum].size = size;
@@ -2267,19 +2267,19 @@ s32 spursAddWorkload(
 
 		if ((spurs->flags1 & SF1_32_WORKLOADS) == 0)
 		{
-			spurs->wklIdleSpuCountOrReadyCount2[wnum].write_relaxed(0);
+			spurs->wklIdleSpuCountOrReadyCount2[wnum].store(0);
 			spurs->wklMinContention[wnum] = minContention > 8 ? 8 : minContention;
 		}
 
-		spurs->wklReadyCount1[wnum].write_relaxed(0);
+		spurs->wklReadyCount1[wnum].store(0);
 	}
 	else
 	{
 		assert((spurs->wklCurrentContention[index] & 0xf0) == 0);
 		assert((spurs->wklPendingContention[index] & 0xf0) == 0);
-		spurs->wklState2[index].write_relaxed(1);
+		spurs->wklState2[index].store(1);
 		spurs->wklStatus2[index] = 0;
-		spurs->wklEvent2[index].write_relaxed(0);
+		spurs->wklEvent2[index].store(0);
 		spurs->wklInfo2[index].addr = pm;
 		spurs->wklInfo2[index].arg = data;
 		spurs->wklInfo2[index].size = size;
@@ -2301,7 +2301,7 @@ s32 spursAddWorkload(
 			spurs->wklEvent2[index] |= 2;
 		}
 
-		spurs->wklIdleSpuCountOrReadyCount2[wnum].write_relaxed(0);
+		spurs->wklIdleSpuCountOrReadyCount2[wnum].store(0);
 	}
 
 	if (wnum <= 15)
@@ -2327,7 +2327,7 @@ s32 spursAddWorkload(
 
 	u32 res_wkl;
 	CellSpurs::WorkloadInfo& wkl = wnum <= 15 ? spurs->wklInfo1[wnum] : spurs->wklInfo2[wnum & 0xf];
-	spurs->wklMskB.atomic_op_sync([spurs, &wkl, wnum, &res_wkl](be_t<u32>& v)
+	spurs->wklMskB.atomic_op([spurs, &wkl, wnum, &res_wkl](be_t<u32>& v)
 	{
 		const u32 mask = v & ~(0x80000000u >> wnum);
 		res_wkl = 0;
@@ -2340,12 +2340,12 @@ s32 spursAddWorkload(
 				if (current.addr == wkl.addr)
 				{
 					// if a workload with identical policy module found
-					res_wkl = current.uniqueId.read_relaxed();
+					res_wkl = current.uniqueId.load();
 					break;
 				}
 				else
 				{
-					k |= 0x80000000 >> current.uniqueId.read_relaxed();
+					k |= 0x80000000 >> current.uniqueId.load();
 					res_wkl = cntlz32(~k);
 				}
 			}
@@ -2437,7 +2437,7 @@ s32 cellSpursWakeUp(PPUThread& CPU, vm::ptr<CellSpurs> spurs)
 
 	spurs->handlerDirty.exchange(1);
 
-	if (spurs->handlerWaiting.read_sync())
+	if (spurs->handlerWaiting.load())
 	{
 		spursSignalToHandlerThread(CPU, spurs);
 	}
@@ -2465,7 +2465,7 @@ s32 cellSpursSendWorkloadSignal(vm::ptr<CellSpurs> spurs, u32 wid)
 		return CELL_SPURS_POLICY_MODULE_ERROR_INVAL;
 	}
 
-	if ((spurs->wklEnabled.read_relaxed() & (0x80000000u >> wid)) == 0)
+	if ((spurs->wklEnabled.load() & (0x80000000u >> wid)) == 0)
 	{
 		return CELL_SPURS_POLICY_MODULE_ERROR_SRCH;
 	}
@@ -2475,7 +2475,7 @@ s32 cellSpursSendWorkloadSignal(vm::ptr<CellSpurs> spurs, u32 wid)
 		return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
 	}
 
-	if (spurs->wklState(wid).read_relaxed() != SPURS_WKL_STATE_RUNNABLE)
+	if (spurs->wklState(wid).load() != SPURS_WKL_STATE_RUNNABLE)
 	{
 		return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
 	}
@@ -2531,12 +2531,12 @@ s32 cellSpursReadyCountStore(vm::ptr<CellSpurs> spurs, u32 wid, u32 value)
 		return CELL_SPURS_POLICY_MODULE_ERROR_INVAL;
 	}
 
-	if ((spurs->wklEnabled.read_relaxed() & (0x80000000u >> wid)) == 0)
+	if ((spurs->wklEnabled.load() & (0x80000000u >> wid)) == 0)
 	{
 		return CELL_SPURS_POLICY_MODULE_ERROR_SRCH;
 	}
 
-	if (spurs->exception.data() || spurs->wklState(wid).read_relaxed() != 2)
+	if (spurs->exception.data() || spurs->wklState(wid).load() != 2)
 	{
 		return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
 	}
@@ -2594,7 +2594,7 @@ s32 cellSpursGetWorkloadData(vm::ptr<CellSpurs> spurs, vm::ptr<u64> data, u32 wi
 		return CELL_SPURS_POLICY_MODULE_ERROR_INVAL;
 	}
 
-	if ((spurs->wklEnabled.read_relaxed() & (0x80000000u >> wid)) == 0)
+	if ((spurs->wklEnabled.load() & (0x80000000u >> wid)) == 0)
 	{
 		return CELL_SPURS_POLICY_MODULE_ERROR_SRCH;
 	}
@@ -2657,7 +2657,7 @@ s32 _cellSpursWorkloadFlagReceiver(vm::ptr<CellSpurs> spurs, u32 wid, u32 is_set
 		return CELL_SPURS_POLICY_MODULE_ERROR_INVAL;
 	}
 
-	if ((spurs->wklEnabled.read_relaxed() & (0x80000000u >> wid)) == 0)
+	if ((spurs->wklEnabled.load() & (0x80000000u >> wid)) == 0)
 	{
 		return CELL_SPURS_POLICY_MODULE_ERROR_SRCH;
 	}
@@ -2667,18 +2667,20 @@ s32 _cellSpursWorkloadFlagReceiver(vm::ptr<CellSpurs> spurs, u32 wid, u32 is_set
 		return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
 	}
 
-	if (s32 res = spurs->wklFlag.flag.atomic_op_sync(0, [spurs, wid, is_set](be_t<u32>& flag) -> s32
+	_mm_mfence();
+
+	if (s32 res = spurs->wklFlag.flag.atomic_op([spurs, wid, is_set](be_t<u32>& flag) -> s32
 	{
 		if (is_set)
 		{
-			if (spurs->wklFlagReceiver.read_relaxed() != 0xff)
+			if (spurs->wklFlagReceiver.load() != 0xff)
 			{
 				return CELL_SPURS_POLICY_MODULE_ERROR_BUSY;
 			}
 		}
 		else
 		{
-			if (spurs->wklFlagReceiver.read_relaxed() != wid)
+			if (spurs->wklFlagReceiver.load() != wid)
 			{
 				return CELL_SPURS_POLICY_MODULE_ERROR_PERM;
 			}
diff --git a/rpcs3/Emu/SysCalls/Modules/cellSpursSpu.cpp b/rpcs3/Emu/SysCalls/Modules/cellSpursSpu.cpp
index 323da36754..970ecd6099 100644
--- a/rpcs3/Emu/SysCalls/Modules/cellSpursSpu.cpp
+++ b/rpcs3/Emu/SysCalls/Modules/cellSpursSpu.cpp
@@ -194,21 +194,21 @@ bool spursKernel1SelectWorkload(SPUThread & spu) {
 
         // The system service has the highest priority. Select the system service if
         // the system service message bit for this SPU is set.
-        if (spurs->sysSrvMessage.read_relaxed() & (1 << ctxt->spuNum)) {
+        if (spurs->sysSrvMessage.load() & (1 << ctxt->spuNum)) {
             ctxt->spuIdling = 0;
             if (!isPoll || ctxt->wklCurrentId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
                 // Clear the message bit
-                spurs->sysSrvMessage.write_relaxed(spurs->sysSrvMessage.read_relaxed() & ~(1 << ctxt->spuNum));
+                spurs->sysSrvMessage.store(spurs->sysSrvMessage.load() & ~(1 << ctxt->spuNum));
             }
         } else {
             // Caclulate the scheduling weight for each workload
             u16 maxWeight = 0;
             for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++) {
                 u16 runnable     = ctxt->wklRunnable1 & (0x8000 >> i);
-                u16 wklSignal    = spurs->wklSignal1.read_relaxed() & (0x8000 >> i);
-                u8  wklFlag      = spurs->wklFlag.flag.read_relaxed() == 0 ? spurs->wklFlagReceiver.read_relaxed() == i ? 1 : 0 : 0;
-                u8  readyCount   = spurs->wklReadyCount1[i].read_relaxed() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklReadyCount1[i].read_relaxed();
-                u8  idleSpuCount = spurs->wklIdleSpuCountOrReadyCount2[i].read_relaxed() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklIdleSpuCountOrReadyCount2[i].read_relaxed();
+                u16 wklSignal    = spurs->wklSignal1.load() & (0x8000 >> i);
+                u8  wklFlag      = spurs->wklFlag.flag.load() == 0 ? spurs->wklFlagReceiver.load() == i ? 1 : 0 : 0;
+                u8  readyCount   = spurs->wklReadyCount1[i].load() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklReadyCount1[i].load();
+                u8  idleSpuCount = spurs->wklIdleSpuCountOrReadyCount2[i].load() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklIdleSpuCountOrReadyCount2[i].load();
                 u8  requestCount = readyCount + idleSpuCount;
 
                 // For a workload to be considered for scheduling:
@@ -218,7 +218,7 @@ bool spursKernel1SelectWorkload(SPUThread & spu) {
                 // 4. The number of SPUs allocated to it must be less than the number of SPUs requested (i.e. readyCount)
                 //    OR the workload must be signalled
                 //    OR the workload flag is 0 and the workload is configured as the wokload flag receiver
-                if (runnable && ctxt->priority[i] != 0 && spurs->wklMaxContention[i].read_relaxed() > contention[i]) {
+                if (runnable && ctxt->priority[i] != 0 && spurs->wklMaxContention[i].load() > contention[i]) {
                     if (wklFlag || wklSignal || (readyCount != 0 && requestCount > contention[i])) {
                         // The scheduling weight of the workload is formed from the following parameters in decreasing order of priority:
                         // 1. Wokload signal set or workload flag or ready count > contention
@@ -253,12 +253,12 @@ bool spursKernel1SelectWorkload(SPUThread & spu) {
 
             if (!isPoll || wklSelectedId == ctxt->wklCurrentId) {
                 // Clear workload signal for the selected workload
-                spurs->wklSignal1.write_relaxed(spurs->wklSignal1.read_relaxed() & ~(0x8000 >> wklSelectedId));
-                spurs->wklSignal2.write_relaxed(spurs->wklSignal1.read_relaxed() & ~(0x80000000u >> wklSelectedId));
+                spurs->wklSignal1.store(spurs->wklSignal1.load() & ~(0x8000 >> wklSelectedId));
+                spurs->wklSignal2.store(spurs->wklSignal1.load() & ~(0x80000000u >> wklSelectedId));
 
                 // If the selected workload is the wklFlag workload then pull the wklFlag to all 1s
-                if (wklSelectedId == spurs->wklFlagReceiver.read_relaxed()) {
-                    spurs->wklFlag.flag.write_relaxed(0xFFFFFFFF);
+                if (wklSelectedId == spurs->wklFlagReceiver.load()) {
+                    spurs->wklFlag.flag.store(0xFFFFFFFF);
                 }
             }
         }
@@ -353,12 +353,12 @@ bool spursKernel2SelectWorkload(SPUThread & spu) {
 
         // The system service has the highest priority. Select the system service if
         // the system service message bit for this SPU is set.
-        if (spurs->sysSrvMessage.read_relaxed() & (1 << ctxt->spuNum)) {
+        if (spurs->sysSrvMessage.load() & (1 << ctxt->spuNum)) {
             // Not sure what this does. Possibly Mark the SPU as in use.
             ctxt->spuIdling = 0;
             if (!isPoll || ctxt->wklCurrentId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
                 // Clear the message bit
-                spurs->sysSrvMessage.write_relaxed(spurs->sysSrvMessage.read_relaxed() & ~(1 << ctxt->spuNum));
+                spurs->sysSrvMessage.store(spurs->sysSrvMessage.load() & ~(1 << ctxt->spuNum));
             }
         } else {
             // Caclulate the scheduling weight for each workload
@@ -367,10 +367,10 @@ bool spursKernel2SelectWorkload(SPUThread & spu) {
                 auto j           = i & 0x0F;
                 u16 runnable      = i < CELL_SPURS_MAX_WORKLOAD ? ctxt->wklRunnable1 & (0x8000 >> j) : ctxt->wklRunnable2 & (0x8000 >> j);
                 u8  priority      = i < CELL_SPURS_MAX_WORKLOAD ? ctxt->priority[j] & 0x0F : ctxt->priority[j] >> 4;
-                u8  maxContention = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklMaxContention[j].read_relaxed() & 0x0F : spurs->wklMaxContention[j].read_relaxed() >> 4;
-                u16 wklSignal     = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklSignal1.read_relaxed() & (0x8000 >> j) : spurs->wklSignal2.read_relaxed() & (0x8000 >> j);
-                u8  wklFlag       = spurs->wklFlag.flag.read_relaxed() == 0 ? spurs->wklFlagReceiver.read_relaxed() == i ? 1 : 0 : 0;
-                u8  readyCount    = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklReadyCount1[j].read_relaxed() : spurs->wklIdleSpuCountOrReadyCount2[j].read_relaxed();
+                u8  maxContention = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklMaxContention[j].load() & 0x0F : spurs->wklMaxContention[j].load() >> 4;
+                u16 wklSignal     = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklSignal1.load() & (0x8000 >> j) : spurs->wklSignal2.load() & (0x8000 >> j);
+                u8  wklFlag       = spurs->wklFlag.flag.load() == 0 ? spurs->wklFlagReceiver.load() == i ? 1 : 0 : 0;
+                u8  readyCount    = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklReadyCount1[j].load() : spurs->wklIdleSpuCountOrReadyCount2[j].load();
 
                 // For a workload to be considered for scheduling:
                 // 1. Its priority must be greater than 0
@@ -405,12 +405,12 @@ bool spursKernel2SelectWorkload(SPUThread & spu) {
 
             if (!isPoll || wklSelectedId == ctxt->wklCurrentId) {
                 // Clear workload signal for the selected workload
-                spurs->wklSignal1.write_relaxed(spurs->wklSignal1.read_relaxed() & ~(0x8000 >> wklSelectedId));
-                spurs->wklSignal2.write_relaxed(spurs->wklSignal1.read_relaxed() & ~(0x80000000u >> wklSelectedId));
+                spurs->wklSignal1.store(spurs->wklSignal1.load() & ~(0x8000 >> wklSelectedId));
+                spurs->wklSignal2.store(spurs->wklSignal1.load() & ~(0x80000000u >> wklSelectedId));
 
                 // If the selected workload is the wklFlag workload then pull the wklFlag to all 1s
-                if (wklSelectedId == spurs->wklFlagReceiver.read_relaxed()) {
-                    spurs->wklFlag.flag.write_relaxed(0xFFFFFFFF);
+                if (wklSelectedId == spurs->wklFlagReceiver.load()) {
+                    spurs->wklFlag.flag.store(0xFFFFFFFF);
                 }
             }
         }
@@ -492,7 +492,7 @@ void spursKernelDispatchWorkload(SPUThread & spu, u64 widAndPollStatus) {
         }
 
         ctxt->wklCurrentAddr     = wklInfo->addr;
-        ctxt->wklCurrentUniqueId = wklInfo->uniqueId.read_relaxed();
+        ctxt->wklCurrentUniqueId = wklInfo->uniqueId.load();
     }
 
     if (!isKernel2) {
@@ -624,7 +624,7 @@ void spursSysServiceIdleHandler(SPUThread & spu, SpursKernelContext * ctxt) {
 
         // Check if any workloads can be scheduled
         bool foundReadyWorkload = false;
-        if (spurs->sysSrvMessage.read_relaxed() & (1 << ctxt->spuNum)) {
+        if (spurs->sysSrvMessage.load() & (1 << ctxt->spuNum)) {
             foundReadyWorkload = true;
         } else {
             if (spurs->flags1 & SF1_32_WORKLOADS) {
@@ -632,11 +632,11 @@ void spursSysServiceIdleHandler(SPUThread & spu, SpursKernelContext * ctxt) {
                     u32 j            = i & 0x0F;
                     u16 runnable     = i < CELL_SPURS_MAX_WORKLOAD ? ctxt->wklRunnable1 & (0x8000 >> j) : ctxt->wklRunnable2 & (0x8000 >> j);
                     u8 priority      = i < CELL_SPURS_MAX_WORKLOAD ? ctxt->priority[j] & 0x0F : ctxt->priority[j] >> 4;
-                    u8 maxContention = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklMaxContention[j].read_relaxed() & 0x0F : spurs->wklMaxContention[j].read_relaxed() >> 4;
+                    u8 maxContention = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklMaxContention[j].load() & 0x0F : spurs->wklMaxContention[j].load() >> 4;
                     u8 contention    = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklCurrentContention[j] & 0x0F : spurs->wklCurrentContention[j] >> 4;
-                    u16 wklSignal    = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklSignal1.read_relaxed() & (0x8000 >> j) : spurs->wklSignal2.read_relaxed() & (0x8000 >> j);
-                    u8 wklFlag       = spurs->wklFlag.flag.read_relaxed() == 0 ? spurs->wklFlagReceiver.read_relaxed() == i ? 1 : 0 : 0;
-                    u8 readyCount    = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklReadyCount1[j].read_relaxed() : spurs->wklIdleSpuCountOrReadyCount2[j].read_relaxed();
+                    u16 wklSignal    = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklSignal1.load() & (0x8000 >> j) : spurs->wklSignal2.load() & (0x8000 >> j);
+                    u8 wklFlag       = spurs->wklFlag.flag.load() == 0 ? spurs->wklFlagReceiver.load() == i ? 1 : 0 : 0;
+                    u8 readyCount    = i < CELL_SPURS_MAX_WORKLOAD ? spurs->wklReadyCount1[j].load() : spurs->wklIdleSpuCountOrReadyCount2[j].load();
 
                     if (runnable && priority > 0 && maxContention > contention) {
                         if (wklFlag || wklSignal || readyCount > contention) {
@@ -648,13 +648,13 @@ void spursSysServiceIdleHandler(SPUThread & spu, SpursKernelContext * ctxt) {
             } else {
                 for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++) {
                     u16 runnable    = ctxt->wklRunnable1 & (0x8000 >> i);
-                    u16 wklSignal   = spurs->wklSignal1.read_relaxed() & (0x8000 >> i);
-                    u8 wklFlag      = spurs->wklFlag.flag.read_relaxed() == 0 ? spurs->wklFlagReceiver.read_relaxed() == i ? 1 : 0 : 0;
-                    u8 readyCount   = spurs->wklReadyCount1[i].read_relaxed() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklReadyCount1[i].read_relaxed();
-                    u8 idleSpuCount = spurs->wklIdleSpuCountOrReadyCount2[i].read_relaxed() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklIdleSpuCountOrReadyCount2[i].read_relaxed();
+                    u16 wklSignal   = spurs->wklSignal1.load() & (0x8000 >> i);
+                    u8 wklFlag      = spurs->wklFlag.flag.load() == 0 ? spurs->wklFlagReceiver.load() == i ? 1 : 0 : 0;
+                    u8 readyCount   = spurs->wklReadyCount1[i].load() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklReadyCount1[i].load();
+                    u8 idleSpuCount = spurs->wklIdleSpuCountOrReadyCount2[i].load() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->wklIdleSpuCountOrReadyCount2[i].load();
                     u8 requestCount = readyCount + idleSpuCount;
 
-                    if (runnable && ctxt->priority[i] != 0 && spurs->wklMaxContention[i].read_relaxed() > spurs->wklCurrentContention[i]) {
+                    if (runnable && ctxt->priority[i] != 0 && spurs->wklMaxContention[i].load() > spurs->wklCurrentContention[i]) {
                         if (wklFlag || wklSignal || (readyCount != 0 && requestCount > spurs->wklCurrentContention[i])) {
                             foundReadyWorkload = true;
                             break;
@@ -802,7 +802,7 @@ void spursSysServiceProcessRequests(SPUThread & spu, SpursKernelContext * ctxt)
         }
 
         // Update workload message
-        if (spurs->sysSrvMsgUpdateWorkload.read_relaxed() & (1 << ctxt->spuNum)) {
+        if (spurs->sysSrvMsgUpdateWorkload.load() & (1 << ctxt->spuNum)) {
             spurs->sysSrvMsgUpdateWorkload &= ~(1 << ctxt->spuNum);
             updateWorkload = true;
         }
@@ -847,7 +847,7 @@ void spursSysServiceActivateWorkload(SPUThread & spu, SpursKernelContext * ctxt)
 
         // Copy the priority of the workload for this SPU and its unique id to the LS
         ctxt->priority[i]    = wklInfo1[i].priority[ctxt->spuNum] == 0 ? 0 : 0x10 - wklInfo1[i].priority[ctxt->spuNum];
-        ctxt->wklUniqueId[i] = wklInfo1[i].uniqueId.read_relaxed();
+        ctxt->wklUniqueId[i] = wklInfo1[i].uniqueId.load();
 
         if (spurs->flags1 & SF1_32_WORKLOADS) {
             auto wklInfo2 = vm::get_ptr<CellSpurs::WorkloadInfo>(spu.offset + 0x30200);
@@ -865,7 +865,7 @@ void spursSysServiceActivateWorkload(SPUThread & spu, SpursKernelContext * ctxt)
         for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++) {
             // Update workload status and runnable flag based on the workload state
             auto wklStatus = spurs->wklStatus1[i];
-            if (spurs->wklState1[i].read_relaxed() == SPURS_WKL_STATE_RUNNABLE) {
+            if (spurs->wklState1[i].load() == SPURS_WKL_STATE_RUNNABLE) {
                 spurs->wklStatus1[i] |= 1 << ctxt->spuNum;
                 ctxt->wklRunnable1     |= 0x8000 >> i;
             } else {
@@ -874,9 +874,9 @@ void spursSysServiceActivateWorkload(SPUThread & spu, SpursKernelContext * ctxt)
 
             // If the workload is shutting down and if this is the last SPU from which it is being removed then
             // add it to the shutdown bit set
-            if (spurs->wklState1[i].read_relaxed() == SPURS_WKL_STATE_SHUTTING_DOWN) {
+            if (spurs->wklState1[i].load() == SPURS_WKL_STATE_SHUTTING_DOWN) {
                 if (((wklStatus & (1 << ctxt->spuNum)) != 0) && (spurs->wklStatus1[i] == 0)) {
-                    spurs->wklState1[i].write_relaxed(SPURS_WKL_STATE_REMOVABLE);
+                    spurs->wklState1[i].store(SPURS_WKL_STATE_REMOVABLE);
                     wklShutdownBitSet |= 0x80000000u >> i;
                 }
             }
@@ -884,7 +884,7 @@ void spursSysServiceActivateWorkload(SPUThread & spu, SpursKernelContext * ctxt)
             if (spurs->flags1 & SF1_32_WORKLOADS) {
                 // Update workload status and runnable flag based on the workload state
                 wklStatus = spurs->wklStatus2[i];
-                if (spurs->wklState2[i].read_relaxed() == SPURS_WKL_STATE_RUNNABLE) {
+                if (spurs->wklState2[i].load() == SPURS_WKL_STATE_RUNNABLE) {
                     spurs->wklStatus2[i] |= 1 << ctxt->spuNum;
                     ctxt->wklRunnable2     |= 0x8000 >> i;
                 } else {
@@ -893,9 +893,9 @@ void spursSysServiceActivateWorkload(SPUThread & spu, SpursKernelContext * ctxt)
 
                 // If the workload is shutting down and if this is the last SPU from which it is being removed then
                 // add it to the shutdown bit set
-                if (spurs->wklState2[i].read_relaxed() == SPURS_WKL_STATE_SHUTTING_DOWN) {
+                if (spurs->wklState2[i].load() == SPURS_WKL_STATE_SHUTTING_DOWN) {
                     if (((wklStatus & (1 << ctxt->spuNum)) != 0) && (spurs->wklStatus2[i] == 0)) {
-                        spurs->wklState2[i].write_relaxed(SPURS_WKL_STATE_REMOVABLE);
+                        spurs->wklState2[i].store(SPURS_WKL_STATE_REMOVABLE);
                         wklShutdownBitSet |= 0x8000 >> i;
                     }
                 }
@@ -924,14 +924,14 @@ void spursSysServiceUpdateShutdownCompletionEvents(SPUThread & spu, SpursKernelC
         for (u32 i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++) {
             if (wklShutdownBitSet & (0x80000000u >> i)) {
                 spurs->wklEvent1[i] |= 0x01;
-                if (spurs->wklEvent1[i].read_relaxed() & 0x02 || spurs->wklEvent1[i].read_relaxed() & 0x10) {
+                if (spurs->wklEvent1[i].load() & 0x02 || spurs->wklEvent1[i].load() & 0x10) {
                     wklNotifyBitSet |= 0x80000000u >> i;
                 }
             }
 
             if (wklShutdownBitSet & (0x8000 >> i)) {
                 spurs->wklEvent2[i] |= 0x01;
-                if (spurs->wklEvent2[i].read_relaxed() & 0x02 || spurs->wklEvent2[i].read_relaxed() & 0x10) {
+                if (spurs->wklEvent2[i].load() & 0x02 || spurs->wklEvent2[i].load() & 0x10) {
                     wklNotifyBitSet |= 0x8000 >> i;
                 }
             }
@@ -1035,10 +1035,10 @@ void spursSysServiceCleanupAfterSystemWorkload(SPUThread & spu, SpursKernelConte
 
         if (wklId >= CELL_SPURS_MAX_WORKLOAD) {
             spurs->wklCurrentContention[wklId & 0x0F] -= 0x10;
-            spurs->wklReadyCount1[wklId & 0x0F].write_relaxed(spurs->wklReadyCount1[wklId & 0x0F].read_relaxed() - 1);
+            spurs->wklReadyCount1[wklId & 0x0F].store(spurs->wklReadyCount1[wklId & 0x0F].load() - 1);
         } else {
             spurs->wklCurrentContention[wklId & 0x0F] -= 0x01;
-            spurs->wklIdleSpuCountOrReadyCount2[wklId & 0x0F].write_relaxed(spurs->wklIdleSpuCountOrReadyCount2[wklId & 0x0F].read_relaxed() - 1);
+            spurs->wklIdleSpuCountOrReadyCount2[wklId & 0x0F].store(spurs->wklIdleSpuCountOrReadyCount2[wklId & 0x0F].load() - 1);
         }
 
         memcpy(vm::get_ptr(spu.offset + 0x100), spurs, 128);
@@ -1317,14 +1317,14 @@ s32 spursTasksetProcessRequest(SPUThread & spu, s32 request, u32 * taskId, u32 *
     vm::reservation_op(vm::cast(kernelCtxt->spurs.addr()), 128, [&]() {
         auto spurs = kernelCtxt->spurs.priv_ptr();
 
-        s32 readyCount  = kernelCtxt->wklCurrentId < CELL_SPURS_MAX_WORKLOAD ? spurs->wklReadyCount1[kernelCtxt->wklCurrentId].read_relaxed() : spurs->wklIdleSpuCountOrReadyCount2[kernelCtxt->wklCurrentId & 0x0F].read_relaxed();
+        s32 readyCount  = kernelCtxt->wklCurrentId < CELL_SPURS_MAX_WORKLOAD ? spurs->wklReadyCount1[kernelCtxt->wklCurrentId].load() : spurs->wklIdleSpuCountOrReadyCount2[kernelCtxt->wklCurrentId & 0x0F].load();
         readyCount     += numNewlyReadyTasks;
         readyCount      = readyCount < 0 ? 0 : readyCount > 0xFF ? 0xFF : readyCount;
 
         if (kernelCtxt->wklCurrentId < CELL_SPURS_MAX_WORKLOAD) {
-            spurs->wklReadyCount1[kernelCtxt->wklCurrentId].write_relaxed(readyCount);
+            spurs->wklReadyCount1[kernelCtxt->wklCurrentId].store(readyCount);
         } else {
-            spurs->wklIdleSpuCountOrReadyCount2[kernelCtxt->wklCurrentId & 0x0F].write_relaxed(readyCount);
+            spurs->wklIdleSpuCountOrReadyCount2[kernelCtxt->wklCurrentId & 0x0F].store(readyCount);
         }
 
         memcpy(vm::get_ptr(spu.offset + 0x100), spurs, 128);
diff --git a/rpcs3/Emu/SysCalls/Modules/cellSync.cpp b/rpcs3/Emu/SysCalls/Modules/cellSync.cpp
index ab17450231..c424fe59b2 100644
--- a/rpcs3/Emu/SysCalls/Modules/cellSync.cpp
+++ b/rpcs3/Emu/SysCalls/Modules/cellSync.cpp
@@ -19,8 +19,10 @@ waiter_map_t g_sync_rwm_read_wm("sync_rwm_read_wm");
 waiter_map_t g_sync_rwm_write_wm("sync_rwm_write_wm");
 waiter_map_t g_sync_queue_wm("sync_queue_wm");
 
-s32 syncMutexInitialize(vm::ptr<CellSyncMutex> mutex)
+s32 cellSyncMutexInitialize(vm::ptr<CellSyncMutex> mutex)
 {
+	cellSync.Log("cellSyncMutexInitialize(mutex=*0x%x)", mutex);
+
 	if (!mutex)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
@@ -31,18 +33,11 @@ s32 syncMutexInitialize(vm::ptr<CellSyncMutex> mutex)
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	mutex->sync_var.exchange({});
+	mutex->exchange({});
 
 	return CELL_OK;
 }
 
-s32 cellSyncMutexInitialize(vm::ptr<CellSyncMutex> mutex)
-{
-	cellSync.Log("cellSyncMutexInitialize(mutex=*0x%x)", mutex);
-
-	return syncMutexInitialize(mutex);
-}
-
 s32 cellSyncMutexLock(vm::ptr<CellSyncMutex> mutex)
 {
 	cellSync.Log("cellSyncMutexLock(mutex=*0x%x)", mutex);
@@ -57,16 +52,13 @@ s32 cellSyncMutexLock(vm::ptr<CellSyncMutex> mutex)
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	// prx: increase acquire_count and remember its old value
-	const auto order = mutex->cnt.acq++;
+	// increase acq value and remember its old value
+	const auto order = mutex->atomic_op(&sync_mutex_t::acquire);
 
-	// prx: wait until release_count is equal to old acquire_count
-	g_sync_mutex_wm.wait_op(mutex.addr(), [mutex, order]()
-	{
-		return order == mutex->cnt.rel.read_relaxed();
-	});
+	// wait until rel value is equal to old acq value
+	g_sync_mutex_wm.wait_op(mutex.addr(), WRAP_EXPR(mutex->load().rel == order));
 
-	mutex->sync_var.read_sync();
+	_mm_mfence();
 
 	return CELL_OK;
 }
@@ -85,11 +77,12 @@ s32 cellSyncMutexTryLock(vm::ptr<CellSyncMutex> mutex)
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	// prx: lock only if acquire_count and release_count are equal
-	return mutex->sync_var.atomic_op(CELL_OK, [](CellSyncMutex::sync_t& mutex) -> s32
+	if (!mutex->atomic_op(&sync_mutex_t::try_lock))
 	{
-		return (mutex.cnt_acq++ != mutex.cnt_rel) ? CELL_SYNC_ERROR_BUSY : CELL_OK;
-	});
+		return CELL_SYNC_ERROR_BUSY;
+	}
+
+	return CELL_OK;
 }
 
 s32 cellSyncMutexUnlock(vm::ptr<CellSyncMutex> mutex)
@@ -100,66 +93,43 @@ s32 cellSyncMutexUnlock(vm::ptr<CellSyncMutex> mutex)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!mutex.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	// prx: increase release count
-	mutex->cnt.rel++;
+	mutex->atomic_op(&sync_mutex_t::unlock);
 
 	g_sync_mutex_wm.notify(mutex.addr());
 
 	return CELL_OK;
 }
 
-s32 syncBarrierInitialize(vm::ptr<CellSyncBarrier> barrier, u16 total_count)
-{
-	if (!barrier)
-	{
-		return CELL_SYNC_ERROR_NULL_POINTER;
-	}
-	if (!barrier.aligned())
-	{
-		return CELL_SYNC_ERROR_ALIGN;
-	}
-	if (!total_count || total_count > 32767)
-	{
-		return CELL_SYNC_ERROR_INVAL;
-	}
-
-	// prx: zeroize first u16, write total_count in second u16 and sync
-	barrier->data.exchange({ 0, total_count });
-
-	return CELL_OK;
-}
-
 s32 cellSyncBarrierInitialize(vm::ptr<CellSyncBarrier> barrier, u16 total_count)
 {
 	cellSync.Log("cellSyncBarrierInitialize(barrier=*0x%x, total_count=%d)", barrier, total_count);
 
-	return syncBarrierInitialize(barrier, total_count);
-}
-
-s32 syncBarrierTryNotifyOp(CellSyncBarrier::data_t& barrier)
-{
-	// prx: extract m_value (repeat if < 0), increase, compare with second s16, set sign bit if equal, insert it back
-	s16 value = barrier.m_value;
-
-	if (value < 0)
+	if (!barrier)
 	{
-		return CELL_SYNC_ERROR_BUSY;
+		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
 
-	if (++value == barrier.m_count)
+	if (!barrier.aligned())
 	{
-		value |= 0x8000;
+		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	barrier.m_value = value;
+	if (!total_count || total_count > 32767)
+	{
+		return CELL_SYNC_ERROR_INVAL;
+	}
+
+	// clear current value, write total_count and sync
+	barrier->exchange({ 0, total_count });
 
 	return CELL_OK;
-};
+}
 
 s32 cellSyncBarrierNotify(vm::ptr<CellSyncBarrier> barrier)
 {
@@ -169,15 +139,13 @@ s32 cellSyncBarrierNotify(vm::ptr<CellSyncBarrier> barrier)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!barrier.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	g_sync_barrier_notify_wm.wait_op(barrier.addr(), [barrier]()
-	{
-		return barrier->data.atomic_op_sync(CELL_OK, syncBarrierTryNotifyOp) == CELL_OK;
-	});
+	g_sync_barrier_notify_wm.wait_op(barrier.addr(), WRAP_EXPR(barrier->atomic_op(&sync_barrier_t::try_notify)));
 
 	g_sync_barrier_wait_wm.notify(barrier.addr());
 
@@ -192,37 +160,20 @@ s32 cellSyncBarrierTryNotify(vm::ptr<CellSyncBarrier> barrier)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!barrier.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	if (s32 res = barrier->data.atomic_op_sync(CELL_OK, syncBarrierTryNotifyOp))
-	{
-		return res;
-	}
+	_mm_mfence();
 
-	g_sync_barrier_wait_wm.notify(barrier.addr());
-
-	return CELL_OK;
-}
-
-s32 syncBarrierTryWaitOp(CellSyncBarrier::data_t& barrier)
-{
-	// prx: extract m_value (repeat if >= 0), decrease it, set 0 if == 0x8000, insert it back
-	s16 value = barrier.m_value;
-
-	if (value >= 0)
+	if (!barrier->atomic_op(&sync_barrier_t::try_notify))
 	{
 		return CELL_SYNC_ERROR_BUSY;
 	}
 
-	if (--value == -0x8000)
-	{
-		value = 0;
-	}
-
-	barrier.m_value = value;
+	g_sync_barrier_wait_wm.notify(barrier.addr());
 
 	return CELL_OK;
 }
@@ -235,15 +186,15 @@ s32 cellSyncBarrierWait(vm::ptr<CellSyncBarrier> barrier)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!barrier.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	g_sync_barrier_wait_wm.wait_op(barrier.addr(), [barrier]()
-	{
-		return barrier->data.atomic_op_sync(CELL_OK, syncBarrierTryWaitOp) == CELL_OK;
-	});
+	_mm_mfence();
+
+	g_sync_barrier_wait_wm.wait_op(barrier.addr(), WRAP_EXPR(barrier->atomic_op(&sync_barrier_t::try_wait)));
 
 	g_sync_barrier_notify_wm.notify(barrier.addr());
 
@@ -258,14 +209,17 @@ s32 cellSyncBarrierTryWait(vm::ptr<CellSyncBarrier> barrier)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!barrier.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	if (s32 res = barrier->data.atomic_op_sync(CELL_OK, syncBarrierTryWaitOp))
+	_mm_mfence();
+
+	if (!barrier->atomic_op(&sync_barrier_t::try_wait))
 	{
-		return res;
+		return CELL_SYNC_ERROR_BUSY;
 	}
 
 	g_sync_barrier_notify_wm.notify(barrier.addr());
@@ -273,56 +227,31 @@ s32 cellSyncBarrierTryWait(vm::ptr<CellSyncBarrier> barrier)
 	return CELL_OK;
 }
 
-s32 syncRwmInitialize(vm::ptr<CellSyncRwm> rwm, vm::ptr<void> buffer, u32 buffer_size)
-{
-	if (!rwm || !buffer)
-	{
-		return CELL_SYNC_ERROR_NULL_POINTER;
-	}
-	if (!rwm.aligned() || buffer % 128)
-	{
-		return CELL_SYNC_ERROR_ALIGN;
-	}
-	if (buffer_size % 128 || buffer_size > 0x4000)
-	{
-		return CELL_SYNC_ERROR_INVAL;
-	}
-
-	// prx: zeroize first u16 and second u16, write buffer_size in second u32, write buffer_addr in second u64 and sync
-	rwm->m_size = buffer_size;
-	rwm->m_buffer = buffer;
-	rwm->data.exchange({});
-
-	return CELL_OK;
-}
-
 s32 cellSyncRwmInitialize(vm::ptr<CellSyncRwm> rwm, vm::ptr<void> buffer, u32 buffer_size)
 {
 	cellSync.Log("cellSyncRwmInitialize(rwm=*0x%x, buffer=*0x%x, buffer_size=0x%x)", rwm, buffer, buffer_size);
 
-	return syncRwmInitialize(rwm, buffer, buffer_size);
-}
-
-s32 syncRwmTryReadBeginOp(CellSyncRwm::data_t& rwm)
-{
-	if (rwm.m_writers.data())
+	if (!rwm || !buffer)
 	{
-		return CELL_SYNC_ERROR_BUSY;
+		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
 
-	rwm.m_readers++;
-
-	return CELL_OK;
-}
-
-s32 syncRwmReadEndOp(CellSyncRwm::data_t& rwm)
-{
-	if (!rwm.m_readers.data())
+	if (!rwm.aligned() || buffer % 128)
 	{
-		return CELL_SYNC_ERROR_ABORT;
+		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	rwm.m_readers--;
+	if (buffer_size % 128 || buffer_size > 0x4000)
+	{
+		return CELL_SYNC_ERROR_INVAL;
+	}
+
+	// clear readers and writers, write buffer_size, buffer addr and sync
+	rwm->ctrl.store({});
+	rwm->size = buffer_size;
+	rwm->buffer = buffer;
+
+	_mm_mfence();
 
 	return CELL_OK;
 }
@@ -335,25 +264,22 @@ s32 cellSyncRwmRead(vm::ptr<CellSyncRwm> rwm, vm::ptr<void> buffer)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!rwm.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	// prx: increase m_readers, wait until m_writers is zero
-	g_sync_rwm_read_wm.wait_op(rwm.addr(), [rwm]()
-	{
-		return rwm->data.atomic_op(CELL_OK, syncRwmTryReadBeginOp) == CELL_OK;
-	});
+	// wait until `writers` is zero, increase `readers`
+	g_sync_rwm_read_wm.wait_op(rwm.addr(), WRAP_EXPR(rwm->ctrl.atomic_op(&sync_rwm_t::try_read_begin)));
 
-	// copy data to buffer_addr
-	memcpy(buffer.get_ptr(), rwm->m_buffer.get_ptr(), rwm->m_size);
+	// copy data to buffer
+	std::memcpy(buffer.get_ptr(), rwm->buffer.get_ptr(), rwm->size);
 
-	// prx: decrease m_readers (return 0x8041010C if already zero)
-	if (s32 res = rwm->data.atomic_op(CELL_OK, syncRwmReadEndOp))
+	// decrease `readers`, return error if already zero
+	if (!rwm->ctrl.atomic_op(&sync_rwm_t::try_read_end))
 	{
-		cellSync.Error("syncRwmReadEndOp(rwm=0x%x) failed: m_readers == 0", rwm);
-		return res;
+		return CELL_SYNC_ERROR_ABORT;
 	}
 
 	g_sync_rwm_write_wm.notify(rwm.addr());
@@ -369,36 +295,28 @@ s32 cellSyncRwmTryRead(vm::ptr<CellSyncRwm> rwm, vm::ptr<void> buffer)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!rwm.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	if (s32 res = rwm->data.atomic_op(CELL_OK, syncRwmTryReadBeginOp))
-	{
-		return res;
-	}
-
-	memcpy(buffer.get_ptr(), rwm->m_buffer.get_ptr(), rwm->m_size);
-
-	if (s32 res = rwm->data.atomic_op(CELL_OK, syncRwmReadEndOp))
-	{
-		return res;
-	}
-
-	g_sync_rwm_write_wm.notify(rwm.addr());
-
-	return CELL_OK;
-}
-
-s32 syncRwmTryWriteBeginOp(CellSyncRwm::data_t& rwm)
-{
-	if (rwm.m_writers.data())
+	// increase `readers` if `writers` is zero
+	if (!rwm->ctrl.atomic_op(&sync_rwm_t::try_read_begin))
 	{
 		return CELL_SYNC_ERROR_BUSY;
 	}
 
-	rwm.m_writers = 1;
+	// copy data to buffer
+	std::memcpy(buffer.get_ptr(), rwm->buffer.get_ptr(), rwm->size);
+
+	// decrease `readers`, return error if already zero
+	if (!rwm->ctrl.atomic_op(&sync_rwm_t::try_read_end))
+	{
+		return CELL_SYNC_ERROR_ABORT;
+	}
+
+	g_sync_rwm_write_wm.notify(rwm.addr());
 
 	return CELL_OK;
 }
@@ -411,27 +329,23 @@ s32 cellSyncRwmWrite(vm::ptr<CellSyncRwm> rwm, vm::cptr<void> buffer)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!rwm.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	g_sync_rwm_read_wm.wait_op(rwm.addr(), [rwm]()
-	{
-		return rwm->data.atomic_op(CELL_OK, syncRwmTryWriteBeginOp) == CELL_OK;
-	});
+	// wait until `writers` is zero, set to 1
+	g_sync_rwm_read_wm.wait_op(rwm.addr(), WRAP_EXPR(rwm->ctrl.atomic_op(&sync_rwm_t::try_write_begin)));
 
-	// prx: wait until m_readers == 0
-	g_sync_rwm_write_wm.wait_op(rwm.addr(), [rwm]()
-	{
-		return rwm->data.read_relaxed().m_readers.data() == 0;
-	});
+	// wait until `readers` is zero
+	g_sync_rwm_write_wm.wait_op(rwm.addr(), WRAP_EXPR(!rwm->ctrl.load().readers.data()));
 
-	// prx: copy data from buffer_addr
-	memcpy(rwm->m_buffer.get_ptr(), buffer.get_ptr(), rwm->m_size);
+	// copy data from buffer
+	std::memcpy(rwm->buffer.get_ptr(), buffer.get_ptr(), rwm->size);
 
-	// prx: sync and zeroize m_readers and m_writers
-	rwm->data.exchange({});
+	// sync and clear `readers` and `writers`
+	rwm->ctrl.exchange({});
 
 	g_sync_rwm_read_wm.notify(rwm.addr());
 
@@ -446,81 +360,60 @@ s32 cellSyncRwmTryWrite(vm::ptr<CellSyncRwm> rwm, vm::cptr<void> buffer)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!rwm.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	// prx: compare m_readers | m_writers with 0, return if not zero, set m_writers to 1
-	if (!rwm->data.compare_and_swap_test({ 0, 0 }, { 0, 1 }))
+	// set `writers` to 1 if `readers` and `writers` are zero
+	if (!rwm->ctrl.compare_and_swap_test({ 0, 0 }, { 0, 1 }))
 	{
 		return CELL_SYNC_ERROR_BUSY;
 	}
 
-	// prx: copy data from buffer_addr
-	memcpy(rwm->m_buffer.get_ptr(), buffer.get_ptr(), rwm->m_size);
+	// copy data from buffer
+	std::memcpy(rwm->buffer.get_ptr(), buffer.get_ptr(), rwm->size);
 
-	// prx: sync and zeroize m_readers and m_writers
-	rwm->data.exchange({});
+	// sync and clear `readers` and `writers`
+	rwm->ctrl.exchange({});
 
 	g_sync_rwm_read_wm.notify(rwm.addr());
 
 	return CELL_OK;
 }
 
-s32 syncQueueInitialize(vm::ptr<CellSyncQueue> queue, vm::ptr<u8> buffer, u32 size, u32 depth)
-{
-	if (!queue)
-	{
-		return CELL_SYNC_ERROR_NULL_POINTER;
-	}
-	if (size && !buffer)
-	{
-		return CELL_SYNC_ERROR_NULL_POINTER;
-	}
-	if (!queue.aligned() || buffer % 16)
-	{
-		return CELL_SYNC_ERROR_ALIGN;
-	}
-	if (!depth || size % 16)
-	{
-		return CELL_SYNC_ERROR_INVAL;
-	}
-
-	// prx: zeroize first u64, write size in third u32, write depth in fourth u32, write address in third u64 and sync
-	queue->m_size = size;
-	queue->m_depth = depth;
-	queue->m_buffer = buffer;
-	queue->data.exchange({});
-
-	return CELL_OK;
-}
-
 s32 cellSyncQueueInitialize(vm::ptr<CellSyncQueue> queue, vm::ptr<u8> buffer, u32 size, u32 depth)
 {
 	cellSync.Log("cellSyncQueueInitialize(queue=*0x%x, buffer=*0x%x, size=0x%x, depth=0x%x)", queue, buffer, size, depth);
 
-	return syncQueueInitialize(queue, buffer, size, depth);
-}
-
-s32 syncQueueTryPushOp(CellSyncQueue::data_t& queue, u32 depth, u32& position)
-{
-	const u32 v1 = queue.m_v1;
-	const u32 v2 = queue.m_v2;
-
-	// prx: compare 5th u8 with zero (break if not zero)
-	// prx: compare (second u32 (u24) + first u8) with depth (break if greater or equal)
-	if ((v2 >> 24) || ((v2 & 0xffffff) + (v1 >> 24)) >= depth)
+	if (!queue)
 	{
-		return CELL_SYNC_ERROR_BUSY;
+		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
 
-	// prx: extract first u32 (u24) (-> position), calculate (position + 1) % depth, insert it back
-	// prx: insert 1 in 5th u8
-	// prx: extract second u32 (u24), increase it, insert it back
-	position = (v1 & 0xffffff);
-	queue.m_v1 = (v1 & 0xff000000) | ((position + 1) % depth);
-	queue.m_v2 = (1 << 24) | ((v2 & 0xffffff) + 1);
+	if (size && !buffer)
+	{
+		return CELL_SYNC_ERROR_NULL_POINTER;
+	}
+
+	if (!queue.aligned() || buffer % 16)
+	{
+		return CELL_SYNC_ERROR_ALIGN;
+	}
+
+	if (!depth || size % 16)
+	{
+		return CELL_SYNC_ERROR_INVAL;
+	}
+
+	// clear sync var, write size, depth, buffer addr and sync
+	queue->ctrl.store({});
+	queue->size = size;
+	queue->depth = depth;
+	queue->buffer = buffer;
+
+	_mm_mfence();
 
 	return CELL_OK;
 }
@@ -533,30 +426,23 @@ s32 cellSyncQueuePush(vm::ptr<CellSyncQueue> queue, vm::cptr<void> buffer)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!queue.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	const u32 size = queue->m_size;
-	const u32 depth = queue->m_depth;
-	const auto data = queue->data.read_relaxed();
-	assert((data.m_v1 & 0xffffff) <= depth && (data.m_v2 & 0xffffff) <= depth);
+	const u32 depth = queue->check_depth();
 
 	u32 position;
-	g_sync_queue_wm.wait_op(queue.addr(), [queue, depth, &position]()
-	{
-		return CELL_OK == queue->data.atomic_op(CELL_OK, [depth, &position](CellSyncQueue::data_t& queue) -> s32
-		{
-			return syncQueueTryPushOp(queue, depth, position);
-		});
-	});
 
-	// prx: memcpy(position * m_size + m_addr, buffer_addr, m_size), sync
-	memcpy(&queue->m_buffer[position * size], buffer.get_ptr(), size);
+	g_sync_queue_wm.wait_op(queue.addr(), WRAP_EXPR(queue->ctrl.atomic_op(&sync_queue_t::try_push, depth, position)));
 
-	// prx: atomically insert 0 in 5th u8
-	queue->data &= { 0xffffffffu, 0x00ffffff };
+	// copy data from the buffer at the position
+	std::memcpy(&queue->buffer[position * queue->size], buffer.get_ptr(), queue->size);
+
+	// clear 5th byte
+	queue->ctrl &= { 0xffffffff, 0x00ffffff };
 
 	g_sync_queue_wm.notify(queue.addr());
 
@@ -571,53 +457,28 @@ s32 cellSyncQueueTryPush(vm::ptr<CellSyncQueue> queue, vm::cptr<void> buffer)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!queue.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	const u32 size = queue->m_size;
-	const u32 depth = queue->m_depth;
-	const auto data = queue->data.read_relaxed();
-	assert((data.m_v1 & 0xffffff) <= depth && (data.m_v2 & 0xffffff) <= depth);
+	const u32 depth = queue->check_depth();
 
 	u32 position;
-	s32 res = queue->data.atomic_op(CELL_OK, [depth, &position](CellSyncQueue::data_t& queue) -> s32
-	{
-		return syncQueueTryPushOp(queue, depth, position);
-	});
-	if (res)
-	{
-		return res;
-	}
 
-	memcpy(&queue->m_buffer[position * size], buffer.get_ptr(), size);
-
-	queue->data &= { 0xffffffffu, 0x00ffffff };
-
-	g_sync_queue_wm.notify(queue.addr());
-
-	return CELL_OK;
-}
-
-s32 syncQueueTryPopOp(CellSyncQueue::data_t& queue, u32 depth, u32& position)
-{
-	const u32 v1 = queue.m_v1;
-	const u32 v2 = queue.m_v2;
-
-	// prx: extract first u8, repeat if not zero
-	// prx: extract second u32 (u24), subtract 5th u8, compare with zero, repeat if less or equal
-	if ((v1 >> 24) || ((v2 & 0xffffff) <= (v2 >> 24)))
+	if (!queue->ctrl.atomic_op(&sync_queue_t::try_push, depth, position))
 	{
 		return CELL_SYNC_ERROR_BUSY;
 	}
 
-	// prx: insert 1 in first u8
-	// prx: extract first u32 (u24), add depth, subtract second u32 (u24), calculate (% depth), save to position
-	// prx: extract second u32 (u24), decrease it, insert it back
-	queue.m_v1 = 0x1000000 | v1;
-	position = ((v1 & 0xffffff) + depth - (v2 & 0xffffff)) % depth;
-	queue.m_v2 = (v2 & 0xff000000) | ((v2 & 0xffffff) - 1);
+	// copy data from the buffer at the position
+	std::memcpy(&queue->buffer[position * queue->size], buffer.get_ptr(), queue->size);
+
+	// clear 5th byte
+	queue->ctrl &= { 0xffffffff, 0x00ffffff };
+
+	g_sync_queue_wm.notify(queue.addr());
 
 	return CELL_OK;
 }
@@ -630,30 +491,23 @@ s32 cellSyncQueuePop(vm::ptr<CellSyncQueue> queue, vm::ptr<void> buffer)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!queue.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	const u32 size = queue->m_size;
-	const u32 depth = queue->m_depth;
-	const auto data = queue->data.read_relaxed();
-	assert((data.m_v1 & 0xffffff) <= depth && (data.m_v2 & 0xffffff) <= depth);
+	const u32 depth = queue->check_depth();
 	
 	u32 position;
-	g_sync_queue_wm.wait_op(queue.addr(), [queue, depth, &position]()
-	{
-		return CELL_OK == queue->data.atomic_op(CELL_OK, [depth, &position](CellSyncQueue::data_t& queue) -> s32
-		{
-			return syncQueueTryPopOp(queue, depth, position);
-		});
-	});
 
-	// prx: (sync), memcpy(buffer_addr, position * m_size + m_addr, m_size)
-	memcpy(buffer.get_ptr(), &queue->m_buffer[position * size], size);
+	g_sync_queue_wm.wait_op(queue.addr(), WRAP_EXPR(queue->ctrl.atomic_op(&sync_queue_t::try_pop, depth, position)));
 
-	// prx: atomically insert 0 in first u8
-	queue->data &= { 0x00ffffff, 0xffffffffu };
+	// copy data at the position to the buffer
+	std::memcpy(buffer.get_ptr(), &queue->buffer[position * queue->size], queue->size);
+
+	// clear first byte
+	queue->ctrl &= { 0x00ffffff, 0xffffffffu };
 
 	g_sync_queue_wm.notify(queue.addr());
 
@@ -668,47 +522,28 @@ s32 cellSyncQueueTryPop(vm::ptr<CellSyncQueue> queue, vm::ptr<void> buffer)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!queue.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	const u32 size = queue->m_size;
-	const u32 depth = queue->m_depth;
-	const auto data = queue->data.read_relaxed();
-	assert((data.m_v1 & 0xffffff) <= depth && (data.m_v2 & 0xffffff) <= depth);
+	const u32 depth = queue->check_depth();
 
 	u32 position;
-	s32 res = queue->data.atomic_op(CELL_OK, [depth, &position](CellSyncQueue::data_t& queue) -> s32
-	{
-		return syncQueueTryPopOp(queue, depth, position);
-	});
-	if (res)
-	{
-		return res;
-	}
-
-	memcpy(buffer.get_ptr(), &queue->m_buffer[position * size], size);
-
-	queue->data &= { 0x00ffffff, 0xffffffffu };
-
-	g_sync_queue_wm.notify(queue.addr());
-
-	return CELL_OK;
-}
-
-s32 syncQueueTryPeekOp(CellSyncQueue::data_t& queue, u32 depth, u32& position)
-{
-	const u32 v1 = queue.m_v1;
-	const u32 v2 = queue.m_v2;
-
-	if ((v1 >> 24) || ((v2 & 0xffffff) <= (v2 >> 24)))
+	
+	if (!queue->ctrl.atomic_op(&sync_queue_t::try_pop, depth, position))
 	{
 		return CELL_SYNC_ERROR_BUSY;
 	}
 
-	queue.m_v1 = 0x1000000 | v1;
-	position = ((v1 & 0xffffff) + depth - (v2 & 0xffffff)) % depth;
+	// copy data at the position to the buffer
+	std::memcpy(buffer.get_ptr(), &queue->buffer[position * queue->size], queue->size);
+
+	// clear first byte
+	queue->ctrl &= { 0x00ffffff, 0xffffffffu };
+
+	g_sync_queue_wm.notify(queue.addr());
 
 	return CELL_OK;
 }
@@ -721,28 +556,23 @@ s32 cellSyncQueuePeek(vm::ptr<CellSyncQueue> queue, vm::ptr<void> buffer)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!queue.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	const u32 size = queue->m_size;
-	const u32 depth = queue->m_depth;
-	const auto data = queue->data.read_relaxed();
-	assert((data.m_v1 & 0xffffff) <= depth && (data.m_v2 & 0xffffff) <= depth);
+	const u32 depth = queue->check_depth();
 
 	u32 position;
-	g_sync_queue_wm.wait_op(queue.addr(), [queue, depth, &position]()
-	{
-		return CELL_OK == queue->data.atomic_op(CELL_OK, [depth, &position](CellSyncQueue::data_t& queue) -> s32
-		{
-			return syncQueueTryPeekOp(queue, depth, position);
-		});
-	});
 
-	memcpy(buffer.get_ptr(), &queue->m_buffer[position * size], size);
+	g_sync_queue_wm.wait_op(queue.addr(), WRAP_EXPR(queue->ctrl.atomic_op(&sync_queue_t::try_peek, depth, position)));
 
-	queue->data &= { 0x00ffffff, 0xffffffffu };
+	// copy data at the position to the buffer
+	std::memcpy(buffer.get_ptr(), &queue->buffer[position * queue->size], queue->size);
+
+	// clear first byte
+	queue->ctrl &= { 0x00ffffff, 0xffffffffu };
 
 	g_sync_queue_wm.notify(queue.addr());
 
@@ -757,29 +587,26 @@ s32 cellSyncQueueTryPeek(vm::ptr<CellSyncQueue> queue, vm::ptr<void> buffer)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!queue.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	const u32 size = queue->m_size;
-	const u32 depth = queue->m_depth;
-	const auto data = queue->data.read_relaxed();
-	assert((data.m_v1 & 0xffffff) <= depth && (data.m_v2 & 0xffffff) <= depth);
+	const u32 depth = queue->check_depth();
 
 	u32 position;
-	s32 res = queue->data.atomic_op(CELL_OK, [depth, &position](CellSyncQueue::data_t& queue) -> s32
+
+	if (!queue->ctrl.atomic_op(&sync_queue_t::try_peek, depth, position))
 	{
-		return syncQueueTryPeekOp(queue, depth, position);
-	});
-	if (res)
-	{
-		return res;
+		return CELL_SYNC_ERROR_BUSY;
 	}
 
-	memcpy(buffer.get_ptr(), &queue->m_buffer[position * size], size);
+	// copy data at the position to the buffer
+	std::memcpy(buffer.get_ptr(), &queue->buffer[position * queue->size], queue->size);
 
-	queue->data &= { 0x00ffffff, 0xffffffffu };
+	// clear first byte
+	queue->ctrl &= { 0x00ffffff, 0xffffffffu };
 
 	g_sync_queue_wm.notify(queue.addr());
 
@@ -794,17 +621,15 @@ s32 cellSyncQueueSize(vm::ptr<CellSyncQueue> queue)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!queue.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	const auto data = queue->data.read_relaxed();
-	const u32 count = data.m_v2 & 0xffffff;
-	const u32 depth = queue->m_depth;
-	assert((data.m_v1 & 0xffffff) <= depth && count <= depth);
+	queue->check_depth();
 
-	return count;
+	return queue->ctrl.load().m_v2 & 0xffffff;
 }
 
 s32 cellSyncQueueClear(vm::ptr<CellSyncQueue> queue)
@@ -815,23 +640,22 @@ s32 cellSyncQueueClear(vm::ptr<CellSyncQueue> queue)
 	{
 		return CELL_SYNC_ERROR_NULL_POINTER;
 	}
+
 	if (!queue.aligned())
 	{
 		return CELL_SYNC_ERROR_ALIGN;
 	}
 
-	const u32 depth = queue->m_depth;
-	const auto data = queue->data.read_relaxed();
-	assert((data.m_v1 & 0xffffff) <= depth && (data.m_v2 & 0xffffff) <= depth);
+	const u32 depth = queue->check_depth();
 
 	// TODO: optimize if possible
 	g_sync_queue_wm.wait_op(queue.addr(), [queue, depth]()
 	{
-		return CELL_OK == queue->data.atomic_op(CELL_OK, [depth](CellSyncQueue::data_t& queue) -> s32
+		return CELL_OK == queue->ctrl.atomic_op([depth](sync_queue_t& queue) -> s32
 		{
 			const u32 v1 = queue.m_v1;
 
-			// prx: extract first u8, repeat if not zero, insert 1
+			// extract first byte, repeat if not zero, insert 1
 			if (v1 >> 24)
 			{
 				return CELL_SYNC_ERROR_BUSY;
@@ -845,11 +669,11 @@ s32 cellSyncQueueClear(vm::ptr<CellSyncQueue> queue)
 
 	g_sync_queue_wm.wait_op(queue.addr(), [queue, depth]()
 	{
-		return CELL_OK == queue->data.atomic_op(CELL_OK, [depth](CellSyncQueue::data_t& queue) -> s32
+		return CELL_OK == queue->ctrl.atomic_op([depth](sync_queue_t& queue) -> s32
 		{
 			const u32 v2 = queue.m_v2;
 
-			// prx: extract 5th u8, repeat if not zero, insert 1
+			// extract 5th byte, repeat if not zero, insert 1
 			if (v2 >> 24)
 			{
 				return CELL_SYNC_ERROR_BUSY;
@@ -861,7 +685,7 @@ s32 cellSyncQueueClear(vm::ptr<CellSyncQueue> queue)
 		});
 	});
 
-	queue->data.exchange({});
+	queue->ctrl.exchange({});
 
 	g_sync_queue_wm.notify(queue.addr());
 
@@ -895,8 +719,8 @@ void syncLFQueueInit(vm::ptr<CellSyncLFQueue> queue, vm::ptr<u8> buffer, u32 siz
 	}
 	else
 	{
-		queue->pop1 = { { 0, 0, queue->pop1.read_relaxed().m_h3, 0 } };
-		queue->push1 = { { 0, 0, queue->push1.read_relaxed().m_h7, 0 } };
+		queue->pop1 = { { 0, 0, queue->pop1.load().m_h3, 0 } };
+		queue->push1 = { { 0, 0, queue->push1.load().m_h7, 0 } };
 		queue->m_bs[0] = -1; // written as u32
 		queue->m_bs[1] = -1;
 		queue->m_bs[2] = -1;
@@ -952,7 +776,7 @@ s32 syncLFQueueInitialize(vm::ptr<CellSyncLFQueue> queue, vm::ptr<u8> buffer, u3
 	u32 old_value;
 	while (true)
 	{
-		const auto old = queue->init.read_relaxed();
+		const auto old = queue->init.load();
 		auto init = old;
 
 		if (old.data())
@@ -1002,12 +826,12 @@ s32 syncLFQueueInitialize(vm::ptr<CellSyncLFQueue> queue, vm::ptr<u8> buffer, u3
 		// prx: call internal function with same arguments
 		syncLFQueueInit(queue, buffer, size, depth, direction, eaSignal);
 
-		// prx: sync, zeroize u32 at 0x2c offset
+		// prx: sync, clear u32 at 0x2c offset
 		queue->init.exchange({});
 	}
 
 	// prx: sync
-	queue->init.read_sync();
+	_mm_mfence();
 	return CELL_OK;
 }
 
@@ -1038,7 +862,7 @@ s32 syncLFQueueGetPushPointer(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queue, s3
 				return -1;
 			}
 
-			const auto old = queue->push1.read_sync();
+			const auto old = queue->push1.load_sync();
 			auto push = old;
 
 			if (var1)
@@ -1058,7 +882,7 @@ s32 syncLFQueueGetPushPointer(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queue, s3
 			}
 			else
 			{
-				var2 -= (s32)(u16)queue->pop1.read_relaxed().m_h1;
+				var2 -= (s32)(u16)queue->pop1.load().m_h1;
 				if (var2 < 0)
 				{
 					var2 += depth * 2;
@@ -1155,10 +979,10 @@ s32 syncLFQueueCompletePushPointer(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queu
 
 	while (true)
 	{
-		const auto old = queue->push2.read_sync();
+		const auto old = queue->push2.load_sync();
 		auto push2 = old;
 
-		const auto old2 = queue->push3.read_relaxed();
+		const auto old2 = queue->push3.load();
 		auto push3 = old2;
 
 		s32 var1 = pointer - (u16)push3.m_h5;
@@ -1167,7 +991,7 @@ s32 syncLFQueueCompletePushPointer(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queu
 			var1 += depth * 2;
 		}
 
-		s32 var2 = (s32)(s16)queue->pop1.read_relaxed().m_h4 - (s32)(u16)queue->pop1.read_relaxed().m_h1;
+		s32 var2 = (s32)(s16)queue->pop1.load().m_h4 - (s32)(u16)queue->pop1.load().m_h1;
 		if (var2 < 0)
 		{
 			var2 += depth * 2;
@@ -1265,7 +1089,7 @@ s32 syncLFQueueCompletePushPointer(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queu
 			}
 			else
 			{
-				pack = queue->push2.read_relaxed().pack;
+				pack = queue->push2.load().pack;
 				if ((pack & 0x1f) == ((pack >> 10) & 0x1f))
 				{
 					if (queue->push3.compare_and_swap_test(old2, push3))
@@ -1384,7 +1208,7 @@ s32 syncLFQueueGetPopPointer(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queue, s32
 				return -1;
 			}
 
-			const auto old = queue->pop1.read_sync();
+			const auto old = queue->pop1.load_sync();
 			auto pop = old;
 
 			if (var1)
@@ -1404,7 +1228,7 @@ s32 syncLFQueueGetPopPointer(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queue, s32
 			}
 			else
 			{
-				var2 = (s32)(u16)queue->push1.read_relaxed().m_h5 - var2;
+				var2 = (s32)(u16)queue->push1.load().m_h5 - var2;
 				if (var2 < 0)
 				{
 					var2 += depth * 2;
@@ -1501,10 +1325,10 @@ s32 syncLFQueueCompletePopPointer(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queue
 
 	while (true)
 	{
-		const auto old = queue->pop2.read_sync();
+		const auto old = queue->pop2.load_sync();
 		auto pop2 = old;
 
-		const auto old2 = queue->pop3.read_relaxed();
+		const auto old2 = queue->pop3.load();
 		auto pop3 = old2;
 
 		s32 var1 = pointer - (u16)pop3.m_h1;
@@ -1513,7 +1337,7 @@ s32 syncLFQueueCompletePopPointer(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queue
 			var1 += depth * 2;
 		}
 
-		s32 var2 = (s32)(s16)queue->push1.read_relaxed().m_h8 - (s32)(u16)queue->push1.read_relaxed().m_h5;
+		s32 var2 = (s32)(s16)queue->push1.load().m_h8 - (s32)(u16)queue->push1.load().m_h5;
 		if (var2 < 0)
 		{
 			var2 += depth * 2;
@@ -1610,7 +1434,7 @@ s32 syncLFQueueCompletePopPointer(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queue
 			}
 			else
 			{
-				pack = queue->pop2.read_relaxed().pack;
+				pack = queue->pop2.load().pack;
 				if ((pack & 0x1f) == ((pack >> 10) & 0x1f))
 				{
 					if (queue->pop3.compare_and_swap_test(old2, pop3))
@@ -1724,15 +1548,15 @@ s32 cellSyncLFQueueClear(vm::ptr<CellSyncLFQueue> queue)
 
 	while (true)
 	{
-		const auto old = queue->pop1.read_sync();
+		const auto old = queue->pop1.load_sync();
 		auto pop = old;
 
-		const auto push = queue->push1.read_relaxed();
+		const auto push = queue->push1.load();
 
 		s32 var1, var2;
 		if (queue->m_direction != CELL_SYNC_QUEUE_ANY2ANY)
 		{
-			var1 = var2 = (u16)queue->pop2.read_relaxed().pack;
+			var1 = var2 = (u16)queue->pop2.load().pack;
 		}
 		else
 		{
@@ -1774,10 +1598,10 @@ s32 cellSyncLFQueueSize(vm::ptr<CellSyncLFQueue> queue, vm::ptr<u32> size)
 
 	while (true)
 	{
-		const auto old = queue->pop3.read_sync();
+		const auto old = queue->pop3.load_sync();
 
-		u32 var1 = (u16)queue->pop1.read_relaxed().m_h1;
-		u32 var2 = (u16)queue->push1.read_relaxed().m_h5;
+		u32 var1 = (u16)queue->pop1.load().m_h1;
+		u32 var2 = (u16)queue->push1.load().m_h5;
 
 		if (queue->pop3.compare_and_swap_test(old, old))
 		{
diff --git a/rpcs3/Emu/SysCalls/Modules/cellSync.h b/rpcs3/Emu/SysCalls/Modules/cellSync.h
index 858f3cb6d7..124d8cfd39 100644
--- a/rpcs3/Emu/SysCalls/Modules/cellSync.h
+++ b/rpcs3/Emu/SysCalls/Modules/cellSync.h
@@ -31,67 +31,216 @@ enum
 	CELL_SYNC_ERROR_NO_SPU_CONTEXT_STORAGE = 0x80410114, // ???
 };
 
-union set_alignment(4) CellSyncMutex
+struct set_alignment(4) sync_mutex_t // CellSyncMutex sync var
 {
-	struct sync_t
-	{
-		be_t<u16> cnt_rel; // increased when mutex is unlocked
-		be_t<u16> cnt_acq; // increased when mutex is locked
-	};
+	be_t<u16> rel;
+	be_t<u16> acq;
 
-	struct
+	be_t<u16> acquire()
 	{
-		atomic_be_t<u16> rel;
-		atomic_be_t<u16> acq;
+		return acq++;
 	}
-	cnt;
 
-	atomic_be_t<sync_t> sync_var;
+	bool try_lock()
+	{
+		return acq++ == rel;
+	}
+
+	void unlock()
+	{
+		rel++;
+	}
 };
 
+using CellSyncMutex = atomic_be_t<sync_mutex_t>;
+
 CHECK_SIZE_ALIGN(CellSyncMutex, 4, 4);
 
-struct set_alignment(4) CellSyncBarrier
+struct set_alignment(4) sync_barrier_t // CellSyncBarrier sync var
 {
-	struct data_t
+	be_t<s16> value;
+	be_t<s16> count;
+
+	bool try_notify()
 	{
-		be_t<s16> m_value;
-		be_t<s16> m_count;
+		// extract m_value (repeat if < 0), increase, compare with second s16, set sign bit if equal, insert it back
+		s16 v = value;
+
+		if (v < 0)
+		{
+			return false;
+		}
+
+		if (++v == count)
+		{
+			v |= 0x8000;
+		}
+
+		value = v;
+
+		return true;
 	};
 
-	atomic_be_t<data_t> data;
+	bool try_wait()
+	{
+		// extract m_value (repeat if >= 0), decrease it, set 0 if == 0x8000, insert it back
+		s16 v = value;
+
+		if (v >= 0)
+		{
+			return false;
+		}
+
+		if (--v == -0x8000)
+		{
+			v = 0;
+		}
+
+		value = v;
+
+		return true;
+	}
 };
 
+using CellSyncBarrier = atomic_be_t<sync_barrier_t>;
+
 CHECK_SIZE_ALIGN(CellSyncBarrier, 4, 4);
 
+struct sync_rwm_t // CellSyncRwm sync var
+{
+	be_t<u16> readers;
+	be_t<u16> writers;
+
+	bool try_read_begin()
+	{
+		if (writers.data())
+		{
+			return false;
+		}
+
+		readers++;
+		return true;
+	}
+
+	bool try_read_end()
+	{
+		if (!readers.data())
+		{
+			return false;
+		}
+
+		readers--;
+		return true;
+	}
+
+	bool try_write_begin()
+	{
+		if (writers.data())
+		{
+			return false;
+		}
+
+		writers = 1;
+		return true;
+	}
+};
+
 struct set_alignment(16) CellSyncRwm
 {
-	struct data_t
-	{
-		be_t<u16> m_readers;
-		be_t<u16> m_writers;
-	};
+	atomic_be_t<sync_rwm_t> ctrl; // sync var
 
-	atomic_be_t<data_t> data;
-	be_t<u32> m_size;
-	vm::bptr<void, u64> m_buffer;
+	be_t<u32> size;
+	vm::bptr<void, u64> buffer;
 };
 
 CHECK_SIZE_ALIGN(CellSyncRwm, 16, 16);
 
+struct sync_queue_t // CellSyncQueue sync var
+{
+	be_t<u32> m_v1;
+	be_t<u32> m_v2;
+
+	bool try_push(u32 depth, u32& position)
+	{
+		const u32 v1 = m_v1;
+		const u32 v2 = m_v2;
+
+		// compare 5th byte with zero (break if not zero)
+		// compare (second u32 (u24) + first byte) with depth (break if greater or equal)
+		if ((v2 >> 24) || ((v2 & 0xffffff) + (v1 >> 24)) >= depth)
+		{
+			return false;
+		}
+
+		// extract first u32 (u24) (-> position), calculate (position + 1) % depth, insert it back
+		// insert 1 in 5th u8
+		// extract second u32 (u24), increase it, insert it back
+		position = (v1 & 0xffffff);
+		m_v1 = (v1 & 0xff000000) | ((position + 1) % depth);
+		m_v2 = (1 << 24) | ((v2 & 0xffffff) + 1);
+
+		return true;
+	}
+
+	bool try_pop(u32 depth, u32& position)
+	{
+		const u32 v1 = m_v1;
+		const u32 v2 = m_v2;
+
+		// extract first u8, repeat if not zero
+		// extract second u32 (u24), subtract 5th u8, compare with zero, repeat if less or equal
+		if ((v1 >> 24) || ((v2 & 0xffffff) <= (v2 >> 24)))
+		{
+			return false;
+		}
+
+		// insert 1 in first u8
+		// extract first u32 (u24), add depth, subtract second u32 (u24), calculate (% depth), save to position
+		// extract second u32 (u24), decrease it, insert it back
+		m_v1 = 0x1000000 | v1;
+		position = ((v1 & 0xffffff) + depth - (v2 & 0xffffff)) % depth;
+		m_v2 = (v2 & 0xff000000) | ((v2 & 0xffffff) - 1);
+
+		return true;
+	}
+
+	bool try_peek(u32 depth, u32& position)
+	{
+		const u32 v1 = m_v1;
+		const u32 v2 = m_v2;
+
+		if ((v1 >> 24) || ((v2 & 0xffffff) <= (v2 >> 24)))
+		{
+			return false;
+		}
+
+		m_v1 = 0x1000000 | v1;
+		position = ((v1 & 0xffffff) + depth - (v2 & 0xffffff)) % depth;
+
+		return true;
+	}
+};
+
 struct set_alignment(32) CellSyncQueue
 {
-	struct data_t
-	{
-		be_t<u32> m_v1;
-		be_t<u32> m_v2;
-	};
+	atomic_be_t<sync_queue_t> ctrl;
 
-	atomic_be_t<data_t> data;
-	be_t<u32> m_size;
-	be_t<u32> m_depth;
-	vm::bptr<u8, u64> m_buffer;
+	be_t<u32> size;
+	be_t<u32> depth;
+	vm::bptr<u8, u64> buffer;
 	be_t<u64> reserved;
+
+	u32 check_depth()
+	{
+		const auto data = ctrl.load();
+
+		if ((data.m_v1 & 0xffffff) > depth || (data.m_v2 & 0xffffff) > depth)
+		{
+			throw __FUNCTION__;
+		}
+
+		return depth;
+	}
 };
 
 CHECK_SIZE_ALIGN(CellSyncQueue, 32, 32);
@@ -191,14 +340,6 @@ struct set_alignment(128) CellSyncLFQueue
 
 CHECK_SIZE_ALIGN(CellSyncLFQueue, 128, 128);
 
-s32 syncMutexInitialize(vm::ptr<CellSyncMutex> mutex);
-
-s32 syncBarrierInitialize(vm::ptr<CellSyncBarrier> barrier, u16 total_count);
-
-s32 syncRwmInitialize(vm::ptr<CellSyncRwm> rwm, vm::ptr<void> buffer, u32 buffer_size);
-
-s32 syncQueueInitialize(vm::ptr<CellSyncQueue> queue, vm::ptr<u8> buffer, u32 size, u32 depth);
-
 s32 syncLFQueueInitialize(vm::ptr<CellSyncLFQueue> queue, vm::ptr<u8> buffer, u32 size, u32 depth, CellSyncQueueDirection direction, vm::ptr<void> eaSignal);
 s32 syncLFQueueGetPushPointer(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queue, s32& pointer, u32 isBlocking, u32 useEventQueue);
 s32 syncLFQueueGetPushPointer2(PPUThread& CPU, vm::ptr<CellSyncLFQueue> queue, s32& pointer, u32 isBlocking, u32 useEventQueue);
diff --git a/rpcs3/Emu/SysCalls/Modules/libmixer.cpp b/rpcs3/Emu/SysCalls/Modules/libmixer.cpp
index dac5fbe273..a9ef23eca8 100644
--- a/rpcs3/Emu/SysCalls/Modules/libmixer.cpp
+++ b/rpcs3/Emu/SysCalls/Modules/libmixer.cpp
@@ -343,7 +343,7 @@ int cellSurMixerCreate(vm::cptr<CellSurMixerConfig> config)
 		ppu.InitRegs();
 		ppu.DoRun();
 
-		while (port.state.read_relaxed() != AUDIO_PORT_STATE_CLOSED && !Emu.IsStopped())
+		while (port.state.load() != AUDIO_PORT_STATE_CLOSED && !Emu.IsStopped())
 		{
 			if (mixcount > (port.tag + 0)) // adding positive value (1-15): preemptive buffer filling (hack)
 			{
@@ -351,7 +351,7 @@ int cellSurMixerCreate(vm::cptr<CellSurMixerConfig> config)
 				continue;
 			}
 
-			if (port.state.read_relaxed() == AUDIO_PORT_STATE_STARTED)
+			if (port.state.load() == AUDIO_PORT_STATE_STARTED)
 			{
 				//u64 stamp0 = get_system_time();
 
diff --git a/rpcs3/Emu/SysCalls/Modules/sysPrxForUser.cpp b/rpcs3/Emu/SysCalls/Modules/sysPrxForUser.cpp
index 091dcf4e60..8feea2d7dc 100644
--- a/rpcs3/Emu/SysCalls/Modules/sysPrxForUser.cpp
+++ b/rpcs3/Emu/SysCalls/Modules/sysPrxForUser.cpp
@@ -125,7 +125,7 @@ s32 sys_lwmutex_destroy(PPUThread& CPU, vm::ptr<sys_lwmutex_t> lwmutex)
 	sysPrxForUser.Log("sys_lwmutex_destroy(lwmutex=*0x%x)", lwmutex);
 
 	// check to prevent recursive locking in the next call
-	if (lwmutex->vars.owner.read_relaxed() == CPU.GetId())
+	if (lwmutex->vars.owner.load() == CPU.GetId())
 	{
 		return CELL_EBUSY;
 	}
@@ -184,7 +184,7 @@ s32 sys_lwmutex_lock(PPUThread& CPU, vm::ptr<sys_lwmutex_t> lwmutex, u64 timeout
 
 		// recursive locking succeeded
 		lwmutex->recursive_count++;
-		lwmutex->lock_var.read_sync();
+		_mm_mfence();
 
 		return CELL_OK;
 	}
@@ -197,7 +197,7 @@ s32 sys_lwmutex_lock(PPUThread& CPU, vm::ptr<sys_lwmutex_t> lwmutex, u64 timeout
 
 	for (u32 i = 0; i < 300; i++)
 	{
-		if (lwmutex->vars.owner.read_relaxed() == lwmutex_free)
+		if (lwmutex->vars.owner.load() == lwmutex_free)
 		{
 			if (lwmutex->vars.owner.compare_and_swap_test(lwmutex_free, tid))
 			{
@@ -278,7 +278,7 @@ s32 sys_lwmutex_trylock(PPUThread& CPU, vm::ptr<sys_lwmutex_t> lwmutex)
 
 		// recursive locking succeeded
 		lwmutex->recursive_count++;
-		lwmutex->lock_var.read_sync();
+		_mm_mfence();
 
 		return CELL_OK;
 	}
@@ -319,7 +319,7 @@ s32 sys_lwmutex_unlock(PPUThread& CPU, vm::ptr<sys_lwmutex_t> lwmutex)
 	const be_t<u32> tid = CPU.GetId();
 
 	// check owner
-	if (lwmutex->vars.owner.read_relaxed() != tid)
+	if (lwmutex->vars.owner.load() != tid)
 	{
 		return CELL_EPERM;
 	}
@@ -392,7 +392,7 @@ s32 sys_lwcond_signal(PPUThread& CPU, vm::ptr<sys_lwcond_t> lwcond)
 		//return _sys_lwcond_signal(lwcond->lwcond_queue, 0, -1, 2);
 	}
 
-	if (lwmutex->vars.owner.read_relaxed() == CPU.GetId())
+	if (lwmutex->vars.owner.load() == CPU.GetId())
 	{
 		// if owns the mutex
 		lwmutex->all_info++;
@@ -450,7 +450,7 @@ s32 sys_lwcond_signal_all(PPUThread& CPU, vm::ptr<sys_lwcond_t> lwcond)
 		//return _sys_lwcond_signal_all(lwcond->lwcond_queue, lwmutex->sleep_queue, 2);
 	}
 
-	if (lwmutex->vars.owner.read_relaxed() == CPU.GetId())
+	if (lwmutex->vars.owner.load() == CPU.GetId())
 	{
 		// if owns the mutex, call the syscall
 		const s32 res = _sys_lwcond_signal_all(lwcond->lwcond_queue, lwmutex->sleep_queue, 1);
@@ -507,7 +507,7 @@ s32 sys_lwcond_signal_to(PPUThread& CPU, vm::ptr<sys_lwcond_t> lwcond, u32 ppu_t
 		//return _sys_lwcond_signal(lwcond->lwcond_queue, 0, ppu_thread_id, 2);
 	}
 
-	if (lwmutex->vars.owner.read_relaxed() == CPU.GetId())
+	if (lwmutex->vars.owner.load() == CPU.GetId())
 	{
 		// if owns the mutex
 		lwmutex->all_info++;
@@ -561,7 +561,7 @@ s32 sys_lwcond_wait(PPUThread& CPU, vm::ptr<sys_lwcond_t> lwcond, u64 timeout)
 
 	const vm::ptr<sys_lwmutex_t> lwmutex = lwcond->lwmutex;
 
-	if (lwmutex->vars.owner.read_relaxed() != tid)
+	if (lwmutex->vars.owner.load() != tid)
 	{
 		// if not owner of the mutex
 		return CELL_EPERM;
@@ -1189,7 +1189,7 @@ void sys_spinlock_lock(vm::ptr<atomic_be_t<u32>> lock)
 	// prx: exchange with 0xabadcafe, repeat until exchanged with 0
 	while (lock->exchange(0xabadcafe).data())
 	{
-		g_sys_spinlock_wm.wait_op(lock.addr(), [lock](){ return lock->read_relaxed().data() == 0; });
+		g_sys_spinlock_wm.wait_op(lock.addr(), [lock](){ return lock->load().data() == 0; });
 
 		if (Emu.IsStopped())
 		{
diff --git a/rpcs3/Emu/SysCalls/lv2/sys_interrupt.cpp b/rpcs3/Emu/SysCalls/lv2/sys_interrupt.cpp
index 7ce6a93aef..711a8ddcbc 100644
--- a/rpcs3/Emu/SysCalls/lv2/sys_interrupt.cpp
+++ b/rpcs3/Emu/SysCalls/lv2/sys_interrupt.cpp
@@ -88,7 +88,7 @@ s32 sys_interrupt_thread_establish(vm::ptr<u32> ih, u32 intrtag, u64 intrthread,
 			return CELL_EAGAIN;
 		}
 
-		if (s32 res = tag.assigned.atomic_op<s32>(CELL_OK, [](s32& value) -> s32
+		if (s32 res = tag.assigned.atomic_op([](s32& value) -> s32
 		{
 			if (value < 0)
 			{
@@ -113,7 +113,7 @@ s32 sys_interrupt_thread_establish(vm::ptr<u32> ih, u32 intrtag, u64 intrthread,
 			while (!Emu.IsStopped())
 			{
 				// call interrupt handler until int status is clear
-				if (tag.stat.read_relaxed())
+				if (tag.stat.load())
 				{
 					//func(CPU, arg);
 					CPU.GPR[3] = arg;
diff --git a/rpcs3/Emu/SysCalls/lv2/sys_mutex.cpp b/rpcs3/Emu/SysCalls/lv2/sys_mutex.cpp
index 6616f0dd71..8f70bedc3d 100644
--- a/rpcs3/Emu/SysCalls/lv2/sys_mutex.cpp
+++ b/rpcs3/Emu/SysCalls/lv2/sys_mutex.cpp
@@ -33,7 +33,7 @@ s32 sys_mutex_create(vm::ptr<u32> mutex_id, vm::ptr<sys_mutex_attribute_t> attr)
 
 	const bool recursive = attr->recursive == SYS_SYNC_RECURSIVE;
 
-	if ((!recursive && attr->recursive != SYS_SYNC_NOT_RECURSIVE) || attr->pshared.data() != SYS_SYNC_NOT_PROCESS_SHARED || attr->adaptive != SYS_SYNC_NOT_ADAPTIVE || attr->ipc_key.data() || attr->flags.data())
+	if ((!recursive && attr->recursive != SYS_SYNC_NOT_RECURSIVE) || attr->pshared != SYS_SYNC_NOT_PROCESS_SHARED || attr->adaptive != SYS_SYNC_NOT_ADAPTIVE || attr->ipc_key.data() || attr->flags.data())
 	{
 		sys_mutex.Error("sys_mutex_create(): unknown attributes (recursive=0x%x, pshared=0x%x, adaptive=0x%x, ipc_key=0x%llx, flags=0x%x)",
 			attr->recursive, attr->pshared, attr->adaptive, attr->ipc_key, attr->flags);
diff --git a/rpcs3/Emu/SysCalls/lv2/sys_spu.cpp b/rpcs3/Emu/SysCalls/lv2/sys_spu.cpp
index 68e9ca41c5..d540b167d8 100644
--- a/rpcs3/Emu/SysCalls/lv2/sys_spu.cpp
+++ b/rpcs3/Emu/SysCalls/lv2/sys_spu.cpp
@@ -566,7 +566,7 @@ s32 sys_spu_thread_group_join(u32 id, vm::ptr<u32> cause, vm::ptr<u32> status)
 			{
 				auto& spu = static_cast<SPUThread&>(*t);
 
-				if (!(spu.status.read_relaxed() & SPU_STATUS_STOPPED_BY_STOP))
+				if (!(spu.status.load() & SPU_STATUS_STOPPED_BY_STOP))
 				{
 					stopped = false;
 					break;
@@ -1253,7 +1253,7 @@ s32 sys_raw_spu_get_int_mask(u32 id, u32 class_id, vm::ptr<u64> mask)
 
 	auto& spu = static_cast<RawSPUThread&>(*t);
 
-	*mask = (class_id ? spu.int2 : spu.int0).mask.read_sync();
+	*mask = (class_id ? spu.int2 : spu.int0).mask.load();
 
 	return CELL_OK;
 }
@@ -1299,7 +1299,7 @@ s32 sys_raw_spu_get_int_stat(u32 id, u32 class_id, vm::ptr<u64> stat)
 
 	auto& spu = static_cast<RawSPUThread&>(*t);
 
-	*stat = (class_id ? spu.int2 : spu.int0).stat.read_sync();
+	*stat = (class_id ? spu.int2 : spu.int0).stat.load();
 
 	return CELL_OK;
 }
diff --git a/rpcs3/Gui/RSXDebugger.cpp b/rpcs3/Gui/RSXDebugger.cpp
index 3102cbf738..7a34aa581c 100644
--- a/rpcs3/Gui/RSXDebugger.cpp
+++ b/rpcs3/Gui/RSXDebugger.cpp
@@ -333,7 +333,7 @@ void RSXDebugger::GoToGet(wxCommandEvent& event)
 	if (!RSXReady()) return;
 	auto ctrl = vm::get_ptr<CellGcmControl>(Emu.GetGSManager().GetRender().m_ctrlAddress);
 	u32 realAddr;
-	if (Memory.RSXIOMem.getRealAddr(ctrl->get.read_relaxed(), realAddr)) {
+	if (Memory.RSXIOMem.getRealAddr(ctrl->get.load(), realAddr)) {
 		m_addr = realAddr;
 		t_addr->SetValue(wxString::Format("%08x", m_addr));
 		UpdateInformation();
@@ -347,7 +347,7 @@ void RSXDebugger::GoToPut(wxCommandEvent& event)
 	if (!RSXReady()) return;
 	auto ctrl = vm::get_ptr<CellGcmControl>(Emu.GetGSManager().GetRender().m_ctrlAddress);
 	u32 realAddr;
-	if (Memory.RSXIOMem.getRealAddr(ctrl->put.read_relaxed(), realAddr)) {
+	if (Memory.RSXIOMem.getRealAddr(ctrl->put.load(), realAddr)) {
 		m_addr = realAddr;
 		t_addr->SetValue(wxString::Format("%08x", m_addr));
 		UpdateInformation();
diff --git a/rpcs3/stdafx.h b/rpcs3/stdafx.h
index 98b7a0a845..82fa7215b2 100644
--- a/rpcs3/stdafx.h
+++ b/rpcs3/stdafx.h
@@ -108,6 +108,8 @@ template<typename T> struct ID_type;
 #define CHECK_MAX_SIZE(type, size) static_assert(sizeof(type) <= size, #type " type size is too big")
 #define CHECK_SIZE_ALIGN(type, size, align) CHECK_SIZE(type, size); CHECK_ALIGN(type, align)
 
+#define WRAP_EXPR(expr) [&]{ return (expr); }
+
 #define _PRGNAME_ "RPCS3"
 #define _PRGVER_ "0.0.0.5"