SPU/PPU atomics performance and LR event fixes (#5435)

* Fix SPU LR event setting in atomic commands according to hw test * MFC: increment timestamp for PUT cmd in non-tsx path * MFC: fix reservation lost test on non-tsx path in regard to the lock bit * Reservation notification moved out of writer_lock scope to reduce its lifetime * Use passive_lock/unlock in ppu atomic inctrustions to reduce redundancy * Lock only once for dma transfers (non-TSX) * Don't use RDTSC in reservation update logic * Remove MFC cmd args passing to process_mfc_cmd * Reorder check_state cpu_flag::memory check for faster unlocking * Specialization for 128-byte data copy in SPU dma transfers * Implement memory range locks and isolate PPU and SPU passive lock logic
2025-04-21 03:55:32 +00:00 · 2019-01-15 17:31:21 +02:00 · 2019-01-15 17:31:21 +02:00 · fc92ae4085
commit fc92ae4085
parent f19fd23227
9 changed files with 344 additions and 235 deletions
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@ -119,15 +119,16 @@ bool cpu_thread::check_state()

 	while (true)
 	{
-		if (state & cpu_flag::memory && state.test_and_reset(cpu_flag::memory))
+		if (state & cpu_flag::memory)
 		{
-			cpu_flag_memory = true;
-
 			if (auto& ptr = vm::g_tls_locked)
 			{
 				ptr->compare_and_swap(this, nullptr);
 				ptr = nullptr;
 			}
+
+			cpu_flag_memory = true;
+			state -= cpu_flag::memory;
 		}

 		if (state & cpu_flag::exit + cpu_flag::dbg_global_stop)
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -977,7 +977,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 		}
 	}

-	vm::temporary_unlock(ppu);
+	vm::passive_unlock(ppu);

 	for (u64 i = 0;; i++)
 	{
@ -1003,8 +1003,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 		}
 	}

-	ppu.cpu_mem();
-
+	vm::passive_lock(ppu);
 	return static_cast<T>(ppu.rdata << data_off >> size_off);
 }

@ -1044,7 +1043,7 @@ const auto ppu_stwcx_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, u64 r
 	c.cmp(x86::dword_ptr(x86::r11), args[2].r32());
 	c.jne(fail);
 	c.mov(x86::dword_ptr(x86::r11), args[3].r32());
-	c.add(x86::qword_ptr(x86::r10), 1);
+	c.add(x86::qword_ptr(x86::r10), 2);
 	c.xend();
 	c.mov(x86::eax, 1);
 	c.ret();
@ -1070,7 +1069,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
 	auto& data = vm::_ref<atomic_be_t<u32>>(addr & -4);
 	const u32 old_data = static_cast<u32>(ppu.rdata << ((addr & 7) * 8) >> 32);

-	if (ppu.raddr != addr || addr & 3 || old_data != data.load() || ppu.rtime != vm::reservation_acquire(addr, sizeof(u32)))
+	if (ppu.raddr != addr || addr & 3 || old_data != data.load() || ppu.rtime != (vm::reservation_acquire(addr, sizeof(u32)) & ~1ull))
 	{
 		ppu.raddr = 0;
 		return false;
@ -1090,7 +1089,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
 		return false;
 	}

-	vm::temporary_unlock(ppu);
+	vm::passive_unlock(ppu);

 	auto& res = vm::reservation_lock(addr, sizeof(u32));

@ -1098,7 +1097,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)

 	if (result)
 	{
-		vm::reservation_update(addr, sizeof(u32));
+		res++;
 		vm::reservation_notifier(addr, sizeof(u32)).notify_all();
 	}
 	else
@ -1106,7 +1105,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
 		res &= ~1ull;
 	}

-	ppu.cpu_mem();
+	vm::passive_lock(ppu);
 	ppu.raddr = 0;
 	return result;
 }
@ -1137,7 +1136,7 @@ const auto ppu_stdcx_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, u64 r
 	c.cmp(x86::qword_ptr(x86::r11), args[2]);
 	c.jne(fail);
 	c.mov(x86::qword_ptr(x86::r11), args[3]);
-	c.add(x86::qword_ptr(x86::r10), 1);
+	c.add(x86::qword_ptr(x86::r10), 2);
 	c.xend();
 	c.mov(x86::eax, 1);
 	c.ret();
@ -1163,7 +1162,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
 	auto& data = vm::_ref<atomic_be_t<u64>>(addr & -8);
 	const u64 old_data = ppu.rdata << ((addr & 7) * 8);

-	if (ppu.raddr != addr || addr & 7 || old_data != data.load() || ppu.rtime != vm::reservation_acquire(addr, sizeof(u64)))
+	if (ppu.raddr != addr || addr & 7 || old_data != data.load() || ppu.rtime != (vm::reservation_acquire(addr, sizeof(u64)) & ~1ull))
 	{
 		ppu.raddr = 0;
 		return false;
@ -1183,7 +1182,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
 		return false;
 	}

-	vm::temporary_unlock(ppu);
+	vm::passive_unlock(ppu);

 	auto& res = vm::reservation_lock(addr, sizeof(u64));

@ -1191,7 +1190,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)

 	if (result)
 	{
-		vm::reservation_update(addr, sizeof(u64));
+		res++;
 		vm::reservation_notifier(addr, sizeof(u64)).notify_all();
 	}
 	else
@ -1199,7 +1198,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
 		res &= ~1ull;
 	}

-	ppu.cpu_mem();
+	vm::passive_lock(ppu);
 	ppu.raddr = 0;
 	return result;
 }
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -1436,6 +1436,7 @@ void spu_recompiler::get_events()
 		c->mov(*qw0, imm_ptr(vm::g_reservations));
 		c->shr(qw1->r32(), 4);
 		c->mov(*qw0, x86::qword_ptr(*qw0, *qw1));
+		c->and_(qw0->r64(), (u64)(~1ull));
 		c->cmp(*qw0, SPU_OFF_64(rtime));
 		c->jne(fail);
 		c->mov(*qw0, imm_ptr(vm::g_base_addr));
@ -2596,7 +2597,7 @@ static void spu_wrch(spu_thread* _spu, u32 ch, u32 value, spu_function_t _ret)

 static void spu_wrch_mfc(spu_thread* _spu, spu_function_t _ret)
 {
-	if (!_spu->process_mfc_cmd(_spu->ch_mfc_cmd))
+	if (!_spu->process_mfc_cmd())
 	{
 		_ret = &spu_wrch_ret;
 	}
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -3362,7 +3362,7 @@ public:

 	static bool exec_mfc_cmd(spu_thread* _spu)
 	{
-		return _spu->process_mfc_cmd(_spu->ch_mfc_cmd);
+		return _spu->process_mfc_cmd();
 	}

 	void WRCH(spu_opcode_t op) //
@ -3541,9 +3541,9 @@ public:
 						csize = ci->getZExtValue();
 					}

-					if (cmd >= MFC_SNDSIG_CMD)
+					if (cmd >= MFC_SNDSIG_CMD && csize != 4)
 					{
-						csize = 4;
+						csize = -1;
 					}

 					llvm::Value* src = m_ir->CreateGEP(m_lsptr, zext<u64>(lsa).value);
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -40,6 +40,34 @@ bool operator ==(const u128& lhs, const u128& rhs)
 }
 #endif

+static FORCE_INLINE void mov_rdata(u128* const dst, const u128* const src)
+{
+	{
+		const u128 data0 = src[0];
+		const u128 data1 = src[1];
+		const u128 data2 = src[2];
+		dst[0] = data0;
+		dst[1] = data1;
+		dst[2] = data2;
+	}
+
+	{
+		const u128 data0 = src[3];
+		const u128 data1 = src[4];
+		const u128 data2 = src[5];
+		dst[3] = data0;
+		dst[4] = data1;
+		dst[5] = data2;
+	}
+
+	{
+		const u128 data0 = src[6];
+		const u128 data1 = src[7];
+		dst[6] = data0;
+		dst[7] = data1;
+	}
+};
+
 extern u64 get_timebased_time();
 extern u64 get_system_time();

@ -158,12 +186,13 @@ namespace spu
 	}
 }

-const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
+const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;

 	Label fall = c.newLabel();
 	Label fail = c.newLabel();
+	Label retry = c.newLabel();

 	// Prepare registers
 	c.mov(x86::rax, imm_ptr(&vm::g_reservations));
@ -216,7 +245,7 @@ const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, cons
 	c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm8);
 	c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm9);
 #endif
-	c.add(x86::qword_ptr(x86::r10), 1);
+	c.add(x86::qword_ptr(x86::r10), 2);
 	c.xend();
 	c.vzeroupper();
 	c.mov(x86::eax, 1);
@ -224,10 +253,10 @@ const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, cons

 	// Touch memory after transaction failure
 	c.bind(fall);
-	c.sub(args[0].r32(), 1);
-	c.jz(fail);
 	c.sar(x86::eax, 24);
 	c.js(fail);
+	c.sub(args[0].r32(), 1);
+	c.jz(retry);
 	c.lock().add(x86::qword_ptr(x86::r11), 0);
 	c.lock().add(x86::qword_ptr(x86::r10), 0);
 #ifdef _WIN32
@ -240,9 +269,12 @@ const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, cons
 	build_transaction_abort(c, 0xff);
 	c.xor_(x86::eax, x86::eax);
 	c.ret();
+	c.bind(retry);
+	c.mov(x86::eax, 2);
+	c.ret();
 });

-const auto spu_getll_tx = build_function_asm<bool(*)(u32 raddr, void* rdata, u64* out_rtime)>([](asmjit::X86Assembler& c, auto& args)
+const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;

@ -271,8 +303,6 @@ const auto spu_getll_tx = build_function_asm<bool(*)(u32 raddr, void* rdata, u64
 	c.vmovups(x86::yword_ptr(args[1], 64), x86::ymm2);
 	c.vmovups(x86::yword_ptr(args[1], 96), x86::ymm3);
 	c.vzeroupper();
-	c.mov(x86::qword_ptr(args[2]), x86::rax);
-	c.mov(x86::eax, 1);
 	c.ret();

 	// Touch memory after transaction failure
@ -282,7 +312,7 @@ const auto spu_getll_tx = build_function_asm<bool(*)(u32 raddr, void* rdata, u64
 	c.mov(x86::rax, x86::qword_ptr(x86::r10));
 	c.sub(args[0], 1);
 	c.jnz(begin);
-	c.xor_(x86::eax, x86::eax);
+	c.mov(x86::eax, 1);
 	c.ret();
 });

@ -314,7 +344,7 @@ const auto spu_putlluc_tx = build_function_asm<bool(*)(u32 raddr, const void* rd
 	c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1);
 	c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2);
 	c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3);
-	c.add(x86::qword_ptr(x86::r10), 1);
+	c.add(x86::qword_ptr(x86::r10), 2);
 	c.xend();
 	c.vzeroupper();
 	c.mov(x86::eax, 1);
@ -767,8 +797,8 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 		}
 	}

-	void* dst = vm::base(eal);
-	void* src = vm::base(offset + lsa);
+	u8* dst = (u8*)vm::base(eal);
+	u8* src = (u8*)vm::base(offset + lsa);

 	if (UNLIKELY(!is_get && !g_use_rtm))
 	{
@ -777,65 +807,72 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 		case 1:
 		{
 			auto& res = vm::reservation_lock(eal, 1);
-			*static_cast<u8*>(dst) = *static_cast<const u8*>(src);
-			res &= ~1ull;
+			*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
+			res++;
 			break;
 		}
 		case 2:
 		{
 			auto& res = vm::reservation_lock(eal, 2);
-			*static_cast<u16*>(dst) = *static_cast<const u16*>(src);
-			res &= ~1ull;
+			*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
+			res++;
 			break;
 		}
 		case 4:
 		{
 			auto& res = vm::reservation_lock(eal, 4);
-			*static_cast<u32*>(dst) = *static_cast<const u32*>(src);
-			res &= ~1ull;
+			*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
+			res++;
 			break;
 		}
 		case 8:
 		{
 			auto& res = vm::reservation_lock(eal, 8);
-			*static_cast<u64*>(dst) = *static_cast<const u64*>(src);
-			res &= ~1ull;
-			break;
-		}
-		case 16:
-		{
-			auto& res = vm::reservation_lock(eal, 16);
-			_mm_store_si128(static_cast<__m128i*>(dst), _mm_load_si128(static_cast<const __m128i*>(src)));
-			res &= ~1ull;
+			*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
+			res++;
 			break;
 		}
 		default:
 		{
-			auto* res = &vm::reservation_lock(eal, 16);
-			auto vdst = static_cast<__m128i*>(dst);
-			auto vsrc = static_cast<const __m128i*>(src);
-
-			for (u32 addr = eal, end = eal + size;; vdst++, vsrc++)
+			if (((eal & 127) + size) <= 128)
 			{
-				_mm_store_si128(vdst, _mm_load_si128(vsrc));
+				// Lock one cache line
+				auto& res = vm::reservation_lock(eal, 128);

-				addr += 16;
-
-				if (addr == end)
+				while (size)
 				{
-					break;
+					*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
+
+					dst += 16;
+					src += 16;
+					size -= 16;
 				}

-				if (addr % 128)
-				{
-					continue;
-				}
-
-				res->fetch_and(~1ull);
-				res = &vm::reservation_lock(addr, 16);
+				res++;
+				break;
 			}

-			res->fetch_and(~1ull);
+			auto lock = vm::passive_lock(eal & -128u, ::align(eal + size, 128));
+
+			while (size >= 128)
+			{
+				mov_rdata(reinterpret_cast<u128*>(dst), reinterpret_cast<const u128*>(src));
+
+				dst += 128;
+				src += 128;
+				size -= 128;
+			}
+
+			while (size)
+			{
+				*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
+
+				dst += 16;
+				src += 16;
+				size -= 16;
+			}
+
+			*lock = 0;
 			break;
 		}
 		}
@ -852,67 +889,44 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 	{
 	case 1:
 	{
-		*static_cast<u8*>(dst) = *static_cast<const u8*>(src);
+		*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
 		break;
 	}
 	case 2:
 	{
-		*static_cast<u16*>(dst) = *static_cast<const u16*>(src);
+		*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
 		break;
 	}
 	case 4:
 	{
-		*static_cast<u32*>(dst) = *static_cast<const u32*>(src);
+		*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
 		break;
 	}
 	case 8:
 	{
-		*static_cast<u64*>(dst) = *static_cast<const u64*>(src);
-		break;
-	}
-	case 16:
-	{
-		_mm_store_si128(static_cast<__m128i*>(dst), _mm_load_si128(static_cast<const __m128i*>(src)));
+		*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
 		break;
 	}
 	default:
 	{
-		auto vdst = static_cast<__m128i*>(dst);
-		auto vsrc = static_cast<const __m128i*>(src);
-		auto vcnt = size / sizeof(__m128i);
-
-		while (vcnt >= 8)
+		while (size >= 128)
 		{
-			const __m128i data[]
-			{
-				_mm_load_si128(vsrc + 0),
-				_mm_load_si128(vsrc + 1),
-				_mm_load_si128(vsrc + 2),
-				_mm_load_si128(vsrc + 3),
-				_mm_load_si128(vsrc + 4),
-				_mm_load_si128(vsrc + 5),
-				_mm_load_si128(vsrc + 6),
-				_mm_load_si128(vsrc + 7),
-			};
+			mov_rdata(reinterpret_cast<u128*>(dst), reinterpret_cast<const u128*>(src));

-			_mm_store_si128(vdst + 0, data[0]);
-			_mm_store_si128(vdst + 1, data[1]);
-			_mm_store_si128(vdst + 2, data[2]);
-			_mm_store_si128(vdst + 3, data[3]);
-			_mm_store_si128(vdst + 4, data[4]);
-			_mm_store_si128(vdst + 5, data[5]);
-			_mm_store_si128(vdst + 6, data[6]);
-			_mm_store_si128(vdst + 7, data[7]);
-
-			vcnt -= 8;
-			vsrc += 8;
-			vdst += 8;
+			dst += 128;
+			src += 128;
+			size -= 128;
 		}

-		while (vcnt--)
+		while (size)
 		{
-			_mm_store_si128(vdst++, _mm_load_si128(vsrc++));
+			*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
+
+			dst += 16;
+			src += 16;
+			size -= 16;
 		}
+
 		break;
 	}
 	}
@ -1030,7 +1044,12 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)

 	if (raddr && addr == raddr)
 	{
-		ch_event_stat |= SPU_EVENT_LR;
+		// Last check for event before we clear the reservation
+		if ((vm::reservation_acquire(addr, 128) & ~1ull) != rtime || rdata != vm::_ref<decltype(rdata)>(addr))
+		{
+			ch_event_stat |= SPU_EVENT_LR;
+		}
+
 		raddr = 0;
 	}

@ -1057,20 +1076,20 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
 		auto& data = vm::_ref<decltype(rdata)>(addr);
 		auto& res = vm::reservation_lock(addr, 128);

-		vm::_ref<atomic_t<u32>>(addr) += 0;
+		*reinterpret_cast<atomic_t<u32>*>(&data) += 0;

 		if (g_cfg.core.spu_accurate_putlluc)
 		{
 			// Full lock (heavyweight)
 			// TODO: vm::check_addr
-			vm::writer_lock lock(1);
-			data = to_write;
-			vm::reservation_update(addr, 128);
+			vm::writer_lock lock(addr);
+			mov_rdata(data.data(), to_write.data());
+			res++;
 		}
 		else
 		{
-			data = to_write;
-			vm::reservation_update(addr, 128);
+			mov_rdata(data.data(), to_write.data());
+			res++;
 		}
 	}

@ -1140,11 +1159,7 @@ void spu_thread::do_mfc(bool wait)
 			return false;
 		}

-		if (args.size)
-		{
-			do_dma_transfer(args);
-		}
-		else if (args.cmd == MFC_PUTQLLUC_CMD)
+		if (args.cmd == MFC_PUTQLLUC_CMD)
 		{
 			if (fence & mask)
 			{
@ -1153,6 +1168,10 @@ void spu_thread::do_mfc(bool wait)

 			do_putlluc(args);
 		}
+		else if (args.size)
+		{
+			do_dma_transfer(args);
+		}

 		removed++;
 		return true;
@ -1184,7 +1203,7 @@ u32 spu_thread::get_mfc_completed()
 	return ch_tag_mask & ~mfc_fence;
 }

-bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
+bool spu_thread::process_mfc_cmd()
 {
 	// Stall infinitely if MFC queue is full
 	while (UNLIKELY(mfc_size >= 16))
@ -1198,29 +1217,24 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
 	}

 	spu::scheduler::concurrent_execution_watchdog watchdog(*this);
-	LOG_TRACE(SPU, "DMAC: cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x", args.cmd, args.lsa, args.eal, args.tag, args.size);
+	LOG_TRACE(SPU, "DMAC: cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x", ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size);

-	switch (args.cmd)
+	switch (ch_mfc_cmd.cmd)
 	{
 	case MFC_GETLLAR_CMD:
 	{
-		const u32 addr = args.eal & -128u;
+		const u32 addr = ch_mfc_cmd.eal & -128u;
 		auto& data = vm::_ref<decltype(rdata)>(addr);
-
-		if (raddr && raddr != addr)
-		{
-			ch_event_stat |= SPU_EVENT_LR;
-		}
-
-		raddr = addr;
+		auto& dst = _ref<decltype(rdata)>(ch_mfc_cmd.lsa & 0x3ff80);
+		u64 ntime;

 		const bool is_polling = false; // TODO

 		if (is_polling)
 		{
-			rtime = vm::reservation_acquire(raddr, 128);
+			rtime = vm::reservation_acquire(addr, 128);

-			while (rdata == data && vm::reservation_acquire(raddr, 128) == rtime)
+			while (rdata == data && vm::reservation_acquire(addr, 128) == rtime)
 			{
 				if (is_stopped())
 				{
@ -1235,57 +1249,78 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
 		{
 			u64 count = 1;

-			while (g_cfg.core.spu_accurate_getllar && !spu_getll_tx(raddr, rdata.data(), &rtime))
+			if (g_cfg.core.spu_accurate_getllar)
 			{
-				std::this_thread::yield();
-				count += 2;
+			 	while ((ntime = spu_getll_tx(addr, dst.data())) & 1)
+				{
+					std::this_thread::yield();
+					count += 2;
+				}
 			}
-
-			if (!g_cfg.core.spu_accurate_getllar)
+			else
 			{
 				for (;; count++, busy_wait(300))
 				{
-					rtime = vm::reservation_acquire(raddr, 128);
-					rdata = data;
+					ntime = vm::reservation_acquire(addr, 128);
+					dst = data;

-					if (LIKELY(vm::reservation_acquire(raddr, 128) == rtime))
+					if (LIKELY(vm::reservation_acquire(addr, 128) == ntime))
 					{
 						break;
 					}
 				}
 			}

-			if (count > 9)
+			if (count > 15)
 			{
-				LOG_ERROR(SPU, "%s took too long: %u", args.cmd, count);
+				LOG_ERROR(SPU, "%s took too long: %u", ch_mfc_cmd.cmd, count);
 			}
 		}
 		else
 		{
-			auto& res = vm::reservation_lock(raddr, 128);
+			auto& res = vm::reservation_lock(addr, 128);

 			if (g_cfg.core.spu_accurate_getllar)
 			{
-				vm::_ref<atomic_t<u32>>(raddr) += 0;
+				*reinterpret_cast<atomic_t<u32>*>(&data) += 0;

 				// Full lock (heavyweight)
 				// TODO: vm::check_addr
-				vm::writer_lock lock(1);
+				vm::writer_lock lock(addr);

-				rtime = res & ~1ull;
-				rdata = data;
+				ntime = res & ~1ull;
+				mov_rdata(dst.data(), data.data());
 				res &= ~1ull;
 			}
 			else
 			{
-				rtime = res & ~1ull;
-				rdata = data;
+				ntime = res & ~1ull;
+				mov_rdata(dst.data(), data.data());
 				res &= ~1ull;
 			}
 		}

-		// Copy to LS
-		_ref<decltype(rdata)>(args.lsa & 0x3ff80) = rdata;
+		if (const u32 _addr = raddr)
+		{
+			// Last check for event before we replace the reservation with a new one
+			if ((vm::reservation_acquire(_addr, 128) & ~1ull) != rtime || rdata != vm::_ref<decltype(rdata)>(_addr))
+			{
+				ch_event_stat |= SPU_EVENT_LR;
+
+				if (_addr == addr)
+				{
+					// Lost current reservation
+					raddr = 0;
+					ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
+					return true;
+				}
+			}
+		}
+
+		raddr = addr;
+		rtime = ntime;
+		mov_rdata(rdata.data(), dst.data());
+
 		ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
 		return true;
 	}
@ -1293,40 +1328,50 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
 	case MFC_PUTLLC_CMD:
 	{
 		// Store conditionally
-		const u32 addr = args.eal & -128u;
+		const u32 addr = ch_mfc_cmd.eal & -128u;
+		u32 result = 0;

-		bool result = false;
-
-		if (raddr == addr && rtime == vm::reservation_acquire(raddr, 128))
+		if (raddr == addr && rtime == (vm::reservation_acquire(raddr, 128) & ~1ull))
 		{
-			const auto& to_write = _ref<decltype(rdata)>(args.lsa & 0x3ff80);
+			const auto& to_write = _ref<decltype(rdata)>(ch_mfc_cmd.lsa & 0x3ff80);

 			if (LIKELY(g_use_rtm))
 			{
-				if (spu_putllc_tx(raddr, rtime, rdata.data(), to_write.data()))
+				while (true)
 				{
-					vm::reservation_notifier(raddr, 128).notify_all();
-					result = true;
-				}
+					result = spu_putllc_tx(addr, rtime, rdata.data(), to_write.data());
+					
+					if (result < 2)
+					{
+						break;
+					}

-				// Don't fallback to heavyweight lock, just give up
+					// Retry
+					std::this_thread::yield();
+				}
 			}
 			else if (auto& data = vm::_ref<decltype(rdata)>(addr); rdata == data)
 			{
 				auto& res = vm::reservation_lock(raddr, 128);

-				vm::_ref<atomic_t<u32>>(raddr) += 0;
-
-				// Full lock (heavyweight)
-				// TODO: vm::check_addr
-				vm::writer_lock lock(1);
-
-				if (rtime == (res & ~1ull) && rdata == data)
+				if (rtime == (res & ~1ull))
 				{
-					data = to_write;
-					vm::reservation_update(raddr, 128);
-					vm::reservation_notifier(raddr, 128).notify_all();
-					result = true;
+					*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
+
+					// Full lock (heavyweight)
+					// TODO: vm::check_addr
+					vm::writer_lock lock(addr);
+
+					if (rdata == data)
+					{
+						mov_rdata(data.data(), to_write.data());
+						res++;
+						result = 1;
+					}
+					else
+					{
+						res &= ~1ull;
+					}
 				}
 				else
 				{
@ -1337,16 +1382,21 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)

 		if (result)
 		{
+			vm::reservation_notifier(addr, 128).notify_all();
 			ch_atomic_stat.set_value(MFC_PUTLLC_SUCCESS);
 		}
 		else
 		{
-			ch_atomic_stat.set_value(MFC_PUTLLC_FAILURE);
-		}
+			if (raddr)
+			{
+				// Last check for event before we clear the reservation
+				if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & ~1ull) || rdata != vm::_ref<decltype(rdata)>(raddr))
+				{
+					ch_event_stat |= SPU_EVENT_LR;
+				}
+			}

-		if (raddr && !result)
-		{
-			ch_event_stat |= SPU_EVENT_LR;
+			ch_atomic_stat.set_value(MFC_PUTLLC_FAILURE);
 		}

 		raddr = 0;
@ -1354,23 +1404,22 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
 	}
 	case MFC_PUTLLUC_CMD:
 	{
-		do_putlluc(args);
+		do_putlluc(ch_mfc_cmd);
 		ch_atomic_stat.set_value(MFC_PUTLLUC_SUCCESS);
 		return true;
 	}
 	case MFC_PUTQLLUC_CMD:
 	{
-		const u32 mask = utils::rol32(1, args.tag);
+		const u32 mask = utils::rol32(1, ch_mfc_cmd.tag);

 		if (UNLIKELY((mfc_barrier | mfc_fence) & mask))
 		{
-			args.size = 0;
-			mfc_queue[mfc_size++] = args;
+			mfc_queue[mfc_size++] = ch_mfc_cmd;
 			mfc_fence |= mask;
 		}
 		else
 		{
-			do_putlluc(args);
+			do_putlluc(ch_mfc_cmd);
 		}

 		return true;
@ -1379,7 +1428,11 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
 	case MFC_SNDSIGB_CMD:
 	case MFC_SNDSIGF_CMD:
 	{
-		args.size = 4;
+		if (ch_mfc_cmd.size != 4)
+		{
+			break;
+		}
+
 		// Fallthrough
 	}
 	case MFC_PUT_CMD:
@ -1392,24 +1445,24 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
 	case MFC_GETB_CMD:
 	case MFC_GETF_CMD:
 	{
-		if (LIKELY(args.size <= 0x4000))
+		if (LIKELY(ch_mfc_cmd.size <= 0x4000))
 		{
-			if (LIKELY(do_dma_check(args)))
+			if (LIKELY(do_dma_check(ch_mfc_cmd)))
 			{
-				if (LIKELY(args.size))
+				if (ch_mfc_cmd.size)
 				{
-					do_dma_transfer(args);
+					do_dma_transfer(ch_mfc_cmd);
 				}

 				return true;
 			}

-			mfc_queue[mfc_size++] = args;
-			mfc_fence |= utils::rol32(1, args.tag);
+			mfc_queue[mfc_size++] = ch_mfc_cmd;
+			mfc_fence |= utils::rol32(1, ch_mfc_cmd.tag);

-			if (args.cmd & MFC_BARRIER_MASK)
+			if (ch_mfc_cmd.cmd & MFC_BARRIER_MASK)
 			{
-				mfc_barrier |= utils::rol32(1, args.tag);
+				mfc_barrier |= utils::rol32(1, ch_mfc_cmd.tag);
 			}

 			return true;
@ -1427,22 +1480,25 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
 	case MFC_GETLB_CMD:
 	case MFC_GETLF_CMD:
 	{
-		if (LIKELY(args.size <= 0x4000))
+		if (LIKELY(ch_mfc_cmd.size <= 0x4000))
 		{
-			if (LIKELY(do_dma_check(args)))
+			auto& cmd = mfc_queue[mfc_size];
+			cmd = ch_mfc_cmd;
+
+			if (LIKELY(do_dma_check(cmd)))
 			{
-				if (LIKELY(do_list_transfer(args)))
+				if (LIKELY(do_list_transfer(cmd)))
 				{
 					return true;
 				}
 			}

-			mfc_queue[mfc_size++] = args;
-			mfc_fence |= utils::rol32(1, args.tag);
+			mfc_size++;
+			mfc_fence |= utils::rol32(1, cmd.tag);

-			if (args.cmd & MFC_BARRIER_MASK)
+			if (cmd.cmd & MFC_BARRIER_MASK)
 			{
-				mfc_barrier |= utils::rol32(1, args.tag);
+				mfc_barrier |= utils::rol32(1, cmd.tag);
 			}

 			return true;
@ -1460,7 +1516,7 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
 		}
 		else
 		{
-			mfc_queue[mfc_size++] = args;
+			mfc_queue[mfc_size++] = ch_mfc_cmd;
 			mfc_barrier |= -1;
 		}

@ -1473,7 +1529,7 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
 	}

 	fmt::throw_exception("Unknown command (cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)" HERE,
-		args.cmd, args.lsa, args.eal, args.tag, args.size);
+		ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size);
 }

 u32 spu_thread::get_events(bool waiting)
@ -1486,7 +1542,7 @@ u32 spu_thread::get_events(bool waiting)
 	}

 	// Check reservation status and set SPU_EVENT_LR if lost
-	if (raddr && (vm::reservation_acquire(raddr, sizeof(rdata)) != rtime || rdata != vm::_ref<decltype(rdata)>(raddr)))
+	if (raddr && ((vm::reservation_acquire(raddr, sizeof(rdata)) & ~1ull) != rtime || rdata != vm::_ref<decltype(rdata)>(raddr)))
 	{
 		ch_event_stat |= SPU_EVENT_LR;
 		raddr = 0;
@ -2026,7 +2082,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
 	case MFC_Cmd:
 	{
 		ch_mfc_cmd.cmd = MFC(value & 0xff);
-		return process_mfc_cmd(ch_mfc_cmd);
+		return process_mfc_cmd();
 	}

 	case MFC_WrListStallAck:
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@ -1,8 +1,9 @@
-#pragma once
+#pragma once

 #include "Emu/Cell/Common.h"
 #include "Emu/CPU/CPUThread.h"
 #include "Emu/Cell/SPUInterpreter.h"
+#include "Emu/Memory/vm.h"
 #include "MFC.h"

 #include <map>
@ -595,7 +596,7 @@ public:
 	void do_mfc(bool wait = true);
 	u32 get_mfc_completed();

-	bool process_mfc_cmd(spu_mfc_cmd args);
+	bool process_mfc_cmd();
 	u32 get_events(bool waiting = false);
 	void set_events(u32 mask);
 	void set_interrupt_status(bool enable);
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@ -57,8 +57,12 @@ namespace vm
 	// Memory mutex acknowledgement
 	thread_local atomic_t<cpu_thread*>* g_tls_locked = nullptr;

+	// Currently locked address
+	atomic_t<u32> g_addr_lock = 0;
+
 	// Memory mutex: passive locks
-	std::array<atomic_t<cpu_thread*>, 32> g_locks;
+	std::array<atomic_t<cpu_thread*>, 4> g_locks{};
+	std::array<atomic_t<u64>, 6> g_range_locks{};

 	static void _register_lock(cpu_thread* _cpu)
 	{
@ -72,11 +76,25 @@ namespace vm
 		}
 	}

-	bool passive_lock(cpu_thread& cpu, bool wait)
+	static atomic_t<u64>* _register_range_lock(const u64 lock_info)
+	{
+		while (true)
+		{
+			for (auto& lock : g_range_locks)
+			{
+				if (!lock && lock.compare_and_swap_test(0, lock_info))
+				{
+					return &lock;
+				}		
+			}
+		}
+	}
+
+	void passive_lock(cpu_thread& cpu)
 	{
 		if (UNLIKELY(g_tls_locked && *g_tls_locked == &cpu))
 		{
-			return true;
+			return;
 		}

 		if (LIKELY(g_mutex.is_lockable()))
@ -84,31 +102,46 @@ namespace vm
 			// Optimistic path (hope that mutex is not exclusively locked)
 			_register_lock(&cpu);

-			if (UNLIKELY(!g_mutex.is_lockable()))
+			if (LIKELY(g_mutex.is_lockable()))
 			{
-				passive_unlock(cpu);
-
-				if (!wait)
-				{
-					return false;
-				}
-
-				::reader_lock lock(g_mutex);
-				_register_lock(&cpu);
+				return;
 			}
+
+			passive_unlock(cpu);
 		}
-		else
+
+		::reader_lock lock(g_mutex);
+		_register_lock(&cpu);
+	}
+
+	atomic_t<u64>* passive_lock(const u32 addr, const u32 end)
+	{
+		static const auto test_addr = [](const u32 target, const u32 addr, const u32 end)
 		{
-			if (!wait)
+			return addr > target || end <= target;
+		};
+
+		atomic_t<u64>* _ret;
+
+		if (LIKELY(test_addr(g_addr_lock.load(), addr, end)))
+		{
+			// Optimistic path (hope that address range is not locked)
+			_ret = _register_range_lock((u64)end << 32 | addr);
+
+			if (LIKELY(test_addr(g_addr_lock.load(), addr, end)))
 			{
-				return false;
+				return _ret;
 			}

-			::reader_lock lock(g_mutex);
-			_register_lock(&cpu);
+			*_ret = 0;
 		}

-		return true;
+		{
+			::reader_lock lock(g_mutex);
+			_ret = _register_range_lock((u64)end << 32 | addr);
+		}
+
+		return _ret;
 	}

 	void passive_unlock(cpu_thread& cpu)
@ -194,8 +227,7 @@ namespace vm
 		m_upgraded = true;
 	}

-	writer_lock::writer_lock(int full)
-		: locked(true)
+	writer_lock::writer_lock(u32 addr)
 	{
 		auto cpu = get_current_cpu_thread();

@ -206,7 +238,7 @@ namespace vm

 		g_mutex.lock();

-		if (full)
+		if (addr)
 		{
 			for (auto& lock : g_locks)
 			{
@ -216,6 +248,30 @@ namespace vm
 				}
 			}

+			g_addr_lock = addr;
+
+			for (auto& lock : g_range_locks)
+			{
+				while (true)
+				{
+					const u64 value = lock;
+
+					// Test beginning address 
+					if (static_cast<u32>(value) > addr)
+					{
+						break;
+					}
+
+					// Test end address
+					if (static_cast<u32>(value >> 32) <= addr)
+					{
+						break;
+					}
+
+					_mm_pause();
+				}
+			}
+
 			for (auto& lock : g_locks)
 			{
 				while (cpu_thread* ptr = lock)
@ -225,7 +281,7 @@ namespace vm
 						break;
 					}

-					busy_wait();
+					_mm_pause();
 				}
 			}
 		}
@ -239,10 +295,8 @@ namespace vm

 	writer_lock::~writer_lock()
 	{
-		if (locked)
-		{
-			g_mutex.unlock();
-		}
+		g_addr_lock.raw() = 0;
+		g_mutex.unlock();
 	}

 	void reservation_lock_internal(atomic_t<u64>& res)
--- a/rpcs3/Emu/Memory/vm.h
+++ b/rpcs3/Emu/Memory/vm.h
@ -53,7 +53,8 @@ namespace vm
 	extern thread_local atomic_t<cpu_thread*>* g_tls_locked;

 	// Register reader
-	bool passive_lock(cpu_thread& cpu, bool wait = true);
+	void passive_lock(cpu_thread& cpu);
+	atomic_t<u64>* passive_lock(const u32 begin, const u32 end);

 	// Unregister reader
 	void passive_unlock(cpu_thread& cpu);
@ -80,14 +81,10 @@ namespace vm

 	struct writer_lock final
 	{
-		const bool locked;
-
 		writer_lock(const writer_lock&) = delete;
 		writer_lock& operator=(const writer_lock&) = delete;
-		writer_lock(int full);
+		writer_lock(u32 addr = 0);
 		~writer_lock();
-
-		explicit operator bool() const { return locked; }
 	};

 	// Get reservation status for further atomic update: last update timestamp
@ -101,7 +98,7 @@ namespace vm
 	inline void reservation_update(u32 addr, u32 size, bool lsb = false)
 	{
 		// Update reservation info with new timestamp
-		reservation_acquire(addr, size) = (__rdtsc() << 1) | u64{lsb};
+		reservation_acquire(addr, size) += 2;
 	}

 	// Get reservation sync variable
--- a/rpcs3/Emu/System.h
+++ b/rpcs3/Emu/System.h
@ -354,7 +354,7 @@ struct cfg_root : cfg::node
 		node_core(cfg::node* _this) : cfg::node(_this, "Core") {}

 		cfg::_enum<ppu_decoder_type> ppu_decoder{this, "PPU Decoder", ppu_decoder_type::llvm};
-		cfg::_int<1, 16> ppu_threads{this, "PPU Threads", 2}; // Amount of PPU threads running simultaneously (must be 2)
+		cfg::_int<1, 4> ppu_threads{this, "PPU Threads", 2}; // Amount of PPU threads running simultaneously (must be 2)
 		cfg::_bool ppu_debug{this, "PPU Debug"};
 		cfg::_bool llvm_logs{this, "Save LLVM logs"};
 		cfg::string llvm_cpu{this, "Use LLVM CPU"};