From 86fc842c89565b696263353785f9f5db35b9000d Mon Sep 17 00:00:00 2001
From: Nekotekina <nekotekina@gmail.com>
Date: Sat, 31 Oct 2020 01:52:24 +0300
Subject: [PATCH] TSX: new fallback method (time-based)

Basically, using timestamp counter.
Rewritten vm::reservation_op with the same principle.
Rewritten another transaction helper.
Add two new settings for configuring fallbacks.
Two limits are specified in nanoseconds (first and second).
Fix PUTLLC reload logic (prevent reusing garbage).
---
 Utilities/JIT.h                         |  53 ++++++++--
 Utilities/asm.h                         |   9 +-
 rpcs3/Emu/CPU/CPUThread.h               |   5 +
 rpcs3/Emu/Cell/Modules/cellSpurs.cpp    |  26 ++---
 rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp |   2 +-
 rpcs3/Emu/Cell/PPUThread.cpp            |  72 ++++++++++---
 rpcs3/Emu/Cell/SPUThread.cpp            | 129 ++++++++++++++++++------
 rpcs3/Emu/Memory/vm.cpp                 |   8 +-
 rpcs3/Emu/Memory/vm_reservation.h       |  22 ++--
 rpcs3/Emu/System.cpp                    |  12 ++-
 rpcs3/Emu/System.h                      |   2 +
 rpcs3/Emu/system_config.h               |   2 +
 12 files changed, 263 insertions(+), 79 deletions(-)
diff --git a/Utilities/JIT.h b/Utilities/JIT.h
index 17be332d07..7a33d8f975 100644
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@@ -57,31 +57,68 @@ namespace asmjit
 
 	// Emit xbegin and adjacent loop, return label at xbegin (don't use xabort please)
 	template <typename F>
-	[[nodiscard]] inline asmjit::Label build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback, const asmjit::X86Gp& ctr, uint less_than, F func)
+	[[nodiscard]] inline asmjit::Label build_transaction_enter(asmjit::X86Assembler& c, asmjit::Label fallback, F func)
 	{
 		Label fall = c.newLabel();
 		Label begin = c.newLabel();
 		c.jmp(begin);
 		c.bind(fall);
 
-		// First invoked after failure
-		func();
-
-		c.add(ctr, 1);
-
 		// Don't repeat on zero status (may indicate syscall or interrupt)
 		c.test(x86::eax, x86::eax);
 		c.jz(fallback);
 
+		// First invoked after failure (can fallback to proceed, or jump anywhere else)
+		func();
+
 		// Other bad statuses are ignored regardless of repeat flag (TODO)
-		c.cmp(ctr, less_than);
-		c.jae(fallback);
 		c.align(kAlignCode, 16);
 		c.bind(begin);
 		return fall;
 
 		// xbegin should be issued manually, allows to add more check before entering transaction
 	}
+
+	// Helper to spill RDX (EDX) register for RDTSC
+	inline void build_swap_rdx_with(asmjit::X86Assembler& c, std::array<X86Gp, 4>& args, const asmjit::X86Gp& with)
+	{
+#ifdef _WIN32
+		c.xchg(args[1], with);
+		args[1] = with;
+#else
+		c.xchg(args[2], with);
+		args[2] = with;
+#endif
+	}
+
+	// Get full RDTSC value into chosen register (clobbers rax/rdx or saves only rax with other target)
+	inline void build_get_tsc(asmjit::X86Assembler& c, const asmjit::X86Gp& to = asmjit::x86::rax)
+	{
+		if (&to != &x86::rax && &to != &x86::rdx)
+		{
+			// Swap to save its contents
+			c.xchg(x86::rax, to);
+		}
+
+		c.rdtsc();
+		c.shl(x86::rdx, 32);
+
+		if (&to == &x86::rax)
+		{
+			c.or_(x86::rax, x86::rdx);
+		}
+		else if (&to == &x86::rdx)
+		{
+			c.or_(x86::rdx, x86::rax);
+		}
+		else
+		{
+			// Swap back, maybe there is more effective way to do it
+			c.xchg(x86::rax, to);
+			c.mov(to.r32(), to.r32());
+			c.or_(to.r64(), x86::rdx);
+		}
+	}
 }
 
 // Build runtime function with asmjit::X86Assembler
diff --git a/Utilities/asm.h b/Utilities/asm.h
index 7a93c211ed..e56a2a121c 100644
--- a/Utilities/asm.h
+++ b/Utilities/asm.h
@@ -2,15 +2,18 @@
 
 #include "types.h"
 
+extern bool g_use_rtm;
+extern u64 g_rtm_tx_limit1;
+
 namespace utils
 {
-	// Transaction helper (Max = max attempts) (result = pair of success and op result)
-	template <uint Max = 10, typename F, typename R = std::invoke_result_t<F>>
+	// Transaction helper (result = pair of success and op result, or just bool)
+	template <typename F, typename R = std::invoke_result_t<F>>
 	inline auto tx_start(F op)
 	{
 		uint status = -1;
 
-		for (uint i = 0; i < Max; i++)
+		for (auto stamp0 = __rdtsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = __rdtsc())
 		{
 #ifndef _MSC_VER
 			__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
diff --git a/rpcs3/Emu/CPU/CPUThread.h b/rpcs3/Emu/CPU/CPUThread.h
index b6e44aa891..044021d1cb 100644
--- a/rpcs3/Emu/CPU/CPUThread.h
+++ b/rpcs3/Emu/CPU/CPUThread.h
@@ -73,6 +73,11 @@ public:
 		return !!(state & (cpu_flag::suspend + cpu_flag::dbg_global_pause + cpu_flag::dbg_pause));
 	}
 
+	bool has_pause_flag() const
+	{
+		return !!(state & cpu_flag::pause);
+	}
+
 	// Check thread type
 	u32 id_type() const
 	{
diff --git a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
index 56e49ac3db..abca9012e9 100644
--- a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
@@ -292,7 +292,7 @@ namespace _spurs
 namespace _spurs
 {
 	// Add workload
-	s32 add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg);
+	s32 add_workload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg);
 }
 
 //s32 _cellSpursWorkloadAttributeInitialize(vm::ptr<CellSpursWorkloadAttribute> attr, u32 revision, u32 sdkVersion, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt);
@@ -2295,7 +2295,7 @@ s32 cellSpursWorkloadAttributeSetShutdownCompletionEventHook(vm::ptr<CellSpursWo
 	return CELL_OK;
 }
 
-s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg)
+s32 _spurs::add_workload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, const u8(&priorityTable)[8], u32 minContention, u32 maxContention, vm::cptr<char> nameClass, vm::cptr<char> nameInstance, vm::ptr<CellSpursShutdownCompletionEventHook> hook, vm::ptr<void> hookArg)
 {
 	if (!spurs || !wid || !pm)
 	{
@@ -2420,7 +2420,7 @@ s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<vo
 
 	u32 res_wkl;
 	const auto wkl = &spurs->wklInfo(wnum);
-	vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
 	{
 		const u32 mask = op.wklMskB & ~(0x80000000u >> wnum);
 		res_wkl = 0;
@@ -2456,12 +2456,12 @@ s32 _spurs::add_workload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<vo
 }
 
 /// Add workload
-s32 cellSpursAddWorkload(vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt)
+s32 cellSpursAddWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, vm::ptr<u32> wid, vm::cptr<void> pm, u32 size, u64 data, vm::cptr<u8[8]> priority, u32 minCnt, u32 maxCnt)
 {
 	cellSpurs.warning("cellSpursAddWorkload(spurs=*0x%x, wid=*0x%x, pm=*0x%x, size=0x%x, data=0x%llx, priority=*0x%x, minCnt=0x%x, maxCnt=0x%x)",
 		spurs, wid, pm, size, data, priority, minCnt, maxCnt);
 
-	return _spurs::add_workload(spurs, wid, pm, size, data, *priority, minCnt, maxCnt, vm::null, vm::null, vm::null, vm::null);
+	return _spurs::add_workload(ppu, spurs, wid, pm, size, data, *priority, minCnt, maxCnt, vm::null, vm::null, vm::null, vm::null);
 }
 
 /// Add workload
@@ -2484,7 +2484,7 @@ s32 cellSpursAddWorkloadWithAttribute(ppu_thread& ppu, vm::ptr<CellSpurs> spurs,
 		return CELL_SPURS_POLICY_MODULE_ERROR_INVAL;
 	}
 
-	return _spurs::add_workload(spurs, wid, attr->pm, attr->size, attr->data, attr->priority, attr->minContention, attr->maxContention, attr->nameClass, attr->nameInstance, attr->hook, attr->hookArg);
+	return _spurs::add_workload(ppu, spurs, wid, attr->pm, attr->size, attr->data, attr->priority, attr->minContention, attr->maxContention, attr->nameClass, attr->nameInstance, attr->hook, attr->hookArg);
 }
 
 /// Request workload shutdown
@@ -2506,7 +2506,7 @@ s32 cellSpursShutdownWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid
 
 	bool send_event;
 	s32 rc, old_state;
-	if (!vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
+	if (!vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
 	{
 		auto& state = wid < CELL_SPURS_MAX_WORKLOAD ? op.wklState1[wid] : op.wklState2[wid % 16];
 
@@ -2663,7 +2663,7 @@ s32 cellSpursRemoveWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid)
 	}
 
 	s32 rc;
-	vm::reservation_op(vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_wkl_state_op>(spurs.ptr(&CellSpurs::wklState1)), [&](spurs_wkl_state_op& op)
 	{
 		auto& state = wid < CELL_SPURS_MAX_WORKLOAD ? op.wklState1[wid] : op.wklState2[wid % 16];
 
@@ -3040,7 +3040,7 @@ s32 _cellSpursWorkloadFlagReceiver(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u3
 	};
 
 	s32 res;
-	vm::reservation_op(vm::unsafe_ptr_cast<wklFlagOp>(spurs), [&](wklFlagOp& val)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<wklFlagOp>(spurs), [&](wklFlagOp& val)
 	{
 		if (is_set)
 		{
@@ -3189,7 +3189,7 @@ s32 cellSpursEventFlagSet(ppu_thread& ppu, vm::ptr<CellSpursEventFlag> eventFlag
 	u16  pendingRecv;
 	u16  pendingRecvTaskEvents[16];
 
-	vm::reservation_op(vm::unsafe_ptr_cast<CellSpursEventFlag_x00>(eventFlag), [bits, &send, &ppuWaitSlot, &ppuEvents, &pendingRecv, &pendingRecvTaskEvents](CellSpursEventFlag_x00& eventFlag)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursEventFlag_x00>(eventFlag), [bits, &send, &ppuWaitSlot, &ppuEvents, &pendingRecv, &pendingRecvTaskEvents](CellSpursEventFlag_x00& eventFlag)
 	{
 		send        = false;
 		ppuWaitSlot = 0;
@@ -4081,7 +4081,7 @@ s32 _cellSpursSendSignal(ppu_thread& ppu, vm::ptr<CellSpursTaskset> taskset, u32
 
 	int signal;
 
-	vm::reservation_op(vm::unsafe_ptr_cast<spurs_taskset_signal_op>(taskset), [&](spurs_taskset_signal_op& op)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<spurs_taskset_signal_op>(taskset), [&](spurs_taskset_signal_op& op)
 	{
 		const u32 signalled = op.signalled[taskId / 32];
 		const u32 running = op.running[taskId / 32];
@@ -4972,7 +4972,7 @@ s32 cellSpursJobGuardNotify(ppu_thread& ppu, vm::ptr<CellSpursJobGuard> jobGuard
 	u32 allow_jobchain_run = 0; // Affects cellSpursJobChainRun execution
 	u32 old = 0;
 
-	const bool ok = vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobGuard_x00>(jobGuard), [&](CellSpursJobGuard_x00& jg)
+	const bool ok = vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursJobGuard_x00>(jobGuard), [&](CellSpursJobGuard_x00& jg)
 	{
 		allow_jobchain_run = jg.zero;
 		old = jg.ncount0;
@@ -5136,7 +5136,7 @@ s32 cellSpursAddUrgentCommand(ppu_thread& ppu, vm::ptr<CellSpursJobChain> jobCha
 
 	s32 result = CELL_OK;
 
-	vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobChain_x00>(jobChain), [&](CellSpursJobChain_x00& jch)
+	vm::reservation_op(ppu, vm::unsafe_ptr_cast<CellSpursJobChain_x00>(jobChain), [&](CellSpursJobChain_x00& jch)
 	{
 		for (auto& cmd : jch.urgentCmds)
 		{
diff --git a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
index 75cc6a0eb6..3083ff0725 100644
--- a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
@@ -2074,7 +2074,7 @@ void spursJobchainPopUrgentCommand(spu_thread& spu)
 	const auto jc = vm::unsafe_ptr_cast<CellSpursJobChain_x00>(+ctxt->jobChain);
 
 	const bool alterQueue = ctxt->unkFlag0;
-	vm::reservation_op(jc, [&](CellSpursJobChain_x00& op)
+	vm::reservation_op(spu, jc, [&](CellSpursJobChain_x00& op)
 	{
 		const auto ls = reinterpret_cast<CellSpursJobChain_x00*>(ctxt->tempAreaJobChain);
 
diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp
index 7e5844add7..0569ff44a1 100644
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@@ -1216,6 +1216,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 		{
 			ppu.rtime = ppu.last_ftime;
 			ppu.raddr = ppu.last_faddr;
+			ppu.last_ftime = 0;
 			return static_cast<T>(rdata << data_off >> size_off);
 		}
 
@@ -1261,7 +1262,7 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr)
 	return ppu_load_acquire_reservation<u64>(ppu, addr);
 }
 
-const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>([](asmjit::X86Assembler& c, auto& args)
+const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
@@ -1282,6 +1283,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	c.push(x86::r13);
 	c.push(x86::r12);
 	c.push(x86::rbx);
+	c.push(x86::r14);
+	c.push(x86::r15);
 	c.sub(x86::rsp, 40);
 #ifdef _WIN32
 	if (!s_tsx_avx)
@@ -1292,6 +1295,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 #endif
 
 	// Prepare registers
+	build_swap_rdx_with(c, args, x86::r12);
 	c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
 	c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
 	c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@@ -1305,7 +1309,6 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	c.and_(x86::rbx, -128 / 2);
 	c.prefetchw(x86::byte_ptr(x86::rbx));
 	c.and_(args[0].r32(), 63);
-	c.mov(x86::r12d, 1);
 	c.mov(x86::r13, args[1]);
 
 	// Prepare data
@@ -1328,8 +1331,20 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 		c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112));
 	}
 
+	// Alloc r14 to stamp0
+	const auto stamp0 = x86::r14;
+	const auto stamp1 = x86::r15;
+	build_get_tsc(c, stamp0);
+
 	// Begin transaction
-	Label tx0 = build_transaction_enter(c, fall, x86::r12d, 4, []{});
+	Label tx0 = build_transaction_enter(c, fall, [&]()
+	{
+		build_get_tsc(c, stamp1);
+		c.sub(stamp1, stamp0);
+		c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
+		c.xor_(x86::eax, x86::eax);
+		c.jae(fall);
+	});
 	c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
 	c.mov(x86::eax, _XABORT_EXPLICIT);
 	c.jc(fall);
@@ -1380,7 +1395,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	// Update reservation
 	c.sub(x86::qword_ptr(x86::rbx), -128);
 	c.xend();
-	c.mov(x86::eax, x86::r12d);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	// XABORT is expensive so finish with xend instead
@@ -1411,6 +1427,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 
 	c.bind(skip);
 	c.xend();
+	build_get_tsc(c, stamp1);
 	c.mov(x86::eax, _XABORT_EXPLICIT);
 	//c.jmp(fall);
 
@@ -1436,11 +1453,28 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	c.test(x86::eax, vm::rsrv_unique_lock);
 	c.jnz(fail2);
 
-	// Allow only first shared lock to proceed
+	// Check if already updated
+	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, x86::r13);
 	c.jne(fail2);
 
-	Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, []{});
+	// Exclude some time spent on touching memory: stamp1 contains last success or failure
+	c.mov(x86::rax, stamp1);
+	c.sub(x86::rax, stamp0);
+	c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+	c.jae(fall2);
+	build_get_tsc(c, stamp1);
+	c.sub(stamp1, x86::rax);
+
+	Label tx1 = build_transaction_enter(c, fall2, [&]()
+	{
+		build_get_tsc(c);
+		c.sub(x86::rax, stamp1);
+		c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+		c.jae(fall2);
+		c.test(x86::qword_ptr(x86::rbx), 127 - 1);
+		c.jnz(fall2);
+	});
 	c.prefetchw(x86::byte_ptr(x86::rbp, 0));
 	c.prefetchw(x86::byte_ptr(x86::rbp, 64));
 
@@ -1448,8 +1482,6 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
 	c.jc(fall2);
 	c.mov(x86::rax, x86::qword_ptr(x86::rbx));
-	c.test(x86::rax, 127 - 1);
-	c.jnz(fall2);
 	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, x86::r13);
 	c.jne(fail2);
@@ -1493,7 +1525,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 
 	c.xend();
 	c.lock().add(x86::qword_ptr(x86::rbx), 127);
-	c.mov(x86::eax, x86::r12d);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	// XABORT is expensive so try to finish with xend instead
@@ -1523,7 +1556,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	c.jmp(fail2);
 
 	c.bind(fall2);
-	c.mov(x86::eax, -1);
+	c.mov(x86::rax, -1);
 	c.jmp(_ret);
 
 	c.bind(fail2);
@@ -1550,6 +1583,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 		c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7);
 	}
 
+	c.mov(x86::rax, -1);
+	c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax);
 	c.xor_(x86::eax, x86::eax);
 	//c.jmp(_ret);
 
@@ -1569,6 +1604,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	}
 
 	c.add(x86::rsp, 40);
+	c.pop(x86::r15);
+	c.pop(x86::r14);
 	c.pop(x86::rbx);
 	c.pop(x86::r12);
 	c.pop(x86::r13);
@@ -1634,9 +1671,9 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 		{
 			if (g_use_rtm) [[likely]]
 			{
-				switch (u32 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast<u64>(new_data)))
+				switch (u64 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast<u64>(new_data)))
 				{
-				case UINT32_MAX:
+				case UINT64_MAX:
 				{
 					auto& all_data = *vm::get_super_ptr<spu_rdata_t>(addr & -128);
 					auto& sdata = *vm::get_super_ptr<atomic_be_t<u64>>(addr & -8);
@@ -1660,6 +1697,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 						break;
 					}
 
+					ppu.last_ftime = -1;
 					[[fallthrough]];
 				}
 				case 0:
@@ -1669,6 +1707,12 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 						ppu.last_fail++;
 					}
 
+					if (ppu.last_ftime != umax)
+					{
+						ppu.last_faddr = 0;
+						return false;
+					}
+
 					_m_prefetchw(ppu.rdata);
 					_m_prefetchw(ppu.rdata + 64);
 					ppu.last_faddr = addr;
@@ -1678,9 +1722,9 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 				}
 				default:
 				{
-					if (count > 60 && g_cfg.core.perf_report) [[unlikely]]
+					if (count > 20000 && g_cfg.core.perf_report) [[unlikely]]
 					{
-						perf_log.warning("STCX: took too long: %u", count);
+						perf_log.warning(u8"STCX: took too long: %.3fµs (%u c)", count / (utils::get_tsc_freq() / 1000'000.), count);
 					}
 
 					break;
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 7c1dc1b762..b1d110f844 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -371,7 +371,7 @@ namespace spu
 	}
 }
 
-const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
+const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
@@ -415,6 +415,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 #endif
 
 	// Prepare registers
+	build_swap_rdx_with(c, args, x86::r12);
 	c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
 	c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
 	c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@@ -425,7 +426,6 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.shr(args[0].r32(), 1);
 	c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
 	c.prefetchw(x86::byte_ptr(x86::rbx));
-	c.mov(x86::r12d, 1);
 	c.mov(x86::r13, args[1]);
 
 	// Prepare data
@@ -460,10 +460,20 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 		c.movaps(x86::xmm15, x86::oword_ptr(args[3], 112));
 	}
 
+	// Alloc args[0] to stamp0
+	const auto stamp0 = args[0];
+	const auto stamp1 = args[1];
+	build_get_tsc(c, stamp0);
+
 	// Begin transaction
-	Label tx0 = build_transaction_enter(c, fall, x86::r12d, 4, [&]()
+	Label tx0 = build_transaction_enter(c, fall, [&]()
 	{
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
+		build_get_tsc(c, stamp1);
+		c.sub(stamp1, stamp0);
+		c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
+		c.xor_(x86::eax, x86::eax);
+		c.jae(fall);
 	});
 	c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
 	c.mov(x86::eax, _XABORT_EXPLICIT);
@@ -531,7 +541,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.sub(x86::qword_ptr(x86::rbx), -128);
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
-	c.mov(x86::eax, x86::r12d);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	// XABORT is expensive so finish with xend instead
@@ -564,6 +575,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.bind(skip);
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
+	build_get_tsc(c, stamp1);
 	c.mov(x86::eax, _XABORT_EXPLICIT);
 	//c.jmp(fall);
 
@@ -589,13 +601,28 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.test(x86::eax, vm::rsrv_unique_lock);
 	c.jnz(fail2);
 
-	// Allow only first shared lock to proceed
+	// Check if already updated
+	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, x86::r13);
 	c.jne(fail2);
 
-	Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, [&]()
+	// Exclude some time spent on touching memory: stamp1 contains last success or failure
+	c.mov(x86::rax, stamp1);
+	c.sub(x86::rax, stamp0);
+	build_get_tsc(c, stamp1);
+	c.sub(stamp1, x86::rax);
+	c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+	c.jae(fall2);
+
+	Label tx1 = build_transaction_enter(c, fall2, [&]()
 	{
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
+		build_get_tsc(c);
+		c.sub(x86::rax, stamp1);
+		c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+		c.jae(fall2);
+		c.test(x86::qword_ptr(x86::rbx), 127 - 1);
+		c.jnz(fall2);
 	});
 	c.prefetchw(x86::byte_ptr(x86::rbp, 0));
 	c.prefetchw(x86::byte_ptr(x86::rbp, 64));
@@ -604,8 +631,6 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
 	c.jc(fall2);
 	c.mov(x86::rax, x86::qword_ptr(x86::rbx));
-	c.test(x86::rax, 127 - 1);
-	c.jnz(fall2);
 	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, x86::r13);
 	c.jne(fail2);
@@ -666,7 +691,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
 	c.lock().add(x86::qword_ptr(x86::rbx), 127);
-	c.mov(x86::eax, x86::r12d);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	// XABORT is expensive so try to finish with xend instead
@@ -697,7 +723,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.jmp(fail2);
 
 	c.bind(fall2);
-	c.mov(x86::eax, -1);
+	c.mov(x86::rax, -1);
 	c.jmp(_ret);
 
 	c.bind(fail2);
@@ -724,6 +750,8 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 		c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7);
 	}
 
+	c.mov(x86::rax, -1);
+	c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax);
 	c.xor_(x86::eax, x86::eax);
 	//c.jmp(_ret);
 
@@ -763,7 +791,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, void*
 	c.ret();
 });
 
-const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
+const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
@@ -792,6 +820,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 #endif
 
 	// Prepare registers
+	build_swap_rdx_with(c, args, x86::r12);
 	c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
 	c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
 	c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@@ -802,7 +831,6 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	c.shr(args[0].r32(), 1);
 	c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
 	c.prefetchw(x86::byte_ptr(x86::rbx));
-	c.mov(x86::r12d, 1);
 	c.mov(x86::r13, args[1]);
 
 	// Prepare data
@@ -825,10 +853,20 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 		c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
 	}
 
+	// Alloc args[0] to stamp0
+	const auto stamp0 = args[0];
+	const auto stamp1 = args[1];
+	build_get_tsc(c, stamp0);
+
 	// Begin transaction
-	Label tx0 = build_transaction_enter(c, fall, x86::r12d, 8, [&]()
+	Label tx0 = build_transaction_enter(c, fall, [&]()
 	{
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
+		build_get_tsc(c, stamp1);
+		c.sub(stamp1, stamp0);
+		c.cmp(stamp1, imm_ptr(&g_rtm_tx_limit1));
+		c.xor_(x86::eax, x86::eax);
+		c.jae(fall);
 	});
 	c.xbegin(tx0);
 	c.test(x86::qword_ptr(x86::rbx), vm::rsrv_unique_lock);
@@ -856,12 +894,15 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	c.sub(x86::qword_ptr(x86::rbx), -128);
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
-	c.mov(x86::eax, 1);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	c.bind(skip);
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
+	build_get_tsc(c, stamp1);
+	c.mov(x86::eax, _XABORT_EXPLICIT);
 	//c.jmp(fall);
 
 	c.bind(fall);
@@ -881,12 +922,24 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	// Lock reservation
 	c.mov(x86::eax, 1);
 	c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
-	c.test(x86::eax, vm::rsrv_unique_lock);
+	c.test(x86::eax, 127 - 1);
 	c.jnz(fall2);
 
-	Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666, [&]()
+	// Exclude some time spent on touching memory: stamp1 contains last success or failure
+	c.mov(x86::rax, stamp1);
+	c.sub(x86::rax, stamp0);
+	c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+	c.jae(fall2);
+	build_get_tsc(c, stamp1);
+	c.sub(stamp1, x86::rax);
+
+	Label tx1 = build_transaction_enter(c, fall2, [&]()
 	{
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
+		build_get_tsc(c);
+		c.sub(x86::rax, stamp1);
+		c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit2));
+		c.jae(fall2);
 	});
 
 	c.prefetchw(x86::byte_ptr(x86::rbp, 0));
@@ -922,7 +975,8 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
 	c.lock().add(x86::qword_ptr(x86::rbx), 127);
-	c.mov(x86::eax, x86::r12d);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 	c.jmp(_ret);
 
 	c.bind(fall2);
@@ -952,7 +1006,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	c.ret();
 });
 
-const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
+const extern auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
 {
 	using namespace asmjit;
 
@@ -979,6 +1033,7 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
 #endif
 
 	// Prepare registers
+	build_swap_rdx_with(c, args, x86::r12);
 	c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
 	c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
 	c.mov(x86::rbp, x86::qword_ptr(x86::rax));
@@ -986,13 +1041,20 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
 	c.and_(args[0].r32(), 0xff80);
 	c.shr(args[0].r32(), 1);
 	c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
-	c.mov(x86::r12d, 1);
 	c.mov(x86::r13, args[1]);
 
+	// Alloc args[0] to stamp0
+	const auto stamp0 = args[0];
+	build_get_tsc(c, stamp0);
+
 	// Begin transaction
-	Label tx0 = build_transaction_enter(c, fall, x86::r12d, 8, [&]()
+	Label tx0 = build_transaction_enter(c, fall, [&]()
 	{
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
+		build_get_tsc(c);
+		c.sub(x86::rax, stamp0);
+		c.cmp(x86::rax, imm_ptr(&g_rtm_tx_limit1));
+		c.jae(fall);
 	});
 
 	// Check pause flag
@@ -1026,6 +1088,8 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
 
 	c.xend();
 	c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
+	build_get_tsc(c);
+	c.sub(x86::rax, stamp0);
 
 	// Store data
 	if (s_tsx_avx)
@@ -1047,9 +1111,7 @@ const extern auto spu_getllar_tx = build_function_asm<u32(*)(u32 raddr, void* rd
 		c.movaps(x86::oword_ptr(args[1], 112), x86::xmm7);
 	}
 
-	c.mov(x86::eax, 1);
 	c.jmp(_ret);
-
 	c.bind(fall);
 	c.xor_(x86::eax, x86::eax);
 	//c.jmp(_ret);
@@ -1546,6 +1608,7 @@ void spu_thread::push_snr(u32 number, u32 value)
 	const u32 event_bit = SPU_EVENT_S1 >> (number & 1);
 	const u32 bitor_bit = (snr_config >> number) & 1;
 
+	// Redundant, g_use_rtm is checked inside tx_start now.
 	if (g_use_rtm)
 	{
 		bool channel_notify = false;
@@ -2422,9 +2485,9 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 
 		if (g_use_rtm) [[likely]]
 		{
-			switch (u32 count = spu_putllc_tx(addr, rtime, rdata, to_write))
+			switch (u64 count = spu_putllc_tx(addr, rtime, rdata, to_write))
 			{
-			case UINT32_MAX:
+			case UINT64_MAX:
 			{
 				auto& data = *vm::get_super_ptr<spu_rdata_t>(addr);
 
@@ -2451,6 +2514,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 					break;
 				}
 
+				last_ftime = -1;
 				[[fallthrough]];
 			}
 			case 0:
@@ -2460,6 +2524,12 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 					last_fail++;
 				}
 
+				if (last_ftime != umax)
+				{
+					last_faddr = 0;
+					return false;
+				}
+
 				_m_prefetchw(rdata);
 				_m_prefetchw(rdata + 64);
 				last_faddr = addr;
@@ -2469,9 +2539,9 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 			}
 			default:
 			{
-				if (count > 60 && g_cfg.core.perf_report) [[unlikely]]
+				if (count > 20000 && g_cfg.core.perf_report) [[unlikely]]
 				{
-					perf_log.warning("PUTLLC: took too long: %u", count);
+					perf_log.warning(u8"PUTLLC: took too long: %.3fµs (%u c)", count / (utils::get_tsc_freq() / 1000'000.), count);
 				}
 
 				break;
@@ -2566,7 +2636,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
 
 	if (g_use_rtm) [[likely]]
 	{
-		const u32 result = spu_putlluc_tx(addr, to_write, cpu);
+		const u64 result = spu_putlluc_tx(addr, to_write, cpu);
 
 		if (result == 0)
 		{
@@ -2579,9 +2649,9 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
 				res += 127;
 			});
 		}
-		else if (result > 60 && g_cfg.core.perf_report) [[unlikely]]
+		else if (result > 20000 && g_cfg.core.perf_report) [[unlikely]]
 		{
-			perf_log.warning("STORE128: took too long: %u", result);
+			perf_log.warning(u8"STORE128: took too long: %.3fµs (%u c)", result / (utils::get_tsc_freq() / 1000'000.), result);
 		}
 
 		static_cast<void>(cpu->test_stopped());
@@ -2796,6 +2866,7 @@ bool spu_thread::process_mfc_cmd()
 		{
 			rtime = last_ftime;
 			raddr = last_faddr;
+			last_ftime = 0;
 			mov_rdata(_ref<spu_rdata_t>(ch_mfc_cmd.lsa & 0x3ff80), rdata);
 
 			ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp
index e28053e676..6fdd29072b 100644
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@@ -550,17 +550,19 @@ namespace vm
 
 	void reservation_op_internal(u32 addr, std::function<bool()> func)
 	{
-		auto& res = vm::reservation_acquire(addr, 128);
+		auto& res = vm::reservation_acquire(addr, 1);
+		auto* ptr = vm::get_super_ptr(addr & -128);
 
-		cpu_thread::suspend_all(get_current_cpu_thread(), {&res}, [&]
+		cpu_thread::suspend_all(get_current_cpu_thread(), {ptr, ptr + 64, &res}, [&]
 		{
 			if (func())
 			{
-				// Success, release all locks if necessary
+				// Success, release the lock and progress
 				res += 127;
 			}
 			else
 			{
+				// Only release the lock on failure
 				res -= 1;
 			}
 		});
diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h
index 1e4b586763..be3903421c 100644
--- a/rpcs3/Emu/Memory/vm_reservation.h
+++ b/rpcs3/Emu/Memory/vm_reservation.h
@@ -7,6 +7,7 @@
 #include <functional>
 
 extern bool g_use_rtm;
+extern u64 g_rtm_tx_limit2;
 
 namespace vm
 {
@@ -70,8 +71,8 @@ namespace vm
 	// TODO: remove and make it external
 	void reservation_op_internal(u32 addr, std::function<bool()> func);
 
-	template <bool Ack = false, typename T, typename AT = u32, typename F>
-	SAFE_BUFFERS inline auto reservation_op(_ptr_base<T, AT> ptr, F op)
+	template <bool Ack = false, typename CPU, typename T, typename AT = u32, typename F>
+	SAFE_BUFFERS inline auto reservation_op(CPU& cpu, _ptr_base<T, AT> ptr, F op)
 	{
 		// Atomic operation will be performed on aligned 128 bytes of data, so the data size and alignment must comply
 		static_assert(sizeof(T) <= 128 && alignof(T) == sizeof(T), "vm::reservation_op: unsupported type");
@@ -94,9 +95,10 @@ namespace vm
 		{
 			// Stage 1: single optimistic transaction attempt
 			unsigned status = _XBEGIN_STARTED;
-			unsigned count = 0;
 			u64 _old = 0;
 
+			auto stamp0 = __rdtsc(), stamp1 = stamp0, stamp2 = stamp0;
+
 #ifndef _MSC_VER
 			__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
 #else
@@ -157,6 +159,7 @@ namespace vm
 #ifndef _MSC_VER
 			__asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory");
 #endif
+			stamp1 = __rdtsc();
 
 			// Touch memory if transaction failed with status 0
 			if (!status)
@@ -167,12 +170,17 @@ namespace vm
 			// Stage 2: try to lock reservation first
 			_old = res.fetch_add(1);
 
-			// Also identify atomic op
-			count = 1;
+			// Compute stamps excluding memory touch
+			stamp2 = __rdtsc() - (stamp1 - stamp0);
 
-			// Start lightened transaction (TODO: tweaking)
-			for (; !(_old & rsrv_unique_lock) && count < 60; count++)
+			// Start lightened transaction
+			for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = __rdtsc())
 			{
+				if (cpu.has_pause_flag())
+				{
+					break;
+				}
+
 #ifndef _MSC_VER
 				__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
 #else
diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp
index a71dbb51c1..521ec42b28 100644
--- a/rpcs3/Emu/System.cpp
+++ b/rpcs3/Emu/System.cpp
@@ -55,7 +55,9 @@ LOG_CHANNEL(sys_log, "SYS");
 
 stx::manual_fixed_typemap<void> g_fixed_typemap;
 
-bool g_use_rtm;
+bool g_use_rtm = false;
+u64 g_rtm_tx_limit1 = 0;
+u64 g_rtm_tx_limit2 = 0;
 
 std::string g_cfg_defaults;
 
@@ -1019,6 +1021,14 @@ game_boot_result Emulator::Load(const std::string& title_id, bool add_only, bool
 			}
 		}
 
+		if (g_use_rtm)
+		{
+			// Update supplementary settings
+			const f64 _1ns = utils::get_tsc_freq() / 1000'000'000.;
+			g_rtm_tx_limit1 = g_cfg.core.tx_limit1_ns * _1ns;
+			g_rtm_tx_limit2 = g_cfg.core.tx_limit2_ns * _1ns;
+		}
+
 		// Load patches from different locations
 		g_fxo->get<patch_engine>()->append_title_patches(m_title_id);
 
diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h
index 2661d58f56..88d97d4115 100644
--- a/rpcs3/Emu/System.h
+++ b/rpcs3/Emu/System.h
@@ -240,3 +240,5 @@ private:
 extern Emulator Emu;
 
 extern bool g_use_rtm;
+extern u64 g_rtm_tx_limit1;
+extern u64 g_rtm_tx_limit2;
diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h
index ca5f74145e..7be132b865 100644
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@@ -66,6 +66,8 @@ struct cfg_root : cfg::node
 		cfg::_bool hle_lwmutex{ this, "HLE lwmutex" }; // Force alternative lwmutex/lwcond implementation
 		cfg::uint64 spu_llvm_lower_bound{ this, "SPU LLVM Lower Bound" };
 		cfg::uint64 spu_llvm_upper_bound{ this, "SPU LLVM Upper Bound", 0xffffffffffffffff };
+		cfg::uint64 tx_limit1_ns{this, "TSX Transaction First Limit", 800}; // In nanoseconds
+		cfg::uint64 tx_limit2_ns{this, "TSX Transaction Second Limit", 2000}; // In nanoseconds
 
 		cfg::_int<10, 3000> clocks_scale{ this, "Clocks scale", 100, true }; // Changing this from 100 (percentage) may affect game speed in unexpected ways
 		cfg::_enum<sleep_timers_accuracy_level> sleep_timers_accuracy{ this, "Sleep Timers Accuracy",