From 89f124814089981aeedcf1d1edab987d9ba31c88 Mon Sep 17 00:00:00 2001
From: Nekotekina <nekotekina@gmail.com>
Date: Wed, 7 Oct 2020 01:14:35 +0300
Subject: [PATCH] Implement vm::reservation_op

Implement vm::reservation_peek (memory load)
Implement vm::unsafe_ptr_cast helper
Example use in cellSpurs.cpp
Fix dma_lockb value and description
---
 rpcs3/Emu/Cell/Modules/cellSpurs.cpp |  86 ++------
 rpcs3/Emu/Cell/Modules/cellSpurs.h   |  23 ++-
 rpcs3/Emu/Memory/vm.cpp              |  47 +++++
 rpcs3/Emu/Memory/vm_ptr.h            |   7 +
 rpcs3/Emu/Memory/vm_reservation.h    | 296 ++++++++++++++++++++++++++-
 5 files changed, 388 insertions(+), 71 deletions(-)
diff --git a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
index 1df6704a0c..f9a34c4a4e 100644
--- a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
@@ -160,32 +160,6 @@ extern u32 ppu_lwarx(ppu_thread&, u32);
 extern bool ppu_stwcx(ppu_thread&, u32, u32);
 extern bool ppu_stdcx(ppu_thread&, u32, u64);
 
-bool do_atomic_128_load(cpu_thread& cpu, u32 addr, void* dst)
-{
-	verify(HERE), (addr % 128) == 0;
-
-	while (!cpu.test_stopped())
-	{
-		const u64 rtime = vm::reservation_acquire(addr, 128);
-
-		if (rtime % 128)
-		{
-			continue;
-		}
-
-		std::memcpy(dst, vm::base(addr), 128);
-
-		if (rtime != vm::reservation_acquire(addr, 128))
-		{
-			continue;
-		}
-
-		return true;
-	}
-
-	return false;
-}
-
 error_code sys_spu_image_close(ppu_thread&, vm::ptr<sys_spu_image> img);
 
 //----------------------------------------------------------------------------
@@ -2516,7 +2490,7 @@ s32 cellSpursShutdownWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid
 
 	if (wid >= (spurs->flags1 & SF1_32_WORKLOADS ? CELL_SPURS_MAX_WORKLOAD2 : CELL_SPURS_MAX_WORKLOAD))
 		return CELL_SPURS_POLICY_MODULE_ERROR_INVAL;
- 
+
 	if (spurs->exception)
 		return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
 
@@ -4393,7 +4367,7 @@ s32 _spurs::check_job_chain_attribute(u32 sdkVer, vm::cptr<u64> jcEntry, u16 siz
 {
 	if (!jcEntry)
 		return CELL_SPURS_JOB_ERROR_NULL_POINTER;
-	
+
 	if (!jcEntry.aligned())
 		return CELL_SPURS_JOB_ERROR_ALIGN;
 
@@ -4592,13 +4566,12 @@ s32 cellSpursGetJobChainInfo(ppu_thread& ppu, vm::ptr<CellSpursJobChain> jobChai
 		return err;
 	}
 
-	CellSpursJobChain data;
-
 	// Read the commands queue atomically
-	if (!do_atomic_128_load(ppu, jobChain.addr(), &data))
+	CellSpursJobChain data;
+	vm::reservation_peek(ppu, vm::unsafe_ptr_cast<CellSpursJobChain_x00>(jobChain), [&](const CellSpursJobChain_x00& jch)
 	{
-		return 0;
-	}
+		std::memcpy(&data, &jch, sizeof(jch));
+	});
 
 	info->linkRegister[0] = +data.linkRegister[0];
 	info->linkRegister[1] = +data.linkRegister[1];
@@ -4896,48 +4869,25 @@ s32 cellSpursAddUrgentCommand(ppu_thread& ppu, vm::ptr<CellSpursJobChain> jobCha
 	if (jobChain->workloadId >= CELL_SPURS_MAX_WORKLOAD2)
 		return CELL_SPURS_JOB_ERROR_INVAL;
 
-	for (u32 i = 0;;)
+	s32 result = CELL_OK;
+
+	vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobChain_x00>(jobChain), [&](CellSpursJobChain_x00& jch)
 	{
-		if (i >= std::size(jobChain->urgentCmds))
+		for (auto& cmd : jch.urgentCmds)
 		{
-			// Exausted all slots
-			return CELL_SPURS_JOB_ERROR_BUSY;
-		}
-
-		u64 currCmd = ppu_ldarx(ppu, jobChain.ptr(&CellSpursJobChain::urgentCmds, i).addr());
-		std::atomic_thread_fence(std::memory_order_acq_rel);
-
-		bool found = false;
-		bool reset = false;
-
-		if (!currCmd)
-		{
-			if (i != 0 && !jobChain->urgentCmds[i - 1])
+			if (!cmd)
 			{
-				// Restart search, someone emptied out the previous one
-				reset = true;
-			}
-			else
-			{
-				found = true;
-				currCmd = newCmd;
+				cmd = newCmd;
+				return true;
 			}
 		}
 
-		if (reset || !ppu_stdcx(ppu, jobChain.ptr(&CellSpursJobChain::urgentCmds, i).addr(), currCmd))
-		{
-			// Someone modified the job chain or the previous slot is empty, restart search
-			i = 0;
-			continue;
-		}
+		// Considered unlikely so unoptimized
+		result = CELL_SPURS_JOB_ERROR_BUSY;
+		return false;
+	});
 
-		if (found)
-			break;
-
-		i++;
-	}
-
-	return CELL_OK;
+	return result;
 }
 
 s32 cellSpursAddUrgentCall(ppu_thread& ppu, vm::ptr<CellSpursJobChain> jobChain, vm::ptr<u64> commandList)
diff --git a/rpcs3/Emu/Cell/Modules/cellSpurs.h b/rpcs3/Emu/Cell/Modules/cellSpurs.h
index 6485eac0d2..4adfff87ae 100644
--- a/rpcs3/Emu/Cell/Modules/cellSpurs.h
+++ b/rpcs3/Emu/Cell/Modules/cellSpurs.h
@@ -465,6 +465,25 @@ struct alignas(128) CellSpursJobChain
 	u8 unk5[0x100 - 0xA8];
 };
 
+struct alignas(128) CellSpursJobChain_x00
+{
+	vm::bcptr<u64, u64> pc;                                                      // 0x00
+	vm::bcptr<u64, u64> linkRegister[3];                                         // 0x08
+	u8 unk0[0x3];                                                                // 0x20
+	b8 isHalted;                                                                 // 0x23
+	b8 autoReadyCount;                                                           // 0x24
+	u8 unk1[0x7];                                                                // 0x25
+	u8 val2C;                                                                    // 0x2C
+	u8 val2D;                                                                    // 0x2D
+	u8 val2E;                                                                    // 0x2E
+	u8 val2F;                                                                    // 0x2F
+	be_t<u64> urgentCmds[4];                                                     // 0x30
+	u8 unk2[0x22];                                                               // 0x50
+	be_t<u16> maxGrabbedJob;                                                     // 0x72
+	be_t<u32> workloadId;                                                        // 0x74
+	vm::bptr<CellSpurs, u64> spurs;                                              // 0x78
+};
+
 struct CellSpursJobChainInfo
 {
 	be_t<u64> urgentCommandSlot[4];                                                 // 0x00
@@ -494,7 +513,7 @@ struct alignas(8) CellSpursJobChainAttribute
 	be_t<u32> maxGrabbedJob;          // 0x0E
 	u8 priorities[8];                 // 0x10
 	be_t<u32> maxContention;          // 0x18
-	b8 autoSpuCount;                  // 0x1C 
+	b8 autoSpuCount;                  // 0x1C
 	u8 padding[3];                    // 0x1D
 	be_t<u32> tag1;                   // 0x20
 	be_t<u32> tag2;                   // 0x24
@@ -1031,7 +1050,7 @@ struct alignas(16) CellSpursTaskBinInfo
 
 struct alignas(128) CellSpursBarrier
 {
-	be_t<u32> zero;                     // 0x00 
+	be_t<u32> zero;                     // 0x00
 	be_t<u32> remained;                 // 0x04
 	u8 unk0[0x34 - 0x8];
 	vm::bptr<CellSpursTaskset> taskset; // 0x34
diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp
index d8ba1ef808..0251f28dc8 100644
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@@ -13,6 +13,7 @@
 #include "Emu/CPU/CPUThread.h"
 #include "Emu/Cell/lv2/sys_memory.h"
 #include "Emu/RSX/GSRender.h"
+#include "Emu/Cell/SPURecompiler.h"
 #include <atomic>
 #include <thread>
 #include <deque>
@@ -470,6 +471,52 @@ namespace vm
 		}
 	}
 
+	void reservation_op_internal(u32 addr, std::function<bool()> func)
+	{
+		const auto _cpu = get_current_cpu_thread();
+
+		// Acknowledge contender if necessary (TODO: check)
+		_cpu->state += cpu_flag::wait;
+
+		{
+			cpu_thread::suspend_all cpu_lock(_cpu);
+
+			// Wait to acquire PUTLLUC lock
+			while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
+			{
+				busy_wait(100);
+			}
+
+			if (func())
+			{
+				// Success, release PUTLLUC and PUTLLC locks if necessary
+				vm::reservation_acquire(addr, 128) += 63;
+			}
+			else
+			{
+				// Fake update (TODO)
+				vm::reservation_acquire(addr, 128) += 63;
+			}
+		}
+
+		vm::reservation_notifier(addr, 128).notify_all();
+	}
+
+	void reservation_escape_internal()
+	{
+		const auto _cpu = get_current_cpu_thread();
+
+		if (_cpu && _cpu->id_type() == 1)
+		{
+			thread_ctrl::emergency_exit("vm::reservation_escape");
+		}
+
+		if (_cpu && _cpu->id_type() == 2)
+		{
+			spu_runtime::g_escape(static_cast<spu_thread*>(_cpu));
+		}
+	}
+
 	static void _page_map(u32 addr, u8 flags, u32 size, utils::shm* shm)
 	{
 		if (!size || (size | addr) % 4096 || flags & page_allocated)
diff --git a/rpcs3/Emu/Memory/vm_ptr.h b/rpcs3/Emu/Memory/vm_ptr.h
index d9f71532c4..dd77c453a1 100644
--- a/rpcs3/Emu/Memory/vm_ptr.h
+++ b/rpcs3/Emu/Memory/vm_ptr.h
@@ -323,6 +323,13 @@ namespace vm
 		{
 			return vm::cast(other.addr(), HERE);
 		}
+
+		// Perform reinterpret cast
+		template <typename CT, typename T, typename AT, typename = decltype(reinterpret_cast<to_be_t<CT>*>(std::declval<T*>()))>
+		inline _ptr_base<to_be_t<CT>, u32> unsafe_ptr_cast(const _ptr_base<T, AT>& other)
+		{
+			return vm::cast(other.addr(), HERE);
+		}
 	}
 
 	struct null_t
diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h
index 6320c91e7d..fd593c28db 100644
--- a/rpcs3/Emu/Memory/vm_reservation.h
+++ b/rpcs3/Emu/Memory/vm_reservation.h
@@ -1,15 +1,19 @@
 #pragma once
 
 #include "vm.h"
+#include "vm_locking.h"
 #include "Utilities/cond.h"
 #include "util/atomic.hpp"
+#include <functional>
+
+extern bool g_use_rtm;
 
 namespace vm
 {
 	enum reservation_lock_bit : u64
 	{
 		stcx_lockb = 1 << 0, // Exclusive conditional reservation lock
-		dma_lockb = 1 << 1, // Inexclusive unconditional reservation lock
+		dma_lockb = 1 << 5, // Exclusive unconditional reservation lock
 		putlluc_lockb = 1 << 6, // Exclusive unconditional reservation lock
 	};
 
@@ -69,4 +73,294 @@ namespace vm
 
 		return {*res, rtime};
 	}
+
+	void reservation_op_internal(u32 addr, std::function<bool()> func);
+
+	template <typename T, typename AT = u32, typename F>
+	SAFE_BUFFERS inline auto reservation_op(_ptr_base<T, AT> ptr, F op)
+	{
+		// Atomic operation will be performed on aligned 128 bytes of data, so the data size and alignment must comply
+		static_assert(sizeof(T) <= 128 && alignof(T) == sizeof(T), "vm::reservation_op: unsupported type");
+		static_assert(std::is_trivially_copyable_v<T>, "vm::reservation_op: not triv copyable (optimization)");
+
+		// Use "super" pointer to prevent access violation handling during atomic op
+		const auto sptr = vm::get_super_ptr<T>(static_cast<u32>(ptr.addr()));
+
+		// Use 128-byte aligned addr
+		const u32 addr = static_cast<u32>(ptr.addr()) & -128;
+
+		if (g_use_rtm)
+		{
+			auto& res = vm::reservation_acquire(addr, 128);
+
+			// Stage 1: single optimistic transaction attempt
+			unsigned status = _XBEGIN_STARTED;
+
+#ifndef _MSC_VER
+			__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
+#else
+			status = _xbegin();
+			if (status == _XBEGIN_STARTED)
+#endif
+			{
+				if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
+				{
+					res += 128;
+					std::invoke(op, *sptr);
+#ifndef _MSC_VER
+					__asm__ volatile ("xend;" ::: "memory");
+#else
+					_xend();
+#endif
+					res.notify_all();
+					return;
+				}
+				else
+				{
+					if (auto result = std::invoke(op, *sptr))
+					{
+						res += 128;
+#ifndef _MSC_VER
+						__asm__ volatile ("xend;" ::: "memory");
+#else
+						_xend();
+#endif
+						res.notify_all();
+						return result;
+					}
+					else
+					{
+#ifndef _MSC_VER
+						__asm__ volatile ("xabort $1;" ::: "memory");
+#else
+						_xabort(1);
+#endif
+						// Unreachable code
+						return std::invoke_result_t<F, T&>();
+					}
+				}
+			}
+
+			stage2:
+#ifndef _MSC_VER
+			__asm__ volatile ("movl %%eax, %0;" : "=r" (status) :: "memory");
+#endif
+			if constexpr (!std::is_void_v<std::invoke_result_t<F, T&>>)
+			{
+				if (_XABORT_CODE(status))
+				{
+					// Unfortunately, actual function result is not recoverable in this case
+					return std::invoke_result_t<F, T&>();
+				}
+			}
+
+			// Touch memory if transaction failed without RETRY flag on the first attempt (TODO)
+			if (!(status & _XABORT_RETRY))
+			{
+				reinterpret_cast<atomic_t<u8>*>(sptr)->fetch_add(0);
+			}
+
+			// Stage 2: try to lock reservation first
+			res += stcx_lockb;
+
+			// Start lightened transaction (TODO: tweaking)
+			while (true)
+			{
+#ifndef _MSC_VER
+				__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
+#else
+				status = _xbegin();
+
+				if (status != _XBEGIN_STARTED) [[unlikely]]
+				{
+					goto retry;
+				}
+#endif
+				if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
+				{
+					std::invoke(op, *sptr);
+#ifndef _MSC_VER
+					__asm__ volatile ("xend;" ::: "memory");
+#else
+					_xend();
+#endif
+					res += 127;
+					res.notify_all();
+					return;
+				}
+				else
+				{
+					if (auto result = std::invoke(op, *sptr))
+					{
+#ifndef _MSC_VER
+						__asm__ volatile ("xend;" ::: "memory");
+#else
+						_xend();
+#endif
+						res += 127;
+						res.notify_all();
+						return result;
+					}
+					else
+					{
+#ifndef _MSC_VER
+						__asm__ volatile ("xabort $1;" ::: "memory");
+#else
+						_xabort(1);
+#endif
+						return std::invoke_result_t<F, T&>();
+					}
+				}
+
+				retry:
+#ifndef _MSC_VER
+				__asm__ volatile ("movl %%eax, %0;" : "=r" (status) :: "memory");
+#endif
+				if (!(status & _XABORT_RETRY)) [[unlikely]]
+				{
+					if constexpr (!std::is_void_v<std::invoke_result_t<F, T&>>)
+					{
+						if (_XABORT_CODE(status))
+						{
+							res -= 1;
+							return std::invoke_result_t<F, T&>();
+						}
+					}
+
+					break;
+				}
+			}
+
+			// Stage 3: all failed, heavyweight fallback (see comments at the bottom)
+			if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
+			{
+				return vm::reservation_op_internal(addr, [&]
+				{
+					std::invoke(op, *sptr);
+					return true;
+				});
+			}
+			else
+			{
+				auto result = std::invoke_result_t<F, T&>();
+
+				vm::reservation_op_internal(addr, [&]
+				{
+					T buf = *sptr;
+
+					if ((result = std::invoke(op, buf)))
+					{
+						*sptr = buf;
+						return true;
+					}
+					else
+					{
+						return false;
+					}
+				});
+
+				return result;
+			}
+		}
+
+
+		// Perform under heavyweight lock
+		auto& res = vm::reservation_acquire(addr, 128);
+
+		res += stcx_lockb;
+
+		// Write directly if the op cannot fail
+		if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
+		{
+			{
+				vm::writer_lock lock(addr);
+				std::invoke(op, *sptr);
+				res += 127;
+			}
+
+			res.notify_all();
+			return;
+		}
+		else
+		{
+			// Make an operational copy of data (TODO: volatile storage?)
+			auto result = std::invoke_result_t<F, T&>();
+
+			{
+				vm::writer_lock lock(addr);
+				T buf = *sptr;
+
+				if ((result = std::invoke(op, buf)))
+				{
+					// If operation succeeds, write the data back
+					*sptr = buf;
+					res += 127;
+				}
+				else
+				{
+					// Operation failed, no memory has been modified
+					res -= 1;
+					return std::invoke_result_t<F, T&>();
+				}
+			}
+
+			res.notify_all();
+			return result;
+		}
+	}
+
+	// For internal usage
+	void reservation_escape_internal();
+
+	// Read memory value in pseudo-atomic manner
+	template <typename CPU, typename T, typename AT = u32, typename F>
+	SAFE_BUFFERS inline auto reservation_peek(CPU&& cpu, _ptr_base<T, AT> ptr, F op)
+	{
+		// Atomic operation will be performed on aligned 128 bytes of data, so the data size and alignment must comply
+		static_assert(sizeof(T) <= 128 && alignof(T) == sizeof(T), "vm::reservation_peek: unsupported type");
+
+		// Use "super" pointer to prevent access violation handling during atomic op
+		const auto sptr = vm::get_super_ptr<const T>(static_cast<u32>(ptr.addr()));
+
+		// Use 128-byte aligned addr
+		const u32 addr = static_cast<u32>(ptr.addr()) & -128;
+
+		while (true)
+		{
+			if constexpr (std::is_class_v<std::remove_cvref_t<CPU>>)
+			{
+				if (cpu.test_stopped())
+				{
+					reservation_escape_internal();
+				}
+			}
+
+			const u64 rtime = vm::reservation_acquire(addr, 128);
+
+			if (rtime & 127)
+			{
+				continue;
+			}
+
+			// Observe data non-atomically and make sure no reservation updates were made
+			if constexpr (std::is_void_v<std::invoke_result_t<F, const T&>>)
+			{
+				std::invoke(op, *sptr);
+
+				if (rtime == vm::reservation_acquire(addr, 128))
+				{
+					return;
+				}
+			}
+			else
+			{
+				auto res = std::invoke(op, *sptr);
+
+				if (rtime == vm::reservation_acquire(addr, 128))
+				{
+					return res;
+				}
+			}
+		}
+	}
 } // namespace vm