Merge 119589f50d into 8437a5f5ac

2025-04-20 03:25:16 +00:00 · 2025-04-19 15:52:03 +00:00 · 2025-04-19 15:52:03 +00:00 · ddf9c672ed
commit ddf9c672ed
parent 8437a5f5ac 119589f50d
18 changed files with 317 additions and 214 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.28)
 project(rpcs3 LANGUAGES C CXX)

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)

 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11)
--- a/Utilities/JITASM.cpp
+++ b/Utilities/JITASM.cpp
@ -344,15 +344,7 @@ jit_runtime_base& asmjit::get_global_runtime()
 	{
 		custom_runtime() noexcept
 		{
-			// Search starting in first 2 GiB of memory
-			for (u64 addr = size;; addr += size)
-			{
-				if (auto ptr = utils::memory_reserve(size, reinterpret_cast<void*>(addr)))
-				{
-					m_pos.raw() = static_cast<uchar*>(ptr);
-					break;
-				}
-			}
+			ensure(m_pos.raw() = static_cast<uchar*>(utils::memory_reserve(size)));

 			// Initialize "end" pointer
 			m_max = m_pos + size;
--- a/Utilities/Thread.cpp
+++ b/Utilities/Thread.cpp
@ -2490,7 +2490,7 @@ void thread_ctrl::wait_for(u64 usec, [[maybe_unused]] bool alert /* true */)
 	if (alert)
 	{
 		list.set<0>(_this->m_sync, 0);
-		list.set<1>(utils::bless<atomic_t<u32>>(&_this->m_taskq)[1], 0);
+		list.template set<1>(_this->m_taskq);
 	}
 	else
 	{
--- a/Utilities/lockless.h
+++ b/Utilities/lockless.h
@ -49,7 +49,7 @@ public:

 			if (!next)
 			{
-				// Do not allow access beyond many element more at a time 
+				// Do not allow access beyond many element more at a time
 				ensure(!installed && index - i < N * 2);

 				installed = true;
@ -384,17 +384,26 @@ public:
 template <typename T>
 class lf_queue final
 {
-	atomic_t<u64> m_head{0};
-
-	lf_queue_item<T>* load(u64 value) const noexcept
+public:
+	struct fat_ptr
 	{
-		return reinterpret_cast<lf_queue_item<T>*>(value >> 16);
+		u64 ptr{};
+		u32 is_non_null{};
+		u32 reserved{};
+	};
+
+private:
+	atomic_t<fat_ptr> m_head{fat_ptr{}};
+
+	lf_queue_item<T>* load(fat_ptr value) const noexcept
+	{
+		return reinterpret_cast<lf_queue_item<T>*>(value.ptr);
 	}

 	// Extract all elements and reverse element order (FILO to FIFO)
 	lf_queue_item<T>* reverse() noexcept
 	{
-		if (auto* head = load(m_head) ? load(m_head.exchange(0)) : nullptr)
+		if (auto* head = load(m_head) ? load(m_head.exchange(fat_ptr{})) : nullptr)
 		{
 			if (auto* prev = head->m_link)
 			{
@ -420,7 +429,7 @@ public:

 	lf_queue(lf_queue&& other) noexcept
 	{
-		m_head.release(other.m_head.exchange(0));
+		m_head.release(other.m_head.exchange(fat_ptr{}));
 	}

 	lf_queue& operator=(lf_queue&& other) noexcept
@ -431,7 +440,7 @@ public:
 		}

 		delete load(m_head);
-		m_head.release(other.m_head.exchange(0));
+		m_head.release(other.m_head.exchange(fat_ptr{}));
 		return *this;
 	}

@ -442,9 +451,9 @@ public:

 	void wait(std::nullptr_t /*null*/ = nullptr) noexcept
 	{
-		if (m_head == 0)
+		if (!operator bool())
 		{
-			utils::bless<atomic_t<u32>>(&m_head)[1].wait(0);
+			utils::bless<atomic_t<u32>>(&m_head.raw().is_non_null)->wait(0);
 		}
 	}

@ -455,7 +464,7 @@ public:

 	explicit operator bool() const noexcept
 	{
-		return m_head != 0;
+		return observe() != nullptr;
 	}

 	template <bool Notify = true, typename... Args>
@ -464,25 +473,25 @@ public:
 		auto oldv = m_head.load();
 		auto item = new lf_queue_item<T>(load(oldv), std::forward<Args>(args)...);

-		while (!m_head.compare_exchange(oldv, reinterpret_cast<u64>(item) << 16))
+		while (!m_head.compare_exchange(oldv, fat_ptr{reinterpret_cast<u64>(item), item != nullptr, 0}))
 		{
 			item->m_link = load(oldv);
 		}

-		if (!oldv && Notify)
+		if (!oldv.ptr && Notify)
 		{
 			// Notify only if queue was empty
 			notify(true);
 		}

-		return !oldv;
+		return !oldv.ptr;
 	}

 	void notify(bool force = false)
 	{
 		if (force || operator bool())
 		{
-			utils::bless<atomic_t<u32>>(&m_head)[1].notify_one();
+			utils::bless<atomic_t<u32>>(&m_head.raw().is_non_null)->notify_one();
 		}
 	}

@ -498,7 +507,7 @@ public:
 	lf_queue_slice<T> pop_all_reversed()
 	{
 		lf_queue_slice<T> result;
-		result.m_head = load(m_head.exchange(0));
+		result.m_head = load(m_head.exchange(fat_ptr{}));
 		return result;
 	}

--- a/buildfiles/cmake/ConfigureCompiler.cmake
+++ b/buildfiles/cmake/ConfigureCompiler.cmake
@ -5,13 +5,12 @@ if(MSVC)
 	add_compile_definitions(
 		_CRT_SECURE_NO_DEPRECATE=1 _CRT_NON_CONFORMING_SWPRINTFS=1 _SCL_SECURE_NO_WARNINGS=1
 		NOMINMAX _ENABLE_EXTENDED_ALIGNED_STORAGE=1 _HAS_EXCEPTIONS=0)
-	add_link_options(/DYNAMICBASE:NO /BASE:0x10000 /FIXED)
+	add_link_options(/DYNAMICBASE:YES)

 	#TODO: Some of these could be cleaned up
 	add_compile_options(/wd4805) # Comparing boolean and int
 	add_compile_options(/wd4804) # Using integer operators with booleans
 	add_compile_options(/wd4200) # Zero-sized array in struct/union
-	add_link_options(/ignore:4281) # Undesirable base address 0x10000

 	# MSVC 2017 uses iterator as base class internally, causing a lot of warning spam
 	add_compile_definitions(_SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING=1)
@ -19,8 +18,6 @@ if(MSVC)
 	# Increase stack limit to 8 MB
 	add_link_options(/STACK:8388608,1048576)
 else()
-	# Some distros have the compilers set to use PIE by default, but RPCS3 doesn't work with PIE, so we need to disable it.
-	check_cxx_compiler_flag("-no-pie" HAS_NO_PIE)
 	check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
 	check_cxx_compiler_flag("-msse -msse2 -mcx16" COMPILER_X86)
 	if (APPLE)
@ -96,15 +93,6 @@ else()
 	if(NOT APPLE AND NOT WIN32)
 		# This hides our LLVM from mesa's LLVM, otherwise we get some unresolvable conflicts.
 		add_link_options(-Wl,--exclude-libs,ALL)
-
-		if(HAS_NO_PIE)
-			add_link_options(-no-pie)
-		endif()
-	elseif(APPLE)
-		if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
-			add_link_options(-Wl,-image_base,0x10000 -Wl,-pagezero_size,0x10000)
-			add_link_options(-Wl,-no_pie)
-		endif()
 	elseif(WIN32)
 		add_compile_definitions(__STDC_FORMAT_MACROS=1)

@ -113,11 +101,6 @@ else()

 		# Increase stack limit to 8 MB
 		add_link_options(-Wl,--stack -Wl,8388608)
-
-		# For arm64 windows, the image base cannot be below 4GB or the OS rejects the binary without much explanation.
-		if(COMPILER_X86)
-			add_link_options(-Wl,--image-base,0x10000)
-		endif()
 	endif()

 	# Specify C++ library to use as standard C++ when using clang (not required on linux due to GNU)
--- a/rpcs3/Emu/Cell/PPUFunction.cpp
+++ b/rpcs3/Emu/Cell/PPUFunction.cpp
@ -1902,8 +1902,9 @@ auto gen_ghc_cpp_trampoline(ppu_intrp_func_t fn_target)
 		// Take second ghc arg
 		c.mov(args[0], x86::rbp);
 		c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)));
-		c.add(args[2], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
-		c.jmp(fn_target);
+		c.movabs(args[1], reinterpret_cast<u64>(&vm::g_base_addr));
+		c.add(args[2], x86::qword_ptr(args[1]));
+		c.jmp(Imm(fn_target));
 	};
 }

--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -220,19 +220,21 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
 	c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp);

 	// Initialize args
-	c.mov(x86::r13, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
+	c.movabs(x86::r13, reinterpret_cast<u64>(&vm::g_exec_addr));
+	c.mov(x86::r13, x86::qword_ptr(x86::r13));
 	c.mov(x86::rbp, args[0]);
 	c.mov(x86::edx, x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia))); // Load PC

-	c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::edx, 1, 0)); // Load call target
-	c.mov(x86::rdx, x86::rax);
-	c.shl(x86::rax, 16);
-	c.shr(x86::rax, 16);
-	c.shr(x86::rdx, 48);
+	c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::rdx, 1, 0)); // Load call target
+	c.movabs(x86::r12, vm::g_exec_addr_seg_offset);
+	c.add(x86::r12, x86::r13);
+	c.shr(x86::edx, 1);
+	c.mov(x86::edx, x86::word_ptr(x86::r12, x86::edx)); // Load relocation base
 	c.shl(x86::edx, 13);
-	c.mov(x86::r12d, x86::edx); // Load relocation base
+	c.mov(x86::r12d, x86::edx); // Set relocation base

-	c.mov(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
+	c.movabs(x86::rbx, reinterpret_cast<u64>(&vm::g_base_addr));
+	c.mov(x86::rbx, x86::qword_ptr(x86::rbx));
 	c.mov(x86::r14, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 0))); // Load some registers
 	c.mov(x86::rsi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 1)));
 	c.mov(x86::rdi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 2)));
@ -346,14 +348,11 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
 	c.ldr(call_target, arm::Mem(a64::x19, pc));
 	// Compute REG_Hp
 	const arm::GpX reg_hp = a64::x21;
-	c.mov(reg_hp, call_target);
-	c.lsr(reg_hp, reg_hp, 48);
+	c.mov(reg_hp, Imm(vm::g_exec_addr_seg_offset));
+	c.add(reg_hp, reg_hp, pc, arm::Shift(arm::ShiftOp::kLSR, 2));
+	c.ldrh(reg_hp.w(), arm::Mem(a64::x19, reg_hp));
 	c.lsl(reg_hp.w(), reg_hp.w(), 13);

-	// Zero top 16 bits of call target
-	c.lsl(call_target, call_target, Imm(16));
-	c.lsr(call_target, call_target, Imm(16));
-
 	// Load registers
 	c.mov(a64::x22, Imm(reinterpret_cast<u64>(&vm::g_base_addr)));
 	c.ldr(a64::x22, arm::Mem(a64::x22));
@ -473,6 +472,11 @@ static inline u8* ppu_ptr(u32 addr)
 	return vm::g_exec_addr + u64{addr} * 2;
 }

+static inline u8* ppu_seg_ptr(u32 addr)
+{
+	return vm::g_exec_addr + vm::g_exec_addr_seg_offset + (addr >> 1);
+}
+
 static inline ppu_intrp_func_t ppu_read(u32 addr)
 {
 	return read_from_ptr<ppu_intrp_func_t>(ppu_ptr(addr));
@ -518,7 +522,7 @@ void ppu_recompiler_fallback(ppu_thread& ppu)

 	while (true)
 	{
-		if (uptr func = uptr(ppu_read(ppu.cia)); (func << 16 >> 16) != reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
+		if (uptr func = uptr(ppu_read(ppu.cia)); func != reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
 		{
 			// We found a recompiler function at cia, return
 			break;
@ -773,6 +777,9 @@ extern void ppu_register_range(u32 addr, u32 size)
 	utils::memory_commit(ppu_ptr(addr), u64{size} * 2, utils::protection::rw);
 	ensure(vm::page_protect(addr, size, 0, vm::page_executable));

+	// Segment data
+	utils::memory_commit(ppu_seg_ptr(addr), size >> 1, utils::protection::rw);
+
 	if (g_cfg.core.ppu_debug)
 	{
 		utils::memory_commit(vm::g_stat_addr + addr, size);
@ -785,12 +792,13 @@ extern void ppu_register_range(u32 addr, u32 size)
 		if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
 		{
 			// Assume addr is the start of first segment of PRX
-			const uptr entry_value = reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3));
-			write_to_ptr<uptr>(ppu_ptr(addr), entry_value);
+			write_to_ptr<uptr>(ppu_ptr(addr), std::bit_cast<uptr>(ppu_recompiler_fallback_ghc));
+			write_to_ptr<u16>(ppu_seg_ptr(addr), static_cast<u16>(seg_base >> 13));
 		}
 		else
 		{
 			write_to_ptr<ppu_intrp_func_t>(ppu_ptr(addr), ppu_fallback);
+			write_to_ptr<u16>(ppu_seg_ptr(addr), 0);
 		}

 		addr += 4;
@ -805,7 +813,7 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr =
 	// Initialize specific function
 	if (ptr)
 	{
-		write_to_ptr<uptr>(ppu_ptr(addr), (reinterpret_cast<uptr>(ptr) & 0xffff'ffff'ffffu) | (uptr(ppu_read(addr)) & ~0xffff'ffff'ffffu));
+		write_to_ptr<uptr>(ppu_ptr(addr), std::bit_cast<uptr>(ptr));
 		return;
 	}

@ -3164,8 +3172,9 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime

 	// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
 	c.push(x86::rbp);
+	c.push(x86::r13);
 	c.push(x86::r14);
-	c.sub(x86::rsp, 40);
+	c.sub(x86::rsp, 48);
 #ifdef _WIN32
 	if (!s_tsx_avx)
 	{
@ -3176,14 +3185,16 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime

 	// Prepare registers
 	build_swap_rdx_with(c, args, x86::r10);
-	c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
+	c.movabs(x86::rbp, reinterpret_cast<u64>(&vm::g_sudo_addr));
+	c.mov(x86::rbp, x86::qword_ptr(x86::rbp));
 	c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
 	c.and_(x86::rbp, -128);
 	c.prefetchw(x86::byte_ptr(x86::rbp, 0));
 	c.prefetchw(x86::byte_ptr(x86::rbp, 64));
 	c.movzx(args[0].r32(), args[0].r16());
 	c.shr(args[0].r32(), 1);
-	c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
+	c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
+	c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
 	c.and_(x86::r11, -128 / 2);
 	c.and_(args[0].r32(), 63);

@ -3217,7 +3228,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
 	{
 		build_get_tsc(c);
 		c.sub(x86::rax, stamp0);
-		c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
+		c.movabs(x86::r13, reinterpret_cast<u64>(&g_rtm_tx_limit2));
+		c.cmp(x86::rax, x86::qword_ptr(x86::r13));
 		c.jae(fall);
 	});

@ -3342,8 +3354,9 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
 		c.vzeroupper();
 	}

-	c.add(x86::rsp, 40);
+	c.add(x86::rsp, 48);
 	c.pop(x86::r14);
+	c.pop(x86::r13);
 	c.pop(x86::rbp);

 	maybe_flush_lbr(c);
@ -4179,7 +4192,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 	// 2 7MB overlay files -> 14GB
 	// The growth in memory requirements of LLVM is not linear with file size of course
 	// But these estimates should hopefully protect RPCS3 in the coming years
-	// Especially when thread count is on the rise with each CPU generation 
+	// Especially when thread count is on the rise with each CPU generation
 	atomic_t<u32> file_size_limit = static_cast<u32>(std::clamp<u64>(utils::aligned_div<u64>(utils::get_total_memory(), 2000), 65536, u32{umax}));

 	const u32 software_thread_limit = std::min<u32>(g_cfg.core.llvm_threads ? g_cfg.core.llvm_threads : u32{umax}, ::size32(file_queue));
@ -4301,8 +4314,8 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 			if (!src && !Emu.klic.empty() && src.open(path))
 			{
 				src = decrypt_self(src, reinterpret_cast<u8*>(&Emu.klic[0]));
-				
-				if (src) 
+
+				if (src)
 				{
 					ppu_log.error("Possible missed KLIC for precompilation of '%s', please report to developers.", path);

@ -4333,7 +4346,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 				{
 					if (value)
 					{
-						// Allow at least one file, make 0 the "memory unavailable" sign value for atomic waiting efficiency 
+						// Allow at least one file, make 0 the "memory unavailable" sign value for atomic waiting efficiency
 						const u32 new_val = static_cast<u32>(utils::sub_saturate<u64>(value, file_size));
 						restore_mem = value - new_val;
 						value = new_val;
@ -4506,8 +4519,8 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
 			if (!src && !Emu.klic.empty() && src.open(path))
 			{
 				src = decrypt_self(src, reinterpret_cast<u8*>(&Emu.klic[0]));
-				
-				if (src) 
+
+				if (src)
 				{
 					ppu_log.error("Possible missed KLIC for precompilation of '%s', please report to developers.", path);
 				}
@ -5079,17 +5092,18 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 			code_size_until_jump = buf_end - buf_start;

 			c.add(x86::edx, seg0);
-			c.mov(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
+			c.movabs(x86::rax, reinterpret_cast<u64>(&vm::g_exec_addr));
+			c.mov(x86::rax, x86::qword_ptr(x86::rax));
 			c.mov(x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia)), x86::edx);

-			c.mov(x86::rax, x86::qword_ptr(x86::rax, x86::rdx, 1, 0)); // Load call target
-			c.mov(x86::rdx, x86::rax);
-			c.shl(x86::rax, 16);
-			c.shr(x86::rax, 16);
-			c.shr(x86::rdx, 48);
+			c.mov(x86::rcx, x86::qword_ptr(x86::rax, x86::rdx, 1, 0)); // Load call target
+			c.movabs(x86::r12, vm::g_exec_addr_seg_offset);
+			c.add(x86::rax, x86::r12);
+			c.shr(x86::edx, 1);
+			c.mov(x86::edx, x86::word_ptr(x86::rax, x86::edx)); // Load relocation base
 			c.shl(x86::edx, 13);
-			c.mov(x86::r12d, x86::edx); // Load relocation base
-			c.jmp(x86::rax);
+			c.mov(x86::r12d, x86::edx); // Set relocation base
+			c.jmp(x86::rcx);
 #else
 			// Load REG_Base - use absolute jump target to bypass rel jmp range limits
 			// X19 contains vm::g_exec_addr
@ -5125,14 +5139,11 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s

 			// Compute REG_Hp
 			const arm::GpX reg_hp = a64::x21;
-			c.mov(reg_hp, call_target);
-			c.lsr(reg_hp, reg_hp, 48);
+			c.mov(reg_hp, Imm(vm::g_exec_addr_seg_offset));
+			c.add(reg_hp, reg_hp, pc, arm::Shift(arm::ShiftOp::kLSR, 2));
+			c.ldrh(reg_hp.w(), arm::Mem(exec_addr, reg_hp));
 			c.lsl(reg_hp.w(), reg_hp.w(), 13);

-			// Zero top 16 bits of call target
-			c.lsl(call_target, call_target, 16);
-			c.lsr(call_target, call_target, 16);
-
 			// Execute LLE call
 			c.br(call_target);
 #endif
@ -5340,7 +5351,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 					sha1_update(&ctx, reinterpret_cast<const u8*>(addrs.data()), addrs.size() * sizeof(be_t<u32>));
 				}

-				part.jit_bounds = std::move(local_jit_bounds); 
+				part.jit_bounds = std::move(local_jit_bounds);
 				local_jit_bounds = std::make_shared<std::pair<u32, u32>>(u32{umax}, 0);
 			}

@ -5400,7 +5411,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
 				settings += ppu_settings::contains_symbol_resolver; // Avoid invalidating all modules for this purpose

 			// Write version, hash, CPU, settings
-			fmt::append(obj_name, "v6-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
+			fmt::append(obj_name, "v7-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
 		}

 		if (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped())
@ -5712,7 +5723,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s

 		for (u32 addr = info.segs[0].addr; addr < info.segs[0].addr + info.segs[0].size; addr += 4, inst_ptr++)
 		{
-			if (*inst_ptr == ppu_instructions::BLR() && (reinterpret_cast<uptr>(ppu_read(addr)) << 16 >> 16) == reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
+			if (*inst_ptr == ppu_instructions::BLR() && reinterpret_cast<uptr>(ppu_read(addr)) == reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
 			{
 				write_to_ptr<ppu_intrp_func_t>(ppu_ptr(addr), BLR_func);
 			}
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -411,12 +411,19 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module<lv2_obj>& info)

 	const auto faddr = m_ir->CreateLoad(ptr_inst->getResultElementType(), ptr_inst);
 	const auto faddr_int = m_ir->CreatePtrToInt(faddr, get_type<uptr>());
-	const auto fval = m_ir->CreateOr(m_ir->CreateShl(m_seg0, 32 + 3), faddr_int);
-	const auto pos = m_ir->CreateShl(m_reloc ? m_ir->CreateAdd(func_pc, m_seg0) : func_pc, 1);
+	const auto pos_32 = m_reloc ? m_ir->CreateAdd(func_pc, m_seg0) : func_pc;
+	const auto pos = m_ir->CreateShl(pos_32, 1);
 	const auto ptr = dyn_cast<GetElementPtrInst>(m_ir->CreateGEP(get_type<u8>(), m_exec, pos));

+	const auto seg_base_ptr = m_ir->CreateIntToPtr(m_ir->CreateAdd(
+		m_ir->CreatePtrToInt(m_exec, get_type<u64>()), m_ir->getInt64(vm::g_exec_addr_seg_offset)), m_exec->getType());
+	const auto seg_pos = m_ir->CreateLShr(pos_32, 1);
+	const auto seg_ptr = dyn_cast<GetElementPtrInst>(m_ir->CreateGEP(get_type<u8>(), seg_base_ptr, seg_pos));
+	const auto seg_val = m_ir->CreateTrunc(m_ir->CreateLShr(m_seg0, 13), get_type<u16>());
+
 	// Store to jumptable
-	m_ir->CreateStore(fval, ptr);
+	m_ir->CreateStore(faddr_int, ptr);
+	m_ir->CreateStore(seg_val, seg_ptr);

 	// Increment index and branch back to loop
 	const auto post_add = m_ir->CreateAdd(index_value, m_ir->getInt64(1));
@ -605,10 +612,15 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
 		const auto pos = m_ir->CreateShl(indirect, 1);
 		const auto ptr = dyn_cast<GetElementPtrInst>(m_ir->CreateGEP(get_type<u8>(), m_exec, pos));
 		const auto val = m_ir->CreateLoad(get_type<u64>(), ptr);
-		callee = FunctionCallee(type, m_ir->CreateIntToPtr(m_ir->CreateAnd(val, 0xffff'ffff'ffff), type->getPointerTo()));
+		callee = FunctionCallee(type, m_ir->CreateIntToPtr(val, type->getPointerTo()));

 		// Load new segment address
-		seg0 = m_ir->CreateShl(m_ir->CreateLShr(val, 48), 13);
+		const auto seg_base_ptr = m_ir->CreateIntToPtr(m_ir->CreateAdd(
+			m_ir->CreatePtrToInt(m_exec, get_type<u64>()), m_ir->getInt64(vm::g_exec_addr_seg_offset)), m_exec->getType());
+		const auto seg_pos = m_ir->CreateLShr(indirect, 1);
+		const auto seg_ptr = dyn_cast<GetElementPtrInst>(m_ir->CreateGEP(get_type<u8>(), seg_base_ptr, seg_pos));
+		const auto seg_val = m_ir->CreateZExt(m_ir->CreateLoad(get_type<u16>(), seg_ptr), get_type<u64>());
+		seg0 = m_ir->CreateShl(seg_val, 13);
 	}

 	m_ir->SetInsertPoint(block);
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -2770,14 +2770,17 @@ void spu_recompiler::FREST(spu_opcode_t op)
 	const u64 fraction_lut_addr = reinterpret_cast<u64>(spu_frest_fraction_lut);
 	const u64 exponent_lut_addr = reinterpret_cast<u64>(spu_frest_exponent_lut);

+	c->movabs(*arg0, fraction_lut_addr);
+	c->movabs(*arg1, exponent_lut_addr);
+
 	for (u32 index = 0; index < 4; index++)
 	{
 		c->pextrd(*qw0, v_fraction, index);
-		c->mov(*qw1, asmjit::x86::dword_ptr(fraction_lut_addr, *qw0, 2));
+		c->mov(*qw1, asmjit::x86::dword_ptr(*arg0, *qw0, 2));
 		c->pinsrd(v_fraction, *qw1, index);

 		c->pextrd(*qw0, v_exponent, index);
-		c->mov(*qw1, asmjit::x86::dword_ptr(exponent_lut_addr, *qw0, 2));
+		c->mov(*qw1, asmjit::x86::dword_ptr(*arg1, *qw0, 2));
 		c->pinsrd(v_exponent, *qw1, index);
 	}

@ -2810,14 +2813,17 @@ void spu_recompiler::FRSQEST(spu_opcode_t op)
 	const u64 fraction_lut_addr = reinterpret_cast<u64>(spu_frsqest_fraction_lut);
 	const u64 exponent_lut_addr = reinterpret_cast<u64>(spu_frsqest_exponent_lut);

+	c->movabs(*arg0, fraction_lut_addr);
+	c->movabs(*arg1, exponent_lut_addr);
+
 	for (u32 index = 0; index < 4; index++)
 	{
 		c->pextrd(*qw0, v_fraction, index);
-		c->mov(*qw1, asmjit::x86::dword_ptr(fraction_lut_addr, *qw0, 2));
+		c->mov(*qw1, asmjit::x86::dword_ptr(*arg0, *qw0, 2));
 		c->pinsrd(v_fraction, *qw1, index);

 		c->pextrd(*qw0, v_exponent, index);
-		c->mov(*qw1, asmjit::x86::dword_ptr(exponent_lut_addr, *qw0, 2));
+		c->mov(*qw1, asmjit::x86::dword_ptr(*arg1, *qw0, 2));
 		c->pinsrd(v_exponent, *qw1, index);
 	}

--- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp
@ -842,6 +842,7 @@ void spu_cache::initialize(bool build_existing_cache)
 		// Initialize compiler instances for parallel compilation
 		std::unique_ptr<spu_recompiler_base> compiler;

+#if defined(ARCH_X64)
 		if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
 		{
 			compiler = spu_recompiler_base::make_asmjit_recompiler();
@ -850,6 +851,22 @@ void spu_cache::initialize(bool build_existing_cache)
 		{
 			compiler = spu_recompiler_base::make_llvm_recompiler();
 		}
+		else
+		{
+			fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
+		}
+#elif defined(ARCH_ARM64)
+		if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
+		{
+			compiler = spu_recompiler_base::make_llvm_recompiler();
+		}
+		else
+		{
+			fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
+		}
+#else
+#error "Unimplemented"
+#endif

 		compiler->init();

@ -2545,7 +2562,7 @@ bool reg_state_t::is_const() const

 bool reg_state_t::compare_tags(const reg_state_t& rhs) const
 {
-	// Compare by tag, address of instruction origin 
+	// Compare by tag, address of instruction origin
 	return tag == rhs.tag && origin == rhs.origin && is_instruction == rhs.is_instruction;
 }

@ -6066,7 +6083,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
 									else if (atomic16->ls_offs.compare_with_mask_indifference(atomic16->lsa, SPU_LS_MASK_128) && atomic16->ls.is_less_than(128 - (atomic16->ls_offs.value & 127)))
 									{
 										// Relative memory access with offset less than 128 bytes
-										// Common around SPU utilities which have less strict restrictions about memory alignment 
+										// Common around SPU utilities which have less strict restrictions about memory alignment
 										ok = true;
 									}
 								}
@ -6340,7 +6357,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
 			{
 				atomic16->mem_count++;

-				// Do not clear lower 16 bytes addressing because the program can move on 4-byte basis 
+				// Do not clear lower 16 bytes addressing because the program can move on 4-byte basis
 				const u32 offs = spu_branch_target(pos - result.lower_bound, op.si16);

 				if (atomic16->lsa.is_const() && [&]()
@ -8142,7 +8159,7 @@ std::array<reg_state_t, s_reg_max>& block_reg_info::evaluate_start_state(const s
 					// Check if the node is resolved
 					if (!node->has_true_state)
 					{
-						// Assume this block cannot be resolved at the moment 
+						// Assume this block cannot be resolved at the moment
 						is_all_resolved = false;
 						break;
 					}
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -628,6 +628,8 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
 	//}

 	// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
+	c.push(x86::rbp);
+	c.push(x86::rbx);
 #ifdef _WIN32
 	c.sub(x86::rsp, 168);
 	if (s_tsx_avx)
@ -648,17 +650,21 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
 		c.movups(x86::oword_ptr(x86::rsp, 128), x86::xmm14);
 		c.movups(x86::oword_ptr(x86::rsp, 144), x86::xmm15);
 	}
+#else
+	c.sub(x86::rsp, 40);
 #endif

 	// Prepare registers
 	build_swap_rdx_with(c, args, x86::r10);
-	c.mov(args[1], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
+	c.movabs(args[1], reinterpret_cast<u64>(&vm::g_sudo_addr));
+	c.mov(args[1], x86::qword_ptr(args[1]));
 	c.lea(args[1], x86::qword_ptr(args[1], args[0]));
 	c.prefetchw(x86::byte_ptr(args[1], 0));
 	c.prefetchw(x86::byte_ptr(args[1], 64));
 	c.and_(args[0].r32(), 0xff80);
 	c.shr(args[0].r32(), 1);
-	c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
+	c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
+	c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));

 	// Prepare data
 	if (s_tsx_avx)
@ -703,7 +709,8 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
 		build_get_tsc(c);
 		c.sub(x86::rax, stamp0);
-		c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
+		c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit2));
+		c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
 		c.jae(fall);
 	});

@ -853,8 +860,13 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
 		c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144));
 	}
 	c.add(x86::rsp, 168);
+#else
+	c.add(x86::rsp, 40);
 #endif

+	c.pop(x86::rbx);
+	c.pop(x86::rbp);
+
 	if (s_tsx_avx)
 	{
 		c.vzeroupper();
@ -884,8 +896,10 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
 	//}

 	// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
-#ifdef _WIN32
+	c.push(x86::rbp);
+	c.push(x86::rbx);
 	c.sub(x86::rsp, 40);
+#ifdef _WIN32
 	if (!s_tsx_avx)
 	{
 		c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
@ -894,7 +908,8 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
 #endif
 	// Prepare registers
 	build_swap_rdx_with(c, args, x86::r10);
-	c.mov(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
+	c.movabs(x86::r11, reinterpret_cast<u64>(&vm::g_sudo_addr));
+	c.mov(x86::r11, x86::qword_ptr(x86::r11));
 	c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
 	c.prefetchw(x86::byte_ptr(x86::r11, 0));
 	c.prefetchw(x86::byte_ptr(x86::r11, 64));
@ -921,7 +936,8 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda

 	c.and_(args[0].r32(), 0xff80);
 	c.shr(args[0].r32(), 1);
-	c.lea(args[1], x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
+	c.movabs(args[1], reinterpret_cast<u64>(+vm::g_reservations));
+	c.lea(args[1], x86::qword_ptr(args[1], args[0]));

 	// Alloc args[0] to stamp0
 	const auto stamp0 = args[0];
@ -933,7 +949,8 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
 		c.add(x86::qword_ptr(args[3]), 1);
 		build_get_tsc(c);
 		c.sub(x86::rax, stamp0);
-		c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
+		c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit2));
+		c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
 		c.jae(fall);
 	});

@ -986,6 +1003,10 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
 		c.vzeroupper();
 	}

+	c.add(x86::rsp, 40);
+	c.pop(x86::rbx);
+	c.pop(x86::rbp);
+
 	maybe_flush_lbr(c);
 	c.ret();
 #else
@ -1023,11 +1044,13 @@ const auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cp

 	// Prepare registers
 	build_swap_rdx_with(c, args, x86::r10);
-	c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
+	c.movabs(x86::rbp, reinterpret_cast<u64>(&vm::g_sudo_addr));
+	c.mov(x86::rbp, x86::qword_ptr(x86::rbp));
 	c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
 	c.and_(args[0].r32(), 0xff80);
 	c.shr(args[0].r32(), 1);
-	c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
+	c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
+	c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));

 	// Alloc args[0] to stamp0
 	const auto stamp0 = args[0];
@ -1039,7 +1062,8 @@ const auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cp
 		c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
 		build_get_tsc(c);
 		c.sub(x86::rax, stamp0);
-		c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit1)));
+		c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit1));
+		c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
 		c.jae(fall);
 	});

@ -2118,20 +2142,31 @@ spu_thread::spu_thread(lv2_spu_group* group, u32 index, std::string_view name, u
 	, lv2_id(lv2_id)
 	, spu_tname(make_single<std::string>(name))
 {
+#if defined(ARCH_X64)
 	if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
 	{
 		jit = spu_recompiler_base::make_asmjit_recompiler();
 	}
 	else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
 	{
-#if defined(ARCH_X64)
 		jit = spu_recompiler_base::make_fast_llvm_recompiler();
+	}
+	else
+	{
+		fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
+	}
 #elif defined(ARCH_ARM64)
+	if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
+	{
 		jit = spu_recompiler_base::make_llvm_recompiler();
+	}
+	else
+	{
+		fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
+	}
 #else
 #error "Unimplemented"
 #endif
-	}

 	if (g_cfg.core.mfc_debug)
 	{
@ -2193,20 +2228,31 @@ spu_thread::spu_thread(utils::serial& ar, lv2_spu_group* group)
 	, lv2_id(ar)
 	, spu_tname(make_single<std::string>(ar.operator std::string()))
 {
+#if defined(ARCH_X64)
 	if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
 	{
 		jit = spu_recompiler_base::make_asmjit_recompiler();
 	}
 	else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
 	{
-#if defined(ARCH_X64)
 		jit = spu_recompiler_base::make_fast_llvm_recompiler();
+	}
+	else
+	{
+		fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
+	}
 #elif defined(ARCH_ARM64)
+	if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
+	{
 		jit = spu_recompiler_base::make_llvm_recompiler();
+	}
+	else
+	{
+		fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
+	}
 #else
 #error "Unimplemented"
 #endif
-	}

 	if (g_cfg.core.mfc_debug)
 	{
@ -4445,7 +4491,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span<const u8> ls_ptr, u32 base_add
 					// Detect "invalid" relative branches
 					// Branch offsets that, although are the only way to get X code address using relative address
 					// Rely on overflow/underflow of SPU memory bounds
-					// Thus they would behave differently if SPU LS memory size was to increase (evolving the CELL architecture was the original plan) 
+					// Thus they would behave differently if SPU LS memory size was to increase (evolving the CELL architecture was the original plan)
 					// Making them highly unlikely to be valid code

 					if (rel < 0)
@ -4666,7 +4712,7 @@ bool spu_thread::process_mfc_cmd()

 									// Add to chance if previous wait was long enough
 									const u32 add_count = zero_count == 3 && total_wait >= 40 ? (total_wait - 39) * 40
-										: zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40 
+										: zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40
 										: zero_count == 1 && total_wait >= 8 ? (total_wait - 7) * 40
 										: zero_count == 0 && total_wait >= 6 ? (total_wait - 5) * 40
 										: 0;
@ -5004,7 +5050,7 @@ bool spu_thread::process_mfc_cmd()

 						if (group->spurs_running == max_run - 1)
 						{
-							// Try to let another thread slip in and take over execution 
+							// Try to let another thread slip in and take over execution
 							thread_ctrl::wait_for(300);

 							// Update value
@ -5029,7 +5075,7 @@ bool spu_thread::process_mfc_cmd()
 				if (spurs_last_task_timestamp)
 				{
 					const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate;
-					spurs_average_task_duration -= avg_entry; 
+					spurs_average_task_duration -= avg_entry;
 					spurs_average_task_duration += std::min<u64>(45'000, current - spurs_last_task_timestamp);
 					spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate);
 					spurs_last_task_timestamp = 0;
@ -5050,7 +5096,7 @@ bool spu_thread::process_mfc_cmd()
 					}

 					max_run = group->max_run;
-					
+
 					prev_running = group->spurs_running.fetch_op([max_run](u32& x)
 					{
 						if (x < max_run)
@ -5115,7 +5161,7 @@ bool spu_thread::process_mfc_cmd()
 					if (spurs_last_task_timestamp)
 					{
 						const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate;
-						spurs_average_task_duration -= avg_entry; 
+						spurs_average_task_duration -= avg_entry;
 						spurs_average_task_duration += std::min<u64>(45'000, current - spurs_last_task_timestamp);
 						spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate);
 						spurs_last_task_timestamp = 0;
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@ -47,7 +47,7 @@ namespace vm
 	u8* const g_sudo_addr = g_base_addr + 0x1'0000'0000;

 	// Auxiliary virtual memory for executable areas
-	u8* const g_exec_addr = memory_reserve_4GiB(g_sudo_addr, 0x200000000);
+	u8* const g_exec_addr = memory_reserve_4GiB(g_sudo_addr, 0x300000000);

 	// Hooks for memory R/W interception (default: zero offset to some function with only ret instructions)
 	u8* const g_hook_addr = memory_reserve_4GiB(g_exec_addr, 0x800000000);
--- a/rpcs3/Emu/Memory/vm.h
+++ b/rpcs3/Emu/Memory/vm.h
@ -34,6 +34,8 @@ namespace vm
 	extern u8* const g_free_addr;
 	extern u8 g_reservations[65536 / 128 * 64];

+	static constexpr u64 g_exec_addr_seg_offset = 0x2'0000'0000ULL;
+
 	struct writer_lock;

 	enum memory_location_t : uint
--- a/rpcs3/rpcs3.vcxproj
+++ b/rpcs3/rpcs3.vcxproj
@ -97,10 +97,9 @@
      <IgnoreImportLibrary>true</IgnoreImportLibrary>
      <LinkIncremental>false</LinkIncremental>
      <OutputFile>$(OutDir)\rpcs3.exe</OutputFile>
-      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <RandomizedBaseAddress>true</RandomizedBaseAddress>
      <SubSystem>Windows</SubSystem>
      <SuppressStartupBanner>true</SuppressStartupBanner>
-      <BaseAddress>0x10000</BaseAddress>
      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
    </Link>
    <Midl>
@ -148,10 +147,11 @@
      <GenerateDebugInformation>Debug</GenerateDebugInformation>
      <IgnoreImportLibrary>true</IgnoreImportLibrary>
      <OutputFile>$(OutDir)\rpcs3d.exe</OutputFile>
-      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <RandomizedBaseAddress>true</RandomizedBaseAddress>
      <SubSystem>Windows</SubSystem>
      <SuppressStartupBanner>true</SuppressStartupBanner>
-      <BaseAddress>0x10000</BaseAddress>
+      <BaseAddress>
+      </BaseAddress>
      <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
    </Link>
    <Midl>
@ -2123,4 +2123,4 @@
      <UserProperties MocDir=".\QTGeneratedFiles\$(ConfigurationName)" Qt5Version_x0020_x64="$(DefaultQtVersion)" RccDir=".\QTGeneratedFiles" UicDir=".\QTGeneratedFiles" />
    </VisualStudio>
  </ProjectExtensions>
-</Project>
+</Project>
--- a/rpcs3/rpcs3qt/settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/settings_dialog.cpp
@ -398,6 +398,10 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 	spu_bg->addButton(ui->spu_asmjit,  static_cast<int>(spu_decoder_type::asmjit));
 	spu_bg->addButton(ui->spu_llvm,    static_cast<int>(spu_decoder_type::llvm));

+#ifndef ARCH_X64
+	ui->spu_asmjit->setEnabled(false);
+#endif
+
 	connect(spu_bg, &QButtonGroup::idToggled, [this](int id, bool checked)
 	{
 		if (!checked) return;
--- a/rpcs3/util/atomic.cpp
+++ b/rpcs3/util/atomic.cpp
@ -57,8 +57,8 @@ static bool has_waitv()
 // Total number of entries.
 static constexpr usz s_hashtable_size = 1u << 17;

-// Reference counter combined with shifted pointer (which is assumed to be 48 bit)
-static constexpr uptr s_ref_mask = 0xffff;
+// Reference counter mask
+static constexpr uptr s_ref_mask = 0xffff'ffff;

 // Fix for silly on-first-use initializer
 static bool s_null_wait_cb(const void*, u64, u64){ return true; };
@ -153,8 +153,16 @@ namespace
 	// Essentially a fat semaphore
 	struct alignas(64) cond_handle
 	{
-		// Combined pointer (most significant 48 bits) and ref counter (16 least significant bits)
-		atomic_t<u64> ptr_ref;
+		struct fat_ptr
+		{
+			u64 ptr{};
+			u32 reserved{};
+			u32 ref_ctr{};
+
+			auto operator<=>(const fat_ptr& other) const = default;
+		};
+
+		atomic_t<fat_ptr> ptr_ref;
 		u64 tid;
 		u32 oldv;

@ -183,7 +191,7 @@ namespace
 			mtx.init(mtx);
 #endif

-			ensure(!ptr_ref.exchange((iptr << 16) | 1));
+			ensure(ptr_ref.exchange(fat_ptr{iptr, 0, 1}) == fat_ptr{});
 		}

 		void destroy()
@ -370,7 +378,7 @@ namespace
 				if (cond_id)
 				{
 					// Set fake refctr
-					s_cond_list[cond_id].ptr_ref.release(1);
+					s_cond_list[cond_id].ptr_ref.release(cond_handle::fat_ptr{0, 0, 1});
 					cond_free(cond_id, -1);
 				}
 			}
@ -390,7 +398,7 @@ static u32 cond_alloc(uptr iptr, u32 tls_slot = -1)
 	{
 		// Fast reinitialize
 		const u32 id = std::exchange(*ptls, 0);
-		s_cond_list[id].ptr_ref.release((iptr << 16) | 1);
+		s_cond_list[id].ptr_ref.release(cond_handle::fat_ptr{iptr, 0, 1});
 		return id;
 	}

@ -461,15 +469,15 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1)
 	const auto cond = s_cond_list + cond_id;

 	// Dereference, destroy on last ref
-	const bool last = cond->ptr_ref.atomic_op([](u64& val)
+	const bool last = cond->ptr_ref.atomic_op([](cond_handle::fat_ptr& val)
 	{
-		ensure(val & s_ref_mask);
+		ensure(val.ref_ctr);

-		val--;
+		val.ref_ctr--;

-		if ((val & s_ref_mask) == 0)
+		if (val.ref_ctr == 0)
 		{
-			val = 0;
+			val = cond_handle::fat_ptr{};
 			return true;
 		}

@ -525,15 +533,15 @@ static cond_handle* cond_id_lock(u32 cond_id, uptr iptr = 0)

 	while (true)
 	{
-		const auto [old, ok] = cond->ptr_ref.fetch_op([&](u64& val)
+		const auto [old, ok] = cond->ptr_ref.fetch_op([&](cond_handle::fat_ptr& val)
 		{
-			if (!val || (val & s_ref_mask) == s_ref_mask)
+			if (val == cond_handle::fat_ptr{} || val.ref_ctr == s_ref_mask)
 			{
 				// Don't reference already deallocated semaphore
 				return false;
 			}

-			if (iptr && (val >> 16) != iptr)
+			if (iptr && val.ptr != iptr)
 			{
 				// Pointer mismatch
 				return false;
@ -548,7 +556,7 @@ static cond_handle* cond_id_lock(u32 cond_id, uptr iptr = 0)

 			if (!did_ref)
 			{
-				val++;
+				val.ref_ctr++;
 			}

 			return true;
@ -566,7 +574,7 @@ static cond_handle* cond_id_lock(u32 cond_id, uptr iptr = 0)
 			return cond;
 		}

-		if ((old & s_ref_mask) == s_ref_mask)
+		if (old.ref_ctr == s_ref_mask)
 		{
 			fmt::throw_exception("Reference count limit (%u) reached in an atomic notifier.", s_ref_mask);
 		}
@ -589,12 +597,14 @@ namespace
 		u64 maxc: 5; // Collision counter
 		u64 maxd: 11; // Distance counter
 		u64 bits: 24; // Allocated bits
-		u64 prio: 24; // Reserved
+		u64 prio: 8; // Reserved

 		u64 ref : 16; // Ref counter
-		u64 iptr: 48; // First pointer to use slot (to count used slots)
+		u64 iptr: 64; // First pointer to use slot (to count used slots)
 	};

+	static_assert(sizeof(slot_allocator) == 16);
+
 	// Need to spare 16 bits for ref counter
 	static constexpr u64 max_threads = 24;

@ -935,7 +945,7 @@ atomic_wait_engine::wait(const void* data, u32 old_value, u64 timeout, atomic_wa

 	const auto stamp0 = utils::get_unique_tsc();

-	const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 16);
+	const uptr iptr = reinterpret_cast<uptr>(data);

 	uptr iptr_ext[atomic_wait::max_list - 1]{};

@ -956,7 +966,7 @@ atomic_wait_engine::wait(const void* data, u32 old_value, u64 timeout, atomic_wa
 				}
 			}

-			iptr_ext[ext_size] = reinterpret_cast<uptr>(e->data) & (~s_ref_mask >> 16);
+			iptr_ext[ext_size] = reinterpret_cast<uptr>(e->data);
 			ext_size++;
 		}
 	}
@ -1266,7 +1276,7 @@ void atomic_wait_engine::notify_one(const void* data)
 		return;
 	}
 #endif
-	const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 16);
+	const uptr iptr = reinterpret_cast<uptr>(data);

 	root_info::slot_search(iptr, [&](u32 cond_id)
 	{
@ -1289,7 +1299,7 @@ atomic_wait_engine::notify_all(const void* data)
 		return;
 	}
 #endif
-	const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 16);
+	const uptr iptr = reinterpret_cast<uptr>(data);

 	// Array count for batch notification
 	u32 count = 0;
--- a/rpcs3/util/atomic.hpp
+++ b/rpcs3/util/atomic.hpp
@ -205,9 +205,9 @@ namespace atomic_wait
 		constexpr void set(lf_queue<T2>& var, std::nullptr_t = nullptr)
 		{
 			static_assert(Index < Max);
-			static_assert(sizeof(var) == sizeof(uptr));
+			static_assert(sizeof(var) == sizeof(uptr) * 2);

-			m_info[Index].data = reinterpret_cast<char*>(&var) + sizeof(u32);
+			m_info[Index].data = reinterpret_cast<char*>(&var) + offsetof(typename lf_queue<T2>::fat_ptr, is_non_null);
 			m_info[Index].old = 0;
 		}

@ -215,9 +215,9 @@ namespace atomic_wait
 		constexpr void set(stx::atomic_ptr<T2>& var, std::nullptr_t = nullptr)
 		{
 			static_assert(Index < Max);
-			static_assert(sizeof(var) == sizeof(uptr));
+			static_assert(sizeof(var) == sizeof(uptr) * 2);

-			m_info[Index].data = reinterpret_cast<char*>(&var) + sizeof(u32);
+			m_info[Index].data = reinterpret_cast<char*>(&var) + offsetof(typename stx::atomic_ptr<T2>::fat_ptr, is_non_null);
 			m_info[Index].old = 0;
 		}

--- a/rpcs3/util/shared_ptr.hpp
+++ b/rpcs3/util/shared_ptr.hpp
@ -19,14 +19,8 @@ namespace stx
 	template <typename T>
 	class atomic_ptr;

-	// Basic assumption of userspace pointer size
-	constexpr uint c_ptr_size = 48;
-
-	// Use lower 16 bits as atomic_ptr internal counter of borrowed refs (pointer itself is shifted)
-	constexpr uint c_ref_mask = 0xffff, c_ref_size = 16;
-
-	// Remaining pointer bits
-	constexpr uptr c_ptr_mask = static_cast<uptr>(-1) << c_ref_size;
+	// Use 16 bits as atomic_ptr internal counter of borrowed refs
+	constexpr uint c_ref_mask = 0xffff;

 	struct shared_counter
 	{
@ -574,7 +568,6 @@ namespace stx
 		}

 		// Random checks which may fail on invalid pointer
-		ensure((reinterpret_cast<u64>(r.d()->destroy.load()) - 0x10000) >> 47 == 0);
 		ensure((r.d()->refs++ - 1) >> 58 == 0);
 		return r;
 	}
@ -583,11 +576,21 @@ namespace stx
 	template <typename T>
 	class atomic_ptr
 	{
-		mutable atomic_t<uptr> m_val{0};
-
-		static shared_counter* d(uptr val) noexcept
+	public:
+		struct fat_ptr
 		{
-			return std::launder(reinterpret_cast<shared_counter*>((val >> c_ref_size) - sizeof(shared_counter)));
+			uptr ptr{};
+			u32 is_non_null{};
+			u32 ref_ctr{};
+		};
+
+	private:
+
+		mutable atomic_t<fat_ptr> m_val{fat_ptr{}};
+
+		static shared_counter* d(fat_ptr val) noexcept
+		{
+			return std::launder(reinterpret_cast<shared_counter*>(val.ptr - sizeof(shared_counter)));
 		}

 		shared_counter* d() const noexcept
@ -595,14 +598,19 @@ namespace stx
 			return d(m_val);
 		}

-		static uptr to_val(const volatile std::remove_extent_t<T>* ptr) noexcept
+		static fat_ptr to_val(const volatile std::remove_extent_t<T>* ptr) noexcept
 		{
-			return (reinterpret_cast<uptr>(ptr) << c_ref_size);
+			return fat_ptr{reinterpret_cast<uptr>(ptr), ptr != nullptr, 0};
 		}

-		static std::remove_extent_t<T>* ptr_to(uptr val) noexcept
+		static fat_ptr to_val(uptr ptr) noexcept
 		{
-			return reinterpret_cast<std::remove_extent_t<T>*>(val >> c_ref_size);
+			return fat_ptr{ptr, ptr != 0, 0};
+		}
+
+		static std::remove_extent_t<T>* ptr_to(fat_ptr val) noexcept
+		{
+			return reinterpret_cast<std::remove_extent_t<T>*>(val.ptr);
 		}

 		template <typename U>
@ -645,7 +653,7 @@ namespace stx
 		atomic_ptr(const shared_ptr<U>& r) noexcept
 		{
 			// Obtain a ref + as many refs as an atomic_ptr can additionally reference
-			if (uptr rval = to_val(r.m_ptr))
+			if (fat_ptr rval = to_val(r.m_ptr); rval.ptr != 0)
 			{
 				m_val.raw() = rval;
 				d(rval)->refs += c_ref_mask + 1;
@ -655,7 +663,7 @@ namespace stx
 		template <typename U> requires same_ptr_implicit_v<T, U>
 		atomic_ptr(shared_ptr<U>&& r) noexcept
 		{
-			if (uptr rval = to_val(r.m_ptr))
+			if (fat_ptr rval = to_val(r.m_ptr); rval.ptr != 0)
 			{
 				m_val.raw() = rval;
 				d(rval)->refs += c_ref_mask;
@ -667,7 +675,7 @@ namespace stx
 		template <typename U> requires same_ptr_implicit_v<T, U>
 		atomic_ptr(single_ptr<U>&& r) noexcept
 		{
-			if (uptr rval = to_val(r.m_ptr))
+			if (fat_ptr rval = to_val(r.m_ptr); rval.ptr != 0)
 			{
 				m_val.raw() = rval;
 				d(rval)->refs += c_ref_mask;
@ -678,13 +686,13 @@ namespace stx

 		~atomic_ptr() noexcept
 		{
-			const uptr v = m_val.raw();
+			const fat_ptr v = m_val.raw();

-			if (v >> c_ref_size)
+			if (v.ptr)
 			{
 				const auto o = d(v);

-				if (!o->refs.sub_fetch(c_ref_mask + 1 - (v & c_ref_mask)))
+				if (!o->refs.sub_fetch(c_ref_mask + 1 - (v.ref_ctr & c_ref_mask)))
 				{
 					o->destroy.load()(o);
 				}
@ -733,11 +741,11 @@ namespace stx
 			shared_type r;

 			// Add reference
-			const auto [prev, did_ref] = m_val.fetch_op([](uptr& val)
+			const auto [prev, did_ref] = m_val.fetch_op([](fat_ptr& val)
 			{
-				if (val >> c_ref_size)
+				if (val.ptr)
 				{
-					val++;
+					val.ref_ctr++;
 					return true;
 				}

@ -755,11 +763,11 @@ namespace stx
 			r.d()->refs++;

 			// Dereference if still the same pointer
-			const auto [_, did_deref] = m_val.fetch_op([prev = prev](uptr& val)
+			const auto [_, did_deref] = m_val.fetch_op([prev = prev](fat_ptr& val)
 			{
-				if (val >> c_ref_size == prev >> c_ref_size)
+				if (val.ptr == prev.ptr)
 				{
-					val--;
+					val.ref_ctr--;
 					return true;
 				}

@ -782,11 +790,11 @@ namespace stx
 			shared_type r;

 			// Add reference
-			const auto [prev, did_ref] = m_val.fetch_op([](uptr& val)
+			const auto [prev, did_ref] = m_val.fetch_op([](fat_ptr& val)
 			{
-				if (val >> c_ref_size)
+				if (val.ptr)
 				{
-					val++;
+					val.ref_ctr++;
 					return true;
 				}

@ -823,11 +831,11 @@ namespace stx
 			}

 			// Dereference if still the same pointer
-			const auto [_, did_deref] = m_val.fetch_op([prev = prev](uptr& val)
+			const auto [_, did_deref] = m_val.fetch_op([prev = prev](fat_ptr& val)
 			{
-				if (val >> c_ref_size == prev >> c_ref_size)
+				if (val.ptr == prev.ptr)
 				{
-					val--;
+					val.ref_ctr--;
 					return true;
 				}

@ -888,7 +896,7 @@ namespace stx

 			atomic_ptr old;
 			old.m_val.raw() = m_val.exchange(to_val(r.m_ptr));
-			old.m_val.raw() += 1;
+			old.m_val.raw().ref_ctr += 1;

 			r.m_ptr = std::launder(ptr_to(old.m_val));
 			return r;
@ -904,7 +912,7 @@ namespace stx

 			atomic_ptr old;
 			old.m_val.raw() = m_val.exchange(to_val(value.m_ptr));
-			old.m_val.raw() += 1;
+			old.m_val.raw().ref_ctr += 1;

 			value.m_ptr = std::launder(ptr_to(old.m_val));
 			return value;
@ -923,21 +931,21 @@ namespace stx

 			atomic_ptr old;

-			const uptr _val = m_val.fetch_op([&](uptr& val)
+			const fat_ptr _val = m_val.fetch_op([&](fat_ptr& val)
 			{
-				if (val >> c_ref_size == _old)
+				if (val.ptr == _old)
 				{
 					// Set new value
-					val = _new << c_ref_size;
+					val = to_val(_new);
 				}
-				else if (val)
+				else if (val.ptr != 0)
 				{
 					// Reference previous value
-					val++;
+					val.ref_ctr++;
 				}
 			});

-			if (_val >> c_ref_size == _old)
+			if (_val.ptr == _old)
 			{
 				// Success (exch is consumed, cmp_and_old is unchanged)
 				if (exch.m_ptr)
@ -954,9 +962,10 @@ namespace stx
 			old_exch.m_val.raw() = to_val(std::exchange(exch.m_ptr, nullptr));

 			// Set to reset old cmp_and_old value
-			old.m_val.raw() = to_val(cmp_and_old.m_ptr) | c_ref_mask;
+			old.m_val.raw() = to_val(cmp_and_old.m_ptr);
+			old.m_val.raw().ref_ctr |= c_ref_mask;

-			if (!_val)
+			if (!_val.ptr)
 			{
 				return false;
 			}
@ -966,11 +975,11 @@ namespace stx
 			cmp_and_old.d()->refs++;

 			// Dereference if still the same pointer
-			const auto [_, did_deref] = m_val.fetch_op([_val](uptr& val)
+			const auto [_, did_deref] = m_val.fetch_op([_val](fat_ptr& val)
 			{
-				if (val >> c_ref_size == _val >> c_ref_size)
+				if (val.ptr == _val.ptr)
 				{
-					val--;
+					val.ref_ctr--;
 					return true;
 				}

@ -1009,12 +1018,12 @@ namespace stx

 			atomic_ptr old;

-			const auto [_val, ok] = m_val.fetch_op([&](uptr& val)
+			const auto [_val, ok] = m_val.fetch_op([&](fat_ptr& val)
 			{
-				if (val >> c_ref_size == _old)
+				if (val.ptr == _old)
 				{
 					// Set new value
-					val = _new << c_ref_size;
+					val = to_val(_new);
 					return true;
 				}

@ -1081,7 +1090,7 @@ namespace stx
 			if (next.m_ptr)
 			{
 				// Compensation for `next` assignment
-				old.m_val.raw() += 1;
+				old.m_val.raw().ref_ctr += 1;
 			}
 		}

@ -1093,7 +1102,7 @@ namespace stx

 		explicit constexpr operator bool() const noexcept
 		{
-			return m_val != 0;
+			return observe() != nullptr;
 		}

 		template <typename U> requires same_ptr_implicit_v<T, U>
@ -1110,17 +1119,17 @@ namespace stx

 		void wait(std::nullptr_t, atomic_wait_timeout timeout = atomic_wait_timeout::inf)
 		{
-			utils::bless<atomic_t<u32>>(&m_val)[1].wait(0, timeout);
+			utils::bless<atomic_t<u32>>(&m_val.raw().is_non_null)->wait(0, timeout);
 		}

 		void notify_one()
 		{
-			utils::bless<atomic_t<u32>>(&m_val)[1].notify_one();
+			utils::bless<atomic_t<u32>>(&m_val.raw().is_non_null)->notify_one();
 		}

 		void notify_all()
 		{
-			utils::bless<atomic_t<u32>>(&m_val)[1].notify_all();
+			utils::bless<atomic_t<u32>>(&m_val.raw().is_non_null)->notify_all();
 		}
 	};