diff --git a/3rdparty/7zip/7zip b/3rdparty/7zip/7zip
index e008ce3976..e5431fa6f5 160000
--- a/3rdparty/7zip/7zip
+++ b/3rdparty/7zip/7zip
@@ -1 +1 @@
-Subproject commit e008ce3976c087bfd21344af8f00a23cf69d4174
+Subproject commit e5431fa6f5505e385c6f9367260717e9c47dc2ee
diff --git a/3rdparty/FAudio b/3rdparty/FAudio
index 74d45e615c..b7c2e109ea 160000
--- a/3rdparty/FAudio
+++ b/3rdparty/FAudio
@@ -1 +1 @@
-Subproject commit 74d45e615c2e7510c7e0f2ccb91dc6d7ccae4bec
+Subproject commit b7c2e109ea86b82109244c9c4569ce9ad0c884df
diff --git a/3rdparty/OpenAL/openal-soft b/3rdparty/OpenAL/openal-soft
index d3875f333f..90191edd20 160000
--- a/3rdparty/OpenAL/openal-soft
+++ b/3rdparty/OpenAL/openal-soft
@@ -1 +1 @@
-Subproject commit d3875f333fb6abe2f39d82caca329414871ae53b
+Subproject commit 90191edd20bb877c5cbddfdac7ec0fe49ad93727
diff --git a/3rdparty/curl/curl b/3rdparty/curl/curl
index b1ef0e1a01..75a2079d5c 160000
--- a/3rdparty/curl/curl
+++ b/3rdparty/curl/curl
@@ -1 +1 @@
-Subproject commit b1ef0e1a01c0bb6ee5367bd9c186a603bde3615a
+Subproject commit 75a2079d5c28debb2eaa848ca9430f1fe0d7844c
diff --git a/3rdparty/libsdl-org/SDL b/3rdparty/libsdl-org/SDL
index c98c4fbff6..9c821dc21c 160000
--- a/3rdparty/libsdl-org/SDL
+++ b/3rdparty/libsdl-org/SDL
@@ -1 +1 @@
-Subproject commit c98c4fbff6d8f3016a3ce6685bf8f43433c3efcc
+Subproject commit 9c821dc21ccbd69b2bda421fdb35cb4ae2da8f5e
diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt
index f4f67a4ea1..2d63e67ab8 100644
--- a/rpcs3/Emu/CMakeLists.txt
+++ b/rpcs3/Emu/CMakeLists.txt
@@ -506,6 +506,7 @@ target_sources(rpcs3_emu PRIVATE
     RSX/GL/OpenGL.cpp
     RSX/GL/upscalers/fsr1/fsr_pass.cpp
     RSX/GSRender.cpp
+    RSX/Host/MM.cpp
     RSX/Host/RSXDMAWriter.cpp
     RSX/Null/NullGSRender.cpp
     RSX/NV47/FW/draw_call.cpp
diff --git a/rpcs3/Emu/Cell/Modules/cellAdec.cpp b/rpcs3/Emu/Cell/Modules/cellAdec.cpp
index 43c5f7989b..dfc91c8d2f 100644
--- a/rpcs3/Emu/Cell/Modules/cellAdec.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellAdec.cpp
@@ -1,11 +1,14 @@
 #include "stdafx.h"
+#include "Emu/perf_meter.hpp"
 #include "Emu/Cell/PPUModule.h"
 #include "Emu/Cell/lv2/sys_sync.h"
+#include "Emu/Cell/lv2/sys_ppu_thread.h"
 #include "Emu/savestate_utils.hpp"
+#include "sysPrxForUser.h"
 
 #include "cellAdec.h"
 
-#include <mutex>
+#include "util/simd.hpp"
 #include "util/asm.hpp"
 
 LOG_CHANNEL(cellAdec);
@@ -225,57 +228,732 @@ void fmt_class_string<CellAdecError>::format(std::string& out, u64 arg)
 
 vm::gvar<CellAdecCoreOps> g_cell_adec_core_ops_lpcm;
 
+void LpcmDecContext::exec(ppu_thread& ppu)
+{
+	perf_meter<"LPCMDEC"_u64> perf0;
+
+	switch (savestate)
+	{
+	case lpcm_dec_state::waiting_for_cmd_mutex_lock: break;
+	case lpcm_dec_state::waiting_for_cmd_cond_wait: break;
+	case lpcm_dec_state::waiting_for_output_mutex_lock: goto output_mutex_lock;
+	case lpcm_dec_state::waiting_for_output_cond_wait: goto output_cond_wait;
+	case lpcm_dec_state::queue_mutex_lock: goto queue_mutex_lock;
+	case lpcm_dec_state::executing_cmd: goto execute_cmd;
+	}
+
+	for (; run_thread; cmd_counter++)
+	{
+		cellAdec.trace("Command counter: %llu, waiting for next command...", cmd_counter);
+
+		// Wait for a command to become available
+		error_occurred |= static_cast<u32>(cmd_available.acquire(ppu, savestate) != CELL_OK);
+
+		if (ppu.state & cpu_flag::again)
+		{
+			return;
+		}
+
+		cellAdec.trace("Command available, waiting for output to be consumed...");
+
+		// Wait for the output to be consumed.
+		// The output has to be consumed even if the next command is not a decode command
+		savestate = lpcm_dec_state::waiting_for_output_mutex_lock;
+		output_mutex_lock:
+
+		error_occurred |= static_cast<u32>(sys_mutex_lock(ppu, output_mutex, 0) != CELL_OK);
+
+		if (ppu.state & cpu_flag::again)
+		{
+			return;
+		}
+
+		while (output_locked)
+		{
+			savestate = lpcm_dec_state::waiting_for_output_cond_wait;
+			output_cond_wait:
+
+			ensure(sys_cond_wait(ppu, output_consumed, 0) == CELL_OK); // Error code isn't checked on LLE
+
+			if (ppu.state & cpu_flag::again)
+			{
+				return;
+			}
+		}
+
+		cellAdec.trace("Output consumed");
+
+		// Pop command from queue
+		savestate = lpcm_dec_state::queue_mutex_lock;
+		queue_mutex_lock:
+
+		ensure(sys_mutex_lock(ppu, queue_mutex, 0) == CELL_OK); // Error code isn't checked on LLE
+
+		if (ppu.state & cpu_flag::again)
+		{
+			return;
+		}
+
+		cmd_queue.pop(cmd);
+
+		ensure(sys_mutex_unlock(ppu, queue_mutex) == CELL_OK); // Error code isn't checked on LLE
+
+		cellAdec.trace("Command type: %d", static_cast<u32>(cmd.type.get()));
+
+		savestate = lpcm_dec_state::executing_cmd;
+		execute_cmd:
+
+		switch (cmd.type)
+		{
+		case LpcmDecCmdType::start_seq:
+			// LLE sends a command to the SPU thread. The SPU thread consumes the command without doing anything, however
+			error_occurred |= static_cast<u32>(sys_mutex_unlock(ppu, output_mutex) != CELL_OK);
+			break;
+
+		case LpcmDecCmdType::end_seq:
+		{
+			// Block savestate creation during callbacks
+			std::unique_lock savestate_lock{g_fxo->get<hle_locks_t>(), std::try_to_lock};
+
+			if (!savestate_lock.owns_lock())
+			{
+				ppu.state += cpu_flag::again;
+				return;
+			}
+
+			// Doesn't do anything else
+			notify_seq_done.cbFunc(ppu, notify_seq_done.cbArg);
+
+			error_occurred |= static_cast<u32>(sys_mutex_unlock(ppu, output_mutex) != CELL_OK);
+			break;
+		}
+		case LpcmDecCmdType::close:
+			ensure(sys_mutex_unlock(ppu, output_mutex) == CELL_OK); // Error code isn't checked on LLE
+			return;
+
+		case LpcmDecCmdType::decode_au:
+		{
+			// For 20 and 24-bit samples
+			const u8* const input_u8 = static_cast<const u8*>(cmd.au_start_addr.get_ptr());
+			const s64 au_size_u8 = cmd.au_size;
+
+			// For 16-bit samples
+			const be_t<s16, 1>* const input_s16 = static_cast<const be_t<s16, 1>*>(cmd.au_start_addr.get_ptr());
+			const s64 au_size_s16 = static_cast<s64>(au_size_u8 / sizeof(s16));
+
+			be_t<f32>* const _output = std::assume_aligned<0x80>(output.get_ptr());
+			s64 output_size = cmd.au_size;
+
+			s32 sample_num = static_cast<s32>(utils::align(+lpcm_param->audioPayloadSize, 0x10));
+			s32 channel_num = 0;
+
+			if (!dvd_packing)
+			{
+				switch (lpcm_param->sizeOfWord)
+				{
+				case CELL_ADEC_BIT_LENGTH_16: output_size = output_size * 32 / 16; sample_num /= 2; break;
+				case CELL_ADEC_BIT_LENGTH_20: // Same as below
+				case CELL_ADEC_BIT_LENGTH_24: output_size = output_size * 32 / 24; sample_num /= 3; break;
+				default: ; // LLE skips decoding entirely, the output buffer isn't written to, and it outputs whatever was there before
+				}
+
+				// LPCM streams with an odd number of channels contain an empty dummy channel
+				switch (lpcm_param->channelNumber)
+				{
+				case CELL_ADEC_CH_MONO:    channel_num = 1; output_size = output_size * 1 / 2; break;
+				case CELL_ADEC_CH_STEREO:  channel_num = 2; break;
+				case CELL_ADEC_CH_3_0:     // Same as below
+				case CELL_ADEC_CH_2_1:     channel_num = 3; output_size = output_size * 3 / 4; break;
+				case CELL_ADEC_CH_3_1:     // Same as below
+				case CELL_ADEC_CH_2_2:     channel_num = 4; break;
+				case CELL_ADEC_CH_3_2:     channel_num = 5; output_size = output_size * 5 / 6; break;
+				case CELL_ADEC_CH_3_2_LFE: channel_num = 6; break;
+				case CELL_ADEC_CH_3_4:     channel_num = 7; output_size = output_size * 7 / 8; break;
+				case CELL_ADEC_CH_3_4_LFE: channel_num = 8; break;
+				default: ; // Don't do anything, LLE simply skips reordering channels
+				}
+
+				// LLE doesn't check the output size
+				ensure(output_size <= LPCM_DEC_OUTPUT_BUFFER_SIZE);
+				ensure(sample_num * sizeof(f32) <= LPCM_DEC_OUTPUT_BUFFER_SIZE);
+
+				// Convert to float
+				if (lpcm_param->sizeOfWord == CELL_ADEC_BIT_LENGTH_16)
+				{
+					s64 i = 0;
+					for (; i <= au_size_s16 - 8; i += 8)
+					{
+						const v128 s16be = v128::loadu(&input_s16[i]);
+
+						// Convert endianess if necessary and shift left by 16
+#if defined(ARCH_X64) && !defined(__SSSE3__)
+						const v128 s16le = gv_rol16<8>(s16be);
+						const v128 s32_1 = gv_unpacklo16(gv_bcst16(0), s16le);
+						const v128 s32_2 = gv_unpackhi16(gv_bcst16(0), s16le);
+#else
+						const v128 s32_1 = std::endian::native == std::endian::little
+							? gv_shuffle8(s16be, v128::normal_array_t<s8>{ -1, -1, 1, 0, -1, -1, 3, 2, -1, -1, 5, 4, -1, -1, 7, 6 })
+							: gv_unpacklo16(s16be, gv_bcst16(0));
+
+						const v128 s32_2 = std::endian::native == std::endian::little
+							? gv_shuffle8(s16be, v128::normal_array_t<s8>{ -1, -1, 9, 8, -1, -1, 11, 10, -1, -1, 13, 12, -1, -1, 15, 14 })
+							: gv_unpackhi16(s16be, gv_bcst16(0));
+#endif
+						// Convert to float and divide by INT32_MAX + 1
+						const v128 f32_1 = gv_mulfs(gv_cvts32_tofs(s32_1), 1.f / static_cast<f32>(0x80000000u));
+						const v128 f32_2 = gv_mulfs(gv_cvts32_tofs(s32_2), 1.f / static_cast<f32>(0x80000000u));
+
+						v128::storeu(gv_to_be32(f32_1), &_output[i]);
+						v128::storeu(gv_to_be32(f32_2), &_output[i + 4]);
+					}
+
+					for (; i < au_size_s16; i++)
+					{
+						_output[i] = static_cast<f32>(input_s16[i]) / 0x8000u;
+					}
+				}
+				else if (lpcm_param->sizeOfWord == CELL_ADEC_BIT_LENGTH_20 || lpcm_param->sizeOfWord == CELL_ADEC_BIT_LENGTH_24)
+				{
+					s64 i = 0;
+					for (; i * 3 <= au_size_u8 - static_cast<s64>(sizeof(v128)); i += 4)
+					{
+						// Load four samples, convert endianness if necessary and shift left by 8
+						const v128 _s32 = std::endian::native == std::endian::little
+							? gv_shuffle8(v128::loadu(&input_u8[i * 3]), v128::normal_array_t<s8>{ -1, 2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9 })
+							: gv_shuffle8(v128::loadu(&input_u8[i * 3]), v128::normal_array_t<s8>{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 });
+
+						// Convert to float and divide by INT32_MAX + 1
+						const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast<f32>(0x80000000u));
+
+						v128::storeu(gv_to_be32(_f32), &_output[i]);
+					}
+
+					for (; i * 3 <= au_size_u8 - 3; i++)
+					{
+						alignas(alignof(s32)) const u8 s32le[4] = { 0, input_u8[i * 3 + 2], input_u8[i * 3 + 1], input_u8[i * 3] };
+
+						_output[i] = static_cast<f32>(std::bit_cast<le_t<s32>>(s32le)) / static_cast<f32>(0x80000000u);
+					}
+				}
+
+				// Reorder channels and remove the dummy channel
+
+				// Input channel order:
+				// Front Left, Front Right, Center, Side Left, Rear Left, Rear Right, Side Right, LFE
+
+				// Output channel order:
+				// - up to 3_4: Front Left, Center, Front Right, Side Left, Side Right, Rear Left, Rear Right, LFE
+				// - 3_4_LFE: Front Left, Front Right, Center, LFE, Side Left, Side Right, Rear Left, Rear Right
+
+				// The following loops can access sample_num % channel_num * sizeof(f32) bytes more than LPCM_DEC_OUTPUT_BUFFER_SIZE (up to 28 bytes).
+				// This is intended, LLE does something similar. The buffer is much larger than LPCM_DEC_OUTPUT_BUFFER_SIZE (see _CellAdecCoreOpGetMemSize_lpcm())
+				switch (lpcm_param->channelNumber)
+				{
+				case CELL_ADEC_CH_MONO:
+					for (s32 i = 0; i < sample_num / 2; i += 4)
+					{
+						const v128 tmp1 = v128::loadu(&_output[i * 2]);
+						const v128 tmp2 = v128::loadu(&_output[i * 2 + 4]);
+						v128::storeu(gv_shufflefs<0, 2, 0, 2>(tmp1, tmp2), &_output[i]); // Remove every other sample
+					}
+					break;
+
+				case CELL_ADEC_CH_STEREO:
+				case CELL_ADEC_CH_2_2:
+					// Input order == output order, no need to do anything
+					break;
+
+				case CELL_ADEC_CH_3_0:
+					for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 4, i_out += 3)
+					{
+						const v128 tmp = gv_shuffle32<0, 2, 1, 3>(v128::loadu(&_output[i_in])); // Swap Front Right and Center
+						v128::storeu(tmp, &_output[i_out]);
+					}
+					break;
+
+				case CELL_ADEC_CH_2_1:
+					for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 4, i_out += 3)
+					{
+						v128::storeu(v128::loadu(&_output[i_in]), &_output[i_out]);
+					}
+					break;
+
+				case CELL_ADEC_CH_3_1:
+				case CELL_ADEC_CH_3_2_LFE:
+					for (s32 i = 0; i < sample_num; i += channel_num)
+					{
+						const u64 tmp = std::rotl(read_from_ptr<u64>(&_output[i + 1]), 0x20); // Swap Front Right and Center
+						std::memcpy(&_output[i + 1], &tmp, sizeof(u64));
+					}
+					break;
+
+				case CELL_ADEC_CH_3_2:
+					for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 6, i_out += 5)
+					{
+						const v128 tmp = gv_shuffle32<0, 2, 1, 3>(v128::loadu(&_output[i_in])); // Swap Front Right and Center
+						v128::storeu(tmp, &_output[i_out]);
+						_output[i_out + 4] = _output[i_in + 4];
+					}
+					break;
+
+				case CELL_ADEC_CH_3_4:
+					for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 8, i_out += 7)
+					{
+						const v128 tmp1 = gv_shuffle32<0, 2, 1, 3>(v128::loadu(&_output[i_in])); // Swap Front Right and Center
+						const v128 tmp2 = gv_shuffle32<2, 0, 1, 3>(v128::loadu(&_output[i_in + 4])); // Reorder Rear Left, Rear Right, Side Right -> Side Right, Rear Left, Rear Right
+						v128::storeu(tmp1, &_output[i_out]);
+						v128::storeu(tmp2, &_output[i_out + 4]);
+					}
+					break;
+
+				case CELL_ADEC_CH_3_4_LFE:
+					for (s32 i = 0; i < sample_num; i += 8)
+					{
+						const v128 tmp1 = gv_shuffle32<3, 2, 0, 1>(v128::loadu(&_output[i + 4])); // Reorder Rear Left, Rear Right, Side Right, LFE -> LFE, Side Right, Rear Left, Rear Right
+						v128::storeu(tmp1, &_output[i + 4]);
+						const u64 tmp2 = std::rotl(read_from_ptr<u64>(&_output[i + 3]), 0x20); // Swap Side Left and LFE
+						std::memcpy(&_output[i + 3], &tmp2, sizeof(u64));
+					}
+					break;
+
+				default:
+					; // Don't do anything
+				}
+			}
+			else
+			{
+				switch (lpcm_param->sizeOfWord)
+				{
+				case CELL_ADEC_BIT_LENGTH_16: output_size = output_size * 32 / 16; break;
+				case CELL_ADEC_BIT_LENGTH_20: output_size = output_size * 32 / 20; break;
+				case CELL_ADEC_BIT_LENGTH_24: output_size = output_size * 32 / 24; break;
+				default: fmt::throw_exception("Unreachable"); // Parameters get verified in adecSetLpcmDvdParams()
+				}
+
+				// Only the front left and right channels are decoded, all other channels are ignored
+				switch (lpcm_param->channelNumber)
+				{
+				case CELL_ADEC_LPCM_DVD_CH_MONO: // Set channel_num to two for mono as well
+				case CELL_ADEC_LPCM_DVD_CH_STEREO:  channel_num = 2; break;
+				case 4:                             channel_num = 3; output_size = output_size * 2 / 3; break;
+				case 5:                             channel_num = 4; output_size = output_size * 2 / 4; break;
+				case CELL_ADEC_LPCM_DVD_CH_3_2:     channel_num = 5; output_size = output_size * 2 / 5; break;
+				case CELL_ADEC_LPCM_DVD_CH_3_2_LFE: channel_num = 6; output_size = output_size * 2 / 6; break;
+				case CELL_ADEC_LPCM_DVD_CH_3_4:     channel_num = 7; output_size = output_size * 2 / 7; break;
+				case CELL_ADEC_LPCM_DVD_CH_3_4_LFE: channel_num = 8; output_size = output_size * 2 / 8; break;
+				default: fmt::throw_exception("Unreachable"); // Parameters get verified in adecSetLpcmDvdParams()
+				}
+
+				// LLE doesn't check the output size
+				ensure(output_size <= LPCM_DEC_OUTPUT_BUFFER_SIZE);
+
+				// Convert to float
+				switch (lpcm_param->sizeOfWord)
+				{
+				case CELL_ADEC_BIT_LENGTH_16:
+				{
+					s64 i_in = 0;
+					s64 i_out = 0;
+					for (; i_in <= au_size_s16 - channel_num - 2; i_in += channel_num * 2, i_out += 4)
+					{
+						// Load four samples
+						const v128 tmp1 = gv_loadu32(&input_s16[i_in]);
+						const v128 tmp2 = gv_loadu32(&input_s16[i_in + channel_num]);
+						const v128 s16be = gv_unpacklo32(tmp1, tmp2);
+
+						// Convert endianess if necessary and shift left by 16
+						const v128 _s32 = std::endian::native == std::endian::little
+							? gv_shuffle8(s16be, v128::normal_array_t<s8>{ -1, -1, 1, 0, -1, -1, 3, 2, -1, -1, 5, 4, -1, -1, 7, 6 })
+							: gv_unpacklo16(s16be, gv_bcst16(0));
+
+						// Convert to float and divide by INT32_MAX + 1
+						const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast<f32>(0x80000000u));
+
+						v128::storeu(gv_to_be32(_f32), &_output[i_out]);
+					}
+
+					for (; i_in <= au_size_s16 - 2; i_in += channel_num, i_out += 2)
+					{
+						const v128 s16be = gv_loadu32(&input_s16[i_in]);
+
+						const v128 _s32 = std::endian::native == std::endian::little
+							? gv_shuffle8(s16be, v128::normal_array_t<s8>{ -1, -1, 1, 0, -1, -1, 3, 2, -1, -1, 5, 4, -1, -1, 7, 6 })
+							: gv_unpacklo16(s16be, gv_bcst16(0));
+
+						const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast<f32>(0x80000000u));
+
+						std::memcpy(&_output[i_out], &gv_to_be32(_f32)._u64[0], sizeof(u64));
+					}
+					break;
+				}
+				case CELL_ADEC_BIT_LENGTH_20:
+				{
+					const s64 high_bytes_3_4_offset = lpcm_param->channelNumber == CELL_ADEC_LPCM_DVD_CH_MONO ? 5 : channel_num * 2;
+					const s64 low_bits_1_2_offset = lpcm_param->channelNumber == CELL_ADEC_LPCM_DVD_CH_MONO ? 4 : channel_num * 4;
+					const s64 low_bits_3_4_offset = channel_num * 4 + channel_num / 2 - !(channel_num & 1);
+					const s64 next_samples_offset = channel_num * 5;
+
+					// If channel_num is odd, the low bits of samples three and four are in different bytes
+					alignas(alignof(v128)) static constexpr auto shuffle_ctrl_same_offset = std::endian::native == std::endian::little
+						? v128::normal_array_t<s8>{ -1, 8, 1, 0, -1, 8, 3, 2, -1, 11, 5, 4, -1, 11, 7, 6 }
+						: v128::normal_array_t<s8>{ 0, 1, 8, -1, 2, 3, 8, -1, 4, 5, 11, -1, 6, 7, 11, -1 };
+
+					alignas(alignof(v128)) static constexpr auto shuffle_ctrl_different_offset = std::endian::native == std::endian::little
+						? v128::normal_array_t<s8>{ -1, 8, 1, 0, -1, 8, 3, 2, -1, 10, 5, 4, -1, 11, 7, 6 }
+						: v128::normal_array_t<s8>{ 0, 1, 8, -1, 2, 3, 8, -1, 4, 5, 10, -1, 6, 7, 11, -1 };
+
+					const v128 shuffle_ctrl = channel_num & 1 ? v128::loadu(&shuffle_ctrl_different_offset) : v128::loadu(&shuffle_ctrl_same_offset);
+
+					alignas(alignof(v128)) static constexpr auto low_bits_mask_same_offset = std::endian::native == std::endian::little
+						? v128::normal_array_t<u8>{ 0x00, 0xf0, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff, 0x00, 0xf0, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff }
+						: v128::normal_array_t<u8>{ 0xff, 0xff, 0xf0, 0x00, 0xff, 0xff, 0x0f, 0x00, 0xff, 0xff, 0xf0, 0x00, 0xff, 0xff, 0x0f, 0x00 };
+
+					alignas(alignof(v128)) static constexpr auto low_bits_mask_different_offset = std::endian::native == std::endian::little
+						? v128::normal_array_t<u8>{ 0x00, 0xf0, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff, 0x00, 0xf0, 0xff, 0xff }
+						: v128::normal_array_t<u8>{ 0xff, 0xff, 0xf0, 0x00, 0xff, 0xff, 0x0f, 0x00, 0xff, 0xff, 0x0f, 0x00, 0xff, 0xff, 0xf0, 0x00 };
+
+					const v128 low_bits_mask = channel_num & 1 ? v128::loadu(&low_bits_mask_different_offset) : v128::loadu(&low_bits_mask_same_offset);
+
+					for (s64 i_in = 0, i_out = 0; i_in <= au_size_u8 - low_bits_3_4_offset - (channel_num & 1); i_in += next_samples_offset, i_out += 4)
+					{
+						// Load all the high and low bits of four samples
+						const v128 tmp1 = gv_loadu32(&input_u8[i_in]);
+						const v128 tmp2 = gv_loadu32(&input_u8[i_in + high_bytes_3_4_offset]);
+						v128 s20be = gv_unpacklo32(tmp1, tmp2);
+						s20be = gv_insert16<4>(s20be, read_from_ptr<u16>(&input_u8[i_in + low_bits_1_2_offset]));
+						s20be = gv_insert16<5>(s20be, read_from_ptr<u16>(&input_u8[i_in + low_bits_3_4_offset]));
+
+						// Reorder bytes to form four 32-bit integer samples
+						v128 _s32 = gv_shuffle8(s20be, shuffle_ctrl);
+
+						// Set low 12 bits to zero for each sample
+						_s32 = _s32 & low_bits_mask;
+
+						// LLE is missing a step: each byte that was ANDed with 0x0f would still need to be shifted left by 4
+
+						// Convert to float and divide by INT32_MAX + 1
+						const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast<f32>(0x80000000u));
+
+						v128::storeu(gv_to_be32(_f32), &_output[i_out]);
+					}
+					break;
+				}
+				case CELL_ADEC_BIT_LENGTH_24:
+				{
+					const s64 high_bytes_3_4_offset = lpcm_param->channelNumber == CELL_ADEC_LPCM_DVD_CH_MONO ? 6 : channel_num * 2;
+					const s64 low_bytes_1_2_offset = lpcm_param->channelNumber == CELL_ADEC_LPCM_DVD_CH_MONO ? 4 : channel_num * 4;
+					const s64 low_bytes_3_4_offset = channel_num * 5;
+					const s64 next_samples_offset = channel_num * 6;
+
+					for (s64 i_in = 0, i_out = 0; i_in <= au_size_u8 - low_bytes_3_4_offset - 2; i_in += next_samples_offset, i_out += 4)
+					{
+						// Load all the high and low bytes of four samples
+						const v128 tmp1 = gv_loadu32(&input_u8[i_in]);
+						const v128 tmp2 = gv_loadu32(&input_u8[i_in + high_bytes_3_4_offset]);
+						v128 s24be = gv_unpacklo32(tmp1, tmp2);
+						s24be = gv_insert16<4>(s24be, read_from_ptr<u16>(&input_u8[i_in + low_bytes_1_2_offset]));
+						s24be = gv_insert16<5>(s24be, read_from_ptr<u16>(&input_u8[i_in + low_bytes_3_4_offset]));
+
+						// Reorder bytes to form four 32-bit integer samples
+						const v128 _s32 = std::endian::native == std::endian::little
+							? gv_shuffle8(s24be, v128::normal_array_t<s8>{ -1, 8, 1, 0, -1, 9, 3, 2, -1, 10, 5, 4, -1, 11, 7, 6 })
+							: gv_shuffle8(s24be, v128::normal_array_t<s8>{ 0, 1, 8, -1, 2, 3, 9, -1, 4, 5, 10, -1, 6, 7, 11, -1 });
+
+						// Convert to float and divide by INT32_MAX + 1
+						const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast<f32>(0x80000000u));
+
+						v128::storeu(gv_to_be32(_f32), &_output[i_out]);
+					}
+				}
+				}
+			}
+
+			// Block savestate creation during callbacks
+			std::unique_lock savestate_lock{g_fxo->get<hle_locks_t>(), std::try_to_lock};
+
+			if (!savestate_lock.owns_lock())
+			{
+				ppu.state += cpu_flag::again;
+				return;
+			}
+
+			if (error_occurred)
+			{
+				notify_error.cbFunc(ppu, CELL_ADEC_ERROR_FATAL, notify_error.cbArg);
+			}
+
+			notify_au_done.cbFunc(ppu, cmd.pcm_handle, notify_au_done.cbArg);
+
+			output_locked = true;
+			error_occurred |= static_cast<u32>(sys_mutex_unlock(ppu, output_mutex) != CELL_OK);
+
+			const vm::var<CellAdecLpcmInfo> bsi_info{{ lpcm_param->channelNumber, lpcm_param->sampleRate, static_cast<u32>(output_size) }};
+
+			notify_pcm_out.cbFunc(ppu, cmd.pcm_handle, output, static_cast<u32>(output_size), notify_pcm_out.cbArg, vm::make_var<vm::bcptr<void>>(bsi_info), ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_HDMV, error_occurred ? static_cast<s32>(CELL_ADEC_ERROR_FATAL) : CELL_OK);
+			break;
+		}
+		default:
+			fmt::throw_exception("Invalid command");
+		}
+	}
+}
+
+template <LpcmDecCmdType type>
+error_code LpcmDecContext::send_command(ppu_thread& ppu, auto&&... args)
+{
+	ppu.state += cpu_flag::wait;
+
+	if (error_code ret = sys_mutex_lock(ppu, queue_size_mutex, 0); ret != CELL_OK)
+	{
+		return ret;
+	}
+
+	if (cmd_queue.full())
+	{
+		ensure(sys_mutex_unlock(ppu, queue_size_mutex) == CELL_OK); // Error code isn't checked on LLE
+		return CELL_ADEC_ERROR_BUSY;
+	}
+
+	// LLE copies the parameters directly into the context
+	if constexpr (type == LpcmDecCmdType::start_seq)
+	{
+		*lpcm_param = { args... };
+	}
+
+	if (error_code ret = sys_mutex_lock(ppu, queue_mutex, 0); ret != CELL_OK)
+	{
+		ensure(sys_mutex_unlock(ppu, queue_size_mutex) == CELL_OK); // Error code isn't checked on LLE
+		return ret;
+	}
+
+	cmd_queue.emplace(type, std::forward<decltype(args)>(args)...);
+
+	if (error_code ret = sys_mutex_unlock(ppu, queue_mutex); ret != CELL_OK
+		|| (ret = cmd_available.release(ppu)) != CELL_OK)
+	{
+		ensure(sys_mutex_unlock(ppu, queue_size_mutex) == CELL_OK); // Error code isn't checked on LLE
+		return ret;
+	}
+
+	return sys_mutex_unlock(ppu, queue_size_mutex);
+}
+
+inline error_code LpcmDecContext::release_output(ppu_thread& ppu)
+{
+	if (error_code ret = sys_mutex_lock(ppu, output_mutex, 0); ret != CELL_OK)
+	{
+		return ret;
+	}
+
+	output_locked = false;
+
+	if (error_code ret = sys_cond_signal(ppu, output_consumed); ret != CELL_OK)
+	{
+		return ret; // LLE doesn't unlock the mutex
+	}
+
+	return sys_mutex_unlock(ppu, output_mutex);
+}
+
+void lpcmDecEntry(ppu_thread& ppu, vm::ptr<LpcmDecContext> lpcm_dec)
+{
+	lpcm_dec->exec(ppu);
+
+	if (ppu.state & cpu_flag::again)
+	{
+		// For savestates, save argument
+		ppu.syscall_args[0] = lpcm_dec.addr();
+
+		return;
+	}
+
+	ppu_execute<&sys_ppu_thread_exit>(ppu, CELL_OK);
+}
+
 error_code _CellAdecCoreOpGetMemSize_lpcm(vm::ptr<CellAdecAttr> attr)
 {
-	cellAdec.todo("_CellAdecCoreOpGetMemSize_lpcm(attr=*0x%x)", attr);
+	cellAdec.notice("_CellAdecCoreOpGetMemSize_lpcm(attr=*0x%x)", attr);
+
+	constexpr u32 mem_size =
+		utils::align(static_cast<u32>(sizeof(LpcmDecContext)), 0x80)
+		+ utils::align(static_cast<u32>(sizeof(CellAdecParamLpcm)), 0x80)
+		+ 0x100 // Command data for Spurs task
+		+ LPCM_DEC_OUTPUT_BUFFER_SIZE
+		+ 0x2900 // sizeof(CellSpurs) + sizeof(CellSpursTaskset)
+		+ 0x3b400 // Spurs context
+		+ 0x300 // (sizeof(CellSpursQueue) + 0x80 + queue buffer) * 2
+		+ 0x855; // Unused
+
+	static_assert(mem_size == 0x7ebd5);
+
+	attr->workMemSize = mem_size;
 
 	return CELL_OK;
 }
 
-[[noreturn]] error_code _CellAdecCoreOpOpenExt_lpcm(vm::ptr<void> handle, vm::ptr<AdecNotifyAuDone> notifyAuDone, vm::ptr<void> notifyAuDoneArg, vm::ptr<AdecNotifyPcmOut> notifyPcmOut, vm::ptr<void> notifyPcmOutArg,
+error_code _CellAdecCoreOpOpenExt_lpcm(ppu_thread& ppu, vm::ptr<LpcmDecContext> handle, vm::ptr<AdecNotifyAuDone> notifyAuDone, vm::ptr<void> notifyAuDoneArg, vm::ptr<AdecNotifyPcmOut> notifyPcmOut, vm::ptr<void> notifyPcmOutArg,
 	vm::ptr<AdecNotifyError> notifyError, vm::ptr<void> notifyErrorArg, vm::ptr<AdecNotifySeqDone> notifySeqDone, vm::ptr<void> notifySeqDoneArg, vm::cptr<CellAdecResource> res, vm::cptr<CellAdecResourceSpurs> spursRes)
 {
-	cellAdec.todo("_CellAdecCoreOpOpenExt_lpcm(handle=*0x%x, notifyAuDone=*0x%x, notifyAuDoneArg=0x%x, notifyPcmOut=*0x%x, notifyPcmOutArg=0x%x, notifyError=*0x%x, notifyErrorArg=0x%x, notifySeqDone=*0x%x, notifySeqDoneArg=0x%x, res=*0x%x, spursRes=*0x%x)",
+	cellAdec.notice("_CellAdecCoreOpOpenExt_lpcm(handle=*0x%x, notifyAuDone=*0x%x, notifyAuDoneArg=0x%x, notifyPcmOut=*0x%x, notifyPcmOutArg=0x%x, notifyError=*0x%x, notifyErrorArg=0x%x, notifySeqDone=*0x%x, notifySeqDoneArg=0x%x, res=*0x%x, spursRes=*0x%x)",
 		handle, notifyAuDone, notifyAuDoneArg, notifyPcmOut, notifyPcmOutArg, notifyError, notifyErrorArg, notifySeqDone, notifySeqDoneArg, res, spursRes);
 
-	fmt::throw_exception("LPCM decoder not implemented, please disable HLE libadec.sprx");
+	ensure(!!handle && !!res); // Not checked on LLE
+	ensure(handle.aligned(0x80)); // LLE doesn't check the alignment or aligns the address itself
+	ensure(!!notifyAuDone && !!notifyAuDoneArg && !!notifyPcmOut && !!notifyPcmOutArg && !!notifyError && !!notifyErrorArg && !!notifySeqDone && !!notifySeqDoneArg); // These should always be set
+
+	const u32 end_of_context_addr = handle.addr() + utils::align(static_cast<u32>(sizeof(LpcmDecContext)), 0x80);
+
+	handle->cmd_queue.front = 0;
+	handle->cmd_queue.back = 0;
+	handle->cmd_queue.size = 0;
+	handle->run_thread = true;
+	handle->notify_au_done = { notifyAuDone, notifyAuDoneArg };
+	handle->notify_pcm_out = { notifyPcmOut, notifyPcmOutArg };
+	handle->notify_error = { notifyError, notifyErrorArg };
+	handle->notify_seq_done = { notifySeqDone, notifySeqDoneArg };
+	handle->output = vm::bptr<f32>::make(end_of_context_addr + 0x180);
+	handle->lpcm_param.set(end_of_context_addr);
+	handle->error_occurred = false;
+
+	const vm::var<sys_mutex_attribute_t> mutex_attr{{ SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, { "_adem04"_u64 } }};
+	const vm::var<sys_mutex_attribute_t> output_mutex_attr{{ SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, { "_adem05"_u64 } }};
+	const vm::var<sys_mutex_attribute_t> queue_mutex_attr{{ SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, { "_adem06"_u64 } }};
+	const vm::var<sys_cond_attribute_t> cond_attr{{ SYS_SYNC_NOT_PROCESS_SHARED, 0, 0, { "_adec03"_u64 } }};
+
+	error_code ret = sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::queue_size_mutex), mutex_attr);
+	ret = ret ? ret : sys_cond_create(ppu, handle.ptr(&LpcmDecContext::queue_size_cond), handle->queue_size_mutex, cond_attr);
+	ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::unk_mutex), mutex_attr);
+	ret = ret ? ret : sys_cond_create(ppu, handle.ptr(&LpcmDecContext::unk_cond), handle->unk_mutex, cond_attr);
+	ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::output_mutex), output_mutex_attr);
+	ret = ret ? ret : sys_cond_create(ppu, handle.ptr(&LpcmDecContext::output_consumed), handle->output_mutex, cond_attr);
+	ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::queue_mutex), queue_mutex_attr);
+	ret = ret ? ret : handle->release_output(ppu);
+	ret = ret ? ret : handle->cmd_available.init(ppu, handle.ptr(&LpcmDecContext::cmd_available), 0);
+	ret = ret ? ret : handle->reserved2.init(ppu, handle.ptr(&LpcmDecContext::reserved2), 0);
+
+	if (ret != CELL_OK)
+	{
+		return ret;
+	}
+
+	// HLE exclusive
+	handle->savestate = lpcm_dec_state::waiting_for_cmd_mutex_lock;
+	handle->cmd_counter = 0;
+
+	const vm::var<char[]> _name = vm::make_str("HLE LPCM decoder");
+	const auto entry = g_fxo->get<ppu_function_manager>().func_addr(FIND_FUNC(lpcmDecEntry));
+
+	ret = ppu_execute<&sys_ppu_thread_create>(ppu, handle.ptr(&LpcmDecContext::thread_id), entry, handle.addr(), +res->ppuThreadPriority, +res->ppuThreadStackSize, SYS_PPU_THREAD_CREATE_JOINABLE, +_name);
+	ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::spurs_queue_pop_mutex), mutex_attr);
+	ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::spurs_queue_push_mutex), mutex_attr);
+
+	return ret;
 }
 
-[[noreturn]] error_code _CellAdecCoreOpOpen_lpcm(vm::ptr<void> handle, vm::ptr<AdecNotifyAuDone> notifyAuDone, vm::ptr<void> notifyAuDoneArg, vm::ptr<AdecNotifyPcmOut> notifyPcmOut, vm::ptr<void> notifyPcmOutArg,
+error_code _CellAdecCoreOpOpen_lpcm(ppu_thread& ppu, vm::ptr<LpcmDecContext> handle, vm::ptr<AdecNotifyAuDone> notifyAuDone, vm::ptr<void> notifyAuDoneArg, vm::ptr<AdecNotifyPcmOut> notifyPcmOut, vm::ptr<void> notifyPcmOutArg,
 	vm::ptr<AdecNotifyError> notifyError, vm::ptr<void> notifyErrorArg, vm::ptr<AdecNotifySeqDone> notifySeqDone, vm::ptr<void> notifySeqDoneArg, vm::cptr<CellAdecResource> res)
 {
-	cellAdec.todo("_CellAdecCoreOpOpen_lpcm(handle=*0x%x, notifyAuDone=*0x%x, notifyAuDoneArg=*0x%x, notifyPcmOut=*0x%x, notifyPcmOutArg=*0x%x, notifyError=*0x%x, notifyErrorArg=*0x%x, notifySeqDone=*0x%x, notifySeqDoneArg=*0x%x, res=*0x%x)",
+	cellAdec.notice("_CellAdecCoreOpOpen_lpcm(handle=*0x%x, notifyAuDone=*0x%x, notifyAuDoneArg=*0x%x, notifyPcmOut=*0x%x, notifyPcmOutArg=*0x%x, notifyError=*0x%x, notifyErrorArg=*0x%x, notifySeqDone=*0x%x, notifySeqDoneArg=*0x%x, res=*0x%x)",
 		handle, notifyAuDone, notifyAuDoneArg, notifyPcmOut, notifyPcmOutArg, notifyError, notifyErrorArg, notifySeqDone, notifySeqDoneArg, res);
 
-	fmt::throw_exception("LPCM decoder not implemented, please disable HLE libadec.sprx");
+	return _CellAdecCoreOpOpenExt_lpcm(ppu, handle, notifyAuDone, notifyAuDoneArg, notifyPcmOut, notifyPcmOutArg, notifyError, notifyErrorArg, notifySeqDone, notifySeqDoneArg, res, vm::null);
 }
 
-error_code _CellAdecCoreOpClose_lpcm(vm::ptr<void> handle)
+error_code _CellAdecCoreOpClose_lpcm(ppu_thread& ppu, vm::ptr<LpcmDecContext> handle)
 {
-	cellAdec.todo("_CellAdecCoreOpClose_lpcm(handle=*0x%x)", handle);
+	ppu.state += cpu_flag::wait;
 
-	return CELL_OK;
+	cellAdec.notice("_CellAdecCoreOpClose_lpcm(handle=*0x%x)", handle);
+
+	if (error_code ret = sys_mutex_lock(ppu, handle->queue_size_mutex, 0); ret != CELL_OK
+		|| (ret = sys_mutex_lock(ppu, handle->queue_mutex, 0)) != CELL_OK)
+	{
+		return ret;
+	}
+
+	if (handle->cmd_queue.empty())
+	{
+		handle->cmd_queue.emplace(LpcmDecCmdType::close);
+
+		if (error_code ret = sys_mutex_unlock(ppu, handle->queue_mutex); ret != CELL_OK)
+		{
+			return ret; // LLE doesn't unlock the queue size mutex
+		}
+
+		if (error_code ret = handle->cmd_available.release(ppu); ret != CELL_OK)
+		{
+			ensure(sys_mutex_unlock(ppu, handle->queue_size_mutex) == CELL_OK); // Error code isn't checked on LLE
+			return ret;
+		}
+	}
+	else
+	{
+		for (auto& cmd : handle->cmd_queue.elements)
+		{
+			cmd.type = LpcmDecCmdType::close;
+		}
+
+		if (error_code ret = sys_mutex_unlock(ppu, handle->queue_mutex); ret != CELL_OK)
+		{
+			return ret; // LLE doesn't unlock the queue size mutex
+		}
+	}
+
+	error_code ret = sys_mutex_unlock(ppu, handle->queue_size_mutex);
+	ret = ret ? ret : handle->release_output(ppu);
+
+	vm::var<u64> thread_ret;
+	ret = ret ? ret : sys_ppu_thread_join(ppu, static_cast<u32>(handle->thread_id), +thread_ret);
+
+	ret = ret ? ret : sys_cond_destroy(ppu, handle->queue_size_cond);
+	ret = ret ? ret : sys_cond_destroy(ppu, handle->unk_cond);
+	ret = ret ? ret : sys_cond_destroy(ppu, handle->output_consumed);
+	ret = ret ? ret : sys_mutex_destroy(ppu, handle->queue_mutex);
+	ret = ret ? ret : sys_mutex_destroy(ppu, handle->queue_size_mutex);
+	ret = ret ? ret : sys_mutex_destroy(ppu, handle->unk_mutex);
+	ret = ret ? ret : sys_mutex_destroy(ppu, handle->output_mutex);
+	ret = ret ? ret : handle->cmd_available.finalize(ppu);
+	ret = ret ? ret : handle->reserved2.finalize(ppu);
+	ret = ret ? ret : sys_mutex_destroy(ppu, handle->spurs_queue_pop_mutex);
+	ret = ret ? ret : sys_mutex_destroy(ppu, handle->spurs_queue_push_mutex);
+
+	return ret;
 }
 
-error_code _CellAdecCoreOpStartSeq_lpcm(vm::ptr<void> handle, vm::ptr<CellAdecParamLpcm> lpcmParam)
+error_code _CellAdecCoreOpStartSeq_lpcm(ppu_thread& ppu, vm::ptr<LpcmDecContext> handle, vm::ptr<CellAdecParamLpcm> lpcmParam)
 {
-	cellAdec.todo("_CellAdecCoreOpStartSeq_lpcm(handle=*0x%x, lpcmParam=*0x%x)", handle, lpcmParam);
+	cellAdec.notice("_CellAdecCoreOpStartSeq_lpcm(handle=*0x%x, lpcmParam=*0x%x)", handle, lpcmParam);
 
-	return CELL_OK;
+	ensure(!!handle && !!lpcmParam); // Not checked on LLE
+
+	cellAdec.notice("_CellAdecCoreOpStartSeq_lpcm(): channelNumber=%d, sampleRate=%d, sizeOfWord=%d, audioPayloadSize=0x%x", lpcmParam->channelNumber, lpcmParam->sampleRate, lpcmParam->sizeOfWord, lpcmParam->audioPayloadSize);
+
+	if (lpcmParam->channelNumber >= 0x20u || lpcmParam->sampleRate >= 0x20u || lpcmParam->sizeOfWord >= 0x20u || lpcmParam->audioPayloadSize == 0u)
+	{
+		return CELL_ADEC_ERROR_LPCM_ARG;
+	}
+
+	return handle->send_command<LpcmDecCmdType::start_seq>(ppu, *lpcmParam);
 }
 
-error_code _CellAdecCoreOpEndSeq_lpcm(vm::ptr<void> handle)
+error_code _CellAdecCoreOpEndSeq_lpcm(ppu_thread& ppu, vm::ptr<LpcmDecContext> handle)
 {
-	cellAdec.todo("_CellAdecCoreOpEndSeq_lpcm(handle=*0x%x)", handle);
+	cellAdec.notice("_CellAdecCoreOpEndSeq_lpcm(handle=*0x%x)", handle);
 
-	return CELL_OK;
+	ensure(!!handle); // Not checked on LLE
+
+	return handle->send_command<LpcmDecCmdType::end_seq>(ppu);
 }
 
-error_code _CellAdecCoreOpDecodeAu_lpcm(vm::ptr<void> handle, s32 pcmHandle, vm::ptr<CellAdecAuInfo> auInfo)
+error_code _CellAdecCoreOpDecodeAu_lpcm(ppu_thread& ppu, vm::ptr<LpcmDecContext> handle, s32 pcmHandle, vm::ptr<CellAdecAuInfo> auInfo)
 {
-	cellAdec.todo("_CellAdecCoreOpDecodeAu_lpcm(handle=*0x%x, pcmHandle=%d, auInfo=*0x%x)", handle, pcmHandle, auInfo);
+	cellAdec.trace("_CellAdecCoreOpDecodeAu_lpcm(handle=*0x%x, pcmHandle=%d, auInfo=*0x%x)", handle, pcmHandle, auInfo);
 
-	return CELL_OK;
+	ensure(!!handle && !!auInfo); // Not checked on LLE
+
+	cellAdec.trace("_CellAdecCoreOpDecodeAu_lpcm(): startAddr=*0x%x, size=0x%x, pts=0x%x, userData=0x%016x", auInfo->startAddr, auInfo->size, std::bit_cast<be_t<u64>>(auInfo->pts), auInfo->userData);
+
+	return handle->send_command<LpcmDecCmdType::decode_au>(ppu, pcmHandle, *auInfo);
 }
 
 void _CellAdecCoreOpGetVersion_lpcm(vm::ptr<be_t<u32, 1>> version)
@@ -285,18 +963,35 @@ void _CellAdecCoreOpGetVersion_lpcm(vm::ptr<be_t<u32, 1>> version)
 	*version = 0x20070323;
 }
 
-error_code _CellAdecCoreOpRealign_lpcm(vm::ptr<void> handle, vm::ptr<void> outBuffer, vm::cptr<void> pcmStartAddr)
+error_code _CellAdecCoreOpRealign_lpcm(vm::ptr<LpcmDecContext> handle, vm::ptr<void> outBuffer, vm::cptr<void> pcmStartAddr)
 {
-	cellAdec.todo("_CellAdecCoreOpRealign_lpcm(handle=*0x%x, outBuffer=*0x%x, pcmStartAddr=*0x%x)", handle, outBuffer, pcmStartAddr);
+	cellAdec.trace("_CellAdecCoreOpRealign_lpcm(handle=*0x%x, outBuffer=*0x%x, pcmStartAddr=*0x%x)", handle, outBuffer, pcmStartAddr);
+
+	if (!pcmStartAddr)
+	{
+		return CELL_ADEC_ERROR_LPCM_ARG;
+	}
+
+	if (outBuffer)
+	{
+		ensure(!!handle); // Not checked on LLE
+		ensure(vm::check_addr(outBuffer.addr(), vm::page_info_t::page_writable, handle->output_size));
+
+		std::memcpy(outBuffer.get_ptr(), pcmStartAddr.get_ptr(), handle->output_size);
+	}
 
 	return CELL_OK;
 }
 
-error_code _CellAdecCoreOpReleasePcm_lpcm(vm::ptr<void> handle, s32 pcmHandle, vm::ptr<void> outBuffer)
+error_code _CellAdecCoreOpReleasePcm_lpcm(ppu_thread& ppu, vm::ptr<LpcmDecContext> handle, s32 pcmHandle, vm::ptr<void> outBuffer)
 {
-	cellAdec.todo("_CellAdecCoreOpReleasePcm_lpcm(handle=*0x%x, pcmHandle=%d, outBuffer=*0x%x)", handle, pcmHandle, outBuffer);
+	ppu.state += cpu_flag::wait;
 
-	return CELL_OK;
+	cellAdec.trace("_CellAdecCoreOpReleasePcm_lpcm(handle=*0x%x, pcmHandle=%d, outBuffer=*0x%x)", handle, pcmHandle, outBuffer);
+
+	ensure(!!handle); // Not checked on LLE
+
+	return handle->release_output(ppu);
 }
 
 s32 _CellAdecCoreOpGetPcmHandleNum_lpcm()
@@ -544,8 +1239,8 @@ error_code AdecContext::correct_pts_value(ppu_thread& ppu, s32 pcm_handle, s8 co
 		{
 			switch (correct_pts_type)
 			{
-			case ADEC_CORRECT_PTS_VALUE_TYPE_LPCM:           return 450;
-			case 1:                                          return 150;
+			case ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_HDMV:      return 450;
+			case ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_DVD:       return 150;
 			case ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_48000Hz: return 3840;
 			case ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_44100Hz: return 4180;
 			case ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_32000Hz: return 5760;
@@ -800,6 +1495,15 @@ error_code adecOpen(ppu_thread& ppu, vm::ptr<CellAdecType> type, vm::cptr<CellAd
 
 	const auto core_ops = get_core_ops(type->audioCodecType);
 
+	// Block savestate creation during ppu_thread::fast_call()
+	std::unique_lock savestate_lock{g_fxo->get<hle_locks_t>(), std::try_to_lock};
+
+	if (!savestate_lock.owns_lock())
+	{
+		ppu.state += cpu_flag::again;
+		return {};
+	}
+
 	const s32 pcm_handle_num = core_ops->getPcmHandleNum(ppu);
 	const u32 bitstream_info_size = core_ops->getBsiInfoSize(ppu);
 
@@ -810,11 +1514,11 @@ error_code adecOpen(ppu_thread& ppu, vm::ptr<CellAdecType> type, vm::cptr<CellAd
 
 	if (type->audioCodecType == CELL_ADEC_TYPE_LPCM_DVD)
 	{
-		// TODO
+		vm::static_ptr_cast<LpcmDecContext>(core_handle)->dvd_packing = true;
 	}
 	else if (type->audioCodecType == CELL_ADEC_TYPE_LPCM_PAMF || type->audioCodecType == CELL_ADEC_TYPE_LPCM_BLURAY)
 	{
-		// TODO
+		vm::static_ptr_cast<LpcmDecContext>(core_handle)->dvd_packing = false;
 	}
 
 	_this->_this = _this;
@@ -862,15 +1566,6 @@ error_code adecOpen(ppu_thread& ppu, vm::ptr<CellAdecType> type, vm::cptr<CellAd
 	const auto notifyError = vm::ptr<AdecNotifyError>::make(g_fxo->get<ppu_function_manager>().func_addr(FIND_FUNC(adecNotifyError)));
 	const auto notifySeqDone = vm::ptr<AdecNotifySeqDone>::make(g_fxo->get<ppu_function_manager>().func_addr(FIND_FUNC(adecNotifySeqDone)));
 
-	// Block savestate creation during ppu_thread::fast_call()
-	std::unique_lock savestate_lock{g_fxo->get<hle_locks_t>(), std::try_to_lock};
-
-	if (!savestate_lock.owns_lock())
-	{
-		ppu.state += cpu_flag::again;
-		return {};
-	}
-
 	if (spursRes)
 	{
 		return core_ops->openExt(ppu, _this->core_handle, notifyAuDone, _this, notifyPcmOut, _this, notifyError, _this, notifySeqDone, _this, res, spursRes);
@@ -1013,6 +1708,177 @@ error_code cellAdecEndSeq(ppu_thread& ppu, vm::ptr<AdecContext> handle)
 	return handle->core_ops->endSeq(ppu, handle->core_handle);
 }
 
+error_code adecSetLpcmBlurayParams(vm::ptr<LpcmDecContext> handle, u64 userData)
+{
+	const u8 channel_number = static_cast<u8>(userData >> 32);
+	const u8 sample_rate = static_cast<u8>(userData >> 40);
+	const u8 size_of_word = static_cast<u8>(userData >> 48);
+	const u8 unk = static_cast<u8>(userData >> 56);
+
+	handle->lpcm_param->channelNumber = channel_number;
+	handle->lpcm_param->sampleRate = sample_rate;
+	handle->lpcm_param->sizeOfWord = size_of_word;
+
+	u32 allocated_channels;
+
+	switch (channel_number)
+	{
+	case CELL_ADEC_CH_MONO:
+	case CELL_ADEC_CH_STEREO:
+		allocated_channels = 2;
+		break;
+
+	case CELL_ADEC_CH_3_0:
+	case CELL_ADEC_CH_2_1:
+	case CELL_ADEC_CH_3_1:
+	case CELL_ADEC_CH_2_2:
+		allocated_channels = 4;
+		break;
+
+	case CELL_ADEC_CH_3_2:
+	case CELL_ADEC_CH_3_2_LFE:
+		allocated_channels = 6;
+		break;
+
+	case CELL_ADEC_CH_3_4:
+	case CELL_ADEC_CH_3_4_LFE:
+		allocated_channels = 8;
+		break;
+
+	default:
+		return CELL_ADEC_ERROR_FATAL;
+	}
+
+	u32 samples_per_frame;
+
+	switch (sample_rate)
+	{
+	case CELL_ADEC_FS_48kHz: samples_per_frame = 48000 / 200; break;
+	case CELL_ADEC_FS_96kHz: samples_per_frame = 96000 / 200; break;
+	case CELL_ADEC_FS_192kHz: samples_per_frame = 192000 / 200; break;
+	default: return CELL_ADEC_ERROR_FATAL;
+	}
+
+	u32 allocated_bytes_per_sample;
+
+	switch (size_of_word)
+	{
+	case CELL_ADEC_BIT_LENGTH_16: allocated_bytes_per_sample = 2; break;
+	case CELL_ADEC_BIT_LENGTH_20: // Same as below
+	case CELL_ADEC_BIT_LENGTH_24: allocated_bytes_per_sample = 3; break;
+	default: return CELL_ADEC_ERROR_FATAL;
+	}
+
+	handle->lpcm_param->audioPayloadSize = allocated_bytes_per_sample * allocated_channels * samples_per_frame * unk;
+
+	return CELL_OK;
+}
+
+error_code adecSetLpcmDvdParams(vm::ptr<LpcmDecContext> handle, u64 userData)
+{
+	const u8 channel_layout = static_cast<u8>(userData >> 32);
+	const u8 sample_rate = static_cast<u8>(userData >> 40);
+	const u8 size_of_word = static_cast<u8>(userData >> 48);
+	const u8 unk = static_cast<u8>(userData >> 56);
+
+	handle->lpcm_param->channelNumber = channel_layout;
+	handle->lpcm_param->sampleRate = sample_rate;
+	handle->lpcm_param->sizeOfWord = size_of_word;
+
+	u32 samples_per_frame;
+
+	switch (sample_rate)
+	{
+	case CELL_ADEC_FS_48kHz: samples_per_frame = 48000 / 600; break;
+	case CELL_ADEC_FS_96kHz: samples_per_frame = 96000 / 600; break;
+	default: return CELL_ADEC_ERROR_FATAL;
+	}
+
+	u32 bits_per_sample;
+
+	switch (size_of_word)
+	{
+	case CELL_ADEC_BIT_LENGTH_16: bits_per_sample = 16; break;
+	case CELL_ADEC_BIT_LENGTH_20: bits_per_sample = 20; break;
+	case CELL_ADEC_BIT_LENGTH_24: bits_per_sample = 24; break;
+	default: return CELL_ADEC_ERROR_FATAL;
+	}
+
+	u32 channel_number;
+
+	switch (channel_layout)
+	{
+	case CELL_ADEC_LPCM_DVD_CH_MONO:
+		channel_number = 1;
+		break;
+
+	case CELL_ADEC_LPCM_DVD_CH_STEREO:
+		channel_number = 2;
+		break;
+
+	case 4:
+		if (sample_rate == CELL_ADEC_FS_96kHz && size_of_word == CELL_ADEC_BIT_LENGTH_24)
+		{
+			return CELL_ADEC_ERROR_FATAL;
+		}
+
+		channel_number = 3;
+		break;
+
+	case 5:
+		if (sample_rate == CELL_ADEC_FS_96kHz && size_of_word != CELL_ADEC_BIT_LENGTH_16)
+		{
+			return CELL_ADEC_ERROR_FATAL;
+		}
+
+		channel_number = 4;
+		break;
+
+	case CELL_ADEC_LPCM_DVD_CH_3_2:
+		if (sample_rate == CELL_ADEC_FS_96kHz)
+		{
+			return CELL_ADEC_ERROR_FATAL;
+		}
+
+		channel_number = 5;
+		break;
+
+	case CELL_ADEC_LPCM_DVD_CH_3_2_LFE:
+		if (sample_rate == CELL_ADEC_FS_96kHz || size_of_word == CELL_ADEC_BIT_LENGTH_24)
+		{
+			return CELL_ADEC_ERROR_FATAL;
+		}
+
+		channel_number = 6;
+		break;
+
+	case CELL_ADEC_LPCM_DVD_CH_3_4:
+		if (sample_rate == CELL_ADEC_FS_96kHz || size_of_word != CELL_ADEC_BIT_LENGTH_16)
+		{
+			return CELL_ADEC_ERROR_FATAL;
+		}
+
+		channel_number = 7;
+		break;
+
+	case CELL_ADEC_LPCM_DVD_CH_3_4_LFE:
+		if (sample_rate == CELL_ADEC_FS_96kHz || size_of_word != CELL_ADEC_BIT_LENGTH_16)
+		{
+			return CELL_ADEC_ERROR_FATAL;
+		}
+
+		channel_number = 8;
+		break;
+
+	default:
+		return CELL_ADEC_ERROR_FATAL;
+	}
+
+	handle->lpcm_param->audioPayloadSize = bits_per_sample * channel_number * samples_per_frame / 8 * unk;
+
+	return CELL_OK;
+}
+
 error_code cellAdecDecodeAu(ppu_thread& ppu, vm::ptr<AdecContext> handle, vm::ptr<CellAdecAuInfo> auInfo)
 {
 	// Block savestate creation during ppu_thread::fast_call()
@@ -1052,11 +1918,17 @@ error_code cellAdecDecodeAu(ppu_thread& ppu, vm::ptr<AdecContext> handle, vm::pt
 
 	if (handle->type.audioCodecType == CELL_ADEC_TYPE_LPCM_BLURAY)
 	{
-		// TODO
+		if (adecSetLpcmBlurayParams(vm::static_ptr_cast<LpcmDecContext>(handle->core_handle), auInfo->userData) != CELL_OK)
+		{
+			return CELL_ADEC_ERROR_FATAL;
+		}
 	}
 	else if (handle->type.audioCodecType == CELL_ADEC_TYPE_LPCM_DVD)
 	{
-		// TODO
+		if (adecSetLpcmDvdParams(vm::static_ptr_cast<LpcmDecContext>(handle->core_handle), auInfo->userData) != CELL_OK)
+		{
+			return CELL_ADEC_ERROR_FATAL;
+		}
 	}
 
 	return handle->core_ops->decodeAu(ppu, handle->core_handle, pcmHandle, auInfo);
@@ -1104,7 +1976,7 @@ error_code cellAdecGetPcm(ppu_thread& ppu, vm::ptr<AdecContext> handle, vm::ptr<
 
 	if (handle->type.audioCodecType == CELL_ADEC_TYPE_LPCM_PAMF || handle->type.audioCodecType == CELL_ADEC_TYPE_LPCM_BLURAY || handle->type.audioCodecType == CELL_ADEC_TYPE_LPCM_DVD)
 	{
-		// TODO
+		vm::static_ptr_cast<LpcmDecContext>(handle->core_handle)->output_size = pcm_item->size;
 	}
 
 	if (error_code ret = handle->core_ops->realign(ppu, handle->core_handle, outBuffer, pcm_queue_entry->pcm_item->startAddr); ret != CELL_OK)
@@ -1206,6 +2078,8 @@ DECLARE(ppu_module_manager::cellAdec)("cellAdec", []()
 	REG_HIDDEN_FUNC(_CellAdecCoreOpGetPcmHandleNum_lpcm);
 	REG_HIDDEN_FUNC(_CellAdecCoreOpGetBsiInfoSize_lpcm);
 	REG_HIDDEN_FUNC(_CellAdecCoreOpOpenExt_lpcm);
+
+	REG_HIDDEN_FUNC(lpcmDecEntry);
 });
 
 DECLARE(ppu_module_manager::cell_libac3dec)("cell_libac3dec", []
diff --git a/rpcs3/Emu/Cell/Modules/cellAdec.h b/rpcs3/Emu/Cell/Modules/cellAdec.h
index 1cbeca08b8..a43c3f0aef 100644
--- a/rpcs3/Emu/Cell/Modules/cellAdec.h
+++ b/rpcs3/Emu/Cell/Modules/cellAdec.h
@@ -253,7 +253,7 @@ enum CellAdecSampleRate : s32
 	CELL_ADEC_FS_8kHz,
 };
 
-enum CellAdecBitLength : s32
+enum CellAdecBitLength : u32
 {
 	CELL_ADEC_BIT_LENGTH_RESERVED1,
 	CELL_ADEC_BIT_LENGTH_16,
@@ -352,8 +352,8 @@ enum AdecCorrectPtsValueType : s8
 	ADEC_CORRECT_PTS_VALUE_TYPE_UNSPECIFIED = -1,
 
 	// Adds a fixed amount
-	ADEC_CORRECT_PTS_VALUE_TYPE_LPCM = 0,
-	// 1
+	ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_HDMV = 0,
+	ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_DVD = 1, // Unused for some reason, the DVD player probably takes care of timestamps itself
 	ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_48000Hz = 2,
 	ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_44100Hz = 3,
 	ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_32000Hz = 4,
@@ -562,6 +562,11 @@ public:
 	{
 		ensure(sys_mutex_lock(ppu, mutex, 0) == CELL_OK); // Error code isn't checked on LLE
 
+		if (ppu.state & cpu_flag::again) // Savestate was created while waiting on the mutex
+		{
+			return {};
+		}
+
 		if (entries[front].state == 0xff)
 		{
 			ensure(sys_mutex_unlock(ppu, mutex) == CELL_OK); // Error code isn't checked on LLE
@@ -648,6 +653,20 @@ static_assert(std::is_standard_layout_v<AdecContext> && std::is_trivial_v<AdecCo
 CHECK_SIZE_ALIGN(AdecContext, 0x530, 8);
 
 
+enum : u32
+{
+	CELL_ADEC_LPCM_DVD_CH_RESERVED1,
+	CELL_ADEC_LPCM_DVD_CH_MONO,
+	CELL_ADEC_LPCM_DVD_CH_RESERVED2,
+	CELL_ADEC_LPCM_DVD_CH_STEREO,
+	CELL_ADEC_LPCM_DVD_CH_UNK1, // Either 3 front or 2 front + 1 surround
+	CELL_ADEC_LPCM_DVD_CH_UNK2, // Either 3 front + 1 surround or 2 front + 2 surround
+	CELL_ADEC_LPCM_DVD_CH_3_2,
+	CELL_ADEC_LPCM_DVD_CH_3_2_LFE,
+	CELL_ADEC_LPCM_DVD_CH_3_4,
+	CELL_ADEC_LPCM_DVD_CH_3_4_LFE,
+};
+
 struct CellAdecParamLpcm
 {
 	be_t<u32> channelNumber;
@@ -664,6 +683,216 @@ struct CellAdecLpcmInfo
 	be_t<u32> outputDataSize;
 };
 
+// HLE exclusive, for savestates
+enum class lpcm_dec_state : u8
+{
+	waiting_for_cmd_mutex_lock,
+	waiting_for_cmd_cond_wait,
+	waiting_for_output_mutex_lock,
+	waiting_for_output_cond_wait,
+	queue_mutex_lock,
+	executing_cmd
+};
+
+class LpcmDecSemaphore
+{
+	be_t<u32> value;
+	be_t<u32> mutex; // sys_mutex_t
+	be_t<u32> cond;  // sys_cond_t
+
+public:
+	error_code init(ppu_thread& ppu, vm::ptr<LpcmDecSemaphore> _this, u32 initial_value)
+	{
+		value = initial_value;
+
+		const vm::var<sys_mutex_attribute_t> mutex_attr{{ SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, { "_adem01"_u64 } }};
+		const vm::var<sys_cond_attribute_t> cond_attr{{ SYS_SYNC_NOT_PROCESS_SHARED, 0, 0, { "_adec01"_u64 } }};
+
+		if (error_code ret = sys_mutex_create(ppu, _this.ptr(&LpcmDecSemaphore::mutex), mutex_attr); ret != CELL_OK)
+		{
+			return ret;
+		}
+
+		return sys_cond_create(ppu, _this.ptr(&LpcmDecSemaphore::cond), mutex, cond_attr);
+	}
+
+	error_code finalize(ppu_thread& ppu) const
+	{
+		if (error_code ret = sys_cond_destroy(ppu, cond); ret != CELL_OK)
+		{
+			return ret;
+		}
+
+		return sys_mutex_destroy(ppu, mutex);
+	}
+
+	error_code release(ppu_thread& ppu)
+	{
+		if (error_code ret = sys_mutex_lock(ppu, mutex, 0); ret != CELL_OK)
+		{
+			return ret;
+		}
+
+		value++;
+
+		if (error_code ret = sys_cond_signal(ppu, cond); ret != CELL_OK)
+		{
+			return ret; // LLE doesn't unlock the mutex
+		}
+
+		return sys_mutex_unlock(ppu, mutex);
+	}
+
+	error_code acquire(ppu_thread& ppu, lpcm_dec_state& savestate)
+	{
+		if (savestate == lpcm_dec_state::waiting_for_cmd_cond_wait)
+		{
+			goto cond_wait;
+		}
+
+		savestate = lpcm_dec_state::waiting_for_cmd_mutex_lock;
+
+		if (error_code ret = sys_mutex_lock(ppu, mutex, 0); ret != CELL_OK)
+		{
+			return ret;
+		}
+
+		if (ppu.state & cpu_flag::again)
+		{
+			return {};
+		}
+
+		if (value == 0u)
+		{
+			savestate = lpcm_dec_state::waiting_for_cmd_cond_wait;
+			cond_wait:
+
+			if (error_code ret = sys_cond_wait(ppu, cond, 0); ret != CELL_OK)
+			{
+				return ret; // LLE doesn't unlock the mutex
+			}
+
+			if (ppu.state & cpu_flag::again)
+			{
+				return {};
+			}
+		}
+
+		value--;
+
+		return sys_mutex_unlock(ppu, mutex);
+	}
+};
+
+CHECK_SIZE(LpcmDecSemaphore, 0xc);
+
+enum class LpcmDecCmdType : u32
+{
+	start_seq,
+	end_seq,
+	decode_au,
+	close
+};
+
+struct LpcmDecCmd
+{
+	be_t<s32> pcm_handle;
+	vm::bcptr<void> au_start_addr;
+	be_t<u32> au_size;
+	u32 reserved1[2];
+	CellAdecParamLpcm lpcm_param;
+	be_t<LpcmDecCmdType> type;
+	u32 reserved2;
+
+	LpcmDecCmd() = default; // cellAdecOpen()
+
+	LpcmDecCmd(LpcmDecCmdType&& type) // End sequence
+		: type(type)
+	{
+	}
+
+	LpcmDecCmd(LpcmDecCmdType&& type, const CellAdecParamLpcm& lpcm_param) // Start sequence
+		: lpcm_param(lpcm_param), type(type)
+	{
+	}
+
+	LpcmDecCmd(LpcmDecCmdType&& type, const s32& pcm_handle, const CellAdecAuInfo& au_info) // Decode au
+		: pcm_handle(pcm_handle), au_start_addr(au_info.startAddr), au_size(au_info.size), type(type)
+	{
+	}
+};
+
+CHECK_SIZE(LpcmDecCmd, 0x2c);
+
+struct LpcmDecContext
+{
+	AdecCmdQueue<LpcmDecCmd> cmd_queue;
+
+	be_t<u64> thread_id; // sys_ppu_thread_t
+
+	be_t<u32> queue_size_mutex; // sys_mutex_t
+	be_t<u32> queue_size_cond;  // sys_cond_t, unused
+	be_t<u32> unk_mutex;        // sys_mutex_t, unused
+	be_t<u32> unk_cond;         // sys_cond_t, unused
+
+	be_t<u32> run_thread;
+
+	AdecCb<AdecNotifyAuDone> notify_au_done;
+	AdecCb<AdecNotifyPcmOut> notify_pcm_out;
+	AdecCb<AdecNotifyError> notify_error;
+	AdecCb<AdecNotifySeqDone> notify_seq_done;
+
+	be_t<u32> output_locked;
+	vm::bptr<f32> output;
+
+	vm::bptr<CellAdecParamLpcm> lpcm_param;
+
+	vm::bcptr<void> spurs_cmd_data;
+
+	// HLE exclusive
+	lpcm_dec_state savestate;
+	u64 cmd_counter; // For debugging
+
+	u8 reserved1[24]; // 36 bytes on LLE
+
+	be_t<u32> output_mutex;    // sys_mutex_t
+	be_t<u32> output_consumed; // sys_cond_t
+
+	LpcmDecSemaphore cmd_available;
+	LpcmDecSemaphore reserved2; // Unused
+
+	be_t<u32> queue_mutex; // sys_mutex_t
+
+	be_t<u32> error_occurred;
+
+	u8 spurs_stuff[32];
+
+	be_t<u32> spurs_queue_pop_mutex;
+	be_t<u32> spurs_queue_push_mutex;
+
+	be_t<u32> using_existing_spurs_instance;
+
+	be_t<u32> dvd_packing;
+
+	be_t<u32> output_size;
+
+	LpcmDecCmd cmd; // HLE exclusive, name of Spurs taskset (32 bytes) + CellSpursTaskLsPattern on LLE
+
+	u8 more_spurs_stuff[10]; // 52 bytes on LLE
+
+	void exec(ppu_thread& ppu);
+
+	template <LpcmDecCmdType type>
+	error_code send_command(ppu_thread& ppu, auto&&... args);
+
+	inline error_code release_output(ppu_thread& ppu);
+};
+
+static_assert(std::is_standard_layout_v<LpcmDecContext>);
+CHECK_SIZE_ALIGN(LpcmDecContext, 0x1c8, 8);
+
+constexpr s32 LPCM_DEC_OUTPUT_BUFFER_SIZE = 0x40000;
+
 // CELP Excitation Mode
 enum CELP_ExcitationMode : s32
 {
diff --git a/rpcs3/Emu/Cell/Modules/cellCamera.cpp b/rpcs3/Emu/Cell/Modules/cellCamera.cpp
index 89ab8ef9f9..64135ca5fa 100644
--- a/rpcs3/Emu/Cell/Modules/cellCamera.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellCamera.cpp
@@ -784,26 +784,26 @@ s32 cellCameraIsAttached(s32 dev_num)
 
 	if (g_cfg.io.camera == camera_handler::null)
 	{
-		return false;
+		return 0;
 	}
 
 	auto& g_camera = g_fxo->get<camera_thread>();
 
 	if (!g_camera.init)
 	{
-		return false;
+		return 0;
 	}
 
 	if (!check_dev_num(dev_num))
 	{
-		return false;
+		return 0;
 	}
 
 	vm::var<s32> type;
 
 	if (cellCameraGetType(dev_num, type) != CELL_OK)
 	{
-		return false;
+		return 0;
 	}
 
 	std::lock_guard lock(g_camera.mutex);
@@ -821,12 +821,12 @@ s32 cellCameraIsAttached(s32 dev_num)
 		}
 	}
 
-	return is_attached;
+	return is_attached ? 1 : 0;
 }
 
 s32 cellCameraIsOpen(s32 dev_num)
 {
-	cellCamera.notice("cellCameraIsOpen(dev_num=%d)", dev_num);
+	cellCamera.trace("cellCameraIsOpen(dev_num=%d)", dev_num);
 
 	if (g_cfg.io.camera == camera_handler::null)
 	{
@@ -852,7 +852,7 @@ s32 cellCameraIsOpen(s32 dev_num)
 
 s32 cellCameraIsStarted(s32 dev_num)
 {
-	cellCamera.notice("cellCameraIsStarted(dev_num=%d)", dev_num);
+	cellCamera.trace("cellCameraIsStarted(dev_num=%d)", dev_num);
 
 	if (g_cfg.io.camera == camera_handler::null)
 	{
diff --git a/rpcs3/Emu/Cell/Modules/cellGem.cpp b/rpcs3/Emu/Cell/Modules/cellGem.cpp
index df621efef3..ff596cbc34 100644
--- a/rpcs3/Emu/Cell/Modules/cellGem.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellGem.cpp
@@ -956,9 +956,9 @@ static inline void pos_to_gem_state(u32 gem_num, gem_config::gem_controller& con
 		static constexpr f32 PI = 3.14159265f;
 		const auto degree_to_rad = [](f32 degree) -> f32 { return degree * PI / 180.0f; };
 
-		static constexpr f32 CONE = 10.0f / 2.0f;
-		const f32 roll = -degree_to_rad((image_y - half_height) / half_height * CONE); // This is actually the pitch
-		const f32 pitch = -degree_to_rad((image_x - half_width) / half_width * CONE); // This is actually the yaw
+		const f32 max_angle_per_side = g_cfg.io.fake_move_rotation_cone / 2.0f;
+		const f32 roll = -degree_to_rad((image_y - half_height) / half_height * max_angle_per_side); // This is actually the pitch
+		const f32 pitch = -degree_to_rad((image_x - half_width) / half_width * max_angle_per_side); // This is actually the yaw
 		const f32 yaw = degree_to_rad(0.0f);
 		const f32 cr = std::cos(roll * 0.5f);
 		const f32 sr = std::sin(roll * 0.5f);
diff --git a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp
index ecda6ded37..c86487042c 100644
--- a/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_net/lv2_socket_p2p.cpp
@@ -333,6 +333,10 @@ void lv2_socket_p2p::close()
 	auto& nc = g_fxo->get<p2p_context>();
 	{
 		std::lock_guard lock(nc.list_p2p_ports_mutex);
+
+		if (!nc.list_p2p_ports.contains(port))
+			return;
+
 		auto& p2p_port = ::at32(nc.list_p2p_ports, port);
 		{
 			std::lock_guard lock(p2p_port.bound_p2p_vports_mutex);
diff --git a/rpcs3/Emu/Cell/lv2/sys_prx.cpp b/rpcs3/Emu/Cell/lv2/sys_prx.cpp
index 49b5839cfa..da24edbedb 100644
--- a/rpcs3/Emu/Cell/lv2/sys_prx.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_prx.cpp
@@ -35,7 +35,7 @@ extern const std::map<std::string_view, int> g_prx_list
 	{ "libaacenc_spurs.sprx", 0 },
 	{ "libac3dec.sprx", 0 },
 	{ "libac3dec2.sprx", 0 },
-	{ "libadec.sprx", 0 },
+	{ "libadec.sprx", 1 },
 	{ "libadec2.sprx", 0 },
 	{ "libadec_internal.sprx", 0 },
 	{ "libad_async.sprx", 0 },
diff --git a/rpcs3/Emu/Io/Null/null_camera_handler.h b/rpcs3/Emu/Io/Null/null_camera_handler.h
index 61e26a8d96..a250facd75 100644
--- a/rpcs3/Emu/Io/Null/null_camera_handler.h
+++ b/rpcs3/Emu/Io/Null/null_camera_handler.h
@@ -7,10 +7,10 @@ class null_camera_handler final : public camera_handler_base
 public:
 	null_camera_handler() : camera_handler_base() {}
 
-	void open_camera() override { m_state = camera_handler_state::open; }
-	void close_camera() override { m_state = camera_handler_state::closed; }
-	void start_camera() override { m_state = camera_handler_state::running; }
-	void stop_camera() override { m_state = camera_handler_state::open; }
+	void open_camera() override { set_state(camera_handler_state::open); }
+	void close_camera() override { set_state(camera_handler_state::closed); }
+	void start_camera() override { set_state(camera_handler_state::running); }
+	void stop_camera() override { set_state(camera_handler_state::open); }
 
 	void set_format(s32 format, u32 bytesize) override
 	{
@@ -45,6 +45,6 @@ public:
 		height = 0;
 		frame_number = 0;
 		bytes_read = 0;
-		return m_state;
+		return get_state();
 	}
 };
diff --git a/rpcs3/Emu/Io/camera_handler_base.h b/rpcs3/Emu/Io/camera_handler_base.h
index 531fa3abe3..49ce4dc635 100644
--- a/rpcs3/Emu/Io/camera_handler_base.h
+++ b/rpcs3/Emu/Io/camera_handler_base.h
@@ -30,22 +30,29 @@ public:
 	virtual u64 frame_number() const = 0; // Convenience function to check if there's a new frame.
 	virtual camera_handler_state get_image(u8* buf, u64 size, u32& width, u32& height, u64& frame_number, u64& bytes_read) = 0;
 
-	camera_handler_state get_state() const { return m_state.load(); };
+	camera_handler_state get_state() const { return m_state.load(); }
+	void set_state(camera_handler_state state) { m_state = m_state_expected = state; }
 
-	bool mirrored() const { return m_mirrored; };
-	s32 format() const { return m_format; };
-	u32 bytesize() const { return m_bytesize; };
-	u32 width() const { return m_width; };
-	u32 height() const { return m_height; };
-	u32 frame_rate() const { return m_frame_rate; };
+	camera_handler_state get_expected_state() const { return m_state_expected.load(); }
+	void set_expected_state(camera_handler_state state) { m_state_expected = state; }
+
+	bool mirrored() const { return m_mirrored; }
+	s32 format() const { return m_format; }
+	u32 bytesize() const { return m_bytesize; }
+	u32 width() const { return m_width; }
+	u32 height() const { return m_height; }
+	u32 frame_rate() const { return m_frame_rate; }
 
 protected:
 	std::mutex m_mutex;
-	atomic_t<camera_handler_state> m_state = camera_handler_state::closed;
 	bool m_mirrored = false;
 	s32 m_format = 2; // CELL_CAMERA_RAW8
 	u32 m_bytesize = 0;
 	u32 m_width = 640;
 	u32 m_height = 480;
 	u32 m_frame_rate = 30;
+
+private:
+	atomic_t<camera_handler_state> m_state = camera_handler_state::closed;
+	atomic_t<camera_handler_state> m_state_expected = camera_handler_state::closed;
 };
diff --git a/rpcs3/Emu/RSX/Common/texture_cache_utils.h b/rpcs3/Emu/RSX/Common/texture_cache_utils.h
index 79ad1b610e..d01660775e 100644
--- a/rpcs3/Emu/RSX/Common/texture_cache_utils.h
+++ b/rpcs3/Emu/RSX/Common/texture_cache_utils.h
@@ -5,6 +5,7 @@
 #include "TextureUtils.h"
 
 #include "Emu/Memory/vm.h"
+#include "Emu/RSX/Host/MM.h"
 #include "util/vm.hpp"
 
 #include <list>
@@ -29,8 +30,7 @@ namespace rsx
 	{
 		ensure(range.is_page_range());
 
-		//rsx_log.error("memory_protect(0x%x, 0x%x, %x)", static_cast<u32>(range.start), static_cast<u32>(range.length()), static_cast<u32>(prot));
-		utils::memory_protect(vm::base(range.start), range.length(), prot);
+		rsx::mm_protect(vm::base(range.start), range.length(), prot);
 
 #ifdef TEXTURE_CACHE_DEBUG
 		tex_cache_checker.set_protection(range, prot);
diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
index cc17333dde..c785ddc879 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@@ -7,6 +7,7 @@
 
 #include "Emu/Memory/vm_locking.h"
 #include "Emu/RSX/rsx_methods.h"
+#include "Emu/RSX/Host/MM.h"
 #include "Emu/RSX/Host/RSXDMAWriter.h"
 #include "Emu/RSX/NV47/HW/context_accessors.define.h"
 
@@ -1082,6 +1083,8 @@ void GLGSRender::patch_transform_constants(rsx::context* ctx, u32 index, u32 cou
 
 bool GLGSRender::on_access_violation(u32 address, bool is_writing)
 {
+	rsx::mm_flush(address);
+
 	const bool can_flush = is_current_thread();
 	const rsx::invalidation_cause cause = is_writing
 		? (can_flush ? rsx::invalidation_cause::write : rsx::invalidation_cause::deferred_write)
diff --git a/rpcs3/Emu/RSX/Host/MM.cpp b/rpcs3/Emu/RSX/Host/MM.cpp
new file mode 100644
index 0000000000..cf21b6e046
--- /dev/null
+++ b/rpcs3/Emu/RSX/Host/MM.cpp
@@ -0,0 +1,110 @@
+#include "stdafx.h"
+#include "MM.h"
+#include <Emu/RSX/Common/simple_array.hpp>
+#include <Emu/RSX/RSXOffload.h>
+
+#include <Emu/Memory/vm.h>
+#include <Emu/IdManager.h>
+#include <Emu/system_config.h>
+#include <Utilities/address_range.h>
+#include <Utilities/mutex.h>
+
+namespace rsx
+{
+	rsx::simple_array<MM_block> g_deferred_mprotect_queue;
+	shared_mutex g_mprotect_queue_lock;
+
+	void mm_flush_mprotect_queue_internal()
+	{
+		for (const auto& block : g_deferred_mprotect_queue)
+		{
+			utils::memory_protect(reinterpret_cast<void*>(block.start), block.length, block.prot);
+		}
+
+		g_deferred_mprotect_queue.clear();
+	}
+
+	void mm_defer_mprotect_internal(u64 start, u64 length, utils::protection prot)
+	{
+		// We could stack and merge requests here, but that is more trouble than it is truly worth.
+		// A fresh call to memory_protect only takes a few nanoseconds of setup overhead, it is not worth the risk of hanging because of conflicts.
+		g_deferred_mprotect_queue.push_back({ start, length, prot });
+	}
+
+	void mm_protect(void* ptr, u64 length, utils::protection prot)
+	{
+		if (g_cfg.video.disable_async_host_memory_manager)
+		{
+			utils::memory_protect(ptr, length, prot);
+			return;
+		}
+
+		// Naive merge. Eventually it makes more sense to do conflict resolution, but it's not as important.
+		const auto start = reinterpret_cast<u64>(ptr);
+		const auto end = start + length;
+
+		std::lock_guard lock(g_mprotect_queue_lock);
+
+		if (prot == utils::protection::rw || prot == utils::protection::wx)
+		{
+			// Basically an unlock op. Flush if any overlap is detected
+			for (const auto& block : g_deferred_mprotect_queue)
+			{
+				if (block.overlaps(start, end))
+				{
+					mm_flush_mprotect_queue_internal();
+					break;
+				}
+			}
+
+			utils::memory_protect(ptr, length, prot);
+			return;
+		}
+
+		// No, Ro, etc.
+		mm_defer_mprotect_internal(start, length, prot);
+	}
+
+	void mm_flush()
+	{
+		std::lock_guard lock(g_mprotect_queue_lock);
+		mm_flush_mprotect_queue_internal();
+	}
+
+	void mm_flush(u32 vm_address)
+	{
+		std::lock_guard lock(g_mprotect_queue_lock);
+		if (g_deferred_mprotect_queue.empty())
+		{
+			return;
+		}
+
+		const auto addr = reinterpret_cast<u64>(vm::base(vm_address));
+		for (const auto& block : g_deferred_mprotect_queue)
+		{
+			if (block.overlaps(addr))
+			{
+				mm_flush_mprotect_queue_internal();
+				return;
+			}
+		}
+	}
+
+	void mm_flush_lazy()
+	{
+		if (!g_cfg.video.multithreaded_rsx)
+		{
+			mm_flush();
+			return;
+		}
+
+		std::lock_guard lock(g_mprotect_queue_lock);
+		if (g_deferred_mprotect_queue.empty())
+		{
+			return;
+		}
+
+		auto& rsxdma = g_fxo->get<rsx::dma_manager>();
+		rsxdma.backend_ctrl(mm_backend_ctrl::cmd_mm_flush, nullptr);
+	}
+}
diff --git a/rpcs3/Emu/RSX/Host/MM.h b/rpcs3/Emu/RSX/Host/MM.h
new file mode 100644
index 0000000000..e9415a685f
--- /dev/null
+++ b/rpcs3/Emu/RSX/Host/MM.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <util/types.hpp>
+#include <util/vm.hpp>
+
+namespace rsx
+{
+	struct MM_block
+	{
+		u64 start;
+		u64 length;
+		utils::protection prot;
+
+		inline bool overlaps(u64 start, u64 end) const
+		{
+			// [Start, End] is not a proper closed range, there is an off-by-one by design.
+			// FIXME: Use address_range64
+			const u64 this_end = this->start + this->length;
+			return (this->start < end && start < this_end);
+		}
+
+		inline bool overlaps(u64 addr) const
+		{
+			// [Start, End] is not a proper closed range, there is an off-by-one by design.
+			// FIXME: Use address_range64
+			const u64 this_end = this->start + this->length;
+			return (addr >= start && addr < this_end);
+		}
+	};
+
+	enum mm_backend_ctrl : u32
+	{
+		cmd_mm_flush = 0x81000000,
+	};
+
+	void mm_protect(void* start, u64 length, utils::protection prot);
+	void mm_flush_lazy();
+	void mm_flush(u32 vm_address);
+	void mm_flush();
+}
diff --git a/rpcs3/Emu/RSX/NV47/HW/nv47_sync.hpp b/rpcs3/Emu/RSX/NV47/HW/nv47_sync.hpp
index fe6dc21ba5..9c153b2056 100644
--- a/rpcs3/Emu/RSX/NV47/HW/nv47_sync.hpp
+++ b/rpcs3/Emu/RSX/NV47/HW/nv47_sync.hpp
@@ -7,6 +7,9 @@
 
 namespace rsx
 {
+	void mm_flush_lazy();
+	void mm_flush();
+
 	namespace util
 	{
 		template <bool FlushDMA, bool FlushPipe>
@@ -24,17 +27,24 @@ namespace rsx
 					return;
 				}
 
-				if constexpr (FlushDMA)
+				if constexpr (FlushDMA || FlushPipe)
 				{
-					// If the backend handled the request, this call will basically be a NOP
-					g_fxo->get<rsx::dma_manager>().sync();
-				}
+					// Release op must be acoompanied by MM flush.
+					// FlushPipe implicitly does a MM flush but FlushDMA does not. Trigger the flush here
+					rsx::mm_flush();
 
-				if constexpr (FlushPipe)
-				{
-					// Manually flush the pipeline.
-					// It is possible to stream report writes using the host GPU, but that generates too much submit traffic.
-					RSX(ctx)->sync();
+					if constexpr (FlushDMA)
+					{
+						// If the backend handled the request, this call will basically be a NOP
+						g_fxo->get<rsx::dma_manager>().sync();
+					}
+
+					if constexpr (FlushPipe)
+					{
+						// Manually flush the pipeline.
+						// It is possible to stream report writes using the host GPU, but that generates too much submit traffic.
+						RSX(ctx)->sync();
+					}
 				}
 
 				if (handled)
diff --git a/rpcs3/Emu/RSX/Overlays/HomeMenu/overlay_home_menu_settings.cpp b/rpcs3/Emu/RSX/Overlays/HomeMenu/overlay_home_menu_settings.cpp
index 50d690d56a..0639e41ea7 100644
--- a/rpcs3/Emu/RSX/Overlays/HomeMenu/overlay_home_menu_settings.cpp
+++ b/rpcs3/Emu/RSX/Overlays/HomeMenu/overlay_home_menu_settings.cpp
@@ -83,6 +83,7 @@ namespace rsx
 
 			add_dropdown(&g_cfg.io.pad_mode, localized_string_id::HOME_MENU_SETTINGS_INPUT_PAD_MODE);
 			add_unsigned_slider(&g_cfg.io.pad_sleep, localized_string_id::HOME_MENU_SETTINGS_INPUT_PAD_SLEEP, " µs", 100);
+			add_unsigned_slider(&g_cfg.io.fake_move_rotation_cone, localized_string_id::HOME_MENU_SETTINGS_INPUT_FAKE_MOVE_ROTATION_CONE, "°", 1);
 
 			apply_layout();
 		}
diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp
index f266a25587..4abc484b5e 100644
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@@ -9,6 +9,7 @@
 #include "Common/time.hpp"
 #include "Core/RSXReservationLock.hpp"
 #include "Core/RSXEngLock.hpp"
+#include "Host/MM.h"
 #include "Host/RSXDMAWriter.h"
 #include "NV47/HW/context.h"
 #include "Program/GLSLCommon.h"
@@ -2603,8 +2604,14 @@ namespace rsx
 						rsx_log.error("Depth texture bound to pipeline with unexpected format 0x%X", format);
 					}
 				}
-				else if (!backend_config.supports_hw_renormalization)
+				else if (!backend_config.supports_hw_renormalization /* &&
+					tex.min_filter() == rsx::texture_minify_filter::nearest &&
+					tex.mag_filter() == rsx::texture_magnify_filter::nearest*/)
 				{
+					// FIXME: This check should only apply to point-sampled textures. However, it severely regresses some games (id tech 5).
+					// This is because even when filtering is active, the error from the PS3 texture expansion still applies.
+					// A proper fix is to expand these formats into BGRA8 when high texture precision is required. That requires different GUI settings and inflation shaders, so it will be handled separately.
+
 					switch (format)
 					{
 					case CELL_GCM_TEXTURE_A1R5G5B5:
@@ -3175,6 +3182,8 @@ namespace rsx
 	{
 		m_eng_interrupt_mask.clear(rsx::pipe_flush_interrupt);
 
+		mm_flush();
+
 		if (zcull_ctrl->has_pending())
 		{
 			zcull_ctrl->sync(this);
@@ -3627,10 +3636,25 @@ namespace rsx
 			on_invalidate_memory_range(m_invalidated_memory_range, rsx::invalidation_cause::read);
 		}
 
+		// Host sync
+		rsx::mm_flush();
+
 		on_invalidate_memory_range(m_invalidated_memory_range, rsx::invalidation_cause::unmap);
 		m_invalidated_memory_range.invalidate();
 	}
 
+	void thread::renderctl(u32 request_code, void* args)
+	{
+		switch (request_code)
+		{
+		case rsx::mm_backend_ctrl::cmd_mm_flush:
+			rsx::mm_flush();
+			break;
+		default:
+			fmt::throw_exception("Unknown backend request: 0x%x", request_code);
+		}
+	}
+
 	//Pause/cont wrappers for FIFO ctrl. Never call this from rsx thread itself!
 	void thread::pause()
 	{
@@ -3696,6 +3720,9 @@ namespace rsx
 	{
 		bool pause_emulator = false;
 
+		// MM sync. This is a pre-emptive operation, so we can use a deferred request.
+		rsx::mm_flush_lazy();
+
 		// Marks the end of a frame scope GPU-side
 		if (g_user_asked_for_frame_capture.exchange(false) && !capture_current_frame)
 		{
diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h
index b7bf5ff83e..e92e02c6cd 100644
--- a/rpcs3/Emu/RSX/RSXThread.h
+++ b/rpcs3/Emu/RSX/RSXThread.h
@@ -404,7 +404,7 @@ namespace rsx
 		virtual void notify_tile_unbound(u32 /*tile*/) {}
 
 		// control
-		virtual void renderctl(u32 /*request_code*/, void* /*args*/) {}
+		virtual void renderctl(u32 request_code, void* args);
 
 		// zcull
 		void notify_zcull_info_changed();
diff --git a/rpcs3/Emu/RSX/VK/VKCommandStream.h b/rpcs3/Emu/RSX/VK/VKCommandStream.h
index e559a688f4..4ee4d00e19 100644
--- a/rpcs3/Emu/RSX/VK/VKCommandStream.h
+++ b/rpcs3/Emu/RSX/VK/VKCommandStream.h
@@ -9,7 +9,7 @@ namespace vk
 	enum // callback commands
 	{
 		rctrl_queue_submit = 0x80000000,
-		rctrl_run_gc       = 0x80000001
+		rctrl_run_gc       = 0x80000001,
 	};
 
 	struct submit_packet
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
index 371a777e94..5a02197548 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@@ -15,6 +15,7 @@
 #include "vkutils/scratch.h"
 
 #include "Emu/RSX/rsx_methods.h"
+#include "Emu/RSX/Host/MM.h"
 #include "Emu/RSX/Host/RSXDMAWriter.h"
 #include "Emu/RSX/NV47/HW/context_accessors.define.h"
 #include "Emu/Memory/vm_locking.h"
@@ -1010,6 +1011,8 @@ VKGSRender::~VKGSRender()
 
 bool VKGSRender::on_access_violation(u32 address, bool is_writing)
 {
+	rsx::mm_flush(address);
+
 	vk::texture_cache::thrashed_set result;
 	{
 		const rsx::invalidation_cause cause = is_writing ? rsx::invalidation_cause::deferred_write : rsx::invalidation_cause::deferred_read;
@@ -2460,6 +2463,9 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
 {
 	ensure(!m_queue_status.test_and_set(flush_queue_state::flushing));
 
+	// Host MM sync before executing anything on the GPU
+	rsx::mm_flush();
+
 	// Workaround for deadlock occuring during RSX offloader fault
 	// TODO: Restructure command submission infrastructure to avoid this condition
 	const bool sync_success = g_fxo->get<rsx::dma_manager>().sync();
@@ -2824,7 +2830,7 @@ void VKGSRender::renderctl(u32 request_code, void* args)
 		break;
 	}
 	default:
-		fmt::throw_exception("Unhandled request code 0x%x", request_code);
+		rsx::thread::renderctl(request_code, args);
 	}
 }
 
diff --git a/rpcs3/Emu/localized_string_id.h b/rpcs3/Emu/localized_string_id.h
index 7476d58a59..4a380422b9 100644
--- a/rpcs3/Emu/localized_string_id.h
+++ b/rpcs3/Emu/localized_string_id.h
@@ -223,6 +223,7 @@ enum class localized_string_id
 	HOME_MENU_SETTINGS_INPUT_CAMERA_FLIP,
 	HOME_MENU_SETTINGS_INPUT_PAD_MODE,
 	HOME_MENU_SETTINGS_INPUT_PAD_SLEEP,
+	HOME_MENU_SETTINGS_INPUT_FAKE_MOVE_ROTATION_CONE,
 	HOME_MENU_SETTINGS_ADVANCED,
 	HOME_MENU_SETTINGS_ADVANCED_PREFERRED_SPU_THREADS,
 	HOME_MENU_SETTINGS_ADVANCED_MAX_CPU_PREEMPTIONS,
diff --git a/rpcs3/Emu/scoped_progress_dialog.cpp b/rpcs3/Emu/scoped_progress_dialog.cpp
index 35606b4600..ababe3d190 100644
--- a/rpcs3/Emu/scoped_progress_dialog.cpp
+++ b/rpcs3/Emu/scoped_progress_dialog.cpp
@@ -91,8 +91,8 @@ scoped_progress_dialog::scoped_progress_dialog(std::string text) noexcept
 
 scoped_progress_dialog& scoped_progress_dialog::operator=(std::string text) noexcept
 {
-	// Exchange text atomically
-	g_progr_text_queue[m_text_index].exchange(make_single_value(std::move(text)));
+	// Set text atomically
+	g_progr_text_queue[m_text_index].store(make_single_value(std::move(text)));
 	return *this;
 }
 
diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h
index 8a0e7737ca..04a4ce5399 100644
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@@ -178,6 +178,7 @@ struct cfg_root : cfg::node
 		cfg::_bool decr_memory_layout{ this, "DECR memory layout", false}; // Force enable increased allowed main memory range as DECR console
 		cfg::_bool host_label_synchronization{ this, "Allow Host GPU Labels", false };
 		cfg::_bool disable_msl_fast_math{ this, "Disable MSL Fast Math", false };
+		cfg::_bool disable_async_host_memory_manager{ this, "Disable Asynchronous Memory Manager", false, true };
 		cfg::_enum<output_scaling_mode> output_scaling{ this, "Output Scaling Mode", output_scaling_mode::bilinear, true };
 
 		struct node_vk : cfg::node
@@ -282,6 +283,7 @@ struct cfg_root : cfg::node
 		cfg::string midi_devices{this, "Emulated Midi devices", "ßßß@@@ßßß@@@ßßß@@@"};
 		cfg::_bool load_sdl_mappings{ this, "Load SDL GameController Mappings", true };
 		cfg::_bool debug_overlay{ this, "IO Debug overlay", false, true };
+		cfg::uint<1, 180> fake_move_rotation_cone{ this, "Fake Move Rotation Cone", 10, true };
 
 	} io{ this };
 
diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj
index 9a196dd8af..f311845462 100644
--- a/rpcs3/emucore.vcxproj
+++ b/rpcs3/emucore.vcxproj
@@ -104,6 +104,7 @@
     <ClCompile Include="Emu\perf_monitor.cpp" />
     <ClCompile Include="Emu\RSX\Common\texture_cache.cpp" />
     <ClCompile Include="Emu\RSX\Core\RSXContext.cpp" />
+    <ClCompile Include="Emu\RSX\Host\MM.cpp" />
     <ClCompile Include="Emu\RSX\Host\RSXDMAWriter.cpp" />
     <ClCompile Include="Emu\RSX\NV47\FW\draw_call.cpp" />
     <ClCompile Include="Emu\RSX\NV47\FW\reg_context.cpp" />
@@ -621,6 +622,7 @@
     <ClInclude Include="Emu\RSX\Core\RSXDisplay.h" />
     <ClInclude Include="Emu\RSX\Core\RSXReservationLock.hpp" />
     <ClInclude Include="Emu\RSX\Core\RSXVertexTypes.h" />
+    <ClInclude Include="Emu\RSX\Host\MM.h" />
     <ClInclude Include="Emu\RSX\Host\RSXDMAWriter.h" />
     <ClInclude Include="Emu\RSX\NV47\FW\draw_call.hpp" />
     <ClInclude Include="Emu\RSX\NV47\FW\draw_call.inc.h" />
diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters
index 584787892a..c516f50756 100644
--- a/rpcs3/emucore.vcxproj.filters
+++ b/rpcs3/emucore.vcxproj.filters
@@ -1312,6 +1312,9 @@
     <ClCompile Include="Emu\RSX\Host\RSXDMAWriter.cpp">
       <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
     </ClCompile>
+    <ClCompile Include="Emu\RSX\Host\MM.cpp">
+      <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="Crypto\aes.h">
@@ -2644,6 +2647,9 @@
     <ClInclude Include="Emu\RSX\Host\RSXDMAWriter.h">
       <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
     </ClInclude>
+    <ClInclude Include="Emu\RSX\Host\MM.h">
+      <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="Emu\RSX\Program\GLSLSnippets\GPUDeswizzle.glsl">
diff --git a/rpcs3/rpcs3qt/emu_settings_type.h b/rpcs3/rpcs3qt/emu_settings_type.h
index f6b6268bd3..9eec1956ed 100644
--- a/rpcs3/rpcs3qt/emu_settings_type.h
+++ b/rpcs3/rpcs3qt/emu_settings_type.h
@@ -103,6 +103,7 @@ enum class emu_settings_type
 	DisableMSLFastMath,
 	OutputScalingMode,
 	ForceHwMSAAResolve,
+	DisableAsyncHostMM,
 
 	// Performance Overlay
 	PerfOverlayEnabled,
@@ -294,6 +295,7 @@ inline static const std::map<emu_settings_type, cfg_location> settings_location
 	{ emu_settings_type::DisableMSLFastMath,         { "Video", "Disable MSL Fast Math"}},
 	{ emu_settings_type::OutputScalingMode,          { "Video", "Output Scaling Mode"}},
 	{ emu_settings_type::ForceHwMSAAResolve,         { "Video", "Force Hardware MSAA Resolve"}},
+	{ emu_settings_type::DisableAsyncHostMM,         { "Video", "Disable Asynchronous Memory Manager"}},
 
 	// Vulkan
 	{ emu_settings_type::VulkanAsyncTextureUploads,           { "Video", "Vulkan", "Asynchronous Texture Streaming 2"}},
diff --git a/rpcs3/rpcs3qt/flow_layout.cpp b/rpcs3/rpcs3qt/flow_layout.cpp
index 579ae59404..92fe956b28 100644
--- a/rpcs3/rpcs3qt/flow_layout.cpp
+++ b/rpcs3/rpcs3qt/flow_layout.cpp
@@ -79,10 +79,14 @@ flow_layout::~flow_layout()
 
 void flow_layout::clear()
 {
+	// We can't use a ranged loop here, since deleting the widget will call takeAt on the layout. So let's also use takeAt.
 	while (QLayoutItem* item = takeAt(0))
 	{
-		delete item->widget();
-		delete item;
+		if (item)
+		{
+			delete item->widget();
+			delete item;
+		}
 	}
 	m_item_list.clear();
 	m_positions.clear();
@@ -185,8 +189,8 @@ int flow_layout::doLayout(const QRect& rect, bool testOnly) const
 	int x = effectiveRect.x();
 	int y = effectiveRect.y();
 	int lineHeight = 0;
-	int rows = 0;
-	int cols = 0;
+	int row_count = 0;
+	int col_count = 0;
 
 	if (m_dynamic_spacing)
 	{
@@ -259,8 +263,8 @@ int flow_layout::doLayout(const QRect& rect, bool testOnly) const
 		pos.row = row;
 		pos.col = col++;
 
-		rows = std::max(rows, pos.row + 1);
-		cols = std::max(cols, pos.col + 1);
+		row_count = std::max(row_count, pos.row + 1);
+		col_count = std::max(col_count, pos.col + 1);
 
 		if (!testOnly)
 			item->setGeometry(QRect(QPoint(x, y), item->sizeHint()));
@@ -269,8 +273,8 @@ int flow_layout::doLayout(const QRect& rect, bool testOnly) const
 		lineHeight = qMax(lineHeight, item->sizeHint().height());
 	}
 
-	m_rows = rows;
-	m_cols = cols;
+	m_rows = row_count;
+	m_cols = col_count;
 
 	return y + lineHeight - rect.y() + bottom;
 }
diff --git a/rpcs3/rpcs3qt/gui_save.h b/rpcs3/rpcs3qt/gui_save.h
index 66970172f6..cbeec7b5a6 100644
--- a/rpcs3/rpcs3qt/gui_save.h
+++ b/rpcs3/rpcs3qt/gui_save.h
@@ -11,16 +11,11 @@ struct gui_save
 
 	gui_save()
 	{
-		key = "";
-		name = "";
-		def = QVariant();
 	}
 
 	gui_save(const QString& k, const QString& n, const QVariant& d)
+		: key(k), name(n), def(d)
 	{
-		key = k;
-		name = n;
-		def = d;
 	}
 
 	bool operator==(const gui_save& rhs) const noexcept
diff --git a/rpcs3/rpcs3qt/localized_emu.h b/rpcs3/rpcs3qt/localized_emu.h
index 2f6c4d19ae..7b4dbd72f9 100644
--- a/rpcs3/rpcs3qt/localized_emu.h
+++ b/rpcs3/rpcs3qt/localized_emu.h
@@ -244,6 +244,7 @@ private:
 		case localized_string_id::HOME_MENU_SETTINGS_INPUT_CAMERA_FLIP: return tr("Camera Flip", "Input");
 		case localized_string_id::HOME_MENU_SETTINGS_INPUT_PAD_MODE: return tr("Pad Handler Mode", "Input");
 		case localized_string_id::HOME_MENU_SETTINGS_INPUT_PAD_SLEEP: return tr("Pad Handler Sleep", "Input");
+		case localized_string_id::HOME_MENU_SETTINGS_INPUT_FAKE_MOVE_ROTATION_CONE: return tr("Fake PS Move Rotation Cone", "Input");
 		case localized_string_id::HOME_MENU_SETTINGS_ADVANCED: return tr("Advanced");
 		case localized_string_id::HOME_MENU_SETTINGS_ADVANCED_PREFERRED_SPU_THREADS: return tr("Preferred SPU Threads", "Advanced");
 		case localized_string_id::HOME_MENU_SETTINGS_ADVANCED_MAX_CPU_PREEMPTIONS: return tr("Max Power Saving CPU-Preemptions", "Advanced");
diff --git a/rpcs3/rpcs3qt/qt_camera_handler.cpp b/rpcs3/rpcs3qt/qt_camera_handler.cpp
index f0d11f937c..88d8c15963 100644
--- a/rpcs3/rpcs3qt/qt_camera_handler.cpp
+++ b/rpcs3/rpcs3qt/qt_camera_handler.cpp
@@ -47,6 +47,7 @@ void qt_camera_handler::set_camera(const QCameraDevice& camera_info)
 {
 	if (camera_info.isNull())
 	{
+		set_expected_state(camera_handler_state::closed);
 		reset();
 		return;
 	}
@@ -57,9 +58,9 @@ void qt_camera_handler::set_camera(const QCameraDevice& camera_info)
 	camera_log.success("Using camera: id=\"%s\", description=\"%s\", front_facing=%d", camera_info.id().toStdString(), camera_info.description(), front_facing);
 
 	// Create camera and video surface
-	m_media_capture_session.reset(new QMediaCaptureSession(nullptr));
-	m_video_sink.reset(new qt_camera_video_sink(front_facing, nullptr));
-	m_camera.reset(new QCamera(camera_info));
+	m_media_capture_session = std::make_unique<QMediaCaptureSession>(nullptr);
+	m_video_sink = std::make_unique<qt_camera_video_sink>(front_facing, nullptr);
+	m_camera = std::make_unique<QCamera>(camera_info);
 
 	connect(m_camera.get(), &QCamera::activeChanged, this, &qt_camera_handler::handle_camera_active);
 	connect(m_camera.get(), &QCamera::errorOccurred, this, &qt_camera_handler::handle_camera_error);
@@ -76,14 +77,37 @@ void qt_camera_handler::handle_camera_active(bool is_active)
 {
 	camera_log.notice("Camera active status changed to %d", is_active);
 
-	if (is_active)
+	// Check if the camera does what it's supposed to do.
+	const camera_handler_state expected_state = get_expected_state();
+
+	switch (expected_state)
 	{
-		m_state = camera_handler_state::running;
-	}
-	else
+	case camera_handler_state::closed:
+	case camera_handler_state::open:
 	{
-		m_state = camera_handler_state::closed;
+		if (is_active)
+		{
+			// This is not supposed to happen and indicates an unexpected QCamera issue
+			camera_log.error("Camera started unexpectedly");
+			set_state(camera_handler_state::running);
+			return;
+		}
+		break;
 	}
+	case camera_handler_state::running:
+	{
+		if (!is_active)
+		{
+			// This is not supposed to happen and indicates an unexpected QCamera issue
+			camera_log.error("Camera stopped unexpectedly");
+			set_state(camera_handler_state::open);
+			return;
+		}
+		break;
+	}
+	}
+
+	set_state(expected_state);
 }
 
 void qt_camera_handler::handle_camera_error(QCamera::Error error, const QString& errorString)
@@ -100,7 +124,11 @@ void qt_camera_handler::open_camera()
 	{
 		camera_log.notice("Switching camera from %s to %s", m_camera_id, camera_id);
 		camera_log.notice("Stopping old camera...");
-		if (m_camera) m_camera->stop();
+		if (m_camera)
+		{
+			set_expected_state(camera_handler_state::open);
+			m_camera->stop();
+		}
 		m_camera_id = camera_id;
 	}
 
@@ -129,7 +157,7 @@ void qt_camera_handler::open_camera()
 	{
 		if (m_camera_id.empty()) camera_log.notice("Camera disabled");
 		else camera_log.error("No camera found");
-		m_state = camera_handler_state::closed;
+		set_state(camera_handler_state::closed);
 		return;
 	}
 
@@ -148,7 +176,7 @@ void qt_camera_handler::open_camera()
 	// Update camera and view finder settings
 	update_camera_settings();
 
-	m_state = camera_handler_state::open;
+	set_state(camera_handler_state::open);
 }
 
 void qt_camera_handler::close_camera()
@@ -159,11 +187,12 @@ void qt_camera_handler::close_camera()
 	{
 		if (m_camera_id.empty()) camera_log.notice("Camera disabled");
 		else camera_log.error("No camera found");
-		m_state = camera_handler_state::closed;
+		set_state(camera_handler_state::closed);
 		return;
 	}
 
 	// Unload/close camera
+	set_expected_state(camera_handler_state::closed);
 	m_camera->stop();
 }
 
@@ -175,7 +204,7 @@ void qt_camera_handler::start_camera()
 	{
 		if (m_camera_id.empty()) camera_log.notice("Camera disabled");
 		else camera_log.error("No camera found");
-		m_state = camera_handler_state::closed;
+		set_state(camera_handler_state::closed);
 		return;
 	}
 
@@ -206,6 +235,7 @@ void qt_camera_handler::start_camera()
 #endif
 
 	// Start camera. We will start receiving frames now.
+	set_expected_state(camera_handler_state::running);
 	m_camera->start();
 }
 
@@ -217,7 +247,7 @@ void qt_camera_handler::stop_camera()
 	{
 		if (m_camera_id.empty()) camera_log.notice("Camera disabled");
 		else camera_log.error("No camera found");
-		m_state = camera_handler_state::closed;
+		set_state(camera_handler_state::closed);
 		return;
 	}
 
@@ -228,6 +258,7 @@ void qt_camera_handler::stop_camera()
 	}
 
 	// Stop camera. The camera will still be drawing power.
+	set_expected_state(camera_handler_state::open);
 	m_camera->stop();
 }
 
@@ -284,26 +315,26 @@ camera_handler_base::camera_handler_state qt_camera_handler::get_image(u8* buf,
 		m_camera_id != camera_id)
 	{
 		camera_log.notice("Switching cameras");
-		m_state = camera_handler_state::closed;
+		set_state(camera_handler_state::closed);
 		return camera_handler_state::closed;
 	}
 
 	if (m_camera_id.empty())
 	{
 		camera_log.notice("Camera disabled");
-		m_state = camera_handler_state::closed;
+		set_state(camera_handler_state::closed);
 		return camera_handler_state::closed;
 	}
 
 	if (!m_camera || !m_video_sink)
 	{
 		camera_log.fatal("Error: camera invalid");
-		m_state = camera_handler_state::closed;
+		set_state(camera_handler_state::closed);
 		return camera_handler_state::closed;
 	}
 
 	// Backup current state. State may change through events.
-	const camera_handler_state current_state = m_state;
+	const camera_handler_state current_state = get_state();
 
 	if (current_state == camera_handler_state::running)
 	{
diff --git a/rpcs3/rpcs3qt/qt_camera_handler.h b/rpcs3/rpcs3qt/qt_camera_handler.h
index d828bd6c84..0759d739c6 100644
--- a/rpcs3/rpcs3qt/qt_camera_handler.h
+++ b/rpcs3/rpcs3qt/qt_camera_handler.h
@@ -17,8 +17,6 @@ public:
 	qt_camera_handler();
 	virtual ~qt_camera_handler();
 
-	void set_camera(const QCameraDevice& camera_info);
-
 	void open_camera() override;
 	void close_camera() override;
 	void start_camera() override;
@@ -31,11 +29,12 @@ public:
 	camera_handler_state get_image(u8* buf, u64 size, u32& width, u32& height, u64& frame_number, u64& bytes_read) override;
 
 private:
+	void set_camera(const QCameraDevice& camera_info);
 	void reset();
 	void update_camera_settings();
 
 	std::string m_camera_id;
-	std::shared_ptr<QCamera> m_camera;
+	std::unique_ptr<QCamera> m_camera;
 	std::unique_ptr<QMediaCaptureSession> m_media_capture_session;
 	std::unique_ptr<qt_camera_video_sink> m_video_sink;
 
diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp
index aa3245ba8d..6d17c7e9a5 100644
--- a/rpcs3/rpcs3qt/settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/settings_dialog.cpp
@@ -94,7 +94,7 @@ void remove_item(QComboBox* box, int data_value, int def_value)
 
 extern const std::map<std::string_view, int> g_prx_list;
 
-settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std::shared_ptr<emu_settings> emu_settings, const int& tab_index, QWidget* parent, const GameInfo* game, bool create_cfg_from_global_cfg)
+settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std::shared_ptr<emu_settings> emu_settings, int tab_index, QWidget* parent, const GameInfo* game, bool create_cfg_from_global_cfg)
 	: QDialog(parent)
 	, m_tab_index(tab_index)
 	, ui(new Ui::settings_dialog)
@@ -1593,6 +1593,9 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 	ui->disableMslFastMath->setVisible(false);
 #endif
 
+	m_emu_settings->EnhanceCheckBox(ui->disableAsyncHostMM, emu_settings_type::DisableAsyncHostMM);
+	SubscribeTooltip(ui->disableAsyncHostMM, tooltips.settings.disable_async_host_mm);
+
 	// Comboboxes
 
 	m_emu_settings->EnhanceComboBox(ui->maxSPURSThreads, emu_settings_type::MaxSPURSThreads, true);
@@ -2604,14 +2607,11 @@ void settings_dialog::ApplyStylesheet(bool reset)
 	}
 }
 
-int settings_dialog::exec()
+void settings_dialog::open()
 {
-	// singleShot Hack to fix following bug:
-	// If we use setCurrentIndex now we will miraculously see a resize of the dialog as soon as we
-	// switch to the cpu tab after conjuring the settings_dialog with another tab opened first.
-	// Weirdly enough this won't happen if we change the tab order so that anything else is at index 0.
-	ui->tab_widget_settings->setCurrentIndex(0);
-	QTimer::singleShot(0, [this]{ ui->tab_widget_settings->setCurrentIndex(m_tab_index); });
+	QDialog::open();
+
+	ui->tab_widget_settings->setCurrentIndex(m_tab_index);
 
 	// Open a dialog if your config file contained invalid entries
 	QTimer::singleShot(10, [this]
@@ -2637,8 +2637,6 @@ int settings_dialog::exec()
 			}
 		}
 	});
-
-	return QDialog::exec();
 }
 
 void settings_dialog::SubscribeDescription(QLabel* description)
diff --git a/rpcs3/rpcs3qt/settings_dialog.h b/rpcs3/rpcs3qt/settings_dialog.h
index 203fd1c8f9..0513227e80 100644
--- a/rpcs3/rpcs3qt/settings_dialog.h
+++ b/rpcs3/rpcs3qt/settings_dialog.h
@@ -21,9 +21,9 @@ class settings_dialog : public QDialog
 	Q_OBJECT
 
 public:
-	explicit settings_dialog(std::shared_ptr<gui_settings> gui_settings, std::shared_ptr<emu_settings> emu_settings, const int& tab_index = 0, QWidget* parent = nullptr, const GameInfo* game = nullptr, bool create_cfg_from_global_cfg = true);
+	explicit settings_dialog(std::shared_ptr<gui_settings> gui_settings, std::shared_ptr<emu_settings> emu_settings, int tab_index = 0, QWidget* parent = nullptr, const GameInfo* game = nullptr, bool create_cfg_from_global_cfg = true);
 	~settings_dialog();
-	int exec() override;
+	void open() override;
 Q_SIGNALS:
 	void GuiStylesheetRequest();
 	void GuiRepaintRequest();
diff --git a/rpcs3/rpcs3qt/settings_dialog.ui b/rpcs3/rpcs3qt/settings_dialog.ui
index 74c5629092..66b3b32088 100644
--- a/rpcs3/rpcs3qt/settings_dialog.ui
+++ b/rpcs3/rpcs3qt/settings_dialog.ui
@@ -2695,6 +2695,13 @@
                  </property>
                 </widget>
                </item>
+               <item>
+                <widget class="QCheckBox" name="disableAsyncHostMM">
+                 <property name="text">
+                  <string>Disable Asynchronous Memory Manager</string>
+                 </property>
+                </widget>
+               </item>
               </layout>
              </widget>
             </item>
diff --git a/rpcs3/rpcs3qt/tooltips.h b/rpcs3/rpcs3qt/tooltips.h
index 99de286ec2..c6bfa15cf7 100644
--- a/rpcs3/rpcs3qt/tooltips.h
+++ b/rpcs3/rpcs3qt/tooltips.h
@@ -40,6 +40,7 @@ public:
 		const QString allow_host_labels            = tr("Allows the host GPU to synchronize with CELL directly. This incurs a performance penalty, but exposes the true state of GPU objects to the guest CPU. Can help eliminate visual noise and glitching at the cost of performance. Use with caution.");
 		const QString force_hw_MSAA                = tr("Forces MSAA to use the host GPU's resolve capabilities for all sampling operations.\nThis option incurs a performance penalty as well as the risk of visual artifacts but can yield crisper visuals when MSAA is enabled.");
 		const QString disable_vertex_cache         = tr("Disables the vertex cache.\nMight resolve missing or flickering graphics output.\nMay degrade performance.");
+		const QString disable_async_host_mm        = tr("Force host memory management calls to be inlined instead of handled asynchronously.\nThis can cause severe performance degradation and stuttering in some games.\nThis option is only needed by developers to debug problems with texture cache memory protection.");
 		const QString zcull_operation_mode         = tr("Changes ZCULL report synchronization behaviour. Experiment to find the best option for your game. Approximate mode is recommended for most games.\n· Precise is the most accurate to PS3 behaviour. Required for accurate visuals in some titles such as Demon's Souls and The Darkness.\n· Approximate is a much faster way to generate occlusion data which may not always match what the PS3 would generate. Works well with most PS3 games.\n· Relaxed changes the synchronization method completely and can greatly improve performance in some games or completely break others.");
 		const QString max_spurs_threads            = tr("Limits the maximum number of SPURS threads in each thread group.\nMay improve performance in some cases, especially on systems with limited number of hardware threads.\nLimiting the number of threads is likely to cause crashes; it's recommended to keep this at the default value.");
 		const QString sleep_timers_accuracy        = tr("Changes the sleep period accuracy.\n'As Host' uses default accuracy of the underlying operating system, while 'All Timers' attempts to improve it.\n'Usleep Only' limits the adjustments to usleep syscall only.\nCan affect performance in unexpected ways.");
diff --git a/rpcs3/rpcs3qt/user_account.h b/rpcs3/rpcs3qt/user_account.h
index 46991f7565..dd2e1a3d3c 100644
--- a/rpcs3/rpcs3qt/user_account.h
+++ b/rpcs3/rpcs3qt/user_account.h
@@ -14,9 +14,9 @@ class user_account
 public:
 	explicit user_account(const std::string& user_id = "00000001");
 
-	std::string GetUserId() const { return m_user_id; }
-	std::string GetUserDir() const { return m_user_dir; }
-	std::string GetUsername() const { return m_username; }
+	const std::string& GetUserId() const { return m_user_id; }
+	const std::string& GetUserDir() const { return m_user_dir; }
+	const std::string& GetUsername() const { return m_username; }
 
 	static std::map<u32, user_account> GetUserAccounts(const std::string& base_dir);
 
diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp
index d357c800ab..750bb97fd1 100644
--- a/rpcs3/util/simd.hpp
+++ b/rpcs3/util/simd.hpp
@@ -21,6 +21,7 @@
 #include <arm_neon.h>
 #endif
 
+#include <algorithm>
 #include <cmath>
 #include <math.h>
 #include <cfenv>
@@ -1967,6 +1968,15 @@ inline v128 gv_mulfs(const v128& a, const v128& b)
 #endif
 }
 
+inline v128 gv_mulfs(const v128& a, f32 b)
+{
+#if defined(ARCH_X64)
+	return _mm_mul_ps(a, _mm_set_ps1(b));
+#elif defined(ARCH_ARM64)
+	return vmulq_n_f32(a, b);
+#endif
+}
+
 inline v128 gv_hadds8x2(const v128& a)
 {
 #if defined(__SSSE3__)
@@ -2979,6 +2989,23 @@ inline v128 gv_rol16(const v128& a, const v128& b)
 #endif
 }
 
+// For each 16-bit element, r = rotate a by count
+template <u8 Count>
+inline v128 gv_rol16(const v128& a)
+{
+	constexpr u8 count = Count & 0xf;
+#if defined(ARCH_X64)
+	return _mm_or_si128(_mm_srli_epi16(a, 16 - count), _mm_slli_epi16(a, count));
+#elif defined(ARCH_ARM64)
+	return vorrq_u16(vshrq_n_u16(a, 16 - count), vshlq_n_u16(a, count));
+#else
+	v128 r;
+	for (u32 i = 0; i < 8; i++)
+		r._u16[i] = std::rotl(a._u16[i], count);
+	return r;
+#endif
+}
+
 // For each 32-bit element, r = rotate a by b
 inline v128 gv_rol32(const v128& a, const v128& b)
 {
@@ -2997,15 +3024,16 @@ inline v128 gv_rol32(const v128& a, const v128& b)
 }
 
 // For each 32-bit element, r = rotate a by count
-inline v128 gv_rol32(const v128& a, u32 count)
+template <u8 Count>
+inline v128 gv_rol32(const v128& a)
 {
-	count %= 32;
-#if defined(ARCH_X64)
-	return _mm_or_epi32(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count));
+	constexpr u8 count = Count & 0x1f;
+#if defined(__AVX512VL__)
+	return _mm_rol_epi32(a, count);
+#elif defined(ARCH_X64)
+	return _mm_or_si128(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count));
 #elif defined(ARCH_ARM64)
-	const auto amt1 = vdupq_n_s32(count);
-	const auto amt2 = vdupq_n_s32(count - 32);
-	return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2));
+	return vorrq_u32(vshrq_n_u32(a, 32 - count), vshlq_n_u32(a, count));
 #else
 	v128 r;
 	for (u32 i = 0; i < 4; i++)
@@ -3107,6 +3135,139 @@ inline auto gv_shuffle_right(A&& a)
 	FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward<A>(a), Count);
 }
 
+// Load 32-bit integer into the first element of a new vector, set other elements to zero
+inline v128 gv_loadu32(const void* ptr)
+{
+#if defined(ARCH_X64)
+	return _mm_loadu_si32(ptr);
+#elif defined(ARCH_ARM64)
+	return vld1q_lane_u32(static_cast<const u32*>(ptr), vdupq_n_u32(0), 0);
+#endif
+}
+
+// Load 16-bit integer into an existing vector at the position specified by Index
+template <u8 Index>
+inline v128 gv_insert16(const v128& vec, u16 value)
+{
+#if defined(ARCH_X64)
+	return _mm_insert_epi16(vec, value, Index);
+#elif defined(ARCH_ARM64)
+	return vsetq_lane_u16(value, vec, Index & 0x7);
+#endif
+}
+
+// For each 8-bit element,
+// if ctrl >= 0 && ctrl < 16 then r = vec[ctrl],
+// else if ctrl < 0 then r = 0
+inline v128 gv_shuffle8(const v128& vec, const v128& ctrl)
+{
+	AUDIT(std::ranges::none_of(ctrl._chars, [](s8 i){ return i >= static_cast<s8>(sizeof(v128)); }), "All indices must be in the range [0, 15] or negative, since PSHUFB and TBL behave differently otherwise");
+#if defined(__SSSE3__)
+	return _mm_shuffle_epi8(vec, ctrl);
+#elif defined(ARCH_ARM64)
+	return vqtbl1q_s8(vec, ctrl);
+#else
+	v128 r;
+	for (s32 i = 0; i < 16; i++)
+		r._s8[i] = ctrl._s8[i] < 0 ? 0 : vec._s8[ctrl._s8[i] & 0xf];
+	return r;
+#endif
+}
+
+// For each 2-bit index in Control, r = vec[index]
+template <u8 Control>
+inline v128 gv_shuffle32(const v128& vec)
+{
+#if defined(ARCH_X64)
+	return _mm_shuffle_epi32(vec, Control);
+#elif defined(ARCH_ARM64)
+	constexpr u8 idx0 = (Control & 3) * sizeof(s32);
+	constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32);
+	constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32);
+	constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32);
+
+	constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
+
+	return vqtbl1q_s8(vec, idx_vec);
+#endif
+}
+
+// For each index, r = vec[index & 3]
+template <u8 Index0, u8 Index1, u8 Index2, u8 Index3>
+inline v128 gv_shuffle32(const v128& vec)
+{
+#if defined(ARCH_X64)
+	return _mm_shuffle_epi32(vec, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6);
+#elif defined(ARCH_ARM64)
+	constexpr u8 idx0 = (Index0 & 3) * sizeof(s32);
+	constexpr u8 idx1 = (Index1 & 3) * sizeof(s32);
+	constexpr u8 idx2 = (Index2 & 3) * sizeof(s32);
+	constexpr u8 idx3 = (Index3 & 3) * sizeof(s32);
+
+	constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
+
+	return vqtbl1q_s8(vec, idx_vec);
+#endif
+}
+
+// For the first two 2-bit indices in Control, r = a[index],
+// for the last two indices, r = b[index]
+template <u8 Control>
+inline v128 gv_shufflefs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_shuffle_ps(a, b, Control);
+#elif defined(ARCH_ARM64)
+	constexpr u8 idx0 = (Control & 3) * sizeof(s32);
+	constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32);
+	constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32) + sizeof(v128);
+	constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32) + sizeof(v128);
+
+	constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
+
+	return vqtbl2q_s8({ a, b }, idx_vec);
+#endif
+}
+
+// For the first two indices, r = a[index & 3],
+// for the last two indices, r = b[index & 3]
+template <u8 Index0, u8 Index1, u8 Index2, u8 Index3>
+inline v128 gv_shufflefs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_shuffle_ps(a, b, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6);
+#elif defined(ARCH_ARM64)
+	constexpr u8 idx0 = (Index0 & 3) * sizeof(s32);
+	constexpr u8 idx1 = (Index1 & 3) * sizeof(s32);
+	constexpr u8 idx2 = (Index2 & 3) * sizeof(s32) + sizeof(v128);
+	constexpr u8 idx3 = (Index3 & 3) * sizeof(s32) + sizeof(v128);
+
+	constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
+
+	return vqtbl2q_s8({ a, b }, idx_vec);
+#endif
+}
+
+// For each 32-bit element, reverse byte order
+inline v128 gv_rev32(const v128& vec)
+{
+#if defined(__SSSE3__)
+	return _mm_shuffle_epi8(vec, _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12));
+#elif defined(ARCH_ARM64)
+	return vrev32q_u8(vec);
+#else
+	return gv_rol32<16>(gv_rol16<8>(vec));
+#endif
+}
+
+// For each 32-bit element, convert between big-endian and native-endian
+inline v128 gv_to_be32(const v128& vec)
+{
+	if constexpr (std::endian::native == std::endian::little)
+		return gv_rev32(vec);
+	return vec;
+}
+
 #if defined(__clang__)
 #pragma clang diagnostic pop
 #elif defined(__GNUC__)
diff --git a/rpcs3/util/types.hpp b/rpcs3/util/types.hpp
index 42c900898f..f01e558e92 100644
--- a/rpcs3/util/types.hpp
+++ b/rpcs3/util/types.hpp
@@ -989,14 +989,14 @@ template <typename To, typename From> requires (std::is_integral_v<decltype(std:
 	constexpr bool is_from_signed = std::is_signed_v<CommonFrom>;
 	constexpr bool is_to_signed = std::is_signed_v<CommonTo>;
 
-	constexpr auto from_mask = is_from_signed > is_to_signed ? UnFrom{umax} >> 1 : UnFrom{umax};
-	constexpr auto to_mask = is_to_signed > is_from_signed ? UnTo{umax} >> 1 : UnTo{umax};
+	constexpr auto from_mask = (is_from_signed && !is_to_signed) ? UnFrom{umax} >> 1 : UnFrom{umax};
+	constexpr auto to_mask = (is_to_signed && !is_from_signed) ? UnTo{umax} >> 1 : UnTo{umax};
 
 	constexpr auto mask = ~(from_mask & to_mask);
 
 	// Signed to unsigned always require test
 	// Otherwise, this is bit-wise narrowing or conversion between types of different signedness of the same size
-	if constexpr (is_from_signed > is_to_signed || to_mask < from_mask)
+	if constexpr ((is_from_signed && !is_to_signed) || to_mask < from_mask)
 	{
 		// Try to optimize test if both are of the same signedness
 		if (is_from_signed != is_to_signed ? !!(value & mask) : static_cast<CommonTo>(value) != value) [[unlikely]]