diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h
index 2b426e05b6..c5d7a544f6 100644
--- a/rpcs3/Emu/RSX/Common/texture_cache.h
+++ b/rpcs3/Emu/RSX/Common/texture_cache.h
@@ -2490,7 +2490,7 @@ namespace rsx
 
 			// Invalidate
 			const address_range tex_range = address_range::start_length(attributes.address, tex_size);
-			invalidate_range_impl_base(cmd, tex_range, invalidation_cause::read, {}, std::forward<Args>(extras)...);
+			invalidate_range_impl_base(cmd, tex_range, invalidation_cause::cause_is_read | invalidation_cause::cause_uses_strict_data_bounds, {}, std::forward<Args>(extras)...);
 
 			// Upload from CPU. Note that sRGB conversion is handled in the FS
 			auto uploaded = upload_image_from_cpu(cmd, tex_range, attributes.width, attributes.height, attributes.depth, tex.get_exact_mipmap_count(), attributes.pitch, attributes.gcm_format,
diff --git a/rpcs3/Emu/RSX/Common/texture_cache_utils.h b/rpcs3/Emu/RSX/Common/texture_cache_utils.h
index 79ad1b610e..d01660775e 100644
--- a/rpcs3/Emu/RSX/Common/texture_cache_utils.h
+++ b/rpcs3/Emu/RSX/Common/texture_cache_utils.h
@@ -5,6 +5,7 @@
 #include "TextureUtils.h"
 
 #include "Emu/Memory/vm.h"
+#include "Emu/RSX/Host/MM.h"
 #include "util/vm.hpp"
 
 #include <list>
@@ -29,8 +30,7 @@ namespace rsx
 	{
 		ensure(range.is_page_range());
 
-		//rsx_log.error("memory_protect(0x%x, 0x%x, %x)", static_cast<u32>(range.start), static_cast<u32>(range.length()), static_cast<u32>(prot));
-		utils::memory_protect(vm::base(range.start), range.length(), prot);
+		rsx::mm_protect(vm::base(range.start), range.length(), prot);
 
 #ifdef TEXTURE_CACHE_DEBUG
 		tex_cache_checker.set_protection(range, prot);
diff --git a/rpcs3/Emu/RSX/Host/MM.cpp b/rpcs3/Emu/RSX/Host/MM.cpp
new file mode 100644
index 0000000000..b8968f1d29
--- /dev/null
+++ b/rpcs3/Emu/RSX/Host/MM.cpp
@@ -0,0 +1,104 @@
+#include "stdafx.h"
+#include "MM.h"
+#include <Emu/RSX/Common/simple_array.hpp>
+#include <Emu/RSX/RSXOffload.h>
+
+#include <Emu/Memory/vm.h>
+#include <Emu/IdManager.h>
+#include <Emu/system_config.h>
+#include <Utilities/address_range.h>
+#include <Utilities/mutex.h>
+
+namespace rsx
+{
+	rsx::simple_array<MM_block> g_deferred_mprotect_queue;
+	shared_mutex g_mprotect_queue_lock;
+
+	void mm_flush_mprotect_queue_internal()
+	{
+		for (const auto& block : g_deferred_mprotect_queue)
+		{
+			utils::memory_protect(reinterpret_cast<void*>(block.start), block.length, block.prot);
+		}
+
+		g_deferred_mprotect_queue.clear();
+	}
+
+	void mm_defer_mprotect_internal(u64 start, u64 length, utils::protection prot)
+	{
+		// We could stack and merge requests here, but that is more trouble than it is truly worth.
+		// A fresh call to memory_protect only takes a few nanoseconds of setup overhead, it is not worth the risk of hanging because of conflicts.
+		g_deferred_mprotect_queue.push_back({ start, length, prot });
+	}
+
+	void mm_protect(void* ptr, u64 length, utils::protection prot)
+	{
+		if (!g_cfg.video.async_host_memory_manager)
+		{
+			utils::memory_protect(ptr, length, prot);
+			return;
+		}
+
+		// Naive merge. Eventually it makes more sense to do conflict resolution, but it's not as important.
+		const auto start = reinterpret_cast<u64>(ptr);
+		const auto end = start + length;
+
+		std::lock_guard lock(g_mprotect_queue_lock);
+
+		if (prot == utils::protection::rw || prot == utils::protection::wx)
+		{
+			// Basically an unlock op. Flush if any overlap is detected
+			for (const auto& block : g_deferred_mprotect_queue)
+			{
+				if (block.overlaps(start, end))
+				{
+					mm_flush_mprotect_queue_internal();
+					break;
+				}
+			}
+
+			utils::memory_protect(ptr, length, prot);
+			return;
+		}
+
+		// No, Ro, etc.
+		mm_defer_mprotect_internal(start, length, prot);
+	}
+
+	void mm_flush()
+	{
+		std::lock_guard lock(g_mprotect_queue_lock);
+		mm_flush_mprotect_queue_internal();
+	}
+
+	void mm_flush(u32 vm_address)
+	{
+		std::lock_guard lock(g_mprotect_queue_lock);
+		if (g_deferred_mprotect_queue.empty())
+		{
+			return;
+		}
+
+		const auto addr = reinterpret_cast<u64>(vm::base(vm_address));
+		for (const auto& block : g_deferred_mprotect_queue)
+		{
+			if (block.overlaps(addr))
+			{
+				mm_flush_mprotect_queue_internal();
+				return;
+			}
+		}
+	}
+
+	void mm_flush_lazy()
+	{
+		if (!g_cfg.video.multithreaded_rsx)
+		{
+			mm_flush();
+			return;
+		}
+
+		auto& rsxdma = g_fxo->get<rsx::dma_manager>();
+		rsxdma.backend_ctrl(static_cast<u32>(mm_backend_ctrl::mm_flush), nullptr);
+	}
+}
diff --git a/rpcs3/Emu/RSX/Host/MM.h b/rpcs3/Emu/RSX/Host/MM.h
new file mode 100644
index 0000000000..96e5229095
--- /dev/null
+++ b/rpcs3/Emu/RSX/Host/MM.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <util/types.hpp>
+#include <util/vm.hpp>
+
+namespace rsx
+{
+	struct MM_block
+	{
+		u64 start;
+		u64 length;
+		utils::protection prot;
+
+		inline bool overlaps(u64 start, u64 end) const
+		{
+			// [Start, End] is not a proper closed range, there is an off-by-one by design.
+			// FIXME: Use address_range64
+			const u64 this_end = this->start + this->length;
+			return (this->start < end && start < this_end);
+		}
+
+		inline bool overlaps(u64 addr) const
+		{
+			// [Start, End] is not a proper closed range, there is an off-by-one by design.
+			// FIXME: Use address_range64
+			const u64 this_end = this->start + this->length;
+			return (addr >= start && addr < this_end);
+		}
+	};
+
+	enum class mm_backend_ctrl : u32
+	{
+		mm_flush = 0x80000002
+	};
+
+	void mm_protect(void* start, u64 length, utils::protection prot);
+	void mm_flush_lazy();
+	void mm_flush(u32 vm_address);
+	void mm_flush();
+}
diff --git a/rpcs3/Emu/RSX/NV47/HW/nv47_sync.hpp b/rpcs3/Emu/RSX/NV47/HW/nv47_sync.hpp
index fe6dc21ba5..674c66470d 100644
--- a/rpcs3/Emu/RSX/NV47/HW/nv47_sync.hpp
+++ b/rpcs3/Emu/RSX/NV47/HW/nv47_sync.hpp
@@ -7,6 +7,9 @@
 
 namespace rsx
 {
+	void mm_flush_lazy();
+	void mm_flush();
+
 	namespace util
 	{
 		template <bool FlushDMA, bool FlushPipe>
@@ -27,6 +30,7 @@ namespace rsx
 				if constexpr (FlushDMA)
 				{
 					// If the backend handled the request, this call will basically be a NOP
+					rsx::mm_flush_lazy();
 					g_fxo->get<rsx::dma_manager>().sync();
 				}
 
@@ -34,6 +38,7 @@ namespace rsx
 				{
 					// Manually flush the pipeline.
 					// It is possible to stream report writes using the host GPU, but that generates too much submit traffic.
+					rsx::mm_flush();
 					RSX(ctx)->sync();
 				}
 
diff --git a/rpcs3/Emu/RSX/VK/VKCommandStream.h b/rpcs3/Emu/RSX/VK/VKCommandStream.h
index e559a688f4..31dc703593 100644
--- a/rpcs3/Emu/RSX/VK/VKCommandStream.h
+++ b/rpcs3/Emu/RSX/VK/VKCommandStream.h
@@ -9,7 +9,8 @@ namespace vk
 	enum // callback commands
 	{
 		rctrl_queue_submit = 0x80000000,
-		rctrl_run_gc       = 0x80000001
+		rctrl_run_gc       = 0x80000001,
+		rctrl_mem_protect  = 0x80000002,
 	};
 
 	struct submit_packet
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
index 371a777e94..c31a1ac515 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@@ -15,6 +15,7 @@
 #include "vkutils/scratch.h"
 
 #include "Emu/RSX/rsx_methods.h"
+#include "Emu/RSX/Host/MM.h"
 #include "Emu/RSX/Host/RSXDMAWriter.h"
 #include "Emu/RSX/NV47/HW/context_accessors.define.h"
 #include "Emu/Memory/vm_locking.h"
@@ -1010,6 +1011,8 @@ VKGSRender::~VKGSRender()
 
 bool VKGSRender::on_access_violation(u32 address, bool is_writing)
 {
+	rsx::mm_flush(address);
+
 	vk::texture_cache::thrashed_set result;
 	{
 		const rsx::invalidation_cause cause = is_writing ? rsx::invalidation_cause::deferred_write : rsx::invalidation_cause::deferred_read;
@@ -2460,6 +2463,9 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
 {
 	ensure(!m_queue_status.test_and_set(flush_queue_state::flushing));
 
+	// Host MM sync before executing anything on the GPU
+	rsx::mm_flush();
+
 	// Workaround for deadlock occuring during RSX offloader fault
 	// TODO: Restructure command submission infrastructure to avoid this condition
 	const bool sync_success = g_fxo->get<rsx::dma_manager>().sync();
@@ -2823,6 +2829,11 @@ void VKGSRender::renderctl(u32 request_code, void* args)
 		vk::on_event_completed(eid, true);
 		break;
 	}
+	case vk::rctrl_mem_protect:
+	{
+		rsx::mm_flush();
+		break;
+	}
 	default:
 		fmt::throw_exception("Unhandled request code 0x%x", request_code);
 	}
diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h
index 8a0e7737ca..a3a14268f8 100644
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@@ -178,6 +178,7 @@ struct cfg_root : cfg::node
 		cfg::_bool decr_memory_layout{ this, "DECR memory layout", false}; // Force enable increased allowed main memory range as DECR console
 		cfg::_bool host_label_synchronization{ this, "Allow Host GPU Labels", false };
 		cfg::_bool disable_msl_fast_math{ this, "Disable MSL Fast Math", false };
+		cfg::_bool async_host_memory_manager{ this, "Asynchronous Host Memory Manager", true, true };
 		cfg::_enum<output_scaling_mode> output_scaling{ this, "Output Scaling Mode", output_scaling_mode::bilinear, true };
 
 		struct node_vk : cfg::node
diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj
index 9a196dd8af..f311845462 100644
--- a/rpcs3/emucore.vcxproj
+++ b/rpcs3/emucore.vcxproj
@@ -104,6 +104,7 @@
     <ClCompile Include="Emu\perf_monitor.cpp" />
     <ClCompile Include="Emu\RSX\Common\texture_cache.cpp" />
     <ClCompile Include="Emu\RSX\Core\RSXContext.cpp" />
+    <ClCompile Include="Emu\RSX\Host\MM.cpp" />
     <ClCompile Include="Emu\RSX\Host\RSXDMAWriter.cpp" />
     <ClCompile Include="Emu\RSX\NV47\FW\draw_call.cpp" />
     <ClCompile Include="Emu\RSX\NV47\FW\reg_context.cpp" />
@@ -621,6 +622,7 @@
     <ClInclude Include="Emu\RSX\Core\RSXDisplay.h" />
     <ClInclude Include="Emu\RSX\Core\RSXReservationLock.hpp" />
     <ClInclude Include="Emu\RSX\Core\RSXVertexTypes.h" />
+    <ClInclude Include="Emu\RSX\Host\MM.h" />
     <ClInclude Include="Emu\RSX\Host\RSXDMAWriter.h" />
     <ClInclude Include="Emu\RSX\NV47\FW\draw_call.hpp" />
     <ClInclude Include="Emu\RSX\NV47\FW\draw_call.inc.h" />
diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters
index 584787892a..c516f50756 100644
--- a/rpcs3/emucore.vcxproj.filters
+++ b/rpcs3/emucore.vcxproj.filters
@@ -1312,6 +1312,9 @@
     <ClCompile Include="Emu\RSX\Host\RSXDMAWriter.cpp">
       <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
     </ClCompile>
+    <ClCompile Include="Emu\RSX\Host\MM.cpp">
+      <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="Crypto\aes.h">
@@ -2644,6 +2647,9 @@
     <ClInclude Include="Emu\RSX\Host\RSXDMAWriter.h">
       <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
     </ClInclude>
+    <ClInclude Include="Emu\RSX\Host\MM.h">
+      <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="Emu\RSX\Program\GLSLSnippets\GPUDeswizzle.glsl">