diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
index 274365f4cb..1d083fce7c 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@@ -3,6 +3,7 @@
 #include "../Overlays/Shaders/shader_loading_dialog_native.h"
 #include "GLGSRender.h"
 #include "GLCompute.h"
+#include "GLDMA.h"
 
 #include "Emu/Memory/vm_locking.h"
 #include "Emu/RSX/rsx_methods.h"
@@ -180,6 +181,20 @@ void GLGSRender::on_init_thread()
 		backend_config.supports_normalized_barycentrics = false;
 	}
 
+	if (gl_caps.AMD_pinned_memory)
+	{
+		backend_config.supports_host_gpu_labels = true;
+
+		if (g_cfg.video.host_label_synchronization)
+		{
+			m_host_gpu_context_data = std::make_unique<gl::buffer>();
+			m_host_gpu_context_data->create(gl::buffer::target::array, 4096);
+
+			auto host_context_ptr = reinterpret_cast<rsx::host_gpu_context_t*>(m_host_gpu_context_data->map(0, 4096, gl::buffer::access::read));
+			m_host_dma_ctrl = std::make_unique<rsx::RSXDMAWriter>(host_context_ptr);
+		}
+	}
+
 	// Use industry standard resource alignment values as defaults
 	m_uniform_buffer_offset_align = 256;
 	m_min_texbuffer_alignment = 256;
@@ -397,6 +412,7 @@ void GLGSRender::on_exit()
 	// TODO: Move these
 	gl::destroy_compute_tasks();
 	gl::destroy_overlay_passes();
+	gl::clear_dma_resources();
 
 	gl::destroy_global_texture_resources();
 
@@ -407,6 +423,9 @@ void GLGSRender::on_exit()
 	m_prog_buffer.clear();
 	m_rtts.destroy();
 
+	m_host_dma_ctrl.reset();
+	m_host_gpu_context_data.reset();
+
 	for (auto &fbo : m_framebuffer_cache)
 	{
 		fbo.remove();
diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h
index c339f7dc39..8ea87f8e5d 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.h
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.h
@@ -128,7 +128,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 
 	GLProgramBuffer m_prog_buffer;
 
-	//buffer
+	// Draw Buffers
 	gl::fbo* m_draw_fbo = nullptr;
 	std::list<gl::framebuffer_holder> m_framebuffer_cache;
 	std::unique_ptr<gl::texture> m_flip_tex_color[2];
@@ -137,7 +137,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 	std::unique_ptr<gl::upscaler> m_upscaler;
 	output_scaling_mode m_output_scaling = output_scaling_mode::bilinear;
 
-	//vaos are mandatory for core profile
+	// VAOs are mandatory for core profile
 	gl::vao m_vao;
 
 	shared_mutex m_sampler_mutex;
@@ -150,6 +150,9 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 	// Occlusion query type, can be SAMPLES_PASSED or ANY_SAMPLES_PASSED
 	GLenum m_occlusion_type = GL_ANY_SAMPLES_PASSED;
 
+	// Host context for GPU-driven work
+	std::unique_ptr<gl::buffer> m_host_gpu_context_data;
+
 public:
 	u64 get_cycles() final;
 
diff --git a/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp b/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp
new file mode 100644
index 0000000000..d86b03712a
--- /dev/null
+++ b/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp
@@ -0,0 +1,67 @@
+#include "stdafx.h"
+#include "RSXDMAWriter.h"
+
+#include "Utilities//Thread.h"
+#include <util/asm.hpp>
+
+namespace rsx
+{
+	void RSXDMAWriter::update()
+	{
+		if (m_dispatch_handlers.empty())
+		{
+			m_job_queue.clear();
+			return;
+		}
+
+		while (!m_job_queue.empty())
+		{
+			const auto job = m_job_queue.front();
+
+			if (const auto dispatch = m_dispatch_handlers.find(job.dispatch_class);
+				dispatch == m_dispatch_handlers.end() || dispatch->second.handler(m_host_context_ptr, &job))
+			{
+				// No handler registered, or callback consumed the job
+				m_job_queue.pop_front();
+				continue;
+			}
+
+			// Dispatcher found and rejected the job. Stop, we'll try again later.
+			break;
+		}
+	}
+
+	void RSXDMAWriter::register_handler(host_dispatch_handler_t handler)
+	{
+		m_dispatch_handlers[handler.dispatch_class] = handler;
+	}
+
+	void RSXDMAWriter::deregister_handler(int dispatch_class)
+	{
+		m_dispatch_handlers.erase(dispatch_class);
+	}
+
+	void RSXDMAWriter::enqueue(const host_gpu_write_op_t& request)
+	{
+		m_job_queue.push_back(request);
+	}
+
+	void RSXDMAWriter::drain_label_queue()
+	{
+		if (!m_host_context_ptr)
+		{
+			return;
+		}
+
+		// FIXME: This is a busy wait, consider yield to improve responsiveness on weak devices.
+		while (!m_host_context_ptr->in_flight_commands_completed())
+		{
+			utils::pause();
+
+			if (thread_ctrl::state() == thread_state::aborting)
+			{
+				break;
+			}
+		}
+	}
+}
diff --git a/rpcs3/Emu/RSX/Host/RSXDMAWriter.h b/rpcs3/Emu/RSX/Host/RSXDMAWriter.h
new file mode 100644
index 0000000000..18d232bfda
--- /dev/null
+++ b/rpcs3/Emu/RSX/Host/RSXDMAWriter.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <util/types.hpp>
+
+#include <unordered_map>
+#include <functional>
+#include <deque>
+
+namespace rsx
+{
+	struct host_gpu_context_t
+	{
+		u64 magic = 0xCAFEBABE;
+		u64 event_counter = 0;
+		u64 texture_load_request_event = 0;
+		u64 texture_load_complete_event = 0;
+		u64 last_label_acquire_event = 0;
+		u64 last_label_release2_event = 0;
+		u64 commands_complete_event = 0;
+
+		inline u64 inc_counter() volatile
+		{
+			// Workaround for volatile increment warning. GPU can see this value directly, but currently we do not modify it on the device.
+			event_counter = event_counter + 1;
+			return event_counter;
+		}
+
+		inline bool in_flight_commands_completed() const volatile
+		{
+			return last_label_release2_event == commands_complete_event;
+		}
+
+		inline bool texture_loads_completed() const volatile
+		{
+			// Return true if all texture load requests are done.
+			return texture_load_complete_event == texture_load_request_event;
+		}
+
+		inline bool has_unflushed_texture_loads() const volatile
+		{
+			return texture_load_request_event > last_label_release2_event;
+		}
+
+		inline u64 on_texture_load_acquire() volatile
+		{
+			texture_load_request_event = inc_counter();
+			return texture_load_request_event;
+		}
+
+		inline void on_texture_load_release() volatile
+		{
+			// Normally released by the host device, but implemented nonetheless for software fallback
+			texture_load_complete_event = texture_load_request_event;
+		}
+
+		inline u64 on_label_acquire() volatile
+		{
+			last_label_acquire_event = inc_counter();
+			return last_label_acquire_event;
+		}
+
+		inline void on_label_release() volatile
+		{
+			last_label_release2_event = last_label_acquire_event;
+		}
+
+		inline bool needs_label_release() const volatile
+		{
+			return last_label_acquire_event > last_label_release2_event;
+		}
+	};
+
+	struct host_gpu_write_op_t
+	{
+		int dispatch_class = 0;
+		void* userdata = nullptr;
+	};
+
+	struct host_dispatch_handler_t
+	{
+		int dispatch_class = 0;
+		std::function<bool(const volatile host_gpu_context_t*, const host_gpu_write_op_t*)> handler;
+	};
+
+	class RSXDMAWriter
+	{
+	public:
+		RSXDMAWriter(void* mem)
+			: m_host_context_ptr(new (mem)host_gpu_context_t)
+		{}
+
+		RSXDMAWriter(host_gpu_context_t* pctx)
+			: m_host_context_ptr(pctx)
+		{}
+
+		void update();
+
+		void register_handler(host_dispatch_handler_t handler);
+		void deregister_handler(int dispatch_class);
+
+		void enqueue(const host_gpu_write_op_t& request);
+		void drain_label_queue();
+
+		volatile host_gpu_context_t* host_ctx() const
+		{
+			return m_host_context_ptr;
+		}
+
+	private:
+		std::unordered_map<int, host_dispatch_handler_t> m_dispatch_handlers;
+		volatile host_gpu_context_t* m_host_context_ptr = nullptr;
+
+		std::deque<host_gpu_write_op_t> m_job_queue;
+	};
+}
diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp
index 37ed9bac6f..c90b8a2079 100644
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@@ -1162,6 +1162,7 @@ namespace rsx
 
 				// Update other sub-units
 				zcull_ctrl->update(this);
+				m_host_dma_ctrl->update();
 			}
 
 			// Execute FIFO queue
diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h
index 3f7b3842f1..514d5f69bb 100644
--- a/rpcs3/Emu/RSX/RSXThread.h
+++ b/rpcs3/Emu/RSX/RSXThread.h
@@ -35,6 +35,8 @@
 
 #include "NV47/FW/GRAPH_backend.h"
 
+#include "Host/RSXDMAWriter.h"
+
 extern atomic_t<bool> g_user_asked_for_frame_capture;
 extern atomic_t<bool> g_disable_frame_limit;
 extern rsx::frame_trace_data frame_debug;
@@ -212,6 +214,9 @@ namespace rsx
 		// Context
 		context* m_ctx = nullptr;
 
+		// Host DMA
+		std::unique_ptr<RSXDMAWriter> m_host_dma_ctrl;
+
 	public:
 		atomic_t<u64> new_get_put = u64{umax};
 		u32 restore_point = 0;
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
index fb5c245b87..6352c9c58e 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@@ -867,8 +867,7 @@ VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar)
 				VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0,
 				VMM_ALLOCATION_POOL_SYSTEM);
 
-			m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t();
-			ensure(m_host_data_ptr->magic == 0xCAFEBABE);
+			m_host_dma_ctrl = std::make_unique<rsx::RSXDMAWriter>(m_host_object_data->map(0, 0x10000));
 		}
 		else
 		{
@@ -1784,6 +1783,11 @@ void VKGSRender::flush_command_queue(bool hard_sync, bool do_not_switch)
 	m_current_command_buffer->begin();
 }
 
+std::pair<volatile vk::host_data_t*, VkBuffer> VKGSRender::map_host_object_data() const
+{
+	return { m_host_dma_ctrl->host_ctx(), m_host_object_data->value};
+}
+
 bool VKGSRender::release_GCM_label(u32 address, u32 args)
 {
 	if (!backend_config.supports_host_gpu_labels)
@@ -1791,25 +1795,13 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args)
 		return false;
 	}
 
-	auto drain_label_queue = [this]()
-	{
-		while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
-		{
-			utils::pause();
+	auto host_ctx = ensure(m_host_dma_ctrl->host_ctx());
 
-			if (thread_ctrl::state() == thread_state::aborting)
-			{
-				break;
-			}
-		}
-	};
-
-	ensure(m_host_data_ptr);
-	if (m_host_data_ptr->texture_load_complete_event == m_host_data_ptr->texture_load_request_event)
+	if (host_ctx->texture_loads_completed())
 	{
 		// All texture loads already seen by the host GPU
 		// Wait for all previously submitted labels to be flushed
-		drain_label_queue();
+		m_host_dma_ctrl->drain_label_queue();
 		return false;
 	}
 
@@ -1821,13 +1813,13 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args)
 		// NVIDIA GPUs can disappoint when DMA blocks straddle VirtualAlloc boundaries.
 		// Take the L and try the fallback.
 		rsx_log.warning("Host label update at 0x%x was not possible.", address);
-		drain_label_queue();
+		m_host_dma_ctrl->drain_label_queue();
 		return false;
 	}
 
-	m_host_data_ptr->last_label_release_event = m_host_data_ptr->inc_counter();
+	const auto release_event_id = host_ctx->on_label_acquire();
 
-	if (m_host_data_ptr->texture_load_request_event > m_host_data_ptr->last_label_submit_event)
+	if (host_ctx->has_unflushed_texture_loads())
 	{
 		if (vk::is_renderpass_open(*m_current_command_buffer))
 		{
@@ -1842,14 +1834,15 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args)
 		auto cmd = m_secondary_cb_list.next();
 		cmd->begin();
 		vkCmdUpdateBuffer(*cmd, mapping.second->value, mapping.first, 4, &write_data);
-		vkCmdUpdateBuffer(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
+		vkCmdUpdateBuffer(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, &release_event_id);
 		cmd->end();
 
 		vk::queue_submit_t submit_info = { m_device->get_graphics_queue(), nullptr };
 		cmd->submit(submit_info);
 
-		m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
+		host_ctx->on_label_release();
 	}
+
 	return true;
 }
 
@@ -2516,15 +2509,15 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
 		m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
 	}
 
-	if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->last_label_submit_event)
+	if (m_host_dma_ctrl && m_host_dma_ctrl->host_ctx()->needs_label_release())
 	{
 		vkCmdUpdateBuffer(*m_current_command_buffer,
 			m_host_object_data->value,
 			::offset32(&vk::host_data_t::commands_complete_event),
 			sizeof(u64),
-			const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
+			const_cast<u64*>(&m_host_dma_ctrl->host_ctx()->last_label_acquire_event));
 
-		m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
+		m_host_dma_ctrl->host_ctx()->on_label_release();
 	}
 
 	m_current_command_buffer->end();
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h
index f1ae833938..92627b99ef 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@@ -1,6 +1,4 @@
 #pragma once
-#include "Emu/RSX/GSRender.h"
-#include "Emu/Cell/timers.hpp"
 
 #include "upscalers/upscaling.h"
 
@@ -19,15 +17,23 @@
 #include "VKFramebuffer.h"
 #include "VKShaderInterpreter.h"
 #include "VKQueryPool.h"
-#include "../GCM.h"
 #include "util/asm.hpp"
 
+#include "Emu/RSX/GCM.h"
+#include "Emu/RSX/GSRender.h"
+#include "Emu/RSX/Host/RSXDMAWriter.h"
+
 #include <thread>
 #include <optional>
 
 using namespace vk::vmm_allocation_pool_; // clang workaround.
 using namespace vk::upscaling_flags_;     // ditto
 
+namespace vk
+{
+	using host_data_t = rsx::host_gpu_context_t;
+}
+
 class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 {
 private:
@@ -118,7 +124,6 @@ private:
 	vk::command_buffer_chain<VK_MAX_ASYNC_CB_COUNT> m_primary_cb_list;
 	vk::command_buffer_chunk* m_current_command_buffer = nullptr;
 
-	volatile vk::host_data_t* m_host_data_ptr = nullptr;
 	std::unique_ptr<vk::buffer> m_host_object_data;
 
 	vk::descriptor_pool m_descriptor_pool;
@@ -274,7 +279,7 @@ public:
 	void end_conditional_rendering() override;
 
 	// Host sync object
-	inline std::pair<volatile vk::host_data_t*, VkBuffer> map_host_object_data() { return { m_host_data_ptr, m_host_object_data->value }; }
+	std::pair<volatile vk::host_data_t*, VkBuffer> map_host_object_data() const;
 
 	// GRAPH backend
 	void patch_transform_constants(rsx::context* ctx, u32 index, u32 count) override;
diff --git a/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp b/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp
index e1893626ab..b4d999e07c 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp
@@ -6,6 +6,7 @@
 
 #include "Emu/RSX/Common/simple_array.hpp"
 #include "Emu/RSX/rsx_utils.h"
+#include "Emu/RSX/rsx_cache.h"
 #include "Utilities/mutex.h"
 #include "util/asm.hpp"
 
diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp
index 3353dcf341..80630656c0 100644
--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@@ -1246,8 +1246,8 @@ namespace vk
 			// Queue a sync update on the CB doing the load
 			auto [host_data, host_buffer] = static_cast<VKGSRender*>(rsxthr)->map_host_object_data();
 			ensure(host_data);
-			const auto event_id = host_data->inc_counter();
-			host_data->texture_load_request_event = event_id;
+
+			const auto event_id = host_data->on_texture_load_acquire();
 			vkCmdUpdateBuffer(cmd2, host_buffer, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id);
 		}
 	}
diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.h b/rpcs3/Emu/RSX/VK/vkutils/sync.h
index a91cafebab..9177e84113 100644
--- a/rpcs3/Emu/RSX/VK/vkutils/sync.h
+++ b/rpcs3/Emu/RSX/VK/vkutils/sync.h
@@ -18,25 +18,6 @@ namespace vk
 		gpu = 1
 	};
 
-	struct host_data_t // Pick a better name
-	{
-		u64 magic = 0xCAFEBABE;
-		u64 event_counter = 0;
-		u64 texture_load_request_event = 0;
-		u64 texture_load_complete_event = 0;
-		u64 last_label_release_event = 0;
-		u64 last_label_submit_event = 0;
-		u64 commands_complete_event = 0;
-		u64 last_label_request_timestamp = 0;
-
-		inline u64 inc_counter() volatile
-		{
-			// Workaround for volatile increment warning. GPU can see this value directly, but currently we do not modify it on the device.
-			event_counter = event_counter + 1;
-			return event_counter;
-		}
-	};
-
 	struct fence
 	{
 		atomic_t<bool> flushed = false;
diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj
index 8119df399e..580a403653 100644
--- a/rpcs3/emucore.vcxproj
+++ b/rpcs3/emucore.vcxproj
@@ -104,6 +104,7 @@
     <ClCompile Include="Emu\perf_monitor.cpp" />
     <ClCompile Include="Emu\RSX\Common\texture_cache.cpp" />
     <ClCompile Include="Emu\RSX\Core\RSXContext.cpp" />
+    <ClCompile Include="Emu\RSX\Host\RSXDMAWriter.cpp" />
     <ClCompile Include="Emu\RSX\NV47\FW\draw_call.cpp" />
     <ClCompile Include="Emu\RSX\NV47\FW\reg_context.cpp" />
     <ClCompile Include="Emu\RSX\NV47\HW\common.cpp" />
@@ -617,6 +618,7 @@
     <ClInclude Include="Emu\RSX\Core\RSXDisplay.h" />
     <ClInclude Include="Emu\RSX\Core\RSXReservationLock.hpp" />
     <ClInclude Include="Emu\RSX\Core\RSXVertexTypes.h" />
+    <ClInclude Include="Emu\RSX\Host\RSXDMAWriter.h" />
     <ClInclude Include="Emu\RSX\NV47\FW\draw_call.hpp" />
     <ClInclude Include="Emu\RSX\NV47\FW\draw_call.inc.h" />
     <ClInclude Include="Emu\RSX\NV47\FW\GRAPH_backend.h" />
diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters
index df52d9c397..7a2bf3ed16 100644
--- a/rpcs3/emucore.vcxproj.filters
+++ b/rpcs3/emucore.vcxproj.filters
@@ -1300,6 +1300,9 @@
     <ClCompile Include="Emu\RSX\gcm_enums.cpp">
       <Filter>Emu\GPU\RSX\NV47\FW</Filter>
     </ClCompile>
+    <ClCompile Include="Emu\RSX\Host\RSXDMAWriter.cpp">
+      <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="Crypto\aes.h">
@@ -2620,6 +2623,9 @@
     <ClInclude Include="Emu\RSX\color_utils.h">
       <Filter>Emu\GPU\RSX\Utils</Filter>
     </ClInclude>
+    <ClInclude Include="Emu\RSX\Host\RSXDMAWriter.h">
+      <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="Emu\RSX\Program\GLSLSnippets\GPUDeswizzle.glsl">