From 10fe14e783e91b89f7fe4e4f1f624fbb61b05487 Mon Sep 17 00:00:00 2001
From: kd-11 <karokidii@gmail.com>
Date: Thu, 4 Apr 2024 02:34:40 +0300
Subject: [PATCH] rsx: Separate register context from RSX thread

---
 rpcs3/Emu/RSX/NV47/common.cpp                 |   66 +
 rpcs3/Emu/RSX/NV47/common.h                   |   60 +
 rpcs3/Emu/RSX/NV47/context.h                  |   30 +
 rpcs3/Emu/RSX/NV47/context_accessors.define.h |    3 +
 rpcs3/Emu/RSX/NV47/context_accessors.undef.h  |    3 +
 rpcs3/Emu/RSX/NV47/nv3089.cpp                 |  655 ++++++
 rpcs3/Emu/RSX/NV47/nv3089.h                   |   10 +
 rpcs3/Emu/RSX/NV47/nv308a.cpp                 |  159 ++
 rpcs3/Emu/RSX/NV47/nv308a.h                   |   14 +
 rpcs3/Emu/RSX/NV47/nv406e.cpp                 |  125 ++
 rpcs3/Emu/RSX/NV47/nv406e.h                   |   15 +
 rpcs3/Emu/RSX/NV47/nv4097.cpp                 |  629 ++++++
 rpcs3/Emu/RSX/NV47/nv4097.h                   |  238 +++
 rpcs3/Emu/RSX/NV47/nv47.h                     |    7 +
 rpcs3/Emu/RSX/RSXDisAsm.cpp                   |    2 +-
 rpcs3/Emu/RSX/RSXFIFO.cpp                     |   23 +-
 rpcs3/Emu/RSX/RSXThread.cpp                   |    2 +-
 rpcs3/Emu/RSX/rsx_methods.cpp                 | 1879 +----------------
 rpcs3/Emu/RSX/rsx_methods.h                   |    4 +-
 rpcs3/emucore.vcxproj                         |   14 +
 rpcs3/emucore.vcxproj.filters                 |   45 +
 21 files changed, 2149 insertions(+), 1834 deletions(-)
 create mode 100644 rpcs3/Emu/RSX/NV47/common.cpp
 create mode 100644 rpcs3/Emu/RSX/NV47/common.h
 create mode 100644 rpcs3/Emu/RSX/NV47/context.h
 create mode 100644 rpcs3/Emu/RSX/NV47/context_accessors.define.h
 create mode 100644 rpcs3/Emu/RSX/NV47/context_accessors.undef.h
 create mode 100644 rpcs3/Emu/RSX/NV47/nv3089.cpp
 create mode 100644 rpcs3/Emu/RSX/NV47/nv3089.h
 create mode 100644 rpcs3/Emu/RSX/NV47/nv308a.cpp
 create mode 100644 rpcs3/Emu/RSX/NV47/nv308a.h
 create mode 100644 rpcs3/Emu/RSX/NV47/nv406e.cpp
 create mode 100644 rpcs3/Emu/RSX/NV47/nv406e.h
 create mode 100644 rpcs3/Emu/RSX/NV47/nv4097.cpp
 create mode 100644 rpcs3/Emu/RSX/NV47/nv4097.h
 create mode 100644 rpcs3/Emu/RSX/NV47/nv47.h

diff --git a/rpcs3/Emu/RSX/NV47/common.cpp b/rpcs3/Emu/RSX/NV47/common.cpp
new file mode 100644
index 0000000000..319a7c41bf
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/common.cpp
@@ -0,0 +1,66 @@
+#include "stdafx.h"
+#include "common.h"
+
+#include "Emu/RSX/RSXThread.h"
+
+#define RSX(ctx) ctx->rsxthr
+#define REGS(ctx) (&rsx::method_registers)
+
+namespace rsx
+{
+	namespace util
+	{
+		void push_vertex_data(rsx::context* ctx, u32 attrib_index, u32 channel_select, int count, rsx::vertex_base_type vtype, u32 value)
+		{
+			if (RSX(ctx)->in_begin_end)
+			{
+				// Update to immediate mode register/array
+				// NOTE: Push buffers still behave like register writes.
+				// You do not need to specify each attribute for each vertex, the register is referenced instead.
+				// This is classic OpenGL 1.x behavior as I remember.
+				RSX(ctx)->append_to_push_buffer(attrib_index, count, channel_select, vtype, value);
+			}
+
+			auto& info = REGS(ctx)->register_vertex_info[attrib_index];
+
+			info.type = vtype;
+			info.size = count;
+			info.frequency = 0;
+			info.stride = 0;
+			REGS(ctx)->register_vertex_info[attrib_index].data[channel_select] = value;
+		}
+
+		void push_draw_parameter_change(rsx::context* ctx, rsx::command_barrier_type type, u32 reg, u32 arg)
+		{
+			if (REGS(ctx)->latch == arg ||
+				!RSX(ctx)->in_begin_end ||
+				REGS(ctx)->current_draw_clause.empty())
+			{
+				return;
+			}
+
+			// Defer the change. Rollback...
+			REGS(ctx)->decode(reg, REGS(ctx)->latch);
+
+			// Insert barrier to reinsert the value later
+			REGS(ctx)->current_draw_clause.insert_command_barrier(index_base_modifier_barrier, arg);
+		}
+
+		u32 get_report_data_impl(rsx::context* ctx, u32 offset)
+		{
+			u32 location = 0;
+			blit_engine::context_dma report_dma = REGS(ctx)->context_dma_report();
+
+			switch (report_dma)
+			{
+			case blit_engine::context_dma::to_memory_get_report: location = CELL_GCM_CONTEXT_DMA_REPORT_LOCATION_LOCAL; break;
+			case blit_engine::context_dma::report_location_main: location = CELL_GCM_CONTEXT_DMA_REPORT_LOCATION_MAIN; break;
+			case blit_engine::context_dma::memory_host_buffer: location = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER; break;
+			default:
+				return vm::addr_t(0);
+			}
+
+			return vm::cast(get_address(offset, location));
+		}
+	}
+}
\ No newline at end of file
diff --git a/rpcs3/Emu/RSX/NV47/common.h b/rpcs3/Emu/RSX/NV47/common.h
new file mode 100644
index 0000000000..6fc50edbd1
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/common.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <util/types.hpp>
+#include "context.h"
+#include "context_accessors.define.h"
+
+namespace rsx
+{
+	enum command_barrier_type : u32;
+	enum vertex_base_type;
+
+	namespace util
+	{
+		u32 get_report_data_impl(rsx::context* ctx, u32 offset);
+
+		void push_vertex_data(rsx::context* ctx, u32 attrib_index, u32 channel_select, int count, rsx::vertex_base_type vtype, u32 value);
+
+		void push_draw_parameter_change(rsx::context* ctx, rsx::command_barrier_type type, u32 reg, u32 arg);
+
+		template <bool FlushDMA, bool FlushPipe>
+		void write_gcm_label(context* ctx, u32 address, u32 data)
+		{
+			const bool is_flip_sema = (address == (RSX(ctx)->label_addr + 0x10) || address == (RSX(ctx)->device_addr + 0x30));
+			if (!is_flip_sema)
+			{
+				// First, queue the GPU work. If it flushes the queue for us, the following routines will be faster.
+				const bool handled = RSX(ctx)->get_backend_config().supports_host_gpu_labels && RSX(ctx)->release_GCM_label(address, data);
+
+				if (vm::_ref<RsxSemaphore>(address).val == data)
+				{
+					// It's a no-op to write the same value (although there is a delay in real-hw so it's more accurate to allow GPU label in this case)
+					return;
+				}
+
+				if constexpr (FlushDMA)
+				{
+					// If the backend handled the request, this call will basically be a NOP
+					g_fxo->get<rsx::dma_manager>().sync();
+				}
+
+				if constexpr (FlushPipe)
+				{
+					// Manually flush the pipeline.
+					// It is possible to stream report writes using the host GPU, but that generates too much submit traffic.
+					RSX(ctx)->sync();
+				}
+
+				if (handled)
+				{
+					// Backend will handle it, nothing to write.
+					return;
+				}
+			}
+
+			vm::_ref<RsxSemaphore>(address).val = data;
+		}
+	}
+}
+ 
+#include "context_accessors.undef.h"
diff --git a/rpcs3/Emu/RSX/NV47/context.h b/rpcs3/Emu/RSX/NV47/context.h
new file mode 100644
index 0000000000..24f67bdfae
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/context.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <util/types.hpp>
+
+namespace rsx
+{
+	class thread;
+
+#if 0
+	// TODO: Separate GRAPH context from RSX state
+	struct GRAPH_context
+	{
+		u32 id;
+		std::array<u32, 0x10000 / 4> registers;
+
+		GRAPH_context(u32 ctx_id)
+			: id(ctx_id)
+		{
+			std::fill(registers.begin(), registers.end(), 0);
+		}
+	};
+#endif
+
+	struct context
+	{
+		thread* rsxthr;
+		// GRAPH_context* graph;
+		rsx_state* register_state;
+	};
+}
diff --git a/rpcs3/Emu/RSX/NV47/context_accessors.define.h b/rpcs3/Emu/RSX/NV47/context_accessors.define.h
new file mode 100644
index 0000000000..b423f11cd9
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/context_accessors.define.h
@@ -0,0 +1,3 @@
+#define RSX(ctx) ctx->rsxthr
+#define REGS(ctx) ctx->register_state
+#define RSX_CAPTURE_EVENT(name) if (RSX(ctx)->capture_current_frame) { RSX(ctx)->capture_frame(name); }
diff --git a/rpcs3/Emu/RSX/NV47/context_accessors.undef.h b/rpcs3/Emu/RSX/NV47/context_accessors.undef.h
new file mode 100644
index 0000000000..4fd31eec4e
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/context_accessors.undef.h
@@ -0,0 +1,3 @@
+#undef RSX
+#undef REGS
+#undef RSX_CAPTURE_EVENT
diff --git a/rpcs3/Emu/RSX/NV47/nv3089.cpp b/rpcs3/Emu/RSX/NV47/nv3089.cpp
new file mode 100644
index 0000000000..97f837ecf8
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/nv3089.cpp
@@ -0,0 +1,655 @@
+#include "stdafx.h"
+#include "nv3089.h"
+
+#include "Emu/RSX/RSXThread.h"
+
+#include "context_accessors.define.h"
+
+namespace rsx
+{
+	namespace nv3089
+	{
+		static std::tuple<bool, blit_src_info, blit_dst_info> decode_transfer_registers(context* ctx)
+		{
+			blit_src_info src_info = {};
+			blit_dst_info dst_info = {};
+
+			const rsx::blit_engine::transfer_operation operation = REGS(ctx)->blit_engine_operation();
+
+			const u16 out_x = REGS(ctx)->blit_engine_output_x();
+			const u16 out_y = REGS(ctx)->blit_engine_output_y();
+			const u16 out_w = REGS(ctx)->blit_engine_output_width();
+			const u16 out_h = REGS(ctx)->blit_engine_output_height();
+
+			const u16 in_w = REGS(ctx)->blit_engine_input_width();
+			const u16 in_h = REGS(ctx)->blit_engine_input_height();
+
+			const blit_engine::transfer_origin in_origin = REGS(ctx)->blit_engine_input_origin();
+			auto src_color_format = REGS(ctx)->blit_engine_src_color_format();
+
+			const f32 scale_x = REGS(ctx)->blit_engine_ds_dx();
+			const f32 scale_y = REGS(ctx)->blit_engine_dt_dy();
+
+			// Clipping
+			// Validate that clipping rect will fit onto both src and dst regions
+			const u16 clip_w = std::min(REGS(ctx)->blit_engine_clip_width(), out_w);
+			const u16 clip_h = std::min(REGS(ctx)->blit_engine_clip_height(), out_h);
+
+			// Check both clip dimensions and dst dimensions
+			if (clip_w == 0 || clip_h == 0)
+			{
+				rsx_log.warning("NV3089_IMAGE_IN: Operation NOPed out due to empty regions");
+				return { false, src_info, dst_info };
+			}
+
+			if (in_w == 0 || in_h == 0)
+			{
+				// Input cant be an empty region
+				fmt::throw_exception("NV3089_IMAGE_IN_SIZE: Invalid blit dimensions passed (in_w=%d, in_h=%d)", in_w, in_h);
+			}
+
+			u16 clip_x = REGS(ctx)->blit_engine_clip_x();
+			u16 clip_y = REGS(ctx)->blit_engine_clip_y();
+
+			//Fit onto dst
+			if (clip_x && (out_x + clip_x + clip_w) > out_w) clip_x = 0;
+			if (clip_y && (out_y + clip_y + clip_h) > out_h) clip_y = 0;
+
+			u16 in_pitch = REGS(ctx)->blit_engine_input_pitch();
+
+			switch (in_origin)
+			{
+			case blit_engine::transfer_origin::corner:
+			case blit_engine::transfer_origin::center:
+				break;
+			default:
+				rsx_log.warning("NV3089_IMAGE_IN_SIZE: unknown origin (%d)", static_cast<u8>(in_origin));
+			}
+
+			if (operation != rsx::blit_engine::transfer_operation::srccopy)
+			{
+				rsx_log.error("NV3089_IMAGE_IN_SIZE: unknown operation (0x%x)", REGS(ctx)->registers[NV3089_SET_OPERATION]);
+				RSX(ctx)->recover_fifo();
+				return { false, src_info, dst_info };
+			}
+
+			if (!src_color_format)
+			{
+				rsx_log.error("NV3089_IMAGE_IN_SIZE: unknown src color format (0x%x)", REGS(ctx)->registers[NV3089_SET_COLOR_FORMAT]);
+				RSX(ctx)->recover_fifo();
+				return { false, src_info, dst_info };
+			}
+
+			const u32 src_offset = REGS(ctx)->blit_engine_input_offset();
+			const u32 src_dma = REGS(ctx)->blit_engine_input_location();
+
+			u32 dst_offset;
+			u32 dst_dma = 0;
+			rsx::blit_engine::transfer_destination_format dst_color_format;
+			u32 out_pitch = 0;
+			[[maybe_unused]] u32 out_alignment = 64;
+			bool is_block_transfer = false;
+
+			switch (REGS(ctx)->blit_engine_context_surface())
+			{
+			case blit_engine::context_surface::surface2d:
+			{
+				dst_dma = REGS(ctx)->blit_engine_output_location_nv3062();
+				dst_offset = REGS(ctx)->blit_engine_output_offset_nv3062();
+				out_pitch = REGS(ctx)->blit_engine_output_pitch_nv3062();
+				out_alignment = REGS(ctx)->blit_engine_output_alignment_nv3062();
+				is_block_transfer = fcmp(scale_x, 1.f) && fcmp(scale_y, 1.f);
+
+				if (auto dst_fmt = REGS(ctx)->blit_engine_nv3062_color_format(); !dst_fmt)
+				{
+					rsx_log.error("NV3089_IMAGE_IN_SIZE: unknown NV3062 dst color format (0x%x)", REGS(ctx)->registers[NV3062_SET_COLOR_FORMAT]);
+					RSX(ctx)->recover_fifo();
+					return { false, src_info, dst_info };
+				}
+				else
+				{
+					dst_color_format = dst_fmt;
+				}
+
+				break;
+			}
+			case blit_engine::context_surface::swizzle2d:
+			{
+				dst_dma = REGS(ctx)->blit_engine_nv309E_location();
+				dst_offset = REGS(ctx)->blit_engine_nv309E_offset();
+
+				if (auto dst_fmt = REGS(ctx)->blit_engine_output_format_nv309E(); !dst_fmt)
+				{
+					rsx_log.error("NV3089_IMAGE_IN_SIZE: unknown NV309E dst color format (0x%x)", REGS(ctx)->registers[NV309E_SET_FORMAT]);
+					RSX(ctx)->recover_fifo();
+					return { false, src_info, dst_info };
+				}
+				else
+				{
+					dst_color_format = dst_fmt;
+				}
+
+				break;
+			}
+			default:
+				rsx_log.error("NV3089_IMAGE_IN_SIZE: unknown m_context_surface (0x%x)", static_cast<u8>(REGS(ctx)->blit_engine_context_surface()));
+				return { false, src_info, dst_info };
+			}
+
+			const u32 in_bpp = (src_color_format == rsx::blit_engine::transfer_source_format::r5g6b5) ? 2 : 4; // bytes per pixel
+			const u32 out_bpp = (dst_color_format == rsx::blit_engine::transfer_destination_format::r5g6b5) ? 2 : 4;
+
+			if (out_pitch == 0)
+			{
+				out_pitch = out_bpp * out_w;
+			}
+
+			if (in_pitch == 0)
+			{
+				in_pitch = in_bpp * in_w;
+			}
+
+			if (in_bpp != out_bpp)
+			{
+				is_block_transfer = false;
+			}
+
+			u16 in_x, in_y;
+			if (in_origin == blit_engine::transfer_origin::center)
+			{
+				// Convert to normal u,v addressing. Under this scheme offset of 1 is actually half-way inside pixel 0
+				const float x = std::max(REGS(ctx)->blit_engine_in_x(), 0.5f);
+				const float y = std::max(REGS(ctx)->blit_engine_in_y(), 0.5f);
+				in_x = static_cast<u16>(std::floor(x - 0.5f));
+				in_y = static_cast<u16>(std::floor(y - 0.5f));
+			}
+			else
+			{
+				in_x = static_cast<u16>(std::floor(REGS(ctx)->blit_engine_in_x()));
+				in_y = static_cast<u16>(std::floor(REGS(ctx)->blit_engine_in_y()));
+			}
+
+			// Check for subpixel addressing
+			if (scale_x < 1.f)
+			{
+				float dst_x = in_x * scale_x;
+				in_x = static_cast<u16>(std::floor(dst_x) / scale_x);
+			}
+
+			if (scale_y < 1.f)
+			{
+				float dst_y = in_y * scale_y;
+				in_y = static_cast<u16>(std::floor(dst_y) / scale_y);
+			}
+
+			const u32 in_offset = in_x * in_bpp + in_pitch * in_y;
+			const u32 out_offset = out_x * out_bpp + out_pitch * out_y;
+
+			const u32 src_line_length = (in_w * in_bpp);
+
+			u32 src_address = 0;
+			const u32 dst_address = get_address(dst_offset, dst_dma, 1); // TODO: Add size
+
+			if (is_block_transfer && (clip_h == 1 || (in_pitch == out_pitch && src_line_length == in_pitch)))
+			{
+				const u32 nb_lines = std::min(clip_h, in_h);
+				const u32 data_length = nb_lines * src_line_length;
+
+				if (src_address = get_address(src_offset, src_dma, data_length);
+					!src_address || !dst_address)
+				{
+					RSX(ctx)->recover_fifo();
+					return { false, src_info, dst_info };
+				}
+
+				RSX(ctx)->invalidate_fragment_program(dst_dma, dst_offset, data_length);
+
+				if (const auto result = RSX(ctx)->read_barrier(src_address, data_length, false);
+					result == rsx::result_zcull_intr)
+				{
+					if (RSX(ctx)->copy_zcull_stats(src_address, data_length, dst_address) == data_length)
+					{
+						// All writes deferred
+						return { false, src_info, dst_info };
+					}
+				}
+			}
+			else
+			{
+				const u16 read_h = std::min(static_cast<u16>(clip_h / scale_y), in_h);
+				const u32 data_length = in_pitch * (read_h - 1) + src_line_length;
+
+				if (src_address = get_address(src_offset, src_dma, data_length);
+					!src_address || !dst_address)
+				{
+					RSX(ctx)->recover_fifo();
+					return { false, src_info, dst_info };
+				}
+
+				RSX(ctx)->invalidate_fragment_program(dst_dma, dst_offset, data_length);
+				RSX(ctx)->read_barrier(src_address, data_length, true);
+			}
+
+			if (src_address == dst_address &&
+				in_w == clip_w && in_h == clip_h &&
+				in_pitch == out_pitch &&
+				rsx::fcmp(scale_x, 1.f) && rsx::fcmp(scale_y, 1.f))
+			{
+				// NULL operation
+				rsx_log.warning("NV3089_IMAGE_IN: Operation writes memory onto itself with no modification (move-to-self). Will ignore.");
+				return { false, src_info, dst_info };
+			}
+
+			u8* pixels_src = vm::_ptr<u8>(src_address + in_offset);
+			u8* pixels_dst = vm::_ptr<u8>(dst_address + out_offset);
+
+			if (dst_color_format != rsx::blit_engine::transfer_destination_format::r5g6b5 &&
+				dst_color_format != rsx::blit_engine::transfer_destination_format::a8r8g8b8)
+			{
+				fmt::throw_exception("NV3089_IMAGE_IN_SIZE: unknown dst_color_format (%d)", static_cast<u8>(dst_color_format));
+			}
+
+			if (src_color_format != rsx::blit_engine::transfer_source_format::r5g6b5 &&
+				src_color_format != rsx::blit_engine::transfer_source_format::a8r8g8b8)
+			{
+				// Alpha has no meaning in both formats
+				if (src_color_format == rsx::blit_engine::transfer_source_format::x8r8g8b8)
+				{
+					src_color_format = rsx::blit_engine::transfer_source_format::a8r8g8b8;
+				}
+				else
+				{
+					// TODO: Support more formats
+					fmt::throw_exception("NV3089_IMAGE_IN_SIZE: unknown src_color_format (%d)", static_cast<u8>(*src_color_format));
+				}
+			}
+
+			u32 convert_w = static_cast<u32>(std::abs(scale_x) * in_w);
+			u32 convert_h = static_cast<u32>(std::abs(scale_y) * in_h);
+
+			if (convert_w == 0 || convert_h == 0)
+			{
+				rsx_log.error("NV3089_IMAGE_IN: Invalid dimensions or scaling factor. Request ignored (ds_dx=%f, dt_dy=%f)",
+					REGS(ctx)->blit_engine_ds_dx(), REGS(ctx)->blit_engine_dt_dy());
+				return { false, src_info, dst_info };
+			}
+
+			src_info.format = src_color_format;
+			src_info.origin = in_origin;
+			src_info.width = in_w;
+			src_info.height = in_h;
+			src_info.pitch = in_pitch;
+			src_info.bpp = in_bpp;
+			src_info.offset_x = in_x;
+			src_info.offset_y = in_y;
+			src_info.dma = src_dma;
+			src_info.rsx_address = src_address;
+			src_info.pixels = pixels_src;
+
+			dst_info.format = dst_color_format;
+			dst_info.width = convert_w;
+			dst_info.height = convert_h;
+			dst_info.clip_x = clip_x;
+			dst_info.clip_y = clip_y;
+			dst_info.clip_width = clip_w;
+			dst_info.clip_height = clip_h;
+			dst_info.offset_x = out_x;
+			dst_info.offset_y = out_y;
+			dst_info.pitch = out_pitch;
+			dst_info.bpp = out_bpp;
+			dst_info.scale_x = scale_x;
+			dst_info.scale_y = scale_y;
+			dst_info.dma = dst_dma;
+			dst_info.rsx_address = dst_address;
+			dst_info.pixels = pixels_dst;
+			dst_info.swizzled = (REGS(ctx)->blit_engine_context_surface() == blit_engine::context_surface::swizzle2d);
+
+			return { true, src_info, dst_info };
+		}
+
+		void linear_copy(
+			const blit_dst_info& dst,
+			const blit_src_info& src,
+			u16 out_w,
+			u16 out_h,
+			u32 slice_h,
+			AVPixelFormat ffmpeg_src_format,
+			AVPixelFormat ffmpeg_dst_format,
+			bool need_convert,
+			bool need_clip,
+			bool src_is_modified,
+			bool interpolate)
+		{
+			std::vector<u8> temp2;
+
+			if (!need_convert) [[ likely ]]
+			{
+				const bool is_overlapping = !src_is_modified && dst.dma == src.dma && [&]() -> bool
+				{
+					const auto src_range = utils::address_range::start_length(src.rsx_address, src.pitch * (src.height - 1) + (src.bpp * src.width));
+					const auto dst_range = utils::address_range::start_length(dst.rsx_address, dst.pitch * (dst.clip_height - 1) + (dst.bpp * dst.clip_width));
+					return src_range.overlaps(dst_range);
+				}();
+
+					if (is_overlapping) [[ unlikely ]]
+					{
+						if (need_clip)
+						{
+							temp2.resize(dst.pitch * dst.clip_height);
+							clip_image_may_overlap(dst.pixels, src.pixels, dst.clip_x, dst.clip_y, dst.clip_width, dst.clip_height, dst.bpp, src.pitch, dst.pitch, temp2.data());
+							return;
+						}
+
+						if (dst.pitch != src.pitch || dst.pitch != dst.bpp * out_w)
+						{
+							const u32 buffer_pitch = dst.bpp * out_w;
+							temp2.resize(buffer_pitch * out_h);
+							std::add_pointer_t<u8> buf = temp2.data(), pixels = src.pixels;
+
+							// Read the whole buffer from source
+							for (u32 y = 0; y < out_h; ++y)
+							{
+								std::memcpy(buf, pixels, buffer_pitch);
+								pixels += src.pitch;
+								buf += buffer_pitch;
+							}
+
+							buf = temp2.data(), pixels = dst.pixels;
+
+							// Write to destination
+							for (u32 y = 0; y < out_h; ++y)
+							{
+								std::memcpy(pixels, buf, buffer_pitch);
+								pixels += dst.pitch;
+								buf += buffer_pitch;
+							}
+
+							return;
+						}
+
+						std::memmove(dst.pixels, src.pixels, dst.pitch * out_h);
+						return;
+					}
+
+					if (need_clip) [[ unlikely ]]
+					{
+						clip_image(dst.pixels, src.pixels, dst.clip_x, dst.clip_y, dst.clip_width, dst.clip_height, dst.bpp, src.pitch, dst.pitch);
+						return;
+					}
+
+					if (dst.pitch != src.pitch || dst.pitch != dst.bpp * out_w) [[ unlikely ]]
+					{
+						u8* dst_pixels = dst.pixels, * src_pixels = src.pixels;
+
+						for (u32 y = 0; y < out_h; ++y)
+						{
+							std::memcpy(dst_pixels, src_pixels, out_w * dst.bpp);
+							dst_pixels += dst.pitch;
+							src_pixels += src.pitch;
+						}
+
+						return;
+					}
+
+					std::memcpy(dst.pixels, src.pixels, dst.pitch * out_h);
+					return;
+			}
+
+			if (need_clip) [[ unlikely ]]
+			{
+				temp2.resize(dst.pitch * std::max<u32>(dst.height, dst.clip_height));
+
+				convert_scale_image(temp2.data(), ffmpeg_dst_format, dst.width, dst.height, dst.pitch,
+					src.pixels, ffmpeg_src_format, src.width, src.height, src.pitch, slice_h, interpolate);
+
+				clip_image(dst.pixels, temp2.data(), dst.clip_x, dst.clip_y, dst.clip_width, dst.clip_height, dst.bpp, dst.pitch, dst.pitch);
+				return;
+			}
+
+			convert_scale_image(dst.pixels, ffmpeg_dst_format, out_w, out_h, dst.pitch,
+				src.pixels, ffmpeg_src_format, src.width, src.height, src.pitch, slice_h,
+				interpolate);
+		}
+
+		std::vector<u8> swizzled_copy_1(
+			const blit_dst_info& dst,
+			const blit_src_info& src,
+			u16 out_w,
+			u16 out_h,
+			u32 slice_h,
+			AVPixelFormat ffmpeg_src_format,
+			AVPixelFormat ffmpeg_dst_format,
+			bool need_convert,
+			bool need_clip,
+			bool interpolate)
+		{
+			std::vector<u8> temp2, temp3;
+
+			if (need_clip)
+			{
+				temp3.resize(dst.pitch * dst.clip_height);
+
+				if (need_convert)
+				{
+					temp2.resize(dst.pitch * std::max<u32>(dst.height, dst.clip_height));
+
+					convert_scale_image(temp2.data(), ffmpeg_dst_format, dst.width, dst.height, dst.pitch,
+						src.pixels, ffmpeg_src_format, src.width, src.height, src.pitch, slice_h,
+						interpolate);
+
+					clip_image(temp3.data(), temp2.data(), dst.clip_x, dst.clip_y, dst.clip_width, dst.clip_height, dst.bpp, dst.pitch, dst.pitch);
+					return temp3;
+				}
+
+				clip_image(temp3.data(), src.pixels, dst.clip_x, dst.clip_y, dst.clip_width, dst.clip_height, dst.bpp, src.pitch, dst.pitch);
+				return temp3;
+			}
+
+			if (need_convert)
+			{
+				temp3.resize(dst.pitch * out_h);
+
+				convert_scale_image(temp3.data(), ffmpeg_dst_format, out_w, out_h, dst.pitch,
+					src.pixels, ffmpeg_src_format, src.width, src.height, src.pitch, slice_h,
+					interpolate);
+
+				return temp3;
+			}
+
+			return {};
+		}
+
+		void swizzled_copy_2(
+			u8* linear_pixels,
+			u8* swizzled_pixels,
+			u32 linear_pitch,
+			u16 out_w,
+			u16 out_h,
+			u8 out_bpp)
+		{
+			// TODO: Validate these claims. Are the registers always correctly initialized? Should we trust them at all?
+			// It looks like rsx may ignore the requested swizzle size and just always
+			// round up to nearest power of 2
+			/*
+			u8 sw_width_log2 = REGS(ctx)->nv309e_sw_width_log2();
+			u8 sw_height_log2 = REGS(ctx)->nv309e_sw_height_log2();
+
+			// 0 indicates height of 1 pixel
+			sw_height_log2 = sw_height_log2 == 0 ? 1 : sw_height_log2;
+
+			// swizzle based on destination size
+			u16 sw_width = 1 << sw_width_log2;
+			u16 sw_height = 1 << sw_height_log2;
+			*/
+
+			std::vector<u8> sw_temp;
+
+			u32 sw_width = next_pow2(out_w);
+			u32 sw_height = next_pow2(out_h);
+
+			// Check and pad texture out if we are given non power of 2 output
+			if (sw_width != out_w || sw_height != out_h)
+			{
+				sw_temp.resize(out_bpp * sw_width * sw_height);
+
+				switch (out_bpp)
+				{
+				case 1:
+					pad_texture<u8>(linear_pixels, sw_temp.data(), out_w, out_h, sw_width, sw_height);
+					break;
+				case 2:
+					pad_texture<u16>(linear_pixels, sw_temp.data(), out_w, out_h, sw_width, sw_height);
+					break;
+				case 4:
+					pad_texture<u32>(linear_pixels, sw_temp.data(), out_w, out_h, sw_width, sw_height);
+					break;
+				}
+
+				linear_pixels = sw_temp.data();
+			}
+
+			switch (out_bpp)
+			{
+			case 1:
+				convert_linear_swizzle<u8, false>(linear_pixels, swizzled_pixels, sw_width, sw_height, linear_pitch);
+				break;
+			case 2:
+				convert_linear_swizzle<u16, false>(linear_pixels, swizzled_pixels, sw_width, sw_height, linear_pitch);
+				break;
+			case 4:
+				convert_linear_swizzle<u32, false>(linear_pixels, swizzled_pixels, sw_width, sw_height, linear_pitch);
+				break;
+			}
+		}
+
+		std::vector<u8> _mirror_transform(const blit_src_info& src, bool flip_x, bool flip_y)
+		{
+			std::vector<u8> temp1;
+			if (!flip_x && !flip_y)
+			{
+				return temp1;
+			}
+
+			const u32 packed_pitch = src.width * src.bpp;
+			temp1.resize(packed_pitch * src.height);
+
+			const s32 stride_y = (flip_y ? -1 : 1) * static_cast<s32>(src.pitch);
+
+			for (u32 y = 0; y < src.height; ++y)
+			{
+				u8* dst_pixels = temp1.data() + (packed_pitch * y);
+				u8* src_pixels = src.pixels + (static_cast<s32>(y) * stride_y);
+
+				if (flip_x)
+				{
+					if (src.bpp == 4) [[ likely ]]
+					{
+						rsx::memcpy_r<u32>(dst_pixels, src_pixels, src.width);
+						continue;
+					}
+
+					rsx::memcpy_r<u16>(dst_pixels, src_pixels, src.width);
+					continue;
+				}
+
+				std::memcpy(dst_pixels, src_pixels, packed_pitch);
+			}
+
+			return temp1;
+		}
+
+		void image_in(context* ctx, u32 /*reg*/, u32 /*arg*/)
+		{
+			auto [success, src, dst] = decode_transfer_registers(ctx);
+			if (!success)
+			{
+				return;
+			}
+
+			// Decode extra params before locking
+			const blit_engine::transfer_interpolator in_inter = REGS(ctx)->blit_engine_input_inter();
+			const u16 out_w = REGS(ctx)->blit_engine_output_width();
+			const u16 out_h = REGS(ctx)->blit_engine_output_height();
+
+			// Lock here. RSX cannot execute any locking operations from this point, including ZCULL read barriers
+			auto res = ::rsx::reservation_lock<true>(
+				dst.rsx_address, dst.pitch * dst.clip_height,
+				src.rsx_address, src.pitch * src.height);
+
+			if (!g_cfg.video.force_cpu_blit_processing &&
+				(dst.dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER || src.dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER) &&
+				RSX(ctx)->scaled_image_from_memory(src, dst, in_inter == blit_engine::transfer_interpolator::foh))
+			{
+				// HW-accelerated blit
+				return;
+			}
+
+			std::vector<u8> mirror_tmp;
+			bool src_is_temp = false;
+
+			// Flip source if needed
+			if (dst.scale_y < 0 || dst.scale_x < 0)
+			{
+				mirror_tmp = _mirror_transform(src, dst.scale_x < 0, dst.scale_y < 0);
+				src.pixels = mirror_tmp.data();
+				src.pitch = src.width * src.bpp;
+				src_is_temp = true;
+			}
+
+			const AVPixelFormat in_format = (src.format == rsx::blit_engine::transfer_source_format::r5g6b5) ? AV_PIX_FMT_RGB565BE : AV_PIX_FMT_ARGB;
+			const AVPixelFormat out_format = (dst.format == rsx::blit_engine::transfer_destination_format::r5g6b5) ? AV_PIX_FMT_RGB565BE : AV_PIX_FMT_ARGB;
+
+			const bool need_clip =
+				dst.clip_width != src.width ||
+				dst.clip_height != src.height ||
+				dst.clip_x > 0 || dst.clip_y > 0 ||
+				dst.width != out_w || dst.height != out_h;
+
+			const bool need_convert = out_format != in_format || !rsx::fcmp(fabsf(dst.scale_x), 1.f) || !rsx::fcmp(fabsf(dst.scale_y), 1.f);
+			const u32 slice_h = static_cast<u32>(std::ceil(static_cast<f32>(dst.clip_height + dst.clip_y) / dst.scale_y));
+			const bool interpolate = in_inter == blit_engine::transfer_interpolator::foh;
+
+			auto real_dst = dst.pixels;
+			const auto tiled_region = RSX(ctx)->get_tiled_memory_region(utils::address_range::start_length(dst.rsx_address, dst.pitch * dst.clip_height));
+			std::vector<u8> tmp;
+
+			if (tiled_region)
+			{
+				tmp.resize(tiled_region.tile->size);
+				real_dst = dst.pixels;
+				dst.pixels = tmp.data();
+			}
+
+			if (REGS(ctx)->blit_engine_context_surface() != blit_engine::context_surface::swizzle2d)
+			{
+				linear_copy(dst, src, out_w, out_h, slice_h, in_format, out_format, need_convert, need_clip, src_is_temp, interpolate);
+			}
+			else
+			{
+				const auto swz_temp = swizzled_copy_1(dst, src, out_w, out_h, slice_h, in_format, out_format, need_convert, need_clip, interpolate);
+				auto pixels_src = swz_temp.empty() ? src.pixels : swz_temp.data();
+
+				swizzled_copy_2(const_cast<u8*>(pixels_src), dst.pixels, src.pitch, out_w, out_h, dst.bpp);
+			}
+
+			if (tiled_region)
+			{
+				const auto tile_func = dst.bpp == 4
+					? rsx::tile_texel_data32
+					: rsx::tile_texel_data16;
+
+				tile_func(
+					real_dst,
+					dst.pixels,
+					tiled_region.base_address,
+					dst.rsx_address - tiled_region.base_address,
+					tiled_region.tile->size,
+					tiled_region.tile->bank,
+					tiled_region.tile->pitch,
+					dst.clip_width,
+					dst.clip_height
+				);
+			}
+		}
+	}
+}
diff --git a/rpcs3/Emu/RSX/NV47/nv3089.h b/rpcs3/Emu/RSX/NV47/nv3089.h
new file mode 100644
index 0000000000..e54b4a48c5
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/nv3089.h
@@ -0,0 +1,10 @@
+#pragma once
+#include "context.h"
+
+namespace rsx
+{
+	namespace nv3089
+	{
+		void image_in(context* ctx, u32 reg, u32 arg);
+	}
+}
diff --git a/rpcs3/Emu/RSX/NV47/nv308a.cpp b/rpcs3/Emu/RSX/NV47/nv308a.cpp
new file mode 100644
index 0000000000..049f39d192
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/nv308a.cpp
@@ -0,0 +1,159 @@
+#include "stdafx.h"
+#include "nv308a.h"
+
+#include "Emu/RSX/RSXThread.h"
+
+#include "context_accessors.define.h"
+
+namespace rsx
+{
+	namespace nv308a
+	{
+		void color::impl(context* ctx, u32 reg, u32)
+		{
+			const u32 out_x_max = REGS(ctx)->nv308a_size_out_x();
+			const u32 index = reg - NV308A_COLOR;
+
+			if (index >= out_x_max)
+			{
+				// Skip
+				return;
+			}
+
+			// Get position of the current command arg
+			[[maybe_unused]] const u32 src_offset = RSX(ctx)->fifo_ctrl->get_pos();
+
+			// FIFO args count including this one
+			const u32 fifo_args_cnt = RSX(ctx)->fifo_ctrl->get_remaining_args_count() + 1;
+
+			// The range of methods this function resposible to
+			const u32 method_range = std::min<u32>(0x700 - index, out_x_max - index);
+
+			// Get limit imposed by FIFO PUT (if put is behind get it will result in a number ignored by min)
+			const u32 fifo_read_limit = static_cast<u32>(((RSX(ctx)->ctrl->put & ~3ull) - (RSX(ctx)->fifo_ctrl->get_pos())) / 4);
+
+			u32 count = std::min<u32>({ fifo_args_cnt, fifo_read_limit, method_range });
+
+			const u32 dst_dma = REGS(ctx)->blit_engine_output_location_nv3062();
+			const u32 dst_offset = REGS(ctx)->blit_engine_output_offset_nv3062();
+			const u32 out_pitch = REGS(ctx)->blit_engine_output_pitch_nv3062();
+
+			const u32 x = REGS(ctx)->nv308a_x() + index;
+			const u32 y = REGS(ctx)->nv308a_y();
+
+			const auto fifo_span = RSX(ctx)->fifo_ctrl->get_current_arg_ptr();
+
+			if (fifo_span.size() < count)
+			{
+				count = ::size32(fifo_span);
+			}
+
+			// Skip "handled methods"
+			RSX(ctx)->fifo_ctrl->skip_methods(count - 1);
+
+			// 308A::COLOR can be used to create custom sync primitives.
+			// Hide this behind strict mode due to the potential performance implications.
+			if (count == 1 && g_cfg.video.strict_rendering_mode && !g_cfg.video.relaxed_zcull_sync)
+			{
+				RSX(ctx)->sync();
+			}
+
+			switch (*REGS(ctx)->blit_engine_nv3062_color_format())
+			{
+			case blit_engine::transfer_destination_format::a8r8g8b8:
+			case blit_engine::transfer_destination_format::y32:
+			{
+				// Bit cast - optimize to mem copy
+
+				const u32 data_length = count * 4;
+
+				const auto dst_address = get_address(dst_offset + (x * 4) + (out_pitch * y), dst_dma, data_length);
+
+				if (!dst_address)
+				{
+					RSX(ctx)->recover_fifo();
+					return;
+				}
+
+				const auto dst = vm::_ptr<u8>(dst_address);
+				const auto src = reinterpret_cast<const u8*>(fifo_span.data());
+
+				rsx::reservation_lock<true> rsx_lock(dst_address, data_length);
+
+				if (RSX(ctx)->fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) [[unlikely]]
+					{
+						// Move last 32 bits
+						reinterpret_cast<u32*>(dst)[0] = reinterpret_cast<const u32*>(src)[count - 1];
+						RSX(ctx)->invalidate_fragment_program(dst_dma, dst_offset, 4);
+					}
+				else
+				{
+					if (dst_dma & CELL_GCM_LOCATION_MAIN)
+					{
+						// May overlap
+						std::memmove(dst, src, data_length);
+					}
+					else
+					{
+						// Never overlaps
+						std::memcpy(dst, src, data_length);
+					}
+
+					RSX(ctx)->invalidate_fragment_program(dst_dma, dst_offset, count * 4);
+				}
+
+				break;
+			}
+			case blit_engine::transfer_destination_format::r5g6b5:
+			{
+				const auto data_length = count * 2;
+
+				const auto dst_address = get_address(dst_offset + (x * 2) + (y * out_pitch), dst_dma, data_length);
+				const auto dst = vm::_ptr<u16>(dst_address);
+				const auto src = utils::bless<const be_t<u32>>(fifo_span.data());
+
+				if (!dst_address)
+				{
+					RSX(ctx)->recover_fifo();
+					return;
+				}
+
+				rsx::reservation_lock<true> rsx_lock(dst_address, data_length);
+
+				auto convert = [](u32 input) -> u16
+					{
+						// Input is considered to be ARGB8
+						u32 r = (input >> 16) & 0xFF;
+						u32 g = (input >> 8) & 0xFF;
+						u32 b = input & 0xFF;
+
+						r = (r * 32) / 255;
+						g = (g * 64) / 255;
+						b = (b * 32) / 255;
+						return static_cast<u16>((r << 11) | (g << 5) | b);
+					};
+
+				if (RSX(ctx)->fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) [[unlikely]]
+					{
+						// Move last 16 bits
+						dst[0] = convert(src[count - 1]);
+						RSX(ctx)->invalidate_fragment_program(dst_dma, dst_offset, 2);
+						break;
+					}
+
+					for (u32 i = 0; i < count; i++)
+					{
+						dst[i] = convert(src[i]);
+					}
+
+					RSX(ctx)->invalidate_fragment_program(dst_dma, dst_offset, count * 2);
+					break;
+			}
+			default:
+			{
+				fmt::throw_exception("Unreachable");
+			}
+			}
+		}
+	}
+}
diff --git a/rpcs3/Emu/RSX/NV47/nv308a.h b/rpcs3/Emu/RSX/NV47/nv308a.h
new file mode 100644
index 0000000000..eb28063c2b
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/nv308a.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "context.h"
+
+namespace rsx
+{
+	namespace nv308a
+	{
+		struct color
+		{
+			static void impl(context* ctx, u32 reg, u32 arg);
+		};
+	}
+}
diff --git a/rpcs3/Emu/RSX/NV47/nv406e.cpp b/rpcs3/Emu/RSX/NV47/nv406e.cpp
new file mode 100644
index 0000000000..3c28acfcef
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/nv406e.cpp
@@ -0,0 +1,125 @@
+#include "stdafx.h"
+#include "nv406e.h"
+#include "common.h"
+
+#include "Emu/RSX/RSXThread.h"
+
+#include "context_accessors.define.h"
+
+namespace rsx
+{
+	namespace nv406e
+	{
+		void set_reference(context* ctx, u32 /*reg*/, u32 arg)
+		{
+			RSX(ctx)->sync();
+
+			// Write ref+get (get will be written again with the same value at command end)
+			auto& dma = vm::_ref<RsxDmaControl>(RSX(ctx)->dma_address);
+			dma.get.release(RSX(ctx)->fifo_ctrl->get_pos());
+			dma.ref.store(arg);
+		}
+
+		void semaphore_acquire(context* ctx, u32 /*reg*/, u32 arg)
+		{
+			RSX(ctx)->sync_point_request.release(true);
+			const u32 addr = get_address(REGS(ctx)->semaphore_offset_406e(), REGS(ctx)->semaphore_context_dma_406e());
+
+			const auto& sema = vm::_ref<RsxSemaphore>(addr).val;
+
+			if (sema == arg)
+			{
+				// Flip semaphore doesnt need wake-up delay
+				if (addr != RSX(ctx)->label_addr + 0x10)
+				{
+					RSX(ctx)->flush_fifo();
+					RSX(ctx)->fifo_wake_delay(2);
+				}
+
+				return;
+			}
+			else
+			{
+				RSX(ctx)->flush_fifo();
+			}
+
+			u64 start = rsx::uclock();
+			u64 last_check_val = start;
+
+			while (sema != arg)
+			{
+				if (RSX(ctx)->test_stopped())
+				{
+					RSX(ctx)->state += cpu_flag::again;
+					return;
+				}
+
+				if (const auto tdr = static_cast<u64>(g_cfg.video.driver_recovery_timeout))
+				{
+					const u64 current = rsx::uclock();
+
+					if (current - last_check_val > 20'000)
+					{
+						// Suspicious amnount of time has passed
+						// External pause such as debuggers' pause or operating system sleep may have taken place
+						// Ignore it
+						start += current - last_check_val;
+					}
+
+					last_check_val = current;
+
+					if ((current - start) > tdr)
+					{
+						// If longer than driver timeout force exit
+						rsx_log.error("nv406e::semaphore_acquire has timed out. semaphore_address=0x%X", addr);
+						break;
+					}
+				}
+
+				RSX(ctx)->cpu_wait({});
+			}
+
+			RSX(ctx)->fifo_wake_delay();
+			RSX(ctx)->performance_counters.idle_time += (rsx::uclock() - start);
+		}
+
+		void semaphore_release(context* ctx, u32 /*reg*/, u32 arg)
+		{
+			const u32 offset = REGS(ctx)->semaphore_offset_406e();
+
+			if (offset % 4)
+			{
+				rsx_log.warning("NV406E semaphore release is using unaligned semaphore, ignoring. (offset=0x%x)", offset);
+				return;
+			}
+
+			const u32 ctxt = REGS(ctx)->semaphore_context_dma_406e();
+
+			// By avoiding doing this on flip's semaphore release
+			// We allow last gcm's registers reset to occur in case of a crash
+			if (const bool is_flip_sema = (offset == 0x10 && ctxt == CELL_GCM_CONTEXT_DMA_SEMAPHORE_R);
+				!is_flip_sema)
+			{
+				RSX(ctx)->sync_point_request.release(true);
+			}
+
+			const u32 addr = get_address(offset, ctxt);
+
+			// TODO: Check if possible to write on reservations
+			if (RSX(ctx)->label_addr >> 28 != addr >> 28)
+			{
+				rsx_log.error("NV406E semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr);
+				RSX(ctx)->recover_fifo();
+				return;
+			}
+
+			if (addr == RSX(ctx)->device_addr + 0x30 && !arg)
+			{
+				// HW flip synchronization related, 1 is not written without display queue command (TODO: make it behave as real hw)
+				arg = 1;
+			}
+
+			util::write_gcm_label<false, true>(ctx, addr, arg);
+		}
+	}
+}
diff --git a/rpcs3/Emu/RSX/NV47/nv406e.h b/rpcs3/Emu/RSX/NV47/nv406e.h
new file mode 100644
index 0000000000..426228741c
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/nv406e.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "context.h"
+
+namespace rsx
+{
+	namespace nv406e
+	{
+		void set_reference(context* ctx, u32 reg, u32 arg);
+
+		void semaphore_acquire(context* ctx, u32 reg, u32 arg);
+
+		void semaphore_release(context* ctx, u32 reg, u32 arg);
+	}
+}
diff --git a/rpcs3/Emu/RSX/NV47/nv4097.cpp b/rpcs3/Emu/RSX/NV47/nv4097.cpp
new file mode 100644
index 0000000000..c9155f5dbc
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/nv4097.cpp
@@ -0,0 +1,629 @@
+#include "stdafx.h"
+#include "nv4097.h"
+
+#include "Emu/RSX/RSXThread.h"
+#include "Emu/RSX/Common/BufferUtils.h"
+
+#define RSX(ctx) ctx->rsxthr
+#define REGS(ctx) (&rsx::method_registers)
+#define RSX_CAPTURE_EVENT(name) if (RSX(ctx)->capture_current_frame) { RSX(ctx)->capture_frame(name); }
+
+namespace rsx
+{
+	template<typename Type> struct vertex_data_type_from_element_type;
+	template<> struct vertex_data_type_from_element_type<float> { static const vertex_base_type type = vertex_base_type::f; };
+	template<> struct vertex_data_type_from_element_type<f16> { static const vertex_base_type type = vertex_base_type::sf; };
+	template<> struct vertex_data_type_from_element_type<u8> { static const vertex_base_type type = vertex_base_type::ub; };
+	template<> struct vertex_data_type_from_element_type<u16> { static const vertex_base_type type = vertex_base_type::s32k; };
+	template<> struct vertex_data_type_from_element_type<s16> { static const vertex_base_type type = vertex_base_type::s1; };
+
+	namespace nv4097
+	{
+		///// Program management
+
+		void set_shader_program_dirty(context* ctx, u32, u32)
+		{
+			RSX(ctx)->m_graphics_state |= rsx::pipeline_state::fragment_program_ucode_dirty;
+		}
+
+		void set_transform_constant::impl(context* ctx, u32 reg, u32 arg)
+		{
+			const u32 index = reg - NV4097_SET_TRANSFORM_CONSTANT;
+			const u32 constant_id = index / 4;
+			const u8 subreg = index % 4;
+
+			// FIFO args count including this one
+			const u32 fifo_args_cnt = RSX(ctx)->fifo_ctrl->get_remaining_args_count() + 1;
+
+			// The range of methods this function resposible to
+			const u32 method_range = 32 - index;
+
+			// Get limit imposed by FIFO PUT (if put is behind get it will result in a number ignored by min)
+			const u32 fifo_read_limit = static_cast<u32>(((RSX(ctx)->ctrl->put & ~3ull) - (RSX(ctx)->fifo_ctrl->get_pos())) / 4);
+
+			const u32 count = std::min<u32>({ fifo_args_cnt, fifo_read_limit, method_range });
+
+			const u32 load = REGS(ctx)->transform_constant_load();
+
+			u32 rcount = count;
+			if (const u32 max = (load + constant_id) * 4 + count + subreg, limit = 468 * 4; max > limit)
+			{
+				// Ignore addresses outside the usable [0, 467] range
+				rsx_log.warning("Invalid transform register index (load=%u, index=%u, count=%u)", load, index, count);
+
+				if ((max - count) < limit)
+					rcount -= max - limit;
+				else
+					rcount = 0;
+			}
+
+			const auto values = &REGS(ctx)->transform_constants[load + constant_id][subreg];
+
+			const auto fifo_span = RSX(ctx)->fifo_ctrl->get_current_arg_ptr();
+
+			if (fifo_span.size() < rcount)
+			{
+				rcount = ::size32(fifo_span);
+			}
+
+			if (RSX(ctx)->m_graphics_state & rsx::pipeline_state::transform_constants_dirty)
+			{
+				// Minor optimization: don't compare values if we already know we need invalidation
+				copy_data_swap_u32(values, fifo_span.data(), rcount);
+			}
+			else
+			{
+				if (copy_data_swap_u32_cmp(values, fifo_span.data(), rcount))
+				{
+					// Transform constants invalidation is expensive (~8k bytes per update)
+					RSX(ctx)->m_graphics_state |= rsx::pipeline_state::transform_constants_dirty;
+				}
+			}
+
+			RSX(ctx)->fifo_ctrl->skip_methods(rcount - 1);
+		}
+
+		void set_transform_program::impl(context* ctx, u32 reg, u32 arg)
+		{
+			const u32 index = reg - NV4097_SET_TRANSFORM_PROGRAM;
+
+			// FIFO args count including this one
+			const u32 fifo_args_cnt = RSX(ctx)->fifo_ctrl->get_remaining_args_count() + 1;
+
+			// The range of methods this function resposible to
+			const u32 method_range = 32 - index;
+
+			// Get limit imposed by FIFO PUT (if put is behind get it will result in a number ignored by min)
+			const u32 fifo_read_limit = static_cast<u32>(((RSX(ctx)->ctrl->put & ~3ull) - (RSX(ctx)->fifo_ctrl->get_pos())) / 4);
+
+			const u32 count = std::min<u32>({ fifo_args_cnt, fifo_read_limit, method_range });
+
+			const u32 load_pos = REGS(ctx)->transform_program_load();
+
+			u32 rcount = count;
+
+			if (const u32 max = load_pos * 4 + rcount + (index % 4);
+				max > max_vertex_program_instructions * 4)
+			{
+				rsx_log.warning("Program buffer overflow! Attempted to write %u VP instructions.", max / 4);
+				rcount -= max - (max_vertex_program_instructions * 4);
+			}
+
+			const auto fifo_span = RSX(ctx)->fifo_ctrl->get_current_arg_ptr();
+
+			if (fifo_span.size() < rcount)
+			{
+				rcount = ::size32(fifo_span);
+			}
+
+			copy_data_swap_u32(&REGS(ctx)->transform_program[load_pos * 4 + index % 4], fifo_span.data(), rcount);
+
+			RSX(ctx)->m_graphics_state |= rsx::pipeline_state::vertex_program_ucode_dirty;
+			REGS(ctx)->transform_program_load_set(load_pos + ((rcount + index % 4) / 4));
+			RSX(ctx)->fifo_ctrl->skip_methods(rcount - 1);
+		}
+
+		///// Texture management
+
+		///// Surface management
+
+		void set_surface_dirty_bit(context* ctx, u32 reg, u32 arg)
+		{
+			if (arg == REGS(ctx)->latch)
+			{
+				return;
+			}
+
+			switch (reg)
+			{
+			case NV4097_SET_SURFACE_COLOR_TARGET:
+				RSX(ctx)->m_graphics_state |= rsx::pipeline_state::pipeline_config_dirty;
+				break;
+			case NV4097_SET_SURFACE_CLIP_VERTICAL:
+			case NV4097_SET_SURFACE_CLIP_HORIZONTAL:
+				RSX(ctx)->m_graphics_state |= rsx::pipeline_state::vertex_state_dirty;
+				break;
+			default:
+				break;
+			}
+
+			RSX(ctx)->m_graphics_state.set(rtt_config_dirty);
+			RSX(ctx)->m_graphics_state.clear(rtt_config_contested);
+		}
+
+		void set_surface_format(context* ctx, u32 reg, u32 arg)
+		{
+			// The high bits of this register are just log2(dimension), ignore them
+			if ((arg & 0xFFFF) == (REGS(ctx)->latch & 0xFFFF))
+			{
+				return;
+			}
+
+			// The important parameters have changed (format, type, antialias)
+			RSX(ctx)->m_graphics_state |= rsx::pipeline_state::pipeline_config_dirty;
+
+			// Check if we need to also update fragment state
+			const auto current = REGS(ctx)->decode<NV4097_SET_SURFACE_FORMAT>(arg);
+			const auto previous = REGS(ctx)->decode<NV4097_SET_SURFACE_FORMAT>(REGS(ctx)->latch);
+
+			if (*current.antialias() != *previous.antialias() ||                         // Antialias control has changed, update ROP parameters
+				current.is_integer_color_format() != previous.is_integer_color_format()) // The type of color format also requires ROP control update
+			{
+				RSX(ctx)->m_graphics_state |= rsx::pipeline_state::fragment_state_dirty;
+			}
+
+			set_surface_dirty_bit(ctx, reg, arg);
+		}
+
+		void set_surface_options_dirty_bit(context* ctx, u32 reg, u32 arg)
+		{
+			if (arg != REGS(ctx)->latch)
+			{
+				RSX(ctx)->on_framebuffer_options_changed(reg);
+				RSX(ctx)->m_graphics_state |= rsx::pipeline_config_dirty;
+			}
+		}
+
+		void set_color_mask(context* ctx, u32 reg, u32 arg)
+		{
+			if (arg == REGS(ctx)->latch)
+			{
+				return;
+			}
+
+			if (REGS(ctx)->decode<NV4097_SET_COLOR_MASK>(arg).is_invalid()) [[ unlikely ]]
+			{
+				REGS(ctx)->decode(reg, REGS(ctx)->latch);
+			}
+			else
+			{
+				set_surface_options_dirty_bit(ctx, reg, arg);
+			}
+		}
+
+		void set_stencil_op(context* ctx, u32 reg, u32 arg)
+		{
+			if (arg == REGS(ctx)->latch)
+			{
+				return;
+			}
+
+			const auto typed = to_stencil_op(arg);
+			if (typed) [[ likely ]]
+			{
+				set_surface_options_dirty_bit(ctx, reg, arg);
+			}
+			else
+			{
+				REGS(ctx)->decode(reg, REGS(ctx)->latch);
+			}
+		}
+
+		///// Draw call setup (vertex, etc)
+
+		void set_array_element16(context* ctx, u32, u32 arg)
+		{
+			if (RSX(ctx)->in_begin_end)
+			{
+				RSX(ctx)->append_array_element(arg & 0xFFFF);
+				RSX(ctx)->append_array_element(arg >> 16);
+			}
+		}
+
+		void set_array_element32(context* ctx, u32, u32 arg)
+		{
+			if (RSX(ctx)->in_begin_end)
+				RSX(ctx)->append_array_element(arg);
+		}
+
+		void draw_arrays(context* /*rsx*/, u32 /*reg*/, u32 arg)
+		{
+			REGS(ctx)->current_draw_clause.command = rsx::draw_command::array;
+			rsx::registers_decoder<NV4097_DRAW_ARRAYS>::decoded_type v(arg);
+
+			REGS(ctx)->current_draw_clause.append(v.start(), v.count());
+		}
+
+		void draw_index_array(context* /*rsx*/, u32 /*reg*/, u32 arg)
+		{
+			REGS(ctx)->current_draw_clause.command = rsx::draw_command::indexed;
+			rsx::registers_decoder<NV4097_DRAW_INDEX_ARRAY>::decoded_type v(arg);
+
+			REGS(ctx)->current_draw_clause.append(v.start(), v.count());
+		}
+
+		void draw_inline_array(context* /*rsx*/, u32 /*reg*/, u32 arg)
+		{
+			arg = std::bit_cast<u32, be_t<u32>>(arg);
+			REGS(ctx)->current_draw_clause.command = rsx::draw_command::inlined_array;
+			REGS(ctx)->current_draw_clause.inline_vertex_array.push_back(arg);
+		}
+
+		void set_transform_program_start(context* ctx, u32 reg, u32)
+		{
+			if (REGS(ctx)->registers[reg] != REGS(ctx)->latch)
+			{
+				RSX(ctx)->m_graphics_state |= rsx::pipeline_state::vertex_program_ucode_dirty;
+			}
+		}
+
+		void set_vertex_attribute_output_mask(context* ctx, u32 reg, u32)
+		{
+			if (REGS(ctx)->registers[reg] != REGS(ctx)->latch)
+			{
+				RSX(ctx)->m_graphics_state |= rsx::pipeline_state::vertex_program_state_dirty;
+			}
+		}
+
+		void set_vertex_base_offset(context* ctx, u32 reg, u32 arg)
+		{
+			util::push_draw_parameter_change(ctx, vertex_base_modifier_barrier, reg, arg);
+		}
+
+		void set_index_base_offset(context* ctx, u32 reg, u32 arg)
+		{
+			util::push_draw_parameter_change(ctx, index_base_modifier_barrier, reg, arg);
+		}
+
+		void check_index_array_dma(context* ctx, u32 reg, u32 arg)
+		{
+			// Check if either location or index type are invalid
+			if (arg & ~(CELL_GCM_LOCATION_MAIN | (CELL_GCM_DRAW_INDEX_ARRAY_TYPE_16 << 4)))
+			{
+				// Ignore invalid value, recover
+				REGS(ctx)->registers[reg] = REGS(ctx)->latch;
+				RSX(ctx)->recover_fifo();
+
+				rsx_log.error("Invalid NV4097_SET_INDEX_ARRAY_DMA value: 0x%x", arg);
+			}
+		}
+
+		///// Drawing
+
+		void set_begin_end(context* ctx, u32 /*reg*/, u32 arg)
+		{
+			// Ignore upper bits
+			if (const u8 prim = static_cast<u8>(arg))
+			{
+				const auto primitive_type = to_primitive_type(prim);
+				if (!primitive_type)
+				{
+					RSX(ctx)->in_begin_end = true;
+
+					rsx_log.warning("Invalid NV4097_SET_BEGIN_END value: 0x%x", arg);
+					return;
+				}
+
+				REGS(ctx)->current_draw_clause.reset(primitive_type);
+				RSX(ctx)->begin();
+				return;
+			}
+
+			// Check if we have immediate mode vertex data in a driver-local buffer
+			if (REGS(ctx)->current_draw_clause.command == rsx::draw_command::none)
+			{
+				const u32 push_buffer_vertices_count = RSX(ctx)->get_push_buffer_vertex_count();
+				const u32 push_buffer_index_count = RSX(ctx)->get_push_buffer_index_count();
+
+				// Need to set this flag since it overrides some register contents
+				REGS(ctx)->current_draw_clause.is_immediate_draw = true;
+
+				if (push_buffer_index_count)
+				{
+					REGS(ctx)->current_draw_clause.command = rsx::draw_command::indexed;
+					REGS(ctx)->current_draw_clause.append(0, push_buffer_index_count);
+				}
+				else if (push_buffer_vertices_count)
+				{
+					REGS(ctx)->current_draw_clause.command = rsx::draw_command::array;
+					REGS(ctx)->current_draw_clause.append(0, push_buffer_vertices_count);
+				}
+			}
+			else
+			{
+				REGS(ctx)->current_draw_clause.is_immediate_draw = false;
+			}
+
+			if (!REGS(ctx)->current_draw_clause.empty())
+			{
+				REGS(ctx)->current_draw_clause.compile();
+
+				if (g_cfg.video.disable_video_output)
+				{
+					RSX(ctx)->execute_nop_draw();
+					RSX(ctx)->rsx::thread::end();
+					return;
+				}
+
+				RSX(ctx)->end();
+			}
+			else
+			{
+				RSX(ctx)->in_begin_end = false;
+			}
+
+			if (RSX(ctx)->pause_on_draw && RSX(ctx)->pause_on_draw.exchange(false))
+			{
+				RSX(ctx)->state -= cpu_flag::dbg_step;
+				RSX(ctx)->state += cpu_flag::dbg_pause;
+				RSX(ctx)->check_state();
+			}
+		}
+
+		void clear(context* ctx, u32 /*reg*/, u32 arg)
+		{
+			RSX(ctx)->clear_surface(arg);
+
+			RSX_CAPTURE_EVENT("clear");
+		}
+
+		void clear_zcull(context* ctx, u32 /*reg*/, u32 /*arg*/)
+		{
+			RSX_CAPTURE_EVENT("clear zcull memory");
+		}
+
+		void set_face_property(context* ctx, u32 reg, u32 arg)
+		{
+			if (reg == REGS(ctx)->latch)
+			{
+				return;
+			}
+
+			bool valid;
+			switch (reg)
+			{
+			case NV4097_SET_CULL_FACE:
+				valid = !!to_cull_face(arg); break;
+			case NV4097_SET_FRONT_FACE:
+				valid = !!to_front_face(arg); break;
+			default:
+				valid = false; break;
+			}
+
+			if (valid) [[ likely ]]
+			{
+				RSX(ctx)->m_graphics_state |= rsx::pipeline_config_dirty;
+			}
+			else
+			{
+				REGS(ctx)->registers[reg] = REGS(ctx)->latch;
+			}
+		}
+
+		void set_blend_equation(context* ctx, u32 reg, u32 arg)
+		{
+			if (reg == REGS(ctx)->latch)
+			{
+				return;
+			}
+
+			if (to_blend_equation(arg & 0xFFFF) &&
+				to_blend_equation((arg >> 16) & 0xFFFF)) [[ likely ]]
+			{
+				RSX(ctx)->m_graphics_state |= rsx::pipeline_config_dirty;
+			}
+			else
+			{
+				REGS(ctx)->decode(reg, REGS(ctx)->latch);
+			}
+		}
+
+		void set_blend_factor(context* ctx, u32 reg, u32 arg)
+		{
+			if (reg == REGS(ctx)->latch)
+			{
+				return;
+			}
+
+			if (to_blend_factor(arg & 0xFFFF) &&
+				to_blend_factor((arg >> 16) & 0xFFFF)) [[ likely ]]
+			{
+				RSX(ctx)->m_graphics_state |= rsx::pipeline_config_dirty;
+			}
+			else
+			{
+				REGS(ctx)->decode(reg, REGS(ctx)->latch);
+			}
+		}
+
+		///// Reports
+
+		void get_report(context* ctx, u32 /*reg*/, u32 arg)
+		{
+			u8 type = arg >> 24;
+			u32 offset = arg & 0xffffff;
+
+			auto address_ptr = util::get_report_data_impl(ctx, offset);
+			if (!address_ptr)
+			{
+				rsx_log.error("Bad argument passed to NV4097_GET_REPORT, arg=0x%X", arg);
+				return;
+			}
+
+			switch (type)
+			{
+			case CELL_GCM_ZPASS_PIXEL_CNT:
+			case CELL_GCM_ZCULL_STATS:
+			case CELL_GCM_ZCULL_STATS1:
+			case CELL_GCM_ZCULL_STATS2:
+			case CELL_GCM_ZCULL_STATS3:
+				RSX(ctx)->get_zcull_stats(type, vm::cast(address_ptr));
+				break;
+			default:
+				rsx_log.error("NV4097_GET_REPORT: Bad type %d", type);
+
+				vm::_ref<atomic_t<CellGcmReportData>>(address_ptr).atomic_op([&](CellGcmReportData& data)
+				{
+					data.timer = RSX(ctx)->timestamp();
+					data.padding = 0;
+				});
+				break;
+			}
+		}
+
+		void clear_report_value(context* ctx, u32 /*reg*/, u32 arg)
+		{
+			switch (arg)
+			{
+			case CELL_GCM_ZPASS_PIXEL_CNT:
+			case CELL_GCM_ZCULL_STATS:
+				break;
+			default:
+				rsx_log.error("NV4097_CLEAR_REPORT_VALUE: Bad type: %d", arg);
+				break;
+			}
+
+			RSX(ctx)->clear_zcull_stats(arg);
+		}
+
+		void set_render_mode(context* ctx, u32, u32 arg)
+		{
+			const u32 mode = arg >> 24;
+			switch (mode)
+			{
+			case 1:
+				RSX(ctx)->disable_conditional_rendering();
+				return;
+			case 2:
+				break;
+			default:
+				rsx_log.error("Unknown render mode %d", mode);
+				return;
+			}
+
+			const u32 offset = arg & 0xffffff;
+			auto address_ptr = util::get_report_data_impl(ctx, offset);
+
+			if (!address_ptr)
+			{
+				rsx_log.error("Bad argument passed to NV4097_SET_RENDER_ENABLE, arg=0x%X", arg);
+				return;
+			}
+
+			// Defer conditional render evaluation
+			RSX(ctx)->enable_conditional_rendering(vm::cast(address_ptr));
+		}
+
+		void set_zcull_render_enable(context* ctx, u32, u32)
+		{
+			RSX(ctx)->notify_zcull_info_changed();
+		}
+
+		void set_zcull_stats_enable(context* ctx, u32, u32)
+		{
+			RSX(ctx)->notify_zcull_info_changed();
+		}
+
+		void set_zcull_pixel_count_enable(context* ctx, u32, u32)
+		{
+			RSX(ctx)->notify_zcull_info_changed();
+		}
+
+		///// Misc (sync objects, etc)
+
+		void set_notify(context* ctx, u32 /*reg*/, u32 /*arg*/)
+		{
+			const u32 location = REGS(ctx)->context_dma_notify();
+			const u32 index = (location & 0x7) ^ 0x7;
+
+			if ((location & ~7) != (CELL_GCM_CONTEXT_DMA_NOTIFY_MAIN_0 & ~7))
+			{
+				if (rsx_log.trace)
+					rsx_log.trace("NV4097_NOTIFY: invalid context = 0x%x", REGS(ctx)->context_dma_notify());
+				return;
+			}
+
+			const u32 addr = RSX(ctx)->iomap_table.get_addr(0xf100000 + (index * 0x40));
+
+			ensure(addr != umax);
+
+			vm::_ref<atomic_t<RsxNotify>>(addr).store(
+			{
+				RSX(ctx)->timestamp(),
+				0
+			});
+		}
+
+		void texture_read_semaphore_release(context* ctx, u32 /*reg*/, u32 arg)
+		{
+			// Pipeline barrier seems to be equivalent to a SHADER_READ stage barrier.
+			// Ideally the GPU only needs to have cached all textures declared up to this point before writing the label.
+
+			// lle-gcm likes to inject system reserved semaphores, presumably for system/vsh usage
+			// Avoid calling render to avoid any havoc(flickering) they may cause from invalid flush/write
+			const u32 offset = REGS(ctx)->semaphore_offset_4097();
+
+			if (offset % 16)
+			{
+				rsx_log.error("NV4097 semaphore using unaligned offset, recovering. (offset=0x%x)", offset);
+				RSX(ctx)->recover_fifo();
+				return;
+			}
+
+			const u32 addr = get_address(offset, REGS(ctx)->semaphore_context_dma_4097());
+
+			if (RSX(ctx)->label_addr >> 28 != addr >> 28)
+			{
+				rsx_log.error("NV4097 semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr);
+			}
+
+			if (g_cfg.video.strict_rendering_mode) [[ unlikely ]]
+			{
+				util::write_gcm_label<true, true>(ctx, addr, arg);
+			}
+			else
+			{
+				util::write_gcm_label<true, false>(ctx, addr, arg);
+			}
+		}
+
+		void back_end_write_semaphore_release(context* ctx, u32 /*reg*/, u32 arg)
+		{
+			// Full pipeline barrier. GPU must flush pipeline before writing the label
+
+			const u32 offset = REGS(ctx)->semaphore_offset_4097();
+
+			if (offset % 16)
+			{
+				rsx_log.error("NV4097 semaphore using unaligned offset, recovering. (offset=0x%x)", offset);
+				RSX(ctx)->recover_fifo();
+				return;
+			}
+
+			const u32 addr = get_address(offset, REGS(ctx)->semaphore_context_dma_4097());
+
+			if (RSX(ctx)->label_addr >> 28 != addr >> 28)
+			{
+				rsx_log.error("NV4097 semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr);
+			}
+
+			const u32 val = (arg & 0xff00ff00) | ((arg & 0xff) << 16) | ((arg >> 16) & 0xff);
+			util::write_gcm_label<true, true>(ctx, addr, val);
+		}
+
+		void sync(context* ctx, u32, u32)
+		{
+			RSX(ctx)->sync();
+		}
+	}
+}
diff --git a/rpcs3/Emu/RSX/NV47/nv4097.h b/rpcs3/Emu/RSX/NV47/nv4097.h
new file mode 100644
index 0000000000..6007563434
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/nv4097.h
@@ -0,0 +1,238 @@
+// NV47 3D Engine
+#pragma once
+
+#include "common.h"
+
+namespace rsx
+{
+	enum command_barrier_type;
+	enum vertex_base_type;
+
+	namespace nv4097
+	{
+		void clear(context* ctx, u32 reg, u32 arg);
+
+		void clear_zcull(context* ctx, u32 reg, u32 arg);
+
+		void set_face_property(context* ctx, u32 reg, u32 arg);
+
+		void set_notify(context* ctx, u32 reg, u32 arg);
+
+		void texture_read_semaphore_release(context* ctx, u32 reg, u32 arg);
+
+		void back_end_write_semaphore_release(context* ctx, u32 reg, u32 arg);
+
+		void set_array_element16(context* ctx, u32, u32 arg);
+
+		void set_array_element32(context* ctx, u32, u32 arg);
+
+		void draw_arrays(context* /*rsx*/, u32 reg, u32 arg);
+
+		void draw_index_array(context* /*rsx*/, u32 reg, u32 arg);
+
+		void draw_inline_array(context* /*rsx*/, u32 reg, u32 arg);
+
+		void set_transform_program_start(context* ctx, u32 reg, u32);
+
+		void set_vertex_attribute_output_mask(context* ctx, u32 reg, u32);
+
+		void set_begin_end(context* ctxthr, u32 reg, u32 arg);
+
+		void get_report(context* ctx, u32 reg, u32 arg);
+
+		void clear_report_value(context* ctx, u32 reg, u32 arg);
+
+		void set_render_mode(context* ctx, u32, u32 arg);
+
+		void set_zcull_render_enable(context* ctx, u32, u32);
+
+		void set_zcull_stats_enable(context* ctx, u32, u32);
+
+		void set_zcull_pixel_count_enable(context* ctx, u32, u32);
+
+		void sync(context* ctx, u32, u32);
+
+		void set_shader_program_dirty(context* ctx, u32, u32);
+
+		void set_surface_dirty_bit(context* ctx, u32 reg, u32 arg);
+
+		void set_surface_format(context* ctx, u32 reg, u32 arg);
+
+		void set_surface_options_dirty_bit(context* ctx, u32 reg, u32 arg);
+
+		void set_color_mask(context* ctx, u32 reg, u32 arg);
+
+		void set_stencil_op(context* ctx, u32 reg, u32 arg);
+
+		void set_vertex_base_offset(context* ctx, u32 reg, u32 arg);
+
+		void set_index_base_offset(context* ctx, u32 reg, u32 arg);
+
+		void check_index_array_dma(context* ctx, u32 reg, u32 arg);
+
+		void set_blend_equation(context* ctx, u32 reg, u32 arg);
+
+		void set_blend_factor(context* ctx, u32 reg, u32 arg);
+
+#define RSX(ctx) ctx->rsxthr
+#define REGS(ctx) (&rsx::method_registers)
+
+		/**
+		* id = base method register
+		* index = register index in method
+		* count = element count per attribute
+		* register_count = number of registers consumed per attribute. E.g 3-element methods have padding
+		*/
+		template<u32 id, u32 index, int count, int register_count, typename type>
+		void set_vertex_data_impl(context* ctx, u32 arg)
+		{
+			static constexpr usz increment_per_array_index = (register_count * sizeof(type)) / sizeof(u32);
+
+			static constexpr usz attribute_index = index / increment_per_array_index;
+			static constexpr usz vertex_subreg = index % increment_per_array_index;
+
+			constexpr auto vtype = vertex_data_type_from_element_type<type>::type;
+			static_assert(vtype != rsx::vertex_base_type::cmp);
+			static_assert(vtype != rsx::vertex_base_type::ub256);
+
+			// Convert LE data to BE layout
+			if constexpr (sizeof(type) == 4)
+			{
+				arg = std::bit_cast<u32, be_t<u32>>(arg);
+			}
+			else if constexpr (sizeof(type) == 2)
+			{
+				// 2 16-bit values packed in 1 32-bit word
+				const auto be_data = std::bit_cast<u32, be_t<u32>>(arg);
+
+				// After u32 swap, the components are in the wrong position
+				arg = (be_data << 16) | (be_data >> 16);
+			}
+
+			util::push_vertex_data(attribute_index, vertex_subreg, count, vtype);
+		}
+
+		template<u32 index>
+		struct set_vertex_data4ub_m
+		{
+			static void impl(context* ctx, u32 reg, u32 arg)
+			{
+				set_vertex_data_impl<NV4097_SET_VERTEX_DATA4UB_M, index, 4, 4, u8>(ctx, arg);
+			}
+		};
+
+		template<u32 index>
+		struct set_vertex_data1f_m
+		{
+			static void impl(context* ctx, u32 reg, u32 arg)
+			{
+				set_vertex_data_impl<NV4097_SET_VERTEX_DATA1F_M, index, 1, 1, f32>(ctx, arg);
+			}
+		};
+
+		template<u32 index>
+		struct set_vertex_data2f_m
+		{
+			static void impl(context* ctx, u32 reg, u32 arg)
+			{
+				set_vertex_data_impl<NV4097_SET_VERTEX_DATA2F_M, index, 2, 2, f32>(ctx, arg);
+			}
+		};
+
+		template<u32 index>
+		struct set_vertex_data3f_m
+		{
+			static void impl(context* ctx, u32 reg, u32 arg)
+			{
+				//Register alignment is only 1, 2, or 4 (Rachet & Clank 2)
+				set_vertex_data_impl<NV4097_SET_VERTEX_DATA3F_M, index, 3, 4, f32>(ctx, arg);
+			}
+		};
+
+		template<u32 index>
+		struct set_vertex_data4f_m
+		{
+			static void impl(context* ctx, u32 reg, u32 arg)
+			{
+				set_vertex_data_impl<NV4097_SET_VERTEX_DATA4F_M, index, 4, 4, f32>(ctx, arg);
+			}
+		};
+
+		template<u32 index>
+		struct set_vertex_data2s_m
+		{
+			static void impl(context* ctx, u32 reg, u32 arg)
+			{
+				set_vertex_data_impl<NV4097_SET_VERTEX_DATA2S_M, index, 2, 2, u16>(ctx, arg);
+			}
+		};
+
+		template<u32 index>
+		struct set_vertex_data4s_m
+		{
+			static void impl(context* ctx, u32 reg, u32 arg)
+			{
+				set_vertex_data_impl<NV4097_SET_VERTEX_DATA4S_M, index, 4, 4, u16>(ctx, arg);
+			}
+		};
+
+		template<u32 index>
+		struct set_vertex_data_scaled4s_m
+		{
+			static void impl(context* ctx, u32 reg, u32 arg)
+			{
+				set_vertex_data_impl<NV4097_SET_VERTEX_DATA_SCALED4S_M, index, 4, 4, s16>(ctx, arg);
+			}
+		};
+
+		struct set_transform_constant
+		{
+			static void impl(context* ctx, u32 reg, u32 arg);
+		};
+
+		struct set_transform_program
+		{
+			static void impl(context* ctx, u32 reg, u32 arg);
+		};
+
+		template<u32 index>
+		struct set_vertex_array_offset
+		{
+			static void impl(context* ctx, u32 reg, u32 arg)
+			{
+				util::push_draw_parameter_change(ctx, vertex_array_offset_modifier_barrier, reg, arg);
+			}
+		};
+
+		template<u32 index>
+		struct set_texture_dirty_bit
+		{
+			static void impl(context* ctx, u32 reg, u32 arg)
+			{
+				RSX(ctx)->m_textures_dirty[index] = true;
+
+				if (RSX(ctx)->current_fp_metadata.referenced_textures_mask & (1 << index))
+				{
+					RSX(ctx)->m_graphics_state |= rsx::pipeline_state::fragment_program_state_dirty;
+				}
+			}
+		};
+
+		template<u32 index>
+		struct set_vertex_texture_dirty_bit
+		{
+			static void impl(context* ctx, u32 reg, u32 arg)
+			{
+				RSX(ctx)->m_vertex_textures_dirty[index] = true;
+
+				if (RSX(ctx)->current_vp_metadata.referenced_textures_mask & (1 << index))
+				{
+					RSX(ctx)->m_graphics_state |= rsx::pipeline_state::vertex_program_state_dirty;
+				}
+			}
+		};
+
+#undef RSX
+#undef REGS
+	}
+}
diff --git a/rpcs3/Emu/RSX/NV47/nv47.h b/rpcs3/Emu/RSX/NV47/nv47.h
new file mode 100644
index 0000000000..8a27f4d14c
--- /dev/null
+++ b/rpcs3/Emu/RSX/NV47/nv47.h
@@ -0,0 +1,7 @@
+// 3D Engine definitions
+#pragma once
+
+#include "nv3089.h"
+#include "nv308a.h"
+#include "nv406e.h"
+#include "nv4097.h"
diff --git a/rpcs3/Emu/RSX/RSXDisAsm.cpp b/rpcs3/Emu/RSX/RSXDisAsm.cpp
index 81ba45c586..e1ac01c001 100644
--- a/rpcs3/Emu/RSX/RSXDisAsm.cpp
+++ b/rpcs3/Emu/RSX/RSXDisAsm.cpp
@@ -8,7 +8,7 @@
 
 namespace rsx
 {
-	void invalid_method(thread*, u32, u32);
+	void invalid_method(context*, u32, u32);
 }
 
 u32 RSXDisAsm::disasm(u32 pc)
diff --git a/rpcs3/Emu/RSX/RSXFIFO.cpp b/rpcs3/Emu/RSX/RSXFIFO.cpp
index 637db20913..825bf88e5f 100644
--- a/rpcs3/Emu/RSX/RSXFIFO.cpp
+++ b/rpcs3/Emu/RSX/RSXFIFO.cpp
@@ -7,6 +7,8 @@
 #include "Core/RSXReservationLock.hpp"
 #include "Emu/Memory/vm_reservation.h"
 #include "Emu/Cell/lv2/sys_rsx.h"
+#include "NV47/context.h"
+
 #include "util/asm.hpp"
 
 #include <bitset>
@@ -808,6 +810,9 @@ namespace rsx
 				}
 			}
 
+			// FIXME: This should be properly managed
+			rsx::context ctx{ .rsxthr = this, .register_state = &method_registers };
+
 			if (m_flattener.is_enabled()) [[unlikely]]
 			{
 				switch(m_flattener.test(command))
@@ -819,15 +824,15 @@ namespace rsx
 				case FIFO::EMIT_END:
 				{
 					// Emit end command to close existing scope
-					//ensure(in_begin_end);
-					methods[NV4097_SET_BEGIN_END](this, NV4097_SET_BEGIN_END, 0);
+					AUDIT(in_begin_end);
+					methods[NV4097_SET_BEGIN_END](&ctx, NV4097_SET_BEGIN_END, 0);
 					break;
 				}
 				case FIFO::EMIT_BARRIER:
 				{
-					//ensure(in_begin_end);
-					methods[NV4097_SET_BEGIN_END](this, NV4097_SET_BEGIN_END, 0);
-					methods[NV4097_SET_BEGIN_END](this, NV4097_SET_BEGIN_END, m_flattener.get_primitive());
+					AUDIT(in_begin_end);
+					methods[NV4097_SET_BEGIN_END](&ctx, NV4097_SET_BEGIN_END, 0);
+					methods[NV4097_SET_BEGIN_END](&ctx, NV4097_SET_BEGIN_END, m_flattener.get_primitive());
 					break;
 				}
 				default:
@@ -846,19 +851,19 @@ namespace rsx
 			const u32 reg = (command.reg & 0xffff) >> 2;
 			const u32 value = command.value;
 
-			method_registers.decode(reg, value);
+			ctx.register_state->decode(reg, value);
 
 			if (auto method = methods[reg])
 			{
-				method(this, reg, value);
+				method(&ctx, reg, value);
 
 				if (state & cpu_flag::again)
 				{
-					method_registers.decode(reg, method_registers.register_previous_value);
+					ctx.register_state->decode(reg, ctx.register_state->latch);
 					break;
 				}
 			}
-			else if (method_registers.register_previous_value != value)
+			else if (ctx.register_state->latch != value)
 			{
 				// Something changed, set signal flags if any specified
 				m_graphics_state |= state_signals[reg];
diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp
index 498433d776..0d195a18f2 100644
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@@ -3383,7 +3383,7 @@ namespace rsx
 		return fifo_ctrl->last_cmd();
 	}
 
-	void invalid_method(thread*, u32, u32);
+	void invalid_method(context*, u32, u32);
 
 	void thread::dump_regs(std::string& result, std::any& /*custom_data*/) const
 	{
diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp
index 9ddce9c959..706d07219a 100644
--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@@ -10,6 +10,9 @@
 #include "Emu/Cell/lv2/sys_rsx.h"
 #include "Emu/RSX/Common/BufferUtils.h"
 
+#include "Emu/RSX/NV47/nv47.h"
+#include "Emu/RSX/NV47/context_accessors.define.h"
+
 namespace rsx
 {
 	rsx_state method_registers;
@@ -17,1807 +20,36 @@ namespace rsx
 	std::array<rsx_method_t, 0x10000 / 4> methods{};
 	std::array<u32, 0x10000 / 4> state_signals{};
 
-	void invalid_method(thread* rsx, u32 reg, u32 arg)
+	void invalid_method(context* ctx, u32 reg, u32 arg)
 	{
 		//Don't throw, gather information and ignore broken/garbage commands
 		//TODO: Investigate why these commands are executed at all. (Heap corruption? Alignment padding?)
-		const u32 cmd = rsx->get_fifo_cmd();
+		const u32 cmd = RSX(ctx)->get_fifo_cmd();
 		rsx_log.error("Invalid RSX method 0x%x (arg=0x%x, start=0x%x, count=0x%x, non-inc=%s)", reg << 2, arg,
 		cmd & 0xfffc, (cmd >> 18) & 0x7ff, !!(cmd & RSX_METHOD_NON_INCREMENT_CMD));
 
 		if (g_cfg.core.rsx_fifo_accuracy != rsx_fifo_mode::as_ps3)
 		{
-			rsx->recover_fifo();
+			RSX(ctx)->recover_fifo();
 		}
 	}
 
-	static void trace_method(thread* /*rsx*/, u32 reg, u32 arg)
+	static void trace_method(context* /*ctx*/, u32 reg, u32 arg)
 	{
 		// For unknown yet valid methods
 		rsx_log.trace("RSX method 0x%x (arg=0x%x)", reg << 2, arg);
 	}
 
-	template <bool FlushDMA, bool FlushPipe>
-	void write_gcm_label(thread* rsx, u32 address, u32 data)
-	{
-		const bool is_flip_sema = (address == (rsx->label_addr + 0x10) || address == (rsx->device_addr + 0x30));
-		if (!is_flip_sema)
-		{
-			// First, queue the GPU work. If it flushes the queue for us, the following routines will be faster.
-			const bool handled = rsx->get_backend_config().supports_host_gpu_labels && rsx->release_GCM_label(address, data);
-
-			if (vm::_ref<RsxSemaphore>(address).val == data)
-			{
-				// It's a no-op to write the same value (although there is a delay in real-hw so it's more accurate to allow GPU label in this case)
-				return;
-			}
-
-			if constexpr (FlushDMA)
-			{
-				// If the backend handled the request, this call will basically be a NOP
-				g_fxo->get<rsx::dma_manager>().sync();
-			}
-
-			if constexpr (FlushPipe)
-			{
-				// Manually flush the pipeline.
-				// It is possible to stream report writes using the host GPU, but that generates too much submit traffic.
-				rsx->sync();
-			}
-
-			if (handled)
-			{
-				// Backend will handle it, nothing to write.
-				return;
-			}
-		}
-
-		vm::_ref<RsxSemaphore>(address).val = data;
-	}
-
-	template<typename Type> struct vertex_data_type_from_element_type;
-	template<> struct vertex_data_type_from_element_type<float> { static const vertex_base_type type = vertex_base_type::f; };
-	template<> struct vertex_data_type_from_element_type<f16> { static const vertex_base_type type = vertex_base_type::sf; };
-	template<> struct vertex_data_type_from_element_type<u8> { static const vertex_base_type type = vertex_base_type::ub; };
-	template<> struct vertex_data_type_from_element_type<u16> { static const vertex_base_type type = vertex_base_type::s32k; };
-	template<> struct vertex_data_type_from_element_type<s16> { static const vertex_base_type type = vertex_base_type::s1; };
-
-	namespace nv406e
-	{
-		void set_reference(thread* rsx, u32 /*reg*/, u32 arg)
-		{
-			rsx->sync();
-
-			// Write ref+get (get will be written again with the same value at command end)
-			auto& dma = vm::_ref<RsxDmaControl>(rsx->dma_address);
-			dma.get.release(rsx->fifo_ctrl->get_pos());
-			dma.ref.store(arg);
-		}
-
-		void semaphore_acquire(thread* rsx, u32 /*reg*/, u32 arg)
-		{
-			rsx->sync_point_request.release(true);
-			const u32 addr = get_address(method_registers.semaphore_offset_406e(), method_registers.semaphore_context_dma_406e());
-
-			const auto& sema = vm::_ref<RsxSemaphore>(addr).val;
-
-			if (sema == arg)
-			{
-				// Flip semaphore doesnt need wake-up delay
-				if (addr != rsx->label_addr + 0x10)
-				{
-					rsx->flush_fifo();
-					rsx->fifo_wake_delay(2);
-				}
-
-				return;
-			}
-			else
-			{
-				rsx->flush_fifo();
-			}
-
-			u64 start = rsx::uclock();
-			u64 last_check_val = start;
-
-			while (sema != arg)
-			{
-				if (rsx->test_stopped())
-				{
-					rsx->state += cpu_flag::again;
-					return;
-				}
-
-				if (const auto tdr = static_cast<u64>(g_cfg.video.driver_recovery_timeout))
-				{
-					const u64 current = rsx::uclock();
-
-					if (current - last_check_val > 20'000)
-					{
-						// Suspicious amnount of time has passed
-						// External pause such as debuggers' pause or operating system sleep may have taken place
-						// Ignore it
-						start += current - last_check_val;
-					}
-
-					last_check_val = current;
-
-					if ((current - start) > tdr)
-					{
-						// If longer than driver timeout force exit
-						rsx_log.error("nv406e::semaphore_acquire has timed out. semaphore_address=0x%X", addr);
-						break;
-					}
-				}
-
-				rsx->cpu_wait({});
-			}
-
-			rsx->fifo_wake_delay();
-			rsx->performance_counters.idle_time += (rsx::uclock() - start);
-		}
-
-		void semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
-		{
-			const u32 offset = method_registers.semaphore_offset_406e();
-
-			if (offset % 4)
-			{
-				rsx_log.warning("NV406E semaphore release is using unaligned semaphore, ignoring. (offset=0x%x)", offset);
-				return;
-			}
-
-			const u32 ctxt = method_registers.semaphore_context_dma_406e();
-
-			// By avoiding doing this on flip's semaphore release
-			// We allow last gcm's registers reset to occur in case of a crash
-			if (const bool is_flip_sema = (offset == 0x10 && ctxt == CELL_GCM_CONTEXT_DMA_SEMAPHORE_R);
-				!is_flip_sema)
-			{
-				rsx->sync_point_request.release(true);
-			}
-
-			const u32 addr = get_address(offset, ctxt);
-
-			// TODO: Check if possible to write on reservations
-			if (rsx->label_addr >> 28 != addr >> 28)
-			{
-				rsx_log.error("NV406E semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr);
-				rsx->recover_fifo();
-				return;
-			}
-
-			if (addr == rsx->device_addr + 0x30 && !arg)
-			{
-				// HW flip synchronization related, 1 is not written without display queue command (TODO: make it behave as real hw)
-				arg = 1;
-			}
-
-			write_gcm_label<false, true>(rsx, addr, arg);
-		}
-	}
-
-	namespace nv4097
-	{
-		void clear(thread* rsx, u32 /*reg*/, u32 arg)
-		{
-			rsx->clear_surface(arg);
-
-			if (rsx->capture_current_frame)
-			{
-				rsx->capture_frame("clear");
-			}
-		}
-
-		void clear_zcull(thread* rsx, u32 /*reg*/, u32 /*arg*/)
-		{
-			if (rsx->capture_current_frame)
-			{
-				rsx->capture_frame("clear zcull memory");
-			}
-		}
-
-		void set_face_property(thread* rsx, u32 reg, u32 arg)
-		{
-			if (reg == method_registers.register_previous_value)
-			{
-				return;
-			}
-
-			bool valid;
-			switch (reg)
-			{
-			case NV4097_SET_CULL_FACE:
-				valid = !!to_cull_face(arg); break;
-			case NV4097_SET_FRONT_FACE:
-				valid = !!to_front_face(arg); break;
-			default:
-				valid = false; break;
-			}
-
-			if (valid) [[ likely ]]
-			{
-				rsx->m_graphics_state |= rsx::pipeline_config_dirty;
-			}
-			else
-			{
-				method_registers.registers[reg] = method_registers.register_previous_value;
-			}
-		}
-
-		void set_notify(thread* rsx, u32 /*reg*/, u32 /*arg*/)
-		{
-			const u32 location = method_registers.context_dma_notify();
-			const u32 index = (location & 0x7) ^ 0x7;
-
-			if ((location & ~7) != (CELL_GCM_CONTEXT_DMA_NOTIFY_MAIN_0 & ~7))
-			{
-				if (rsx_log.trace)
-					rsx_log.trace("NV4097_NOTIFY: invalid context = 0x%x", method_registers.context_dma_notify());
-				return;
-			}
-
-			const u32 addr = rsx->iomap_table.get_addr(0xf100000 + (index * 0x40));
-
-			ensure(addr != umax);
-
-			vm::_ref<atomic_t<RsxNotify>>(addr).store(
-			{
-				rsx->timestamp(),
-				0
-			});
-		}
-
-		void texture_read_semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
-		{
-			// Pipeline barrier seems to be equivalent to a SHADER_READ stage barrier.
-			// Ideally the GPU only needs to have cached all textures declared up to this point before writing the label.
-
-			// lle-gcm likes to inject system reserved semaphores, presumably for system/vsh usage
-			// Avoid calling render to avoid any havoc(flickering) they may cause from invalid flush/write
-			const u32 offset = method_registers.semaphore_offset_4097();
-
-			if (offset % 16)
-			{
-				rsx_log.error("NV4097 semaphore using unaligned offset, recovering. (offset=0x%x)", offset);
-				rsx->recover_fifo();
-				return;
-			}
-
-			const u32 addr = get_address(offset, method_registers.semaphore_context_dma_4097());
-
-			if (rsx->label_addr >> 28 != addr >> 28)
-			{
-				rsx_log.error("NV4097 semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr);
-			}
-
-			if (g_cfg.video.strict_rendering_mode) [[ unlikely ]]
-			{
-				write_gcm_label<true, true>(rsx, addr, arg);
-			}
-			else
-			{
-				write_gcm_label<true, false>(rsx, addr, arg);
-			}
-		}
-
-		void back_end_write_semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
-		{
-			// Full pipeline barrier. GPU must flush pipeline before writing the label
-
-			const u32 offset = method_registers.semaphore_offset_4097();
-
-			if (offset % 16)
-			{
-				rsx_log.error("NV4097 semaphore using unaligned offset, recovering. (offset=0x%x)", offset);
-				rsx->recover_fifo();
-				return;
-			}
-
-			const u32 addr = get_address(offset, method_registers.semaphore_context_dma_4097());
-
-			if (rsx->label_addr >> 28 != addr >> 28)
-			{
-				rsx_log.error("NV4097 semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr);
-			}
-
-			const u32 val = (arg & 0xff00ff00) | ((arg & 0xff) << 16) | ((arg >> 16) & 0xff);
-			write_gcm_label<true, true>(rsx, addr, val);
-		}
-
-		/**
-		 * id = base method register
-		 * index = register index in method
-		 * count = element count per attribute
-		 * register_count = number of registers consumed per attribute. E.g 3-element methods have padding
-		 */
-		template<u32 id, u32 index, int count, int register_count, typename type>
-		void set_vertex_data_impl(thread* rsx, u32 arg)
-		{
-			static constexpr usz increment_per_array_index = (register_count * sizeof(type)) / sizeof(u32);
-
-			static constexpr usz attribute_index = index / increment_per_array_index;
-			static constexpr usz vertex_subreg = index % increment_per_array_index;
-
-			constexpr auto vtype = vertex_data_type_from_element_type<type>::type;
-			static_assert(vtype != rsx::vertex_base_type::cmp);
-			static_assert(vtype != rsx::vertex_base_type::ub256);
-
-			// Convert LE data to BE layout
-			if constexpr (sizeof(type) == 4)
-			{
-				arg = std::bit_cast<u32, be_t<u32>>(arg);
-			}
-			else if constexpr (sizeof(type) == 2)
-			{
-				// 2 16-bit values packed in 1 32-bit word
-				const auto be_data = std::bit_cast<u32, be_t<u32>>(arg);
-
-				// After u32 swap, the components are in the wrong position
-				arg = (be_data << 16) | (be_data >> 16);
-			}
-
-			if (rsx->in_begin_end)
-			{
-				// Update to immediate mode register/array
-				// NOTE: Push buffers still behave like register writes.
-				// You do not need to specify each attribute for each vertex, the register is referenced instead.
-				// This is classic OpenGL 1.x behavior as I remember.
-				rsx->append_to_push_buffer(attribute_index, count, vertex_subreg, vtype, arg);
-			}
-
-			auto& info = rsx::method_registers.register_vertex_info[attribute_index];
-
-			info.type = vtype;
-			info.size = count;
-			info.frequency = 0;
-			info.stride = 0;
-			rsx::method_registers.register_vertex_info[attribute_index].data[vertex_subreg] = arg;
-		}
-
-		template<u32 index>
-		struct set_vertex_data4ub_m
-		{
-			static void impl(thread* rsx, u32 /*reg*/, u32 arg)
-			{
-				set_vertex_data_impl<NV4097_SET_VERTEX_DATA4UB_M, index, 4, 4, u8>(rsx, arg);
-			}
-		};
-
-		template<u32 index>
-		struct set_vertex_data1f_m
-		{
-			static void impl(thread* rsx, u32 /*reg*/, u32 arg)
-			{
-				set_vertex_data_impl<NV4097_SET_VERTEX_DATA1F_M, index, 1, 1, f32>(rsx, arg);
-			}
-		};
-
-		template<u32 index>
-		struct set_vertex_data2f_m
-		{
-			static void impl(thread* rsx, u32 /*reg*/, u32 arg)
-			{
-				set_vertex_data_impl<NV4097_SET_VERTEX_DATA2F_M, index, 2, 2, f32>(rsx, arg);
-			}
-		};
-
-		template<u32 index>
-		struct set_vertex_data3f_m
-		{
-			static void impl(thread* rsx, u32 /*reg*/, u32 arg)
-			{
-				//Register alignment is only 1, 2, or 4 (Rachet & Clank 2)
-				set_vertex_data_impl<NV4097_SET_VERTEX_DATA3F_M, index, 3, 4, f32>(rsx, arg);
-			}
-		};
-
-		template<u32 index>
-		struct set_vertex_data4f_m
-		{
-			static void impl(thread* rsx, u32 /*reg*/, u32 arg)
-			{
-				set_vertex_data_impl<NV4097_SET_VERTEX_DATA4F_M, index, 4, 4, f32>(rsx, arg);
-			}
-		};
-
-		template<u32 index>
-		struct set_vertex_data2s_m
-		{
-			static void impl(thread* rsx, u32 /*reg*/, u32 arg)
-			{
-				set_vertex_data_impl<NV4097_SET_VERTEX_DATA2S_M, index, 2, 2, u16>(rsx, arg);
-			}
-		};
-
-		template<u32 index>
-		struct set_vertex_data4s_m
-		{
-			static void impl(thread* rsx, u32 /*reg*/, u32 arg)
-			{
-				set_vertex_data_impl<NV4097_SET_VERTEX_DATA4S_M, index, 4, 4, u16>(rsx, arg);
-			}
-		};
-
-		template<u32 index>
-		struct set_vertex_data_scaled4s_m
-		{
-			static void impl(thread* rsx, u32 /*reg*/, u32 arg)
-			{
-				set_vertex_data_impl<NV4097_SET_VERTEX_DATA_SCALED4S_M, index, 4, 4, s16>(rsx, arg);
-			}
-		};
-
-		void set_array_element16(thread* rsx, u32, u32 arg)
-		{
-			if (rsx->in_begin_end)
-			{
-				rsx->append_array_element(arg & 0xFFFF);
-				rsx->append_array_element(arg >> 16);
-			}
-		}
-
-		void set_array_element32(thread* rsx, u32, u32 arg)
-		{
-			if (rsx->in_begin_end)
-				rsx->append_array_element(arg);
-		}
-
-		void draw_arrays(thread* /*rsx*/, u32 /*reg*/, u32 arg)
-		{
-			rsx::method_registers.current_draw_clause.command = rsx::draw_command::array;
-			rsx::registers_decoder<NV4097_DRAW_ARRAYS>::decoded_type v(arg);
-
-			rsx::method_registers.current_draw_clause.append(v.start(), v.count());
-		}
-
-		void draw_index_array(thread* /*rsx*/, u32 /*reg*/, u32 arg)
-		{
-			rsx::method_registers.current_draw_clause.command = rsx::draw_command::indexed;
-			rsx::registers_decoder<NV4097_DRAW_INDEX_ARRAY>::decoded_type v(arg);
-
-			rsx::method_registers.current_draw_clause.append(v.start(), v.count());
-		}
-
-		void draw_inline_array(thread* /*rsx*/, u32 /*reg*/, u32 arg)
-		{
-			arg = std::bit_cast<u32, be_t<u32>>(arg);
-			rsx::method_registers.current_draw_clause.command = rsx::draw_command::inlined_array;
-			rsx::method_registers.current_draw_clause.inline_vertex_array.push_back(arg);
-		}
-
-		struct set_transform_constant
-		{
-			static void impl(thread* rsx, u32 _reg, u32 /*arg*/)
-			{
-				const u32 index = _reg - NV4097_SET_TRANSFORM_CONSTANT;
-				const u32 reg = index / 4;
-				const u8 subreg = index % 4;
-
-				// FIFO args count including this one
-				const u32 fifo_args_cnt = rsx->fifo_ctrl->get_remaining_args_count() + 1;
-
-				// The range of methods this function resposible to
-				const u32 method_range = 32 - index;
-
-				// Get limit imposed by FIFO PUT (if put is behind get it will result in a number ignored by min)
-				const u32 fifo_read_limit = static_cast<u32>(((rsx->ctrl->put & ~3ull) - (rsx->fifo_ctrl->get_pos())) / 4);
-
-				const u32 count = std::min<u32>({fifo_args_cnt, fifo_read_limit, method_range});
-
-				const u32 load = rsx::method_registers.transform_constant_load();
-
-				u32 rcount = count;
-				if (const u32 max = (load + reg) * 4 + count + subreg, limit = 468 * 4; max > limit)
-				{
-					// Ignore addresses outside the usable [0, 467] range
-					rsx_log.warning("Invalid transform register index (load=%u, index=%u, count=%u)", load, index, count);
-
-					if ((max - count) < limit)
-						rcount -= max - limit;
-					else
-						rcount = 0;
-				}
-
-				const auto values = &rsx::method_registers.transform_constants[load + reg][subreg];
-
-				const auto fifo_span = rsx->fifo_ctrl->get_current_arg_ptr();
-
-				if (fifo_span.size() < rcount)
-				{
-					rcount = ::size32(fifo_span);
-				}
-
-				if (rsx->m_graphics_state & rsx::pipeline_state::transform_constants_dirty)
-				{
-					// Minor optimization: don't compare values if we already know we need invalidation
-					copy_data_swap_u32(values, fifo_span.data(), rcount);
-				}
-				else
-				{
-					if (copy_data_swap_u32_cmp(values, fifo_span.data(), rcount))
-					{
-						// Transform constants invalidation is expensive (~8k bytes per update)
-						rsx->m_graphics_state |= rsx::pipeline_state::transform_constants_dirty;
-					}
-				}
-
-				rsx->fifo_ctrl->skip_methods(rcount - 1);
-			}
-		};
-
-		struct set_transform_program
-		{
-			static void impl(thread* rsx, u32 reg, u32 /*arg*/)
-			{
-				const u32 index = reg - NV4097_SET_TRANSFORM_PROGRAM;
-
-				// FIFO args count including this one
-				const u32 fifo_args_cnt = rsx->fifo_ctrl->get_remaining_args_count() + 1;
-
-				// The range of methods this function resposible to
-				const u32 method_range = 32 - index;
-
-				// Get limit imposed by FIFO PUT (if put is behind get it will result in a number ignored by min)
-				const u32 fifo_read_limit = static_cast<u32>(((rsx->ctrl->put & ~3ull) - (rsx->fifo_ctrl->get_pos())) / 4);
-
-				const u32 count = std::min<u32>({fifo_args_cnt, fifo_read_limit, method_range});
-
-				const u32 load_pos = rsx::method_registers.transform_program_load();
-
-				u32 rcount = count;
-
-				if (const u32 max = load_pos * 4 + rcount + (index % 4);
-					max > max_vertex_program_instructions * 4)
-				{
-					rsx_log.warning("Program buffer overflow! Attempted to write %u VP instructions.", max / 4);
-					rcount -= max - (max_vertex_program_instructions * 4);
-				}
-
-				const auto fifo_span = rsx->fifo_ctrl->get_current_arg_ptr();
-
-				if (fifo_span.size() < rcount)
-				{
-					rcount = ::size32(fifo_span);
-				}
-
-				copy_data_swap_u32(&rsx::method_registers.transform_program[load_pos * 4 + index % 4], fifo_span.data(), rcount);
-
-				rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_ucode_dirty;
-				rsx::method_registers.transform_program_load_set(load_pos + ((rcount + index % 4) / 4));
-				rsx->fifo_ctrl->skip_methods(rcount - 1);
-			}
-		};
-
-		void set_transform_program_start(thread* rsx, u32 reg, u32)
-		{
-			if (method_registers.registers[reg] != method_registers.register_previous_value)
-			{
-				rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_ucode_dirty;
-			}
-		}
-
-		void set_vertex_attribute_output_mask(thread* rsx, u32 reg, u32)
-		{
-			if (method_registers.registers[reg] != method_registers.register_previous_value)
-			{
-				rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_state_dirty;
-			}
-		}
-
-		void set_begin_end(thread* rsxthr, u32 /*reg*/, u32 arg)
-		{
-			// Ignore upper bits
-			if (const u8 prim = static_cast<u8>(arg))
-			{
-				const auto primitive_type = to_primitive_type(prim);
-				if (!primitive_type)
-				{
-					rsxthr->in_begin_end = true;
-
-					rsx_log.warning("Invalid NV4097_SET_BEGIN_END value: 0x%x", arg);
-					return;
-				}
-
-				rsx::method_registers.current_draw_clause.reset(primitive_type);
-				rsxthr->begin();
-				return;
-			}
-
-			// Check if we have immediate mode vertex data in a driver-local buffer
-			if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::none)
-			{
-				const u32 push_buffer_vertices_count = rsxthr->get_push_buffer_vertex_count();
-				const u32 push_buffer_index_count = rsxthr->get_push_buffer_index_count();
-
-				// Need to set this flag since it overrides some register contents
-				rsx::method_registers.current_draw_clause.is_immediate_draw = true;
-
-				if (push_buffer_index_count)
-				{
-					rsx::method_registers.current_draw_clause.command = rsx::draw_command::indexed;
-					rsx::method_registers.current_draw_clause.append(0, push_buffer_index_count);
-				}
-				else if (push_buffer_vertices_count)
-				{
-					rsx::method_registers.current_draw_clause.command = rsx::draw_command::array;
-					rsx::method_registers.current_draw_clause.append(0, push_buffer_vertices_count);
-				}
-			}
-			else
-			{
-				rsx::method_registers.current_draw_clause.is_immediate_draw = false;
-			}
-
-			if (!rsx::method_registers.current_draw_clause.empty())
-			{
-				rsx::method_registers.current_draw_clause.compile();
-
-				if (g_cfg.video.disable_video_output)
-				{
-					rsxthr->execute_nop_draw();
-					rsxthr->rsx::thread::end();
-					return;
-				}
-
-				rsxthr->end();
-			}
-			else
-			{
-				rsxthr->in_begin_end = false;
-			}
-
-			if (rsxthr->pause_on_draw && rsxthr->pause_on_draw.exchange(false))
-			{
-				rsxthr->state -= cpu_flag::dbg_step;
-				rsxthr->state += cpu_flag::dbg_pause;
-				rsxthr->check_state();
-			}
-		}
-
-		vm::addr_t get_report_data_impl(u32 offset)
-		{
-			u32 location = 0;
-			blit_engine::context_dma report_dma = method_registers.context_dma_report();
-
-			switch (report_dma)
-			{
-			case blit_engine::context_dma::to_memory_get_report: location = CELL_GCM_CONTEXT_DMA_REPORT_LOCATION_LOCAL; break;
-			case blit_engine::context_dma::report_location_main: location = CELL_GCM_CONTEXT_DMA_REPORT_LOCATION_MAIN; break;
-			case blit_engine::context_dma::memory_host_buffer: location = CELL_GCM_CONTEXT_DMA_MEMORY_HOST_BUFFER; break;
-			default:
-				return vm::addr_t(0);
-			}
-
-			return vm::cast(get_address(offset, location));
-		}
-
-		void get_report(thread* rsx, u32 /*reg*/, u32 arg)
-		{
-			u8 type = arg >> 24;
-			u32 offset = arg & 0xffffff;
-
-			auto address_ptr = get_report_data_impl(offset);
-			if (!address_ptr)
-			{
-				rsx_log.error("Bad argument passed to NV4097_GET_REPORT, arg=0x%X", arg);
-				return;
-			}
-
-			switch (type)
-			{
-			case CELL_GCM_ZPASS_PIXEL_CNT:
-			case CELL_GCM_ZCULL_STATS:
-			case CELL_GCM_ZCULL_STATS1:
-			case CELL_GCM_ZCULL_STATS2:
-			case CELL_GCM_ZCULL_STATS3:
-				rsx->get_zcull_stats(type, address_ptr);
-				break;
-			default:
-				rsx_log.error("NV4097_GET_REPORT: Bad type %d", type);
-
-				vm::_ref<atomic_t<CellGcmReportData>>(address_ptr).atomic_op([&](CellGcmReportData& data)
-				{
-					data.timer = rsx->timestamp();
-					data.padding = 0;
-				});
-				break;
-			}
-		}
-
-		void clear_report_value(thread* rsx, u32 /*reg*/, u32 arg)
-		{
-			switch (arg)
-			{
-			case CELL_GCM_ZPASS_PIXEL_CNT:
-			case CELL_GCM_ZCULL_STATS:
-				break;
-			default:
-				rsx_log.error("NV4097_CLEAR_REPORT_VALUE: Bad type: %d", arg);
-				break;
-			}
-
-			rsx->clear_zcull_stats(arg);
-		}
-
-		void set_render_mode(thread* rsx, u32, u32 arg)
-		{
-			const u32 mode = arg >> 24;
-			switch (mode)
-			{
-			case 1:
-				rsx->disable_conditional_rendering();
-				return;
-			case 2:
-				break;
-			default:
-				rsx_log.error("Unknown render mode %d", mode);
-				return;
-			}
-
-			const u32 offset = arg & 0xffffff;
-			auto address_ptr = get_report_data_impl(offset);
-
-			if (!address_ptr)
-			{
-				rsx_log.error("Bad argument passed to NV4097_SET_RENDER_ENABLE, arg=0x%X", arg);
-				return;
-			}
-
-			// Defer conditional render evaluation
-			rsx->enable_conditional_rendering(address_ptr);
-		}
-
-		void set_zcull_render_enable(thread* rsx, u32, u32)
-		{
-			rsx->notify_zcull_info_changed();
-		}
-
-		void set_zcull_stats_enable(thread* rsx, u32, u32)
-		{
-			rsx->notify_zcull_info_changed();
-		}
-
-		void set_zcull_pixel_count_enable(thread* rsx, u32, u32)
-		{
-			rsx->notify_zcull_info_changed();
-		}
-
-		void sync(thread* rsx, u32, u32)
-		{
-			rsx->sync();
-		}
-
-		void set_shader_program_dirty(thread* rsx, u32, u32)
-		{
-			rsx->m_graphics_state |= rsx::pipeline_state::fragment_program_ucode_dirty;
-		}
-
-		void set_surface_dirty_bit(thread* rsx, u32 reg, u32 arg)
-		{
-			if (arg == method_registers.register_previous_value)
-			{
-				return;
-			}
-
-			switch (reg)
-			{
-			case NV4097_SET_SURFACE_COLOR_TARGET:
-				rsx->m_graphics_state |= rsx::pipeline_state::pipeline_config_dirty;
-				break;
-			case NV4097_SET_SURFACE_CLIP_VERTICAL:
-			case NV4097_SET_SURFACE_CLIP_HORIZONTAL:
-				rsx->m_graphics_state |= rsx::pipeline_state::vertex_state_dirty;
-				break;
-			default:
-				break;
-			}
-
-			rsx->m_graphics_state.set(rtt_config_dirty);
-			rsx->m_graphics_state.clear(rtt_config_contested);
-		}
-
-		void set_surface_format(thread* rsx, u32 reg, u32 arg)
-		{
-			// The high bits of this register are just log2(dimension), ignore them
-			if ((arg & 0xFFFF) == (method_registers.register_previous_value & 0xFFFF))
-			{
-				return;
-			}
-
-			// The important parameters have changed (format, type, antialias)
-			rsx->m_graphics_state |= rsx::pipeline_state::pipeline_config_dirty;
-
-			// Check if we need to also update fragment state
-			const auto current = method_registers.decode<NV4097_SET_SURFACE_FORMAT>(arg);
-			const auto previous = method_registers.decode<NV4097_SET_SURFACE_FORMAT>(method_registers.register_previous_value);
-
-			if (*current.antialias() != *previous.antialias() ||                         // Antialias control has changed, update ROP parameters
-				current.is_integer_color_format() != previous.is_integer_color_format()) // The type of color format also requires ROP control update
-			{
-				rsx->m_graphics_state |= rsx::pipeline_state::fragment_state_dirty;
-			}
-
-			set_surface_dirty_bit(rsx, reg, arg);
-		}
-
-		void set_surface_options_dirty_bit(thread* rsx, u32 reg, u32 arg)
-		{
-			if (arg != method_registers.register_previous_value)
-			{
-				rsx->on_framebuffer_options_changed(reg);
-				rsx->m_graphics_state |= rsx::pipeline_config_dirty;
-			}
-		}
-
-		void set_color_mask(thread* rsx, u32 reg, u32 arg)
-		{
-			if (arg == method_registers.register_previous_value)
-			{
-				return;
-			}
-
-			if (method_registers.decode<NV4097_SET_COLOR_MASK>(arg).is_invalid()) [[ unlikely ]]
-			{
-				method_registers.decode(reg, method_registers.register_previous_value);
-			}
-			else
-			{
-				set_surface_options_dirty_bit(rsx, reg, arg);
-			}
-		}
-
-		void set_stencil_op(thread* rsx, u32 reg, u32 arg)
-		{
-			if (arg == method_registers.register_previous_value)
-			{
-				return;
-			}
-
-			const auto typed = to_stencil_op(arg);
-			if (typed) [[ likely ]]
-			{
-				set_surface_options_dirty_bit(rsx, reg, arg);
-			}
-			else
-			{
-				method_registers.decode(reg, method_registers.register_previous_value);
-			}
-		}
-
-		void set_vertex_base_offset(thread* rsx, u32 reg, u32 arg)
-		{
-			if (rsx->in_begin_end &&
-				!rsx::method_registers.current_draw_clause.empty() &&
-				reg != method_registers.register_previous_value)
-			{
-				// Revert change to queue later
-				method_registers.decode(reg, method_registers.register_previous_value);
-
-				// Insert base mofifier barrier
-				method_registers.current_draw_clause.insert_command_barrier(vertex_base_modifier_barrier, arg);
-			}
-		}
-
-		void set_index_base_offset(thread* rsx, u32 reg, u32 arg)
-		{
-			if (rsx->in_begin_end &&
-				!rsx::method_registers.current_draw_clause.empty() &&
-				reg != method_registers.register_previous_value)
-			{
-				// Revert change to queue later
-				method_registers.decode(reg, method_registers.register_previous_value);
-
-				// Insert base mofifier barrier
-				method_registers.current_draw_clause.insert_command_barrier(index_base_modifier_barrier, arg);
-			}
-		}
-
-		template<u32 index>
-		struct set_vertex_array_offset
-		{
-			static void impl(thread* rsx, u32 reg, u32 arg)
-			{
-				if (rsx->in_begin_end &&
-					!rsx::method_registers.current_draw_clause.empty() &&
-					reg != method_registers.register_previous_value)
-				{
-					// Revert change to queue later
-					method_registers.decode(reg, method_registers.register_previous_value);
-
-					// Insert offset mofifier barrier
-					method_registers.current_draw_clause.insert_command_barrier(vertex_array_offset_modifier_barrier, arg, index);
-				}
-			}
-		};
-
-		void check_index_array_dma(thread* rsx, u32 reg, u32 arg)
-		{
-			// Check if either location or index type are invalid
-			if (arg & ~(CELL_GCM_LOCATION_MAIN | (CELL_GCM_DRAW_INDEX_ARRAY_TYPE_16 << 4)))
-			{
-				// Ignore invalid value, recover
-				method_registers.registers[reg] = method_registers.register_previous_value;
-				rsx->recover_fifo();
-
-				rsx_log.error("Invalid NV4097_SET_INDEX_ARRAY_DMA value: 0x%x", arg);
-			}
-		}
-
-		void set_blend_equation(thread* rsx, u32 reg, u32 arg)
-		{
-			if (reg == method_registers.register_previous_value)
-			{
-				return;
-			}
-
-			if (to_blend_equation(arg & 0xFFFF) &&
-				to_blend_equation((arg >> 16) & 0xFFFF)) [[ likely ]]
-			{
-				rsx->m_graphics_state |= rsx::pipeline_config_dirty;
-			}
-			else
-			{
-				method_registers.decode(reg, method_registers.register_previous_value);
-			}
-		}
-
-		void set_blend_factor(thread* rsx, u32 reg, u32 arg)
-		{
-			if (reg == method_registers.register_previous_value)
-			{
-				return;
-			}
-
-			if (to_blend_factor(arg & 0xFFFF) &&
-				to_blend_factor((arg >> 16) & 0xFFFF)) [[ likely ]]
-			{
-				rsx->m_graphics_state |= rsx::pipeline_config_dirty;
-			}
-			else
-			{
-				method_registers.decode(reg, method_registers.register_previous_value);
-			}
-		}
-
-		template<u32 index>
-		struct set_texture_dirty_bit
-		{
-			static void impl(thread* rsx, u32 /*reg*/, u32 /*arg*/)
-			{
-				rsx->m_textures_dirty[index] = true;
-
-				if (rsx->current_fp_metadata.referenced_textures_mask & (1 << index))
-				{
-					rsx->m_graphics_state |= rsx::pipeline_state::fragment_program_state_dirty;
-				}
-			}
-		};
-
-		template<u32 index>
-		struct set_vertex_texture_dirty_bit
-		{
-			static void impl(thread* rsx, u32 /*reg*/, u32 /*arg*/)
-			{
-				rsx->m_vertex_textures_dirty[index] = true;
-
-				if (rsx->current_vp_metadata.referenced_textures_mask & (1 << index))
-				{
-					rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_state_dirty;
-				}
-			}
-		};
-	}
-
-	namespace nv308a
-	{
-		struct color
-		{
-			static void impl(thread* rsx, u32 reg, u32)
-			{
-				const u32 out_x_max = method_registers.nv308a_size_out_x();
-				const u32 index = reg - NV308A_COLOR;
-
-				if (index >= out_x_max)
-				{
-					// Skip
-					return;
-				}
-
-				// Get position of the current command arg
-				[[maybe_unused]] const u32 src_offset = rsx->fifo_ctrl->get_pos();
-
-				// FIFO args count including this one
-				const u32 fifo_args_cnt = rsx->fifo_ctrl->get_remaining_args_count() + 1;
-
-				// The range of methods this function resposible to
-				const u32 method_range = std::min<u32>(0x700 - index, out_x_max - index);
-
-				// Get limit imposed by FIFO PUT (if put is behind get it will result in a number ignored by min)
-				const u32 fifo_read_limit = static_cast<u32>(((rsx->ctrl->put & ~3ull) - (rsx->fifo_ctrl->get_pos())) / 4);
-
-				u32 count = std::min<u32>({fifo_args_cnt, fifo_read_limit, method_range});
-
-				const u32 dst_dma = method_registers.blit_engine_output_location_nv3062();
-				const u32 dst_offset = method_registers.blit_engine_output_offset_nv3062();
-				const u32 out_pitch = method_registers.blit_engine_output_pitch_nv3062();
-
-				const u32 x = method_registers.nv308a_x() + index;
-				const u32 y = method_registers.nv308a_y();
-
-				const auto fifo_span = rsx->fifo_ctrl->get_current_arg_ptr();
-
-				if (fifo_span.size() < count)
-				{
-					count = ::size32(fifo_span);
-				}
-
-				// Skip "handled methods"
-				rsx->fifo_ctrl->skip_methods(count - 1);
-
-				// 308A::COLOR can be used to create custom sync primitives.
-				// Hide this behind strict mode due to the potential performance implications.
-				if (count == 1 && g_cfg.video.strict_rendering_mode && !g_cfg.video.relaxed_zcull_sync)
-				{
-					rsx->sync();
-				}
-
-				switch (*method_registers.blit_engine_nv3062_color_format())
-				{
-				case blit_engine::transfer_destination_format::a8r8g8b8:
-				case blit_engine::transfer_destination_format::y32:
-				{
-					// Bit cast - optimize to mem copy
-
-					const u32 data_length = count * 4;
-
-					const auto dst_address = get_address(dst_offset + (x * 4) + (out_pitch * y), dst_dma, data_length);
-
-					if (!dst_address)
-					{
-						rsx->recover_fifo();
-						return;
-					}
-
-					const auto dst = vm::_ptr<u8>(dst_address);
-					const auto src = reinterpret_cast<const u8*>(fifo_span.data());
-
-					rsx::reservation_lock<true> rsx_lock(dst_address, data_length);
-
-					if (rsx->fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) [[unlikely]]
-					{
-						// Move last 32 bits
-						reinterpret_cast<u32*>(dst)[0] = reinterpret_cast<const u32*>(src)[count - 1];
-						rsx->invalidate_fragment_program(dst_dma, dst_offset, 4);
-					}
-					else
-					{
-						if (dst_dma & CELL_GCM_LOCATION_MAIN)
-						{
-							// May overlap
-							std::memmove(dst, src, data_length);
-						}
-						else
-						{
-							// Never overlaps
-							std::memcpy(dst, src, data_length);
-						}
-
-						rsx->invalidate_fragment_program(dst_dma, dst_offset, count * 4);
-					}
-
-					break;
-				}
-				case blit_engine::transfer_destination_format::r5g6b5:
-				{
-					const auto data_length = count * 2;
-
-					const auto dst_address = get_address(dst_offset + (x * 2) + (y * out_pitch), dst_dma, data_length);
-					const auto dst = vm::_ptr<u16>(dst_address);
-					const auto src = utils::bless<const be_t<u32>>(fifo_span.data());
-
-					if (!dst_address)
-					{
-						rsx->recover_fifo();
-						return;
-					}
-
-					rsx::reservation_lock<true> rsx_lock(dst_address, data_length);
-
-					auto convert = [](u32 input) -> u16
-					{
-						// Input is considered to be ARGB8
-						u32 r = (input >> 16) & 0xFF;
-						u32 g = (input >> 8) & 0xFF;
-						u32 b = input & 0xFF;
-
-						r = (r * 32) / 255;
-						g = (g * 64) / 255;
-						b = (b * 32) / 255;
-						return static_cast<u16>((r << 11) | (g << 5) | b);
-					};
-
-					if (rsx->fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) [[unlikely]]
-					{
-						// Move last 16 bits
-						dst[0] = convert(src[count - 1]);
-						rsx->invalidate_fragment_program(dst_dma, dst_offset, 2);
-						break;
-					}
-
-					for (u32 i = 0; i < count; i++)
-					{
-						dst[i] = convert(src[i]);
-					}
-
-					rsx->invalidate_fragment_program(dst_dma, dst_offset, count * 2);
-					break;
-				}
-				default:
-				{
-					fmt::throw_exception("Unreachable");
-				}
-				}
-			}
-		};
-	}
-
-	namespace nv3089
-	{
-		std::tuple<bool, blit_src_info, blit_dst_info> _decode_transfer_registers(thread* rsx)
-		{
-			blit_src_info src_info = {};
-			blit_dst_info dst_info = {};
-
-			const rsx::blit_engine::transfer_operation operation = method_registers.blit_engine_operation();
-
-			const u16 out_x = method_registers.blit_engine_output_x();
-			const u16 out_y = method_registers.blit_engine_output_y();
-			const u16 out_w = method_registers.blit_engine_output_width();
-			const u16 out_h = method_registers.blit_engine_output_height();
-
-			const u16 in_w = method_registers.blit_engine_input_width();
-			const u16 in_h = method_registers.blit_engine_input_height();
-
-			const blit_engine::transfer_origin in_origin = method_registers.blit_engine_input_origin();
-			auto src_color_format = method_registers.blit_engine_src_color_format();
-
-			const f32 scale_x = method_registers.blit_engine_ds_dx();
-			const f32 scale_y = method_registers.blit_engine_dt_dy();
-
-			// Clipping
-			// Validate that clipping rect will fit onto both src and dst regions
-			const u16 clip_w = std::min(method_registers.blit_engine_clip_width(), out_w);
-			const u16 clip_h = std::min(method_registers.blit_engine_clip_height(), out_h);
-
-			// Check both clip dimensions and dst dimensions
-			if (clip_w == 0 || clip_h == 0)
-			{
-				rsx_log.warning("NV3089_IMAGE_IN: Operation NOPed out due to empty regions");
-				return { false, src_info, dst_info };
-			}
-
-			if (in_w == 0 || in_h == 0)
-			{
-				// Input cant be an empty region
-				fmt::throw_exception("NV3089_IMAGE_IN_SIZE: Invalid blit dimensions passed (in_w=%d, in_h=%d)", in_w, in_h);
-			}
-
-			u16 clip_x = method_registers.blit_engine_clip_x();
-			u16 clip_y = method_registers.blit_engine_clip_y();
-
-			//Fit onto dst
-			if (clip_x && (out_x + clip_x + clip_w) > out_w) clip_x = 0;
-			if (clip_y && (out_y + clip_y + clip_h) > out_h) clip_y = 0;
-
-			u16 in_pitch = method_registers.blit_engine_input_pitch();
-
-			switch (in_origin)
-			{
-			case blit_engine::transfer_origin::corner:
-			case blit_engine::transfer_origin::center:
-				break;
-			default:
-				rsx_log.warning("NV3089_IMAGE_IN_SIZE: unknown origin (%d)", static_cast<u8>(in_origin));
-			}
-
-			if (operation != rsx::blit_engine::transfer_operation::srccopy)
-			{
-				rsx_log.error("NV3089_IMAGE_IN_SIZE: unknown operation (0x%x)", method_registers.registers[NV3089_SET_OPERATION]);
-				rsx->recover_fifo();
-				return { false, src_info, dst_info };
-			}
-
-			if (!src_color_format)
-			{
-				rsx_log.error("NV3089_IMAGE_IN_SIZE: unknown src color format (0x%x)", method_registers.registers[NV3089_SET_COLOR_FORMAT]);
-				rsx->recover_fifo();
-				return { false, src_info, dst_info };
-			}
-
-			const u32 src_offset = method_registers.blit_engine_input_offset();
-			const u32 src_dma = method_registers.blit_engine_input_location();
-
-			u32 dst_offset;
-			u32 dst_dma = 0;
-			rsx::blit_engine::transfer_destination_format dst_color_format;
-			u32 out_pitch = 0;
-			[[maybe_unused]] u32 out_alignment = 64;
-			bool is_block_transfer = false;
-
-			switch (method_registers.blit_engine_context_surface())
-			{
-			case blit_engine::context_surface::surface2d:
-			{
-				dst_dma = method_registers.blit_engine_output_location_nv3062();
-				dst_offset = method_registers.blit_engine_output_offset_nv3062();
-				out_pitch = method_registers.blit_engine_output_pitch_nv3062();
-				out_alignment = method_registers.blit_engine_output_alignment_nv3062();
-				is_block_transfer = fcmp(scale_x, 1.f) && fcmp(scale_y, 1.f);
-
-				if (auto dst_fmt = method_registers.blit_engine_nv3062_color_format(); !dst_fmt)
-				{
-					rsx_log.error("NV3089_IMAGE_IN_SIZE: unknown NV3062 dst color format (0x%x)", method_registers.registers[NV3062_SET_COLOR_FORMAT]);
-					rsx->recover_fifo();
-					return { false, src_info, dst_info };
-				}
-				else
-				{
-					dst_color_format = dst_fmt;
-				}
-
-				break;
-			}
-			case blit_engine::context_surface::swizzle2d:
-			{
-				dst_dma = method_registers.blit_engine_nv309E_location();
-				dst_offset = method_registers.blit_engine_nv309E_offset();
-
-				if (auto dst_fmt = method_registers.blit_engine_output_format_nv309E(); !dst_fmt)
-				{
-					rsx_log.error("NV3089_IMAGE_IN_SIZE: unknown NV309E dst color format (0x%x)", method_registers.registers[NV309E_SET_FORMAT]);
-					rsx->recover_fifo();
-					return { false, src_info, dst_info };
-				}
-				else
-				{
-					dst_color_format = dst_fmt;
-				}
-
-				break;
-			}
-			default:
-				rsx_log.error("NV3089_IMAGE_IN_SIZE: unknown m_context_surface (0x%x)", static_cast<u8>(method_registers.blit_engine_context_surface()));
-				return { false, src_info, dst_info };
-			}
-
-			const u32 in_bpp = (src_color_format == rsx::blit_engine::transfer_source_format::r5g6b5) ? 2 : 4; // bytes per pixel
-			const u32 out_bpp = (dst_color_format == rsx::blit_engine::transfer_destination_format::r5g6b5) ? 2 : 4;
-
-			if (out_pitch == 0)
-			{
-				out_pitch = out_bpp * out_w;
-			}
-
-			if (in_pitch == 0)
-			{
-				in_pitch = in_bpp * in_w;
-			}
-
-			if (in_bpp != out_bpp)
-			{
-				is_block_transfer = false;
-			}
-
-			u16 in_x, in_y;
-			if (in_origin == blit_engine::transfer_origin::center)
-			{
-				// Convert to normal u,v addressing. Under this scheme offset of 1 is actually half-way inside pixel 0
-				const float x = std::max(method_registers.blit_engine_in_x(), 0.5f);
-				const float y = std::max(method_registers.blit_engine_in_y(), 0.5f);
-				in_x = static_cast<u16>(std::floor(x - 0.5f));
-				in_y = static_cast<u16>(std::floor(y - 0.5f));
-			}
-			else
-			{
-				in_x = static_cast<u16>(std::floor(method_registers.blit_engine_in_x()));
-				in_y = static_cast<u16>(std::floor(method_registers.blit_engine_in_y()));
-			}
-
-			// Check for subpixel addressing
-			if (scale_x < 1.f)
-			{
-				float dst_x = in_x * scale_x;
-				in_x = static_cast<u16>(std::floor(dst_x) / scale_x);
-			}
-
-			if (scale_y < 1.f)
-			{
-				float dst_y = in_y * scale_y;
-				in_y = static_cast<u16>(std::floor(dst_y) / scale_y);
-			}
-
-			const u32 in_offset = in_x * in_bpp + in_pitch * in_y;
-			const u32 out_offset = out_x * out_bpp + out_pitch * out_y;
-
-			const u32 src_line_length = (in_w * in_bpp);
-
-			u32 src_address = 0;
-			const u32 dst_address = get_address(dst_offset, dst_dma, 1); // TODO: Add size
-
-			if (is_block_transfer && (clip_h == 1 || (in_pitch == out_pitch && src_line_length == in_pitch)))
-			{
-				const u32 nb_lines = std::min(clip_h, in_h);
-				const u32 data_length = nb_lines * src_line_length;
-
-				if (src_address = get_address(src_offset, src_dma, data_length);
-					!src_address || !dst_address)
-				{
-					rsx->recover_fifo();
-					return { false, src_info, dst_info };
-				}
-
-				rsx->invalidate_fragment_program(dst_dma, dst_offset, data_length);
-
-				if (const auto result = rsx->read_barrier(src_address, data_length, false);
-					result == rsx::result_zcull_intr)
-				{
-					if (rsx->copy_zcull_stats(src_address, data_length, dst_address) == data_length)
-					{
-						// All writes deferred
-						return { false, src_info, dst_info };
-					}
-				}
-			}
-			else
-			{
-				const u16 read_h = std::min(static_cast<u16>(clip_h / scale_y), in_h);
-				const u32 data_length = in_pitch * (read_h - 1) + src_line_length;
-
-				if (src_address = get_address(src_offset, src_dma, data_length);
-					!src_address || !dst_address)
-				{
-					rsx->recover_fifo();
-					return { false, src_info, dst_info };
-				}
-
-				rsx->invalidate_fragment_program(dst_dma, dst_offset, data_length);
-				rsx->read_barrier(src_address, data_length, true);
-			}
-
-			if (src_address == dst_address &&
-				in_w == clip_w && in_h == clip_h &&
-				in_pitch == out_pitch &&
-				rsx::fcmp(scale_x, 1.f) && rsx::fcmp(scale_y, 1.f))
-			{
-				// NULL operation
-				rsx_log.warning("NV3089_IMAGE_IN: Operation writes memory onto itself with no modification (move-to-self). Will ignore.");
-				return { false, src_info, dst_info };
-			}
-
-			u8* pixels_src = vm::_ptr<u8>(src_address + in_offset);
-			u8* pixels_dst = vm::_ptr<u8>(dst_address + out_offset);
-
-			if (dst_color_format != rsx::blit_engine::transfer_destination_format::r5g6b5 &&
-				dst_color_format != rsx::blit_engine::transfer_destination_format::a8r8g8b8)
-			{
-				fmt::throw_exception("NV3089_IMAGE_IN_SIZE: unknown dst_color_format (%d)", static_cast<u8>(dst_color_format));
-			}
-
-			if (src_color_format != rsx::blit_engine::transfer_source_format::r5g6b5 &&
-				src_color_format != rsx::blit_engine::transfer_source_format::a8r8g8b8)
-			{
-				// Alpha has no meaning in both formats
-				if (src_color_format == rsx::blit_engine::transfer_source_format::x8r8g8b8)
-				{
-					src_color_format = rsx::blit_engine::transfer_source_format::a8r8g8b8;
-				}
-				else
-				{
-					// TODO: Support more formats
-					fmt::throw_exception("NV3089_IMAGE_IN_SIZE: unknown src_color_format (%d)", static_cast<u8>(*src_color_format));
-				}
-			}
-
-			u32 convert_w = static_cast<u32>(std::abs(scale_x) * in_w);
-			u32 convert_h = static_cast<u32>(std::abs(scale_y) * in_h);
-
-			if (convert_w == 0 || convert_h == 0)
-			{
-				rsx_log.error("NV3089_IMAGE_IN: Invalid dimensions or scaling factor. Request ignored (ds_dx=%f, dt_dy=%f)",
-					method_registers.blit_engine_ds_dx(), method_registers.blit_engine_dt_dy());
-				return { false, src_info, dst_info };
-			}
-
-			src_info.format = src_color_format;
-			src_info.origin = in_origin;
-			src_info.width = in_w;
-			src_info.height = in_h;
-			src_info.pitch = in_pitch;
-			src_info.bpp = in_bpp;
-			src_info.offset_x = in_x;
-			src_info.offset_y = in_y;
-			src_info.dma = src_dma;
-			src_info.rsx_address = src_address;
-			src_info.pixels = pixels_src;
-
-			dst_info.format = dst_color_format;
-			dst_info.width = convert_w;
-			dst_info.height = convert_h;
-			dst_info.clip_x = clip_x;
-			dst_info.clip_y = clip_y;
-			dst_info.clip_width = clip_w;
-			dst_info.clip_height = clip_h;
-			dst_info.offset_x = out_x;
-			dst_info.offset_y = out_y;
-			dst_info.pitch = out_pitch;
-			dst_info.bpp = out_bpp;
-			dst_info.scale_x = scale_x;
-			dst_info.scale_y = scale_y;
-			dst_info.dma = dst_dma;
-			dst_info.rsx_address = dst_address;
-			dst_info.pixels = pixels_dst;
-			dst_info.swizzled = (method_registers.blit_engine_context_surface() == blit_engine::context_surface::swizzle2d);
-
-			return { true, src_info, dst_info };
-		}
-
-		void _linear_copy(
-			const blit_dst_info& dst,
-			const blit_src_info& src,
-			u16 out_w,
-			u16 out_h,
-			u32 slice_h,
-			AVPixelFormat ffmpeg_src_format,
-			AVPixelFormat ffmpeg_dst_format,
-			bool need_convert,
-			bool need_clip,
-			bool src_is_modified,
-			bool interpolate)
-		{
-			std::vector<u8> temp2;
-
-			if (!need_convert) [[ likely ]]
-			{
-				const bool is_overlapping = !src_is_modified && dst.dma == src.dma && [&]() -> bool
-				{
-					const auto src_range = utils::address_range::start_length(src.rsx_address, src.pitch * (src.height - 1) + (src.bpp * src.width));
-					const auto dst_range = utils::address_range::start_length(dst.rsx_address, dst.pitch * (dst.clip_height - 1) + (dst.bpp * dst.clip_width));
-					return src_range.overlaps(dst_range);
-				}();
-
-				if (is_overlapping) [[ unlikely ]]
-				{
-					if (need_clip)
-					{
-						temp2.resize(dst.pitch * dst.clip_height);
-						clip_image_may_overlap(dst.pixels, src.pixels, dst.clip_x, dst.clip_y, dst.clip_width, dst.clip_height, dst.bpp, src.pitch, dst.pitch, temp2.data());
-						return;
-					}
-
-					if (dst.pitch != src.pitch || dst.pitch != dst.bpp * out_w)
-					{
-						const u32 buffer_pitch = dst.bpp * out_w;
-						temp2.resize(buffer_pitch * out_h);
-						std::add_pointer_t<u8> buf = temp2.data(), pixels = src.pixels;
-
-						// Read the whole buffer from source
-						for (u32 y = 0; y < out_h; ++y)
-						{
-							std::memcpy(buf, pixels, buffer_pitch);
-							pixels += src.pitch;
-							buf += buffer_pitch;
-						}
-
-						buf = temp2.data(), pixels = dst.pixels;
-
-						// Write to destination
-						for (u32 y = 0; y < out_h; ++y)
-						{
-							std::memcpy(pixels, buf, buffer_pitch);
-							pixels += dst.pitch;
-							buf += buffer_pitch;
-						}
-
-						return;
-					}
-
-					std::memmove(dst.pixels, src.pixels, dst.pitch * out_h);
-					return;
-				}
-
-				if (need_clip) [[ unlikely ]]
-				{
-					clip_image(dst.pixels, src.pixels, dst.clip_x, dst.clip_y, dst.clip_width, dst.clip_height, dst.bpp, src.pitch, dst.pitch);
-					return;
-				}
-
-				if (dst.pitch != src.pitch || dst.pitch != dst.bpp * out_w) [[ unlikely ]]
-				{
-					u8 *dst_pixels = dst.pixels, *src_pixels = src.pixels;
-
-					for (u32 y = 0; y < out_h; ++y)
-					{
-						std::memcpy(dst_pixels, src_pixels, out_w * dst.bpp);
-						dst_pixels += dst.pitch;
-						src_pixels += src.pitch;
-					}
-
-					return;
-				}
-
-				std::memcpy(dst.pixels, src.pixels, dst.pitch * out_h);
-				return;
-			}
-
-			if (need_clip) [[ unlikely ]]
-			{
-				temp2.resize(dst.pitch * std::max<u32>(dst.height, dst.clip_height));
-
-				convert_scale_image(temp2.data(), ffmpeg_dst_format, dst.width, dst.height, dst.pitch,
-					src.pixels, ffmpeg_src_format, src.width, src.height, src.pitch, slice_h, interpolate);
-
-				clip_image(dst.pixels, temp2.data(), dst.clip_x, dst.clip_y, dst.clip_width, dst.clip_height, dst.bpp, dst.pitch, dst.pitch);
-				return;
-			}
-
-			convert_scale_image(dst.pixels, ffmpeg_dst_format, out_w, out_h, dst.pitch,
-				src.pixels, ffmpeg_src_format, src.width, src.height, src.pitch, slice_h,
-				interpolate);
-		}
-
-		std::vector<u8> _swizzled_copy_1(
-			const blit_dst_info& dst,
-			const blit_src_info& src,
-			u16 out_w,
-			u16 out_h,
-			u32 slice_h,
-			AVPixelFormat ffmpeg_src_format,
-			AVPixelFormat ffmpeg_dst_format,
-			bool need_convert,
-			bool need_clip,
-			bool interpolate)
-		{
-			std::vector<u8> temp2, temp3;
-
-			if (need_clip)
-			{
-				temp3.resize(dst.pitch * dst.clip_height);
-
-				if (need_convert)
-				{
-					temp2.resize(dst.pitch * std::max<u32>(dst.height, dst.clip_height));
-
-					convert_scale_image(temp2.data(), ffmpeg_dst_format, dst.width, dst.height, dst.pitch,
-						src.pixels, ffmpeg_src_format, src.width, src.height, src.pitch, slice_h,
-						interpolate);
-
-					clip_image(temp3.data(), temp2.data(), dst.clip_x, dst.clip_y, dst.clip_width, dst.clip_height, dst.bpp, dst.pitch, dst.pitch);
-					return temp3;
-				}
-
-				clip_image(temp3.data(), src.pixels, dst.clip_x, dst.clip_y, dst.clip_width, dst.clip_height, dst.bpp, src.pitch, dst.pitch);
-				return temp3;
-			}
-
-			if (need_convert)
-			{
-				temp3.resize(dst.pitch * out_h);
-
-				convert_scale_image(temp3.data(), ffmpeg_dst_format, out_w, out_h, dst.pitch,
-					src.pixels, ffmpeg_src_format, src.width, src.height, src.pitch, slice_h,
-					interpolate);
-
-				return temp3;
-			}
-
-			return {};
-		}
-
-		void _swizzled_copy_2(
-			u8* linear_pixels,
-			u8* swizzled_pixels,
-			u32 linear_pitch,
-			u16 out_w,
-			u16 out_h,
-			u8 out_bpp)
-		{
-			// TODO: Validate these claims. Are the registers always correctly initialized? Should we trust them at all?
-			// It looks like rsx may ignore the requested swizzle size and just always
-			// round up to nearest power of 2
-			/*
-			u8 sw_width_log2 = method_registers.nv309e_sw_width_log2();
-			u8 sw_height_log2 = method_registers.nv309e_sw_height_log2();
-
-			// 0 indicates height of 1 pixel
-			sw_height_log2 = sw_height_log2 == 0 ? 1 : sw_height_log2;
-
-			// swizzle based on destination size
-			u16 sw_width = 1 << sw_width_log2;
-			u16 sw_height = 1 << sw_height_log2;
-			*/
-
-			std::vector<u8> sw_temp;
-
-			u32 sw_width = next_pow2(out_w);
-			u32 sw_height = next_pow2(out_h);
-
-			// Check and pad texture out if we are given non power of 2 output
-			if (sw_width != out_w || sw_height != out_h)
-			{
-				sw_temp.resize(out_bpp * sw_width * sw_height);
-
-				switch (out_bpp)
-				{
-				case 1:
-					pad_texture<u8>(linear_pixels, sw_temp.data(), out_w, out_h, sw_width, sw_height);
-					break;
-				case 2:
-					pad_texture<u16>(linear_pixels, sw_temp.data(), out_w, out_h, sw_width, sw_height);
-					break;
-				case 4:
-					pad_texture<u32>(linear_pixels, sw_temp.data(), out_w, out_h, sw_width, sw_height);
-					break;
-				}
-
-				linear_pixels = sw_temp.data();
-			}
-
-			switch (out_bpp)
-			{
-			case 1:
-				convert_linear_swizzle<u8, false>(linear_pixels, swizzled_pixels, sw_width, sw_height, linear_pitch);
-				break;
-			case 2:
-				convert_linear_swizzle<u16, false>(linear_pixels, swizzled_pixels, sw_width, sw_height, linear_pitch);
-				break;
-			case 4:
-				convert_linear_swizzle<u32, false>(linear_pixels, swizzled_pixels, sw_width, sw_height, linear_pitch);
-				break;
-			}
-		}
-
-		std::vector<u8> _mirror_transform(const blit_src_info& src, bool flip_x, bool flip_y)
-		{
-			std::vector<u8> temp1;
-			if (!flip_x && !flip_y)
-			{
-				return temp1;
-			}
-
-			const u32 packed_pitch = src.width * src.bpp;
-			temp1.resize(packed_pitch * src.height);
-
-			const s32 stride_y = (flip_y ? -1 : 1) * static_cast<s32>(src.pitch);
-
-			for (u32 y = 0; y < src.height; ++y)
-			{
-				u8* dst_pixels = temp1.data() + (packed_pitch * y);
-				u8* src_pixels = src.pixels + (static_cast<s32>(y) * stride_y);
-
-				if (flip_x)
-				{
-					if (src.bpp == 4) [[ likely ]]
-						{
-							rsx::memcpy_r<u32>(dst_pixels, src_pixels, src.width);
-							continue;
-						}
-
-						rsx::memcpy_r<u16>(dst_pixels, src_pixels, src.width);
-						continue;
-				}
-
-				std::memcpy(dst_pixels, src_pixels, packed_pitch);
-			}
-
-			return temp1;
-		}
-
-		void image_in(thread* rsx, u32 /*reg*/, u32 /*arg*/)
-		{
-			auto [success, src, dst] = _decode_transfer_registers(rsx);
-			if (!success)
-			{
-				return;
-			}
-
-			// Decode extra params before locking
-			const blit_engine::transfer_interpolator in_inter = method_registers.blit_engine_input_inter();
-			const u16 out_w = method_registers.blit_engine_output_width();
-			const u16 out_h = method_registers.blit_engine_output_height();
-
-			// Lock here. RSX cannot execute any locking operations from this point, including ZCULL read barriers
-			auto res = ::rsx::reservation_lock<true>(
-				dst.rsx_address, dst.pitch * dst.clip_height,
-				src.rsx_address, src.pitch * src.height);
-
-			if (!g_cfg.video.force_cpu_blit_processing &&
-				(dst.dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER || src.dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER) &&
-				rsx->scaled_image_from_memory(src, dst, in_inter == blit_engine::transfer_interpolator::foh))
-			{
-				// HW-accelerated blit
-				return;
-			}
-
-			std::vector<u8> mirror_tmp;
-			bool src_is_temp = false;
-
-			// Flip source if needed
-			if (dst.scale_y < 0 || dst.scale_x < 0)
-			{
-				mirror_tmp = _mirror_transform(src, dst.scale_x < 0, dst.scale_y < 0);
-				src.pixels = mirror_tmp.data();
-				src.pitch = src.width * src.bpp;
-				src_is_temp = true;
-			}
-
-			const AVPixelFormat in_format = (src.format == rsx::blit_engine::transfer_source_format::r5g6b5) ? AV_PIX_FMT_RGB565BE : AV_PIX_FMT_ARGB;
-			const AVPixelFormat out_format = (dst.format == rsx::blit_engine::transfer_destination_format::r5g6b5) ? AV_PIX_FMT_RGB565BE : AV_PIX_FMT_ARGB;
-
-			const bool need_clip =
-				dst.clip_width != src.width ||
-				dst.clip_height != src.height ||
-				dst.clip_x > 0 || dst.clip_y > 0 ||
-				dst.width != out_w || dst.height != out_h;
-
-			const bool need_convert = out_format != in_format || !rsx::fcmp(fabsf(dst.scale_x), 1.f) || !rsx::fcmp(fabsf(dst.scale_y), 1.f);
-			const u32 slice_h = static_cast<u32>(std::ceil(static_cast<f32>(dst.clip_height + dst.clip_y) / dst.scale_y));
-			const bool interpolate = in_inter == blit_engine::transfer_interpolator::foh;
-
-			auto real_dst = dst.pixels;
-			const auto tiled_region = rsx->get_tiled_memory_region(utils::address_range::start_length(dst.rsx_address, dst.pitch * dst.clip_height));
-			std::vector<u8> tmp;
-
-			if (tiled_region)
-			{
-				tmp.resize(tiled_region.tile->size);
-				real_dst = dst.pixels;
-				dst.pixels = tmp.data();
-			}
-
-			if (method_registers.blit_engine_context_surface() != blit_engine::context_surface::swizzle2d)
-			{
-				_linear_copy(dst, src, out_w, out_h, slice_h, in_format, out_format, need_convert, need_clip, src_is_temp, interpolate);
-			}
-			else
-			{
-				const auto swz_temp = _swizzled_copy_1(dst, src, out_w, out_h, slice_h, in_format, out_format, need_convert, need_clip, interpolate);
-				auto pixels_src = swz_temp.empty() ? src.pixels : swz_temp.data();
-
-				_swizzled_copy_2(const_cast<u8*>(pixels_src), dst.pixels, src.pitch, out_w, out_h, dst.bpp);
-			}
-
-			if (tiled_region)
-			{
-				const auto tile_func = dst.bpp == 4
-					? rsx::tile_texel_data32
-					: rsx::tile_texel_data16;
-
-				tile_func(
-					real_dst,
-					dst.pixels,
-					tiled_region.base_address,
-					dst.rsx_address - tiled_region.base_address,
-					tiled_region.tile->size,
-					tiled_region.tile->bank,
-					tiled_region.tile->pitch,
-					dst.clip_width,
-					dst.clip_height
-				);
-			}
-		}
-	}
-
 	namespace nv0039
 	{
-		void buffer_notify(thread* rsx, u32, u32 arg)
+		void buffer_notify(context* ctx, u32, u32 arg)
 		{
-			s32 in_pitch = method_registers.nv0039_input_pitch();
-			s32 out_pitch = method_registers.nv0039_output_pitch();
-			const u32 line_length = method_registers.nv0039_line_length();
-			const u32 line_count = method_registers.nv0039_line_count();
-			const u8 out_format = method_registers.nv0039_output_format();
-			const u8 in_format = method_registers.nv0039_input_format();
+			s32 in_pitch = REGS(ctx)->nv0039_input_pitch();
+			s32 out_pitch = REGS(ctx)->nv0039_output_pitch();
+			const u32 line_length = REGS(ctx)->nv0039_line_length();
+			const u32 line_count = REGS(ctx)->nv0039_line_count();
+			const u8 out_format = REGS(ctx)->nv0039_output_format();
+			const u8 in_format = REGS(ctx)->nv0039_input_format();
 			const u32 notify = arg;
 
 			if (!line_count || !line_length)
@@ -1830,11 +62,11 @@ namespace rsx
 			rsx_log.trace("NV0039_BUFFER_NOTIFY: pitch(in=0x%x, out=0x%x), line(len=0x%x, cnt=0x%x), fmt(in=0x%x, out=0x%x), notify=0x%x",
 				in_pitch, out_pitch, line_length, line_count, in_format, out_format, notify);
 
-			u32 src_offset = method_registers.nv0039_input_offset();
-			u32 src_dma = method_registers.nv0039_input_location();
+			u32 src_offset = REGS(ctx)->nv0039_input_offset();
+			u32 src_dma = REGS(ctx)->nv0039_input_location();
 
-			u32 dst_offset = method_registers.nv0039_output_offset();
-			u32 dst_dma = method_registers.nv0039_output_location();
+			u32 dst_offset = REGS(ctx)->nv0039_output_offset();
+			u32 dst_dma = REGS(ctx)->nv0039_output_location();
 
 			const bool is_block_transfer = (in_pitch == out_pitch && out_pitch + 0u == line_length);
 			const auto read_address = get_address(src_offset, src_dma);
@@ -1842,13 +74,13 @@ namespace rsx
 			const auto read_length = in_pitch * (line_count - 1) + line_length;
 			const auto write_length = out_pitch * (line_count - 1) + line_length;
 
-			rsx->invalidate_fragment_program(dst_dma, dst_offset, write_length);
+			RSX(ctx)->invalidate_fragment_program(dst_dma, dst_offset, write_length);
 
-			if (const auto result = rsx->read_barrier(read_address, read_length, !is_block_transfer);
+			if (const auto result = RSX(ctx)->read_barrier(read_address, read_length, !is_block_transfer);
 				result == rsx::result_zcull_intr)
 			{
 				// This transfer overlaps will zcull data pool
-				if (rsx->copy_zcull_stats(read_address, read_length, write_address) == write_length)
+				if (RSX(ctx)->copy_zcull_stats(read_address, read_length, write_address) == write_length)
 				{
 					// All writes deferred
 					return;
@@ -1938,53 +170,53 @@ namespace rsx
 		}
 	}
 
-	void flip_command(thread* rsx, u32, u32 arg)
+	void flip_command(context* ctx, u32, u32 arg)
 	{
-		ensure(rsx->isHLE);
+		ensure(RSX(ctx)->isHLE);
 
-		if (rsx->vblank_at_flip != umax)
+		if (RSX(ctx)->vblank_at_flip != umax)
 		{
-			rsx->flip_notification_count++;
+			RSX(ctx)->flip_notification_count++;
 		}
 
-		if (auto ptr = rsx->queue_handler)
+		if (auto ptr = RSX(ctx)->queue_handler)
 		{
-			rsx->intr_thread->cmd_list
+			RSX(ctx)->intr_thread->cmd_list
 			({
 				{ ppu_cmd::set_args, 1 }, u64{1},
 				{ ppu_cmd::lle_call, ptr },
 				{ ppu_cmd::sleep, 0 }
 			});
 
-			rsx->intr_thread->cmd_notify++;
-			rsx->intr_thread->cmd_notify.notify_one();
+			RSX(ctx)->intr_thread->cmd_notify++;
+			RSX(ctx)->intr_thread->cmd_notify.notify_one();
 		}
 
-		rsx->reset();
-		rsx->on_frame_end(arg);
-		rsx->request_emu_flip(arg);
-		vm::_ref<atomic_t<u128>>(rsx->label_addr + 0x10).store(u128{});
+		RSX(ctx)->reset();
+		RSX(ctx)->on_frame_end(arg);
+		RSX(ctx)->request_emu_flip(arg);
+		vm::_ref<atomic_t<u128>>(RSX(ctx)->label_addr + 0x10).store(u128{});
 	}
 
-	void user_command(thread* rsx, u32, u32 arg)
+	void user_command(context* ctx, u32, u32 arg)
 	{
-		if (!rsx->isHLE)
+		if (!RSX(ctx)->isHLE)
 		{
 			sys_rsx_context_attribute(0x55555555, 0xFEF, 0, arg, 0, 0);
 			return;
 		}
 
-		if (auto ptr = rsx->user_handler)
+		if (auto ptr = RSX(ctx)->user_handler)
 		{
-			rsx->intr_thread->cmd_list
+			RSX(ctx)->intr_thread->cmd_list
 			({
 				{ ppu_cmd::set_args, 1 }, u64{arg},
 				{ ppu_cmd::lle_call, ptr },
 				{ ppu_cmd::sleep, 0 }
 			});
 
-			rsx->intr_thread->cmd_notify++;
-			rsx->intr_thread->cmd_notify.notify_one();
+			RSX(ctx)->intr_thread->cmd_notify++;
+			RSX(ctx)->intr_thread->cmd_notify.notify_one();
 		}
 	}
 
@@ -1993,7 +225,7 @@ namespace rsx
 		template<u32 index>
 		struct driver_flip
 		{
-			static void impl(thread*, u32 /*reg*/, u32 arg)
+			static void impl(context*, u32 /*reg*/, u32 arg)
 			{
 				sys_rsx_context_attribute(0x55555555, 0x102, index, arg, 0, 0);
 			}
@@ -2002,11 +234,11 @@ namespace rsx
 		template<u32 index>
 		struct queue_flip
 		{
-			static void impl(thread* rsx, u32 /*reg*/, u32 arg)
+			static void impl(context* ctx, u32 /*reg*/, u32 arg)
 			{
-				if (rsx->vblank_at_flip != umax)
+				if (RSX(ctx)->vblank_at_flip != umax)
 				{
-					rsx->flip_notification_count++;
+					RSX(ctx)->flip_notification_count++;
 				}
 
 				sys_rsx_context_attribute(0x55555555, 0x103, index, arg, 0, 0);
@@ -2016,14 +248,14 @@ namespace rsx
 
 	namespace fifo
 	{
-		void draw_barrier(thread* rsx, u32, u32)
+		void draw_barrier(context* ctx, u32, u32)
 		{
-			if (rsx->in_begin_end)
+			if (RSX(ctx)->in_begin_end)
 			{
-				if (!method_registers.current_draw_clause.is_disjoint_primitive)
+				if (!REGS(ctx)->current_draw_clause.is_disjoint_primitive)
 				{
 					// Enable primitive barrier request
-					method_registers.current_draw_clause.primitive_barrier_enable = true;
+					REGS(ctx)->current_draw_clause.primitive_barrier_enable = true;
 				}
 			}
 		}
@@ -2587,7 +819,12 @@ namespace rsx
 			registers[NV308A_SIZE_OUT] = 0x0;
 			registers[NV308A_SIZE_IN] = 0x0;
 			registers[NV406E_SET_REFERENCE] = umax;
-			if (auto rsx = Emu.IsStopped() ? nullptr : get_current_renderer(); rsx && rsx->ctrl) rsx->ctrl->ref = u32{umax};
+
+			if (auto rsx = Emu.IsStopped() ? nullptr : get_current_renderer(); rsx && rsx->ctrl)
+			{
+				// FIXME: Multi-context unaware
+				rsx->ctrl->ref = u32{ umax };
+			}
 		}
 
 		{
@@ -3098,7 +1335,7 @@ namespace rsx
 	void rsx_state::decode(u32 reg, u32 value)
 	{
 		// Store new value and save previous
-		register_previous_value = std::exchange(registers[reg], value);
+		latch = std::exchange(registers[reg], value);
 	}
 
 	bool rsx_state::test(u32 reg, u32 value) const
@@ -3192,17 +1429,17 @@ namespace rsx
 				break;
 			case index_base_modifier_barrier:
 				// Change index base offset
-				method_registers.decode(NV4097_SET_VERTEX_DATA_BASE_INDEX, barrier.arg);
+				REGS(ctx)->decode(NV4097_SET_VERTEX_DATA_BASE_INDEX, barrier.arg);
 				result |= index_base_changed;
 				break;
 			case vertex_base_modifier_barrier:
 				// Change vertex base offset
-				method_registers.decode(NV4097_SET_VERTEX_DATA_BASE_OFFSET, barrier.arg);
+				REGS(ctx)->decode(NV4097_SET_VERTEX_DATA_BASE_OFFSET, barrier.arg);
 				result |= vertex_base_changed;
 				break;
 			case vertex_array_offset_modifier_barrier:
 				// Change vertex array offset
-				method_registers.decode(NV4097_SET_VERTEX_DATA_ARRAY_OFFSET + barrier.index, barrier.arg);
+				REGS(ctx)->decode(NV4097_SET_VERTEX_DATA_ARRAY_OFFSET + barrier.index, barrier.arg);
 				result |= vertex_arrays_changed;
 				break;
 			default:
@@ -3737,7 +1974,7 @@ namespace rsx
 		// FIFO
 		bind(FIFO::FIFO_DRAW_BARRIER >> 2, fifo::draw_barrier);
 
-		method_registers.init();
+		REGS(ctx)->init();
 
 		return true;
 	}();
diff --git a/rpcs3/Emu/RSX/rsx_methods.h b/rpcs3/Emu/RSX/rsx_methods.h
index 7347c24b81..fd3db6dcfc 100644
--- a/rpcs3/Emu/RSX/rsx_methods.h
+++ b/rpcs3/Emu/RSX/rsx_methods.h
@@ -375,7 +375,7 @@ namespace rsx
 		}
 	};
 
-	using rsx_method_t = void(*)(class thread*, u32 reg, u32 arg);
+	using rsx_method_t = void(*)(class context*, u32 reg, u32 arg);
 
 	//TODO
 	union alignas(4) method_registers_t
@@ -442,7 +442,7 @@ namespace rsx
 	{
 	public:
 		std::array<u32, 0x10000 / 4> registers{};
-		u32 register_previous_value{};
+		u32 latch{};
 
 		template<u32 opcode>
 		using decoded_type = typename registers_decoder<opcode>::decoded_type;
diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj
index e7f0442c80..20d40bc3a2 100644
--- a/rpcs3/emucore.vcxproj
+++ b/rpcs3/emucore.vcxproj
@@ -95,6 +95,11 @@
     <ClCompile Include="Emu\perf_monitor.cpp" />
     <ClCompile Include="Emu\RSX\Common\texture_cache.cpp" />
     <ClCompile Include="Emu\RSX\Core\RSXContext.cpp" />
+    <ClCompile Include="Emu\RSX\NV47\common.cpp" />
+    <ClCompile Include="Emu\RSX\NV47\nv3089.cpp" />
+    <ClCompile Include="Emu\RSX\NV47\nv308a.cpp" />
+    <ClCompile Include="Emu\RSX\NV47\nv406e.cpp" />
+    <ClCompile Include="Emu\RSX\NV47\nv4097.cpp" />
     <ClCompile Include="Emu\RSX\Overlays\HomeMenu\overlay_home_menu.cpp" />
     <ClCompile Include="Emu\RSX\Overlays\HomeMenu\overlay_home_menu_components.cpp" />
     <ClCompile Include="Emu\RSX\Overlays\HomeMenu\overlay_home_menu_message_box.cpp" />
@@ -584,6 +589,15 @@
     <ClInclude Include="Emu\RSX\Core\RSXDisplay.h" />
     <ClInclude Include="Emu\RSX\Core\RSXReservationLock.hpp" />
     <ClInclude Include="Emu\RSX\Core\RSXVertexTypes.h" />
+    <ClInclude Include="Emu\RSX\NV47\context.h" />
+    <ClInclude Include="Emu\RSX\NV47\context_accessors.define.h" />
+    <ClInclude Include="Emu\RSX\NV47\context_accessors.undef.h" />
+    <ClInclude Include="Emu\RSX\NV47\nv3089.h" />
+    <ClInclude Include="Emu\RSX\NV47\nv308a.h" />
+    <ClInclude Include="Emu\RSX\NV47\nv406e.h" />
+    <ClInclude Include="Emu\RSX\NV47\nv4097.h" />
+    <ClInclude Include="Emu\RSX\NV47\nv47.h" />
+    <ClInclude Include="Emu\RSX\NV47\common.h" />
     <ClInclude Include="Emu\RSX\Overlays\HomeMenu\overlay_home_menu.h" />
     <ClInclude Include="Emu\RSX\Overlays\HomeMenu\overlay_home_menu_components.h" />
     <ClInclude Include="Emu\RSX\Overlays\HomeMenu\overlay_home_menu_message_box.h" />
diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters
index 665e056f92..12889ce92d 100644
--- a/rpcs3/emucore.vcxproj.filters
+++ b/rpcs3/emucore.vcxproj.filters
@@ -97,6 +97,9 @@
     <Filter Include="Emu\GPU\RSX\Program\Upscalers\FSR1">
       <UniqueIdentifier>{cab197c1-581c-49db-9d8b-670335b44cb2}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Emu\GPU\RSX\NV47">
+      <UniqueIdentifier>{213387bd-09c5-4247-8fb0-b3cae06ba34b}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="Crypto\aes.cpp">
@@ -1210,6 +1213,21 @@
     <ClCompile Include="Emu\RSX\Program\SPIRVCommon.cpp">
       <Filter>Emu\GPU\RSX\Program</Filter>
     </ClCompile>
+    <ClCompile Include="Emu\RSX\NV47\nv4097.cpp">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClCompile>
+    <ClCompile Include="Emu\RSX\NV47\common.cpp">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClCompile>
+    <ClCompile Include="Emu\RSX\NV47\nv406e.cpp">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClCompile>
+    <ClCompile Include="Emu\RSX\NV47\nv308a.cpp">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClCompile>
+    <ClCompile Include="Emu\RSX\NV47\nv3089.cpp">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="Crypto\aes.h">
@@ -2449,6 +2467,33 @@
     <ClInclude Include="Emu\RSX\Program\SPIRVCommon.h">
       <Filter>Emu\GPU\RSX\Program</Filter>
     </ClInclude>
+    <ClInclude Include="Emu\RSX\NV47\nv47.h">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\NV47\nv4097.h">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\NV47\nv406e.h">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\NV47\nv3089.h">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\NV47\nv308a.h">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\NV47\context.h">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\NV47\common.h">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\NV47\context_accessors.define.h">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\NV47\context_accessors.undef.h">
+      <Filter>Emu\GPU\RSX\NV47</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="Emu\RSX\Program\GLSLSnippets\GPUDeswizzle.glsl">