update

2025-09-12 20:42:25 +00:00 · 2024-12-30 22:20:19 +01:00 · 2024-12-30 22:20:19 +01:00 · 1def3c49da
commit 1def3c49da
parent 720dc9dd72
3 changed files with 1007 additions and 0 deletions
--- a/rpcs3/Emu/RSX/Core/RSXDrawCommands.cpp
+++ b/rpcs3/Emu/RSX/Core/RSXDrawCommands.cpp
@ -0,0 +1,843 @@
+#include "stdafx.h"
+#include "RSXDrawCommands.h"
+
+#include "Emu/RSX/Common/BufferUtils.h"
+#include "Emu/RSX/Common/buffer_stream.hpp"
+#include "Emu/RSX/Common/io_buffer.h"
+#include "Emu/RSX/Common/simple_array.hpp"
+#include "Emu/RSX/NV47/HW/context_accessors.define.h"
+#include "Emu/RSX/Program/GLSLCommon.h"
+#include "Emu/RSX/rsx_methods.h"
+#include "Emu/RSX/RSXThread.h"
+
+#include "Emu/Memory/vm.h"
+
+namespace rsx
+{
+	void draw_command_processor::analyse_inputs_interleaved(vertex_input_layout& result, const vertex_program_metadata_t& vp_metadata)
+	{
+		const rsx_state& state = *REGS(m_ctx);
+		const u32 input_mask = state.vertex_attrib_input_mask() & vp_metadata.referenced_inputs_mask;
+
+		result.clear();
+		result.attribute_mask = static_cast<u16>(input_mask);
+
+		if (state.current_draw_clause.command == rsx::draw_command::inlined_array)
+		{
+			interleaved_range_info& info = *result.alloc_interleaved_block();
+			info.interleaved = true;
+
+			for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
+			{
+				auto& vinfo = state.vertex_arrays_info[index];
+				result.attribute_placement[index] = attribute_buffer_placement::none;
+
+				if (vinfo.size() > 0)
+				{
+					// Stride must be updated even if the stream is disabled
+					info.attribute_stride += rsx::get_vertex_type_size_on_host(vinfo.type(), vinfo.size());
+					info.locations.push_back({ index, false, 1 });
+
+					if (input_mask & (1u << index))
+					{
+						result.attribute_placement[index] = attribute_buffer_placement::transient;
+					}
+				}
+				else if (state.register_vertex_info[index].size > 0 && input_mask & (1u << index))
+				{
+					// Reads from register
+					result.referenced_registers.push_back(index);
+					result.attribute_placement[index] = attribute_buffer_placement::transient;
+				}
+			}
+
+			if (info.attribute_stride)
+			{
+				// At least one array feed must be enabled for vertex input
+				result.interleaved_blocks.push_back(&info);
+			}
+
+			return;
+		}
+
+		const u32 frequency_divider_mask = REGS(m_ctx)->frequency_divider_operation_mask();
+		result.interleaved_blocks.reserve(16);
+		result.referenced_registers.reserve(16);
+
+		for (auto [ref_mask, index] = std::tuple{ input_mask, u8(0) }; ref_mask; ++index, ref_mask >>= 1)
+		{
+			ensure(index < rsx::limits::vertex_count);
+
+			if (!(ref_mask & 1u))
+			{
+				// Nothing to do, uninitialized
+				continue;
+			}
+
+			// Always reset attribute placement by default
+			result.attribute_placement[index] = attribute_buffer_placement::none;
+
+			// Check for interleaving
+			if (REGS(m_ctx)->current_draw_clause.is_immediate_draw &&
+				REGS(m_ctx)->current_draw_clause.command != rsx::draw_command::indexed)
+			{
+				// NOTE: In immediate rendering mode, all vertex setup is ignored
+				// Observed with GT5, immediate render bypasses array pointers completely, even falling back to fixed-function register defaults
+				if (m_vertex_push_buffers[index].vertex_count > 1)
+				{
+					// Ensure consistent number of vertices per attribute.
+					m_vertex_push_buffers[index].pad_to(m_vertex_push_buffers[0].vertex_count, false);
+
+					// Read temp buffer (register array)
+					std::pair<u8, u32> volatile_range_info = std::make_pair(index, static_cast<u32>(m_vertex_push_buffers[index].data.size() * sizeof(u32)));
+					result.volatile_blocks.push_back(volatile_range_info);
+					result.attribute_placement[index] = attribute_buffer_placement::transient;
+				}
+				else if (state.register_vertex_info[index].size > 0)
+				{
+					// Reads from register
+					result.referenced_registers.push_back(index);
+					result.attribute_placement[index] = attribute_buffer_placement::transient;
+				}
+
+				// Fall back to the default register value if no source is specified via register
+				continue;
+			}
+
+			const auto& info = state.vertex_arrays_info[index];
+			if (!info.size())
+			{
+				if (state.register_vertex_info[index].size > 0)
+				{
+					// Reads from register
+					result.referenced_registers.push_back(index);
+					result.attribute_placement[index] = attribute_buffer_placement::transient;
+					continue;
+				}
+			}
+			else
+			{
+				result.attribute_placement[index] = attribute_buffer_placement::persistent;
+				const u32 base_address = info.offset() & 0x7fffffff;
+				bool alloc_new_block = true;
+				bool modulo = !!(frequency_divider_mask & (1 << index));
+
+				for (auto& block : result.interleaved_blocks)
+				{
+					if (block->single_vertex)
+					{
+						// Single vertex definition, continue
+						continue;
+					}
+
+					if (block->attribute_stride != info.stride())
+					{
+						// Stride does not match, continue
+						continue;
+					}
+
+					if (base_address > block->base_offset)
+					{
+						const u32 diff = base_address - block->base_offset;
+						if (diff > info.stride())
+						{
+							// Not interleaved, continue
+							continue;
+						}
+					}
+					else
+					{
+						const u32 diff = block->base_offset - base_address;
+						if (diff > info.stride())
+						{
+							// Not interleaved, continue
+							continue;
+						}
+
+						// Matches, and this address is lower than existing
+						block->base_offset = base_address;
+					}
+
+					alloc_new_block = false;
+					block->locations.push_back({ index, modulo, info.frequency() });
+					block->interleaved = true;
+					break;
+				}
+
+				if (alloc_new_block)
+				{
+					interleaved_range_info& block = *result.alloc_interleaved_block();
+					block.base_offset = base_address;
+					block.attribute_stride = info.stride();
+					block.memory_location = info.offset() >> 31;
+					block.locations.reserve(16);
+					block.locations.push_back({ index, modulo, info.frequency() });
+
+					if (block.attribute_stride == 0)
+					{
+						block.single_vertex = true;
+						block.attribute_stride = rsx::get_vertex_type_size_on_host(info.type(), info.size());
+					}
+
+					result.interleaved_blocks.push_back(&block);
+				}
+			}
+		}
+
+		for (auto& info : result.interleaved_blocks)
+		{
+			// Calculate real data address to be used during upload
+			info->real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(state.vertex_data_base_offset(), info->base_offset), info->memory_location);
+		}
+	}
+
+	std::span<const std::byte> draw_command_processor::get_raw_index_array(const draw_clause& draw_indexed_clause) const
+	{
+		if (!m_element_push_buffer.empty()) [[ unlikely ]]
+		{
+			// Indices provided via immediate mode
+			return { reinterpret_cast<const std::byte*>(m_element_push_buffer.data()), ::narrow<u32>(m_element_push_buffer.size() * sizeof(u32)) };
+		}
+
+		const rsx::index_array_type type = REGS(m_ctx)->index_type();
+		const u32 type_size = get_index_type_size(type);
+
+		// Force aligned indices as realhw
+		const u32 address = (0 - type_size) & get_address(REGS(m_ctx)->index_array_address(), REGS(m_ctx)->index_array_location());
+
+		const u32 first = draw_indexed_clause.min_index();
+		const u32 count = draw_indexed_clause.get_elements_count();
+
+		const auto ptr = vm::_ptr<const std::byte>(address);
+		return { ptr + first * type_size, count * type_size };
+	}
+
+	std::variant<draw_array_command, draw_indexed_array_command, draw_inlined_array>
+		draw_command_processor::get_draw_command(const rsx::rsx_state& state) const
+	{
+		if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::indexed) [[ likely ]]
+		{
+			return draw_indexed_array_command
+			{
+				get_raw_index_array(state.current_draw_clause)
+			};
+		}
+
+		if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::array)
+		{
+			return draw_array_command{};
+		}
+
+		if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::inlined_array)
+		{
+			return draw_inlined_array{};
+		}
+
+		fmt::throw_exception("ill-formed draw command");
+	}
+
+	void draw_command_processor::append_to_push_buffer(u32 attribute, u32 size, u32 subreg_index, vertex_base_type type, u32 value)
+	{
+		if (!(REGS(m_ctx)->vertex_attrib_input_mask() & (1 << attribute)))
+		{
+			return;
+		}
+
+		// Enforce ATTR0 as vertex attribute for push buffers.
+		// This whole thing becomes a mess if we don't have a provoking attribute.
+		const auto vertex_id = m_vertex_push_buffers[0].get_vertex_id();
+		m_vertex_push_buffers[attribute].set_vertex_data(attribute, vertex_id, subreg_index, type, size, value);
+		RSX(m_ctx)->m_graphics_state |= rsx::pipeline_state::push_buffer_arrays_dirty;
+	}
+
+	u32 draw_command_processor::get_push_buffer_vertex_count() const
+	{
+		// Enforce ATTR0 as vertex attribute for push buffers.
+		// This whole thing becomes a mess if we don't have a provoking attribute.
+		return m_vertex_push_buffers[0].vertex_count;
+	}
+
+	void draw_command_processor::append_array_element(u32 index)
+	{
+		// Endianness is swapped because common upload code expects input in BE
+		// TODO: Implement fast upload path for LE inputs and do away with this
+		m_element_push_buffer.push_back(std::bit_cast<u32, be_t<u32>>(index));
+	}
+
+	u32 draw_command_processor::get_push_buffer_index_count() const
+	{
+		return ::size32(m_element_push_buffer);
+	}
+
+	void draw_command_processor::clear_push_buffers()
+	{
+		auto& graphics_state = RSX(m_ctx)->m_graphics_state;
+		if (graphics_state & rsx::pipeline_state::push_buffer_arrays_dirty)
+		{
+			for (auto& push_buf : m_vertex_push_buffers)
+			{
+				//Disabled, see https://github.com/RPCS3/rpcs3/issues/1932
+				//REGS(m_ctx)->register_vertex_info[index].size = 0;
+
+				push_buf.clear();
+			}
+
+			graphics_state.clear(rsx::pipeline_state::push_buffer_arrays_dirty);
+		}
+
+		m_element_push_buffer.clear();
+	}
+
+	void draw_command_processor::fill_vertex_layout_state(
+		const vertex_input_layout& layout,
+		const vertex_program_metadata_t& vp_metadata,
+		u32 first_vertex,
+		u32 vertex_count,
+		s32* buffer,
+		u32 persistent_offset_base,
+		u32 volatile_offset_base) const
+	{
+		std::array<s32, 16> offset_in_block = {};
+		u32 volatile_offset = volatile_offset_base;
+		u32 persistent_offset = persistent_offset_base;
+
+		// NOTE: Order is important! Transient ayout is always push_buffers followed by register data
+		if (REGS(m_ctx)->current_draw_clause.is_immediate_draw)
+		{
+			for (const auto& info : layout.volatile_blocks)
+			{
+				offset_in_block[info.first] = volatile_offset;
+				volatile_offset += info.second;
+			}
+		}
+
+		for (u8 index : layout.referenced_registers)
+		{
+			offset_in_block[index] = volatile_offset;
+			volatile_offset += 16;
+		}
+
+		if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::inlined_array)
+		{
+			const auto& block = layout.interleaved_blocks[0];
+			u32 inline_data_offset = volatile_offset;
+			for (const auto& attrib : block->locations)
+			{
+				auto& info = REGS(m_ctx)->vertex_arrays_info[attrib.index];
+
+				offset_in_block[attrib.index] = inline_data_offset;
+				inline_data_offset += rsx::get_vertex_type_size_on_host(info.type(), info.size());
+			}
+		}
+		else
+		{
+			for (const auto& block : layout.interleaved_blocks)
+			{
+				for (const auto& attrib : block->locations)
+				{
+					const u32 local_address = (REGS(m_ctx)->vertex_arrays_info[attrib.index].offset() & 0x7fffffff);
+					offset_in_block[attrib.index] = persistent_offset + (local_address - block->base_offset);
+				}
+
+				const auto range = block->calculate_required_range(first_vertex, vertex_count);
+				persistent_offset += block->attribute_stride * range.second;
+			}
+		}
+
+		// Fill the data
+		// Each descriptor field is 64 bits wide
+		// [0-8] attribute stride
+		// [8-24] attribute divisor
+		// [24-27] attribute type
+		// [27-30] attribute size
+		// [30-31] reserved
+		// [31-60] starting offset
+		// [60-21] swap bytes flag
+		// [61-22] volatile flag
+		// [62-63] modulo enable flag
+
+		const s32 default_frequency_mask = (1 << 8);
+		const s32 swap_storage_mask = (1 << 29);
+		const s32 volatile_storage_mask = (1 << 30);
+		const s32 modulo_op_frequency_mask = smin;
+
+		const u32 modulo_mask = REGS(m_ctx)->frequency_divider_operation_mask();
+		const auto max_index = (first_vertex + vertex_count) - 1;
+
+		for (u16 ref_mask = vp_metadata.referenced_inputs_mask, index = 0; ref_mask; ++index, ref_mask >>= 1)
+		{
+			if (!(ref_mask & 1u))
+			{
+				// Unused input, ignore this
+				continue;
+			}
+
+			if (layout.attribute_placement[index] == attribute_buffer_placement::none)
+			{
+				static constexpr u64 zero = 0;
+				std::memcpy(buffer + index * 2, &zero, sizeof(zero));
+				continue;
+			}
+
+			rsx::vertex_base_type type = {};
+			s32 size = 0;
+			s32 attrib0 = 0;
+			s32 attrib1 = 0;
+
+			if (layout.attribute_placement[index] == attribute_buffer_placement::transient)
+			{
+				if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::inlined_array)
+				{
+					const auto& info = REGS(m_ctx)->vertex_arrays_info[index];
+
+					if (!info.size())
+					{
+						// Register
+						const auto& reginfo = REGS(m_ctx)->register_vertex_info[index];
+						type = reginfo.type;
+						size = reginfo.size;
+
+						attrib0 = rsx::get_vertex_type_size_on_host(type, size);
+					}
+					else
+					{
+						// Array
+						type = info.type();
+						size = info.size();
+
+						attrib0 = layout.interleaved_blocks[0]->attribute_stride | default_frequency_mask;
+					}
+				}
+				else
+				{
+					// Data is either from an immediate render or register input
+					// Immediate data overrides register input
+
+					if (REGS(m_ctx)->current_draw_clause.is_immediate_draw &&
+						m_vertex_push_buffers[index].vertex_count > 1)
+					{
+						// Push buffer
+						const auto& info = m_vertex_push_buffers[index];
+						type = info.type;
+						size = info.size;
+
+						attrib0 = rsx::get_vertex_type_size_on_host(type, size) | default_frequency_mask;
+					}
+					else
+					{
+						// Register
+						const auto& info = REGS(m_ctx)->register_vertex_info[index];
+						type = info.type;
+						size = info.size;
+
+						attrib0 = rsx::get_vertex_type_size_on_host(type, size);
+					}
+				}
+
+				attrib1 |= volatile_storage_mask;
+			}
+			else
+			{
+				auto& info = REGS(m_ctx)->vertex_arrays_info[index];
+				type = info.type();
+				size = info.size();
+
+				auto stride = info.stride();
+				attrib0 = stride;
+
+				if (stride > 0) // when stride is 0, input is not an array but a single element
+				{
+					const u32 frequency = info.frequency();
+					switch (frequency)
+					{
+					case 0:
+					case 1:
+					{
+						attrib0 |= default_frequency_mask;
+						break;
+					}
+					default:
+					{
+						if (modulo_mask & (1 << index))
+						{
+							if (max_index >= frequency)
+							{
+								// Only set modulo mask if a modulo op is actually necessary!
+								// This requires that the uploaded range for this attr = [0, freq-1]
+								// Ignoring modulo op if the rendered range does not wrap allows for range optimization
+								attrib0 |= (frequency << 8);
+								attrib1 |= modulo_op_frequency_mask;
+							}
+							else
+							{
+								attrib0 |= default_frequency_mask;
+							}
+						}
+						else
+						{
+							// Division
+							attrib0 |= (frequency << 8);
+						}
+						break;
+					}
+					}
+				}
+			} // end attribute placement check
+
+			// Special compressed 4 components into one 4-byte value. Decoded as one value.
+			if (type == rsx::vertex_base_type::cmp)
+			{
+				size = 1;
+			}
+
+			// All data is passed in in PS3-native order (BE) so swap flag should be set
+			attrib1 |= swap_storage_mask;
+			attrib0 |= (static_cast<s32>(type) << 24);
+			attrib0 |= (size << 27);
+			attrib1 |= offset_in_block[index];
+
+			buffer[index * 2 + 0] = attrib0;
+			buffer[index * 2 + 1] = attrib1;
+		}
+	}
+
+	void draw_command_processor::write_vertex_data_to_memory(
+		const vertex_input_layout& layout,
+		u32 first_vertex,
+		u32 vertex_count,
+		void* persistent_data,
+		void* volatile_data) const
+	{
+		auto transient = static_cast<char*>(volatile_data);
+		auto persistent = static_cast<char*>(persistent_data);
+
+		auto& draw_call = REGS(m_ctx)->current_draw_clause;
+
+		if (transient != nullptr)
+		{
+			if (draw_call.command == rsx::draw_command::inlined_array)
+			{
+				for (const u8 index : layout.referenced_registers)
+				{
+					memcpy(transient, REGS(m_ctx)->register_vertex_info[index].data.data(), 16);
+					transient += 16;
+				}
+
+				memcpy(transient, draw_call.inline_vertex_array.data(), draw_call.inline_vertex_array.size() * sizeof(u32));
+				// Is it possible to reference data outside of the inlined array?
+				return;
+			}
+
+			// NOTE: Order is important! Transient layout is always push_buffers followed by register data
+			if (draw_call.is_immediate_draw)
+			{
+				// NOTE: It is possible for immediate draw to only contain index data, so vertex data can be in persistent memory
+				for (const auto& info : layout.volatile_blocks)
+				{
+					memcpy(transient, m_vertex_push_buffers[info.first].data.data(), info.second);
+					transient += info.second;
+				}
+			}
+
+			for (const u8 index : layout.referenced_registers)
+			{
+				memcpy(transient, REGS(m_ctx)->register_vertex_info[index].data.data(), 16);
+				transient += 16;
+			}
+		}
+
+		if (persistent != nullptr)
+		{
+			for (interleaved_range_info* block : layout.interleaved_blocks)
+			{
+				auto range = block->calculate_required_range(first_vertex, vertex_count);
+
+				const u32 data_size = range.second * block->attribute_stride;
+				const u32 vertex_base = range.first * block->attribute_stride;
+
+				g_fxo->get<rsx::dma_manager>().copy(persistent, vm::_ptr<char>(block->real_offset_address) + vertex_base, data_size);
+				persistent += data_size;
+			}
+		}
+	}
+
+	void draw_command_processor::fill_scale_offset_data(void* buffer, bool flip_y) const
+	{
+		const int clip_w = REGS(m_ctx)->surface_clip_width();
+		const int clip_h = REGS(m_ctx)->surface_clip_height();
+
+		const float scale_x = REGS(m_ctx)->viewport_scale_x() / (clip_w / 2.f);
+		float offset_x = REGS(m_ctx)->viewport_offset_x() - (clip_w / 2.f);
+		offset_x /= clip_w / 2.f;
+
+		float scale_y = REGS(m_ctx)->viewport_scale_y() / (clip_h / 2.f);
+		float offset_y = (REGS(m_ctx)->viewport_offset_y() - (clip_h / 2.f));
+		offset_y /= clip_h / 2.f;
+		if (flip_y) scale_y *= -1;
+		if (flip_y) offset_y *= -1;
+
+		const float scale_z = REGS(m_ctx)->viewport_scale_z();
+		const float offset_z = REGS(m_ctx)->viewport_offset_z();
+		const float one = 1.f;
+
+		utils::stream_vector(buffer, std::bit_cast<u32>(scale_x), 0, 0, std::bit_cast<u32>(offset_x));
+		utils::stream_vector(static_cast<char*>(buffer) + 16, 0, std::bit_cast<u32>(scale_y), 0, std::bit_cast<u32>(offset_y));
+		utils::stream_vector(static_cast<char*>(buffer) + 32, 0, 0, std::bit_cast<u32>(scale_z), std::bit_cast<u32>(offset_z));
+		utils::stream_vector(static_cast<char*>(buffer) + 48, 0, 0, 0, std::bit_cast<u32>(one));
+	}
+
+	void draw_command_processor::fill_user_clip_data(void* buffer) const
+	{
+		const rsx::user_clip_plane_op clip_plane_control[6] =
+		{
+			REGS(m_ctx)->clip_plane_0_enabled(),
+			REGS(m_ctx)->clip_plane_1_enabled(),
+			REGS(m_ctx)->clip_plane_2_enabled(),
+			REGS(m_ctx)->clip_plane_3_enabled(),
+			REGS(m_ctx)->clip_plane_4_enabled(),
+			REGS(m_ctx)->clip_plane_5_enabled(),
+		};
+
+		u8 data_block[64];
+		s32* clip_enabled_flags = reinterpret_cast<s32*>(data_block);
+		f32* clip_distance_factors = reinterpret_cast<f32*>(data_block + 32);
+
+		for (int index = 0; index < 6; ++index)
+		{
+			switch (clip_plane_control[index])
+			{
+			default:
+				rsx_log.error("bad clip plane control (0x%x)", static_cast<u8>(clip_plane_control[index]));
+				[[fallthrough]];
+
+			case rsx::user_clip_plane_op::disable:
+				clip_enabled_flags[index] = 0;
+				clip_distance_factors[index] = 0.f;
+				break;
+
+			case rsx::user_clip_plane_op::greater_or_equal:
+				clip_enabled_flags[index] = 1;
+				clip_distance_factors[index] = 1.f;
+				break;
+
+			case rsx::user_clip_plane_op::less_than:
+				clip_enabled_flags[index] = 1;
+				clip_distance_factors[index] = -1.f;
+				break;
+			}
+		}
+
+		memcpy(buffer, data_block, 2 * 8 * sizeof(u32));
+	}
+
+	/**
+	* Fill buffer with vertex program constants.
+	* Buffer must be at least 512 float4 wide.
+	*/
+	void draw_command_processor::fill_vertex_program_constants_data(void* buffer, const std::span<const u16>& reloc_table) const
+	{
+		if (!reloc_table.empty()) [[ likely ]]
+		{
+			char* dst = reinterpret_cast<char*>(buffer);
+			for (const auto& index : reloc_table)
+			{
+				utils::stream_vector_from_memory(dst, &REGS(m_ctx)->transform_constants[index]);
+				dst += 16;
+			}
+		}
+		else
+		{
+			memcpy(buffer, REGS(m_ctx)->transform_constants.data(), 468 * 4 * sizeof(float));
+		}
+	}
+
+	void draw_command_processor::fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& /*fragment_program*/) const
+	{
+		ROP_control_t rop_control{};
+
+		if (REGS(m_ctx)->alpha_test_enabled())
+		{
+			const u32 alpha_func = static_cast<u32>(REGS(m_ctx)->alpha_func());
+			rop_control.set_alpha_test_func(alpha_func);
+			rop_control.enable_alpha_test();
+		}
+
+		if (REGS(m_ctx)->polygon_stipple_enabled())
+		{
+			rop_control.enable_polygon_stipple();
+		}
+
+		if (REGS(m_ctx)->msaa_alpha_to_coverage_enabled() && !RSX(m_ctx)->get_backend_config().supports_hw_a2c)
+		{
+			// TODO: Properly support alpha-to-coverage and alpha-to-one behavior in shaders
+			// Alpha values generate a coverage mask for order independent blending
+			// Requires hardware AA to work properly (or just fragment sample stage in fragment shaders)
+			// Simulated using combined alpha blend and alpha test
+			rop_control.enable_alpha_to_coverage();
+			if (REGS(m_ctx)->msaa_sample_mask())
+			{
+				rop_control.enable_MSAA_writes();
+			}
+
+			// Sample configuration bits
+			switch (REGS(m_ctx)->surface_antialias())
+			{
+			case rsx::surface_antialiasing::center_1_sample:
+				break;
+			case rsx::surface_antialiasing::diagonal_centered_2_samples:
+				rop_control.set_msaa_control(1u);
+				break;
+			default:
+				rop_control.set_msaa_control(3u);
+				break;
+			}
+		}
+
+		const f32 fog0 = REGS(m_ctx)->fog_params_0();
+		const f32 fog1 = REGS(m_ctx)->fog_params_1();
+		const u32 fog_mode = static_cast<u32>(REGS(m_ctx)->fog_equation());
+
+		// Check if framebuffer is actually an XRGB format and not a WZYX format
+		switch (REGS(m_ctx)->surface_color())
+		{
+		case rsx::surface_color_format::w16z16y16x16:
+		case rsx::surface_color_format::w32z32y32x32:
+		case rsx::surface_color_format::x32:
+			// These behave very differently from "normal" formats.
+			break;
+		default:
+			// Integer framebuffer formats.
+			rop_control.enable_framebuffer_INT();
+
+			// Check if we want sRGB conversion.
+			if (REGS(m_ctx)->framebuffer_srgb_enabled())
+			{
+				rop_control.enable_framebuffer_sRGB();
+			}
+			break;
+		}
+
+		// Generate wpos coefficients
+		// wpos equation is now as follows:
+		// wpos.y = (frag_coord / resolution_scale) * ((window_origin!=top)?-1.: 1.) + ((window_origin!=top)? window_height : 0)
+		// wpos.x = (frag_coord / resolution_scale)
+		// wpos.zw = frag_coord.zw
+
+		const auto window_origin = REGS(m_ctx)->shader_window_origin();
+		const u32 window_height = REGS(m_ctx)->shader_window_height();
+		const f32 resolution_scale = (window_height <= static_cast<u32>(g_cfg.video.min_scalable_dimension)) ? 1.f : rsx::get_resolution_scale();
+		const f32 wpos_scale = (window_origin == rsx::window_origin::top) ? (1.f / resolution_scale) : (-1.f / resolution_scale);
+		const f32 wpos_bias = (window_origin == rsx::window_origin::top) ? 0.f : window_height;
+		const f32 alpha_ref = REGS(m_ctx)->alpha_ref();
+
+		u32* dst = static_cast<u32*>(buffer);
+		utils::stream_vector(dst, std::bit_cast<u32>(fog0), std::bit_cast<u32>(fog1), rop_control.value, std::bit_cast<u32>(alpha_ref));
+		utils::stream_vector(dst + 4, 0u, fog_mode, std::bit_cast<u32>(wpos_scale), std::bit_cast<u32>(wpos_bias));
+	}
+
+	void draw_command_processor::fill_constants_instancing_buffer(rsx::io_buffer& indirection_table_buf, rsx::io_buffer& constants_data_array_buffer, const VertexProgramBase& prog) const
+	{
+		auto& draw_call = REGS(m_ctx)->current_draw_clause;
+
+		// Only call this for instanced draws!
+		ensure(draw_call.is_trivial_instanced_draw);
+
+		// Temp indirection table. Used to track "running" updates.
+		rsx::simple_array<u32> instancing_indirection_table;
+		// indirection table size
+		const auto reloc_table = prog.has_indexed_constants ? decltype(prog.constant_ids){} : prog.constant_ids;
+		const auto redirection_table_size = prog.has_indexed_constants ? 468u : ::size32(prog.constant_ids);
+		instancing_indirection_table.resize(redirection_table_size);
+
+		// Temp constants data
+		rsx::simple_array<u128> constants_data;
+		constants_data.reserve(redirection_table_size * draw_call.pass_count());
+
+		// Allocate indirection buffer on GPU stream
+		indirection_table_buf.reserve(instancing_indirection_table.size_bytes() * draw_call.pass_count());
+		auto indirection_out = indirection_table_buf.data<u32>();
+
+		rsx::instanced_draw_config_t instance_config;
+		u32 indirection_table_offset = 0;
+
+		// We now replay the draw call here to pack the data.
+		draw_call.begin();
+
+		// Write initial draw data.
+		std::iota(instancing_indirection_table.begin(), instancing_indirection_table.end(), 0);
+
+		constants_data.resize(redirection_table_size);
+		fill_vertex_program_constants_data(constants_data.data(), reloc_table);
+
+		// Next draw. We're guaranteed more than one draw call by the caller.
+		draw_call.next();
+
+		do
+		{
+			// Write previous state
+			std::memcpy(indirection_out + indirection_table_offset, instancing_indirection_table.data(), instancing_indirection_table.size_bytes());
+			indirection_table_offset += redirection_table_size;
+
+			// Decode next draw state
+			instance_config = {};
+			draw_call.execute_pipeline_dependencies(m_ctx, &instance_config);
+
+			if (!instance_config.transform_constants_data_changed)
+			{
+				continue;
+			}
+
+			const int translated_offset = prog.has_indexed_constants
+				? instance_config.patch_load_offset
+				: prog.TranslateConstantsRange(instance_config.patch_load_offset, instance_config.patch_load_count);
+
+			if (translated_offset >= 0)
+			{
+				// Trivially patchable in bulk
+				const u32 redirection_loc = ::size32(constants_data);
+				constants_data.resize(::size32(constants_data) + instance_config.patch_load_count);
+				std::memcpy(constants_data.data() + redirection_loc, &REGS(m_ctx)->transform_constants[instance_config.patch_load_offset], instance_config.patch_load_count * sizeof(u128));
+
+				// Update indirection table
+				for (auto i = translated_offset, count = 0;
+					 static_cast<u32>(count) < instance_config.patch_load_count;
+					 ++i, ++count)
+				{
+					instancing_indirection_table[i] = redirection_loc + count;
+				}
+
+				continue;
+			}
+
+			ensure(!prog.has_indexed_constants);
+
+			// Sparse update. Update records individually instead of bulk
+			// FIXME: Range batching optimization
+			const auto load_end = instance_config.patch_load_offset + instance_config.patch_load_count;
+			for (u32 i = 0; i < redirection_table_size; ++i)
+			{
+				const auto read_index = prog.constant_ids[i];
+				if (read_index < instance_config.patch_load_offset || read_index >= load_end)
+				{
+					// Reading outside "hot" range.
+					continue;
+				}
+
+				const u32 redirection_loc = ::size32(constants_data);
+				constants_data.resize(::size32(constants_data) + 1);
+				std::memcpy(constants_data.data() + redirection_loc, &REGS(m_ctx)->transform_constants[read_index], sizeof(u128));
+
+				instancing_indirection_table[i] = redirection_loc;
+			}
+
+		} while (draw_call.next());
+
+		// Tail
+		ensure(indirection_table_offset < (instancing_indirection_table.size() * draw_call.pass_count()));
+		std::memcpy(indirection_out + indirection_table_offset, instancing_indirection_table.data(), instancing_indirection_table.size_bytes());
+
+		// Now write the constants to the GPU buffer
+		constants_data_array_buffer.reserve(constants_data.size_bytes());
+		std::memcpy(constants_data_array_buffer.data(), constants_data.data(), constants_data.size_bytes());
+	}
+}
--- a/rpcs3/Emu/RSX/Core/RSXDrawCommands.h
+++ b/rpcs3/Emu/RSX/Core/RSXDrawCommands.h
@ -0,0 +1,110 @@
+#pragma once
+
+#include <util/types.hpp>
+
+#include "Emu/RSX/Core/RSXVertexTypes.h"
+#include "Emu/RSX/NV47/FW/draw_call.hpp"
+#include "Emu/RSX/Program/ProgramStateCache.h"
+#include "Emu/RSX/rsx_vertex_data.h"
+
+#include <span>
+#include <variant>
+
+namespace rsx
+{
+	struct rsx_state;
+	struct context;
+	class io_buffer;
+
+	class draw_command_processor
+	{
+		using vertex_program_metadata_t = program_hash_util::vertex_program_utils::vertex_program_metadata;
+
+		context* m_ctx = nullptr;
+
+	protected:
+		friend class thread;
+
+		std::array<push_buffer_vertex_info, 16> m_vertex_push_buffers;
+		rsx::simple_array<u32> m_element_push_buffer;
+
+	public:
+		draw_command_processor() = default;
+
+		void init(context* ctx)
+		{
+			m_ctx = ctx;
+		}
+
+		// Analyze vertex inputs and group all interleaved blocks
+		void analyse_inputs_interleaved(vertex_input_layout& layout, const vertex_program_metadata_t& vp_metadata);
+
+		// Retrieve raw bytes for the index array (untyped)
+		std::span<const std::byte> get_raw_index_array(const draw_clause& draw_indexed_clause) const;
+
+		// Get compiled draw command for backend rendering
+		std::variant<draw_array_command, draw_indexed_array_command, draw_inlined_array>
+			get_draw_command(const rsx::rsx_state& state) const;
+
+		// Push-buffers for immediate rendering (begin-end scopes)
+		void append_to_push_buffer(u32 attribute, u32 size, u32 subreg_index, vertex_base_type type, u32 value);
+
+		u32 get_push_buffer_vertex_count() const;
+
+		void append_array_element(u32 index);
+
+		u32 get_push_buffer_index_count() const;
+
+		void clear_push_buffers();
+
+		const std::span<const u32> element_push_buffer() const
+		{
+			return m_element_push_buffer;
+		}
+
+		// Host driver helpers
+		void fill_vertex_layout_state(
+			const vertex_input_layout& layout,
+			const vertex_program_metadata_t& vp_metadata,
+			u32 first_vertex,
+			u32 vertex_count,
+			s32* buffer,
+			u32 persistent_offset_base,
+			u32 volatile_offset_base) const;
+
+		void write_vertex_data_to_memory(
+			const vertex_input_layout& layout,
+			u32 first_vertex,
+			u32 vertex_count,
+			void* persistent_data,
+			void* volatile_data) const;
+
+		/**
+		 * Fill buffer with 4x4 scale offset matrix.
+		 * Vertex shader's position is to be multiplied by this matrix.
+		 * if flip_y is set, the matrix is modified to use d3d convention.
+		 */
+		void fill_scale_offset_data(void* buffer, bool flip_y) const;
+
+		/**
+		 * Fill buffer with user clip information
+		 */
+		void fill_user_clip_data(void* buffer) const;
+
+		/**
+		* Fill buffer with vertex program constants.
+		* Relocation table allows to do a partial fill with only selected registers.
+		*/
+		void fill_vertex_program_constants_data(void* buffer, const std::span<const u16>& reloc_table) const;
+
+		/**
+		 * Fill buffer with fragment rasterization state.
+		 * Fills current fog values, alpha test parameters and texture scaling parameters
+		 */
+		void fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& fragment_program) const;
+
+		// Fill instancing buffers. A single iobuf is used for both. 256byte alignment enforced to allow global bind
+		// Returns offsets to the index redirection lookup table and constants field array
+		void fill_constants_instancing_buffer(rsx::io_buffer& indirection_table_buf, rsx::io_buffer& constants_data_array_buffer, const VertexProgramBase& prog) const;
+	};
+}
--- a/rpcs3/Emu/RSX/Core/RSXDriverState.h
+++ b/rpcs3/Emu/RSX/Core/RSXDriverState.h
@ -0,0 +1,54 @@
+#pragma once
+
+#include <util/types.hpp>
+
+namespace rsx
+{
+	enum pipeline_state : u32
+	{
+		fragment_program_ucode_dirty = (1 << 0),   // Fragment program ucode changed
+		vertex_program_ucode_dirty   = (1 << 1),   // Vertex program ucode changed
+		fragment_program_state_dirty = (1 << 2),   // Fragment program state changed
+		vertex_program_state_dirty   = (1 << 3),   // Vertex program state changed
+		fragment_state_dirty         = (1 << 4),   // Fragment state changed (alpha test, etc)
+		vertex_state_dirty           = (1 << 5),   // Vertex state changed (scale_offset, clip planes, etc)
+		transform_constants_dirty    = (1 << 6),   // Transform constants changed
+		fragment_constants_dirty     = (1 << 7),   // Fragment constants changed
+		framebuffer_reads_dirty      = (1 << 8),   // Framebuffer contents changed
+		fragment_texture_state_dirty = (1 << 9),   // Fragment texture parameters changed
+		vertex_texture_state_dirty   = (1 << 10),  // Fragment texture parameters changed
+		scissor_config_state_dirty   = (1 << 11),  // Scissor region changed
+		zclip_config_state_dirty     = (1 << 12),  // Viewport Z clip changed
+
+		scissor_setup_invalid        = (1 << 13),  // Scissor configuration is broken
+		scissor_setup_clipped        = (1 << 14),  // Scissor region is cropped by viewport constraint
+
+		polygon_stipple_pattern_dirty = (1 << 15),  // Rasterizer stippling pattern changed
+		line_stipple_pattern_dirty    = (1 << 16),  // Line stippling pattern changed
+
+		push_buffer_arrays_dirty      = (1 << 17),   // Push buffers have data written to them (immediate mode vertex buffers)
+
+		polygon_offset_state_dirty    = (1 << 18), // Polygon offset config was changed
+		depth_bounds_state_dirty      = (1 << 19), // Depth bounds configuration changed
+
+		pipeline_config_dirty         = (1 << 20), // Generic pipeline configuration changes. Shader peek hint.
+
+		rtt_config_dirty              = (1 << 21), // Render target configuration changed
+		rtt_config_contested          = (1 << 22), // Render target configuration is indeterminate
+		rtt_config_valid              = (1 << 23), // Render target configuration is valid
+		rtt_cache_state_dirty         = (1 << 24), // Texture cache state is indeterminate
+
+		xform_instancing_state_dirty  = (1 << 25), // Transform instancing state has changed
+
+		fragment_program_dirty = fragment_program_ucode_dirty | fragment_program_state_dirty,
+		vertex_program_dirty = vertex_program_ucode_dirty | vertex_program_state_dirty,
+		invalidate_pipeline_bits = fragment_program_dirty | vertex_program_dirty | xform_instancing_state_dirty,
+		invalidate_zclip_bits = vertex_state_dirty | zclip_config_state_dirty,
+		memory_barrier_bits = framebuffer_reads_dirty,
+
+		// Vulkan-specific signals
+		invalidate_vk_dynamic_state = zclip_config_state_dirty | scissor_config_state_dirty | polygon_offset_state_dirty | depth_bounds_state_dirty,
+
+		all_dirty = ~0u
+	};
+}