diff --git a/rpcs3/Emu/RSX/Core/RSXDrawCommands.cpp b/rpcs3/Emu/RSX/Core/RSXDrawCommands.cpp new file mode 100644 index 0000000000..9584f051c9 --- /dev/null +++ b/rpcs3/Emu/RSX/Core/RSXDrawCommands.cpp @@ -0,0 +1,843 @@ +#include "stdafx.h" +#include "RSXDrawCommands.h" + +#include "Emu/RSX/Common/BufferUtils.h" +#include "Emu/RSX/Common/buffer_stream.hpp" +#include "Emu/RSX/Common/io_buffer.h" +#include "Emu/RSX/Common/simple_array.hpp" +#include "Emu/RSX/NV47/HW/context_accessors.define.h" +#include "Emu/RSX/Program/GLSLCommon.h" +#include "Emu/RSX/rsx_methods.h" +#include "Emu/RSX/RSXThread.h" + +#include "Emu/Memory/vm.h" + +namespace rsx +{ + void draw_command_processor::analyse_inputs_interleaved(vertex_input_layout& result, const vertex_program_metadata_t& vp_metadata) + { + const rsx_state& state = *REGS(m_ctx); + const u32 input_mask = state.vertex_attrib_input_mask() & vp_metadata.referenced_inputs_mask; + + result.clear(); + result.attribute_mask = static_cast(input_mask); + + if (state.current_draw_clause.command == rsx::draw_command::inlined_array) + { + interleaved_range_info& info = *result.alloc_interleaved_block(); + info.interleaved = true; + + for (u8 index = 0; index < rsx::limits::vertex_count; ++index) + { + auto& vinfo = state.vertex_arrays_info[index]; + result.attribute_placement[index] = attribute_buffer_placement::none; + + if (vinfo.size() > 0) + { + // Stride must be updated even if the stream is disabled + info.attribute_stride += rsx::get_vertex_type_size_on_host(vinfo.type(), vinfo.size()); + info.locations.push_back({ index, false, 1 }); + + if (input_mask & (1u << index)) + { + result.attribute_placement[index] = attribute_buffer_placement::transient; + } + } + else if (state.register_vertex_info[index].size > 0 && input_mask & (1u << index)) + { + // Reads from register + result.referenced_registers.push_back(index); + result.attribute_placement[index] = attribute_buffer_placement::transient; + } + } + + if (info.attribute_stride) + { + // At least one array feed must be enabled for vertex input + result.interleaved_blocks.push_back(&info); + } + + return; + } + + const u32 frequency_divider_mask = REGS(m_ctx)->frequency_divider_operation_mask(); + result.interleaved_blocks.reserve(16); + result.referenced_registers.reserve(16); + + for (auto [ref_mask, index] = std::tuple{ input_mask, u8(0) }; ref_mask; ++index, ref_mask >>= 1) + { + ensure(index < rsx::limits::vertex_count); + + if (!(ref_mask & 1u)) + { + // Nothing to do, uninitialized + continue; + } + + // Always reset attribute placement by default + result.attribute_placement[index] = attribute_buffer_placement::none; + + // Check for interleaving + if (REGS(m_ctx)->current_draw_clause.is_immediate_draw && + REGS(m_ctx)->current_draw_clause.command != rsx::draw_command::indexed) + { + // NOTE: In immediate rendering mode, all vertex setup is ignored + // Observed with GT5, immediate render bypasses array pointers completely, even falling back to fixed-function register defaults + if (m_vertex_push_buffers[index].vertex_count > 1) + { + // Ensure consistent number of vertices per attribute. + m_vertex_push_buffers[index].pad_to(m_vertex_push_buffers[0].vertex_count, false); + + // Read temp buffer (register array) + std::pair volatile_range_info = std::make_pair(index, static_cast(m_vertex_push_buffers[index].data.size() * sizeof(u32))); + result.volatile_blocks.push_back(volatile_range_info); + result.attribute_placement[index] = attribute_buffer_placement::transient; + } + else if (state.register_vertex_info[index].size > 0) + { + // Reads from register + result.referenced_registers.push_back(index); + result.attribute_placement[index] = attribute_buffer_placement::transient; + } + + // Fall back to the default register value if no source is specified via register + continue; + } + + const auto& info = state.vertex_arrays_info[index]; + if (!info.size()) + { + if (state.register_vertex_info[index].size > 0) + { + // Reads from register + result.referenced_registers.push_back(index); + result.attribute_placement[index] = attribute_buffer_placement::transient; + continue; + } + } + else + { + result.attribute_placement[index] = attribute_buffer_placement::persistent; + const u32 base_address = info.offset() & 0x7fffffff; + bool alloc_new_block = true; + bool modulo = !!(frequency_divider_mask & (1 << index)); + + for (auto& block : result.interleaved_blocks) + { + if (block->single_vertex) + { + // Single vertex definition, continue + continue; + } + + if (block->attribute_stride != info.stride()) + { + // Stride does not match, continue + continue; + } + + if (base_address > block->base_offset) + { + const u32 diff = base_address - block->base_offset; + if (diff > info.stride()) + { + // Not interleaved, continue + continue; + } + } + else + { + const u32 diff = block->base_offset - base_address; + if (diff > info.stride()) + { + // Not interleaved, continue + continue; + } + + // Matches, and this address is lower than existing + block->base_offset = base_address; + } + + alloc_new_block = false; + block->locations.push_back({ index, modulo, info.frequency() }); + block->interleaved = true; + break; + } + + if (alloc_new_block) + { + interleaved_range_info& block = *result.alloc_interleaved_block(); + block.base_offset = base_address; + block.attribute_stride = info.stride(); + block.memory_location = info.offset() >> 31; + block.locations.reserve(16); + block.locations.push_back({ index, modulo, info.frequency() }); + + if (block.attribute_stride == 0) + { + block.single_vertex = true; + block.attribute_stride = rsx::get_vertex_type_size_on_host(info.type(), info.size()); + } + + result.interleaved_blocks.push_back(&block); + } + } + } + + for (auto& info : result.interleaved_blocks) + { + // Calculate real data address to be used during upload + info->real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(state.vertex_data_base_offset(), info->base_offset), info->memory_location); + } + } + + std::span draw_command_processor::get_raw_index_array(const draw_clause& draw_indexed_clause) const + { + if (!m_element_push_buffer.empty()) [[ unlikely ]] + { + // Indices provided via immediate mode + return { reinterpret_cast(m_element_push_buffer.data()), ::narrow(m_element_push_buffer.size() * sizeof(u32)) }; + } + + const rsx::index_array_type type = REGS(m_ctx)->index_type(); + const u32 type_size = get_index_type_size(type); + + // Force aligned indices as realhw + const u32 address = (0 - type_size) & get_address(REGS(m_ctx)->index_array_address(), REGS(m_ctx)->index_array_location()); + + const u32 first = draw_indexed_clause.min_index(); + const u32 count = draw_indexed_clause.get_elements_count(); + + const auto ptr = vm::_ptr(address); + return { ptr + first * type_size, count * type_size }; + } + + std::variant + draw_command_processor::get_draw_command(const rsx::rsx_state& state) const + { + if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::indexed) [[ likely ]] + { + return draw_indexed_array_command + { + get_raw_index_array(state.current_draw_clause) + }; + } + + if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::array) + { + return draw_array_command{}; + } + + if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::inlined_array) + { + return draw_inlined_array{}; + } + + fmt::throw_exception("ill-formed draw command"); + } + + void draw_command_processor::append_to_push_buffer(u32 attribute, u32 size, u32 subreg_index, vertex_base_type type, u32 value) + { + if (!(REGS(m_ctx)->vertex_attrib_input_mask() & (1 << attribute))) + { + return; + } + + // Enforce ATTR0 as vertex attribute for push buffers. + // This whole thing becomes a mess if we don't have a provoking attribute. + const auto vertex_id = m_vertex_push_buffers[0].get_vertex_id(); + m_vertex_push_buffers[attribute].set_vertex_data(attribute, vertex_id, subreg_index, type, size, value); + RSX(m_ctx)->m_graphics_state |= rsx::pipeline_state::push_buffer_arrays_dirty; + } + + u32 draw_command_processor::get_push_buffer_vertex_count() const + { + // Enforce ATTR0 as vertex attribute for push buffers. + // This whole thing becomes a mess if we don't have a provoking attribute. + return m_vertex_push_buffers[0].vertex_count; + } + + void draw_command_processor::append_array_element(u32 index) + { + // Endianness is swapped because common upload code expects input in BE + // TODO: Implement fast upload path for LE inputs and do away with this + m_element_push_buffer.push_back(std::bit_cast>(index)); + } + + u32 draw_command_processor::get_push_buffer_index_count() const + { + return ::size32(m_element_push_buffer); + } + + void draw_command_processor::clear_push_buffers() + { + auto& graphics_state = RSX(m_ctx)->m_graphics_state; + if (graphics_state & rsx::pipeline_state::push_buffer_arrays_dirty) + { + for (auto& push_buf : m_vertex_push_buffers) + { + //Disabled, see https://github.com/RPCS3/rpcs3/issues/1932 + //REGS(m_ctx)->register_vertex_info[index].size = 0; + + push_buf.clear(); + } + + graphics_state.clear(rsx::pipeline_state::push_buffer_arrays_dirty); + } + + m_element_push_buffer.clear(); + } + + void draw_command_processor::fill_vertex_layout_state( + const vertex_input_layout& layout, + const vertex_program_metadata_t& vp_metadata, + u32 first_vertex, + u32 vertex_count, + s32* buffer, + u32 persistent_offset_base, + u32 volatile_offset_base) const + { + std::array offset_in_block = {}; + u32 volatile_offset = volatile_offset_base; + u32 persistent_offset = persistent_offset_base; + + // NOTE: Order is important! Transient ayout is always push_buffers followed by register data + if (REGS(m_ctx)->current_draw_clause.is_immediate_draw) + { + for (const auto& info : layout.volatile_blocks) + { + offset_in_block[info.first] = volatile_offset; + volatile_offset += info.second; + } + } + + for (u8 index : layout.referenced_registers) + { + offset_in_block[index] = volatile_offset; + volatile_offset += 16; + } + + if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::inlined_array) + { + const auto& block = layout.interleaved_blocks[0]; + u32 inline_data_offset = volatile_offset; + for (const auto& attrib : block->locations) + { + auto& info = REGS(m_ctx)->vertex_arrays_info[attrib.index]; + + offset_in_block[attrib.index] = inline_data_offset; + inline_data_offset += rsx::get_vertex_type_size_on_host(info.type(), info.size()); + } + } + else + { + for (const auto& block : layout.interleaved_blocks) + { + for (const auto& attrib : block->locations) + { + const u32 local_address = (REGS(m_ctx)->vertex_arrays_info[attrib.index].offset() & 0x7fffffff); + offset_in_block[attrib.index] = persistent_offset + (local_address - block->base_offset); + } + + const auto range = block->calculate_required_range(first_vertex, vertex_count); + persistent_offset += block->attribute_stride * range.second; + } + } + + // Fill the data + // Each descriptor field is 64 bits wide + // [0-8] attribute stride + // [8-24] attribute divisor + // [24-27] attribute type + // [27-30] attribute size + // [30-31] reserved + // [31-60] starting offset + // [60-21] swap bytes flag + // [61-22] volatile flag + // [62-63] modulo enable flag + + const s32 default_frequency_mask = (1 << 8); + const s32 swap_storage_mask = (1 << 29); + const s32 volatile_storage_mask = (1 << 30); + const s32 modulo_op_frequency_mask = smin; + + const u32 modulo_mask = REGS(m_ctx)->frequency_divider_operation_mask(); + const auto max_index = (first_vertex + vertex_count) - 1; + + for (u16 ref_mask = vp_metadata.referenced_inputs_mask, index = 0; ref_mask; ++index, ref_mask >>= 1) + { + if (!(ref_mask & 1u)) + { + // Unused input, ignore this + continue; + } + + if (layout.attribute_placement[index] == attribute_buffer_placement::none) + { + static constexpr u64 zero = 0; + std::memcpy(buffer + index * 2, &zero, sizeof(zero)); + continue; + } + + rsx::vertex_base_type type = {}; + s32 size = 0; + s32 attrib0 = 0; + s32 attrib1 = 0; + + if (layout.attribute_placement[index] == attribute_buffer_placement::transient) + { + if (REGS(m_ctx)->current_draw_clause.command == rsx::draw_command::inlined_array) + { + const auto& info = REGS(m_ctx)->vertex_arrays_info[index]; + + if (!info.size()) + { + // Register + const auto& reginfo = REGS(m_ctx)->register_vertex_info[index]; + type = reginfo.type; + size = reginfo.size; + + attrib0 = rsx::get_vertex_type_size_on_host(type, size); + } + else + { + // Array + type = info.type(); + size = info.size(); + + attrib0 = layout.interleaved_blocks[0]->attribute_stride | default_frequency_mask; + } + } + else + { + // Data is either from an immediate render or register input + // Immediate data overrides register input + + if (REGS(m_ctx)->current_draw_clause.is_immediate_draw && + m_vertex_push_buffers[index].vertex_count > 1) + { + // Push buffer + const auto& info = m_vertex_push_buffers[index]; + type = info.type; + size = info.size; + + attrib0 = rsx::get_vertex_type_size_on_host(type, size) | default_frequency_mask; + } + else + { + // Register + const auto& info = REGS(m_ctx)->register_vertex_info[index]; + type = info.type; + size = info.size; + + attrib0 = rsx::get_vertex_type_size_on_host(type, size); + } + } + + attrib1 |= volatile_storage_mask; + } + else + { + auto& info = REGS(m_ctx)->vertex_arrays_info[index]; + type = info.type(); + size = info.size(); + + auto stride = info.stride(); + attrib0 = stride; + + if (stride > 0) // when stride is 0, input is not an array but a single element + { + const u32 frequency = info.frequency(); + switch (frequency) + { + case 0: + case 1: + { + attrib0 |= default_frequency_mask; + break; + } + default: + { + if (modulo_mask & (1 << index)) + { + if (max_index >= frequency) + { + // Only set modulo mask if a modulo op is actually necessary! + // This requires that the uploaded range for this attr = [0, freq-1] + // Ignoring modulo op if the rendered range does not wrap allows for range optimization + attrib0 |= (frequency << 8); + attrib1 |= modulo_op_frequency_mask; + } + else + { + attrib0 |= default_frequency_mask; + } + } + else + { + // Division + attrib0 |= (frequency << 8); + } + break; + } + } + } + } // end attribute placement check + + // Special compressed 4 components into one 4-byte value. Decoded as one value. + if (type == rsx::vertex_base_type::cmp) + { + size = 1; + } + + // All data is passed in in PS3-native order (BE) so swap flag should be set + attrib1 |= swap_storage_mask; + attrib0 |= (static_cast(type) << 24); + attrib0 |= (size << 27); + attrib1 |= offset_in_block[index]; + + buffer[index * 2 + 0] = attrib0; + buffer[index * 2 + 1] = attrib1; + } + } + + void draw_command_processor::write_vertex_data_to_memory( + const vertex_input_layout& layout, + u32 first_vertex, + u32 vertex_count, + void* persistent_data, + void* volatile_data) const + { + auto transient = static_cast(volatile_data); + auto persistent = static_cast(persistent_data); + + auto& draw_call = REGS(m_ctx)->current_draw_clause; + + if (transient != nullptr) + { + if (draw_call.command == rsx::draw_command::inlined_array) + { + for (const u8 index : layout.referenced_registers) + { + memcpy(transient, REGS(m_ctx)->register_vertex_info[index].data.data(), 16); + transient += 16; + } + + memcpy(transient, draw_call.inline_vertex_array.data(), draw_call.inline_vertex_array.size() * sizeof(u32)); + // Is it possible to reference data outside of the inlined array? + return; + } + + // NOTE: Order is important! Transient layout is always push_buffers followed by register data + if (draw_call.is_immediate_draw) + { + // NOTE: It is possible for immediate draw to only contain index data, so vertex data can be in persistent memory + for (const auto& info : layout.volatile_blocks) + { + memcpy(transient, m_vertex_push_buffers[info.first].data.data(), info.second); + transient += info.second; + } + } + + for (const u8 index : layout.referenced_registers) + { + memcpy(transient, REGS(m_ctx)->register_vertex_info[index].data.data(), 16); + transient += 16; + } + } + + if (persistent != nullptr) + { + for (interleaved_range_info* block : layout.interleaved_blocks) + { + auto range = block->calculate_required_range(first_vertex, vertex_count); + + const u32 data_size = range.second * block->attribute_stride; + const u32 vertex_base = range.first * block->attribute_stride; + + g_fxo->get().copy(persistent, vm::_ptr(block->real_offset_address) + vertex_base, data_size); + persistent += data_size; + } + } + } + + void draw_command_processor::fill_scale_offset_data(void* buffer, bool flip_y) const + { + const int clip_w = REGS(m_ctx)->surface_clip_width(); + const int clip_h = REGS(m_ctx)->surface_clip_height(); + + const float scale_x = REGS(m_ctx)->viewport_scale_x() / (clip_w / 2.f); + float offset_x = REGS(m_ctx)->viewport_offset_x() - (clip_w / 2.f); + offset_x /= clip_w / 2.f; + + float scale_y = REGS(m_ctx)->viewport_scale_y() / (clip_h / 2.f); + float offset_y = (REGS(m_ctx)->viewport_offset_y() - (clip_h / 2.f)); + offset_y /= clip_h / 2.f; + if (flip_y) scale_y *= -1; + if (flip_y) offset_y *= -1; + + const float scale_z = REGS(m_ctx)->viewport_scale_z(); + const float offset_z = REGS(m_ctx)->viewport_offset_z(); + const float one = 1.f; + + utils::stream_vector(buffer, std::bit_cast(scale_x), 0, 0, std::bit_cast(offset_x)); + utils::stream_vector(static_cast(buffer) + 16, 0, std::bit_cast(scale_y), 0, std::bit_cast(offset_y)); + utils::stream_vector(static_cast(buffer) + 32, 0, 0, std::bit_cast(scale_z), std::bit_cast(offset_z)); + utils::stream_vector(static_cast(buffer) + 48, 0, 0, 0, std::bit_cast(one)); + } + + void draw_command_processor::fill_user_clip_data(void* buffer) const + { + const rsx::user_clip_plane_op clip_plane_control[6] = + { + REGS(m_ctx)->clip_plane_0_enabled(), + REGS(m_ctx)->clip_plane_1_enabled(), + REGS(m_ctx)->clip_plane_2_enabled(), + REGS(m_ctx)->clip_plane_3_enabled(), + REGS(m_ctx)->clip_plane_4_enabled(), + REGS(m_ctx)->clip_plane_5_enabled(), + }; + + u8 data_block[64]; + s32* clip_enabled_flags = reinterpret_cast(data_block); + f32* clip_distance_factors = reinterpret_cast(data_block + 32); + + for (int index = 0; index < 6; ++index) + { + switch (clip_plane_control[index]) + { + default: + rsx_log.error("bad clip plane control (0x%x)", static_cast(clip_plane_control[index])); + [[fallthrough]]; + + case rsx::user_clip_plane_op::disable: + clip_enabled_flags[index] = 0; + clip_distance_factors[index] = 0.f; + break; + + case rsx::user_clip_plane_op::greater_or_equal: + clip_enabled_flags[index] = 1; + clip_distance_factors[index] = 1.f; + break; + + case rsx::user_clip_plane_op::less_than: + clip_enabled_flags[index] = 1; + clip_distance_factors[index] = -1.f; + break; + } + } + + memcpy(buffer, data_block, 2 * 8 * sizeof(u32)); + } + + /** + * Fill buffer with vertex program constants. + * Buffer must be at least 512 float4 wide. + */ + void draw_command_processor::fill_vertex_program_constants_data(void* buffer, const std::span& reloc_table) const + { + if (!reloc_table.empty()) [[ likely ]] + { + char* dst = reinterpret_cast(buffer); + for (const auto& index : reloc_table) + { + utils::stream_vector_from_memory(dst, ®S(m_ctx)->transform_constants[index]); + dst += 16; + } + } + else + { + memcpy(buffer, REGS(m_ctx)->transform_constants.data(), 468 * 4 * sizeof(float)); + } + } + + void draw_command_processor::fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& /*fragment_program*/) const + { + ROP_control_t rop_control{}; + + if (REGS(m_ctx)->alpha_test_enabled()) + { + const u32 alpha_func = static_cast(REGS(m_ctx)->alpha_func()); + rop_control.set_alpha_test_func(alpha_func); + rop_control.enable_alpha_test(); + } + + if (REGS(m_ctx)->polygon_stipple_enabled()) + { + rop_control.enable_polygon_stipple(); + } + + if (REGS(m_ctx)->msaa_alpha_to_coverage_enabled() && !RSX(m_ctx)->get_backend_config().supports_hw_a2c) + { + // TODO: Properly support alpha-to-coverage and alpha-to-one behavior in shaders + // Alpha values generate a coverage mask for order independent blending + // Requires hardware AA to work properly (or just fragment sample stage in fragment shaders) + // Simulated using combined alpha blend and alpha test + rop_control.enable_alpha_to_coverage(); + if (REGS(m_ctx)->msaa_sample_mask()) + { + rop_control.enable_MSAA_writes(); + } + + // Sample configuration bits + switch (REGS(m_ctx)->surface_antialias()) + { + case rsx::surface_antialiasing::center_1_sample: + break; + case rsx::surface_antialiasing::diagonal_centered_2_samples: + rop_control.set_msaa_control(1u); + break; + default: + rop_control.set_msaa_control(3u); + break; + } + } + + const f32 fog0 = REGS(m_ctx)->fog_params_0(); + const f32 fog1 = REGS(m_ctx)->fog_params_1(); + const u32 fog_mode = static_cast(REGS(m_ctx)->fog_equation()); + + // Check if framebuffer is actually an XRGB format and not a WZYX format + switch (REGS(m_ctx)->surface_color()) + { + case rsx::surface_color_format::w16z16y16x16: + case rsx::surface_color_format::w32z32y32x32: + case rsx::surface_color_format::x32: + // These behave very differently from "normal" formats. + break; + default: + // Integer framebuffer formats. + rop_control.enable_framebuffer_INT(); + + // Check if we want sRGB conversion. + if (REGS(m_ctx)->framebuffer_srgb_enabled()) + { + rop_control.enable_framebuffer_sRGB(); + } + break; + } + + // Generate wpos coefficients + // wpos equation is now as follows: + // wpos.y = (frag_coord / resolution_scale) * ((window_origin!=top)?-1.: 1.) + ((window_origin!=top)? window_height : 0) + // wpos.x = (frag_coord / resolution_scale) + // wpos.zw = frag_coord.zw + + const auto window_origin = REGS(m_ctx)->shader_window_origin(); + const u32 window_height = REGS(m_ctx)->shader_window_height(); + const f32 resolution_scale = (window_height <= static_cast(g_cfg.video.min_scalable_dimension)) ? 1.f : rsx::get_resolution_scale(); + const f32 wpos_scale = (window_origin == rsx::window_origin::top) ? (1.f / resolution_scale) : (-1.f / resolution_scale); + const f32 wpos_bias = (window_origin == rsx::window_origin::top) ? 0.f : window_height; + const f32 alpha_ref = REGS(m_ctx)->alpha_ref(); + + u32* dst = static_cast(buffer); + utils::stream_vector(dst, std::bit_cast(fog0), std::bit_cast(fog1), rop_control.value, std::bit_cast(alpha_ref)); + utils::stream_vector(dst + 4, 0u, fog_mode, std::bit_cast(wpos_scale), std::bit_cast(wpos_bias)); + } + + void draw_command_processor::fill_constants_instancing_buffer(rsx::io_buffer& indirection_table_buf, rsx::io_buffer& constants_data_array_buffer, const VertexProgramBase& prog) const + { + auto& draw_call = REGS(m_ctx)->current_draw_clause; + + // Only call this for instanced draws! + ensure(draw_call.is_trivial_instanced_draw); + + // Temp indirection table. Used to track "running" updates. + rsx::simple_array instancing_indirection_table; + // indirection table size + const auto reloc_table = prog.has_indexed_constants ? decltype(prog.constant_ids){} : prog.constant_ids; + const auto redirection_table_size = prog.has_indexed_constants ? 468u : ::size32(prog.constant_ids); + instancing_indirection_table.resize(redirection_table_size); + + // Temp constants data + rsx::simple_array constants_data; + constants_data.reserve(redirection_table_size * draw_call.pass_count()); + + // Allocate indirection buffer on GPU stream + indirection_table_buf.reserve(instancing_indirection_table.size_bytes() * draw_call.pass_count()); + auto indirection_out = indirection_table_buf.data(); + + rsx::instanced_draw_config_t instance_config; + u32 indirection_table_offset = 0; + + // We now replay the draw call here to pack the data. + draw_call.begin(); + + // Write initial draw data. + std::iota(instancing_indirection_table.begin(), instancing_indirection_table.end(), 0); + + constants_data.resize(redirection_table_size); + fill_vertex_program_constants_data(constants_data.data(), reloc_table); + + // Next draw. We're guaranteed more than one draw call by the caller. + draw_call.next(); + + do + { + // Write previous state + std::memcpy(indirection_out + indirection_table_offset, instancing_indirection_table.data(), instancing_indirection_table.size_bytes()); + indirection_table_offset += redirection_table_size; + + // Decode next draw state + instance_config = {}; + draw_call.execute_pipeline_dependencies(m_ctx, &instance_config); + + if (!instance_config.transform_constants_data_changed) + { + continue; + } + + const int translated_offset = prog.has_indexed_constants + ? instance_config.patch_load_offset + : prog.TranslateConstantsRange(instance_config.patch_load_offset, instance_config.patch_load_count); + + if (translated_offset >= 0) + { + // Trivially patchable in bulk + const u32 redirection_loc = ::size32(constants_data); + constants_data.resize(::size32(constants_data) + instance_config.patch_load_count); + std::memcpy(constants_data.data() + redirection_loc, ®S(m_ctx)->transform_constants[instance_config.patch_load_offset], instance_config.patch_load_count * sizeof(u128)); + + // Update indirection table + for (auto i = translated_offset, count = 0; + static_cast(count) < instance_config.patch_load_count; + ++i, ++count) + { + instancing_indirection_table[i] = redirection_loc + count; + } + + continue; + } + + ensure(!prog.has_indexed_constants); + + // Sparse update. Update records individually instead of bulk + // FIXME: Range batching optimization + const auto load_end = instance_config.patch_load_offset + instance_config.patch_load_count; + for (u32 i = 0; i < redirection_table_size; ++i) + { + const auto read_index = prog.constant_ids[i]; + if (read_index < instance_config.patch_load_offset || read_index >= load_end) + { + // Reading outside "hot" range. + continue; + } + + const u32 redirection_loc = ::size32(constants_data); + constants_data.resize(::size32(constants_data) + 1); + std::memcpy(constants_data.data() + redirection_loc, ®S(m_ctx)->transform_constants[read_index], sizeof(u128)); + + instancing_indirection_table[i] = redirection_loc; + } + + } while (draw_call.next()); + + // Tail + ensure(indirection_table_offset < (instancing_indirection_table.size() * draw_call.pass_count())); + std::memcpy(indirection_out + indirection_table_offset, instancing_indirection_table.data(), instancing_indirection_table.size_bytes()); + + // Now write the constants to the GPU buffer + constants_data_array_buffer.reserve(constants_data.size_bytes()); + std::memcpy(constants_data_array_buffer.data(), constants_data.data(), constants_data.size_bytes()); + } +} diff --git a/rpcs3/Emu/RSX/Core/RSXDrawCommands.h b/rpcs3/Emu/RSX/Core/RSXDrawCommands.h new file mode 100644 index 0000000000..52f03b1faa --- /dev/null +++ b/rpcs3/Emu/RSX/Core/RSXDrawCommands.h @@ -0,0 +1,110 @@ +#pragma once + +#include + +#include "Emu/RSX/Core/RSXVertexTypes.h" +#include "Emu/RSX/NV47/FW/draw_call.hpp" +#include "Emu/RSX/Program/ProgramStateCache.h" +#include "Emu/RSX/rsx_vertex_data.h" + +#include +#include + +namespace rsx +{ + struct rsx_state; + struct context; + class io_buffer; + + class draw_command_processor + { + using vertex_program_metadata_t = program_hash_util::vertex_program_utils::vertex_program_metadata; + + context* m_ctx = nullptr; + + protected: + friend class thread; + + std::array m_vertex_push_buffers; + rsx::simple_array m_element_push_buffer; + + public: + draw_command_processor() = default; + + void init(context* ctx) + { + m_ctx = ctx; + } + + // Analyze vertex inputs and group all interleaved blocks + void analyse_inputs_interleaved(vertex_input_layout& layout, const vertex_program_metadata_t& vp_metadata); + + // Retrieve raw bytes for the index array (untyped) + std::span get_raw_index_array(const draw_clause& draw_indexed_clause) const; + + // Get compiled draw command for backend rendering + std::variant + get_draw_command(const rsx::rsx_state& state) const; + + // Push-buffers for immediate rendering (begin-end scopes) + void append_to_push_buffer(u32 attribute, u32 size, u32 subreg_index, vertex_base_type type, u32 value); + + u32 get_push_buffer_vertex_count() const; + + void append_array_element(u32 index); + + u32 get_push_buffer_index_count() const; + + void clear_push_buffers(); + + const std::span element_push_buffer() const + { + return m_element_push_buffer; + } + + // Host driver helpers + void fill_vertex_layout_state( + const vertex_input_layout& layout, + const vertex_program_metadata_t& vp_metadata, + u32 first_vertex, + u32 vertex_count, + s32* buffer, + u32 persistent_offset_base, + u32 volatile_offset_base) const; + + void write_vertex_data_to_memory( + const vertex_input_layout& layout, + u32 first_vertex, + u32 vertex_count, + void* persistent_data, + void* volatile_data) const; + + /** + * Fill buffer with 4x4 scale offset matrix. + * Vertex shader's position is to be multiplied by this matrix. + * if flip_y is set, the matrix is modified to use d3d convention. + */ + void fill_scale_offset_data(void* buffer, bool flip_y) const; + + /** + * Fill buffer with user clip information + */ + void fill_user_clip_data(void* buffer) const; + + /** + * Fill buffer with vertex program constants. + * Relocation table allows to do a partial fill with only selected registers. + */ + void fill_vertex_program_constants_data(void* buffer, const std::span& reloc_table) const; + + /** + * Fill buffer with fragment rasterization state. + * Fills current fog values, alpha test parameters and texture scaling parameters + */ + void fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& fragment_program) const; + + // Fill instancing buffers. A single iobuf is used for both. 256byte alignment enforced to allow global bind + // Returns offsets to the index redirection lookup table and constants field array + void fill_constants_instancing_buffer(rsx::io_buffer& indirection_table_buf, rsx::io_buffer& constants_data_array_buffer, const VertexProgramBase& prog) const; + }; +} diff --git a/rpcs3/Emu/RSX/Core/RSXDriverState.h b/rpcs3/Emu/RSX/Core/RSXDriverState.h new file mode 100644 index 0000000000..d2ff65c18f --- /dev/null +++ b/rpcs3/Emu/RSX/Core/RSXDriverState.h @@ -0,0 +1,54 @@ +#pragma once + +#include + +namespace rsx +{ + enum pipeline_state : u32 + { + fragment_program_ucode_dirty = (1 << 0), // Fragment program ucode changed + vertex_program_ucode_dirty = (1 << 1), // Vertex program ucode changed + fragment_program_state_dirty = (1 << 2), // Fragment program state changed + vertex_program_state_dirty = (1 << 3), // Vertex program state changed + fragment_state_dirty = (1 << 4), // Fragment state changed (alpha test, etc) + vertex_state_dirty = (1 << 5), // Vertex state changed (scale_offset, clip planes, etc) + transform_constants_dirty = (1 << 6), // Transform constants changed + fragment_constants_dirty = (1 << 7), // Fragment constants changed + framebuffer_reads_dirty = (1 << 8), // Framebuffer contents changed + fragment_texture_state_dirty = (1 << 9), // Fragment texture parameters changed + vertex_texture_state_dirty = (1 << 10), // Fragment texture parameters changed + scissor_config_state_dirty = (1 << 11), // Scissor region changed + zclip_config_state_dirty = (1 << 12), // Viewport Z clip changed + + scissor_setup_invalid = (1 << 13), // Scissor configuration is broken + scissor_setup_clipped = (1 << 14), // Scissor region is cropped by viewport constraint + + polygon_stipple_pattern_dirty = (1 << 15), // Rasterizer stippling pattern changed + line_stipple_pattern_dirty = (1 << 16), // Line stippling pattern changed + + push_buffer_arrays_dirty = (1 << 17), // Push buffers have data written to them (immediate mode vertex buffers) + + polygon_offset_state_dirty = (1 << 18), // Polygon offset config was changed + depth_bounds_state_dirty = (1 << 19), // Depth bounds configuration changed + + pipeline_config_dirty = (1 << 20), // Generic pipeline configuration changes. Shader peek hint. + + rtt_config_dirty = (1 << 21), // Render target configuration changed + rtt_config_contested = (1 << 22), // Render target configuration is indeterminate + rtt_config_valid = (1 << 23), // Render target configuration is valid + rtt_cache_state_dirty = (1 << 24), // Texture cache state is indeterminate + + xform_instancing_state_dirty = (1 << 25), // Transform instancing state has changed + + fragment_program_dirty = fragment_program_ucode_dirty | fragment_program_state_dirty, + vertex_program_dirty = vertex_program_ucode_dirty | vertex_program_state_dirty, + invalidate_pipeline_bits = fragment_program_dirty | vertex_program_dirty | xform_instancing_state_dirty, + invalidate_zclip_bits = vertex_state_dirty | zclip_config_state_dirty, + memory_barrier_bits = framebuffer_reads_dirty, + + // Vulkan-specific signals + invalidate_vk_dynamic_state = zclip_config_state_dirty | scissor_config_state_dirty | polygon_offset_state_dirty | depth_bounds_state_dirty, + + all_dirty = ~0u + }; +}