diff --git a/rpcs3/Emu/RSX/Core/RSXDrawCommands.cpp b/rpcs3/Emu/RSX/Core/RSXDrawCommands.cpp index a9d69f3a10..4ce0afa6e3 100644 --- a/rpcs3/Emu/RSX/Core/RSXDrawCommands.cpp +++ b/rpcs3/Emu/RSX/Core/RSXDrawCommands.cpp @@ -3,6 +3,8 @@ #include "Emu/RSX/Common/BufferUtils.h" #include "Emu/RSX/Common/buffer_stream.hpp" +#include "Emu/RSX/Common/io_buffer.h" +#include "Emu/RSX/NV47/HW/context_accessors.define.h" #include "Emu/RSX/Program/GLSLCommon.h" #include "Emu/RSX/rsx_methods.h" #include "Emu/RSX/RSXThread.h" @@ -244,7 +246,7 @@ namespace rsx // This whole thing becomes a mess if we don't have a provoking attribute. const auto vertex_id = m_vertex_push_buffers[0].get_vertex_id(); m_vertex_push_buffers[attribute].set_vertex_data(attribute, vertex_id, subreg_index, type, size, value); - m_thread->m_graphics_state |= rsx::pipeline_state::push_buffer_arrays_dirty; + RSX(m_ctx)->m_graphics_state |= rsx::pipeline_state::push_buffer_arrays_dirty; } u32 draw_command_processor::get_push_buffer_vertex_count() const @@ -268,7 +270,7 @@ namespace rsx void draw_command_processor::clear_push_buffers() { - auto& graphics_state = m_thread->m_graphics_state; + auto& graphics_state = RSX(m_ctx)->m_graphics_state; if (graphics_state & rsx::pipeline_state::push_buffer_arrays_dirty) { for (auto& push_buf : m_vertex_push_buffers) @@ -631,7 +633,7 @@ namespace rsx * Fill buffer with vertex program constants. * Buffer must be at least 512 float4 wide. */ - void draw_command_processor::fill_vertex_program_constants_data(void* buffer, const std::span& reloc_table) + void draw_command_processor::fill_vertex_program_constants_data(void* buffer, const std::span& reloc_table) const { if (!reloc_table.empty()) [[ likely ]] { @@ -648,7 +650,7 @@ namespace rsx } } - void draw_command_processor::fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& /*fragment_program*/) + void draw_command_processor::fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& /*fragment_program*/) const { ROP_control_t rop_control{}; @@ -664,7 +666,7 @@ namespace rsx rop_control.enable_polygon_stipple(); } - if (rsx::method_registers.msaa_alpha_to_coverage_enabled() && !m_thread->get_backend_config().supports_hw_a2c) + if (rsx::method_registers.msaa_alpha_to_coverage_enabled() && !RSX(m_ctx)->get_backend_config().supports_hw_a2c) { // TODO: Properly support alpha-to-coverage and alpha-to-one behavior in shaders // Alpha values generate a coverage mask for order independent blending @@ -731,4 +733,111 @@ namespace rsx utils::stream_vector(dst, std::bit_cast(fog0), std::bit_cast(fog1), rop_control.value, std::bit_cast(alpha_ref)); utils::stream_vector(dst + 4, 0u, fog_mode, std::bit_cast(wpos_scale), std::bit_cast(wpos_bias)); } + +#pragma optimize("", off) + + void draw_command_processor::fill_constants_instancing_buffer(rsx::io_buffer& indirection_table_buf, rsx::io_buffer& constants_data_array_buffer, const VertexProgramBase& prog) const + { + auto& draw_call = rsx::method_registers.current_draw_clause; + + // Only call this for instanced draws! + ensure(draw_call.is_trivial_instanced_draw); + + // Temp indirection table. Used to track "running" updates. + std::vector instancing_indirection_table; + // indirection table size + const auto redirection_table_size = prog.has_indexed_constants ? 468u : ::size32(prog.constant_ids); + + // Temp constants data + std::vector constants_data; + constants_data.reserve(redirection_table_size * draw_call.pass_count()); + + // Allocate indirection buffer on GPU stream + indirection_table_buf.reserve(redirection_table_size * draw_call.pass_count() * sizeof(u32)); + auto indirection_out = indirection_table_buf.data(); + + rsx::instanced_draw_config_t instance_config; + u32 indirection_table_offset = 0; + + // We now replay the draw call here to pack the data. + draw_call.begin(); + + // Write initial draw data. + instancing_indirection_table.resize(redirection_table_size); + std::iota(instancing_indirection_table.begin(), instancing_indirection_table.end(), 0); + + constants_data.resize(redirection_table_size); + fill_vertex_program_constants_data(constants_data.data(), prog.constant_ids); + + // Next draw. We're guaranteed more than one draw call by the caller. + draw_call.next(); + + do + { + // Write previous state + std::memcpy(indirection_out + indirection_table_offset, instancing_indirection_table.data(), instancing_indirection_table.size() * sizeof(u32)); + indirection_table_offset += redirection_table_size; + + // Decode next draw state + instance_config = {}; + draw_call.execute_pipeline_dependencies(m_ctx, &instance_config); + + if (!instance_config.transform_constants_data_changed) + { + continue; + } + + const bool do_full_reload = prog.has_indexed_constants; + if (do_full_reload) + { + const u32 redirection_loc = ::size32(constants_data); + constants_data.resize(redirection_loc + redirection_table_size); + fill_vertex_program_constants_data(constants_data.data() + redirection_loc, prog.constant_ids); + + std::iota(instancing_indirection_table.begin(), instancing_indirection_table.end(), redirection_loc); + continue; + } + + if (auto xform_id = prog.TranslateConstantsRange(instance_config.patch_load_offset, instance_config.patch_load_count); xform_id >= 0) + { + // Trivially patchable in bulk + const u32 redirection_loc = ::size32(constants_data); + constants_data.resize(::size32(constants_data) + instance_config.patch_load_count); + std::memcpy(constants_data.data() + redirection_loc, ®S(m_ctx)->transform_constants[instance_config.patch_load_offset], instance_config.patch_load_count * sizeof(u128)); + + // Update indirection table + for (auto i = xform_id, count = 0; + static_cast(count) < instance_config.patch_load_count; + ++i, ++count) + { + instancing_indirection_table[i] = redirection_loc + count; + } + + continue; + } + + // Sparse. Update records individually instead of bulk + const auto load_end = instance_config.patch_load_offset + instance_config.patch_load_count; + for (u32 i = 0; i < redirection_table_size; ++i) + { + const auto read_index = prog.constant_ids[i]; + if (read_index < instance_config.patch_load_offset || read_index >= load_end) + { + // Reading outside "hot" range. + continue; + } + + const u32 redirection_loc = ::size32(constants_data); + constants_data.resize(::size32(constants_data) + 1); + std::memcpy(constants_data.data() + redirection_loc, ®S(m_ctx)->transform_constants[read_index], sizeof(u128)); + + instancing_indirection_table[i] = redirection_loc; + } + + } while (draw_call.next()); + + // Now write the constants to the GPU buffer + constants_data_array_buffer.reserve(constants_data.size()); + std::memcpy(constants_data_array_buffer.data(), constants_data.data(), constants_data.size() * sizeof(u128)); + } } diff --git a/rpcs3/Emu/RSX/Core/RSXDrawCommands.h b/rpcs3/Emu/RSX/Core/RSXDrawCommands.h index 06570eac5c..b69a918401 100644 --- a/rpcs3/Emu/RSX/Core/RSXDrawCommands.h +++ b/rpcs3/Emu/RSX/Core/RSXDrawCommands.h @@ -13,13 +13,14 @@ namespace rsx { struct rsx_state; - class thread; + struct context; + class io_buffer; class draw_command_processor { using vertex_program_metadata_t = program_hash_util::vertex_program_utils::vertex_program_metadata; - thread* m_thread = nullptr; + context* m_ctx = nullptr; protected: friend class thread; @@ -30,9 +31,9 @@ namespace rsx public: draw_command_processor() = default; - void init(thread* rsxthr) + void init(context* ctx) { - m_thread = rsxthr; + m_ctx = ctx; } // Analyze vertex inputs and group all interleaved blocks @@ -94,12 +95,16 @@ namespace rsx * Fill buffer with vertex program constants. * Relocation table allows to do a partial fill with only selected registers. */ - void fill_vertex_program_constants_data(void* buffer, const std::span& reloc_table); + void fill_vertex_program_constants_data(void* buffer, const std::span& reloc_table) const; /** * Fill buffer with fragment rasterization state. * Fills current fog values, alpha test parameters and texture scaling parameters */ - void fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& fragment_program); + void fill_fragment_state_buffer(void* buffer, const RSXFragmentProgram& fragment_program) const; + + // Fill instancing buffers. A single iobuf is used for both. 256byte alignment enforced to allow global bind + // Returns offsets to the index redirection lookup table and constants field array + void fill_constants_instancing_buffer(rsx::io_buffer& indirection_table_buf, rsx::io_buffer& constants_data_array_buffer, const VertexProgramBase& prog) const; }; } diff --git a/rpcs3/Emu/RSX/Core/RSXDriverState.h b/rpcs3/Emu/RSX/Core/RSXDriverState.h index f49ce437f0..7c7ff3abbb 100644 --- a/rpcs3/Emu/RSX/Core/RSXDriverState.h +++ b/rpcs3/Emu/RSX/Core/RSXDriverState.h @@ -4,7 +4,7 @@ namespace rsx { - enum pipeline_state : u32 + enum pipeline_state : u32 { fragment_program_ucode_dirty = (1 << 0), // Fragment program ucode changed vertex_program_ucode_dirty = (1 << 1), // Vertex program ucode changed diff --git a/rpcs3/Emu/RSX/NV47/FW/draw_call.cpp b/rpcs3/Emu/RSX/NV47/FW/draw_call.cpp index 2d41a0ed75..089b97afeb 100644 --- a/rpcs3/Emu/RSX/NV47/FW/draw_call.cpp +++ b/rpcs3/Emu/RSX/NV47/FW/draw_call.cpp @@ -91,7 +91,7 @@ namespace rsx bool draw_clause::check_trivially_instanced() const { - if (draw_command_ranges.size() <= 1) + if (pass_count() <= 1) { // Cannot instance one draw call or less return false; @@ -145,7 +145,7 @@ namespace rsx is_disjoint_primitive = is_primitive_disjointed(primitive); } - u32 draw_clause::execute_pipeline_dependencies(context* ctx) const + u32 draw_clause::execute_pipeline_dependencies(context* ctx, instanced_draw_config_t* instance_config) const { u32 result = 0u; for (; @@ -191,7 +191,20 @@ namespace rsx // Update transform constants auto ptr = RSX(ctx)->fifo_ctrl->translate_address(barrier.arg0); auto buffer = std::span(static_cast(vm::base(ptr)), barrier.arg1); - nv4097::set_transform_constant::batch_decode(ctx, NV4097_SET_TRANSFORM_CONSTANT + barrier.index, buffer); + auto notify = [&](rsx::context*, u32 load, u32 count) + { + if (!instance_config) + { + return false; + } + + instance_config->transform_constants_data_changed = true; + instance_config->patch_load_offset = load; + instance_config->patch_load_count = count; + return true; + }; + + nv4097::set_transform_constant::batch_decode(ctx, NV4097_SET_TRANSFORM_CONSTANT + barrier.index, buffer, notify); result |= transform_constants_changed; break; } diff --git a/rpcs3/Emu/RSX/NV47/FW/draw_call.hpp b/rpcs3/Emu/RSX/NV47/FW/draw_call.hpp index fed734f025..4e350e985d 100644 --- a/rpcs3/Emu/RSX/NV47/FW/draw_call.hpp +++ b/rpcs3/Emu/RSX/NV47/FW/draw_call.hpp @@ -7,6 +7,14 @@ namespace rsx { + struct instanced_draw_config_t + { + bool transform_constants_data_changed; + + u32 patch_load_offset; + u32 patch_load_count; + }; + class draw_clause { // Stores the first and count argument from draw/draw indexed parameters between begin/end clauses. @@ -272,7 +280,7 @@ namespace rsx /** * Executes commands reqiured to make the current draw state valid */ - u32 execute_pipeline_dependencies(struct context* ctx) const; + u32 execute_pipeline_dependencies(struct context* ctx, instanced_draw_config_t* instance_config = nullptr) const; const draw_range_t& get_range() const { diff --git a/rpcs3/Emu/RSX/NV47/HW/nv4097.cpp b/rpcs3/Emu/RSX/NV47/HW/nv4097.cpp index 7efb92ba2b..7bccf6430e 100644 --- a/rpcs3/Emu/RSX/NV47/HW/nv4097.cpp +++ b/rpcs3/Emu/RSX/NV47/HW/nv4097.cpp @@ -30,7 +30,7 @@ namespace rsx REGS(ctx)->transform_constants[load + constant_id][subreg] = arg; } - void set_transform_constant::batch_decode(context* ctx, u32 reg, const std::span& args) + void set_transform_constant::batch_decode(context* ctx, u32 reg, const std::span& args, const std::function& notify) { const u32 index = reg - NV4097_SET_TRANSFORM_CONSTANT; const u32 constant_id = index / 4; @@ -40,8 +40,15 @@ namespace rsx auto dst = ®S(ctx)->transform_constants[load + constant_id][subreg]; copy_data_swap_u32(dst, args.data(), ::size32(args)); + // Notify const u32 last_constant_id = ((reg + ::size32(args) + 3) - NV4097_SET_TRANSFORM_CONSTANT) / 4; // Aligned div - RSX(ctx)->patch_transform_constants(ctx, load + constant_id, last_constant_id - constant_id); + const u32 load_index = load + constant_id; + const u32 load_count = last_constant_id - constant_id; + + if (!notify || !notify(ctx, load_index, load_count)) + { + RSX(ctx)->patch_transform_constants(ctx, load_index, load_count); + } } void set_transform_constant::impl(context* ctx, u32 reg, [[maybe_unused]] u32 arg) diff --git a/rpcs3/Emu/RSX/NV47/HW/nv4097.h b/rpcs3/Emu/RSX/NV47/HW/nv4097.h index 68a2ddb9df..6d526d799a 100644 --- a/rpcs3/Emu/RSX/NV47/HW/nv4097.h +++ b/rpcs3/Emu/RSX/NV47/HW/nv4097.h @@ -204,7 +204,7 @@ namespace rsx static void decode_one(context* ctx, u32 reg, u32 arg); - static void batch_decode(context* ctx, u32 reg, const std::span& args); + static void batch_decode(context* ctx, u32 reg, const std::span& args, const std::function& notify = {}); }; struct set_transform_program diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl index db2a91ff99..dfe7252ca8 100644 --- a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexPrologue.glsl @@ -56,10 +56,17 @@ vec4 apply_zclip_xform( #endif #if defined(_ENABLE_INSTANCED_CONSTANTS) +// Workaround for GL vs VK builtin variable naming +#ifdef VULKAN +#define _gl_InstanceID gl_InstanceIndex +#else +#define _gl_InstanceID gl_InstanceID +#endif + vec4 _fetch_constant(const in int base_offset) { // Get virtual draw/instance id. Normally will be 1:1 based on instance index - const int indirection_offset = (gl_InstanceID * CONSTANTS_ARRAY_LENGTH) + base_offset; + const int indirection_offset = (_gl_InstanceID * CONSTANTS_ARRAY_LENGTH) + base_offset; const int corrected_offset = constants_addressing_lookup[indirection_offset]; return instanced_constants_array[corrected_offset]; } diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index a7ed9894e1..9a242a3028 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -690,7 +690,6 @@ namespace rsx m_vertex_textures_dirty.fill(true); m_graphics_state |= pipeline_state::all_dirty; - m_draw_processor.init(this); g_user_asked_for_frame_capture = false; @@ -698,6 +697,8 @@ namespace rsx s_ctx.rsxthr = this; m_ctx = &s_ctx; + m_draw_processor.init(m_ctx); + if (g_cfg.misc.use_native_interface && (g_cfg.video.renderer == video_renderer::opengl || g_cfg.video.renderer == video_renderer::vulkan)) { m_overlay_manager = g_fxo->init(0); diff --git a/rpcs3/Emu/RSX/VK/VKDraw.cpp b/rpcs3/Emu/RSX/VK/VKDraw.cpp index 7db23ee80d..7c40cc12ad 100644 --- a/rpcs3/Emu/RSX/VK/VKDraw.cpp +++ b/rpcs3/Emu/RSX/VK/VKDraw.cpp @@ -929,7 +929,11 @@ void VKGSRender::emit_geometry(u32 sub_index) if (!upload_info.index_info) { - if (draw_call.is_single_draw()) + if (draw_call.is_trivial_instanced_draw) + { + vkCmdDraw(*m_current_command_buffer, upload_info.vertex_draw_count, draw_call.pass_count(), 0, 0); + } + else if (draw_call.is_single_draw()) { vkCmdDraw(*m_current_command_buffer, upload_info.vertex_draw_count, 1, 0, 0); } @@ -951,10 +955,13 @@ void VKGSRender::emit_geometry(u32 sub_index) vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, offset, index_type); - if (rsx::method_registers.current_draw_clause.is_single_draw()) + if (draw_call.is_trivial_instanced_draw) { - const u32 index_count = upload_info.vertex_draw_count; - vkCmdDrawIndexed(*m_current_command_buffer, index_count, 1, 0, 0, 0); + vkCmdDrawIndexed(*m_current_command_buffer, upload_info.vertex_draw_count, draw_call.pass_count(), 0, 0, 0); + } + else if (rsx::method_registers.current_draw_clause.is_single_draw()) + { + vkCmdDrawIndexed(*m_current_command_buffer, upload_info.vertex_draw_count, 1, 0, 0, 0); } else { @@ -1052,7 +1059,10 @@ void VKGSRender::end() m_frame_stats.setup_time += m_profiler.duration(); // Apply write memory barriers - if (auto ds = std::get<1>(m_rtts.m_bound_depth_stencil)) ds->write_barrier(*m_current_command_buffer); + if (auto ds = std::get<1>(m_rtts.m_bound_depth_stencil)) + { + ds->write_barrier(*m_current_command_buffer); + } for (auto &rtt : m_rtts.m_bound_render_targets) { @@ -1111,12 +1121,19 @@ void VKGSRender::end() m_current_command_buffer->flags |= vk::command_buffer::cb_reload_dynamic_state; } - rsx::method_registers.current_draw_clause.begin(); + auto& draw_call = rsx::method_registers.current_draw_clause; + draw_call.begin(); do { emit_geometry(sub_index++); + + if (draw_call.is_trivial_instanced_draw) + { + // We already completed. End the draw. + draw_call.end(); + } } - while (rsx::method_registers.current_draw_clause.next()); + while (draw_call.next()); if (m_current_command_buffer->flags & vk::command_buffer::cb_has_conditional_render) { diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index a43918b947..51d0df3580 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -477,6 +477,22 @@ namespace idx++; + bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[idx].descriptorCount = 1; + bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + bindings[idx].binding = binding_table.instancing_lookup_table_bind_slot; + bindings[idx].pImmutableSamplers = nullptr; + + idx++; + + bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[idx].descriptorCount = 1; + bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + bindings[idx].binding = binding_table.instancing_constants_buffer_slot; + bindings[idx].pImmutableSamplers = nullptr; + + idx++; + for (auto binding = binding_table.textures_first_bind_slot; binding < binding_table.vertex_textures_first_bind_slot; binding++) @@ -643,7 +659,7 @@ VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar) { VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER , (num_fs_samplers + 4) }, // Conditional rendering predicate slot; refactor to allow skipping this when not needed - { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1 } + { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3 } }; m_descriptor_pool.create(*m_device, descriptor_type_sizes, max_draw_calls); @@ -661,6 +677,7 @@ VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar) m_index_buffer_ring_info.create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, "index buffer"); m_texture_upload_buffer_ring_info.create(VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, "texture upload buffer", 32 * 0x100000); m_raster_env_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "raster env buffer"); + m_instancing_buffer_ring_info.create(VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, VK_TRANSFORM_CONSTANTS_BUFFER_SIZE_M * 0x100000, "instancing data buffer"); const auto shadermode = g_cfg.video.shadermode.get(); @@ -949,6 +966,7 @@ VKGSRender::~VKGSRender() m_vertex_instructions_buffer.destroy(); m_fragment_instructions_buffer.destroy(); m_raster_env_ring_info.destroy(); + m_instancing_buffer_ring_info.destroy(); // Fallback bindables null_buffer.reset(); @@ -1286,7 +1304,8 @@ void VKGSRender::check_heap_status(u32 flags) m_fragment_constants_ring_info.is_critical() || m_transform_constants_ring_info.is_critical() || m_index_buffer_ring_info.is_critical() || - m_raster_env_ring_info.is_critical(); + m_raster_env_ring_info.is_critical() || + m_instancing_buffer_ring_info.is_critical(); } else { @@ -1318,7 +1337,9 @@ void VKGSRender::check_heap_status(u32 flags) heap_critical = m_vertex_layout_ring_info.is_critical(); break; case VK_HEAP_CHECK_TRANSFORM_CONSTANTS_STORAGE: - heap_critical = m_transform_constants_ring_info.is_critical(); + heap_critical = (current_vertex_program.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS) + ? m_instancing_buffer_ring_info.is_critical() + : m_transform_constants_ring_info.is_critical(); break; case VK_HEAP_CHECK_FRAGMENT_CONSTANTS_STORAGE: heap_critical = m_fragment_constants_ring_info.is_critical(); @@ -1361,6 +1382,7 @@ void VKGSRender::check_heap_status(u32 flags) m_attrib_ring_info.reset_allocation_stats(); m_texture_upload_buffer_ring_info.reset_allocation_stats(); m_raster_env_ring_info.reset_allocation_stats(); + m_instancing_buffer_ring_info.reset_allocation_stats(); m_current_frame->reset_heap_ptrs(); m_last_heap_sync_time = rsx::get_shared_tag(); } @@ -2130,6 +2152,7 @@ void VKGSRender::load_program_env() const bool update_fragment_texture_env = !!(m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty); const bool update_instruction_buffers = (!!m_interpreter_state && m_shader_interpreter.is_interpreter(m_program)); const bool update_raster_env = (rsx::method_registers.polygon_stipple_enabled() && !!(m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty)); + const bool update_instancing_data = rsx::method_registers.current_draw_clause.is_trivial_instanced_draw; if (update_vertex_env) { @@ -2150,7 +2173,32 @@ void VKGSRender::load_program_env() m_vertex_env_buffer_info = { m_vertex_env_ring_info.heap->value, mem, 144 }; } - if (update_transform_constants) + if (update_instancing_data) + { + // Combines transform load + instancing lookup table + const auto alignment = m_device->gpu().get_limits().minStorageBufferOffsetAlignment; + usz indirection_table_offset = 0; + usz constants_data_table_offset = 0; + + rsx::io_buffer indirection_table_buf([&](usz size) -> std::pair + { + indirection_table_offset = m_instancing_buffer_ring_info.alloc<1>(utils::align(size, alignment)); + return std::make_pair(m_instancing_buffer_ring_info.map(indirection_table_offset, size), size); + }); + + rsx::io_buffer constants_array_buf([&](usz size) -> std::pair + { + constants_data_table_offset = m_instancing_buffer_ring_info.alloc<1>(utils::align(size, alignment)); + return std::make_pair(m_instancing_buffer_ring_info.map(constants_data_table_offset, size), size); + }); + + m_draw_processor.fill_constants_instancing_buffer(indirection_table_buf, constants_array_buf, *m_vertex_prog); + m_instancing_buffer_ring_info.unmap(); + + m_instancing_indirection_buffer_info = { m_instancing_buffer_ring_info.heap->value, indirection_table_offset, indirection_table_buf.size() }; + m_instancing_constants_array_buffer_info = { m_instancing_buffer_ring_info.heap->value, constants_data_table_offset, constants_array_buf.size() }; + } + else if (update_transform_constants) { // Transform constants usz mem_offset = 0; @@ -2295,13 +2343,24 @@ void VKGSRender::load_program_env() m_program->bind_buffer({ predicate, 0, 4 }, binding_table.conditional_render_predicate_slot, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set); } + if (current_vertex_program.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS) + { + m_program->bind_buffer(m_instancing_indirection_buffer_info, binding_table.instancing_lookup_table_bind_slot, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set); + m_program->bind_buffer(m_instancing_constants_array_buffer_info, binding_table.instancing_constants_buffer_slot, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set); + } + // Clear flags - m_graphics_state.clear( - rsx::pipeline_state::fragment_state_dirty | + u32 handled_flags = rsx::pipeline_state::fragment_state_dirty | rsx::pipeline_state::vertex_state_dirty | - rsx::pipeline_state::transform_constants_dirty | rsx::pipeline_state::fragment_constants_dirty | - rsx::pipeline_state::fragment_texture_state_dirty); + rsx::pipeline_state::fragment_texture_state_dirty; + + if (!update_instancing_data) + { + handled_flags |= rsx::pipeline_state::transform_constants_dirty; + } + + m_graphics_state.clear(handled_flags); } void VKGSRender::upload_transform_constants(const rsx::io_buffer& buffer) @@ -2488,7 +2547,8 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore m_index_buffer_ring_info.is_dirty() || m_transform_constants_ring_info.is_dirty() || m_texture_upload_buffer_ring_info.is_dirty() || - m_raster_env_ring_info.is_dirty()) + m_raster_env_ring_info.is_dirty() || + m_instancing_buffer_ring_info.is_dirty()) { auto secondary_command_buffer = m_secondary_cb_list.next(); secondary_command_buffer->begin(); @@ -2503,6 +2563,7 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore m_transform_constants_ring_info.sync(*secondary_command_buffer); m_texture_upload_buffer_ring_info.sync(*secondary_command_buffer); m_raster_env_ring_info.sync(*secondary_command_buffer); + m_instancing_buffer_ring_info.sync(*secondary_command_buffer); secondary_command_buffer->end(); diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index f99886c9dd..55c4b029bb 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -149,6 +149,7 @@ private: vk::data_heap m_index_buffer_ring_info; // Index data vk::data_heap m_texture_upload_buffer_ring_info; // Texture upload heap vk::data_heap m_raster_env_ring_info; // Raster control such as polygon and line stipple + vk::data_heap m_instancing_buffer_ring_info; // Instanced rendering data (constants indirection table + instanced constants) vk::data_heap m_fragment_instructions_buffer; vk::data_heap m_vertex_instructions_buffer; @@ -160,6 +161,8 @@ private: VkDescriptorBufferInfo m_fragment_constants_buffer_info {}; VkDescriptorBufferInfo m_fragment_texture_params_buffer_info {}; VkDescriptorBufferInfo m_raster_env_buffer_info {}; + VkDescriptorBufferInfo m_instancing_indirection_buffer_info {}; + VkDescriptorBufferInfo m_instancing_constants_array_buffer_info{}; VkDescriptorBufferInfo m_vertex_instructions_buffer_info {}; VkDescriptorBufferInfo m_fragment_instructions_buffer_info {}; diff --git a/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp b/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp index 27113bd25e..21997508e1 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp +++ b/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp @@ -197,6 +197,7 @@ namespace vk s64 index_heap_ptr = 0; s64 texture_upload_heap_ptr = 0; s64 rasterizer_env_heap_ptr = 0; + s64 instancing_heap_ptr = 0; u64 last_frame_sync_time = 0; @@ -218,6 +219,7 @@ namespace vk index_heap_ptr = other.index_heap_ptr; texture_upload_heap_ptr = other.texture_upload_heap_ptr; rasterizer_env_heap_ptr = other.rasterizer_env_heap_ptr; + instancing_heap_ptr = other.instancing_heap_ptr; } // Exchange storage (non-copyable) @@ -229,7 +231,7 @@ namespace vk void tag_frame_end( s64 attrib_loc, s64 vtxenv_loc, s64 fragenv_loc, s64 vtxlayout_loc, s64 fragtex_loc, s64 fragconst_loc, s64 vtxconst_loc, s64 index_loc, - s64 texture_loc, s64 rasterizer_loc) + s64 texture_loc, s64 rasterizer_loc, s64 instancing_loc) { attrib_heap_ptr = attrib_loc; vtx_env_heap_ptr = vtxenv_loc; @@ -241,6 +243,7 @@ namespace vk index_heap_ptr = index_loc; texture_upload_heap_ptr = texture_loc; rasterizer_env_heap_ptr = rasterizer_loc; + instancing_heap_ptr = instancing_loc; last_frame_sync_time = rsx::get_shared_tag(); } diff --git a/rpcs3/Emu/RSX/VK/VKPresent.cpp b/rpcs3/Emu/RSX/VK/VKPresent.cpp index 5fb4813d2c..0c32d9034d 100644 --- a/rpcs3/Emu/RSX/VK/VKPresent.cpp +++ b/rpcs3/Emu/RSX/VK/VKPresent.cpp @@ -163,7 +163,8 @@ void VKGSRender::advance_queued_frames() m_transform_constants_ring_info.get_current_put_pos_minus_one(), m_index_buffer_ring_info.get_current_put_pos_minus_one(), m_texture_upload_buffer_ring_info.get_current_put_pos_minus_one(), - m_raster_env_ring_info.get_current_put_pos_minus_one()); + m_raster_env_ring_info.get_current_put_pos_minus_one(), + m_instancing_buffer_ring_info.get_current_put_pos_minus_one()); m_queued_frames.push_back(m_current_frame); ensure(m_queued_frames.size() <= VK_MAX_ASYNC_FRAMES); @@ -266,6 +267,8 @@ void VKGSRender::frame_context_cleanup(vk::frame_context_t *ctx) m_fragment_texture_params_ring_info.m_get_pos = ctx->frag_texparam_heap_ptr; m_index_buffer_ring_info.m_get_pos = ctx->index_heap_ptr; m_texture_upload_buffer_ring_info.m_get_pos = ctx->texture_upload_heap_ptr; + m_raster_env_ring_info.m_get_pos = ctx->rasterizer_env_heap_ptr; + m_instancing_buffer_ring_info.m_get_pos = ctx->instancing_heap_ptr; m_attrib_ring_info.notify(); m_vertex_env_ring_info.notify(); @@ -276,6 +279,8 @@ void VKGSRender::frame_context_cleanup(vk::frame_context_t *ctx) m_fragment_texture_params_ring_info.notify(); m_index_buffer_ring_info.notify(); m_texture_upload_buffer_ring_info.notify(); + m_raster_env_ring_info.notify(); + m_instancing_buffer_ring_info.notify(); } } diff --git a/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp b/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp index fae195b1dc..0e88dab75d 100644 --- a/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp +++ b/rpcs3/Emu/RSX/VK/VKVertexProgram.cpp @@ -32,31 +32,34 @@ void VKVertexDecompilerThread::insertHeader(std::stringstream &OS) OS << "#version 450\n\n"; OS << "#extension GL_ARB_separate_shader_objects : enable\n\n"; - OS << "layout(std140, set = 0, binding = 0) uniform VertexContextBuffer\n"; - OS << "{\n"; - OS << " mat4 scale_offset_mat;\n"; - OS << " ivec4 user_clip_enabled[2];\n"; - OS << " vec4 user_clip_factor[2];\n"; - OS << " uint transform_branch_bits;\n"; - OS << " float point_size;\n"; - OS << " float z_near;\n"; - OS << " float z_far;\n"; - OS << "};\n\n"; + OS << + "layout(std140, set = 0, binding = 0) uniform VertexContextBuffer\n" + "{\n" + " mat4 scale_offset_mat;\n" + " ivec4 user_clip_enabled[2];\n" + " vec4 user_clip_factor[2];\n" + " uint transform_branch_bits;\n" + " float point_size;\n" + " float z_near;\n" + " float z_far;\n" + "};\n\n"; if (m_device_props.emulate_conditional_rendering) { - OS << "layout(std430, set = 0, binding = 8) readonly buffer EXT_Conditional_Rendering\n"; - OS << "{\n"; - OS << " uint conditional_rendering_predicate;\n"; - OS << "};\n\n"; + OS << + "layout(std430, set = 0, binding = 8) readonly buffer EXT_Conditional_Rendering\n" + "{\n" + " uint conditional_rendering_predicate;\n" + "};\n\n"; } - OS << "layout(push_constant) uniform VertexLayoutBuffer\n"; - OS << "{\n"; - OS << " uint vertex_base_index;\n"; - OS << " uint vertex_index_offset;\n"; - OS << " uint draw_id;\n"; - OS << " uint layout_ptr_offset;\n"; + OS << + "layout(push_constant) uniform VertexLayoutBuffer\n" + "{\n" + " uint vertex_base_index;\n" + " uint vertex_index_offset;\n" + " uint draw_id;\n" + " uint layout_ptr_offset;\n"; if (m_device_props.emulate_conditional_rendering) { @@ -110,18 +113,50 @@ void VKVertexDecompilerThread::insertConstants(std::stringstream & OS, const std { if (PI.name.starts_with("vc[")) { - OS << "layout(std140, set=0, binding = " << static_cast(m_binding_table.vertex_constant_buffers_bind_slot) << ") uniform VertexConstantsBuffer\n"; - OS << "{\n"; - OS << " vec4 " << PI.name << ";\n"; - OS << "};\n\n"; + if (!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS)) + { + OS << "layout(std140, set=0, binding=" << static_cast(m_binding_table.vertex_constant_buffers_bind_slot) << ") uniform VertexConstantsBuffer\n"; + OS << "{\n"; + OS << " vec4 " << PI.name << ";\n"; + OS << "};\n\n"; - in.location = m_binding_table.vertex_constant_buffers_bind_slot; - in.domain = glsl::glsl_vertex_program; - in.name = "VertexConstantsBuffer"; - in.type = vk::glsl::input_type_uniform_buffer; + in.location = m_binding_table.vertex_constant_buffers_bind_slot; + in.domain = glsl::glsl_vertex_program; + in.name = "VertexConstantsBuffer"; + in.type = vk::glsl::input_type_uniform_buffer; - inputs.push_back(in); - continue; + inputs.push_back(in); + continue; + } + else + { + // 1. Bind indirection lookup buffer + OS << "layout(std430, set=0, binding=" << static_cast(m_binding_table.instancing_lookup_table_bind_slot) << ") readonly buffer InstancingData\n"; + OS << "{\n"; + OS << " int constants_addressing_lookup[];\n"; + OS << "};\n\n"; + + in.location = m_binding_table.instancing_lookup_table_bind_slot; + in.domain = glsl::glsl_vertex_program; + in.name = "InstancingData"; + in.type = vk::glsl::input_type_storage_buffer; + inputs.push_back(in); + + // 2. Bind actual constants buffer + OS << "layout(std430, set=0, binding=" << static_cast(m_binding_table.instancing_constants_buffer_slot) << ") readonly buffer VertexConstantsBuffer\n"; + OS << "{\n"; + OS << " vec4 instanced_constants_array[];\n"; + OS << "};\n\n"; + + OS << "#define CONSTANTS_ARRAY_LENGTH " << (properties.has_indexed_constants ? 468 : ::size32(m_constant_ids)) << "\n\n"; + + in.location = m_binding_table.instancing_constants_buffer_slot; + in.domain = glsl::glsl_vertex_program; + in.name = "VertexConstantsBuffer"; + in.type = vk::glsl::input_type_storage_buffer; + inputs.push_back(in); + continue; + } } if (PT.type == "sampler2D" || @@ -209,6 +244,7 @@ void VKVertexDecompilerThread::insertMainStart(std::stringstream & OS) properties2.emulate_depth_clip_only = vk::g_render_device->get_shader_types_support().allow_float64; properties2.low_precision_tests = vk::is_NVIDIA(vk::get_driver_vendor()); properties2.require_explicit_invariance = (vk::is_NVIDIA(vk::get_driver_vendor()) && g_cfg.video.shader_precision != gpu_preset_level::low); + properties2.require_instanced_render = !!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS); glsl::insert_glsl_legacy_function(OS, properties2); glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_vulkan); diff --git a/rpcs3/Emu/RSX/VK/vkutils/pipeline_binding_table.h b/rpcs3/Emu/RSX/VK/vkutils/pipeline_binding_table.h index e2682a503e..fdf0ddd2b0 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/pipeline_binding_table.h +++ b/rpcs3/Emu/RSX/VK/vkutils/pipeline_binding_table.h @@ -14,8 +14,10 @@ namespace vk u8 vertex_buffers_first_bind_slot = 5; u8 conditional_render_predicate_slot = 8; u8 rasterizer_env_bind_slot = 9; - u8 textures_first_bind_slot = 10; - u8 vertex_textures_first_bind_slot = 10; // Invalid, has to be initialized properly + u8 instancing_lookup_table_bind_slot = 10; + u8 instancing_constants_buffer_slot = 11; + u8 textures_first_bind_slot = 12; + u8 vertex_textures_first_bind_slot = 12; // Invalid, has to be initialized properly u8 total_descriptor_bindings = vertex_textures_first_bind_slot; // Invalid, has to be initialized properly }; }