diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index faace8b3bf..a660ae927b 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -212,7 +212,7 @@ void GLGSRender::begin() __glcheck glDepthBoundsEXT(rsx::method_registers.depth_bounds_min(), rsx::method_registers.depth_bounds_max()); } - __glcheck glDepthRange(rsx::method_registers.clip_min(), rsx::method_registers.clip_max()); + //__glcheck glDepthRange(rsx::method_registers.clip_min(), rsx::method_registers.clip_max()); __glcheck enable(rsx::method_registers.dither_enabled(), GL_DITHER); if (__glcheck enable(rsx::method_registers.blend_enabled(), GL_BLEND)) @@ -373,8 +373,13 @@ void GLGSRender::end() int location; if (!rsx::method_registers.fragment_textures[i].enabled()) { - glActiveTexture(GL_TEXTURE0 + i); - glBindTexture(GL_TEXTURE_2D, 0); + if (m_textures_dirty[i]) + { + glActiveTexture(GL_TEXTURE0 + i); + glBindTexture(GL_TEXTURE_2D, 0); + + m_textures_dirty[i] = false; + } continue; } @@ -392,12 +397,12 @@ void GLGSRender::end() int texture_index = i + rsx::limits::fragment_textures_count; int location; - if (!rsx::method_registers.vertex_textures[i].enabled()) +/* if (!rsx::method_registers.vertex_textures[i].enabled()) { glActiveTexture(GL_TEXTURE0 + texture_index); glBindTexture(GL_TEXTURE_2D, 0); continue; - } + } */ if (m_program->uniforms.has_location("vtex" + std::to_string(i), &location)) { @@ -409,10 +414,20 @@ void GLGSRender::end() std::chrono::time_point textures_end = steady_clock::now(); m_textures_upload_time += (u32)std::chrono::duration_cast(textures_end - textures_start).count(); - u32 vertex_draw_count; + u32 vertex_draw_count = m_last_vertex_count; std::optional > indexed_draw_info; - std::tie(vertex_draw_count, indexed_draw_info) = set_vertex_buffer(); - m_vao.bind(); + bool skip_upload = false; + + if (!is_probable_instanced_draw()) + { + std::tie(vertex_draw_count, indexed_draw_info) = set_vertex_buffer(); + m_last_vertex_count = vertex_draw_count; + } + else + { + //LOG_ERROR(RSX, "No work is needed for this draw call! Muhahahahahahaha"); + skip_upload = true; + } std::chrono::time_point draw_start = steady_clock::now(); @@ -427,19 +442,28 @@ void GLGSRender::end() m_index_ring_buffer->unmap(); } - if (indexed_draw_info) + if (indexed_draw_info || (skip_upload && m_last_draw_indexed == true)) { if (__glcheck enable(rsx::method_registers.restart_index_enabled(), GL_PRIMITIVE_RESTART)) { - GLenum index_type = std::get<0>(indexed_draw_info.value()); + GLenum index_type = (skip_upload)? m_last_ib_type: std::get<0>(indexed_draw_info.value()); __glcheck glPrimitiveRestartIndex((index_type == GL_UNSIGNED_SHORT)? 0xffff: 0xffffffff); } - __glcheck glDrawElements(gl::draw_mode(rsx::method_registers.current_draw_clause.primitive), vertex_draw_count, std::get<0>(indexed_draw_info.value()), (GLvoid *)(std::ptrdiff_t)std::get<1>(indexed_draw_info.value())); + m_last_draw_indexed = true; + + if (!skip_upload) + { + m_last_ib_type = std::get<0>(indexed_draw_info.value()); + m_last_index_offset = std::get<1>(indexed_draw_info.value()); + } + + __glcheck glDrawElements(gl::draw_mode(rsx::method_registers.current_draw_clause.primitive), vertex_draw_count, m_last_ib_type, (GLvoid *)(std::ptrdiff_t)m_last_index_offset); } else { draw_fbo.draw_arrays(rsx::method_registers.current_draw_clause.primitive, vertex_draw_count); + m_last_draw_indexed = false; } m_attrib_ring_buffer->notify(); diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h index 1b9283194b..d324824ddb 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.h +++ b/rpcs3/Emu/RSX/GL/GLGSRender.h @@ -75,6 +75,11 @@ private: bool flush_draw_buffers = false; + bool m_last_draw_indexed; + GLenum m_last_ib_type; + size_t m_last_index_offset; + u32 m_last_vertex_count; + public: gl::fbo draw_fbo; diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 07f83e8123..dba396aad5 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -307,6 +307,32 @@ namespace rsx return (u32)element_push_buffer.size(); } + bool thread::is_probable_instanced_draw() + { + if (!g_cfg.video.batch_instanced_geometry) + return false; + + //If the array registers have not been touched, the index array has also not been touched via notify or via register set, its likely an instanced draw + //gcm lib will set the registers once and then call the same draw command over and over with different transform params to achieve this + if (m_index_buffer_changed || m_vertex_attribs_changed) + return false; + + auto& draw_clause = rsx::method_registers.current_draw_clause; + if (draw_clause.command != m_last_command) + return false; + + if (draw_clause.command != rsx::draw_command::inlined_array) + { + if (draw_clause.first_count_commands.back().second != m_last_first_count.second || + draw_clause.first_count_commands.front().first != m_last_first_count.first) + return false; + } + else if (m_last_first_count.second != draw_clause.inline_vertex_array.size()) + return false; + + return true; + } + void thread::end() { rsx::method_registers.transform_constants.clear(); @@ -327,6 +353,17 @@ namespace rsx u32 element_count = rsx::method_registers.current_draw_clause.get_elements_count(); capture_frame("Draw " + rsx::to_string(rsx::method_registers.current_draw_clause.primitive) + std::to_string(element_count)); } + + auto& clause = rsx::method_registers.current_draw_clause; + + m_last_command = clause.command; + if (m_last_command == rsx::draw_command::inlined_array) + m_last_first_count = std::make_pair(0, (u32)clause.inline_vertex_array.size()); + else + m_last_first_count = std::make_pair(clause.first_count_commands.front().first, clause.first_count_commands.back().second); + + m_index_buffer_changed = false; + m_vertex_attribs_changed = false; } void thread::on_task() @@ -499,6 +536,17 @@ namespace rsx m_vblank_thread->join(); m_vblank_thread.reset(); } + + if (m_vertex_streaming_task.processing_threads.size() > 0) + { + for (auto &thr : m_vertex_streaming_task.processing_threads) + { + thr->join(); + thr.reset(); + } + + m_vertex_streaming_task.processing_threads.resize(0); + } } std::string thread::get_name() const @@ -1072,4 +1120,112 @@ namespace rsx fmt::throw_exception("%s(addr=0x%x): RSXIO memory not mapped" HERE, __FUNCTION__, addr); } } + + void thread::post_vertex_stream_to_upload(gsl::span src, gsl::span dst, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, std::function callback) + { + upload_stream_packet packet; + packet.dst_span = dst; + packet.src_span = src; + packet.src_stride = attribute_src_stride; + packet.type = type; + packet.dst_stride = dst_stride; + packet.vector_width = vector_element_count; + packet.post_upload_func = callback; + + m_vertex_streaming_task.packets.push_back(packet); + } + + void thread::start_vertex_upload_task(u32 vertex_count) + { + if (m_vertex_streaming_task.processing_threads.size() == 0) + { + const u32 streaming_thread_count = (u32)g_cfg.video.vertex_upload_threads; + m_vertex_streaming_task.processing_threads.resize(streaming_thread_count); + + for (u32 n = 0; n < streaming_thread_count; ++n) + { + thread_ctrl::spawn(m_vertex_streaming_task.processing_threads[n], "Vertex Stream " + std::to_string(n), [this, n]() + { + auto &task = m_vertex_streaming_task; + const u32 index = n; + + while (!Emu.IsStopped()) + { + if (task.remaining_packets != 0) + { + //Wait for me! + task.ready_threads--; + + const size_t step = task.processing_threads.size(); + const size_t job_count = task.packets.size(); + //Process every nth packet + + size_t current_job = index; + + while (true) + { + if (current_job >= job_count) + break; + + auto &packet = task.packets[current_job]; + + write_vertex_array_data_to_buffer(packet.dst_span, packet.src_span, task.vertex_count, packet.type, packet.vector_width, packet.src_stride, packet.dst_stride); + + if (packet.post_upload_func) + packet.post_upload_func(packet.dst_span.data(), packet.type, (u8)packet.vector_width, task.vertex_count); + + _mm_sfence(); + task.remaining_packets--; + current_job += step; + } + + _mm_mfence(); + + while (task.remaining_packets > 0 && !Emu.IsStopped()) + { + _mm_lfence(); + std::this_thread::sleep_for(0us); + } + + _mm_sfence(); + task.ready_threads++; + } + else + std::this_thread::sleep_for(0us); + //thread_ctrl::wait(); + //busy_wait(); + } + }); + } + } + + while (m_vertex_streaming_task.ready_threads != 0 && !Emu.IsStopped()) + { + _mm_lfence(); + busy_wait(); + } + + m_vertex_streaming_task.vertex_count = vertex_count; + m_vertex_streaming_task.ready_threads = 0; + m_vertex_streaming_task.remaining_packets = (int)m_vertex_streaming_task.packets.size(); + } + + void thread::wait_for_vertex_upload_task() + { + while (m_vertex_streaming_task.remaining_packets > 0 && !Emu.IsStopped()) + { + _mm_lfence(); + busy_wait(); + } + + m_vertex_streaming_task.packets.resize(0); + } + + bool thread::vertex_upload_task_ready() + { + if (g_cfg.video.vertex_upload_threads < 2) + return false; + + return (m_vertex_streaming_task.remaining_packets == 0 && m_vertex_streaming_task.ready_threads == 0); + } } diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index ff4fb6cf3a..1af33228d8 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "GCM.h" #include "rsx_cache.h" #include "RSXTexture.h" @@ -148,6 +149,8 @@ namespace rsx bool m_rtts_dirty; bool m_transform_constants_dirty; bool m_textures_dirty[16]; + bool m_vertex_attribs_changed; + bool m_index_buffer_changed; protected: std::array get_color_surface_addresses() const; u32 get_zeta_surface_address() const; @@ -221,6 +224,44 @@ namespace rsx void append_array_element(u32 index); u32 get_push_buffer_index_count() const; + protected: + //Save draw call parameters to detect instanced renders + std::pair m_last_first_count; + rsx::draw_command m_last_command; + + bool is_probable_instanced_draw(); + + public: + //MT vertex streaming + struct upload_stream_packet + { + std::function post_upload_func; + gsl::span src_span; + gsl::span dst_span; + rsx::vertex_base_type type; + u32 vector_width; + u32 src_stride; + u8 dst_stride; + }; + + struct upload_stream_task + { + std::vector packets; + std::atomic remaining_packets = { 0 }; + std::atomic ready_threads = { 0 }; + std::atomic vertex_count; + + std::vector> processing_threads; + }; + + upload_stream_task m_vertex_streaming_task; + void post_vertex_stream_to_upload(gsl::span src, gsl::span dst, rsx::vertex_base_type type, + u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, + std::function callback); + void start_vertex_upload_task(u32 vertex_count); + void wait_for_vertex_upload_task(); + bool vertex_upload_task_ready(); + private: std::mutex m_mtx_task; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 02f15f9e65..53bae2c53d 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -637,6 +637,7 @@ VKGSRender::~VKGSRender() } //Close recording and wait for all to finish + close_render_pass(); CHECK_RESULT(vkEndCommandBuffer(*m_current_command_buffer)); for (auto &cb : m_primary_cb_list) @@ -753,12 +754,18 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing) } //This is awful! - while (m_flush_commands) _mm_pause(); + while (m_flush_commands) + { + _mm_lfence(); + _mm_pause(); + } std::lock_guard lock(m_secondary_cb_guard); bool status = m_texture_cache.flush_address(address, *m_device, m_secondary_command_buffer, m_memory_type_mapping, m_swap_chain->get_present_queue()); m_queued_threads--; + _mm_sfence(); + return status; } else @@ -800,6 +807,7 @@ void VKGSRender::begin() flush_command_queue(true); CHECK_RESULT(vkResetDescriptorPool(*m_device, descriptor_pool, 0)); + m_last_descriptor_set = VK_NULL_HANDLE; m_used_descriptors = 0; m_uniform_buffer_ring_info.reset_allocation_stats(); @@ -811,8 +819,6 @@ void VKGSRender::begin() m_flip_time += std::chrono::duration_cast(submit_end - submit_start).count(); } - std::chrono::time_point start = steady_clock::now(); - VkDescriptorSetAllocateInfo alloc_info = {}; alloc_info.descriptorPool = descriptor_pool; alloc_info.descriptorSetCount = 1; @@ -823,6 +829,9 @@ void VKGSRender::begin() CHECK_RESULT(vkAllocateDescriptorSets(*m_device, &alloc_info, &new_descriptor_set)); descriptor_sets = new_descriptor_set; + m_used_descriptors++; + + std::chrono::time_point start = steady_clock::now(); init_buffers(); @@ -834,26 +843,116 @@ void VKGSRender::begin() std::chrono::time_point stop = steady_clock::now(); m_setup_time += std::chrono::duration_cast(stop - start).count(); - - m_used_descriptors++; } -void VKGSRender::end() +void VKGSRender::emit_geometry_instance(u32/* instance_count*/) { + begin_render_pass(); + + m_instanced_draws++; + //Repeat last command + if (!m_last_draw_indexed) + vkCmdDraw(*m_current_command_buffer, m_last_vertex_count, 1, 0, 0); + else + { + vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, m_last_ib_offset, m_last_ib_type); + vkCmdDrawIndexed(*m_current_command_buffer, m_last_vertex_count, 1, 0, 0, 0); + } +} + +void VKGSRender::begin_render_pass() +{ + if (render_pass_open) + return; + size_t idx = vk::get_render_pass_location( vk::get_compatible_surface_format(rsx::method_registers.surface_color()).first, vk::get_compatible_depth_surface_format(m_optimal_tiling_supported_formats, rsx::method_registers.surface_depth_fmt()), (u8)vk::get_draw_buffers(rsx::method_registers.surface_color_target()).size()); VkRenderPass current_render_pass = m_render_passes[idx]; + VkRenderPassBeginInfo rp_begin = {}; + rp_begin.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + rp_begin.renderPass = current_render_pass; + rp_begin.framebuffer = m_framebuffer_to_clean.back()->value; + rp_begin.renderArea.offset.x = 0; + rp_begin.renderArea.offset.y = 0; + rp_begin.renderArea.extent.width = m_framebuffer_to_clean.back()->width(); + rp_begin.renderArea.extent.height = m_framebuffer_to_clean.back()->height(); + + vkCmdBeginRenderPass(*m_current_command_buffer, &rp_begin, VK_SUBPASS_CONTENTS_INLINE); + render_pass_open = true; +} + +void VKGSRender::close_render_pass() +{ + if (!render_pass_open) + return; + + vkCmdEndRenderPass(*m_current_command_buffer); + render_pass_open = false; +} + +void VKGSRender::end() +{ std::chrono::time_point program_start = steady_clock::now(); + const bool is_instanced = is_probable_instanced_draw() && m_last_descriptor_set != VK_NULL_HANDLE && m_program != nullptr; + + if (is_instanced) + { + //Copy descriptor set + VkCopyDescriptorSet copy_info[39]; + u8 descriptors_count = 0; + + for (u8 i = 0; i < 39; ++i) + { + if ((m_program->attribute_location_mask & (1ull << i)) == 0) + continue; + + const u8 n = descriptors_count; + + copy_info[n] = {}; + copy_info[n].sType = VK_STRUCTURE_TYPE_COPY_DESCRIPTOR_SET; + copy_info[n].srcSet = m_last_descriptor_set; + copy_info[n].dstSet = descriptor_sets; + copy_info[n].srcBinding = i; + copy_info[n].dstBinding = i; + copy_info[n].srcArrayElement = 0; + copy_info[n].dstArrayElement = 0; + copy_info[n].descriptorCount = 1; + + descriptors_count++; + } + + vkUpdateDescriptorSets(*m_device, 0, nullptr, descriptors_count, (const VkCopyDescriptorSet*)©_info); + } + //Load program here since it is dependent on vertex state - load_program(); + load_program(is_instanced); std::chrono::time_point program_stop = steady_clock::now(); m_setup_time += (u32)std::chrono::duration_cast(program_stop - program_start).count(); + if (is_instanced) + { + //Only the program constants descriptors should have changed + vkCmdBindPipeline(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_program->pipeline); + vkCmdBindDescriptorSets(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, 0, 1, &descriptor_sets, 0, nullptr); + + emit_geometry_instance(1); + m_last_instanced_cb_index = m_current_cb_index; + rsx::thread::end(); + return; + } + + close_render_pass(); //Texture upload stuff conflicts active RPs + + std::chrono::time_point vertex_start0 = steady_clock::now(); + auto upload_info = upload_vertex_data(); + std::chrono::time_point vertex_end0 = steady_clock::now(); + m_vertex_upload_time += std::chrono::duration_cast(vertex_end0 - vertex_start0).count(); + std::chrono::time_point textures_start = steady_clock::now(); for (int i = 0; i < rsx::limits::fragment_textures_count; ++i) @@ -948,21 +1047,7 @@ void VKGSRender::end() //Only textures are synchronized tightly with the GPU and they have been read back above vk::enter_uninterruptible(); - auto upload_info = upload_vertex_data(); - - std::chrono::time_point vertex_end = steady_clock::now(); - m_vertex_upload_time += std::chrono::duration_cast(vertex_end - textures_end).count(); - - VkRenderPassBeginInfo rp_begin = {}; - rp_begin.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; - rp_begin.renderPass = current_render_pass; - rp_begin.framebuffer = m_framebuffer_to_clean.back()->value; - rp_begin.renderArea.offset.x = 0; - rp_begin.renderArea.offset.y = 0; - rp_begin.renderArea.extent.width = m_framebuffer_to_clean.back()->width(); - rp_begin.renderArea.extent.height = m_framebuffer_to_clean.back()->height(); - - vkCmdBeginRenderPass(*m_current_command_buffer, &rp_begin, VK_SUBPASS_CONTENTS_INLINE); + begin_render_pass(); vkCmdBindPipeline(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_program->pipeline); vkCmdBindDescriptorSets(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, 0, 1, &descriptor_sets, 0, nullptr); @@ -984,8 +1069,20 @@ void VKGSRender::end() std::optional > index_info = std::get<2>(upload_info); + if (m_attrib_ring_info.mapped) + { + wait_for_vertex_upload_task(); + m_attrib_ring_info.unmap(); + } + + std::chrono::time_point vertex_end = steady_clock::now(); + m_vertex_upload_time += std::chrono::duration_cast(vertex_end - textures_end).count(); + if (!index_info) + { vkCmdDraw(*m_current_command_buffer, std::get<1>(upload_info), 1, 0, 0); + m_last_draw_indexed = false; + } else { VkIndexType index_type; @@ -996,9 +1093,15 @@ void VKGSRender::end() vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, offset, index_type); vkCmdDrawIndexed(*m_current_command_buffer, index_count, 1, 0, 0, 0); + + m_last_draw_indexed = false; + m_last_ib_type = index_type; + m_last_ib_offset = offset; + m_last_vertex_count = index_count; } - vkCmdEndRenderPass(*m_current_command_buffer); + m_last_instanced_cb_index = ~0; + m_last_descriptor_set = descriptor_sets; vk::leave_uninterruptible(); @@ -1139,27 +1242,9 @@ void VKGSRender::clear_surface(u32 mask) clear_regions.push_back(region); } - size_t idx = vk::get_render_pass_location( - vk::get_compatible_surface_format(rsx::method_registers.surface_color()).first, - vk::get_compatible_depth_surface_format(m_optimal_tiling_supported_formats, surface_depth_format), - (u8)targets.size()); - VkRenderPass current_render_pass = m_render_passes[idx]; - - VkRenderPassBeginInfo rp_begin = {}; - rp_begin.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; - rp_begin.renderPass = current_render_pass; - rp_begin.framebuffer = m_framebuffer_to_clean.back()->value; - rp_begin.renderArea.offset.x = 0; - rp_begin.renderArea.offset.y = 0; - rp_begin.renderArea.extent.width = m_framebuffer_to_clean.back()->width(); - rp_begin.renderArea.extent.height = m_framebuffer_to_clean.back()->height(); - - vkCmdBeginRenderPass(*m_current_command_buffer, &rp_begin, VK_SUBPASS_CONTENTS_INLINE); - + begin_render_pass(); vkCmdClearAttachments(*m_current_command_buffer, (u32)clear_descriptors.size(), clear_descriptors.data(), (u32)clear_regions.size(), clear_regions.data()); - vkCmdEndRenderPass(*m_current_command_buffer); - if (mask & 0x3) { if (std::get<0>(m_rtts.m_bound_depth_stencil) != 0) @@ -1217,6 +1302,7 @@ void VKGSRender::copy_render_targets_to_dma_location() void VKGSRender::flush_command_queue(bool hard_sync) { + close_render_pass(); close_and_submit_command_buffer({}, m_current_command_buffer->submit_fence); if (hard_sync) @@ -1325,7 +1411,11 @@ void VKGSRender::do_local_task() flush_command_queue(); m_flush_commands = false; - while (m_queued_threads) _mm_pause(); + while (m_queued_threads) + { + _mm_lfence(); + _mm_pause(); + } } } @@ -1345,177 +1435,185 @@ bool VKGSRender::do_method(u32 cmd, u32 arg) } } -bool VKGSRender::load_program() +bool VKGSRender::load_program(bool fast_update) { - auto rtt_lookup_func = [this](u32 texaddr, rsx::fragment_texture&, bool is_depth) -> std::tuple + RSXVertexProgram vertex_program; + RSXFragmentProgram fragment_program; + + if (!fast_update) { - vk::render_target *surface = nullptr; - if (!is_depth) - surface = m_rtts.get_texture_from_render_target_if_applicable(texaddr); + auto rtt_lookup_func = [this](u32 texaddr, rsx::fragment_texture&, bool is_depth) -> std::tuple + { + vk::render_target *surface = nullptr; + if (!is_depth) + surface = m_rtts.get_texture_from_render_target_if_applicable(texaddr); + else + surface = m_rtts.get_texture_from_depth_stencil_if_applicable(texaddr); + + if (!surface) return std::make_tuple(false, 0); + + return std::make_tuple(true, surface->native_pitch); + }; + + vertex_program = get_current_vertex_program(); + fragment_program = get_current_fragment_program(rtt_lookup_func); + + vk::pipeline_props properties = {}; + + properties.ia.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + bool unused; + properties.ia.topology = vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, unused); + + if (rsx::method_registers.restart_index_enabled()) + { + properties.ia.primitiveRestartEnable = VK_TRUE; + } else - surface = m_rtts.get_texture_from_depth_stencil_if_applicable(texaddr); - - if (!surface) return std::make_tuple(false, 0); - - return std::make_tuple(true, surface->native_pitch); - }; - - RSXVertexProgram vertex_program = get_current_vertex_program(); - RSXFragmentProgram fragment_program = get_current_fragment_program(rtt_lookup_func); - - vk::pipeline_props properties = {}; - - properties.ia.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; - bool unused; - properties.ia.topology = vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, unused); - - if (rsx::method_registers.restart_index_enabled()) - { - properties.ia.primitiveRestartEnable = VK_TRUE; - } - else - properties.ia.primitiveRestartEnable = VK_FALSE; + properties.ia.primitiveRestartEnable = VK_FALSE; - for (int i = 0; i < 4; ++i) - { - properties.att_state[i].colorWriteMask = 0xf; - properties.att_state[i].blendEnable = VK_FALSE; - } + for (int i = 0; i < 4; ++i) + { + properties.att_state[i].colorWriteMask = 0xf; + properties.att_state[i].blendEnable = VK_FALSE; + } - VkColorComponentFlags mask = 0; - if (rsx::method_registers.color_mask_a()) mask |= VK_COLOR_COMPONENT_A_BIT; - if (rsx::method_registers.color_mask_b()) mask |= VK_COLOR_COMPONENT_B_BIT; - if (rsx::method_registers.color_mask_g()) mask |= VK_COLOR_COMPONENT_G_BIT; - if (rsx::method_registers.color_mask_r()) mask |= VK_COLOR_COMPONENT_R_BIT; + VkColorComponentFlags mask = 0; + if (rsx::method_registers.color_mask_a()) mask |= VK_COLOR_COMPONENT_A_BIT; + if (rsx::method_registers.color_mask_b()) mask |= VK_COLOR_COMPONENT_B_BIT; + if (rsx::method_registers.color_mask_g()) mask |= VK_COLOR_COMPONENT_G_BIT; + if (rsx::method_registers.color_mask_r()) mask |= VK_COLOR_COMPONENT_R_BIT; - VkColorComponentFlags color_masks[4] = { mask }; + VkColorComponentFlags color_masks[4] = { mask }; - u8 render_targets[] = { 0, 1, 2, 3 }; - - for (u8 idx = 0; idx < m_draw_buffers_count; ++idx) - { - properties.att_state[render_targets[idx]].colorWriteMask = mask; - } - - if (rsx::method_registers.blend_enabled()) - { - VkBlendFactor sfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_rgb()); - VkBlendFactor sfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_a()); - VkBlendFactor dfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_rgb()); - VkBlendFactor dfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_a()); - - VkBlendOp equation_rgb = vk::get_blend_op(rsx::method_registers.blend_equation_rgb()); - VkBlendOp equation_a = vk::get_blend_op(rsx::method_registers.blend_equation_a()); + u8 render_targets[] = { 0, 1, 2, 3 }; for (u8 idx = 0; idx < m_draw_buffers_count; ++idx) { - properties.att_state[render_targets[idx]].blendEnable = VK_TRUE; - properties.att_state[render_targets[idx]].srcColorBlendFactor = sfactor_rgb; - properties.att_state[render_targets[idx]].dstColorBlendFactor = dfactor_rgb; - properties.att_state[render_targets[idx]].srcAlphaBlendFactor = sfactor_a; - properties.att_state[render_targets[idx]].dstAlphaBlendFactor = dfactor_a; - properties.att_state[render_targets[idx]].colorBlendOp = equation_rgb; - properties.att_state[render_targets[idx]].alphaBlendOp = equation_a; + properties.att_state[render_targets[idx]].colorWriteMask = mask; } - - auto blend_colors = rsx::get_constant_blend_colors(); - properties.cs.blendConstants[0] = blend_colors[0]; - properties.cs.blendConstants[1] = blend_colors[1]; - properties.cs.blendConstants[2] = blend_colors[2]; - properties.cs.blendConstants[3] = blend_colors[3]; - } - else - { - for (u8 idx = 0; idx < m_draw_buffers_count; ++idx) + + if (rsx::method_registers.blend_enabled()) { - properties.att_state[render_targets[idx]].blendEnable = VK_FALSE; - } - } - - properties.cs.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; - properties.cs.attachmentCount = m_draw_buffers_count; - properties.cs.pAttachments = properties.att_state; - - if (rsx::method_registers.logic_op_enabled()) - { - properties.cs.logicOpEnable = true; - properties.cs.logicOp = vk::get_logic_op(rsx::method_registers.logic_operation()); - } + VkBlendFactor sfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_rgb()); + VkBlendFactor sfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_a()); + VkBlendFactor dfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_rgb()); + VkBlendFactor dfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_a()); - properties.ds.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; - properties.ds.depthWriteEnable = rsx::method_registers.depth_write_enabled() ? VK_TRUE : VK_FALSE; + VkBlendOp equation_rgb = vk::get_blend_op(rsx::method_registers.blend_equation_rgb()); + VkBlendOp equation_a = vk::get_blend_op(rsx::method_registers.blend_equation_a()); - if (rsx::method_registers.depth_bounds_test_enabled()) - { - properties.ds.depthBoundsTestEnable = VK_TRUE; - properties.ds.minDepthBounds = rsx::method_registers.depth_bounds_min(); - properties.ds.maxDepthBounds = rsx::method_registers.depth_bounds_max(); - } - else - properties.ds.depthBoundsTestEnable = VK_FALSE; + for (u8 idx = 0; idx < m_draw_buffers_count; ++idx) + { + properties.att_state[render_targets[idx]].blendEnable = VK_TRUE; + properties.att_state[render_targets[idx]].srcColorBlendFactor = sfactor_rgb; + properties.att_state[render_targets[idx]].dstColorBlendFactor = dfactor_rgb; + properties.att_state[render_targets[idx]].srcAlphaBlendFactor = sfactor_a; + properties.att_state[render_targets[idx]].dstAlphaBlendFactor = dfactor_a; + properties.att_state[render_targets[idx]].colorBlendOp = equation_rgb; + properties.att_state[render_targets[idx]].alphaBlendOp = equation_a; + } - if (rsx::method_registers.stencil_test_enabled()) - { - properties.ds.stencilTestEnable = VK_TRUE; - properties.ds.front.writeMask = rsx::method_registers.stencil_mask(); - properties.ds.front.compareMask = rsx::method_registers.stencil_func_mask(); - properties.ds.front.reference = rsx::method_registers.stencil_func_ref(); - properties.ds.front.failOp = vk::get_stencil_op(rsx::method_registers.stencil_op_fail()); - properties.ds.front.passOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zpass()); - properties.ds.front.depthFailOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zfail()); - properties.ds.front.compareOp = vk::get_compare_func(rsx::method_registers.stencil_func()); - - if (rsx::method_registers.two_sided_stencil_test_enabled()) - { - properties.ds.back.writeMask = rsx::method_registers.back_stencil_mask(); - properties.ds.back.compareMask = rsx::method_registers.back_stencil_func_mask(); - properties.ds.back.reference = rsx::method_registers.back_stencil_func_ref(); - properties.ds.back.failOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_fail()); - properties.ds.back.passOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zpass()); - properties.ds.back.depthFailOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zfail()); - properties.ds.back.compareOp = vk::get_compare_func(rsx::method_registers.back_stencil_func()); + auto blend_colors = rsx::get_constant_blend_colors(); + properties.cs.blendConstants[0] = blend_colors[0]; + properties.cs.blendConstants[1] = blend_colors[1]; + properties.cs.blendConstants[2] = blend_colors[2]; + properties.cs.blendConstants[3] = blend_colors[3]; } else - properties.ds.back = properties.ds.front; + { + for (u8 idx = 0; idx < m_draw_buffers_count; ++idx) + { + properties.att_state[render_targets[idx]].blendEnable = VK_FALSE; + } + } + + properties.cs.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + properties.cs.attachmentCount = m_draw_buffers_count; + properties.cs.pAttachments = properties.att_state; + + if (rsx::method_registers.logic_op_enabled()) + { + properties.cs.logicOpEnable = true; + properties.cs.logicOp = vk::get_logic_op(rsx::method_registers.logic_operation()); + } + + properties.ds.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + properties.ds.depthWriteEnable = rsx::method_registers.depth_write_enabled() ? VK_TRUE : VK_FALSE; + + if (rsx::method_registers.depth_bounds_test_enabled()) + { + properties.ds.depthBoundsTestEnable = VK_TRUE; + properties.ds.minDepthBounds = rsx::method_registers.depth_bounds_min(); + properties.ds.maxDepthBounds = rsx::method_registers.depth_bounds_max(); + } + else + properties.ds.depthBoundsTestEnable = VK_FALSE; + + if (rsx::method_registers.stencil_test_enabled()) + { + properties.ds.stencilTestEnable = VK_TRUE; + properties.ds.front.writeMask = rsx::method_registers.stencil_mask(); + properties.ds.front.compareMask = rsx::method_registers.stencil_func_mask(); + properties.ds.front.reference = rsx::method_registers.stencil_func_ref(); + properties.ds.front.failOp = vk::get_stencil_op(rsx::method_registers.stencil_op_fail()); + properties.ds.front.passOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zpass()); + properties.ds.front.depthFailOp = vk::get_stencil_op(rsx::method_registers.stencil_op_zfail()); + properties.ds.front.compareOp = vk::get_compare_func(rsx::method_registers.stencil_func()); + + if (rsx::method_registers.two_sided_stencil_test_enabled()) + { + properties.ds.back.writeMask = rsx::method_registers.back_stencil_mask(); + properties.ds.back.compareMask = rsx::method_registers.back_stencil_func_mask(); + properties.ds.back.reference = rsx::method_registers.back_stencil_func_ref(); + properties.ds.back.failOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_fail()); + properties.ds.back.passOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zpass()); + properties.ds.back.depthFailOp = vk::get_stencil_op(rsx::method_registers.back_stencil_op_zfail()); + properties.ds.back.compareOp = vk::get_compare_func(rsx::method_registers.back_stencil_func()); + } + else + properties.ds.back = properties.ds.front; + } + else + properties.ds.stencilTestEnable = VK_FALSE; + + if (rsx::method_registers.depth_test_enabled()) + { + properties.ds.depthTestEnable = VK_TRUE; + properties.ds.depthCompareOp = vk::get_compare_func(rsx::method_registers.depth_func()); + } + else + properties.ds.depthTestEnable = VK_FALSE; + + properties.rs.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + properties.rs.polygonMode = VK_POLYGON_MODE_FILL; + properties.rs.depthClampEnable = VK_FALSE; + properties.rs.rasterizerDiscardEnable = VK_FALSE; + properties.rs.depthBiasEnable = VK_FALSE; + + if (rsx::method_registers.cull_face_enabled()) + properties.rs.cullMode = vk::get_cull_face(rsx::method_registers.cull_face_mode()); + else + properties.rs.cullMode = VK_CULL_MODE_NONE; + + properties.rs.frontFace = vk::get_front_face(rsx::method_registers.front_face_mode()); + + size_t idx = vk::get_render_pass_location( + vk::get_compatible_surface_format(rsx::method_registers.surface_color()).first, + vk::get_compatible_depth_surface_format(m_optimal_tiling_supported_formats, rsx::method_registers.surface_depth_fmt()), + (u8)vk::get_draw_buffers(rsx::method_registers.surface_color_target()).size()); + + properties.render_pass = m_render_passes[idx]; + + properties.num_targets = m_draw_buffers_count; + + vk::enter_uninterruptible(); + + //Load current program from buffer + m_program = m_prog_buffer.getGraphicPipelineState(vertex_program, fragment_program, properties, *m_device, pipeline_layout).get(); } else - properties.ds.stencilTestEnable = VK_FALSE; - - if (rsx::method_registers.depth_test_enabled()) - { - properties.ds.depthTestEnable = VK_TRUE; - properties.ds.depthCompareOp = vk::get_compare_func(rsx::method_registers.depth_func()); - } - else - properties.ds.depthTestEnable = VK_FALSE; - - properties.rs.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; - properties.rs.polygonMode = VK_POLYGON_MODE_FILL; - properties.rs.depthClampEnable = VK_FALSE; - properties.rs.rasterizerDiscardEnable = VK_FALSE; - properties.rs.depthBiasEnable = VK_FALSE; - - if (rsx::method_registers.cull_face_enabled()) - properties.rs.cullMode = vk::get_cull_face(rsx::method_registers.cull_face_mode()); - else - properties.rs.cullMode = VK_CULL_MODE_NONE; - - properties.rs.frontFace = vk::get_front_face(rsx::method_registers.front_face_mode()); - - size_t idx = vk::get_render_pass_location( - vk::get_compatible_surface_format(rsx::method_registers.surface_color()).first, - vk::get_compatible_depth_surface_format(m_optimal_tiling_supported_formats, rsx::method_registers.surface_depth_fmt()), - (u8)vk::get_draw_buffers(rsx::method_registers.surface_color_target()).size()); - - properties.render_pass = m_render_passes[idx]; - - properties.num_targets = m_draw_buffers_count; - - vk::enter_uninterruptible(); - - //Load current program from buffer - m_program = m_prog_buffer.getGraphicPipelineState(vertex_program, fragment_program, properties, *m_device, pipeline_layout).get(); + vk::enter_uninterruptible(); //TODO: Update constant buffers.. //1. Update scale-offset matrix @@ -1534,26 +1632,35 @@ bool VKGSRender::load_program() m_uniform_buffer_ring_info.unmap(); - const size_t vertex_constants_offset = m_uniform_buffer_ring_info.alloc<256>(512 * 4 * sizeof(float)); - buf = (u8*)m_uniform_buffer_ring_info.map(vertex_constants_offset, 512 * 4 * sizeof(float)); - fill_vertex_program_constants_data(buf); - *(reinterpret_cast(buf + (468 * 4 * sizeof(float)))) = rsx::method_registers.transform_branch_bits(); - m_uniform_buffer_ring_info.unmap(); - - const size_t fragment_constants_sz = m_prog_buffer.get_fragment_constants_buffer_size(fragment_program); - const size_t fragment_buffer_sz = fragment_constants_sz + (17 * 4 * sizeof(float)); - const size_t fragment_constants_offset = m_uniform_buffer_ring_info.alloc<256>(fragment_buffer_sz); - - buf = (u8*)m_uniform_buffer_ring_info.map(fragment_constants_offset, fragment_buffer_sz); - if (fragment_constants_sz) - m_prog_buffer.fill_fragment_constants_buffer({ reinterpret_cast(buf), ::narrow(fragment_constants_sz) }, fragment_program); - - fill_fragment_state_buffer(buf+fragment_constants_sz, fragment_program); - m_uniform_buffer_ring_info.unmap(); - m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, scale_offset_offset, 256 }, SCALE_OFFSET_BIND_SLOT, descriptor_sets); - m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, vertex_constants_offset, 512 * 4 * sizeof(float) }, VERTEX_CONSTANT_BUFFERS_BIND_SLOT, descriptor_sets); - m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, fragment_constants_offset, fragment_buffer_sz }, FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, descriptor_sets); + + if (!fast_update || m_transform_constants_dirty) + { + const size_t vertex_constants_offset = m_uniform_buffer_ring_info.alloc<256>(512 * 4 * sizeof(float)); + buf = (u8*)m_uniform_buffer_ring_info.map(vertex_constants_offset, 512 * 4 * sizeof(float)); + fill_vertex_program_constants_data(buf); + *(reinterpret_cast(buf + (468 * 4 * sizeof(float)))) = rsx::method_registers.transform_branch_bits(); + m_uniform_buffer_ring_info.unmap(); + + m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, vertex_constants_offset, 512 * 4 * sizeof(float) }, VERTEX_CONSTANT_BUFFERS_BIND_SLOT, descriptor_sets); + m_transform_constants_dirty = false; + } + + if (!fast_update) + { + const size_t fragment_constants_sz = m_prog_buffer.get_fragment_constants_buffer_size(fragment_program); + const size_t fragment_buffer_sz = fragment_constants_sz + (17 * 4 * sizeof(float)); + const size_t fragment_constants_offset = m_uniform_buffer_ring_info.alloc<256>(fragment_buffer_sz); + + buf = (u8*)m_uniform_buffer_ring_info.map(fragment_constants_offset, fragment_buffer_sz); + if (fragment_constants_sz) + m_prog_buffer.fill_fragment_constants_buffer({ reinterpret_cast(buf), ::narrow(fragment_constants_sz) }, fragment_program); + + fill_fragment_state_buffer(buf + fragment_constants_sz, fragment_program); + m_uniform_buffer_ring_info.unmap(); + + m_program->bind_uniform({ m_uniform_buffer_ring_info.heap->value, fragment_constants_offset, fragment_buffer_sz }, FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT, descriptor_sets); + } vk::leave_uninterruptible(); @@ -1642,6 +1749,7 @@ void VKGSRender::prepare_rtts() if (!m_rtts_dirty) return; + close_render_pass(); copy_render_targets_to_dma_location(); m_rtts_dirty = false; @@ -1773,6 +1881,7 @@ void VKGSRender::flip(int buffer) std::chrono::time_point flip_start = steady_clock::now(); + close_render_pass(); process_swap_request(); if (!resize_screen) @@ -1859,7 +1968,7 @@ void VKGSRender::flip(int buffer) swap_image_view.push_back(std::make_unique(*m_device, target_image, VK_IMAGE_VIEW_TYPE_2D, m_swap_chain->get_surface_format(), vk::default_component_map(), subres)); direct_fbo.reset(new vk::framebuffer(*m_device, single_target_pass, m_client_width, m_client_height, std::move(swap_image_view))); - m_text_writer->print_text(*m_current_command_buffer, *direct_fbo, 0, 0, direct_fbo->width(), direct_fbo->height(), "draw calls: " + std::to_string(m_draw_calls)); + m_text_writer->print_text(*m_current_command_buffer, *direct_fbo, 0, 0, direct_fbo->width(), direct_fbo->height(), "draw calls: " + std::to_string(m_draw_calls) + ", instanced repeats: " + std::to_string(m_instanced_draws)); m_text_writer->print_text(*m_current_command_buffer, *direct_fbo, 0, 18, direct_fbo->width(), direct_fbo->height(), "draw call setup: " + std::to_string(m_setup_time) + "us"); m_text_writer->print_text(*m_current_command_buffer, *direct_fbo, 0, 36, direct_fbo->width(), direct_fbo->height(), "vertex upload time: " + std::to_string(m_vertex_upload_time) + "us"); m_text_writer->print_text(*m_current_command_buffer, *direct_fbo, 0, 54, direct_fbo->width(), direct_fbo->height(), "texture upload time: " + std::to_string(m_textures_upload_time) + "us"); @@ -1956,6 +2065,7 @@ void VKGSRender::flip(int buffer) //Resource destruction is handled within the real swap handler m_draw_calls = 0; + m_instanced_draws = 0; m_draw_time = 0; m_setup_time = 0; m_vertex_upload_time = 0; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index cfaef161a1..3f9d490110 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -156,6 +156,8 @@ private: u32 m_client_height = 0; u32 m_draw_calls = 0; + u32 m_instanced_draws = 0; + s64 m_setup_time = 0; s64 m_vertex_upload_time = 0; s64 m_textures_upload_time = 0; @@ -176,6 +178,16 @@ private: std::atomic m_queued_threads = { 0 }; std::thread::id rsx_thread; + + VkPrimitiveTopology m_last_primititve_type; + VkIndexType m_last_ib_type; + VkDescriptorSet m_last_descriptor_set; + size_t m_last_ib_offset; + u32 m_last_vertex_count; + bool m_last_draw_indexed; + u32 m_last_instanced_cb_index; + + bool render_pass_open = false; #ifdef __linux__ Display *m_display_handle = nullptr; @@ -197,10 +209,15 @@ private: void queue_swap_request(); void process_swap_request(); + void begin_render_pass(); + void close_render_pass(); + + void emit_geometry_instance(u32 instance_count); + /// returns primitive topology, is_indexed, index_count, offset in index buffer, index type std::tuple > > upload_vertex_data(); public: - bool load_program(); + bool load_program(bool fast_update = false); void init_buffers(bool skip_reading = false); void read_buffers(); void write_buffers(); diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index 6f74c25634..311c09aeec 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -18,7 +18,7 @@ #include "../Common/TextureUtils.h" #include "../Common/ring_buffer_helper.h" -#define DESCRIPTOR_MAX_DRAW_CALLS 1024 +#define DESCRIPTOR_MAX_DRAW_CALLS 4096 #define VERTEX_BUFFERS_FIRST_BIND_SLOT 3 #define FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT 2 @@ -1391,6 +1391,7 @@ namespace vk VkDevice m_device; public: VkPipeline pipeline; + u64 attribute_location_mask; program(VkDevice dev, VkPipeline p, const std::vector &vertex_input, const std::vector& fragment_inputs); program(const program&) = delete; @@ -1409,14 +1410,17 @@ namespace vk struct vk_data_heap : public data_heap { std::unique_ptr heap; + bool mapped = false; void* map(size_t offset, size_t size) { + mapped = true; return heap->map(offset, size); } void unmap() { + mapped = false; heap->unmap(); } }; diff --git a/rpcs3/Emu/RSX/VK/VKProgramPipeline.cpp b/rpcs3/Emu/RSX/VK/VKProgramPipeline.cpp index 0805c70e76..50118456f6 100644 --- a/rpcs3/Emu/RSX/VK/VKProgramPipeline.cpp +++ b/rpcs3/Emu/RSX/VK/VKProgramPipeline.cpp @@ -10,6 +10,7 @@ namespace vk { load_uniforms(glsl::program_domain::glsl_vertex_program, vertex_input); load_uniforms(glsl::program_domain::glsl_vertex_program, fragment_inputs); + attribute_location_mask = 0; } program::~program() @@ -60,6 +61,7 @@ namespace vk descriptor_writer.dstBinding = uniform.location + TEXTURES_FIRST_BIND_SLOT; vkUpdateDescriptorSets(m_device, 1, &descriptor_writer, 0, nullptr); + attribute_location_mask |= (1ull << (uniform.location + TEXTURES_FIRST_BIND_SLOT)); return; } } @@ -79,6 +81,7 @@ namespace vk descriptor_writer.dstBinding = binding_point; vkUpdateDescriptorSets(m_device, 1, &descriptor_writer, 0, nullptr); + attribute_location_mask |= (1ull << binding_point); } void program::bind_uniform(const VkBufferView &buffer_view, const std::string &binding_name, VkDescriptorSet &descriptor_set) @@ -97,6 +100,7 @@ namespace vk descriptor_writer.dstBinding = uniform.location + VERTEX_BUFFERS_FIRST_BIND_SLOT; vkUpdateDescriptorSets(m_device, 1, &descriptor_writer, 0, nullptr); + attribute_location_mask |= (1ull << (uniform.location + VERTEX_BUFFERS_FIRST_BIND_SLOT)); return; } } diff --git a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp index 40849d4b90..38993b170f 100644 --- a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp +++ b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp @@ -347,11 +347,13 @@ namespace std::vector>& buffer_view_to_clean, std::function>&)> - get_vertex_buffers_f) + get_vertex_buffers_f, + VKGSRender *thread) : m_device(device), m_index_buffer_ring_info(index_buffer_ring_info), m_attrib_ring_info(attrib_ring_info), m_program(program), m_descriptor_sets(descriptor_sets), m_buffer_view_to_clean(buffer_view_to_clean), - get_vertex_buffers(get_vertex_buffers_f) + get_vertex_buffers(get_vertex_buffers_f), + rsxthr(thread) { } @@ -450,14 +452,129 @@ namespace std::function>&)> get_vertex_buffers; + VKGSRender* rsxthr; void upload_vertex_buffers(u32 min_index, u32 vertex_max_index) { - vertex_buffer_visitor visitor(vertex_max_index - min_index + 1, m_device, + const u32 vertex_count = vertex_max_index - min_index + 1; + + vertex_buffer_visitor visitor(vertex_count, m_device, m_attrib_ring_info, m_program, m_descriptor_sets, m_buffer_view_to_clean); + const auto& vertex_buffers = get_vertex_buffers( rsx::method_registers, {{min_index, vertex_max_index - min_index + 1}}); - for (const auto& vbo : vertex_buffers) std::apply_visitor(visitor, vbo); + + //1. Check if we can get all these allocations at once + std::vector memory_allocations(16); + std::vector allocated_sizes(16); + std::vector upload_jobs(16); + + memory_allocations.resize(0); + allocated_sizes.resize(0); + upload_jobs.resize(0); + + for (int i = 0; i < vertex_buffers.size(); ++i) + { + const auto &vbo = vertex_buffers[i]; + + if (vbo.which() == 0 && vertex_count > 128 && vertex_buffers.size() > 2 && rsxthr->vertex_upload_task_ready()) + { + //vertex array buffer. We can thread this thing heavily + const auto& v = vbo.get(); + + u32 element_size = rsx::get_vertex_type_size_on_host(v.type, v.attribute_size); + u32 real_element_size = vk::get_suitable_vk_size(v.type, v.attribute_size); + + u32 upload_size = real_element_size * vertex_count; + size_t offset = m_attrib_ring_info.alloc<256>(upload_size); + + memory_allocations.push_back(offset); + allocated_sizes.push_back(upload_size); + upload_jobs.push_back(i); + + const VkFormat format = vk::get_suitable_vk_format(v.type, v.attribute_size); + + m_buffer_view_to_clean.push_back(std::make_unique(m_device, m_attrib_ring_info.heap->value, format, offset, upload_size)); + m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[v.index], m_descriptor_sets); + } + else + std::apply_visitor(visitor, vbo); + } + + if (memory_allocations.size() > 0) + { + if (memory_allocations.size() > 1) + { + //2 sets in case the blocks dont fit + u8 available_jobs[2] = {}; + u32 allocated_block[2] = {}; + + size_t last_offset = memory_allocations[0]; + u8 current_index = 0; + + for (int n = 0; n < memory_allocations.size(); ++n) + { + if (memory_allocations[n] < last_offset) + { + //queue went around + current_index = 1; + } + + available_jobs[current_index] ++; + allocated_block[current_index] += allocated_sizes[n]; + } + + int n = 0; + for (int task = 0; task < 2; ++task) + { + if (available_jobs[task]) + { + if (m_attrib_ring_info.mapped) + { + rsxthr->wait_for_vertex_upload_task(); + m_attrib_ring_info.unmap(); + } + + size_t space_remaining = allocated_block[task]; + size_t offset_base = memory_allocations[n]; + + gsl::byte* dst = (gsl::byte*)m_attrib_ring_info.map(memory_allocations[n], space_remaining); + + while (true) + { + if (space_remaining == 0) + break; + + const auto& vertex_array = vertex_buffers[upload_jobs[n]].get(); + const u32 real_element_size = vk::get_suitable_vk_size(vertex_array.type, vertex_array.attribute_size); + + gsl::span dest_span(dst + (memory_allocations[n] - offset_base), allocated_sizes[n]); + rsxthr->post_vertex_stream_to_upload(vertex_array.data, dest_span, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, real_element_size, vk::prepare_buffer_for_writing); + + space_remaining -= allocated_sizes[n]; + n++; + } + + rsxthr->start_vertex_upload_task(vertex_count); + } + } + } + else + { + const size_t offset_in_attrib_buffer = memory_allocations[0]; + const u32 upload_size = allocated_sizes[0]; + const auto& vertex_array = vertex_buffers[upload_jobs[0]].get(); + const u32 real_element_size = vk::get_suitable_vk_size(vertex_array.type, vertex_array.attribute_size); + + void *dst = m_attrib_ring_info.map(offset_in_attrib_buffer, upload_size); + gsl::span dest_span(static_cast(dst), upload_size); + + write_vertex_array_data_to_buffer(dest_span, vertex_array.data, vertex_count, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, real_element_size); + vk::prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count); + + m_attrib_ring_info.unmap(); + } + } } u32 upload_inlined_array() @@ -551,6 +668,6 @@ VKGSRender::upload_vertex_data() { draw_command_visitor visitor(*m_device, m_index_buffer_ring_info, m_attrib_ring_info, m_program, descriptor_sets, m_buffer_view_to_clean, - [this](const auto& state, const auto& range) { return this->get_vertex_buffers(state, range); }); + [this](const auto& state, const auto& range) { return this->get_vertex_buffers(state, range);}, this); return std::apply_visitor(visitor, get_draw_command(rsx::method_registers)); } diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index c93d682d87..93fc7f5b81 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -385,6 +385,20 @@ namespace rsx rsx->m_textures_dirty[index] = true; } }; + + template + struct set_vertex_array_dirty_bit + { + static void impl(thread* rsx, u32, u32) + { + rsx->m_vertex_attribs_changed = true; + } + }; + + void set_idbuf_dirty_bit(thread* rsx, u32, u32) + { + rsx->m_index_buffer_changed = true; + } } namespace nv308a @@ -1428,6 +1442,8 @@ namespace rsx bind_range(); bind_range(); bind_range(); + bind_range(); + bind(); //NV308A bind_range(); diff --git a/rpcs3/Emu/RSX/rsx_vertex_data.h b/rpcs3/Emu/RSX/rsx_vertex_data.h index 14141a430c..9cd09fdd86 100644 --- a/rpcs3/Emu/RSX/rsx_vertex_data.h +++ b/rpcs3/Emu/RSX/rsx_vertex_data.h @@ -117,6 +117,7 @@ struct push_buffer_vertex_info case vertex_base_type::f: *(u32*)dst = se_storage::swap(arg); break; + case vertex_base_type::s1: case vertex_base_type::ub: case vertex_base_type::ub256: *(u32*)dst = arg; diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index 16800fc802..3ee136c4e3 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -314,6 +314,9 @@ struct cfg_root : cfg::node cfg::_bool invalidate_surface_cache_every_frame{this, "Invalidate Cache Every Frame", true}; cfg::_bool strict_rendering_mode{this, "Strict Rendering Mode"}; + cfg::_bool batch_instanced_geometry{this, "Batch Instanced Geometry", false}; + cfg::_int<1, 16> vertex_upload_threads{ this, "Vertex Upload Threads", 1 }; + struct node_d3d12 : cfg::node { node_d3d12(cfg::node* _this) : cfg::node(_this, "D3D12") {}