diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index f3e1397423..0bd4212288 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -809,6 +809,7 @@ void VKGSRender::begin() std::chrono::time_point submit_start = steady_clock::now(); flush_command_queue(true); + m_vertex_cache.purge(); CHECK_RESULT(vkResetDescriptorPool(*m_device, descriptor_pool, 0)); m_last_descriptor_set = VK_NULL_HANDLE; @@ -1534,6 +1535,8 @@ void VKGSRender::process_swap_request() m_text_writer->reset_descriptors(); } + m_vertex_cache.purge(); + m_swap_command_buffer = nullptr; } diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 300eda6b99..6d021135d3 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -90,6 +90,49 @@ struct command_buffer_chunk: public vk::command_buffer } }; +struct weak_vertex_cache +{ + struct uploaded_range + { + u32 offset_in_heap; + + VkFormat buffer_format; + uintptr_t local_address; + u32 data_length; + }; + +private: + std::vector vertex_ranges; +public: + + uploaded_range* find_vertex_range(uintptr_t local_addr, VkFormat fmt, u32 data_length) + { + for (auto &v : vertex_ranges) + { + if (v.local_address == local_addr && v.buffer_format == fmt && v.data_length == data_length) + return &v; + } + + return nullptr; + } + + void store_range(uintptr_t local_addr, VkFormat fmt, u32 data_length, u32 offset_in_heap) + { + uploaded_range v = {}; + v.buffer_format = fmt; + v.data_length = data_length; + v.local_address = local_addr; + v.offset_in_heap = offset_in_heap; + + vertex_ranges.push_back(v); + } + + void purge() + { + vertex_ranges.resize(0); + } +}; + class VKGSRender : public GSRender { private: @@ -114,6 +157,7 @@ private: public: //vk::fbo draw_fbo; + weak_vertex_cache m_vertex_cache; private: VKProgramBuffer m_prog_buffer; diff --git a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp index c15b1980ec..26fed96214 100644 --- a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp +++ b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp @@ -251,9 +251,11 @@ namespace { vertex_buffer_visitor(u32 vtx_cnt, VkDevice dev, vk::vk_data_heap& heap, vk::glsl::program* prog, VkDescriptorSet desc_set, - std::vector>& buffer_view_to_clean) + std::vector>& buffer_view_to_clean, + weak_vertex_cache& vertex_cache) : vertex_count(vtx_cnt), m_attrib_ring_info(heap), device(dev), m_program(prog), - descriptor_sets(desc_set), m_buffer_view_to_clean(buffer_view_to_clean) + descriptor_sets(desc_set), m_buffer_view_to_clean(buffer_view_to_clean), + vertex_cache(&vertex_cache) { } @@ -281,6 +283,9 @@ namespace m_attrib_ring_info.unmap(); const VkFormat format = vk::get_suitable_vk_format(vertex_array.type, vertex_array.attribute_size); + const uintptr_t local_addr = (uintptr_t)vertex_array.data.data(); + vertex_cache->store_range(local_addr, format, upload_size, (u32)offset_in_attrib_buffer); + m_buffer_view_to_clean.push_back(std::make_unique(device, m_attrib_ring_info.heap->value, format, offset_in_attrib_buffer, upload_size)); m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[vertex_array.index], descriptor_sets); } @@ -336,6 +341,7 @@ namespace vk::glsl::program* m_program; VkDescriptorSet descriptor_sets; std::vector>& m_buffer_view_to_clean; + weak_vertex_cache* vertex_cache; }; using attribute_storage = std::vectorm_vertex_cache); const auto& vertex_buffers = get_vertex_buffers( rsx::method_registers, {{min_index, vertex_max_index - min_index + 1}}); @@ -483,26 +489,38 @@ namespace const auto &vbo = vertex_buffers[i]; bool can_multithread = false; - if (vbo.which() == 0 && vertex_count >= (u32)g_cfg.video.mt_vertex_upload_threshold && rsxthr->vertex_upload_task_ready()) + if (vbo.which() == 0) { //vertex array buffer. We can thread this thing heavily const auto& v = vbo.get(); + + const u32 element_size = rsx::get_vertex_type_size_on_host(v.type, v.attribute_size); + const u32 real_element_size = vk::get_suitable_vk_size(v.type, v.attribute_size); + const u32 upload_size = real_element_size * vertex_count; + const VkFormat format = vk::get_suitable_vk_format(v.type, v.attribute_size); + const uintptr_t local_addr = (uintptr_t)v.data.data(); + + const auto cached = rsxthr->m_vertex_cache.find_vertex_range(local_addr, format, upload_size); + if (cached) + { + m_buffer_view_to_clean.push_back(std::make_unique(m_device, m_attrib_ring_info.heap->value, format, cached->offset_in_heap, upload_size)); + m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[v.index], m_descriptor_sets); + + continue; + } - if (v.attribute_size > 1) + if (v.attribute_size > 1 && vertex_count >= (u32)g_cfg.video.mt_vertex_upload_threshold && rsxthr->vertex_upload_task_ready()) { can_multithread = true; - u32 element_size = rsx::get_vertex_type_size_on_host(v.type, v.attribute_size); - u32 real_element_size = vk::get_suitable_vk_size(v.type, v.attribute_size); - - u32 upload_size = real_element_size * vertex_count; size_t offset = m_attrib_ring_info.alloc<256>(upload_size); memory_allocations.push_back(offset); allocated_sizes.push_back(upload_size); upload_jobs.push_back(i); - const VkFormat format = vk::get_suitable_vk_format(v.type, v.attribute_size); + const uintptr_t local_addr = (uintptr_t)v.data.data(); + rsxthr->m_vertex_cache.store_range(local_addr, format, upload_size, (u32)offset); m_buffer_view_to_clean.push_back(std::make_unique(m_device, m_attrib_ring_info.heap->value, format, offset, upload_size)); m_program->bind_uniform(m_buffer_view_to_clean.back()->value, s_reg_table[v.index], m_descriptor_sets);