diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index b345092287..106554c2e0 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -493,6 +493,11 @@ VKGSRender::VKGSRender() : GSRender(frame_type::Vulkan) //create command buffer... m_command_buffer_pool.create((*m_device)); m_command_buffer.create(m_command_buffer_pool); + + //Create secondar command_buffer for parallel operations + m_secondary_command_buffer_pool.create((*m_device)); + m_secondary_command_buffer.create(m_secondary_command_buffer_pool); + open_command_buffer(); for (u32 i = 0; i < m_swap_chain->get_swap_image_count(); ++i) @@ -620,6 +625,9 @@ VKGSRender::~VKGSRender() m_command_buffer.destroy(); m_command_buffer_pool.destroy(); + m_secondary_command_buffer.destroy(); + m_secondary_command_buffer_pool.destroy(); + //Device handles/contexts m_swap_chain->destroy(); m_thread_context.close(); @@ -632,7 +640,29 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing) if (is_writing) return m_texture_cache.invalidate_address(address); else - return m_texture_cache.flush_address(address, *m_device, m_command_buffer, m_memory_type_mapping, m_swap_chain->get_present_queue()); + { + if (!m_texture_cache.address_is_flushable(address)) + return false; + + if (std::this_thread::get_id() != rsx_thread) + { + //TODO: Guard this when the renderer is flushing the command queue, might deadlock otherwise + m_flush_commands = true; + m_queued_threads++; + + //This is awful! + while (m_flush_commands); + + std::lock_guard lock(m_secondary_cb_guard); + bool status = m_texture_cache.flush_address(address, *m_device, m_secondary_command_buffer, m_memory_type_mapping, m_swap_chain->get_present_queue()); + + m_queued_threads--; + return status; + } + + std::lock_guard lock(m_secondary_cb_guard); + return m_texture_cache.flush_address(address, *m_device, m_secondary_command_buffer, m_memory_type_mapping, m_swap_chain->get_present_queue()); + } return false; } @@ -646,7 +676,9 @@ void VKGSRender::begin() { std::chrono::time_point submit_start = steady_clock::now(); - close_and_submit_command_buffer({}, m_submit_fence); + //??Should we wait for the queue to actually render to the GPU? or just flush the queue? + //Needs investigation to determine what drivers expect here, bottom_of_pipe is guaranteed to work, but will be too slow + close_and_submit_command_buffer({}, m_submit_fence, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT); CHECK_RESULT(vkWaitForFences((*m_device), 1, &m_submit_fence, VK_TRUE, ~0ULL)); vkResetDescriptorPool(*m_device, descriptor_pool, 0); @@ -833,9 +865,9 @@ void VKGSRender::end() std::chrono::time_point draw_end = steady_clock::now(); m_draw_time += std::chrono::duration_cast(draw_end - vertex_end).count(); - rsx::thread::end(); - copy_render_targets_to_dma_location(); + + rsx::thread::end(); } void VKGSRender::set_viewport() @@ -875,6 +907,8 @@ void VKGSRender::on_init_thread() GSRender::on_init_thread(); m_attrib_ring_info.init(8 * RING_BUFFER_SIZE); m_attrib_ring_info.heap.reset(new vk::buffer(*m_device, 8 * RING_BUFFER_SIZE, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT|VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, 0)); + + rsx_thread = std::this_thread::get_id(); } void VKGSRender::on_exit() @@ -987,13 +1021,6 @@ void VKGSRender::clear_surface(u32 mask) void VKGSRender::sync_at_semaphore_release() { - close_and_submit_command_buffer({}, m_submit_fence); - CHECK_RESULT(vkWaitForFences((*m_device), 1, &m_submit_fence, VK_TRUE, ~0ULL)); - - CHECK_RESULT(vkResetFences(*m_device, 1, &m_submit_fence)); - CHECK_RESULT(vkResetCommandPool(*m_device, m_command_buffer_pool, 0)); - open_command_buffer(); - m_flush_draw_buffers = true; } @@ -1002,6 +1029,13 @@ void VKGSRender::copy_render_targets_to_dma_location() if (!m_flush_draw_buffers) return; + if (!g_cfg_rsx_write_color_buffers && !g_cfg_rsx_write_depth_buffer) + return; + + //TODO: Make this asynchronous. Should be similar to a glFlush() but in this case its similar to glFinish + //This is due to all the hard waits for fences + //TODO: Use a command buffer array to allow explicit draw command tracking + if (g_cfg_rsx_write_color_buffers) { for (u8 index = 0; index < rsx::limits::color_buffers_count; index++) @@ -1023,7 +1057,28 @@ void VKGSRender::copy_render_targets_to_dma_location() } } - m_flush_draw_buffers = false; + close_and_submit_command_buffer({}, m_submit_fence, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT); + CHECK_RESULT(vkWaitForFences((*m_device), 1, &m_submit_fence, VK_TRUE, ~0ULL)); + + CHECK_RESULT(vkResetFences(*m_device, 1, &m_submit_fence)); + CHECK_RESULT(vkResetCommandPool(*m_device, m_command_buffer_pool, 0)); + open_command_buffer(); +} + +void VKGSRender::do_local_task() +{ + if (m_flush_commands) + { + close_and_submit_command_buffer({}, m_submit_fence, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT); + CHECK_RESULT(vkWaitForFences((*m_device), 1, &m_submit_fence, VK_TRUE, ~0ULL)); + + CHECK_RESULT(vkResetFences(*m_device, 1, &m_submit_fence)); + CHECK_RESULT(vkResetCommandPool(*m_device, m_command_buffer_pool, 0)); + open_command_buffer(); + + m_flush_commands = false; + while (m_queued_threads); + } } bool VKGSRender::do_method(u32 cmd, u32 arg) @@ -1294,17 +1349,16 @@ void VKGSRender::write_buffers() { } -void VKGSRender::close_and_submit_command_buffer(const std::vector &semaphores, VkFence fence) +void VKGSRender::close_and_submit_command_buffer(const std::vector &semaphores, VkFence fence, VkPipelineStageFlags pipeline_stage_flags) { CHECK_RESULT(vkEndCommandBuffer(m_command_buffer)); - VkPipelineStageFlags pipe_stage_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; VkCommandBuffer cmd = m_command_buffer; VkSubmitInfo infos = {}; infos.commandBufferCount = 1; infos.pCommandBuffers = &cmd; - infos.pWaitDstStageMask = &pipe_stage_flags; + infos.pWaitDstStageMask = &pipeline_stage_flags; infos.pWaitSemaphores = semaphores.data(); infos.waitSemaphoreCount = static_cast(semaphores.size()); infos.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 3ff32dd20b..de42ae1937 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -11,6 +11,7 @@ #include "VKProgramBuffer.h" #include "../GCM.h" #include "../rsx_utils.h" +#include #pragma comment(lib, "VKstatic.1.lib") @@ -60,6 +61,9 @@ private: vk::command_pool m_command_buffer_pool; vk::command_buffer m_command_buffer; + std::mutex m_secondary_cb_guard; + vk::command_pool m_secondary_command_buffer_pool; + vk::command_buffer m_secondary_command_buffer; std::array m_render_passes; VkDescriptorSetLayout descriptor_layouts; @@ -86,7 +90,13 @@ private: rsx::gcm_framebuffer_info m_surface_info[rsx::limits::color_buffers_count]; rsx::gcm_framebuffer_info m_depth_surface_info; + bool m_flush_draw_buffers = false; + + std::atomic m_flush_commands = false; + std::atomic m_queued_threads = 0; + + std::thread::id rsx_thread; public: VKGSRender(); @@ -94,7 +104,7 @@ public: private: void clear_surface(u32 mask); - void close_and_submit_command_buffer(const std::vector &semaphores, VkFence fence); + void close_and_submit_command_buffer(const std::vector &semaphores, VkFence fence, VkPipelineStageFlags pipeline_stage_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT); void open_command_buffer(); void sync_at_semaphore_release(); void prepare_rtts(); @@ -117,5 +127,7 @@ protected: bool do_method(u32 id, u32 arg) override; void flip(int buffer) override; + void do_local_task() override; + bool on_access_violation(u32 address, bool is_writing) override; }; diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index 40f0ba9bd7..3f624b64f1 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -32,7 +32,7 @@ namespace rsx namespace vk { -#define CHECK_RESULT(expr) do { VkResult _res = (expr); if (_res != VK_SUCCESS) fmt::throw_exception("Assertion failed! Result is %Xh", (s32)_res); } while (0) +#define CHECK_RESULT(expr) { VkResult _res = (expr); if (_res != VK_SUCCESS) fmt::throw_exception("Assertion failed! Result is %Xh" HERE, (s32)_res); } VKAPI_ATTR void *VKAPI_CALL mem_realloc(void *pUserData, void *pOriginal, size_t size, size_t alignment, VkSystemAllocationScope allocationScope); VKAPI_ATTR void *VKAPI_CALL mem_alloc(void *pUserData, size_t size, size_t alignment, VkSystemAllocationScope allocationScope); diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index f4c25cd6e6..e85541c962 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -28,6 +28,14 @@ namespace vk cached_texture_section() {} + void reset(u32 base, u32 length) + { + if (length > cpu_address_range) + release_dma_resources(); + + rsx::buffered_section::reset(base, length); + } + void create(const u16 w, const u16 h, const u16 depth, const u16 mipmaps, vk::image_view *view, vk::image *image, const u32 native_pitch = 0, bool managed=true) { width = w; @@ -38,8 +46,7 @@ namespace vk uploaded_image_view.reset(view); vram_texture = image; - if (managed) - managed_texture.reset(image); + if (managed) managed_texture.reset(image); //TODO: Properly compute these values this->native_pitch = native_pitch; @@ -105,16 +112,18 @@ namespace vk bool is_flushable() const { - if (protection == utils::protection::ro || protection == utils::protection::no) - return true; - - if (uploaded_image_view.get() == nullptr && vram_texture != nullptr) - return true; - - return false; + //This section is active and can be flushed to cpu + return (protection == utils::protection::no); } - void copy_texture(vk::command_buffer& cmd, u32 heap_index, VkQueue submit_queue, VkImageLayout layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) + bool is_flushed() const + { + //This memory section was flushable, but a flush has already removed protection + return (protection == utils::protection::rw && uploaded_image_view.get() == nullptr && managed_texture.get() == nullptr); + } + + void copy_texture(vk::command_buffer& cmd, u32 heap_index, VkQueue submit_queue, + bool manage_cb_lifetime = false, VkImageLayout layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { if (m_device == nullptr) { @@ -130,7 +139,21 @@ namespace vk if (dma_buffer.get() == nullptr) { - dma_buffer.reset(new vk::buffer(*m_device, native_pitch * height, heap_index, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0)); + dma_buffer.reset(new vk::buffer(*m_device, align(cpu_address_range, 256), heap_index, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0)); + } + + if (manage_cb_lifetime) + { + //cb has to be guaranteed to be in a closed state + //This function can be called asynchronously + VkCommandBufferInheritanceInfo inheritance_info = {}; + inheritance_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO; + + VkCommandBufferBeginInfo begin_infos = {}; + begin_infos.pInheritanceInfo = &inheritance_info; + begin_infos.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + begin_infos.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + CHECK_RESULT(vkBeginCommandBuffer(cmd, &begin_infos)); } VkBufferImageCopy copyRegion = {}; @@ -147,52 +170,47 @@ namespace vk vkCmdCopyImageToBuffer(cmd, vram_texture->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dma_buffer->value, 1, ©Region); change_image_layout(cmd, vram_texture->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, layout, subresource_range); - CHECK_RESULT(vkEndCommandBuffer(cmd)); + if (manage_cb_lifetime) + { + CHECK_RESULT(vkEndCommandBuffer(cmd)); - VkPipelineStageFlags pipe_stage_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; - VkCommandBuffer command_buffer = cmd; + VkPipelineStageFlags pipe_stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkCommandBuffer command_buffer = cmd; - VkSubmitInfo infos = {}; - infos.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - infos.commandBufferCount = 1; - infos.pCommandBuffers = &command_buffer; - infos.pWaitDstStageMask = &pipe_stage_flags; - infos.pWaitSemaphores = nullptr; - infos.waitSemaphoreCount = 0; + VkSubmitInfo infos = {}; + infos.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + infos.commandBufferCount = 1; + infos.pCommandBuffers = &command_buffer; + infos.pWaitDstStageMask = &pipe_stage_flags; + infos.pWaitSemaphores = nullptr; + infos.waitSemaphoreCount = 0; - CHECK_RESULT(vkQueueSubmit(submit_queue, 1, &infos, dma_fence)); + CHECK_RESULT(vkQueueSubmit(submit_queue, 1, &infos, dma_fence)); - //Now we need to restart the command-buffer to restore it to the way it was before... - CHECK_RESULT(vkWaitForFences(*m_device, 1, &dma_fence, VK_TRUE, UINT64_MAX)); - CHECK_RESULT(vkResetCommandPool(*m_device, cmd.get_command_pool(), 0)); - CHECK_RESULT(vkResetFences(*m_device, 1, &dma_fence)); - - VkCommandBufferInheritanceInfo inheritance_info = {}; - inheritance_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO; - - VkCommandBufferBeginInfo begin_infos = {}; - begin_infos.pInheritanceInfo = &inheritance_info; - begin_infos.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - begin_infos.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - CHECK_RESULT(vkBeginCommandBuffer(cmd, &begin_infos)); + //Now we need to restart the command-buffer to restore it to the way it was before... + CHECK_RESULT(vkWaitForFences(*m_device, 1, &dma_fence, VK_TRUE, UINT64_MAX)); + CHECK_RESULT(vkResetCommandPool(*m_device, cmd.get_command_pool(), 0)); + CHECK_RESULT(vkResetFences(*m_device, 1, &dma_fence)); + } } template void do_memory_transfer(void *pixels_dst, void *pixels_src) { + //LOG_ERROR(RSX, "COPY %d -> %d", native_pitch, pitch); if (pitch == native_pitch) { if (sizeof T == 1) - memcpy(pixels_dst, pixels_src, native_pitch * height); + memcpy(pixels_dst, pixels_src, cpu_address_range); else { - const u32 block_size = native_pitch * height / sizeof T; + const u32 block_size = width * height; auto typed_dst = (be_t *)pixels_dst; auto typed_src = (T *)pixels_src; - for (u8 n = 0; n < block_size; ++n) - typed_dst[n] = typed_src[n]; + for (u32 px = 0; px < block_size; ++px) + typed_dst[px] = typed_src[px]; } } else @@ -203,7 +221,7 @@ namespace vk u8 *typed_src = (u8 *)pixels_src; //TODO: Scaling - for (int row = 0; row < height; ++row) + for (u16 row = 0; row < height; ++row) { memcpy(typed_dst, typed_src, native_pitch); typed_dst += pitch; @@ -218,9 +236,9 @@ namespace vk auto typed_dst = (be_t *)pixels_dst; auto typed_src = (T *)pixels_src; - for (int row = 0; row < height; ++row) + for (u16 row = 0; row < height; ++row) { - for (int px = 0; px < width; ++px) + for (u16 px = 0; px < width; ++px) { typed_dst[px] = typed_src[px]; } @@ -240,15 +258,13 @@ namespace vk if (dma_fence == VK_NULL_HANDLE || dma_buffer.get() == nullptr) { LOG_WARNING(RSX, "Cache miss at address 0x%X. This is gonna hurt...", cpu_address_base); - copy_texture(cmd, heap_index, submit_queue, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - - verify (HERE), (dma_fence != VK_NULL_HANDLE && dma_buffer.get()); + copy_texture(cmd, heap_index, submit_queue, true, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); } protect(utils::protection::rw); //TODO: Image scaling, etc - void* pixels_src = dma_buffer->map(0, VK_WHOLE_SIZE); + void* pixels_src = dma_buffer->map(0, cpu_address_range); void* pixels_dst = vm::base(cpu_address_base); //We have to do our own byte swapping since the driver doesnt do it for us @@ -273,12 +289,7 @@ namespace vk } dma_buffer->unmap(); - - //Cleanup - //These sections are usually one-use only so we destroy system resources - //TODO: Recycle dma buffers - release_dma_resources(); - vram_texture = nullptr; //Let m_rtts handle lifetime management + //Its highly likely that this surface will be reused, so we just leave resources in place } }; @@ -333,7 +344,7 @@ namespace vk for (auto &tex : m_cache) { if (tex.is_dirty()) continue; - if (!tex.is_flushable()) continue; + if (!tex.is_flushable() && !tex.is_flushed()) continue; if (tex.matches(address, range)) return &tex; @@ -529,15 +540,16 @@ namespace vk void lock_memory_region(vk::render_target* image, const u32 memory_address, const u32 memory_size, const u32 width, const u32 height) { cached_texture_section& region = find_cached_texture(memory_address, memory_size, true, width, height, 1); - region.create(width, height, 1, 1, nullptr, image, image->native_pitch, false); if (!region.is_locked()) { region.reset(memory_address, memory_size); - region.protect(utils::protection::no); region.set_dirty(false); texture_cache_range = region.get_min_max(texture_cache_range); } + + region.protect(utils::protection::no); + region.create(width, height, 1, 1, nullptr, image, image->native_pitch, false); } void flush_memory_to_cache(const u32 memory_address, const u32 memory_size, vk::command_buffer&cmd, vk::memory_type_mapping& memory_types, VkQueue submit_queue) @@ -554,6 +566,20 @@ namespace vk region->copy_texture(cmd, memory_types.host_visible_coherent, submit_queue); } + bool address_is_flushable(u32 address) + { + for (auto &tex : m_cache) + { + if (tex.is_dirty()) continue; + if (!tex.is_flushable()) continue; + + if (tex.overlaps(address)) + return true; + } + + return false; + } + bool flush_address(u32 address, vk::render_device& dev, vk::command_buffer& cmd, vk::memory_type_mapping& memory_types, VkQueue submit_queue) { if (address < texture_cache_range.first || @@ -584,8 +610,6 @@ namespace vk //TODO: Map basic host_visible memory without coherent constraint tex.flush(dev, cmd, memory_types.host_visible_coherent, submit_queue); - tex.set_dirty(true); - response = true; } } @@ -607,6 +631,7 @@ namespace vk auto &tex = m_cache[i]; if (tex.is_dirty()) continue; + if (!tex.is_locked()) continue; //flushable sections can be 'clean' but unlocked. TODO: Handle this better auto overlapped = tex.overlaps_page(trampled_range, address); if (std::get<0>(overlapped)) diff --git a/rpcs3/Emu/RSX/rsx_cache.h b/rpcs3/Emu/RSX/rsx_cache.h index 7691fe152e..0ac24bbe12 100644 --- a/rpcs3/Emu/RSX/rsx_cache.h +++ b/rpcs3/Emu/RSX/rsx_cache.h @@ -136,12 +136,13 @@ namespace rsx locked_address_range = align(base + length, 4096) - locked_address_base; protection = utils::protection::rw; - locked = false; } void protect(utils::protection prot) { + if (prot == protection) return; + utils::memory_protect(vm::base(locked_address_base), locked_address_range, prot); protection = prot; locked = prot != utils::protection::rw; @@ -149,7 +150,8 @@ namespace rsx void unprotect() { - return protect(utils::protection::rw); + protect(utils::protection::rw); + locked = false; } bool overlaps(std::pair range)