From cef512a123159828f0d504e8be51f106eb017488 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 30 Jan 2022 14:56:22 +0300 Subject: [PATCH] vk: Spec-compliant async compute --- rpcs3/Emu/RSX/VK/VKAsyncScheduler.cpp | 134 +++++++-------------- rpcs3/Emu/RSX/VK/VKAsyncScheduler.h | 58 ++------- rpcs3/Emu/RSX/VK/VKCommandStream.cpp | 30 +++-- rpcs3/Emu/RSX/VK/VKDraw.cpp | 29 ++++- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 91 ++++++++++---- rpcs3/Emu/RSX/VK/VKGSRender.h | 1 + rpcs3/Emu/RSX/VK/VKHelpers.h | 4 +- rpcs3/Emu/RSX/VK/VKResourceManager.h | 5 + rpcs3/Emu/RSX/VK/VKTexture.cpp | 21 +++- rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 47 +++++++- rpcs3/Emu/RSX/VK/vkutils/commands.cpp | 31 ++--- rpcs3/Emu/RSX/VK/vkutils/commands.h | 38 +++++- rpcs3/Emu/RSX/VK/vkutils/image.cpp | 70 ++++++----- rpcs3/Emu/RSX/VK/vkutils/image.h | 11 +- rpcs3/Emu/RSX/VK/vkutils/image_helpers.cpp | 8 +- rpcs3/Emu/RSX/VK/vkutils/image_helpers.h | 4 +- rpcs3/Emu/RSX/VK/vkutils/sync.cpp | 22 +++- rpcs3/Emu/RSX/VK/vkutils/sync.h | 17 +++ rpcs3/Emu/system_config.h | 2 +- rpcs3/Emu/system_config_types.cpp | 4 +- rpcs3/Emu/system_config_types.h | 4 +- 21 files changed, 381 insertions(+), 250 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKAsyncScheduler.cpp b/rpcs3/Emu/RSX/VK/VKAsyncScheduler.cpp index 6e06490582..219b020ed6 100644 --- a/rpcs3/Emu/RSX/VK/VKAsyncScheduler.cpp +++ b/rpcs3/Emu/RSX/VK/VKAsyncScheduler.cpp @@ -10,7 +10,7 @@ namespace vk { - void AsyncTaskScheduler::operator()() + AsyncTaskScheduler::AsyncTaskScheduler(vk_gpu_scheduler_mode mode) { if (g_cfg.video.renderer != video_renderer::vulkan || !g_cfg.video.vk.asynchronous_texture_streaming) { @@ -21,40 +21,15 @@ namespace vk } init_config_options(); - if (!m_use_host_scheduler) + } + + AsyncTaskScheduler::~AsyncTaskScheduler() + { + if (!m_async_command_queue.empty()) { - // No need to keep the GPU alive using a CPU thread. - rsx_log.notice("Host scheduler is disabled. This thread will now exit."); - return; + // Driver resources should be destroyed before driver is detached or you get crashes. RAII won't save you here. + rsx_log.error("Async task scheduler resources were not freed correctly!"); } - - // If this thread is unavailable for too long, your GPU will hard crash and force a full reset - // TODO: Investigate if this can be executed outside the application context. Attach a debugger to rpcs3 and boom - GPU reset. Not fun rebooting so often. - thread_ctrl::set_native_priority(1); - - add_ref(); - - while (thread_ctrl::state() != thread_state::aborting) - { - for (auto&& job : m_event_queue.pop_all()) - { - if (job->type == xqueue_event_type::barrier) - { - // Blocks the queue from progressing until the work items are actually submitted to the GPU - // Avoids spamming the GPU with event requests when the events have not even been submitted yet - while (job->completion_eid == m_submit_count.load()) - { - thread_ctrl::wait_for(100); - } - continue; - } - - vk::wait_for_event(job->queue1_signal.get(), GENERAL_WAIT_TIMEOUT); - job->queue2_signal->host_signal(); - } - } - - release(); } void AsyncTaskScheduler::init_config_options() @@ -66,8 +41,8 @@ namespace vk return; } - m_use_host_scheduler = g_cfg.video.vk.asynchronous_scheduler == vk_gpu_scheduler_mode::host || g_cfg.video.strict_rendering_mode; - rsx_log.notice("Asynchronous task scheduler is active running in %s mode", m_use_host_scheduler? "'Host'" : "'Device'"); + m_use_host_scheduler = g_cfg.video.vk.asynchronous_scheduler == vk_gpu_scheduler_mode::safe || g_cfg.video.strict_rendering_mode; + rsx_log.notice("Asynchronous task scheduler is active running in %s mode", m_use_host_scheduler? "'Safe'" : "'Fast'"); } void AsyncTaskScheduler::delayed_init() @@ -77,58 +52,32 @@ namespace vk auto pdev = get_current_renderer(); m_command_pool.create(*const_cast(pdev), pdev->get_transfer_queue_family()); - for (usz i = 0; i < events_pool_size; ++i) + if (m_use_host_scheduler) { - auto ev1 = std::make_unique(*get_current_renderer(), sync_domain::gpu); - auto ev2 = std::make_unique(*get_current_renderer(), sync_domain::gpu); - m_events_pool.emplace_back(ev1, ev2, 0ull, i); + for (usz i = 0; i < events_pool_size; ++i) + { + auto sema = std::make_unique(*pdev); + m_semaphore_pool.emplace_back(std::move(sema)); + } + + return; } - for (usz i = 0; i < VK_MAX_ASYNC_COMPUTE_QUEUES; ++i) + for (usz i = 0; i < events_pool_size; ++i) { - m_barriers_pool.emplace_back(0ull, 0xFFFF0000 + i); + auto ev = std::make_unique(*pdev, sync_domain::gpu); + m_events_pool.emplace_back(std::move(ev)); } } void AsyncTaskScheduler::insert_sync_event() { ensure(m_current_cb); - ensure(m_next_event_id < events_pool_size); - auto sync_label = &m_events_pool[m_next_event_id]; + auto& sync_label = m_events_pool[m_next_event_id++ % events_pool_size]; - if (++m_next_event_id == events_pool_size) - { - // Wrap - m_next_event_id = 0; - } - - ensure(sync_label->completion_eid <= vk::last_completed_event_id()); - - m_sync_label_debug_uid = sync_label->uid; - sync_label->queue1_signal->reset(); - sync_label->queue2_signal->reset(); - sync_label->completion_eid = vk::current_event_id(); - - sync_label->queue1_signal->signal(*m_current_cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0); - - if (m_use_host_scheduler) - { - m_event_queue.push(sync_label); - m_sync_label = sync_label->queue2_signal.get(); - } - else - { - m_sync_label = sync_label->queue1_signal.get(); - } - } - - AsyncTaskScheduler::~AsyncTaskScheduler() - { - if (!m_async_command_queue.empty()) - { - // Driver resources should be destroyed before driver is detached or you get crashes. RAII won't save you here. - rsx_log.error("Async task scheduler resources were not freed correctly!"); - } + sync_label->reset(); + sync_label->signal(*m_current_cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0); + m_sync_label = sync_label.get(); } command_buffer* AsyncTaskScheduler::get_current() @@ -168,17 +117,14 @@ namespace vk } } - // 3. Insert a barrier for this CB. A job is about to be scheduled on it immediately. - auto barrier = &m_barriers_pool[m_next_cb_index]; - barrier->completion_eid = m_submit_count; - m_event_queue.push(barrier); - m_next_cb_index++; return m_current_cb; } event* AsyncTaskScheduler::get_primary_sync_label() { + ensure(!m_use_host_scheduler); + if (m_sync_required) [[unlikely]] { std::lock_guard lock(m_submit_mutex); // For some reason this is inexplicably expensive. WTF! @@ -190,40 +136,45 @@ namespace vk return std::exchange(m_sync_label, nullptr); } - u64 AsyncTaskScheduler::get_primary_sync_label_debug_uid() + semaphore* AsyncTaskScheduler::get_sema() { - return std::exchange(m_sync_label_debug_uid, ~0ull); + if (m_semaphore_pool.empty()) + { + delayed_init(); + ensure(!m_semaphore_pool.empty()); + } + + const u32 sema_id = (m_next_semaphore_id++ % m_semaphore_pool.size()); + return m_semaphore_pool[sema_id].get(); } - void AsyncTaskScheduler::flush(VkBool32 force_flush, VkSemaphore wait_semaphore, VkPipelineStageFlags wait_dst_stage_mask) + void AsyncTaskScheduler::flush(queue_submit_t& submit_info, VkBool32 force_flush) { if (!m_current_cb) { return; } + submit_info.queue = get_current_renderer()->get_transfer_queue(); + std::lock_guard lock(m_submit_mutex); - if (m_sync_required) + if (m_sync_required && !m_use_host_scheduler) { insert_sync_event(); } m_current_cb->end(); - m_current_cb->submit(get_current_renderer()->get_transfer_queue(), wait_semaphore, VK_NULL_HANDLE, nullptr, wait_dst_stage_mask, force_flush); + m_current_cb->submit(submit_info, force_flush); m_submit_count++; - thread_ctrl::notify(g_fxo->get()); m_last_used_cb = m_current_cb; m_current_cb = nullptr; m_sync_required = false; } - void AsyncTaskScheduler::kill() + void AsyncTaskScheduler::destroy() { - g_fxo->get() = thread_state::aborting; - while (has_refs()); - for (auto& cb : m_async_command_queue) { cb.destroy(); @@ -233,5 +184,6 @@ namespace vk m_next_cb_index = 0; m_command_pool.destroy(); m_events_pool.clear(); + m_semaphore_pool.clear(); } } diff --git a/rpcs3/Emu/RSX/VK/VKAsyncScheduler.h b/rpcs3/Emu/RSX/VK/VKAsyncScheduler.h index b85370d959..540a4f7d2d 100644 --- a/rpcs3/Emu/RSX/VK/VKAsyncScheduler.h +++ b/rpcs3/Emu/RSX/VK/VKAsyncScheduler.h @@ -9,36 +9,7 @@ namespace vk { - enum class xqueue_event_type - { - label, - barrier - }; - - struct xqueue_event - { - // Type - xqueue_event_type type; - - // Payload - std::unique_ptr queue1_signal; - std::unique_ptr queue2_signal; - - // Identifiers - u64 completion_eid; - u64 uid; - - xqueue_event(u64 eid, u64 _uid) - : type(xqueue_event_type::barrier), completion_eid(eid), uid(_uid) - {} - - xqueue_event(std::unique_ptr& trigger, std::unique_ptr& payload, u64 eid, u64 _uid) - : type(xqueue_event_type::label), queue1_signal(std::move(trigger)), queue2_signal(std::move(payload)), - completion_eid(eid), uid(_uid) - {} - }; - - class AsyncTaskScheduler : private rsx::ref_counted + class AsyncTaskScheduler { // Vulkan resources std::vector m_async_command_queue; @@ -48,7 +19,6 @@ namespace vk command_buffer* m_last_used_cb = nullptr; command_buffer* m_current_cb = nullptr; usz m_next_cb_index = 0; - std::vector m_barriers_pool; atomic_t m_submit_count = 0; // Scheduler @@ -59,13 +29,14 @@ namespace vk // Sync event* m_sync_label = nullptr; atomic_t m_sync_required = false; - u64 m_sync_label_debug_uid = 0; static constexpr u32 events_pool_size = 16384; - std::vector m_events_pool; - atomic_t m_next_event_id = 0; + std::vector> m_events_pool; + atomic_t m_next_event_id = 0; + + std::vector> m_semaphore_pool; + atomic_t m_next_semaphore_id = 0; - lf_queue m_event_queue; shared_mutex m_submit_mutex; void init_config_options(); @@ -73,21 +44,18 @@ namespace vk void insert_sync_event(); public: - AsyncTaskScheduler(const std::string_view& name) : thread_name(name) {} // This ctor stops default initialization by fxo + AsyncTaskScheduler(vk_gpu_scheduler_mode mode); // This ctor stops default initialization by fxo ~AsyncTaskScheduler(); command_buffer* get_current(); event* get_primary_sync_label(); - u64 get_primary_sync_label_debug_uid(); + semaphore* get_sema(); - void flush(VkBool32 force_flush, VkSemaphore wait_semaphore = VK_NULL_HANDLE, VkPipelineStageFlags wait_dst_stage_mask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT); - void kill(); + void flush(queue_submit_t& submit_info, VkBool32 force_flush); + void destroy(); - // Thread entry-point - void operator()(); - - const std::string_view thread_name; + // Inline getters + inline bool is_recording() const { return m_current_cb != nullptr; } + inline bool is_host_mode() const { return m_use_host_scheduler; } }; - - using async_scheduler_thread = named_thread; } diff --git a/rpcs3/Emu/RSX/VK/VKCommandStream.cpp b/rpcs3/Emu/RSX/VK/VKCommandStream.cpp index fc2bdd7a4a..428df67ac6 100644 --- a/rpcs3/Emu/RSX/VK/VKCommandStream.cpp +++ b/rpcs3/Emu/RSX/VK/VKCommandStream.cpp @@ -24,17 +24,31 @@ namespace vk } FORCE_INLINE - static void queue_submit_impl(VkQueue queue, const VkSubmitInfo* info, fence* pfence) + static void queue_submit_impl(const queue_submit_t& submit_info) { + ensure(submit_info.fence); acquire_global_submit_lock(); - vkQueueSubmit(queue, 1, info, pfence->handle); + VkSubmitInfo info + { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .pNext = nullptr, + .waitSemaphoreCount = submit_info.wait_semaphores_count, + .pWaitSemaphores = submit_info.wait_semaphores.data(), + .pWaitDstStageMask = submit_info.wait_stages.data(), + .commandBufferCount = 1, + .pCommandBuffers = &submit_info.commands, + .signalSemaphoreCount = submit_info.signal_semaphores_count, + .pSignalSemaphores = submit_info.signal_semaphores.data() + }; + + vkQueueSubmit(submit_info.queue, 1, &info, submit_info.fence->handle); release_global_submit_lock(); // Signal fence - pfence->signal_flushed(); + submit_info.fence->signal_flushed(); } - void queue_submit(VkQueue queue, const VkSubmitInfo* info, fence* pfence, VkBool32 flush) + void queue_submit(const queue_submit_t& submit_info, VkBool32 flush) { // Access to this method must be externally synchronized. // Offloader is guaranteed to never call this for async flushes. @@ -42,18 +56,18 @@ namespace vk if (!flush && g_cfg.video.multithreaded_rsx) { - auto packet = new submit_packet(queue, pfence, info); + auto packet = new queue_submit_t(submit_info); g_fxo->get().backend_ctrl(rctrl_queue_submit, packet); } else { - queue_submit_impl(queue, info, pfence); + queue_submit_impl(submit_info); } } - void queue_submit(const vk::submit_packet* packet) + void queue_submit(const queue_submit_t* packet) { // Flush-only version used by asynchronous submit processing (MTRSX) - queue_submit_impl(packet->queue, &packet->submit_info, packet->pfence); + queue_submit_impl(*packet); } } diff --git a/rpcs3/Emu/RSX/VK/VKDraw.cpp b/rpcs3/Emu/RSX/VK/VKDraw.cpp index e1483ab046..6eff31d1c8 100644 --- a/rpcs3/Emu/RSX/VK/VKDraw.cpp +++ b/rpcs3/Emu/RSX/VK/VKDraw.cpp @@ -380,6 +380,29 @@ void VKGSRender::load_texture_env() m_cached_renderpass = VK_NULL_HANDLE; } } + + if (g_cfg.video.vk.asynchronous_texture_streaming) + { + // We have to do this here, because we have to assume the CB will be dumped + auto& async_task_scheduler = g_fxo->get(); + + if (async_task_scheduler.is_recording()) + { + if (async_task_scheduler.is_host_mode()) + { + flush_command_queue(); + ensure(!async_task_scheduler.is_recording()); + } + else + { + // Sync any async scheduler tasks + if (auto ev = async_task_scheduler.get_primary_sync_label()) + { + ev->gpu_wait(*m_current_command_buffer); + } + } + } + } } bool VKGSRender::bind_texture_env() @@ -1029,12 +1052,6 @@ void VKGSRender::end() load_program_env(); m_frame_stats.setup_time += m_profiler.duration(); - // Sync any async scheduler tasks - if (auto ev = g_fxo->get().get_primary_sync_label()) - { - ev->gpu_wait(*m_current_command_buffer); - } - for (int binding_attempts = 0; binding_attempts < 3; binding_attempts++) { bool out_of_memory; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index dbce568054..959bf73e70 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -575,18 +575,13 @@ VKGSRender::VKGSRender() : GSRender() { case vk::driver_vendor::NVIDIA: if (auto chip_family = vk::get_chip_family(); - chip_family == vk::chip_class::NV_kepler || - chip_family == vk::chip_class::NV_maxwell) + chip_family == vk::chip_class::NV_kepler || chip_family == vk::chip_class::NV_maxwell) { - rsx_log.error("Older NVIDIA cards do not meet requirements for asynchronous compute due to some driver fakery."); - backend_config.supports_asynchronous_compute = false; - } - else // Workaround. Remove once the async decoder is re-written - { - // NVIDIA 471 and newer are completely borked. Queue priority is not observed and any queue waiting on another just causes deadlock. - rsx_log.error("NVIDIA GPUs are incompatible with the current implementation of asynchronous texture decoding."); - backend_config.supports_asynchronous_compute = false; + rsx_log.warning("Older NVIDIA cards do not meet requirements for true asynchronous compute due to some driver fakery."); } + + rsx_log.notice("Forcing safe async compute for NVIDIA device to avoid crashing."); + g_cfg.video.vk.asynchronous_scheduler.set(vk_gpu_scheduler_mode::safe); break; #if !defined(_WIN32) // Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations @@ -614,7 +609,7 @@ VKGSRender::VKGSRender() : GSRender() if (backend_config.supports_asynchronous_compute) { // Run only if async compute can be used. - g_fxo->init("Vulkan Async Scheduler"sv); + g_fxo->init(g_cfg.video.vk.asynchronous_scheduler); } } } @@ -627,21 +622,24 @@ VKGSRender::~VKGSRender() return; } - // Globals. TODO: Refactor lifetime management - if (backend_config.supports_asynchronous_compute) + // Flush DMA queue + while (!g_fxo->get().sync()) { - g_fxo->get().kill(); + do_local_task(rsx::FIFO_state::lock_wait); } //Wait for device to finish up with resources vkDeviceWaitIdle(*m_device); + // Globals. TODO: Refactor lifetime management + if (backend_config.supports_asynchronous_compute) + { + g_fxo->get().destroy(); + } + // Clear flush requests m_flush_requests.clear_pending_flag(); - // Texture cache - m_texture_cache.destroy(); - // Shaders vk::destroy_pipe_compiler(); // Ensure no pending shaders being compiled vk::finalize_compiler_context(); // Shut down the glslang compiler @@ -2064,9 +2062,6 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore const bool sync_success = g_fxo->get().sync(); const VkBool32 force_flush = !sync_success; - // Flush any asynchronously scheduled jobs - g_fxo->get().flush(force_flush); - if (vk::test_status_interrupt(vk::heap_dirty)) { if (m_attrib_ring_info.is_dirty() || @@ -2096,8 +2091,8 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore m_secondary_command_buffer.end(); - m_secondary_command_buffer.submit(m_device->get_graphics_queue(), - VK_NULL_HANDLE, VK_NULL_HANDLE, VK_NULL_HANDLE, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, force_flush); + vk::queue_submit_t submit_info{ m_device->get_graphics_queue(), nullptr }; + m_secondary_command_buffer.submit(submit_info, force_flush); } vk::clear_status_interrupt(vk::heap_dirty); @@ -2128,8 +2123,54 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore m_current_command_buffer->end(); m_current_command_buffer->tag(); - m_current_command_buffer->submit(m_device->get_graphics_queue(), - wait_semaphore, signal_semaphore, pFence, pipeline_stage_flags, force_flush); + // Flush any asynchronously scheduled jobs + // So this is a bit trippy, but, in this case, the primary CB contains the 'release' operations, not the acquire ones. + // The CB that comes in after this submit will acquire the yielded resources automatically. + // This means the primary CB is the precursor to the async CB not the other way around. + // Async CB should wait for the primary CB to signal. + vk::queue_submit_t primary_submit_info{ m_device->get_graphics_queue(), pFence }; + vk::queue_submit_t secondary_submit_info{}; + + if (wait_semaphore) + { + primary_submit_info.wait_on(wait_semaphore, pipeline_stage_flags); + } + + if (const auto wait_sema = std::exchange(m_dangling_semaphore_signal, VK_NULL_HANDLE)) + { + // TODO: Sync on VS stage + primary_submit_info.wait_on(wait_sema, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT); + } + + auto& async_scheduler = g_fxo->get(); + const bool require_secondary_flush = async_scheduler.is_recording(); + + if (async_scheduler.is_recording()) + { + if (async_scheduler.is_host_mode()) + { + // Inject dependency chain using semaphores. + // HEAD = externally synchronized. + // TAIL = insert dangling wait, from the async CB to the next CB down. + m_dangling_semaphore_signal = *async_scheduler.get_sema(); + secondary_submit_info.queue_signal(m_dangling_semaphore_signal); + + // Delay object destruction by one cycle + vk::get_resource_manager()->push_down_current_scope(); + } + } + + if (signal_semaphore) + { + primary_submit_info.queue_signal(signal_semaphore); + } + + m_current_command_buffer->submit(primary_submit_info, force_flush); + + if (require_secondary_flush) + { + async_scheduler.flush(secondary_submit_info, force_flush); + } if (force_flush) { @@ -2367,7 +2408,7 @@ void VKGSRender::renderctl(u32 request_code, void* args) { case vk::rctrl_queue_submit: { - const auto packet = reinterpret_cast(args); + const auto packet = reinterpret_cast(args); vk::queue_submit(packet); free(packet); break; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index be425a9c5a..847b6abe3a 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -429,6 +429,7 @@ private: u32 m_current_cb_index = 0; std::array m_primary_cb_list; vk::command_buffer_chunk* m_current_command_buffer = nullptr; + VkSemaphore m_dangling_semaphore_signal = VK_NULL_HANDLE; VkDescriptorSetLayout descriptor_layouts; VkPipelineLayout pipeline_layout; diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index feb0f0fd59..7fdf210848 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -31,7 +31,7 @@ namespace vk class image; class instance; class render_device; - struct submit_packet; + struct queue_submit_t; enum runtime_state { @@ -53,7 +53,7 @@ namespace vk // Sync helpers around vkQueueSubmit void acquire_global_submit_lock(); void release_global_submit_lock(); - void queue_submit(const vk::submit_packet* packet); + void queue_submit(const vk::queue_submit_t* packet); template T* get_compute_task(); diff --git a/rpcs3/Emu/RSX/VK/VKResourceManager.h b/rpcs3/Emu/RSX/VK/VKResourceManager.h index 4538ad63bc..9c624fe2c3 100644 --- a/rpcs3/Emu/RSX/VK/VKResourceManager.h +++ b/rpcs3/Emu/RSX/VK/VKResourceManager.h @@ -194,6 +194,11 @@ namespace vk dispose(ptr); } + void push_down_current_scope() + { + get_current_eid_scope().eid++; + } + void eid_completed(u64 eid) { while (!m_eid_map.empty()) diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index b035b79473..20e0e9c0df 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -811,7 +811,7 @@ namespace vk const vk::command_buffer* pcmd = nullptr; if (flags & image_upload_options::upload_contents_async) { - auto async_cmd = g_fxo->get().get_current(); + auto async_cmd = g_fxo->get().get_current(); async_cmd->begin(); pcmd = async_cmd; @@ -832,9 +832,20 @@ namespace vk ensure(pcmd); + // Queue transfer stuff. Must release from primary if owned and acquire in secondary. + const bool need_queue_xfer = dst_image->current_layout != VK_IMAGE_LAYOUT_UNDEFINED && primary_cb.get_queue_family() != pcmd->get_queue_family(); + if (need_queue_xfer) + { + dst_image->queue_release(primary_cb, pcmd->get_queue_family(), dst_image->current_layout); + } + if (flags & image_upload_options::initialize_image_layout) { - dst_image->change_layout(*pcmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, pcmd->get_queue_family()); + dst_image->change_layout(*pcmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + } + else if (need_queue_xfer) + { + dst_image->queue_acquire(*pcmd, dst_image->current_layout); } return *pcmd; @@ -1119,6 +1130,12 @@ namespace vk { vkCmdCopyBufferToImage(cmd2, upload_buffer->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast(copy_regions.size()), copy_regions.data()); } + + if (cmd2.get_queue_family() != cmd.get_queue_family()) + { + // Release from async chain, the primary chain will acquire later + dst_image->queue_release(cmd2, cmd.get_queue_family(), dst_image->current_layout); + } } void blitter::scale_image(vk::command_buffer& cmd, vk::image* src, vk::image* dst, areai src_area, areai dst_area, bool interpolate, const rsx::typeless_xfer& xfer_info) diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 8b82ea37d4..e13807020f 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -935,8 +935,9 @@ namespace vk } } + const rsx::flags32_t create_flags = g_fxo->get().is_host_mode() ? texture_create_flags::do_not_reuse : 0; auto section = create_new_texture(cmd, rsx_range, width, height, depth, mipmaps, pitch, gcm_format, context, type, swizzled, - rsx::component_order::default_, 0); + rsx::component_order::default_, create_flags); auto image = section->get_raw_texture(); image->set_debug_name(fmt::format("Raw Texture @0x%x", rsx_range.start)); @@ -950,8 +951,12 @@ namespace vk input_swizzled = false; } - rsx::flags32_t upload_command_flags = initialize_image_layout | - (rsx::get_current_renderer()->get_backend_config().supports_asynchronous_compute ? upload_contents_async : upload_contents_inline); + rsx::flags32_t upload_command_flags = initialize_image_layout | upload_contents_inline; + if (context == rsx::texture_upload_context::shader_read && + rsx::get_current_renderer()->get_backend_config().supports_asynchronous_compute) + { + upload_command_flags |= upload_contents_async; + } const u16 layer_count = (type == rsx::texture_dimension_extended::texture_dimension_cubemap) ? 6 : 1; vk::upload_image(cmd, image, subresource_layout, gcm_format, input_swizzled, layer_count, image->aspect(), @@ -1086,11 +1091,40 @@ namespace vk { // Flush any pending async jobs in case of blockers // TODO: Context-level manager should handle this logic - g_fxo->get().flush(VK_TRUE); + auto& async_scheduler = g_fxo->get(); + vk::semaphore* async_sema = nullptr; + + if (async_scheduler.is_recording()) + { + if (async_scheduler.is_host_mode()) + { + async_sema = async_scheduler.get_sema(); + } + else + { + vk::queue_submit_t submit_info{}; + async_scheduler.flush(submit_info, VK_TRUE); + } + } + // Primary access command queue, must restart it after // Primary access command queue, must restart it after vk::fence submit_fence(*m_device); - cmd.submit(m_submit_queue, VK_NULL_HANDLE, VK_NULL_HANDLE, &submit_fence, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_TRUE); + vk::queue_submit_t submit_info{ m_submit_queue, &submit_fence }; + + if (async_sema) + { + submit_info.queue_signal(*async_sema); + } + + cmd.submit(submit_info, VK_TRUE); + + if (async_sema) + { + vk::queue_submit_t submit_info2{}; + submit_info2.wait_on(*async_sema, VK_PIPELINE_STAGE_TRANSFER_BIT); + async_scheduler.flush(submit_info2, VK_FALSE); + } vk::wait_for_fence(&submit_fence, GENERAL_WAIT_TIMEOUT); @@ -1100,7 +1134,8 @@ namespace vk else { // Auxilliary command queue with auto-restart capability - cmd.submit(m_submit_queue, VK_NULL_HANDLE, VK_NULL_HANDLE, VK_NULL_HANDLE, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_TRUE); + vk::queue_submit_t submit_info{ m_submit_queue, nullptr }; + cmd.submit(submit_info, VK_TRUE); } ensure(cmd.flags == 0); diff --git a/rpcs3/Emu/RSX/VK/vkutils/commands.cpp b/rpcs3/Emu/RSX/VK/vkutils/commands.cpp index 44a89315ab..f419ddbdaa 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/commands.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/commands.cpp @@ -6,7 +6,7 @@ namespace vk { // This queue flushing method to be implemented by the backend as behavior depends on config - void queue_submit(VkQueue queue, const VkSubmitInfo* info, fence* pfence, VkBool32 flush); + void queue_submit(const queue_submit_t& submit_info, VkBool32 flush); void command_pool::create(vk::render_device& dev, u32 queue_family_id) { @@ -112,7 +112,7 @@ namespace vk is_open = false; } - void command_buffer::submit(VkQueue queue, VkSemaphore wait_semaphore, VkSemaphore signal_semaphore, fence* pfence, VkPipelineStageFlags pipeline_stage_flags, VkBool32 flush) + void command_buffer::submit(queue_submit_t& submit_info, VkBool32 flush) { if (is_open) { @@ -123,31 +123,14 @@ namespace vk // Check for hanging queries to avoid driver hang ensure((flags & cb_has_open_query) == 0); // "close and submit of commandbuffer with a hanging query!" - if (!pfence) + if (!submit_info.fence) { - pfence = m_submit_fence; - is_pending = bool(pfence); + submit_info.fence = m_submit_fence; + is_pending = bool(submit_info.fence); } - VkSubmitInfo infos = {}; - infos.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - infos.commandBufferCount = 1; - infos.pCommandBuffers = &commands; - infos.pWaitDstStageMask = &pipeline_stage_flags; - - if (wait_semaphore) - { - infos.waitSemaphoreCount = 1; - infos.pWaitSemaphores = &wait_semaphore; - } - - if (signal_semaphore) - { - infos.signalSemaphoreCount = 1; - infos.pSignalSemaphores = &signal_semaphore; - } - - queue_submit(queue, &infos, pfence, flush); + submit_info.commands = this->commands; + queue_submit(submit_info, flush); clear_flags(); } } diff --git a/rpcs3/Emu/RSX/VK/vkutils/commands.h b/rpcs3/Emu/RSX/VK/vkutils/commands.h index e70f1ef8cd..c0f3001869 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/commands.h +++ b/rpcs3/Emu/RSX/VK/vkutils/commands.h @@ -25,6 +25,42 @@ namespace vk operator VkCommandPool() const; }; + struct queue_submit_t + { + VkQueue queue = VK_NULL_HANDLE; + fence* fence = nullptr; + VkCommandBuffer commands = VK_NULL_HANDLE; + std::array wait_semaphores; + std::array signal_semaphores; + std::array wait_stages; + u32 wait_semaphores_count = 0; + u32 signal_semaphores_count = 0; + + queue_submit_t() = default; + queue_submit_t(VkQueue queue_, vk::fence* fence_) + : queue(queue_), fence(fence_) {} + + queue_submit_t(const queue_submit_t& other) + { + std::memcpy(this, &other, sizeof(queue_submit_t)); + } + + inline queue_submit_t& wait_on(VkSemaphore semaphore, VkPipelineStageFlags stage) + { + ensure(wait_semaphores_count < 4); + wait_semaphores[wait_semaphores_count] = semaphore; + wait_stages[wait_semaphores_count++] = stage; + return *this; + } + + inline queue_submit_t& queue_signal(VkSemaphore semaphore) + { + ensure(signal_semaphores_count < 4); + signal_semaphores[signal_semaphores_count++] = semaphore; + return *this; + } + }; + class command_buffer { private: @@ -64,7 +100,7 @@ namespace vk void begin(); void end(); - void submit(VkQueue queue, VkSemaphore wait_semaphore, VkSemaphore signal_semaphore, fence* pfence, VkPipelineStageFlags pipeline_stage_flags, VkBool32 flush = VK_FALSE); + void submit(queue_submit_t& submit_info, VkBool32 flush = VK_FALSE); // Properties command_pool& get_command_pool() const diff --git a/rpcs3/Emu/RSX/VK/vkutils/image.cpp b/rpcs3/Emu/RSX/VK/vkutils/image.cpp index b2ed85dd8f..8304872854 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/image.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/image.cpp @@ -179,20 +179,25 @@ namespace vk return m_format_class; } - void image::push_layout(VkCommandBuffer cmd, VkImageLayout layout) + void image::push_layout(const command_buffer& cmd, VkImageLayout layout) { + ensure(current_queue_family == VK_QUEUE_FAMILY_IGNORED || current_queue_family == cmd.get_queue_family()); + m_layout_stack.push(current_layout); change_image_layout(cmd, this, layout); } - void image::push_barrier(VkCommandBuffer cmd, VkImageLayout layout) + void image::push_barrier(const command_buffer& cmd, VkImageLayout layout) { + ensure(current_queue_family == VK_QUEUE_FAMILY_IGNORED || current_queue_family == cmd.get_queue_family()); + m_layout_stack.push(current_layout); insert_texture_barrier(cmd, this, layout); } - void image::pop_layout(VkCommandBuffer cmd) + void image::pop_layout(const command_buffer& cmd) { + ensure(current_queue_family == VK_QUEUE_FAMILY_IGNORED || current_queue_family == cmd.get_queue_family()); ensure(!m_layout_stack.empty()); auto layout = m_layout_stack.top(); @@ -200,37 +205,48 @@ namespace vk change_image_layout(cmd, this, layout); } + void image::queue_acquire(const command_buffer& cmd, VkImageLayout new_layout) + { + ensure(m_layout_stack.empty()); + ensure(current_queue_family != cmd.get_queue_family()); + VkImageSubresourceRange range = { aspect(), 0, mipmaps(), 0, layers() }; + change_image_layout(cmd, value, current_layout, new_layout, range, current_queue_family, cmd.get_queue_family(), 0u, ~0u); + + current_layout = new_layout; + current_queue_family = cmd.get_queue_family(); + } + + void image::queue_release(const command_buffer& src_queue_cmd, u32 dst_queue_family, VkImageLayout new_layout) + { + ensure(current_queue_family == src_queue_cmd.get_queue_family()); + ensure(m_layout_stack.empty()); + VkImageSubresourceRange range = { aspect(), 0, mipmaps(), 0, layers() }; + change_image_layout(src_queue_cmd, value, current_layout, new_layout, range, current_queue_family, dst_queue_family, ~0u, 0u); + } + void image::change_layout(const command_buffer& cmd, VkImageLayout new_layout) { - if (current_layout == new_layout) + // This is implicitly an acquire op + if (const auto new_queue_family = cmd.get_queue_family(); + current_queue_family == VK_QUEUE_FAMILY_IGNORED) + { + current_queue_family = new_queue_family; + } + else if (current_queue_family != new_queue_family) + { + queue_acquire(cmd, new_layout); return; + } + + if (current_layout == new_layout) + { + return; + } ensure(m_layout_stack.empty()); change_image_layout(cmd, this, new_layout); - } - void image::change_layout(const command_buffer& cmd, VkImageLayout new_layout, u32 new_queue_family) - { - if (current_layout == new_layout && current_queue_family == new_queue_family) - { - // Nothing to do - return; - } - - ensure(m_layout_stack.empty()); - u32 dst_queue = new_queue_family; - - if (current_queue_family == VK_QUEUE_FAMILY_IGNORED) - { - // Implicit acquisition - dst_queue = VK_QUEUE_FAMILY_IGNORED; - } - - VkImageSubresourceRange range = { aspect(), 0, mipmaps(), 0, layers() }; - change_image_layout(cmd, value, current_layout, new_layout, range, current_queue_family, dst_queue); - - current_layout = new_layout; - current_queue_family = new_queue_family; + current_queue_family = cmd.get_queue_family(); } void image::set_debug_name(const std::string& name) diff --git a/rpcs3/Emu/RSX/VK/vkutils/image.h b/rpcs3/Emu/RSX/VK/vkutils/image.h index 41957bb62b..a0ec374751 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/image.h +++ b/rpcs3/Emu/RSX/VK/vkutils/image.h @@ -77,11 +77,14 @@ namespace vk rsx::format_class format_class() const; // Pipeline management - void push_layout(VkCommandBuffer cmd, VkImageLayout layout); - void push_barrier(VkCommandBuffer cmd, VkImageLayout layout); - void pop_layout(VkCommandBuffer cmd); + void push_layout(const command_buffer& cmd, VkImageLayout layout); + void push_barrier(const command_buffer& cmd, VkImageLayout layout); + void pop_layout(const command_buffer& cmd); void change_layout(const command_buffer& cmd, VkImageLayout new_layout); - void change_layout(const command_buffer& cmd, VkImageLayout new_layout, u32 new_queue_family); + + // Queue transfer + void queue_acquire(const command_buffer& cmd, VkImageLayout new_layout); + void queue_release(const command_buffer& src_queue_cmd, u32 dst_queue_family, VkImageLayout new_layout); // Debug utils void set_debug_name(const std::string& name); diff --git a/rpcs3/Emu/RSX/VK/vkutils/image_helpers.cpp b/rpcs3/Emu/RSX/VK/vkutils/image_helpers.cpp index cf5131b205..137e9523d0 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/image_helpers.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/image_helpers.cpp @@ -56,7 +56,7 @@ namespace vk } void change_image_layout(VkCommandBuffer cmd, VkImage image, VkImageLayout current_layout, VkImageLayout new_layout, const VkImageSubresourceRange& range, - u32 src_queue_family, u32 dst_queue_family) + u32 src_queue_family, u32 dst_queue_family, u32 src_access_mask_bits, u32 dst_access_mask_bits) { if (vk::is_renderpass_open(cmd)) { @@ -196,6 +196,12 @@ namespace vk break; //TODO Investigate what happens here } + barrier.srcAccessMask &= src_access_mask_bits; + barrier.dstAccessMask &= dst_access_mask_bits; + + if (!barrier.srcAccessMask) src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + if (!barrier.dstAccessMask) dst_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + vkCmdPipelineBarrier(cmd, src_stage, dst_stage, 0, 0, nullptr, 0, nullptr, 1, &barrier); } diff --git a/rpcs3/Emu/RSX/VK/vkutils/image_helpers.h b/rpcs3/Emu/RSX/VK/vkutils/image_helpers.h index 639ed0070a..632c3333f3 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/image_helpers.h +++ b/rpcs3/Emu/RSX/VK/vkutils/image_helpers.h @@ -10,7 +10,9 @@ namespace vk VkComponentMapping apply_swizzle_remap(const std::array& base_remap, const std::pair, std::array>& remap_vector); void change_image_layout(VkCommandBuffer cmd, VkImage image, VkImageLayout current_layout, VkImageLayout new_layout, const VkImageSubresourceRange& range, - u32 src_queue_family = VK_QUEUE_FAMILY_IGNORED, u32 dst_queue_family = VK_QUEUE_FAMILY_IGNORED); + u32 src_queue_family = VK_QUEUE_FAMILY_IGNORED, u32 dst_queue_family = VK_QUEUE_FAMILY_IGNORED, + u32 src_access_mask_bits = 0xFFFFFFFF, u32 dst_access_mask_bits = 0xFFFFFFFF); + void change_image_layout(VkCommandBuffer cmd, vk::image* image, VkImageLayout new_layout, const VkImageSubresourceRange& range); void change_image_layout(VkCommandBuffer cmd, vk::image* image, VkImageLayout new_layout); } diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.cpp b/rpcs3/Emu/RSX/VK/vkutils/sync.cpp index 292a2833f6..fa05e8604a 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/sync.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/sync.cpp @@ -53,9 +53,27 @@ namespace vk return (handle != VK_NULL_HANDLE); } - event::event(const render_device& dev, sync_domain domain) + semaphore::semaphore(const render_device& dev) + : m_device(dev) + { + VkSemaphoreCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + CHECK_RESULT(vkCreateSemaphore(m_device, &info, nullptr, &m_handle)); + } + + semaphore::~semaphore() + { + vkDestroySemaphore(m_device, m_handle, nullptr); + } + + semaphore::operator VkSemaphore() const + { + return m_handle; + } + + event::event(const render_device& dev, sync_domain domain) + : m_device(dev) { - m_device = dev; if (domain == sync_domain::gpu || dev.gpu().get_driver_vendor() != driver_vendor::AMD) { VkEventCreateInfo info diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.h b/rpcs3/Emu/RSX/VK/vkutils/sync.h index 807546aea5..a1d0049ab8 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/sync.h +++ b/rpcs3/Emu/RSX/VK/vkutils/sync.h @@ -24,6 +24,7 @@ namespace vk fence(VkDevice dev); ~fence(); + fence(const fence&) = delete; void reset(); void signal_flushed(); @@ -43,6 +44,7 @@ namespace vk public: event(const render_device& dev, sync_domain domain); ~event(); + event(const event&) = delete; void signal(const command_buffer& cmd, VkPipelineStageFlags stages, VkAccessFlags access); void host_signal() const; @@ -51,6 +53,21 @@ namespace vk void reset() const; }; + class semaphore + { + VkSemaphore m_handle = VK_NULL_HANDLE; + VkDevice m_device = VK_NULL_HANDLE; + + semaphore() = default; + + public: + semaphore(const render_device& dev); + ~semaphore(); + semaphore(const semaphore&) = delete; + + operator VkSemaphore() const; + }; + VkResult wait_for_fence(fence* pFence, u64 timeout = 0ull); VkResult wait_for_event(event* pEvent, u64 timeout = 0ull); } diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index c1d2b7297c..eae614f662 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -168,7 +168,7 @@ struct cfg_root : cfg::node cfg::_bool asynchronous_texture_streaming{ this, "Asynchronous Texture Streaming 2", false }; cfg::_bool fsr_upscaling{ this, "Enable FidelityFX Super Resolution Upscaling", false, true }; cfg::uint<0, 100> rcas_sharpening_intensity{ this, "FidelityFX CAS Sharpening Intensity", 50, true }; - cfg::_enum asynchronous_scheduler{ this, "Asynchronous Queue Scheduler", vk_gpu_scheduler_mode::device }; + cfg::_enum asynchronous_scheduler{ this, "Asynchronous Queue Scheduler", vk_gpu_scheduler_mode::safe }; } vk{ this }; diff --git a/rpcs3/Emu/system_config_types.cpp b/rpcs3/Emu/system_config_types.cpp index 9a4d080cd9..47696f721a 100644 --- a/rpcs3/Emu/system_config_types.cpp +++ b/rpcs3/Emu/system_config_types.cpp @@ -521,8 +521,8 @@ void fmt_class_string::format(std::string& out, u64 arg) { switch (value) { - case vk_gpu_scheduler_mode::host: return "Host"; - case vk_gpu_scheduler_mode::device: return "Device"; + case vk_gpu_scheduler_mode::safe: return "Safe"; + case vk_gpu_scheduler_mode::fast: return "Fast"; } return unknown; diff --git a/rpcs3/Emu/system_config_types.h b/rpcs3/Emu/system_config_types.h index 4cef275c43..6b6e2a712f 100644 --- a/rpcs3/Emu/system_config_types.h +++ b/rpcs3/Emu/system_config_types.h @@ -228,8 +228,8 @@ enum class shader_mode enum class vk_gpu_scheduler_mode { - host, - device + safe, + fast }; enum class thread_scheduler_mode