diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 274365f4cb..1d083fce7c 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -3,6 +3,7 @@ #include "../Overlays/Shaders/shader_loading_dialog_native.h" #include "GLGSRender.h" #include "GLCompute.h" +#include "GLDMA.h" #include "Emu/Memory/vm_locking.h" #include "Emu/RSX/rsx_methods.h" @@ -180,6 +181,20 @@ void GLGSRender::on_init_thread() backend_config.supports_normalized_barycentrics = false; } + if (gl_caps.AMD_pinned_memory) + { + backend_config.supports_host_gpu_labels = true; + + if (g_cfg.video.host_label_synchronization) + { + m_host_gpu_context_data = std::make_unique(); + m_host_gpu_context_data->create(gl::buffer::target::array, 4096); + + auto host_context_ptr = reinterpret_cast(m_host_gpu_context_data->map(0, 4096, gl::buffer::access::read)); + m_host_dma_ctrl = std::make_unique(host_context_ptr); + } + } + // Use industry standard resource alignment values as defaults m_uniform_buffer_offset_align = 256; m_min_texbuffer_alignment = 256; @@ -397,6 +412,7 @@ void GLGSRender::on_exit() // TODO: Move these gl::destroy_compute_tasks(); gl::destroy_overlay_passes(); + gl::clear_dma_resources(); gl::destroy_global_texture_resources(); @@ -407,6 +423,9 @@ void GLGSRender::on_exit() m_prog_buffer.clear(); m_rtts.destroy(); + m_host_dma_ctrl.reset(); + m_host_gpu_context_data.reset(); + for (auto &fbo : m_framebuffer_cache) { fbo.remove(); diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h index c339f7dc39..8ea87f8e5d 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.h +++ b/rpcs3/Emu/RSX/GL/GLGSRender.h @@ -128,7 +128,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control GLProgramBuffer m_prog_buffer; - //buffer + // Draw Buffers gl::fbo* m_draw_fbo = nullptr; std::list m_framebuffer_cache; std::unique_ptr m_flip_tex_color[2]; @@ -137,7 +137,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control std::unique_ptr m_upscaler; output_scaling_mode m_output_scaling = output_scaling_mode::bilinear; - //vaos are mandatory for core profile + // VAOs are mandatory for core profile gl::vao m_vao; shared_mutex m_sampler_mutex; @@ -150,6 +150,9 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control // Occlusion query type, can be SAMPLES_PASSED or ANY_SAMPLES_PASSED GLenum m_occlusion_type = GL_ANY_SAMPLES_PASSED; + // Host context for GPU-driven work + std::unique_ptr m_host_gpu_context_data; + public: u64 get_cycles() final; diff --git a/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp b/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp new file mode 100644 index 0000000000..d86b03712a --- /dev/null +++ b/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp @@ -0,0 +1,67 @@ +#include "stdafx.h" +#include "RSXDMAWriter.h" + +#include "Utilities//Thread.h" +#include + +namespace rsx +{ + void RSXDMAWriter::update() + { + if (m_dispatch_handlers.empty()) + { + m_job_queue.clear(); + return; + } + + while (!m_job_queue.empty()) + { + const auto job = m_job_queue.front(); + + if (const auto dispatch = m_dispatch_handlers.find(job.dispatch_class); + dispatch == m_dispatch_handlers.end() || dispatch->second.handler(m_host_context_ptr, &job)) + { + // No handler registered, or callback consumed the job + m_job_queue.pop_front(); + continue; + } + + // Dispatcher found and rejected the job. Stop, we'll try again later. + break; + } + } + + void RSXDMAWriter::register_handler(host_dispatch_handler_t handler) + { + m_dispatch_handlers[handler.dispatch_class] = handler; + } + + void RSXDMAWriter::deregister_handler(int dispatch_class) + { + m_dispatch_handlers.erase(dispatch_class); + } + + void RSXDMAWriter::enqueue(const host_gpu_write_op_t& request) + { + m_job_queue.push_back(request); + } + + void RSXDMAWriter::drain_label_queue() + { + if (!m_host_context_ptr) + { + return; + } + + // FIXME: This is a busy wait, consider yield to improve responsiveness on weak devices. + while (!m_host_context_ptr->in_flight_commands_completed()) + { + utils::pause(); + + if (thread_ctrl::state() == thread_state::aborting) + { + break; + } + } + } +} diff --git a/rpcs3/Emu/RSX/Host/RSXDMAWriter.h b/rpcs3/Emu/RSX/Host/RSXDMAWriter.h new file mode 100644 index 0000000000..18d232bfda --- /dev/null +++ b/rpcs3/Emu/RSX/Host/RSXDMAWriter.h @@ -0,0 +1,115 @@ +#pragma once + +#include + +#include +#include +#include + +namespace rsx +{ + struct host_gpu_context_t + { + u64 magic = 0xCAFEBABE; + u64 event_counter = 0; + u64 texture_load_request_event = 0; + u64 texture_load_complete_event = 0; + u64 last_label_acquire_event = 0; + u64 last_label_release2_event = 0; + u64 commands_complete_event = 0; + + inline u64 inc_counter() volatile + { + // Workaround for volatile increment warning. GPU can see this value directly, but currently we do not modify it on the device. + event_counter = event_counter + 1; + return event_counter; + } + + inline bool in_flight_commands_completed() const volatile + { + return last_label_release2_event == commands_complete_event; + } + + inline bool texture_loads_completed() const volatile + { + // Return true if all texture load requests are done. + return texture_load_complete_event == texture_load_request_event; + } + + inline bool has_unflushed_texture_loads() const volatile + { + return texture_load_request_event > last_label_release2_event; + } + + inline u64 on_texture_load_acquire() volatile + { + texture_load_request_event = inc_counter(); + return texture_load_request_event; + } + + inline void on_texture_load_release() volatile + { + // Normally released by the host device, but implemented nonetheless for software fallback + texture_load_complete_event = texture_load_request_event; + } + + inline u64 on_label_acquire() volatile + { + last_label_acquire_event = inc_counter(); + return last_label_acquire_event; + } + + inline void on_label_release() volatile + { + last_label_release2_event = last_label_acquire_event; + } + + inline bool needs_label_release() const volatile + { + return last_label_acquire_event > last_label_release2_event; + } + }; + + struct host_gpu_write_op_t + { + int dispatch_class = 0; + void* userdata = nullptr; + }; + + struct host_dispatch_handler_t + { + int dispatch_class = 0; + std::function handler; + }; + + class RSXDMAWriter + { + public: + RSXDMAWriter(void* mem) + : m_host_context_ptr(new (mem)host_gpu_context_t) + {} + + RSXDMAWriter(host_gpu_context_t* pctx) + : m_host_context_ptr(pctx) + {} + + void update(); + + void register_handler(host_dispatch_handler_t handler); + void deregister_handler(int dispatch_class); + + void enqueue(const host_gpu_write_op_t& request); + void drain_label_queue(); + + volatile host_gpu_context_t* host_ctx() const + { + return m_host_context_ptr; + } + + private: + std::unordered_map m_dispatch_handlers; + volatile host_gpu_context_t* m_host_context_ptr = nullptr; + + std::deque m_job_queue; + }; +} diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 37ed9bac6f..c90b8a2079 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -1162,6 +1162,7 @@ namespace rsx // Update other sub-units zcull_ctrl->update(this); + m_host_dma_ctrl->update(); } // Execute FIFO queue diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index 3f7b3842f1..514d5f69bb 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -35,6 +35,8 @@ #include "NV47/FW/GRAPH_backend.h" +#include "Host/RSXDMAWriter.h" + extern atomic_t g_user_asked_for_frame_capture; extern atomic_t g_disable_frame_limit; extern rsx::frame_trace_data frame_debug; @@ -212,6 +214,9 @@ namespace rsx // Context context* m_ctx = nullptr; + // Host DMA + std::unique_ptr m_host_dma_ctrl; + public: atomic_t new_get_put = u64{umax}; u32 restore_point = 0; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index fb5c245b87..6352c9c58e 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -867,8 +867,7 @@ VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar) VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0, VMM_ALLOCATION_POOL_SYSTEM); - m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t(); - ensure(m_host_data_ptr->magic == 0xCAFEBABE); + m_host_dma_ctrl = std::make_unique(m_host_object_data->map(0, 0x10000)); } else { @@ -1784,6 +1783,11 @@ void VKGSRender::flush_command_queue(bool hard_sync, bool do_not_switch) m_current_command_buffer->begin(); } +std::pair VKGSRender::map_host_object_data() const +{ + return { m_host_dma_ctrl->host_ctx(), m_host_object_data->value}; +} + bool VKGSRender::release_GCM_label(u32 address, u32 args) { if (!backend_config.supports_host_gpu_labels) @@ -1791,25 +1795,13 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args) return false; } - auto drain_label_queue = [this]() - { - while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event) - { - utils::pause(); + auto host_ctx = ensure(m_host_dma_ctrl->host_ctx()); - if (thread_ctrl::state() == thread_state::aborting) - { - break; - } - } - }; - - ensure(m_host_data_ptr); - if (m_host_data_ptr->texture_load_complete_event == m_host_data_ptr->texture_load_request_event) + if (host_ctx->texture_loads_completed()) { // All texture loads already seen by the host GPU // Wait for all previously submitted labels to be flushed - drain_label_queue(); + m_host_dma_ctrl->drain_label_queue(); return false; } @@ -1821,13 +1813,13 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args) // NVIDIA GPUs can disappoint when DMA blocks straddle VirtualAlloc boundaries. // Take the L and try the fallback. rsx_log.warning("Host label update at 0x%x was not possible.", address); - drain_label_queue(); + m_host_dma_ctrl->drain_label_queue(); return false; } - m_host_data_ptr->last_label_release_event = m_host_data_ptr->inc_counter(); + const auto release_event_id = host_ctx->on_label_acquire(); - if (m_host_data_ptr->texture_load_request_event > m_host_data_ptr->last_label_submit_event) + if (host_ctx->has_unflushed_texture_loads()) { if (vk::is_renderpass_open(*m_current_command_buffer)) { @@ -1842,14 +1834,15 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args) auto cmd = m_secondary_cb_list.next(); cmd->begin(); vkCmdUpdateBuffer(*cmd, mapping.second->value, mapping.first, 4, &write_data); - vkCmdUpdateBuffer(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, const_cast(&m_host_data_ptr->last_label_release_event)); + vkCmdUpdateBuffer(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, &release_event_id); cmd->end(); vk::queue_submit_t submit_info = { m_device->get_graphics_queue(), nullptr }; cmd->submit(submit_info); - m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event; + host_ctx->on_label_release(); } + return true; } @@ -2516,15 +2509,15 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query; } - if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->last_label_submit_event) + if (m_host_dma_ctrl && m_host_dma_ctrl->host_ctx()->needs_label_release()) { vkCmdUpdateBuffer(*m_current_command_buffer, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), sizeof(u64), - const_cast(&m_host_data_ptr->last_label_release_event)); + const_cast(&m_host_dma_ctrl->host_ctx()->last_label_acquire_event)); - m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event; + m_host_dma_ctrl->host_ctx()->on_label_release(); } m_current_command_buffer->end(); diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index f1ae833938..92627b99ef 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -1,6 +1,4 @@ #pragma once -#include "Emu/RSX/GSRender.h" -#include "Emu/Cell/timers.hpp" #include "upscalers/upscaling.h" @@ -19,15 +17,23 @@ #include "VKFramebuffer.h" #include "VKShaderInterpreter.h" #include "VKQueryPool.h" -#include "../GCM.h" #include "util/asm.hpp" +#include "Emu/RSX/GCM.h" +#include "Emu/RSX/GSRender.h" +#include "Emu/RSX/Host/RSXDMAWriter.h" + #include #include using namespace vk::vmm_allocation_pool_; // clang workaround. using namespace vk::upscaling_flags_; // ditto +namespace vk +{ + using host_data_t = rsx::host_gpu_context_t; +} + class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control { private: @@ -118,7 +124,6 @@ private: vk::command_buffer_chain m_primary_cb_list; vk::command_buffer_chunk* m_current_command_buffer = nullptr; - volatile vk::host_data_t* m_host_data_ptr = nullptr; std::unique_ptr m_host_object_data; vk::descriptor_pool m_descriptor_pool; @@ -274,7 +279,7 @@ public: void end_conditional_rendering() override; // Host sync object - inline std::pair map_host_object_data() { return { m_host_data_ptr, m_host_object_data->value }; } + std::pair map_host_object_data() const; // GRAPH backend void patch_transform_constants(rsx::context* ctx, u32 index, u32 count) override; diff --git a/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp b/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp index e1893626ab..b4d999e07c 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp +++ b/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp @@ -6,6 +6,7 @@ #include "Emu/RSX/Common/simple_array.hpp" #include "Emu/RSX/rsx_utils.h" +#include "Emu/RSX/rsx_cache.h" #include "Utilities/mutex.h" #include "util/asm.hpp" diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 3353dcf341..80630656c0 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -1246,8 +1246,8 @@ namespace vk // Queue a sync update on the CB doing the load auto [host_data, host_buffer] = static_cast(rsxthr)->map_host_object_data(); ensure(host_data); - const auto event_id = host_data->inc_counter(); - host_data->texture_load_request_event = event_id; + + const auto event_id = host_data->on_texture_load_acquire(); vkCmdUpdateBuffer(cmd2, host_buffer, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id); } } diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.h b/rpcs3/Emu/RSX/VK/vkutils/sync.h index a91cafebab..9177e84113 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/sync.h +++ b/rpcs3/Emu/RSX/VK/vkutils/sync.h @@ -18,25 +18,6 @@ namespace vk gpu = 1 }; - struct host_data_t // Pick a better name - { - u64 magic = 0xCAFEBABE; - u64 event_counter = 0; - u64 texture_load_request_event = 0; - u64 texture_load_complete_event = 0; - u64 last_label_release_event = 0; - u64 last_label_submit_event = 0; - u64 commands_complete_event = 0; - u64 last_label_request_timestamp = 0; - - inline u64 inc_counter() volatile - { - // Workaround for volatile increment warning. GPU can see this value directly, but currently we do not modify it on the device. - event_counter = event_counter + 1; - return event_counter; - } - }; - struct fence { atomic_t flushed = false; diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index 8119df399e..580a403653 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -104,6 +104,7 @@ + @@ -617,6 +618,7 @@ + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index df52d9c397..7a2bf3ed16 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -1300,6 +1300,9 @@ Emu\GPU\RSX\NV47\FW + + Emu\GPU\RSX\Host Mini-Driver + @@ -2620,6 +2623,9 @@ Emu\GPU\RSX\Utils + + Emu\GPU\RSX\Host Mini-Driver +