From 472cfebc39b37f725b9e7b6dfa66200aa5581caf Mon Sep 17 00:00:00 2001 From: psucien Date: Sat, 14 Dec 2024 17:18:46 +0100 Subject: [PATCH] better asc ring context handling --- src/core/debug_state.cpp | 13 ++- src/core/debug_state.h | 4 +- src/video_core/amdgpu/liverpool.cpp | 108 ++++++++++-------- src/video_core/amdgpu/liverpool.h | 15 ++- .../renderer_vulkan/vk_pipeline_cache.cpp | 11 +- .../renderer_vulkan/vk_rasterizer.cpp | 6 +- .../renderer_vulkan/vk_shader_hle.cpp | 15 ++- .../renderer_vulkan/vk_shader_hle.h | 5 +- 8 files changed, 98 insertions(+), 79 deletions(-) diff --git a/src/core/debug_state.cpp b/src/core/debug_state.cpp index c68fd469d..13e0049db 100644 --- a/src/core/debug_state.cpp +++ b/src/core/debug_state.cpp @@ -15,6 +15,7 @@ using namespace DebugStateType; DebugStateImpl& DebugState = *Common::Singleton::Instance(); +extern std::unique_ptr liverpool; static ThreadID ThisThreadID() { #ifdef _WIN32 @@ -142,8 +143,7 @@ void DebugStateImpl::PushQueueDump(QueueDump dump) { frame.queues.push_back(std::move(dump)); } -void DebugStateImpl::PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, - const AmdGpu::Liverpool::Regs& regs, bool is_compute) { +void DebugStateImpl::PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, bool is_compute) { std::scoped_lock lock{frame_dump_list_mutex}; const auto it = waiting_reg_dumps.find(header_addr); if (it == waiting_reg_dumps.end()) { @@ -153,18 +153,19 @@ void DebugStateImpl::PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, waiting_reg_dumps.erase(it); waiting_reg_dumps_dbg.erase(waiting_reg_dumps_dbg.find(header_addr)); auto& dump = frame.regs[header_addr - base_addr]; - dump.regs = regs; + dump.regs = liverpool->regs; if (is_compute) { dump.is_compute = true; - const auto& cs = dump.regs.cs_program; + auto& cs = dump.regs.cs_program; + cs = liverpool->GetCsRegs(); dump.cs_data = PipelineComputerProgramDump{ .cs_program = cs, .code = std::vector{cs.Code().begin(), cs.Code().end()}, }; } else { for (int i = 0; i < RegDump::MaxShaderStages; i++) { - if (regs.stage_enable.IsStageEnabled(i)) { - auto stage = regs.ProgramForStage(i); + if (dump.regs.stage_enable.IsStageEnabled(i)) { + auto stage = dump.regs.ProgramForStage(i); if (stage->address_lo != 0) { auto code = stage->Code(); dump.stages[i] = PipelineShaderProgramDump{ diff --git a/src/core/debug_state.h b/src/core/debug_state.h index 0db5bc468..f8370ab2d 100644 --- a/src/core/debug_state.h +++ b/src/core/debug_state.h @@ -11,7 +11,6 @@ #include #include "common/types.h" -#include "video_core/amdgpu/liverpool.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #ifdef _WIN32 @@ -203,8 +202,7 @@ public: void PushQueueDump(QueueDump dump); - void PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, - const AmdGpu::Liverpool::Regs& regs, bool is_compute = false); + void PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, bool is_compute = false); void CollectShader(const std::string& name, Shader::LogicalStage l_stage, vk::ShaderModule module, std::span spv, diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 2d7aa4f3f..3a917da13 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -29,32 +29,24 @@ static_assert(Liverpool::NumComputeRings <= MAX_NAMES); static const char* acb_task_name[] = NAME_ARRAY(ACB_TASK, MAX_NAMES); -#define YIELD_CE() \ - mapped_queues[GfxQueueId].cs_state = regs.cs_program; \ +#define YIELD(name) \ FIBER_EXIT; \ co_yield {}; \ - FIBER_ENTER(ccb_task_name); \ - regs.cs_program = mapped_queues[GfxQueueId].cs_state + FIBER_ENTER(name); -#define YIELD_GFX \ - mapped_queues[GfxQueueId].cs_state = regs.cs_program; \ - FIBER_EXIT; \ - co_yield {}; \ - FIBER_ENTER(dcb_task_name); \ - regs.cs_program = mapped_queues[GfxQueueId].cs_state; - -#define YIELD_ASC(id) \ - mapped_queues[id + 1].cs_state = regs.cs_program; \ - FIBER_EXIT; \ - co_yield {}; \ - FIBER_ENTER(acb_task_name[id]); \ - regs.cs_program = mapped_queues[id + 1].cs_state; +#define YIELD_CE() YIELD(ccb_task_name) +#define YIELD_GFX() YIELD(dcb_task_name) +#define YIELD_ASC(id) YIELD(acb_task_name[id]) #define RESUME(task, name) \ FIBER_EXIT; \ task.handle.resume(); \ FIBER_ENTER(name); +#define RESUME_CE(task) RESUME(task, ccb_task_name) +#define RESUME_GFX(task) RESUME(task, dcb_task_name) +#define RESUME_ASC(task, id) RESUME(task, acb_task_name[id]) + std::array Liverpool::ConstantEngine::constants_heap; static std::span NextPacket(std::span span, size_t offset) { @@ -198,10 +190,11 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { const auto* indirect_buffer = reinterpret_cast(header); auto task = ProcessCeUpdate({indirect_buffer->Address(), indirect_buffer->ib_size}); + RESUME_CE(task); while (!task.handle.done()) { YIELD_CE(); - RESUME(task, ccb_task_name); + RESUME_CE(task); } break; } @@ -228,6 +221,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(dcb.data()); @@ -404,7 +398,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanindex_count; regs.draw_initiator = draw_index->draw_initiator; if (DebugState.DumpingCurrentReg()) { - DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs); + DebugState.PushRegsDump(base_addr, reinterpret_cast(header)); } if (rasterizer) { const auto cmd_address = reinterpret_cast(header); @@ -421,7 +415,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanindex_count; regs.draw_initiator = draw_index_off->draw_initiator; if (DebugState.DumpingCurrentReg()) { - DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs); + DebugState.PushRegsDump(base_addr, reinterpret_cast(header)); } if (rasterizer) { const auto cmd_address = reinterpret_cast(header); @@ -437,7 +431,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanindex_count; regs.draw_initiator = draw_index->draw_initiator; if (DebugState.DumpingCurrentReg()) { - DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs); + DebugState.PushRegsDump(base_addr, reinterpret_cast(header)); } if (rasterizer) { const auto cmd_address = reinterpret_cast(header); @@ -453,7 +447,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header), regs); + DebugState.PushRegsDump(base_addr, reinterpret_cast(header)); } if (rasterizer) { const auto cmd_address = reinterpret_cast(header); @@ -470,7 +464,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header), regs); + DebugState.PushRegsDump(base_addr, reinterpret_cast(header)); } if (rasterizer) { const auto cmd_address = reinterpret_cast(header); @@ -487,7 +481,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spandata_offset; const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr; if (DebugState.DumpingCurrentReg()) { - DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs); + DebugState.PushRegsDump(base_addr, reinterpret_cast(header)); } if (rasterizer) { const auto cmd_address = reinterpret_cast(header); @@ -502,15 +496,16 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); - regs.cs_program.dim_x = dispatch_direct->dim_x; - regs.cs_program.dim_y = dispatch_direct->dim_y; - regs.cs_program.dim_z = dispatch_direct->dim_z; - regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; + SaveDispatchContext(); + auto& cs_program = GetCsRegs(); + cs_program.dim_x = dispatch_direct->dim_x; + cs_program.dim_y = dispatch_direct->dim_y; + cs_program.dim_z = dispatch_direct->dim_z; + cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; if (DebugState.DumpingCurrentReg()) { - DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs, - true); + DebugState.PushRegsDump(base_addr, reinterpret_cast(header), true); } - if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { + if (rasterizer && (cs_program.dispatch_initiator & 1)) { const auto cmd_address = reinterpret_cast(header); rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:Dispatch", cmd_address)); rasterizer->DispatchDirect(); @@ -521,14 +516,15 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + SaveDispatchContext(); + auto& cs_program = GetCsRegs(); const auto offset = dispatch_indirect->data_offset; const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr; const auto size = sizeof(PM4CmdDispatchIndirect::GroupDimensions); if (DebugState.DumpingCurrentReg()) { - DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs, - true); + DebugState.PushRegsDump(base_addr, reinterpret_cast(header), true); } - if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { + if (rasterizer && (cs_program.dispatch_initiator & 1)) { const auto cmd_address = reinterpret_cast(header); rasterizer->ScopeMarkerBegin( fmt::format("dcb:{}:DispatchIndirect", cmd_address)); @@ -641,7 +637,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); while (!rewind->Valid()) { - YIELD_GFX; + YIELD_GFX(); } break; } @@ -657,7 +653,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanWaitVoLabel([&] { return wait_reg_mem->Test(); }); } while (!wait_reg_mem->Test()) { - YIELD_GFX; + YIELD_GFX(); } break; } @@ -665,11 +661,11 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); auto task = ProcessGraphics( {indirect_buffer->Address(), indirect_buffer->ib_size}, {}); - RESUME(task, dcb_task_name); + RESUME_GFX(task); while (!task.handle.done()) { - YIELD_GFX; - RESUME(task, dcb_task_name); + YIELD_GFX(); + RESUME_GFX(task); } break; } @@ -679,7 +675,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span acb, u32 vqid) { const auto* indirect_buffer = reinterpret_cast(header); auto task = ProcessCompute( {indirect_buffer->Address(), indirect_buffer->ib_size}, vqid); - RESUME(task, acb_task_name[vqid]); + RESUME_ASC(task, vqid); while (!task.handle.done()) { YIELD_ASC(vqid); - RESUME(task, acb_task_name[vqid]); + RESUME_ASC(task, vqid); } break; } @@ -781,20 +777,32 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { } case PM4ItOpcode::SetShReg: { const auto* set_data = reinterpret_cast(header); - std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2, - (count - 1) * sizeof(u32)); + const auto set_size = (count - 1) * sizeof(u32); + + if (set_data->reg_offset >= 0x200 && + set_data->reg_offset <= (0x200 + sizeof(ComputeProgram) / 4)) { + ASSERT(set_size <= sizeof(ComputeProgram)); + auto* addr = + reinterpret_cast(&asc_sh_regs[vqid]) + (set_data->reg_offset - 0x200); + std::memcpy(addr, header + 2, set_size); + } else { + std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2, + set_size); + } break; } case PM4ItOpcode::DispatchDirect: { const auto* dispatch_direct = reinterpret_cast(header); - regs.cs_program.dim_x = dispatch_direct->dim_x; - regs.cs_program.dim_y = dispatch_direct->dim_y; - regs.cs_program.dim_z = dispatch_direct->dim_z; - regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; + SaveDispatchContext(vqid); + auto& cs_program = GetCsRegs(); + cs_program.dim_x = dispatch_direct->dim_x; + cs_program.dim_y = dispatch_direct->dim_y; + cs_program.dim_z = dispatch_direct->dim_z; + cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; if (DebugState.DumpingCurrentReg()) { - DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs, true); + DebugState.PushRegsDump(base_addr, reinterpret_cast(header), true); } - if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { + if (rasterizer && (cs_program.dispatch_initiator & 1)) { const auto cmd_address = reinterpret_cast(header); rasterizer->ScopeMarkerBegin(fmt::format("acb[{}]:{}:Dispatch", vqid, cmd_address)); rasterizer->DispatchDirect(); diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 63849c986..9f677391d 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -1279,6 +1279,7 @@ struct Liverpool { }; Regs regs{}; + std::array asc_sh_regs{}; // See for a comment in context reg parsing code union CbDbExtent { @@ -1343,6 +1344,10 @@ public: gfx_queue.dcb_buffer.reserve(GfxReservedSize); } + inline ComputeProgram& GetCsRegs() { + return *curr_cs_regs; + } + struct AscQueueInfo { VAddr map_addr; u32* read_addr; @@ -1393,6 +1398,14 @@ private: void Process(std::stop_token stoken); + inline void SaveDispatchContext() { + curr_cs_regs = ®s.cs_program; + } + + inline void SaveDispatchContext(u32 vqid) { + curr_cs_regs = &asc_sh_regs[vqid]; + } + struct GpuQueue { std::mutex m_access{}; std::atomic dcb_buffer_offset; @@ -1400,7 +1413,6 @@ private: std::vector dcb_buffer; std::vector ccb_buffer; std::queue submits{}; - ComputeProgram cs_state{}; VAddr indirect_args_addr{}; }; std::array mapped_queues{}; @@ -1433,6 +1445,7 @@ private: std::mutex submit_mutex; std::condition_variable_any submit_cv; std::queue> command_queue{}; + ComputeProgram* curr_cs_regs{®s.cs_program}; }; static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08); diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 58473496f..ddf34465c 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -172,10 +172,10 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Stage stage, LogicalStage l_ } break; } - case Stage::Compute: { - const auto& cs_pgm = regs.cs_program; + case Shader::Stage::Compute: { + const auto& cs_pgm = liverpool->GetCsRegs(); info.num_user_data = cs_pgm.settings.num_user_regs; - info.num_allocated_vgprs = regs.cs_program.settings.num_vgprs * 4; + info.num_allocated_vgprs = cs_pgm.settings.num_vgprs * 4; info.cs_info.workgroup_size = {cs_pgm.num_thread_x.full, cs_pgm.num_thread_y.full, cs_pgm.num_thread_z.full}; info.cs_info.tgid_enable = {cs_pgm.IsTgidEnabled(0), cs_pgm.IsTgidEnabled(1), @@ -476,8 +476,8 @@ bool PipelineCache::RefreshGraphicsKey() { bool PipelineCache::RefreshComputeKey() { Shader::Backend::Bindings binding{}; - const auto* cs_pgm = &liverpool->regs.cs_program; - const auto cs_params = Liverpool::GetParams(*cs_pgm); + const auto& cs_pgm = liverpool->GetCsRegs(); + const auto cs_params = Liverpool::GetParams(cs_pgm); std::tie(infos[0], modules[0], fetch_shader, compute_key.value) = GetProgram(Shader::Stage::Compute, LogicalStage::Compute, cs_params, binding); return true; @@ -529,6 +529,7 @@ PipelineCache::Result PipelineCache::GetProgram(Stage stage, LogicalStage l_stag return std::make_tuple(&program->info, module, spec.fetch_shader_data, HashCombine(params.hash, 0)); } + it_pgm.value()->info.user_data = params.user_data; auto& program = it_pgm.value(); auto& info = program->info; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index fef4c7ec5..d0a234467 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -317,14 +317,14 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 void Rasterizer::DispatchDirect() { RENDERER_TRACE; - const auto& cs_program = liverpool->regs.cs_program; + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { return; } const auto& cs = pipeline->GetStage(Shader::LogicalStage::Compute); - if (ExecuteShaderHLE(cs, liverpool->regs, *this)) { + if (ExecuteShaderHLE(cs, *this)) { return; } @@ -344,7 +344,7 @@ void Rasterizer::DispatchDirect() { void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) { RENDERER_TRACE; - const auto& cs_program = liverpool->regs.cs_program; + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { return; diff --git a/src/video_core/renderer_vulkan/vk_shader_hle.cpp b/src/video_core/renderer_vulkan/vk_shader_hle.cpp index b863dce21..0b5d32274 100644 --- a/src/video_core/renderer_vulkan/vk_shader_hle.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_hle.cpp @@ -5,16 +5,16 @@ #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_hle.h" -#include "vk_rasterizer.h" +extern std::unique_ptr liverpool; namespace Vulkan { static constexpr u64 COPY_SHADER_HASH = 0xfefebf9f; -bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs, - Rasterizer& rasterizer) { +bool ExecuteCopyShaderHLE(const Shader::Info& info, Rasterizer& rasterizer) { auto& scheduler = rasterizer.GetScheduler(); auto& buffer_cache = rasterizer.GetBufferCache(); + const auto& cs_program = liverpool->GetCsRegs(); // Copy shader defines three formatted buffers as inputs: control, source, and destination. const auto ctl_buf_sharp = info.texture_buffers[0].GetSharp(info); @@ -34,9 +34,9 @@ bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Reg static std::vector copies; copies.clear(); - copies.reserve(regs.cs_program.dim_x); + copies.reserve(cs_program.dim_x); - for (u32 i = 0; i < regs.cs_program.dim_x; i++) { + for (u32 i = 0; i < cs_program.dim_x; i++) { const auto& [dst_idx, src_idx, end] = ctl_buf[i]; const u32 local_dst_offset = dst_idx * buf_stride; const u32 local_src_offset = src_idx * buf_stride; @@ -121,11 +121,10 @@ bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Reg return true; } -bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs, - Rasterizer& rasterizer) { +bool ExecuteShaderHLE(const Shader::Info& info, Rasterizer& rasterizer) { switch (info.pgm_hash) { case COPY_SHADER_HASH: - return ExecuteCopyShaderHLE(info, regs, rasterizer); + return ExecuteCopyShaderHLE(info, rasterizer); default: return false; } diff --git a/src/video_core/renderer_vulkan/vk_shader_hle.h b/src/video_core/renderer_vulkan/vk_shader_hle.h index fda9b1735..975c2a12d 100644 --- a/src/video_core/renderer_vulkan/vk_shader_hle.h +++ b/src/video_core/renderer_vulkan/vk_shader_hle.h @@ -3,7 +3,7 @@ #pragma once -#include "video_core/amdgpu/liverpool.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" namespace Shader { struct Info; @@ -14,7 +14,6 @@ namespace Vulkan { class Rasterizer; /// Attempts to execute a shader using HLE if possible. -bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs, - Rasterizer& rasterizer); +bool ExecuteShaderHLE(const Shader::Info& info, Rasterizer& rasterizer); } // namespace Vulkan