diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index 1d553dc56..a58b2778f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -152,4 +152,20 @@ Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id co return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicExchange); } +Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) { + auto& buffer = ctx.buffers[binding]; + const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, + ctx.ConstU32(gds_addr)); + const auto [scope, semantics]{AtomicArgs(ctx)}; + return ctx.OpAtomicIIncrement(ctx.U32[1], ptr, scope, semantics); +} + +Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding) { + auto& buffer = ctx.buffers[binding]; + const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, + ctx.ConstU32(gds_addr)); + const auto [scope, semantics]{AtomicArgs(ctx)}; + return ctx.OpAtomicIDecrement(ctx.U32[1], ptr, scope, semantics); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 3c5211aef..e506ced3a 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -397,12 +397,13 @@ Id EmitImageAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id EmitImageAtomicOr32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); Id EmitImageAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); - Id EmitLaneId(EmitContext& ctx); Id EmitWarpId(EmitContext& ctx); Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index); Id EmitReadFirstLane(EmitContext& ctx, Id value); Id EmitReadLane(EmitContext& ctx, Id value, u32 lane); Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane); +Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding); +Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding); } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index c0f0fa274..a20f5c89a 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -43,6 +43,10 @@ void Translator::EmitDataShare(const GcnInst& inst) { return DS_MIN_U32(inst, false, true); case Opcode::DS_MAX_RTN_U32: return DS_MAX_U32(inst, false, true); + case Opcode::DS_APPEND: + return DS_APPEND(inst); + case Opcode::DS_CONSUME: + return DS_CONSUME(inst); default: LogMissingOpcode(inst); } @@ -192,4 +196,18 @@ void Translator::V_WRITELANE_B32(const GcnInst& inst) { ir.SetVectorReg(dst, ir.WriteLane(old_value, value, lane)); } +void Translator::DS_APPEND(const GcnInst& inst) { + const u32 inst_offset = inst.control.ds.offset0; + const IR::U32 gds_offset = ir.IAdd(m0_value, ir.Imm32(inst_offset)); + const IR::U32 prev = ir.DataAppend(gds_offset); + SetDst(inst.dst[0], prev); +} + +void Translator::DS_CONSUME(const GcnInst& inst) { + const u32 inst_offset = inst.control.ds.offset0; + const IR::U32 gds_offset = ir.IAdd(m0_value, ir.Imm32(inst_offset)); + const IR::U32 prev = ir.DataConsume(gds_offset); + SetDst(inst.dst[0], prev); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index af258cd19..adc127f12 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -73,9 +73,13 @@ void Translator::EmitScalarAlu(const GcnInst& inst) { case Opcode::S_SUB_I32: return S_SUB_U32(inst); case Opcode::S_MIN_U32: - return S_MIN_U32(inst); + return S_MIN_U32(false, inst); + case Opcode::S_MIN_I32: + return S_MIN_U32(true, inst); case Opcode::S_MAX_U32: - return S_MAX_U32(inst); + return S_MAX_U32(false, inst); + case Opcode::S_MAX_I32: + return S_MAX_U32(true, inst); case Opcode::S_WQM_B64: break; default: @@ -533,18 +537,18 @@ void Translator::S_ADDC_U32(const GcnInst& inst) { SetDst(inst.dst[0], ir.IAdd(ir.IAdd(src0, src1), carry)); } -void Translator::S_MAX_U32(const GcnInst& inst) { +void Translator::S_MAX_U32(bool is_signed, const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; - const IR::U32 result = ir.UMax(src0, src1); + const IR::U32 result = ir.IMax(src0, src1, is_signed); SetDst(inst.dst[0], result); ir.SetScc(ir.IEqual(result, src0)); } -void Translator::S_MIN_U32(const GcnInst& inst) { +void Translator::S_MIN_U32(bool is_signed, const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; - const IR::U32 result = ir.UMin(src0, src1); + const IR::U32 result = ir.IMin(src0, src1, is_signed); SetDst(inst.dst[0], result); ir.SetScc(ir.IEqual(result, src0)); } diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index d6887818d..e4be298ea 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -101,8 +101,8 @@ public: void S_ADDC_U32(const GcnInst& inst); void S_MULK_I32(const GcnInst& inst); void S_ADDK_I32(const GcnInst& inst); - void S_MAX_U32(const GcnInst& inst); - void S_MIN_U32(const GcnInst& inst); + void S_MAX_U32(bool is_signed, const GcnInst& inst); + void S_MIN_U32(bool is_signed, const GcnInst& inst); void S_CMPK(ConditionOp cond, bool is_signed, const GcnInst& inst); // Scalar Memory @@ -173,7 +173,7 @@ public: void V_BCNT_U32_B32(const GcnInst& inst); void V_COS_F32(const GcnInst& inst); void V_MAX3_F32(const GcnInst& inst); - void V_MAX3_U32(const GcnInst& inst); + void V_MAX3_U32(bool is_signed, const GcnInst& inst); void V_CVT_I32_F32(const GcnInst& inst); void V_MIN_I32(const GcnInst& inst); void V_MUL_LO_U32(const GcnInst& inst); @@ -217,6 +217,8 @@ public: void V_READFIRSTLANE_B32(const GcnInst& inst); void V_READLANE_B32(const GcnInst& inst); void V_WRITELANE_B32(const GcnInst& inst); + void DS_APPEND(const GcnInst& inst); + void DS_CONSUME(const GcnInst& inst); void S_BARRIER(); // MIMG diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 2024b7067..e71d965c6 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -227,7 +227,9 @@ void Translator::EmitVectorAlu(const GcnInst& inst) { case Opcode::V_MAX3_F32: return V_MAX3_F32(inst); case Opcode::V_MAX3_U32: - return V_MAX3_U32(inst); + return V_MAX3_U32(false, inst); + case Opcode::V_MAX3_I32: + return V_MAX_U32(true, inst); case Opcode::V_TRUNC_F32: return V_TRUNC_F32(inst); case Opcode::V_CEIL_F32: @@ -831,11 +833,11 @@ void Translator::V_MAX3_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPMax(src0, ir.FPMax(src1, src2))); } -void Translator::V_MAX3_U32(const GcnInst& inst) { +void Translator::V_MAX3_U32(bool is_signed, const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; const IR::U32 src2{GetSrc(inst.src[2])}; - SetDst(inst.dst[0], ir.UMax(src0, ir.UMax(src1, src2))); + SetDst(inst.dst[0], ir.IMax(src0, ir.IMax(src1, src2, is_signed), is_signed)); } void Translator::V_CVT_I32_F32(const GcnInst& inst) { @@ -967,14 +969,28 @@ void Translator::V_FFBL_B32(const GcnInst& inst) { } void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { - const IR::U32 src0{GetSrc(inst.src[0])}; - const IR::U32 src1{GetSrc(inst.src[1])}; if (!is_low) { - ASSERT(src0.IsImmediate() && src0.U32() == ~0U && src1.IsImmediate() && src1.U32() == 0U); - return; + // v_mbcnt_hi_u32_b32 v2, -1, 0 + if (inst.src[0].field == OperandField::SignedConstIntNeg && inst.src[0].code == 193 && + inst.src[1].field == OperandField::ConstZero) { + return; + } + // v_mbcnt_hi_u32_b32 v20, exec_hi, 0 + if (inst.src[0].field == OperandField::ExecHi) { + return; + } + } else { + // v_mbcnt_lo_u32_b32 v2, -1, vX + // used combined with above to fetch lane id in non-compute stages + if (inst.src[0].field == OperandField::SignedConstIntNeg && inst.src[0].code == 193) { + SetDst(inst.dst[0], ir.LaneId()); + } + // v_mbcnt_lo_u32_b32 v20, exec_lo, vX + // used combined in above for append buffer indexing. + if (inst.src[0].field == OperandField::ExecLo) { + SetDst(inst.dst[0], ir.Imm32(0)); + } } - ASSERT(src0.IsImmediate() && src0.U32() == ~0U); - SetDst(inst.dst[0], ir.LaneId()); } void Translator::V_BFM_B32(const GcnInst& inst) { diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index e86b0ddfb..562eb4af3 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -514,6 +514,15 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { const IR::VectorReg vaddr{inst.src[0].code}; const IR::VectorReg vdata{inst.src[1].code}; const IR::ScalarReg srsrc{inst.src[2].code * 4}; + const IR::Value address = [&] -> IR::Value { + if (mubuf.idxen && mubuf.offen) { + return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); + } + if (mubuf.idxen || mubuf.offen) { + return ir.GetVectorReg(vaddr); + } + return {}; + }(); const IR::U32 soffset{GetSrc(inst.src[3])}; ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported"); @@ -523,7 +532,6 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { info.offset_enable.Assign(mubuf.offen); IR::Value vdata_val = ir.GetVectorReg(vdata); - const IR::U32 address = ir.GetVectorReg(vaddr); const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(srsrc), ir.GetScalarReg(srsrc + 1), ir.GetScalarReg(srsrc + 2), ir.GetScalarReg(srsrc + 3)); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index cdc17304c..0184a7f63 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -37,12 +37,13 @@ struct BufferResource { u32 dword_offset; IR::Type used_types; AmdGpu::Buffer inline_cbuf; + bool is_gds_buffer{}; bool is_instance_data{}; bool is_written{}; bool IsStorage(AmdGpu::Buffer buffer) const noexcept { static constexpr size_t MaxUboSize = 65536; - return buffer.GetSize() > MaxUboSize || is_written; + return buffer.GetSize() > MaxUboSize || is_written || is_gds_buffer; } constexpr AmdGpu::Buffer GetSharp(const Info& info) const noexcept; diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 867eac5c7..7e52cfb5f 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -409,6 +409,14 @@ void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, con Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data); } +U32 IREmitter::DataAppend(const U32& counter) { + return Inst(Opcode::DataAppend, counter, Imm32(0)); +} + +U32 IREmitter::DataConsume(const U32& counter) { + return Inst(Opcode::DataConsume, counter, Imm32(0)); +} + U32 IREmitter::LaneId() { return Inst(Opcode::LaneId); } diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 1cecd519d..01e71893c 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -120,6 +120,8 @@ public: [[nodiscard]] Value BufferAtomicSwap(const Value& handle, const Value& address, const Value& value, BufferInstInfo info); + [[nodiscard]] U32 DataAppend(const U32& counter); + [[nodiscard]] U32 DataConsume(const U32& counter); [[nodiscard]] U32 LaneId(); [[nodiscard]] U32 WarpId(); [[nodiscard]] U32 QuadShuffle(const U32& value, const U32& index); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index 148383255..601c453d9 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -67,6 +67,8 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::BufferAtomicOr32: case Opcode::BufferAtomicXor32: case Opcode::BufferAtomicSwap32: + case Opcode::DataAppend: + case Opcode::DataConsume: case Opcode::WriteSharedU128: case Opcode::WriteSharedU64: case Opcode::WriteSharedU32: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 994ccef24..4b922d55b 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -342,3 +342,5 @@ OPCODE(QuadShuffle, U32, U32, OPCODE(ReadFirstLane, U32, U32, ) OPCODE(ReadLane, U32, U32, U32 ) OPCODE(WriteLane, U32, U32, U32, U32 ) +OPCODE(DataAppend, U32, U32, U32 ) +OPCODE(DataConsume, U32, U32, U32 ) diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 8991bdcc8..874ff9abb 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -64,6 +64,11 @@ bool IsBufferInstruction(const IR::Inst& inst) { } } +bool IsDataRingInstruction(const IR::Inst& inst) { + return inst.GetOpcode() == IR::Opcode::DataAppend || + inst.GetOpcode() == IR::Opcode::DataConsume; +} + bool IsTextureBufferInstruction(const IR::Inst& inst) { return inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 || inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32; @@ -183,6 +188,10 @@ public: u32 Add(const BufferResource& desc) { const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) { + // Only one GDS binding can exist. + if (desc.is_gds_buffer && existing.is_gds_buffer) { + return true; + } return desc.sgpr_base == existing.sgpr_base && desc.dword_offset == existing.dword_offset && desc.inline_cbuf == existing.inline_cbuf; @@ -600,6 +609,50 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip } } +void PatchDataRingInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { + // Insert gds binding in the shader if it doesn't exist already. + // The buffer is used for append/consume counters. + constexpr static AmdGpu::Buffer GdsSharp{.base_address = 1}; + const u32 binding = descriptors.Add(BufferResource{ + .used_types = IR::Type::U32, + .inline_cbuf = GdsSharp, + .is_gds_buffer = true, + .is_written = true, + }); + + const auto pred = [](const IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetUserData) { + return inst; + } + return std::nullopt; + }; + + // Attempt to deduce the GDS address of counter at compile time. + const u32 gds_addr = [&] { + const IR::Value& gds_offset = inst.Arg(0); + if (gds_offset.IsImmediate()) { + // Nothing to do, offset is known. + return gds_offset.U32() & 0xFFFF; + } + const auto result = IR::BreadthFirstSearch(&inst, pred); + ASSERT_MSG(result, "Unable to track M0 source"); + + // M0 must be set by some user data register. + const IR::Inst* prod = gds_offset.InstRecursive(); + const u32 ud_reg = u32(result.value()->Arg(0).ScalarReg()); + u32 m0_val = info.user_data[ud_reg]; + if (prod->GetOpcode() == IR::Opcode::IAdd32) { + m0_val += prod->Arg(1).U32(); + } + return m0_val & 0xFFFF; + }(); + + // Patch instruction. + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + inst.SetArg(0, ir.Imm32(gds_addr >> 2)); + inst.SetArg(1, ir.Imm32(binding)); +} + void ResourceTrackingPass(IR::Program& program) { // Iterate resource instructions and patch them after finding the sharp. auto& info = program.info; @@ -616,6 +669,10 @@ void ResourceTrackingPass(IR::Program& program) { } if (IsImageInstruction(inst)) { PatchImageInstruction(*block, inst, info, descriptors); + continue; + } + if (IsDataRingInstruction(inst)) { + PatchDataRingInstruction(*block, inst, info, descriptors); } } } diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index cee30f755..56c325195 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -474,6 +474,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) { + rasterizer->InlineDataToGds(dma_data->dst_addr_lo, dma_data->data); + } break; } case PM4ItOpcode::WriteData: { diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index 58ade221b..6973efbce 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -350,6 +350,17 @@ struct PM4CmdEventWriteEop { } }; +enum class DmaDataDst : u32 { + Memory = 0, + Gds = 1, +}; + +enum class DmaDataSrc : u32 { + Memory = 0, + Gds = 1, + Data = 2, +}; + struct PM4DmaData { PM4Type3Header header; union { @@ -357,11 +368,11 @@ struct PM4DmaData { BitField<12, 1, u32> src_atc; BitField<13, 2, u32> src_cache_policy; BitField<15, 1, u32> src_volatile; - BitField<20, 2, u32> dst_sel; + BitField<20, 2, DmaDataDst> dst_sel; BitField<24, 1, u32> dst_atc; BitField<25, 2, u32> dst_cache_policy; BitField<27, 1, u32> dst_volatile; - BitField<29, 2, u32> src_sel; + BitField<29, 2, DmaDataSrc> src_sel; BitField<31, 1, u32> cp_sync; }; union { diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 89032e990..fa9a8dde5 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -15,6 +15,7 @@ namespace VideoCore { static constexpr size_t NumVertexBuffers = 32; +static constexpr size_t GdsBufferSize = 64_KB; static constexpr size_t StagingBufferSize = 512_MB; static constexpr size_t UboStreamBufferSize = 64_MB; @@ -25,7 +26,10 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s texture_cache{texture_cache_}, tracker{tracker_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, + gds_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, GdsBufferSize}, memory_tracker{&tracker} { + Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer"); + // Ensure the first slot is used for the null buffer void(slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, 0, ReadFlags, 1)); } @@ -232,6 +236,13 @@ u32 BufferCache::BindIndexBuffer(bool& is_indexed, u32 index_offset) { return regs.num_indices; } +void BufferCache::InlineDataToGds(u32 gds_offset, u32 value) { + ASSERT_MSG(gds_offset % 4 == 0, "GDS offset must be dword aligned"); + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.updateBuffer(gds_buffer.Handle(), gds_offset, sizeof(u32), &value); +} + std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer) { static constexpr u64 StreamThreshold = CACHING_PAGESIZE; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index b38b00f07..cd6ea28fc 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -57,6 +57,11 @@ public: PageManager& tracker); ~BufferCache(); + /// Returns a pointer to GDS device local buffer. + [[nodiscard]] const Buffer* GetGdsBuffer() const noexcept { + return &gds_buffer; + } + /// Invalidates any buffer in the logical page range. void InvalidateMemory(VAddr device_addr, u64 size); @@ -66,6 +71,9 @@ public: /// Bind host index buffer for the current draw. u32 BindIndexBuffer(bool& is_indexed, u32 index_offset); + /// Writes a value to GDS buffer. + void InlineDataToGds(u32 gds_offset, u32 value); + /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written, bool is_texel_buffer = false); @@ -130,6 +138,7 @@ private: PageManager& tracker; StreamBuffer staging_buffer; StreamBuffer stream_buffer; + Buffer gds_buffer; std::mutex mutex; Common::SlotVector slot_buffers; MemoryTracker memory_tracker; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index b87d3c915..b96abedb4 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -109,37 +109,43 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, u32 binding{}; for (const auto& desc : info->buffers) { - const auto vsharp = desc.GetSharp(*info); - const bool is_storage = desc.IsStorage(vsharp); - const VAddr address = vsharp.base_address; - // Most of the time when a metadata is updated with a shader it gets cleared. It means we - // can skip the whole dispatch and update the tracked state instead. Also, it is not - // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we will - // need its full emulation anyways. For cases of metadata read a warning will be logged. - if (desc.is_written) { - if (texture_cache.TouchMeta(address, true)) { - LOG_TRACE(Render_Vulkan, "Metadata update skipped"); - return false; - } + bool is_storage = true; + if (desc.is_gds_buffer) { + auto* vk_buffer = buffer_cache.GetGdsBuffer(); + buffer_infos.emplace_back(vk_buffer->Handle(), 0, vk_buffer->SizeBytes()); } else { - if (texture_cache.IsMeta(address)) { - LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)"); + const auto vsharp = desc.GetSharp(*info); + is_storage = desc.IsStorage(vsharp); + const VAddr address = vsharp.base_address; + // Most of the time when a metadata is updated with a shader it gets cleared. It means we + // can skip the whole dispatch and update the tracked state instead. Also, it is not + // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we will + // need its full emulation anyways. For cases of metadata read a warning will be logged. + if (desc.is_written) { + if (texture_cache.TouchMeta(address, true)) { + LOG_TRACE(Render_Vulkan, "Metadata update skipped"); + return false; + } + } else { + if (texture_cache.IsMeta(address)) { + LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)"); + } } + const u32 size = vsharp.GetSize(); + if (desc.is_written) { + texture_cache.InvalidateMemory(address, size); + } + const u32 alignment = + is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); + const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(address, size, desc.is_written); + const u32 offset_aligned = Common::AlignDown(offset, alignment); + const u32 adjust = offset - offset_aligned; + if (adjust != 0) { + ASSERT(adjust % 4 == 0); + push_data.AddOffset(binding, adjust); + } + buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust); } - const u32 size = vsharp.GetSize(); - if (desc.is_written) { - texture_cache.InvalidateMemory(address, size); - } - const u32 alignment = - is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); - const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(address, size, desc.is_written); - const u32 offset_aligned = Common::AlignDown(offset, alignment); - const u32 adjust = offset - offset_aligned; - if (adjust != 0) { - ASSERT(adjust % 4 == 0); - push_data.AddOffset(binding, adjust); - } - buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust); set_writes.push_back({ .dstSet = VK_NULL_HANDLE, .dstBinding = binding++, diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 4419b0f81..23a75aba4 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -297,6 +297,9 @@ bool PipelineCache::RefreshGraphicsKey() { if (stage != Shader::Stage::Vertex && stage != Shader::Stage::Fragment) { return false; } + if (auto* pgm = regs.ProgramForStage(3); regs.stage_enable.IsStageEnabled(3) && pgm->Address() != 0) { + return false; + } std::tie(infos[i], modules[i], key.stage_hashes[i]) = GetProgram(stage, params, binding); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 9f72d0448..fbe1eb548 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -251,6 +251,10 @@ void Rasterizer::BeginRendering() { scheduler.BeginRendering(state); } +void Rasterizer::InlineDataToGds(u32 gds_offset, u32 value) { + buffer_cache.InlineDataToGds(gds_offset, value); +} + void Rasterizer::InvalidateMemory(VAddr addr, u64 size) { buffer_cache.InvalidateMemory(addr, size); texture_cache.InvalidateMemory(addr, size); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 43ab4756d..6a20b5e86 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -41,6 +41,7 @@ public: void ScopeMarkerEnd(); void ScopedMarkerInsert(const std::string_view& str); + void InlineDataToGds(u32 gds_offset, u32 value); void InvalidateMemory(VAddr addr, u64 size); void MapMemory(VAddr addr, u64 size); void UnmapMemory(VAddr addr, u64 size);