mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-04-20 03:24:49 +00:00
shader_recompiler: Lower non-compute shared memory into spare VGPRs. (#2403)
Some checks are pending
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / reuse (push) Waiting to run
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
Some checks are pending
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / reuse (push) Waiting to run
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
This commit is contained in:
parent
ebe2aadb4c
commit
6e12642151
12 changed files with 85 additions and 94 deletions
|
@ -120,10 +120,8 @@ Id EmitUndefU32(EmitContext& ctx);
|
|||
Id EmitUndefU64(EmitContext& ctx);
|
||||
Id EmitLoadSharedU32(EmitContext& ctx, Id offset);
|
||||
Id EmitLoadSharedU64(EmitContext& ctx, Id offset);
|
||||
Id EmitLoadSharedU128(EmitContext& ctx, Id offset);
|
||||
void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value);
|
||||
void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value);
|
||||
void EmitWriteSharedU128(EmitContext& ctx, Id offset, Id value);
|
||||
Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value);
|
||||
Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value);
|
||||
Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value);
|
||||
|
|
|
@ -38,24 +38,6 @@ Id EmitLoadSharedU64(EmitContext& ctx, Id offset) {
|
|||
}
|
||||
}
|
||||
|
||||
Id EmitLoadSharedU128(EmitContext& ctx, Id offset) {
|
||||
const Id shift_id{ctx.ConstU32(2U)};
|
||||
const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)};
|
||||
std::array<Id, 4> values{};
|
||||
for (u32 i = 0; i < 4; ++i) {
|
||||
const Id index{i == 0 ? base_index : ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(i))};
|
||||
if (ctx.info.has_emulated_shared_memory) {
|
||||
const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32,
|
||||
ctx.u32_zero_value, index)};
|
||||
values[i] = ctx.OpLoad(ctx.U32[1], pointer);
|
||||
} else {
|
||||
const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)};
|
||||
values[i] = ctx.OpLoad(ctx.U32[1], pointer);
|
||||
}
|
||||
}
|
||||
return ctx.OpCompositeConstruct(ctx.U32[4], values);
|
||||
}
|
||||
|
||||
void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) {
|
||||
const Id shift{ctx.ConstU32(2U)};
|
||||
const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)};
|
||||
|
@ -88,20 +70,4 @@ void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) {
|
|||
}
|
||||
}
|
||||
|
||||
void EmitWriteSharedU128(EmitContext& ctx, Id offset, Id value) {
|
||||
const Id shift{ctx.ConstU32(2U)};
|
||||
const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)};
|
||||
for (u32 i = 0; i < 4; ++i) {
|
||||
const Id index{i == 0 ? base_index : ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(i))};
|
||||
if (ctx.info.has_emulated_shared_memory) {
|
||||
const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32,
|
||||
ctx.u32_zero_value, index)};
|
||||
ctx.OpStore(pointer, ctx.OpCompositeExtract(ctx.U32[1], value, i));
|
||||
} else {
|
||||
const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)};
|
||||
ctx.OpStore(pointer, ctx.OpCompositeExtract(ctx.U32[1], value, i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Shader::Backend::SPIRV
|
||||
|
|
|
@ -813,6 +813,8 @@ void EmitContext::DefineSharedMemory() {
|
|||
if (!info.uses_shared) {
|
||||
return;
|
||||
}
|
||||
ASSERT(info.stage == Stage::Compute);
|
||||
|
||||
const u32 max_shared_memory_size = profile.max_shared_memory_size;
|
||||
u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
|
||||
if (shared_memory_size == 0) {
|
||||
|
|
|
@ -233,7 +233,8 @@ struct Info {
|
|||
}
|
||||
|
||||
void AddBindings(Backend::Bindings& bnd) const {
|
||||
const auto total_buffers = buffers.size() + (has_readconst ? 1 : 0);
|
||||
const auto total_buffers =
|
||||
buffers.size() + (has_readconst ? 1 : 0) + (has_emulated_shared_memory ? 1 : 0);
|
||||
bnd.buffer += total_buffers;
|
||||
bnd.unified += total_buffers + images.size() + samplers.size();
|
||||
bnd.user_data += ud_mask.NumRegs();
|
||||
|
|
|
@ -308,8 +308,6 @@ Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) {
|
|||
return Inst<U32>(Opcode::LoadSharedU32, offset);
|
||||
case 64:
|
||||
return Inst(Opcode::LoadSharedU64, offset);
|
||||
case 128:
|
||||
return Inst(Opcode::LoadSharedU128, offset);
|
||||
default:
|
||||
UNREACHABLE_MSG("Invalid bit size {}", bit_size);
|
||||
}
|
||||
|
@ -323,9 +321,6 @@ void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset)
|
|||
case 64:
|
||||
Inst(Opcode::WriteSharedU64, offset, value);
|
||||
break;
|
||||
case 128:
|
||||
Inst(Opcode::WriteSharedU128, offset, value);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE_MSG("Invalid bit size {}", bit_size);
|
||||
}
|
||||
|
|
|
@ -78,7 +78,6 @@ bool Inst::MayHaveSideEffects() const noexcept {
|
|||
case Opcode::BufferAtomicSwap32:
|
||||
case Opcode::DataAppend:
|
||||
case Opcode::DataConsume:
|
||||
case Opcode::WriteSharedU128:
|
||||
case Opcode::WriteSharedU64:
|
||||
case Opcode::WriteSharedU32:
|
||||
case Opcode::SharedAtomicIAdd32:
|
||||
|
|
|
@ -32,10 +32,8 @@ OPCODE(EmitPrimitive, Void,
|
|||
// Shared memory operations
|
||||
OPCODE(LoadSharedU32, U32, U32, )
|
||||
OPCODE(LoadSharedU64, U32x2, U32, )
|
||||
OPCODE(LoadSharedU128, U32x4, U32, )
|
||||
OPCODE(WriteSharedU32, Void, U32, U32, )
|
||||
OPCODE(WriteSharedU64, Void, U32, U32x2, )
|
||||
OPCODE(WriteSharedU128, Void, U32, U32x4, )
|
||||
|
||||
// Shared atomic operations
|
||||
OPCODE(SharedAtomicIAdd32, U32, U32, U32, )
|
||||
|
|
|
@ -225,10 +225,8 @@ private:
|
|||
switch (use.user->GetOpcode()) {
|
||||
case IR::Opcode::LoadSharedU32:
|
||||
case IR::Opcode::LoadSharedU64:
|
||||
case IR::Opcode::LoadSharedU128:
|
||||
case IR::Opcode::WriteSharedU32:
|
||||
case IR::Opcode::WriteSharedU64:
|
||||
case IR::Opcode::WriteSharedU128: {
|
||||
case IR::Opcode::WriteSharedU64: {
|
||||
u32 counter = inst->Flags<u32>();
|
||||
inst->SetFlags<u32>(counter + inc);
|
||||
// Stop here
|
||||
|
@ -435,12 +433,9 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
|
|||
}
|
||||
|
||||
case IR::Opcode::WriteSharedU32:
|
||||
case IR::Opcode::WriteSharedU64:
|
||||
case IR::Opcode::WriteSharedU128: {
|
||||
case IR::Opcode::WriteSharedU64: {
|
||||
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
|
||||
const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32
|
||||
? 1
|
||||
: (opcode == IR::Opcode::WriteSharedU64 ? 2 : 4);
|
||||
const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32 ? 1 : 2;
|
||||
const IR::U32 addr{inst.Arg(0)};
|
||||
const IR::U32 data{inst.Arg(1).Resolve()};
|
||||
|
||||
|
@ -480,15 +475,12 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
|
|||
break;
|
||||
}
|
||||
|
||||
case IR::Opcode::LoadSharedU32: {
|
||||
case IR::Opcode::LoadSharedU64:
|
||||
case IR::Opcode::LoadSharedU128:
|
||||
case IR::Opcode::LoadSharedU32:
|
||||
case IR::Opcode::LoadSharedU64: {
|
||||
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
|
||||
const IR::U32 addr{inst.Arg(0)};
|
||||
const AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info);
|
||||
const u32 num_dwords = opcode == IR::Opcode::LoadSharedU32
|
||||
? 1
|
||||
: (opcode == IR::Opcode::LoadSharedU64 ? 2 : 4);
|
||||
const u32 num_dwords = opcode == IR::Opcode::LoadSharedU32 ? 1 : 2;
|
||||
ASSERT_MSG(region == AttributeRegion::InputCP ||
|
||||
region == AttributeRegion::OutputCP,
|
||||
"Unhandled read of patchconst attribute in hull shader");
|
||||
|
@ -562,14 +554,11 @@ void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
|
|||
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
|
||||
const auto opcode = inst.GetOpcode();
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadSharedU32: {
|
||||
case IR::Opcode::LoadSharedU64:
|
||||
case IR::Opcode::LoadSharedU128:
|
||||
case IR::Opcode::LoadSharedU32:
|
||||
case IR::Opcode::LoadSharedU64: {
|
||||
const IR::U32 addr{inst.Arg(0)};
|
||||
AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info);
|
||||
const u32 num_dwords = opcode == IR::Opcode::LoadSharedU32
|
||||
? 1
|
||||
: (opcode == IR::Opcode::LoadSharedU64 ? 2 : 4);
|
||||
const u32 num_dwords = opcode == IR::Opcode::LoadSharedU32 ? 1 : 2;
|
||||
const auto GetInput = [&](IR::U32 addr, u32 off_dw) -> IR::F32 {
|
||||
if (region == AttributeRegion::OutputCP) {
|
||||
return ReadTessControlPointAttribute(
|
||||
|
@ -611,10 +600,8 @@ void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) {
|
|||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadSharedU32:
|
||||
case IR::Opcode::LoadSharedU64:
|
||||
case IR::Opcode::LoadSharedU128:
|
||||
case IR::Opcode::WriteSharedU32:
|
||||
case IR::Opcode::WriteSharedU64:
|
||||
case IR::Opcode::WriteSharedU128: {
|
||||
case IR::Opcode::WriteSharedU64: {
|
||||
IR::Value addr = inst.Arg(0);
|
||||
auto read_const_buffer = IR::BreadthFirstSearch(
|
||||
addr, [](IR::Inst* maybe_tess_const) -> std::optional<IR::Inst*> {
|
||||
|
|
|
@ -20,7 +20,7 @@ void FlattenExtendedUserdataPass(IR::Program& program);
|
|||
void ResourceTrackingPass(IR::Program& program);
|
||||
void CollectShaderInfoPass(IR::Program& program);
|
||||
void LowerBufferFormatToRaw(IR::Program& program);
|
||||
void LowerSharedMemToRegisters(IR::Program& program);
|
||||
void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info);
|
||||
void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info,
|
||||
Stage stage);
|
||||
void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info);
|
||||
|
|
|
@ -1,38 +1,81 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <boost/container/small_vector.hpp>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "shader_recompiler/ir/ir_emitter.h"
|
||||
#include "shader_recompiler/ir/program.h"
|
||||
|
||||
namespace Shader::Optimization {
|
||||
|
||||
void LowerSharedMemToRegisters(IR::Program& program) {
|
||||
boost::container::small_vector<IR::Inst*, 8> ds_writes;
|
||||
Info& info{program.info};
|
||||
static bool IsSharedMemoryInst(const IR::Inst& inst) {
|
||||
const auto opcode = inst.GetOpcode();
|
||||
return opcode == IR::Opcode::LoadSharedU32 || opcode == IR::Opcode::LoadSharedU64 ||
|
||||
opcode == IR::Opcode::WriteSharedU32 || opcode == IR::Opcode::WriteSharedU64;
|
||||
}
|
||||
|
||||
static u32 GetSharedMemImmOffset(const IR::Inst& inst) {
|
||||
const auto* address = inst.Arg(0).InstRecursive();
|
||||
ASSERT(address->GetOpcode() == IR::Opcode::IAdd32);
|
||||
const auto ir_offset = address->Arg(1);
|
||||
ASSERT_MSG(ir_offset.IsImmediate());
|
||||
const auto offset = ir_offset.U32();
|
||||
// Typical usage is the compiler spilling registers into shared memory, with 256 bytes between
|
||||
// each register to account for 4 bytes per register times 64 threads per group. Ensure that
|
||||
// this assumption holds, as if it does not this approach may need to be revised.
|
||||
ASSERT_MSG(offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset);
|
||||
return offset;
|
||||
}
|
||||
|
||||
static void ConvertSharedMemToVgpr(IR::IREmitter& ir, IR::Inst& inst, const IR::VectorReg vgpr) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadSharedU32:
|
||||
inst.ReplaceUsesWithAndRemove(ir.GetVectorReg(vgpr));
|
||||
break;
|
||||
case IR::Opcode::LoadSharedU64:
|
||||
inst.ReplaceUsesWithAndRemove(
|
||||
ir.CompositeConstruct(ir.GetVectorReg(vgpr), ir.GetVectorReg(vgpr + 1)));
|
||||
break;
|
||||
case IR::Opcode::WriteSharedU32:
|
||||
ir.SetVectorReg(vgpr, IR::U32{inst.Arg(1)});
|
||||
inst.Invalidate();
|
||||
break;
|
||||
case IR::Opcode::WriteSharedU64: {
|
||||
const auto value = inst.Arg(1);
|
||||
ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 0)});
|
||||
ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 1)});
|
||||
inst.Invalidate();
|
||||
break;
|
||||
}
|
||||
default:
|
||||
UNREACHABLE_MSG("Unknown shared memory opcode: {}", inst.GetOpcode());
|
||||
}
|
||||
}
|
||||
|
||||
void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info) {
|
||||
u32 next_vgpr_num = runtime_info.num_allocated_vgprs;
|
||||
std::unordered_map<u32, IR::VectorReg> vgpr_map;
|
||||
const auto get_vgpr = [&next_vgpr_num, &vgpr_map](const u32 offset) {
|
||||
const auto [it, is_new] = vgpr_map.try_emplace(offset);
|
||||
if (is_new) {
|
||||
ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs");
|
||||
const auto new_vgpr = static_cast<IR::VectorReg>(next_vgpr_num++);
|
||||
it->second = new_vgpr;
|
||||
}
|
||||
return it->second;
|
||||
};
|
||||
|
||||
for (IR::Block* const block : program.blocks) {
|
||||
for (IR::Inst& inst : block->Instructions()) {
|
||||
const auto opcode = inst.GetOpcode();
|
||||
if (opcode == IR::Opcode::WriteSharedU32 || opcode == IR::Opcode::WriteSharedU64) {
|
||||
ds_writes.emplace_back(&inst);
|
||||
if (!IsSharedMemoryInst(inst)) {
|
||||
continue;
|
||||
}
|
||||
if (opcode == IR::Opcode::LoadSharedU32 || opcode == IR::Opcode::LoadSharedU64) {
|
||||
// Search for write instruction with same offset
|
||||
const IR::Inst* prod = inst.Arg(0).InstRecursive();
|
||||
const auto it = std::ranges::find_if(ds_writes, [&](const IR::Inst* write) {
|
||||
const IR::Inst* write_prod = write->Arg(0).InstRecursive();
|
||||
return write_prod->Arg(1).U32() == prod->Arg(1).U32();
|
||||
});
|
||||
ASSERT(it != ds_writes.end());
|
||||
// Replace data read with value written.
|
||||
inst.ReplaceUsesWithAndRemove((*it)->Arg(1));
|
||||
}
|
||||
const auto offset = GetSharedMemImmOffset(inst);
|
||||
const auto vgpr = get_vgpr(offset);
|
||||
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
|
||||
ConvertSharedMemToVgpr(ir, inst, vgpr);
|
||||
}
|
||||
}
|
||||
// We should have eliminated everything. Invalidate data write instructions.
|
||||
for (const auto inst : ds_writes) {
|
||||
inst->Invalidate();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Shader::Optimization
|
||||
|
|
|
@ -65,6 +65,10 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
|
|||
// Run optimization passes
|
||||
const auto stage = program.info.stage;
|
||||
|
||||
if (stage == Stage::Fragment) {
|
||||
// Before SSA pass, as it will rewrite to VGPR load/store.
|
||||
Shader::Optimization::LowerSharedMemToRegisters(program, runtime_info);
|
||||
}
|
||||
Shader::Optimization::SsaRewritePass(program.post_order_blocks);
|
||||
Shader::Optimization::IdentityRemovalPass(program.blocks);
|
||||
if (info.l_stage == LogicalStage::TessellationControl) {
|
||||
|
@ -82,9 +86,6 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
|
|||
}
|
||||
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
|
||||
Shader::Optimization::RingAccessElimination(program, runtime_info, stage);
|
||||
if (stage != Stage::Compute) {
|
||||
Shader::Optimization::LowerSharedMemToRegisters(program);
|
||||
}
|
||||
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
|
||||
Shader::Optimization::FlattenExtendedUserdataPass(program);
|
||||
Shader::Optimization::ResourceTrackingPass(program);
|
||||
|
|
|
@ -535,6 +535,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
|
|||
.descriptorType = vk::DescriptorType::eStorageBuffer,
|
||||
.pBufferInfo = &buffer_infos.back(),
|
||||
});
|
||||
++binding.buffer;
|
||||
}
|
||||
|
||||
// Bind the flattened user data buffer as a UBO so it's accessible to the shader
|
||||
|
|
Loading…
Add table
Reference in a new issue