shader_recompiler: Improve divergence handling and readlane elimintation (#2667)
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions

* control_flow_graph: Improve divergence handling

* recompiler: Simplify optimization passes

Removes a redudant constant propagation and cleans up the passes a little

* ir_passes: Add new readlane elimination pass

The algorithm has grown complex enough where it deserves its own pass. The old implementation could only handle a single phi level properly,
however this one should be able to eliminate vast majority of lane cases remaining. It first performs a traversal of the phi tree to ensure
that all phi sources can be rewritten into an expected value and then performs elimintation by recursively duplicating the phi nodes at each step,
in order to preserve control flow.

* clang format

* control_flow_graph: Remove debug code
This commit is contained in:
TheTurtle 2025-03-23 00:35:42 +02:00 committed by GitHub
parent a80c4a7f48
commit 1f9ac53c28
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 211 additions and 139 deletions

View file

@ -771,6 +771,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h
src/shader_recompiler/ir/passes/identity_removal_pass.cpp
src/shader_recompiler/ir/passes/ir_passes.h
src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp
src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
src/shader_recompiler/ir/passes/ring_access_elimination.cpp
src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
@ -1121,7 +1122,6 @@ cmrc_add_resource_library(embedded-resources
src/images/gold.png
src/images/platinum.png
src/images/silver.png)
target_link_libraries(shadps4 PRIVATE res::embedded)
# ImGui resources

View file

@ -2,6 +2,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm>
#include <unordered_map>
#include "common/assert.h"
#include "shader_recompiler/frontend/control_flow_graph.h"
@ -39,9 +40,6 @@ static IR::Condition MakeCondition(const GcnInst& inst) {
return IR::Condition::Execz;
case Opcode::S_CBRANCH_EXECNZ:
return IR::Condition::Execnz;
case Opcode::S_AND_SAVEEXEC_B64:
case Opcode::S_ANDN2_B64:
return IR::Condition::Execnz;
default:
return IR::Condition::True;
}
@ -76,9 +74,9 @@ CFG::CFG(Common::ObjectPool<Block>& block_pool_, std::span<const GcnInst> inst_l
index_to_pc.resize(inst_list.size() + 1);
labels.reserve(LabelReserveSize);
EmitLabels();
EmitDivergenceLabels();
EmitBlocks();
LinkBlocks();
SplitDivergenceScopes();
}
void CFG::EmitLabels() {
@ -112,7 +110,7 @@ void CFG::EmitLabels() {
std::ranges::sort(labels);
}
void CFG::EmitDivergenceLabels() {
void CFG::SplitDivergenceScopes() {
const auto is_open_scope = [](const GcnInst& inst) {
// An open scope instruction is an instruction that modifies EXEC
// but also saves the previous value to restore later. This indicates
@ -136,64 +134,97 @@ void CFG::EmitDivergenceLabels() {
(inst.opcode == Opcode::S_ANDN2_B64 && inst.dst[0].field == OperandField::ExecLo);
};
// Since we will be adding new labels, avoid iterating those as well.
const size_t end_size = labels.size();
for (u32 l = 0; l < end_size; l++) {
const Label start = labels[l];
// Stop if we reached end of existing labels.
if (l == end_size - 1) {
break;
}
const Label end = labels[l + 1];
const size_t end_index = GetIndex(end);
for (auto blk = blocks.begin(); blk != blocks.end(); blk++) {
auto next_blk = std::next(blk);
s32 curr_begin = -1;
s32 last_exec_idx = -1;
for (size_t index = GetIndex(start); index < end_index; index++) {
for (size_t index = blk->begin_index; index <= blk->end_index; index++) {
const auto& inst = inst_list[index];
if (curr_begin != -1) {
// Keep note of the last instruction that does not ignore exec, so we know where
// to end the divergence block without impacting trailing instructions that do.
if (!IgnoresExecMask(inst)) {
last_exec_idx = index;
}
// Consider a close scope on certain instruction types or at the last instruction
// before the next label.
if (is_close_scope(inst) || index == end_index - 1) {
// Only insert a scope if, since the open-scope instruction, there is at least
// one instruction that does not ignore exec.
if (index - curr_begin > 1 && last_exec_idx != -1) {
// Add a label to the instruction right after the open scope call.
// It is the start of a new basic block.
const auto& save_inst = inst_list[curr_begin];
AddLabel(index_to_pc[curr_begin] + save_inst.length);
// Add a label to the close scope instruction.
// There are 3 cases where we need to close a scope.
// * Close scope instruction inside the block
// * Close scope instruction at the end of the block (cbranch or endpgm)
// * Normal instruction at the end of the block
// If the instruction we want to close the scope at is at the end of the
// block, we do not need to insert a new label.
if (last_exec_idx != end_index - 1) {
// Add the label after the last instruction affected by exec.
const auto& last_exec_inst = inst_list[last_exec_idx];
AddLabel(index_to_pc[last_exec_idx] + last_exec_inst.length);
}
}
// Reset scope begin.
const bool is_close = is_close_scope(inst);
if ((is_close || index == blk->end_index) && curr_begin != -1) {
// If there are no instructions inside scope don't do anything.
if (index - curr_begin == 1) {
curr_begin = -1;
continue;
}
// If all instructions in the scope ignore exec masking, we shouldn't insert a
// scope.
const auto start = inst_list.begin() + curr_begin + 1;
if (!std::ranges::all_of(start, inst_list.begin() + index, IgnoresExecMask)) {
// Determine the first instruction affected by the exec mask.
do {
++curr_begin;
} while (IgnoresExecMask(inst_list[curr_begin]));
// Determine the last instruction affected by the exec mask.
s32 curr_end = index;
while (IgnoresExecMask(inst_list[curr_end])) {
--curr_end;
}
// Create a new block for the divergence scope.
Block* block = block_pool.Create();
block->begin = index_to_pc[curr_begin];
block->end = index_to_pc[curr_end];
block->begin_index = curr_begin;
block->end_index = curr_end;
block->end_inst = inst_list[curr_end];
blocks.insert_before(next_blk, *block);
// If we are inside the parent block, make an epilogue block and jump to it.
if (curr_end != blk->end_index) {
Block* epi_block = block_pool.Create();
epi_block->begin = index_to_pc[curr_end + 1];
epi_block->end = blk->end;
epi_block->begin_index = curr_end + 1;
epi_block->end_index = blk->end_index;
epi_block->end_inst = blk->end_inst;
epi_block->cond = blk->cond;
epi_block->end_class = blk->end_class;
epi_block->branch_true = blk->branch_true;
epi_block->branch_false = blk->branch_false;
blocks.insert_before(next_blk, *epi_block);
// Have divergence block always jump to epilogue block.
block->cond = IR::Condition::True;
block->branch_true = epi_block;
block->branch_false = nullptr;
// If the parent block fails to enter divergence block make it jump to
// epilogue too
blk->branch_false = epi_block;
} else {
// No epilogue block is needed since the divergence block
// also ends the parent block. Inherit the end condition.
auto& parent_blk = *blk;
ASSERT(blk->cond == IR::Condition::True && blk->branch_true);
block->cond = IR::Condition::True;
block->branch_true = blk->branch_true;
block->branch_false = nullptr;
// If the parent block didn't enter the divergence scope
// have it jump directly to the next one
blk->branch_false = blk->branch_true;
}
// Shrink parent block to end right before curr_begin
// and make it jump to divergence block
--curr_begin;
blk->end = index_to_pc[curr_begin];
blk->end_index = curr_begin;
blk->end_inst = inst_list[curr_begin];
blk->cond = IR::Condition::Execnz;
blk->end_class = EndClass::Branch;
blk->branch_true = block;
}
// Reset scope begin.
curr_begin = -1;
}
// Mark a potential start of an exec scope.
if (is_open_scope(inst)) {
curr_begin = index;
last_exec_idx = -1;
}
}
}
// Sort labels to make sure block insertion is correct.
std::ranges::sort(labels);
}
void CFG::EmitBlocks() {
@ -234,22 +265,6 @@ void CFG::LinkBlocks() {
for (auto it = blocks.begin(); it != blocks.end(); it++) {
auto& block = *it;
const auto end_inst{block.end_inst};
// Handle divergence block inserted here.
if (end_inst.opcode == Opcode::S_AND_SAVEEXEC_B64 ||
end_inst.opcode == Opcode::S_ANDN2_B64 || end_inst.IsCmpx()) {
// Blocks are stored ordered by address in the set
auto next_it = std::next(it);
auto* target_block = &(*next_it);
++target_block->num_predecessors;
block.branch_true = target_block;
auto merge_it = std::next(next_it);
auto* merge_block = &(*merge_it);
++merge_block->num_predecessors;
block.branch_false = merge_block;
block.end_class = EndClass::Branch;
continue;
}
// If the block doesn't end with a branch we simply
// need to link with the next block.

View file

@ -57,9 +57,9 @@ public:
private:
void EmitLabels();
void EmitDivergenceLabels();
void EmitBlocks();
void LinkBlocks();
void SplitDivergenceScopes();
void AddLabel(Label address) {
const auto it = std::ranges::find(labels, address);

View file

@ -251,54 +251,6 @@ void FoldCmpClass(IR::Block& block, IR::Inst& inst) {
}
}
void FoldReadLane(IR::Block& block, IR::Inst& inst) {
const u32 lane = inst.Arg(1).U32();
IR::Inst* prod = inst.Arg(0).InstRecursive();
const auto search_chain = [lane](const IR::Inst* prod) -> IR::Value {
while (prod->GetOpcode() == IR::Opcode::WriteLane) {
if (prod->Arg(2).U32() == lane) {
return prod->Arg(1);
}
prod = prod->Arg(0).InstRecursive();
}
return {};
};
if (prod->GetOpcode() == IR::Opcode::WriteLane) {
if (const IR::Value value = search_chain(prod); !value.IsEmpty()) {
inst.ReplaceUsesWith(value);
}
return;
}
if (prod->GetOpcode() == IR::Opcode::Phi) {
boost::container::small_vector<IR::Value, 2> phi_args;
for (size_t arg_index = 0; arg_index < prod->NumArgs(); ++arg_index) {
const IR::Inst* arg{prod->Arg(arg_index).InstRecursive()};
if (arg->GetOpcode() != IR::Opcode::WriteLane) {
return;
}
const IR::Value value = search_chain(arg);
if (value.IsEmpty()) {
continue;
}
phi_args.emplace_back(value);
}
if (std::ranges::all_of(phi_args, [&](IR::Value value) { return value == phi_args[0]; })) {
inst.ReplaceUsesWith(phi_args[0]);
return;
}
const auto insert_point = IR::Block::InstructionList::s_iterator_to(*prod);
IR::Inst* const new_phi{&*block.PrependNewInst(insert_point, IR::Opcode::Phi)};
new_phi->SetFlags(IR::Type::U32);
for (size_t arg_index = 0; arg_index < phi_args.size(); arg_index++) {
new_phi->AddPhiOperand(prod->PhiBlock(arg_index), phi_args[arg_index]);
}
inst.ReplaceUsesWith(IR::Value{new_phi});
}
}
void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::IAdd32:
@ -408,8 +360,6 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
case IR::Opcode::SelectF32:
case IR::Opcode::SelectF64:
return FoldSelect(inst);
case IR::Opcode::ReadLane:
return FoldReadLane(block, inst);
case IR::Opcode::FPNeg32:
FoldWhenAllImmediates(inst, [](f32 a) { return -a; });
return;

View file

@ -1,11 +1,13 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/assert.h"
#include "shader_recompiler/info.h"
#include "shader_recompiler/ir/attribute.h"
#include "shader_recompiler/ir/breadth_first_search.h"
#include "shader_recompiler/ir/ir_emitter.h"
#include "shader_recompiler/ir/opcodes.h"
#include "shader_recompiler/ir/passes/ir_passes.h"
#include "shader_recompiler/ir/pattern_matching.h"
#include "shader_recompiler/ir/program.h"
#include "shader_recompiler/runtime_info.h"
@ -734,6 +736,8 @@ void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) {
}
}
}
ConstantPropagationPass(program.post_order_blocks);
}
} // namespace Shader::Optimization

View file

@ -17,11 +17,11 @@ void IdentityRemovalPass(IR::BlockList& program);
void DeadCodeEliminationPass(IR::Program& program);
void ConstantPropagationPass(IR::BlockList& program);
void FlattenExtendedUserdataPass(IR::Program& program);
void ReadLaneEliminationPass(IR::Program& program);
void ResourceTrackingPass(IR::Program& program);
void CollectShaderInfoPass(IR::Program& program);
void LowerBufferFormatToRaw(IR::Program& program);
void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info,
Stage stage);
void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info);
void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info);
void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info);
void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info);

View file

@ -0,0 +1,115 @@
// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "shader_recompiler/ir/program.h"
namespace Shader::Optimization {
static IR::Inst* SearchChain(IR::Inst* inst, u32 lane) {
while (inst->GetOpcode() == IR::Opcode::WriteLane) {
if (inst->Arg(2).U32() == lane) {
// We found a possible write lane source, return it.
return inst;
}
inst = inst->Arg(0).InstRecursive();
}
return inst;
}
static bool IsPossibleToEliminate(IR::Inst* inst, u32 lane) {
// Breadth-first search visiting the right most arguments first
boost::container::small_vector<IR::Inst*, 16> visited;
std::queue<IR::Inst*> queue;
queue.push(inst);
while (!queue.empty()) {
// Pop one instruction from the queue
IR::Inst* inst{queue.front()};
queue.pop();
// If it's a WriteLane search for possible candidates
if (inst = SearchChain(inst, lane); inst->GetOpcode() == IR::Opcode::WriteLane) {
// We found a possible write lane source, stop looking here.
continue;
}
// If there are other instructions in-between that use the value we can't eliminate.
if (inst->GetOpcode() != IR::Opcode::ReadLane && inst->GetOpcode() != IR::Opcode::Phi) {
return false;
}
// Visit the right most arguments first
for (size_t arg = inst->NumArgs(); arg--;) {
auto arg_value{inst->Arg(arg)};
if (arg_value.IsImmediate()) {
continue;
}
// Queue instruction if it hasn't been visited
IR::Inst* arg_inst{arg_value.InstRecursive()};
if (std::ranges::find(visited, arg_inst) == visited.end()) {
visited.push_back(arg_inst);
queue.push(arg_inst);
}
}
}
return true;
}
using PhiMap = std::unordered_map<IR::Inst*, IR::Inst*>;
static IR::Value GetRealValue(PhiMap& phi_map, IR::Inst* inst, u32 lane) {
// If this is a WriteLane op search the chain for a possible candidate.
if (inst = SearchChain(inst, lane); inst->GetOpcode() == IR::Opcode::WriteLane) {
return inst->Arg(1);
}
// If this is a phi, duplicate it and populate its arguments with real values.
if (inst->GetOpcode() == IR::Opcode::Phi) {
// We are in a phi cycle, use the already duplicated phi.
const auto [it, is_new_phi] = phi_map.try_emplace(inst);
if (!is_new_phi) {
return IR::Value{it->second};
}
// Create new phi and insert it right before the old one.
const auto insert_point = IR::Block::InstructionList::s_iterator_to(*inst);
IR::Block* block = inst->GetParent();
IR::Inst* new_phi{&*block->PrependNewInst(insert_point, IR::Opcode::Phi)};
new_phi->SetFlags(IR::Type::U32);
it->second = new_phi;
// Gather all arguments.
for (size_t arg_index = 0; arg_index < inst->NumArgs(); arg_index++) {
IR::Inst* arg_prod = inst->Arg(arg_index).InstRecursive();
const IR::Value arg = GetRealValue(phi_map, arg_prod, lane);
new_phi->AddPhiOperand(inst->PhiBlock(arg_index), arg);
}
return IR::Value{new_phi};
}
UNREACHABLE();
}
void ReadLaneEliminationPass(IR::Program& program) {
PhiMap phi_map;
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (inst.GetOpcode() != IR::Opcode::ReadLane) {
continue;
}
const u32 lane = inst.Arg(1).U32();
IR::Inst* prod = inst.Arg(0).InstRecursive();
// Check simple case of no control flow and phis
if (prod = SearchChain(prod, lane); prod->GetOpcode() == IR::Opcode::WriteLane) {
inst.ReplaceUsesWith(prod->Arg(1));
continue;
}
// Traverse the phi tree to see if it's possible to eliminate
if (prod->GetOpcode() == IR::Opcode::Phi && IsPossibleToEliminate(prod, lane)) {
inst.ReplaceUsesWith(GetRealValue(phi_map, prod, lane));
phi_map.clear();
}
}
}
}
} // namespace Shader::Optimization

View file

@ -11,8 +11,7 @@
namespace Shader::Optimization {
void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info,
Stage stage) {
void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info) {
auto& info = program.info;
const auto& ForEachInstruction = [&](auto func) {
@ -24,7 +23,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim
}
};
switch (stage) {
switch (program.info.stage) {
case Stage::Local: {
ForEachInstruction([=](IR::IREmitter& ir, IR::Inst& inst) {
const auto opcode = inst.GetOpcode();

View file

@ -1,9 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/config.h"
#include "common/io_file.h"
#include "common/path_util.h"
#include "shader_recompiler/frontend/control_flow_graph.h"
#include "shader_recompiler/frontend/decode.h"
#include "shader_recompiler/frontend/structured_control_flow.h"
@ -63,26 +60,18 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
program.post_order_blocks = Shader::IR::PostOrder(program.syntax_list.front());
// Run optimization passes
const auto stage = program.info.stage;
Shader::Optimization::SsaRewritePass(program.post_order_blocks);
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
Shader::Optimization::IdentityRemovalPass(program.blocks);
if (info.l_stage == LogicalStage::TessellationControl) {
// Tess passes require previous const prop passes for now (for simplicity). TODO allow
// fine grained folding or opportunistic folding we set an operand to an immediate
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
Shader::Optimization::TessellationPreprocess(program, runtime_info);
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
Shader::Optimization::HullShaderTransform(program, runtime_info);
} else if (info.l_stage == LogicalStage::TessellationEval) {
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
Shader::Optimization::TessellationPreprocess(program, runtime_info);
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
Shader::Optimization::DomainShaderTransform(program, runtime_info);
}
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
Shader::Optimization::RingAccessElimination(program, runtime_info, stage);
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
Shader::Optimization::RingAccessElimination(program, runtime_info);
Shader::Optimization::ReadLaneEliminationPass(program);
Shader::Optimization::FlattenExtendedUserdataPass(program);
Shader::Optimization::ResourceTrackingPass(program);
Shader::Optimization::LowerBufferFormatToRaw(program);