From 3fde455932e9d210adb6afe9b40ba2fac4e538b0 Mon Sep 17 00:00:00 2001 From: Malcolm Jestadt Date: Tue, 26 Oct 2021 04:56:47 -0400 Subject: [PATCH] SPU LLVM: Optimize branch following ORX - test the input of ORX directly for zeroes, instead of the result --- rpcs3/Emu/Cell/SPURecompiler.cpp | 91 +++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 8 deletions(-) diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index dda1870511..eaf7f7e055 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -6889,12 +6889,23 @@ public: set_vr(op.rt, pshufb(a, sh)); } + template + static llvm_calli orx(T&& a) + { + return {"spu_orx", {std::forward(a)}}; + } + void ORX(spu_opcode_t op) { - const auto a = get_vr(op.ra); - const auto x = zshuffle(a, 2, 3, 0, 1) | a; - const auto y = zshuffle(x, 1, 0, 3, 2) | x; - set_vr(op.rt, zshuffle(y, 4, 4, 4, 3)); + register_intrinsic("spu_orx", [&](llvm::CallInst* ci) + { + const auto a = value(ci->getOperand(0)); + const auto x = zshuffle(a, 2, 3, 0, 1) | a; + const auto y = zshuffle(x, 1, 0, 3, 2) | x; + return zshuffle(y, 4, 4, 4, 3); + }); + + set_vr(op.rt, orx(get_vr(op.ra))); } void CBD(spu_opcode_t op) @@ -9234,7 +9245,7 @@ public: const auto rt = get_vr(op.rt); - // Checking for zero doeesn't care about the order of the bytes, + // Checking for zero doesn't care about the order of the bytes, // so load the data before it's byteswapped if (auto [ok, as] = match_expr(rt, byteswap(match())); ok) { @@ -9246,6 +9257,22 @@ public: return; } + const auto ox = get_vr(op.rt); + + // Instead of extracting the value generated by orx, just test the input to orx with ptest + if (auto [ok, as] = match_expr(ox, orx(match())); ok) + { + m_block->block_end = m_ir->GetInsertBlock(); + const auto a = extract(bitcast(as), 0); + const auto b = extract(bitcast(as), 1); + const auto cond = eval((a | b) == 0); + const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); + const auto target = add_block_indirect(op, addr); + m_ir->CreateCondBr(cond.value, target, add_block_next()); + return; + } + + // Check sign bit instead (optimization) if (match_vr(op.rt, [&](auto c, auto MP) { @@ -9279,7 +9306,7 @@ public: const auto rt = get_vr(op.rt); - // Checking for zero doeesn't care about the order of the bytes, + // Checking for zero doesn't care about the order of the bytes, // so load the data before it's byteswapped if (auto [ok, as] = match_expr(rt, byteswap(match())); ok) { @@ -9291,6 +9318,21 @@ public: return; } + const auto ox = get_vr(op.rt); + + // Instead of extracting the value generated by orx, just test the input to orx with ptest + if (auto [ok, as] = match_expr(ox, orx(match())); ok) + { + m_block->block_end = m_ir->GetInsertBlock(); + const auto a = extract(bitcast(as), 0); + const auto b = extract(bitcast(as), 1); + const auto cond = eval((a | b) != 0); + const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); + const auto target = add_block_indirect(op, addr); + m_ir->CreateCondBr(cond.value, target, add_block_next()); + return; + } + // Check sign bit instead (optimization) if (match_vr(op.rt, [&](auto c, auto MP) @@ -9514,7 +9556,7 @@ public: const auto rt = get_vr(op.rt); - // Checking for zero doeesn't care about the order of the bytes, + // Checking for zero doesn't care about the order of the bytes, // so load the data before it's byteswapped if (auto [ok, as] = match_expr(rt, byteswap(match())); ok) { @@ -9527,6 +9569,23 @@ public: } } + const auto ox = get_vr(op.rt); + + // Instead of extracting the value generated by orx, just test the input to orx with ptest + if (auto [ok, as] = match_expr(ox, orx(match())); ok) + { + if (target != m_pos + 4) + { + m_block->block_end = m_ir->GetInsertBlock(); + const auto a = extract(bitcast(as), 0); + const auto b = extract(bitcast(as), 1); + const auto cond = eval((a | b) == 0); + m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); + return; + } + } + + // Check sign bit instead (optimization) if (match_vr(op.rt, [&](auto c, auto MP) { @@ -9573,7 +9632,7 @@ public: const auto rt = get_vr(op.rt); - // Checking for zero doeesn't care about the order of the bytes, + // Checking for zero doesn't care about the order of the bytes, // so load the data before it's byteswapped if (auto [ok, as] = match_expr(rt, byteswap(match())); ok) { @@ -9586,6 +9645,22 @@ public: } } + const auto ox = get_vr(op.rt); + + // Instead of extracting the value generated by orx, just test the input to orx with ptest + if (auto [ok, as] = match_expr(ox, orx(match())); ok) + { + if (target != m_pos + 4) + { + m_block->block_end = m_ir->GetInsertBlock(); + const auto a = extract(bitcast(as), 0); + const auto b = extract(bitcast(as), 1); + const auto cond = eval((a | b) != 0); + m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); + return; + } + } + // Check sign bit instead (optimization) if (match_vr(op.rt, [&](auto c, auto MP) {