mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-20 03:25:16 +00:00
SPU LLVM: Optimize branch following ORX
- test the input of ORX directly for zeroes, instead of the result
This commit is contained in:
parent
ba727e13ae
commit
3fde455932
1 changed files with 83 additions and 8 deletions
|
@ -6889,12 +6889,23 @@ public:
|
|||
set_vr(op.rt, pshufb(a, sh));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static llvm_calli<u32[4], T> orx(T&& a)
|
||||
{
|
||||
return {"spu_orx", {std::forward<T>(a)}};
|
||||
}
|
||||
|
||||
void ORX(spu_opcode_t op)
|
||||
{
|
||||
const auto a = get_vr(op.ra);
|
||||
const auto x = zshuffle(a, 2, 3, 0, 1) | a;
|
||||
const auto y = zshuffle(x, 1, 0, 3, 2) | x;
|
||||
set_vr(op.rt, zshuffle(y, 4, 4, 4, 3));
|
||||
register_intrinsic("spu_orx", [&](llvm::CallInst* ci)
|
||||
{
|
||||
const auto a = value<u32[4]>(ci->getOperand(0));
|
||||
const auto x = zshuffle(a, 2, 3, 0, 1) | a;
|
||||
const auto y = zshuffle(x, 1, 0, 3, 2) | x;
|
||||
return zshuffle(y, 4, 4, 4, 3);
|
||||
});
|
||||
|
||||
set_vr(op.rt, orx(get_vr(op.ra)));
|
||||
}
|
||||
|
||||
void CBD(spu_opcode_t op)
|
||||
|
@ -9234,7 +9245,7 @@ public:
|
|||
|
||||
const auto rt = get_vr<u8[16]>(op.rt);
|
||||
|
||||
// Checking for zero doeesn't care about the order of the bytes,
|
||||
// Checking for zero doesn't care about the order of the bytes,
|
||||
// so load the data before it's byteswapped
|
||||
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
|
||||
{
|
||||
|
@ -9246,6 +9257,22 @@ public:
|
|||
return;
|
||||
}
|
||||
|
||||
const auto ox = get_vr<u32[4]>(op.rt);
|
||||
|
||||
// Instead of extracting the value generated by orx, just test the input to orx with ptest
|
||||
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
|
||||
{
|
||||
m_block->block_end = m_ir->GetInsertBlock();
|
||||
const auto a = extract(bitcast<u64[2]>(as), 0);
|
||||
const auto b = extract(bitcast<u64[2]>(as), 1);
|
||||
const auto cond = eval((a | b) == 0);
|
||||
const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
|
||||
const auto target = add_block_indirect(op, addr);
|
||||
m_ir->CreateCondBr(cond.value, target, add_block_next());
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Check sign bit instead (optimization)
|
||||
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
|
||||
{
|
||||
|
@ -9279,7 +9306,7 @@ public:
|
|||
|
||||
const auto rt = get_vr<u8[16]>(op.rt);
|
||||
|
||||
// Checking for zero doeesn't care about the order of the bytes,
|
||||
// Checking for zero doesn't care about the order of the bytes,
|
||||
// so load the data before it's byteswapped
|
||||
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
|
||||
{
|
||||
|
@ -9291,6 +9318,21 @@ public:
|
|||
return;
|
||||
}
|
||||
|
||||
const auto ox = get_vr<u32[4]>(op.rt);
|
||||
|
||||
// Instead of extracting the value generated by orx, just test the input to orx with ptest
|
||||
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
|
||||
{
|
||||
m_block->block_end = m_ir->GetInsertBlock();
|
||||
const auto a = extract(bitcast<u64[2]>(as), 0);
|
||||
const auto b = extract(bitcast<u64[2]>(as), 1);
|
||||
const auto cond = eval((a | b) != 0);
|
||||
const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
|
||||
const auto target = add_block_indirect(op, addr);
|
||||
m_ir->CreateCondBr(cond.value, target, add_block_next());
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Check sign bit instead (optimization)
|
||||
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
|
||||
|
@ -9514,7 +9556,7 @@ public:
|
|||
|
||||
const auto rt = get_vr<u8[16]>(op.rt);
|
||||
|
||||
// Checking for zero doeesn't care about the order of the bytes,
|
||||
// Checking for zero doesn't care about the order of the bytes,
|
||||
// so load the data before it's byteswapped
|
||||
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
|
||||
{
|
||||
|
@ -9527,6 +9569,23 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
const auto ox = get_vr<u32[4]>(op.rt);
|
||||
|
||||
// Instead of extracting the value generated by orx, just test the input to orx with ptest
|
||||
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
|
||||
{
|
||||
if (target != m_pos + 4)
|
||||
{
|
||||
m_block->block_end = m_ir->GetInsertBlock();
|
||||
const auto a = extract(bitcast<u64[2]>(as), 0);
|
||||
const auto b = extract(bitcast<u64[2]>(as), 1);
|
||||
const auto cond = eval((a | b) == 0);
|
||||
m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Check sign bit instead (optimization)
|
||||
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
|
||||
{
|
||||
|
@ -9573,7 +9632,7 @@ public:
|
|||
|
||||
const auto rt = get_vr<u8[16]>(op.rt);
|
||||
|
||||
// Checking for zero doeesn't care about the order of the bytes,
|
||||
// Checking for zero doesn't care about the order of the bytes,
|
||||
// so load the data before it's byteswapped
|
||||
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
|
||||
{
|
||||
|
@ -9586,6 +9645,22 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
const auto ox = get_vr<u32[4]>(op.rt);
|
||||
|
||||
// Instead of extracting the value generated by orx, just test the input to orx with ptest
|
||||
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
|
||||
{
|
||||
if (target != m_pos + 4)
|
||||
{
|
||||
m_block->block_end = m_ir->GetInsertBlock();
|
||||
const auto a = extract(bitcast<u64[2]>(as), 0);
|
||||
const auto b = extract(bitcast<u64[2]>(as), 1);
|
||||
const auto cond = eval((a | b) != 0);
|
||||
m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Check sign bit instead (optimization)
|
||||
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue