SPU LLVM: Optimize branch following ORX

- test the input of ORX directly for zeroes, instead of the result
This commit is contained in:
Malcolm Jestadt 2021-10-26 04:56:47 -04:00 committed by Ivan
parent ba727e13ae
commit 3fde455932

View file

@ -6889,12 +6889,23 @@ public:
set_vr(op.rt, pshufb(a, sh));
}
template <typename T>
static llvm_calli<u32[4], T> orx(T&& a)
{
return {"spu_orx", {std::forward<T>(a)}};
}
void ORX(spu_opcode_t op)
{
const auto a = get_vr(op.ra);
const auto x = zshuffle(a, 2, 3, 0, 1) | a;
const auto y = zshuffle(x, 1, 0, 3, 2) | x;
set_vr(op.rt, zshuffle(y, 4, 4, 4, 3));
register_intrinsic("spu_orx", [&](llvm::CallInst* ci)
{
const auto a = value<u32[4]>(ci->getOperand(0));
const auto x = zshuffle(a, 2, 3, 0, 1) | a;
const auto y = zshuffle(x, 1, 0, 3, 2) | x;
return zshuffle(y, 4, 4, 4, 3);
});
set_vr(op.rt, orx(get_vr(op.ra)));
}
void CBD(spu_opcode_t op)
@ -9234,7 +9245,7 @@ public:
const auto rt = get_vr<u8[16]>(op.rt);
// Checking for zero doeesn't care about the order of the bytes,
// Checking for zero doesn't care about the order of the bytes,
// so load the data before it's byteswapped
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
{
@ -9246,6 +9257,22 @@ public:
return;
}
const auto ox = get_vr<u32[4]>(op.rt);
// Instead of extracting the value generated by orx, just test the input to orx with ptest
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
{
m_block->block_end = m_ir->GetInsertBlock();
const auto a = extract(bitcast<u64[2]>(as), 0);
const auto b = extract(bitcast<u64[2]>(as), 1);
const auto cond = eval((a | b) == 0);
const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
const auto target = add_block_indirect(op, addr);
m_ir->CreateCondBr(cond.value, target, add_block_next());
return;
}
// Check sign bit instead (optimization)
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
{
@ -9279,7 +9306,7 @@ public:
const auto rt = get_vr<u8[16]>(op.rt);
// Checking for zero doeesn't care about the order of the bytes,
// Checking for zero doesn't care about the order of the bytes,
// so load the data before it's byteswapped
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
{
@ -9291,6 +9318,21 @@ public:
return;
}
const auto ox = get_vr<u32[4]>(op.rt);
// Instead of extracting the value generated by orx, just test the input to orx with ptest
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
{
m_block->block_end = m_ir->GetInsertBlock();
const auto a = extract(bitcast<u64[2]>(as), 0);
const auto b = extract(bitcast<u64[2]>(as), 1);
const auto cond = eval((a | b) != 0);
const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
const auto target = add_block_indirect(op, addr);
m_ir->CreateCondBr(cond.value, target, add_block_next());
return;
}
// Check sign bit instead (optimization)
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
@ -9514,7 +9556,7 @@ public:
const auto rt = get_vr<u8[16]>(op.rt);
// Checking for zero doeesn't care about the order of the bytes,
// Checking for zero doesn't care about the order of the bytes,
// so load the data before it's byteswapped
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
{
@ -9527,6 +9569,23 @@ public:
}
}
const auto ox = get_vr<u32[4]>(op.rt);
// Instead of extracting the value generated by orx, just test the input to orx with ptest
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
{
if (target != m_pos + 4)
{
m_block->block_end = m_ir->GetInsertBlock();
const auto a = extract(bitcast<u64[2]>(as), 0);
const auto b = extract(bitcast<u64[2]>(as), 1);
const auto cond = eval((a | b) == 0);
m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
return;
}
}
// Check sign bit instead (optimization)
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
{
@ -9573,7 +9632,7 @@ public:
const auto rt = get_vr<u8[16]>(op.rt);
// Checking for zero doeesn't care about the order of the bytes,
// Checking for zero doesn't care about the order of the bytes,
// so load the data before it's byteswapped
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
{
@ -9586,6 +9645,22 @@ public:
}
}
const auto ox = get_vr<u32[4]>(op.rt);
// Instead of extracting the value generated by orx, just test the input to orx with ptest
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
{
if (target != m_pos + 4)
{
m_block->block_end = m_ir->GetInsertBlock();
const auto a = extract(bitcast<u64[2]>(as), 0);
const auto b = extract(bitcast<u64[2]>(as), 1);
const auto cond = eval((a | b) != 0);
m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
return;
}
}
// Check sign bit instead (optimization)
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
{