diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 980f0f25fd..6b6681f554 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -7457,8 +7457,10 @@ public: { const auto a = get_vr(op.ra); const auto b = get_vr(op.rb); - const auto ca = eval(clamp_smax(a)); - const auto cb = eval(clamp_smax(b)); + const auto ma = eval(sext(fcmp_uno(a != fsplat(0.)))); + const auto mb = eval(sext(fcmp_uno(b != fsplat(0.)))); + const auto ca = eval(bitcast(bitcast(a) & mb)); + const auto cb = eval(bitcast(bitcast(b) & ma)); set_vr(op.rt, ca * cb); } else @@ -7525,8 +7527,6 @@ public: value_t fma32x4(value_t a, value_t b, value_t c) { value_t r; - const auto ca = eval(clamp_smax(a)); - const auto cb = eval(clamp_smax(b)); // Optimization: Emit only a floating multiply if the addend is zero // This is odd since SPU code could just use the FM instruction, but it seems common enough @@ -7536,20 +7536,41 @@ public: if (is_spu_float_zero(data)) { - r = eval(ca * cb); + r = eval(a * b); return r; } } + if (auto cv = llvm::dyn_cast(b.value)) + { + v128 data = get_const_vector(cv, m_pos, 4000); + + if (is_spu_float_zero(data)) + { + // Just return the added value if either a or b is 0 + return c; + } + } + + if (auto cv = llvm::dyn_cast(a.value)) + { + v128 data = get_const_vector(cv, m_pos, 4000); + + if (is_spu_float_zero(data)) + { + return c; + } + } + if (m_use_fma) { - r.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fma), {ca.value, cb.value, c.value}); + r.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fma), {a.value, b.value, c.value}); return r; } // Convert to doubles - const auto xa = m_ir->CreateFPExt(ca.value, get_type()); - const auto xb = m_ir->CreateFPExt(cb.value, get_type()); + const auto xa = m_ir->CreateFPExt(a.value, get_type()); + const auto xb = m_ir->CreateFPExt(b.value, get_type()); const auto xc = m_ir->CreateFPExt(c.value, get_type()); const auto xr = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::fmuladd), {xa, xb, xc}); r.value = m_ir->CreateFPTrunc(xr, get_type()); @@ -7562,9 +7583,13 @@ public: if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt4, fmuladd(eval(-get_vr(op.ra)), get_vr(op.rb), get_vr(op.rc))); else if (g_cfg.core.spu_approx_xfloat) - set_vr(op.rt4, fma32x4(eval(-get_vr(op.ra)), get_vr(op.rb), get_vr(op.rc))); + { + const auto a = eval(clamp_smax(get_vr(op.ra))); + const auto b = eval(clamp_smax(get_vr(op.rb))); + set_vr(op.rt4, fma32x4(eval(-(a)), (b), get_vr(op.rc))); + } else - set_vr(op.rt4, get_vr(op.rc) - get_vr(op.ra) * get_vr(op.rb)); + set_vr(op.rt4, fma32x4(eval(-get_vr(op.ra)), get_vr(op.rb), get_vr(op.rc))); } void FMA(spu_opcode_t op) @@ -7573,9 +7598,17 @@ public: if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt4, fmuladd(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); else if (g_cfg.core.spu_approx_xfloat) - set_vr(op.rt4, fma32x4(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); + { + const auto a = get_vr(op.ra); + const auto b = get_vr(op.rb); + const auto ma = eval(sext(fcmp_uno(a != fsplat(0.)))); + const auto mb = eval(sext(fcmp_uno(b != fsplat(0.)))); + const auto ca = eval(bitcast(bitcast(a) & mb)); + const auto cb = eval(bitcast(bitcast(b) & ma)); + set_vr(op.rt4, fma32x4((ca), (cb), get_vr(op.rc))); + } else - set_vr(op.rt4, get_vr(op.ra) * get_vr(op.rb) + get_vr(op.rc)); + set_vr(op.rt4, fma32x4(get_vr(op.ra), get_vr(op.rb), get_vr(op.rc))); } void FMS(spu_opcode_t op) @@ -7584,9 +7617,13 @@ public: if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt4, fmuladd(get_vr(op.ra), get_vr(op.rb), eval(-get_vr(op.rc)))); else if (g_cfg.core.spu_approx_xfloat) - set_vr(op.rt4, fma32x4(get_vr(op.ra), get_vr(op.rb), eval(-get_vr(op.rc)))); + { + const auto a = eval(clamp_smax(get_vr(op.ra))); + const auto b = eval(clamp_smax(get_vr(op.rb))); + set_vr(op.rt4, fma32x4((a), (b), eval(-get_vr(op.rc)))); + } else - set_vr(op.rt4, get_vr(op.ra) * get_vr(op.rb) - get_vr(op.rc)); + set_vr(op.rt4, fma32x4(get_vr(op.ra), get_vr(op.rb), eval(-get_vr(op.rc)))); } void FI(spu_opcode_t op)