diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp index 254ada8636..35234b063b 100644 --- a/rpcs3/Emu/CPU/CPUThread.cpp +++ b/rpcs3/Emu/CPU/CPUThread.cpp @@ -282,6 +282,21 @@ void cpu_thread::operator()() thread_ctrl::set_native_priority(-1); } + if (id_type() == 2) + { + // force input/output denormals to zero for SPU threads (FTZ/DAZ) + _mm_setcsr( _mm_getcsr() | 0x8040 ); + + volatile u32 a = 0x1fc00000; + __m128 b = _mm_castsi128_ps(_mm_set1_epi32(const_cast(a))); + int c = _mm_cvtsi128_si32(_mm_castps_si128(_mm_mul_ps(b,b))); + + if (c != 0) + { + LOG_FATAL(GENERAL,"could not disable denormals"); + } + } + if (id_type() == 1 && false) { g_fxo->get()->registered.push(id); diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 9567b914e5..71fe86c19f 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -7186,6 +7186,32 @@ public: set_vr(op.rt, -(get_vr(op.ra) * get_vr(op.rb) + get_vr(op.rt))); } + // clamping helpers + value_t clamp_positive_smax(value_t v) + { + return eval(bitcast(min(bitcast(v),splat(0x7f7fffff)))); + } + + value_t clamp_negative_smax(value_t v) + { + return eval(bitcast(min(bitcast(v),splat(0xff7fffff)))); + } + + value_t clamp_smax(value_t v) + { + return eval(clamp_negative_smax(clamp_positive_smax(v))); + } + + // FMA favouring zeros + value_t xmuladd(value_t a, value_t b, value_t c) + { + const auto ma = eval(sext(fcmp_uno(a != fsplat(0.)))); + const auto mb = eval(sext(fcmp_uno(b != fsplat(0.)))); + const auto ca = eval(bitcast(bitcast(a) & mb)); + const auto cb = eval(bitcast(bitcast(b) & ma)); + return eval(fmuladd(ca, cb, c)); + } + void FREST(spu_opcode_t op) { // TODO @@ -7215,17 +7241,11 @@ public: const auto a = get_vr(op.ra); const auto b = get_vr(op.rb); - // See FCMGT. if (g_cfg.core.spu_approx_xfloat) { - const auto ia = bitcast(fabs(a)); - const auto ib = bitcast(fabs(b)); - const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff)); - - // Use sign bits to invert abs values before comparison. - const auto ca = eval(ia ^ (bitcast(a) >> 31)); - const auto cb = eval(ib ^ (bitcast(b) >> 31)); - set_vr(op.rt, sext((ca > cb) & nz)); + const auto ca = eval(clamp_positive_smax(a)); + const auto cb = eval(clamp_negative_smax(b)); + set_vr(op.rt, sext(fcmp_ord(ca > cb))); } else { @@ -7241,23 +7261,17 @@ public: return; } - const auto a = get_vr(op.ra); - const auto b = get_vr(op.rb); - const auto abs_a = fabs(a); - const auto abs_b = fabs(b); + const auto a = eval(fabs(get_vr(op.ra))); + const auto b = eval(fabs(get_vr(op.rb))); - // Actually, it's accurate and can be used as an alternative path for accurate xfloat. if (g_cfg.core.spu_approx_xfloat) { - // Compare abs values as integers, but return false if both are denormals or zeros. - const auto ia = bitcast(abs_a); - const auto ib = bitcast(abs_b); - const auto nz = eval((ia > 0x7fffff) | (ib > 0x7fffff)); - set_vr(op.rt, sext((ia > ib) & nz)); + const auto ca = eval(clamp_positive_smax(a)); + set_vr(op.rt, sext(fcmp_ord(ca > b))); } else { - set_vr(op.rt, sext(fcmp_ord(abs_a > abs_b))); + set_vr(op.rt, sext(fcmp_ord(a > b))); } } @@ -7273,6 +7287,11 @@ public: { if (g_cfg.core.spu_accurate_xfloat) set_vr(op.rt, get_vr(op.ra) - get_vr(op.rb)); + else if (g_cfg.core.spu_approx_xfloat) + { + const auto b = eval(clamp_smax(get_vr(op.rb))); // for #4478 + set_vr(op.rt, get_vr(op.ra) - b); + } else set_vr(op.rt, get_vr(op.ra) - get_vr(op.rb)); } @@ -7285,21 +7304,11 @@ public: { const auto a = get_vr(op.ra); const auto b = get_vr(op.rb); - const auto m = eval(a * b); - const auto abs_a = bitcast(fabs(a)); - const auto abs_b = bitcast(fabs(b)); - const auto abs_m = bitcast(fabs(m)); - const auto sign_a = eval(bitcast(a) & 0x80000000); - const auto sign_b = eval(bitcast(b) & 0x80000000); - const auto smod_m = eval(bitcast(m) & 0x7fffffff); - const auto fmax_m = eval((sign_a ^ sign_b) | 0x7fffffff); - const auto nzero = eval((abs_a > 0x7fffff) & (abs_b > 0x7fffff) & (abs_m > 0x7fffff)); - - // If m produces Inf or NaN, flush it to max xfloat with appropriate sign - const auto clamp = select(smod_m > 0x7f7fffff, bitcast(fmax_m), m); - - // If a, b, or a * b is a denorm or zero, return zero - set_vr(op.rt, select(nzero, clamp, fsplat(0.))); + const auto ma = eval(sext(fcmp_uno(a != fsplat(0.)))); + const auto mb = eval(sext(fcmp_uno(b != fsplat(0.)))); + const auto ca = eval(bitcast(bitcast(a) & mb)); + const auto cb = eval(bitcast(bitcast(b) & ma)); + set_vr(op.rt, ca * cb); } else set_vr(op.rt, get_vr(op.ra) * get_vr(op.rb));