From 201d54ee08b75c63851ad2b6b937d2e2bcbfa3b3 Mon Sep 17 00:00:00 2001 From: Eladash Date: Thu, 14 May 2020 15:21:15 +0300 Subject: [PATCH] PPU interpreters: Implement AltiVec NaNs precedence and data preservation --- Utilities/BEType.h | 10 ++++ rpcs3/Emu/Cell/PPUInterpreter.cpp | 92 ++++++++++++++++++++----------- 2 files changed, 69 insertions(+), 33 deletions(-) diff --git a/Utilities/BEType.h b/Utilities/BEType.h index 0bb0e71409..966a44f550 100644 --- a/Utilities/BEType.h +++ b/Utilities/BEType.h @@ -312,6 +312,16 @@ union alignas(16) v128 return fromV(_mm_cmpeq_epi32(left.vi, right.vi)); } + static inline v128 eq32f(const v128& left, const v128& right) + { + return fromF(_mm_cmpeq_ps(left.vf, right.vf)); + } + + static inline v128 eq64f(const v128& left, const v128& right) + { + return fromD(_mm_cmpeq_pd(left.vd, right.vd)); + } + bool operator==(const v128& right) const { return _u64[0] == right._u64[0] && _u64[1] == right._u64[1]; diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index 63aa9c02b5..328799c209 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -360,30 +360,47 @@ public: const g_ppu_scale_table; constexpr u32 ppu_nan_u32 = 0x7FC00000u; -const f32 ppu_nan_f32 = std::bit_cast(ppu_nan_u32); +static const f32 ppu_nan_f32 = std::bit_cast(ppu_nan_u32); +static const v128 ppu_vec_nans = v128::from32p(ppu_nan_u32); + +// NaNs production precedence: NaN from Va, Vb, Vc +// and lastly the result of the operation in case none of the operands is a NaN +// Signaling NaNs are 'quieted' (MSB of fraction is set) with other bits of data remain the same +inline v128 vec_select_nan(v128 a) +{ + return a; +} + +inline v128 vec_select_nan(v128 a, v128 b) +{ + const auto not_nan = v128::eq32f(a, a); + return (b & not_nan) | v128::andnot(not_nan, a | ppu_vec_nans); +} + +template +inline v128 vec_select_nan(v128 a, v128 b, Args... args) +{ + return vec_select_nan(a, vec_select_nan(b, args...)); +} + +v128 vec_handle_nan(v128 result) +{ + const auto not_nan = v128::eq32f(result, result); + result = (result & not_nan) | v128::andnot(not_nan, ppu_vec_nans); + + return result; +} template v128 vec_handle_nan(v128 result, Args... args) { - const auto is_nan = v128::fromF(_mm_cmpunord_ps(result.vf, result.vf)); - const auto nans = v128::from32p(ppu_nan_u32); - result = (nans & is_nan) | v128::andnot(is_nan, result); - - return result; + return vec_select_nan(args..., vec_handle_nan(result)); } template v128 vec_handle_nan(__m128 result, Args... args) { - return vec_handle_nan(v128::fromF(result)); -} - -template -float f32_handle_nan(float result, Args... args) -{ - if (std::isnan(result)) - result = ppu_nan_f32; - return result; + return vec_handle_nan(v128::fromF(result), v128::fromF(args)...); } bool ppu_interpreter::MFVSCR(ppu_thread& ppu, ppu_opcode_t op) @@ -945,23 +962,24 @@ bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op) const auto b = ppu.vr[op.vc].vf; const auto c = ppu.vr[op.vb].vf; const auto result = _mm_add_ps(_mm_mul_ps(a, b), c); - ppu.vr[op.vd] = vec_handle_nan(result, a, b, c); + ppu.vr[op.vd] = vec_handle_nan(result); return true; } bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op) { - const auto& a = ppu.vr[op.va]._f; - const auto& b = ppu.vr[op.vb]._f; - const auto& c = ppu.vr[op.vc]._f; - auto& d = ppu.vr[op.rd]._f; + const auto a = ppu.vr[op.va]; + const auto b = ppu.vr[op.vb]; + const auto c = ppu.vr[op.vc]; + v128 d; // TODO: Optimize for (u32 i = 0; i < 4; i++) { - d[i] = f32_handle_nan(f32(f64{a[i]} * f64{c[i]} + f64{b[i]})); + d._f[i] = f32(f64{a._f[i]} * f64{c._f[i]} + f64{b._f[i]}); } + ppu.vr[op.rd] = vec_handle_nan(d, a, b, c); return true; } @@ -1861,49 +1879,57 @@ bool ppu_interpreter::VREFP(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter::VRFIM(ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& b = ppu.vr[op.vb]; + const auto b = ppu.vr[op.vb]; + v128 d; for (uint w = 0; w < 4; w++) { - d._f[w] = f32_handle_nan(std::floor(b._f[w])); + d._f[w] = std::floor(b._f[w]); } + + ppu.vr[op.vb] = vec_handle_nan(d, b); return true; } bool ppu_interpreter::VRFIN(ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& b = ppu.vr[op.vb]; + const auto b = ppu.vr[op.vb]; + v128 d; for (uint w = 0; w < 4; w++) { - d._f[w] = f32_handle_nan(std::nearbyint(b._f[w])); + d._f[w] = std::nearbyint(b._f[w]); } + + ppu.vr[op.vb] = vec_handle_nan(d, b); return true; } bool ppu_interpreter::VRFIP(ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& b = ppu.vr[op.vb]; + const auto b = ppu.vr[op.vb]; + v128 d; for (uint w = 0; w < 4; w++) { - d._f[w] = f32_handle_nan(std::ceil(b._f[w])); + d._f[w] = std::ceil(b._f[w]); } + + ppu.vr[op.vb] = vec_handle_nan(d, b); return true; } bool ppu_interpreter::VRFIZ(ppu_thread& ppu, ppu_opcode_t op) { - auto& d = ppu.vr[op.vd]; - const auto& b = ppu.vr[op.vb]; + const auto b = ppu.vr[op.vb]; + v128 d; for (uint w = 0; w < 4; w++) { - d._f[w] = f32_handle_nan(std::truncf(b._f[w])); + d._f[w] = std::truncf(b._f[w]); } + + ppu.vr[op.vb] = vec_handle_nan(d, b); return true; }