diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index 1ee69203b4..6c88fa2abf 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -54,6 +54,7 @@ enum class ppu_exec_bit : u64 has_rc, set_sat, use_nj, + fix_nj, set_vnan, fix_vnan, set_fpcc, @@ -73,7 +74,7 @@ struct ppu_exec_select static ppu_intrp_func_t select(bs_t selected, F func) { // Make sure there is no flag duplication, otherwise skip flag - if constexpr (((Flags0 != Flag) && ...)) + if constexpr (((Flags0 != Flag) && ...) && (Flag != fix_vnan || ((Flags0 != set_vnan) && ...)) && (Flag != fix_nj || ((Flags0 != use_nj) && ...))) { // Test only relevant flags at runtime initialization (compile both variants) if (selected & Flag) @@ -766,10 +767,10 @@ inline v128 ppu_select_vnan(v128 a, v128 b, Vector128 auto... args) } // Flush denormals to zero if NJ is 1 -template +template inline v128 ppu_flush_denormal(const v128& mask, const v128& a) { - if constexpr (((Flags == use_nj) || ...)) + if constexpr (((Flags == use_nj) || ...) || (Result && ((Flags == fix_nj) || ...))) { return gv_andn(gv_shr32(gv_eq32(mask & a, gv_bcst32(0)), 1), a); } @@ -826,14 +827,14 @@ template auto MTVSCR() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](auto&& sat, auto&& nj, auto&& jm_mask, auto&& b) { const u32 vscr = b._u32[3]; if constexpr (((Flags == set_sat) || ...)) sat._u = vscr & 1; - if constexpr (((Flags == use_nj) || ...)) + if constexpr (((Flags == use_nj || Flags == fix_nj) || ...)) jm_mask = (vscr & 0x10000) ? 0x7f80'0000 : 0x7fff'ffff; nj = (vscr & 0x10000) != 0; }; @@ -860,14 +861,14 @@ template auto VADDFP() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](auto&& d, auto&& a_, auto&& b_, auto&& jm_mask) { const auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); - const auto a = ppu_flush_denormal(m, a_); - const auto b = ppu_flush_denormal(m, b_); - d = ppu_flush_denormal(m, ppu_set_vnan(gv_addfs(a, b), a, b)); + const auto a = ppu_flush_denormal(m, a_); + const auto b = ppu_flush_denormal(m, b_); + d = ppu_flush_denormal(m, ppu_set_vnan(gv_addfs(a, b), a, b)); }; RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.jm_mask); @@ -1508,15 +1509,15 @@ template auto VMADDFP() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](auto&& d, auto&& a_, auto&& b_, auto&& c_, auto&& jm_mask) { const auto m = gv_bcst32(jm_mask, &ppu_thread::jm_mask); - const auto a = ppu_flush_denormal(m, a_); - const auto b = ppu_flush_denormal(m, b_); - const auto c = ppu_flush_denormal(m, c_); - d = ppu_flush_denormal(m, ppu_set_vnan(gv_fmafs(a, c, b))); + const auto a = ppu_flush_denormal(m, a_); + const auto b = ppu_flush_denormal(m, b_); + const auto c = ppu_flush_denormal(m, c_); + d = ppu_flush_denormal(m, ppu_set_vnan(gv_fmafs(a, c, b))); }; RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.vr[op.vc], ppu.jm_mask); @@ -1526,11 +1527,11 @@ template auto VMAXFP() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& jm_mask) { - d = ppu_flush_denormal(gv_bcst32(jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(gv_maxfs(a, b), a, b)); + d = ppu_flush_denormal(gv_bcst32(jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(gv_maxfs(a, b), a, b)); }; RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.jm_mask); @@ -1673,11 +1674,11 @@ template auto VMINFP() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](auto&& d, auto&& a, auto&& b, auto&& jm_mask) { - d = ppu_flush_denormal(gv_bcst32(jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(gv_minfs(a, b), a, b)); + d = ppu_flush_denormal(gv_bcst32(jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(gv_minfs(a, b), a, b)); }; RETURN_(ppu.vr[op.vd], ppu.vr[op.va], ppu.vr[op.vb], ppu.jm_mask); @@ -2087,17 +2088,17 @@ template auto VNMSUBFP() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { // An odd case with (FLT_MIN, FLT_MIN, FLT_MIN) produces FLT_MIN instead of 0 const auto s = _mm_set1_ps(-0.0f); const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask); - const auto a = ppu_flush_denormal(m, ppu.vr[op.va]); - const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); - const auto c = ppu_flush_denormal(m, ppu.vr[op.vc]); + const auto a = ppu_flush_denormal(m, ppu.vr[op.va]); + const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); + const auto c = ppu_flush_denormal(m, ppu.vr[op.vc]); const auto r = _mm_xor_ps(gv_fmafs(a, c, _mm_xor_ps(b, s)), s); - ppu.vr[op.rd] = ppu_flush_denormal(m, ppu_set_vnan(r)); + ppu.vr[op.rd] = ppu_flush_denormal(m, ppu_set_vnan(r)); }; RETURN_(ppu, op); } @@ -2315,14 +2316,14 @@ template auto VREFP() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f); const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask); - const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); + const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); const auto result = _mm_div_ps(a, b); - ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(result, a, b)); + ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(result, a, b)); }; RETURN_(ppu, op); } @@ -2331,11 +2332,11 @@ template auto VRFIM() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask); - const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); + const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); v128 d; for (uint w = 0; w < 4; w++) @@ -2343,7 +2344,7 @@ auto VRFIM() d._f[w] = std::floor(b._f[w]); } - ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(d, b)); + ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(d, b)); }; RETURN_(ppu, op); } @@ -2352,7 +2353,7 @@ template auto VRFIN() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const auto b = ppu.vr[op.vb]; @@ -2363,7 +2364,7 @@ auto VRFIN() d._f[w] = std::nearbyint(b._f[w]); } - ppu.vr[op.vd] = ppu_flush_denormal(gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(d, b)); + ppu.vr[op.vd] = ppu_flush_denormal(gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(d, b)); }; RETURN_(ppu, op); } @@ -2372,11 +2373,11 @@ template auto VRFIP() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask); - const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); + const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); v128 d; for (uint w = 0; w < 4; w++) @@ -2384,7 +2385,7 @@ auto VRFIP() d._f[w] = std::ceil(b._f[w]); } - ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(d, b)); + ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(d, b)); }; RETURN_(ppu, op); } @@ -2393,7 +2394,7 @@ template auto VRFIZ() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const auto b = ppu.vr[op.vb]; @@ -2404,7 +2405,7 @@ auto VRFIZ() d._f[w] = std::truncf(b._f[w]); } - ppu.vr[op.vd] = ppu_flush_denormal(gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(d, b)); + ppu.vr[op.vd] = ppu_flush_denormal(gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask), ppu_set_vnan(d, b)); }; RETURN_(ppu, op); } @@ -2470,14 +2471,14 @@ template auto VRSQRTEFP() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f); const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask); - const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); + const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); const auto result = _mm_div_ps(a, _mm_sqrt_ps(b)); - ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(result, a, b)); + ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(result, a, b)); }; RETURN_(ppu, op); } @@ -2905,14 +2906,14 @@ template auto VSUBFP() { if constexpr (Build == 0xf1a6) - return ppu_exec_select::template select(); + return ppu_exec_select::template select(); static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const auto m = gv_bcst32(ppu.jm_mask, &ppu_thread::jm_mask); - const auto a = ppu_flush_denormal(m, ppu.vr[op.va]); - const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); + const auto a = ppu_flush_denormal(m, ppu.vr[op.va]); + const auto b = ppu_flush_denormal(m, ppu.vr[op.vb]); const auto r = gv_subfs(a, b); - ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(r, a, b)); + ppu.vr[op.vd] = ppu_flush_denormal(m, ppu_set_vnan(r, a, b)); }; RETURN_(ppu, op); } @@ -7583,6 +7584,8 @@ ppu_interpreter_rt_base::ppu_interpreter_rt_base() noexcept selected += set_sat; if (g_cfg.core.ppu_use_nj_bit) selected += use_nj; + if (g_cfg.core.ppu_llvm_nj_fixup) + selected += fix_nj; if (g_cfg.core.ppu_set_vnan) selected += set_vnan; if (g_cfg.core.ppu_fix_vnan) diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index e44673f9c5..4e37419f5e 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -3071,13 +3071,14 @@ bool ppu_initialize(const ppu_module& info, bool check_only) non_win32, accurate_dfma, fixup_vnan, - accurate_jm, + fixup_nj_denormals, accurate_cache_line_stores, reservations_128_byte, greedy_mode, accurate_sat, accurate_fpcc, accurate_vnan, + accurate_nj_mode, __bitset_enum_max }; @@ -3091,8 +3092,8 @@ bool ppu_initialize(const ppu_module& info, bool check_only) settings += ppu_settings::accurate_dfma; if (g_cfg.core.ppu_fix_vnan) settings += ppu_settings::fixup_vnan; - if (g_cfg.core.ppu_use_nj_bit) - settings += ppu_settings::accurate_jm; + if (g_cfg.core.ppu_llvm_nj_fixup) + settings += ppu_settings::fixup_nj_denormals; if (has_dcbz == 2) settings += ppu_settings::accurate_cache_line_stores; if (g_cfg.core.ppu_128_reservations_loop_max_length) @@ -3104,7 +3105,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only) if (g_cfg.core.ppu_set_fpcc) settings += ppu_settings::accurate_fpcc, fmt::throw_exception("FPCC Not implemented"); if (g_cfg.core.ppu_set_vnan) - settings += ppu_settings::accurate_vnan, fmt::throw_exception("VNAN Not implemented"); + settings += ppu_settings::accurate_vnan, settings -= ppu_settings::fixup_vnan, fmt::throw_exception("VNAN Not implemented"); + if (g_cfg.core.ppu_use_nj_bit) + settings += ppu_settings::accurate_nj_mode, settings -= ppu_settings::fixup_nj_denormals, fmt::throw_exception("NJ Not implemented"); // Write version, hash, CPU, settings fmt::append(obj_name, "v5-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index e3aa3d5384..46d9fcf4cb 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -291,7 +291,7 @@ Value* PPUTranslator::VecHandleDenormal(Value* val) Value* PPUTranslator::VecHandleResult(Value* val) { val = g_cfg.core.ppu_fix_vnan ? VecHandleNan(val) : val; - val = g_cfg.core.ppu_use_nj_bit ? VecHandleDenormal(val) : val; + val = g_cfg.core.ppu_llvm_nj_fixup ? VecHandleDenormal(val) : val; return val; } @@ -649,7 +649,7 @@ void PPUTranslator::MTVSCR(ppu_opcode_t op) const auto vscr = m_ir->CreateExtractElement(GetVr(op.vb, VrType::vi32), m_ir->getInt32(m_is_be ? 3 : 0)); const auto nj = Trunc(m_ir->CreateLShr(vscr, 16), GetType()); RegStore(nj, m_nj); - if (g_cfg.core.ppu_use_nj_bit) + if (g_cfg.core.ppu_llvm_nj_fixup) RegStore(m_ir->CreateSelect(nj, m_ir->getInt32(0x7f80'0000), m_ir->getInt32(0x7fff'ffff)), m_jm_mask); if (g_cfg.core.ppu_set_sat_bit) RegStore(m_ir->CreateInsertElement(ConstantAggregateZero::get(GetType()), m_ir->CreateAnd(vscr, 1), m_ir->getInt32(0)), m_sat); diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index aa989758a8..d6de518728 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -55,6 +55,7 @@ struct cfg_root : cfg::node cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip) cfg::_bool full_width_avx512{ this, "Full Width AVX-512", false }; + cfg::_bool ppu_llvm_nj_fixup{ this, "PPU LLVM Java Mode Handling", true }; // Partially respect current Java Mode for alti-vec ops by PPU LLVM cfg::_bool use_accurate_dfma{ this, "Use Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively cfg::_bool ppu_set_sat_bit{ this, "PPU Set Saturation Bit", false }; // Accuracy. If unset, completely disable saturation flag handling. cfg::_bool ppu_use_nj_bit{ this, "PPU Use Non-Java Mode Bit", false }; // Accuracy. If unset, ignore NJ flag completely. diff --git a/rpcs3/rpcs3qt/emu_settings_type.h b/rpcs3/rpcs3qt/emu_settings_type.h index 2cf1eae4fe..faf959b350 100644 --- a/rpcs3/rpcs3qt/emu_settings_type.h +++ b/rpcs3/rpcs3qt/emu_settings_type.h @@ -38,6 +38,7 @@ enum class emu_settings_type ClocksScale, PerformanceReport, FullWidthAVX512, + PPUNJFixup, AccurateDFMA, AccuratePPUSAT, AccuratePPUNJ, @@ -195,6 +196,7 @@ inline static const QMap settings_location = { emu_settings_type::PerformanceReport, { "Core", "Enable Performance Report"}}, { emu_settings_type::FullWidthAVX512, { "Core", "Full Width AVX-512"}}, { emu_settings_type::NumPPUThreads, { "Core", "PPU Threads"}}, + { emu_settings_type::PPUNJFixup, { "Core", "PPU LLVM Java Mode Handling"}}, { emu_settings_type::AccurateDFMA, { "Core", "Use Accurate DFMA"}}, { emu_settings_type::AccuratePPUSAT, { "Core", "PPU Set Saturation Bit"}}, { emu_settings_type::AccuratePPUNJ, { "Core", "PPU Use Non-Java Mode Bit"}}, diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp index bde48085bf..92912ce068 100644 --- a/rpcs3/rpcs3qt/settings_dialog.cpp +++ b/rpcs3/rpcs3qt/settings_dialog.cpp @@ -1151,6 +1151,9 @@ settings_dialog::settings_dialog(std::shared_ptr gui_settings, std SubscribeTooltip(ui->accurateDFMA, tooltips.settings.accurate_dfma); ui->accurateDFMA->setDisabled(utils::has_fma3() || utils::has_fma4()); + m_emu_settings->EnhanceCheckBox(ui->ppuNJFixup, emu_settings_type::PPUNJFixup); + SubscribeTooltip(ui->ppuNJFixup, tooltips.settings.fixup_ppunj); + m_emu_settings->EnhanceCheckBox(ui->accuratePPUSAT, emu_settings_type::AccuratePPUSAT); SubscribeTooltip(ui->accuratePPUSAT, tooltips.settings.accurate_ppusat); diff --git a/rpcs3/rpcs3qt/settings_dialog.ui b/rpcs3/rpcs3qt/settings_dialog.ui index 01e6bf56ae..59b7206b72 100644 --- a/rpcs3/rpcs3qt/settings_dialog.ui +++ b/rpcs3/rpcs3qt/settings_dialog.ui @@ -2063,6 +2063,13 @@ + + + + PPU Non-Java Mode Fixup + + + diff --git a/rpcs3/rpcs3qt/tooltips.h b/rpcs3/rpcs3qt/tooltips.h index 73ed608ec0..e84796648a 100644 --- a/rpcs3/rpcs3qt/tooltips.h +++ b/rpcs3/rpcs3qt/tooltips.h @@ -70,6 +70,7 @@ public: const QString spu_block_size = tr("This option controls the SPU analyser, particularly the size of compiled units. The Mega and Giga modes may improve performance by tying smaller units together, decreasing the number of compiled units but increasing their size.\nUse the Safe mode for maximum compatibility."); const QString preferred_spu_threads = tr("Some SPU stages are sensitive to race conditions and allowing a limited number at a time helps alleviate performance stalls.\nSetting this to a smaller value might improve performance and reduce stuttering in some games.\nLeave this on auto if performance is negatively affected when setting a small value."); const QString full_width_avx512 = tr("Enables the use of code with full width AVX-512.\nThis code can be executed much faster, but may cause a loss in performance if your CPU model experiences downclocking on wide AVX-512 loads.\nNote that AVX-512 instructions will be used regardless of this option, just at 128 and 256 bit width."); + const QString fixup_ppunj = tr("Legacy option. Fixup result vector values in Non-Java Mode in PPU LLVM.\nIf unsure, do not modify this setting."); const QString accurate_dfma = tr("Use accurate double-precision FMA instructions in PPU and SPU backends.\nWhile disabling it might give a decent performance boost if your CPU doesn't support FMA, it may also introduce subtle bugs that otherwise do not occur.\nYou shouldn't disable it if your CPU supports FMA."); const QString accurate_ppusat = tr("Accurately set Saturation Bit values in PPU backends.\nIf unsure, do not modify this setting."); const QString accurate_ppunj = tr("Respect Non-Java Mode Bit values for vector ops in PPU backends.\nIf unsure, do not modify this setting.");