diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index 22771bc5ad..b31c009e70 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -25,6 +25,8 @@ const bool s_use_ssse3 = utils::has_ssse3(); +extern void do_cell_atomic_128_store(u32 addr, const void* to_write); + inline u64 dup32(u32 x) { return x | static_cast(x) << 32; } // Write values to CR field @@ -4435,11 +4437,10 @@ bool ppu_interpreter::DCBZ(ppu_thread& ppu, ppu_opcode_t op) const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb]; const u32 addr0 = vm::cast(addr, HERE) & ~127; - if (g_cfg.core.spu_accurate_dma) + if (g_cfg.core.accurate_cache_line_stores) { - auto [res, rtime] = vm::reservation_lock(addr0, 128, vm::dma_lockb); - std::memset(vm::base(addr0), 0, 128); - res.release(rtime + 128); + alignas(64) static constexpr u8 zero_buf[128]{}; + do_cell_atomic_128_store(addr0, zero_buf); return true; } diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index ebbdacce49..b10325128a 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -92,6 +92,7 @@ void fmt_class_string::format(std::string& out, u64 arg) constexpr ppu_decoder g_ppu_interpreter_precise; constexpr ppu_decoder g_ppu_interpreter_fast; +constexpr ppu_decoder g_ppu_itype; extern void ppu_initialize(); extern void ppu_initialize(const ppu_module& info); @@ -99,6 +100,8 @@ static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_pa extern void ppu_execute_syscall(ppu_thread& ppu, u64 code); static bool ppu_break(ppu_thread& ppu, ppu_opcode_t op); +extern void do_cell_atomic_128_store(u32 addr, const void* to_write); + // Get pointer to executable cache template static T& ppu_ref(u32 addr) @@ -1420,6 +1423,7 @@ extern void ppu_initialize(const ppu_module& info) { "__lvrx", s_use_ssse3 ? reinterpret_cast(sse_cellbe_lvrx) : reinterpret_cast(sse_cellbe_lvrx_v0) }, { "__stvlx", s_use_ssse3 ? reinterpret_cast(sse_cellbe_stvlx) : reinterpret_cast(sse_cellbe_stvlx_v0) }, { "__stvrx", s_use_ssse3 ? reinterpret_cast(sse_cellbe_stvrx) : reinterpret_cast(sse_cellbe_stvrx_v0) }, + { "__dcbz", reinterpret_cast(+[](u32 addr){ alignas(64) static constexpr u8 z[128]{}; do_cell_atomic_128_store(addr, z); }) }, { "__resupdate", reinterpret_cast(vm::reservation_update) }, { "sys_config_io_event", reinterpret_cast(ppu_get_syscall(523)) }, }; @@ -1571,6 +1575,8 @@ extern void ppu_initialize(const ppu_module& info) u8 output[20]; sha1_starts(&ctx); + int has_dcbz = !!g_cfg.core.accurate_cache_line_stores; + for (const auto& func : part.funcs) { if (func.size == 0) @@ -1614,6 +1620,18 @@ extern void ppu_initialize(const ppu_module& info) addr = roff + 4; } + if (has_dcbz == 1) + { + for (u32 i = addr, end = block.second + block.first - 1; i <= end; i += 4) + { + if (g_ppu_itype.decode(vm::read32(i)) == ppu_itype::DCBZ) + { + has_dcbz = 2; + break; + } + } + } + // Hash from addr to the end of the block sha1_update(&ctx, vm::_ptr(addr), block.second - (addr - block.first)); } @@ -1623,6 +1641,18 @@ extern void ppu_initialize(const ppu_module& info) continue; } + if (has_dcbz == 1) + { + for (u32 i = func.addr, end = func.addr + func.size - 1; i <= end; i += 4) + { + if (g_ppu_itype.decode(vm::read32(i)) == ppu_itype::DCBZ) + { + has_dcbz = 2; + break; + } + } + } + sha1_update(&ctx, vm::_ptr(func.addr), func.size); } @@ -1641,6 +1671,7 @@ extern void ppu_initialize(const ppu_module& info) accurate_fma, accurate_ppu_vector_nan, java_mode_handling, + accurate_cache_line_stores, __bitset_enum_max }; @@ -1662,6 +1693,10 @@ extern void ppu_initialize(const ppu_module& info) { settings += ppu_settings::java_mode_handling; } + if (has_dcbz == 2) + { + settings += ppu_settings::accurate_cache_line_stores; + } // Write version, hash, CPU, settings fmt::append(obj_name, "v3-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 67e4af6ea8..97920f1034 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -3502,8 +3502,16 @@ void PPUTranslator::ICBI(ppu_opcode_t op) void PPUTranslator::DCBZ(ppu_opcode_t op) { - const auto ptr = GetMemory(m_ir->CreateAnd(op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb), -128), GetType()); - Call(GetType(), "llvm.memset.p0i8.i32", ptr, m_ir->getInt8(0), m_ir->getInt32(128), m_ir->getTrue()); + const auto addr = m_ir->CreateAnd(op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb), -128); + + if (g_cfg.core.accurate_cache_line_stores) + { + Call(GetType(), "__dcbz", addr); + } + else + { + Call(GetType(), "llvm.memset.p0i8.i32", GetMemory(addr, GetType()), m_ir->getInt8(0), m_ir->getInt32(128), m_ir->getTrue()); + } } void PPUTranslator::LWZ(ppu_opcode_t op) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index c79d021304..2b33b7d135 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -229,6 +229,8 @@ static FORCE_INLINE rsx::thread* get_rsx_if_needs_res_pause(u32 addr) extern u64 get_timebased_time(); extern u64 get_system_time(); +void do_cell_atomic_128_store(u32 addr, const void* to_write); + extern thread_local u64 g_tls_fault_spu; namespace spu @@ -606,7 +608,7 @@ const auto spu_putllc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +const auto spu_putlluc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -701,7 +703,7 @@ const auto spu_putlluc_tx = build_function_asm(cpu_flag::wait)); + c.lock().bts(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast(cpu_flag::wait)); // Touch memory if transaction failed without RETRY flag on the first attempt c.cmp(x86::r12, 1); @@ -1474,6 +1476,19 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) { size0 = std::min(128 - (eal & 127), std::min(size, 128)); + if (size0 == 128u && g_cfg.core.accurate_cache_line_stores) + { + // As atomic as PUTLLUC + do_cell_atomic_128_store(eal, src); + + if (size == size0) + { + break; + } + + continue; + } + // Lock each cache line execlusively auto [res, time0] = vm::reservation_lock(eal, size0, vm::dma_lockb); @@ -1937,6 +1952,80 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) } } +void do_cell_atomic_128_store(u32 addr, const void* to_write) +{ + using rdata_t = decltype(spu_thread::rdata); + const auto cpu = get_current_cpu_thread(); + + if (g_use_rtm) [[likely]] + { + const u32 result = spu_putlluc_tx(addr, to_write, cpu); + + const auto render = result != 1 ? get_rsx_if_needs_res_pause(addr) : nullptr; + + if (render) render->pause(); + + if (result == 2) + { + cpu_thread::suspend_all cpu_lock(cpu); + + if (vm::reservation_acquire(addr, 128) & 64) + { + // Wait for PUTLLC to complete + while (vm::reservation_acquire(addr, 128) & 63) + { + busy_wait(100); + } + + mov_rdata(vm::_ref(addr), *static_cast(to_write)); + vm::reservation_acquire(addr, 128) += 64; + } + } + else if (result == 0) + { + cpu_thread::suspend_all cpu_lock(cpu); + + while (vm::reservation_acquire(addr, 128).bts(std::countr_zero(vm::putlluc_lockb))) + { + busy_wait(100); + } + + while (vm::reservation_acquire(addr, 128) & 63) + { + busy_wait(100); + } + + mov_rdata(vm::_ref(addr), *static_cast(to_write)); + vm::reservation_acquire(addr, 128) += 64; + } + + if (render) render->unpause(); + static_cast(cpu->test_stopped()); + } + else + { + auto& data = vm::_ref(addr); + auto [res, time0] = vm::reservation_lock(addr, 128); + + *reinterpret_cast*>(&data) += 0; + + const auto render = get_rsx_if_needs_res_pause(addr); + + if (render) render->pause(); + + auto& super_data = *vm::get_super_ptr(addr); + { + // Full lock (heavyweight) + // TODO: vm::check_addr + vm::writer_lock lock(addr); + mov_rdata(super_data, *static_cast(to_write)); + res.release(time0 + 128); + } + + if (render) render->unpause(); + } +} + void spu_thread::do_putlluc(const spu_mfc_cmd& args) { const u32 addr = args.eal & -128; @@ -1955,77 +2044,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args) // Failure, fallback to the main implementation } - const auto& to_write = _ref(args.lsa & 0x3ff80); - - // Store unconditionally - if (g_use_rtm) [[likely]] - { - const u32 result = spu_putlluc_tx(addr, to_write.data(), this); - - const auto render = result != 1 ? get_rsx_if_needs_res_pause(addr) : nullptr; - - if (render) render->pause(); - - if (result == 2) - { - cpu_thread::suspend_all cpu_lock(this); - - if (vm::reservation_acquire(addr, 128) & 64) - { - // Wait for PUTLLC to complete - while (vm::reservation_acquire(addr, 128) & 63) - { - busy_wait(100); - } - - mov_rdata(vm::_ref(addr), to_write); - vm::reservation_acquire(addr, 128) += 64; - } - } - else if (result == 0) - { - cpu_thread::suspend_all cpu_lock(this); - - while (vm::reservation_acquire(addr, 128).bts(std::countr_zero(vm::putlluc_lockb))) - { - busy_wait(100); - } - - while (vm::reservation_acquire(addr, 128) & 63) - { - busy_wait(100); - } - - mov_rdata(vm::_ref(addr), to_write); - vm::reservation_acquire(addr, 128) += 64; - } - - if (render) render->unpause(); - static_cast(test_stopped()); - } - else - { - auto& data = vm::_ref(addr); - auto [res, time0] = vm::reservation_lock(addr, 128); - - *reinterpret_cast*>(&data) += 0; - - const auto render = get_rsx_if_needs_res_pause(addr); - - if (render) render->pause(); - - auto& super_data = *vm::get_super_ptr(addr); - { - // Full lock (heavyweight) - // TODO: vm::check_addr - vm::writer_lock lock(addr); - mov_rdata(super_data, to_write); - res.release(time0 + 128); - } - - if (render) render->unpause(); - } - + do_cell_atomic_128_store(addr, _ptr(args.lsa & 0x3ff80)); vm::reservation_notifier(addr, 128).notify_all(); } diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index 8e0b1c1117..917b8ad7b4 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -44,6 +44,7 @@ struct cfg_root : cfg::node cfg::_enum spu_block_size{ this, "SPU Block Size", spu_block_size_type::safe }; cfg::_bool spu_accurate_getllar{ this, "Accurate GETLLAR", false, true }; cfg::_bool spu_accurate_dma{ this, "Accurate SPU DMA", false }; + cfg::_bool accurate_cache_line_stores{ this, "Accurate Cache Line Stores", false }; cfg::_bool rsx_accurate_res_access{this, "Accurate RSX reservation access", false, true}; cfg::_bool spu_verification{ this, "SPU Verification", true }; // Should be enabled cfg::_bool spu_cache{ this, "SPU Cache", true }; diff --git a/rpcs3/rpcs3qt/emu_settings_type.h b/rpcs3/rpcs3qt/emu_settings_type.h index ac28aa286b..ea6ff93c2a 100644 --- a/rpcs3/rpcs3qt/emu_settings_type.h +++ b/rpcs3/rpcs3qt/emu_settings_type.h @@ -22,6 +22,7 @@ enum class emu_settings_type EnableTSX, AccurateGETLLAR, AccurateSpuDMA, + AccurateClineStores, AccurateLLVMdfma, AccurateVectorNaN, AccurateRSXAccess, @@ -163,6 +164,7 @@ static const QMap settings_location = { emu_settings_type::EnableTSX, { "Core", "Enable TSX"}}, { emu_settings_type::AccurateGETLLAR, { "Core", "Accurate GETLLAR"}}, { emu_settings_type::AccurateSpuDMA, { "Core", "Accurate SPU DMA"}}, + { emu_settings_type::AccurateClineStores, { "Core", "Accurate Cache Line Stores"}}, { emu_settings_type::AccurateLLVMdfma, { "Core", "LLVM Accurate DFMA"}}, { emu_settings_type::AccurateVectorNaN, { "Core", "PPU LLVM Accurate Vector NaN values"}}, { emu_settings_type::AccurateRSXAccess, { "Core", "Accurate RSX reservation access"}}, diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp index b86e49dbaa..468f7f7142 100644 --- a/rpcs3/rpcs3qt/settings_dialog.cpp +++ b/rpcs3/rpcs3qt/settings_dialog.cpp @@ -1735,6 +1735,9 @@ settings_dialog::settings_dialog(std::shared_ptr gui_settings, std m_emu_settings->EnhanceCheckBox(ui->accurateSpuDMA, emu_settings_type::AccurateSpuDMA); SubscribeTooltip(ui->accurateSpuDMA, tooltips.settings.accurate_spu_dma); + m_emu_settings->EnhanceCheckBox(ui->accurateClineStores, emu_settings_type::AccurateClineStores); + SubscribeTooltip(ui->accurateClineStores, tooltips.settings.accurate_cache_line_stores); + m_emu_settings->EnhanceCheckBox(ui->accurateRSXAccess, emu_settings_type::AccurateRSXAccess); SubscribeTooltip(ui->accurateRSXAccess, tooltips.settings.accurate_rsx_access); diff --git a/rpcs3/rpcs3qt/settings_dialog.ui b/rpcs3/rpcs3qt/settings_dialog.ui index 2fd8554818..a8605316a1 100644 --- a/rpcs3/rpcs3qt/settings_dialog.ui +++ b/rpcs3/rpcs3qt/settings_dialog.ui @@ -3506,6 +3506,13 @@ + + + + Accurate Cache Line Stores + + + diff --git a/rpcs3/rpcs3qt/tooltips.h b/rpcs3/rpcs3qt/tooltips.h index 054e9bbb72..bb8eda9f78 100644 --- a/rpcs3/rpcs3qt/tooltips.h +++ b/rpcs3/rpcs3qt/tooltips.h @@ -78,6 +78,7 @@ public: const QString set_daz_and_ftz = tr("Sets special MXCSR flags to debug errors in SSE operations.\nOnly used in PPU thread when it's not precise.\nOnly useful to developers.\nNever use this."); const QString accurate_getllar = tr("Accurately processes SPU MFC_GETLLAR operation."); const QString accurate_spu_dma = tr("Accurately processes SPU DMA operations."); + const QString accurate_cache_line_stores = tr("Accurately processes PPU DCBZ instruction.\nIn addition, when combined with Accurate SPU DMA, SPU PUT cache line accesses will be processed atomically."); const QString accurate_llvm_dfma = tr("Provides extra accuracy on FMA instructions at the cost of performance.\nWhile disabling it might give a decent performance boost if your CPU doesn't support FMA, it may also introduce subtle bugs that otherwise do not occur.\nYou can't disable it if your CPU supports FMA."); const QString accurate_vector_nan = tr("Forces the floating point NaN (Not A Number) values outputted from PPU vector instructions to be accurate to the real hardware. (0x7FC00000)"); const QString accurate_rsx_access = tr("Forces RSX pauses on SPU MFC_GETLLAR and SPU MFC_PUTLLUC operations.");