From f4ca6f02a19cbd6addb4fd9529366f8c3092a5ae Mon Sep 17 00:00:00 2001 From: Eladash Date: Tue, 7 Apr 2020 20:29:11 +0300 Subject: [PATCH] PPU: Implement support for 128-byte reservations coherency --- rpcs3/Emu/Cell/PPUThread.cpp | 522 +++++++++++++++++++++++++++---- rpcs3/Emu/Cell/PPUThread.h | 2 + rpcs3/Emu/Cell/PPUTranslator.cpp | 12 + rpcs3/Emu/Cell/SPUThread.cpp | 121 +++---- rpcs3/Emu/Cell/SPUThread.h | 2 +- rpcs3/Emu/Memory/vm.cpp | 2 +- rpcs3/Emu/RSX/RSXThread.h | 21 ++ rpcs3/Emu/system_config.h | 1 + 8 files changed, 539 insertions(+), 144 deletions(-) diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 63f23b4294..f38e2a3a08 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -4,6 +4,7 @@ #include "Utilities/JIT.h" #include "Crypto/sha1.h" #include "Emu/Memory/vm_reservation.h" +#include "Emu/RSX/RSXThread.h" #include "Emu/VFS.h" #include "PPUThread.h" #include "PPUInterpreter.h" @@ -72,6 +73,15 @@ extern atomic_t g_progr; extern atomic_t g_progr_ptotal; extern atomic_t g_progr_pdone; +// Should be of the same type +using spu_rdata_t = decltype(ppu_thread::full_rdata); + +extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src); +extern bool cmp_rdata(const spu_rdata_t& _lhs, const spu_rdata_t& _rhs); + +// Verify AVX availability for TSX transactions +static const bool s_tsx_avx = utils::has_avx(); + template <> void fmt_class_string::format(std::string& out, u64 arg) { @@ -1095,6 +1105,58 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) ppu.raddr = addr; const u64 mask_res = g_use_rtm ? (-128 | vm::dma_lockb) : -1; + if (const s32 max = g_cfg.core.ppu_128_reservations_loop_max_length) + { + // If we use it in HLE it means we want the accurate version + ppu.use_full_rdata = max < 0 || ppu.current_function || [&]() + { + const u32 cia = ppu.cia; + + if ((cia & 0xffff) >= 0x10000u - max * 4) + { + // Do not cross 64k boundary + return true; + } + + const auto inst = vm::_ptr>(cia); + + // Search for STWCX or STDCX nearby (LDARX-STWCX and LWARX-STDCX loops will use accurate 128-byte reservations) + constexpr u32 store_cond = se_storage::swap(sizeof(T) == 8 ? 0x7C00012D : 0x7C0001AD); + constexpr u32 mask = se_storage::swap(0xFC0007FF); + + const auto store_vec = v128::from32p(store_cond); + const auto mask_vec = v128::from32p(mask); + + s32 i = 2; + + for (const s32 _max = max - 3; i < _max; i += 4) + { + const auto _inst = v128::loadu(inst + i) & mask_vec; + + if (_mm_movemask_epi8(v128::eq32(_inst, store_vec).vi)) + { + return false; + } + } + + for (; i < max; i++) + { + const u32 val = inst[i] & mask; + + if (val == store_cond) + { + return false; + } + } + + return true; + }(); + } + else + { + ppu.use_full_rdata = false; + } + for (u64 count = 0;; [&]() { if (ppu.state) @@ -1120,7 +1182,13 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) continue; } - ppu.rdata = data; + const u64 rdata = data; + + if (ppu.use_full_rdata) + { + mov_rdata(ppu.full_rdata, vm::_ref(addr & -128)); + reinterpret_cast&>(ppu.full_rdata[addr & 0x78]) = rdata; // Must match with rdata and must be of atomic 64-bits load + } if ((vm::reservation_acquire(addr, sizeof(T)) & mask_res) == ppu.rtime) [[likely]] { @@ -1129,6 +1197,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) ppu_log.warning("%s took too long: %u", sizeof(T) == 4 ? "LWARX" : "LDARX", count); } + ppu.rdata = rdata; return static_cast(ppu.rdata << data_off >> size_off); } } @@ -1190,6 +1259,251 @@ const auto ppu_stcx_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +{ + using namespace asmjit; + + Label fall = c.newLabel(); + Label fail = c.newLabel(); + Label _ret = c.newLabel(); + Label skip = c.newLabel(); + Label next = c.newLabel(); + + //if (utils::has_avx() && !s_tsx_avx) + //{ + // c.vzeroupper(); + //} + + // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) + c.push(x86::rbp); + c.push(x86::r13); + c.push(x86::r12); + c.push(x86::rbx); + c.sub(x86::rsp, 40); +#ifdef _WIN32 + if (!s_tsx_avx) + { + c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); + c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); + } +#endif + + // Prepare registers + c.mov(x86::rbx, imm_ptr(+vm::g_reservations)); + c.mov(x86::rax, imm_ptr(&vm::g_base_addr)); + c.mov(x86::rbp, x86::qword_ptr(x86::rax)); + c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); + c.and_(x86::rbp, -128); + c.movzx(args[0].r32(), args[0].r16()); + c.shr(args[0].r32(), 1); + c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0])); + c.and_(x86::rbx, -128 / 2); + c.xor_(x86::r12d, x86::r12d); + c.mov(x86::r13, args[1]); + c.bswap(args[3]); + + // Prepare data + if (s_tsx_avx) + { + c.vmovups(x86::ymm0, x86::yword_ptr(args[2], 0)); + c.vmovups(x86::ymm1, x86::yword_ptr(args[2], 32)); + c.vmovups(x86::ymm2, x86::yword_ptr(args[2], 64)); + c.vmovups(x86::ymm3, x86::yword_ptr(args[2], 96)); + } + else + { + c.movaps(x86::xmm0, x86::oword_ptr(args[2], 0)); + c.movaps(x86::xmm1, x86::oword_ptr(args[2], 16)); + c.movaps(x86::xmm2, x86::oword_ptr(args[2], 32)); + c.movaps(x86::xmm3, x86::oword_ptr(args[2], 48)); + c.movaps(x86::xmm4, x86::oword_ptr(args[2], 64)); + c.movaps(x86::xmm5, x86::oword_ptr(args[2], 80)); + c.movaps(x86::xmm6, x86::oword_ptr(args[2], 96)); + c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112)); + } + + // Begin transaction + build_transaction_enter(c, fall, x86::r12, 4); + c.mov(x86::rax, x86::qword_ptr(x86::rbx)); + c.and_(x86::rax, -128); + c.cmp(x86::rax, x86::r13); + c.jne(fail); + c.test(x86::qword_ptr(x86::rbx), 127); + c.jnz(skip); + + if (s_tsx_avx) + { + c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0)); + c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32)); + c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64)); + c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96)); + c.vorps(x86::ymm0, x86::ymm0, x86::ymm1); + c.vorps(x86::ymm1, x86::ymm2, x86::ymm3); + c.vorps(x86::ymm0, x86::ymm1, x86::ymm0); + c.vptest(x86::ymm0, x86::ymm0); + } + else + { + c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); + c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); + c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); + c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); + c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); + c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); + c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); + c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); + c.orps(x86::xmm0, x86::xmm1); + c.orps(x86::xmm2, x86::xmm3); + c.orps(x86::xmm4, x86::xmm5); + c.orps(x86::xmm6, x86::xmm7); + c.orps(x86::xmm0, x86::xmm2); + c.orps(x86::xmm4, x86::xmm6); + c.orps(x86::xmm0, x86::xmm4); + c.ptest(x86::xmm0, x86::xmm0); + } + + c.jnz(fail); + + c.mov(x86::rax, x86::rbp); + c.shl(args[0], 1); + c.or_(x86::rax, args[0]); + c.mov(x86::qword_ptr(x86::rax), args[3]); + c.shr(args[0], 1); + + c.sub(x86::qword_ptr(x86::rbx), -128); + c.xend(); + c.mov(x86::eax, 1); + c.jmp(_ret); + + c.bind(skip); + c.xor_(x86::eax, x86::eax); + c.xor_(x86::r12d, x86::r12d); + build_transaction_abort(c, 0); + //c.jmp(fall); + + c.bind(fall); + c.sar(x86::eax, 24); + c.js(fail); + c.lock().bts(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::full_rdata)), static_cast(cpu_flag::wait)); + + // Touch memory if transaction failed without RETRY flag on the first attempt + c.cmp(x86::r12, 1); + c.jne(next); + c.xor_(x86::rbp, 0xf80); + c.lock().add(x86::dword_ptr(x86::rbp), 0); + c.xor_(x86::rbp, 0xf80); + + Label fall2 = c.newLabel(); + Label fail2 = c.newLabel(); + + // Lightened transaction: only compare and swap data + c.bind(next); + + // Try to "lock" reservation + c.mov(x86::rax, x86::r13); + c.add(x86::r13, 1); + c.lock().cmpxchg(x86::qword_ptr(x86::rbx), x86::r13); + c.jne(fail); + + build_transaction_enter(c, fall2, x86::r12, 666); + + if (s_tsx_avx) + { + c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0)); + c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32)); + c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64)); + c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96)); + c.vorps(x86::ymm0, x86::ymm0, x86::ymm1); + c.vorps(x86::ymm1, x86::ymm2, x86::ymm3); + c.vorps(x86::ymm0, x86::ymm1, x86::ymm0); + c.vptest(x86::ymm0, x86::ymm0); + } + else + { + c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); + c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); + c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); + c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); + c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); + c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); + c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); + c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); + c.orps(x86::xmm0, x86::xmm1); + c.orps(x86::xmm2, x86::xmm3); + c.orps(x86::xmm4, x86::xmm5); + c.orps(x86::xmm6, x86::xmm7); + c.orps(x86::xmm0, x86::xmm2); + c.orps(x86::xmm4, x86::xmm6); + c.orps(x86::xmm0, x86::xmm4); + c.ptest(x86::xmm0, x86::xmm0); + } + + c.jnz(fail2); + + c.mov(x86::rax, x86::rbp); + c.shl(args[0], 1); + c.or_(x86::rax, args[0]); + c.mov(x86::qword_ptr(x86::rax), args[3]); + c.shr(args[0], 1); + + c.xend(); + c.lock().add(x86::qword_ptr(x86::rbx), 127); + c.mov(x86::eax, 1); + c.jmp(_ret); + + c.bind(fall2); + c.sar(x86::eax, 24); + c.js(fail2); + c.mov(x86::eax, 2); + c.jmp(_ret); + + c.bind(fail); + build_transaction_abort(c, 0xff); + c.xor_(x86::eax, x86::eax); + c.jmp(_ret); + + c.bind(fail2); + build_transaction_abort(c, 0xff); + c.lock().sub(x86::qword_ptr(x86::rbx), 1); + c.xor_(x86::eax, x86::eax); + //c.jmp(_ret); + + c.bind(_ret); + +#ifdef _WIN32 + if (s_tsx_avx) + { + c.vmovups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); + c.vmovups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); + } + else + { + c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); + c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); + c.movups(x86::xmm8, x86::oword_ptr(x86::rsp, 32)); + c.movups(x86::xmm9, x86::oword_ptr(x86::rsp, 48)); + c.movups(x86::xmm10, x86::oword_ptr(x86::rsp, 64)); + c.movups(x86::xmm11, x86::oword_ptr(x86::rsp, 80)); + c.movups(x86::xmm12, x86::oword_ptr(x86::rsp, 96)); + c.movups(x86::xmm13, x86::oword_ptr(x86::rsp, 112)); + c.movups(x86::xmm14, x86::oword_ptr(x86::rsp, 128)); + c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144)); + } +#endif + + if (s_tsx_avx) + { + c.vzeroupper(); + } + + c.add(x86::rsp, 40); + c.pop(x86::rbx); + c.pop(x86::r12); + c.pop(x86::r13); + c.pop(x86::rbp); + c.ret(); +}); + template static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) { @@ -1199,11 +1513,9 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) } auto& data = vm::_ref>(addr & -8); - constexpr u64 size_off = (sizeof(T) * 8) & 63; - const u64 old_data = ppu.rdata; - - const T old_data = static_cast(ppu.rdata << ((addr & 7) * 8) >> size_off); auto& res = vm::reservation_acquire(addr, sizeof(T)); + const u64 old_data = ppu.rdata; + const u64 rtime = ppu.rtime; if constexpr (sizeof(T) == sizeof(u32)) { @@ -1223,80 +1535,151 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) reg_value = bf.all; } - if ((std::exchange(ppu.raddr, 0) ^ addr) & -8 || old_data != data || ppu.rtime != (res & -128)) + // Test if store address is on the same aligned 8-bytes memory as load + if (const u32 raddr = std::exchange(ppu.raddr, 0); raddr / 8 != addr / 8) { - // Even when the reservation address does not match the target address must be valid - if (!vm::check_addr(addr, 1, vm::page_writable)) + // If not and it is on the same aligned 128-byte memory, proceed only if 128-byte reservations are enabled + // In realhw the store address can be at any address of the 128-byte cache line + if (raddr / 128 != addr / 128 || !ppu.use_full_rdata) { - // Access violate - data += 0; - } - - return false; - } - - if (reg_value == old_data) - { - if (res.compare_and_swap_test(ppu.rtime, ppu.rtime + 128)) - { - res.notify_all(); - return true; - } - - return false; - } - - addr &= -8; - - if (g_use_rtm) [[likely]] - { - switch (ppu_stcx_tx(addr, ppu.rtime, old_data, reg_value)) - { - case 0: - { - // Reservation lost - return false; - } - case 1: - { - res.notify_all(); - return true; - } - } - - if (res == ppu.rtime && vm::reservation_trylock(res, ppu.rtime)) - { - if (data.compare_and_swap_test(old_data, reg_value)) + // Even when the reservation address does not match the target address must be valid + if (!vm::check_addr(addr, 1, vm::page_writable)) { - res += 127; - res.notify_all(); - return true; + // Access violate + data += 0; } - res -= 1; + return false; + } + } + + if (old_data != data || rtime != (res & -128)) + { + return false; + } + + if ([&]() + { + if (ppu.use_full_rdata) [[unlikely]] + { + if (g_use_rtm) [[likely]] + { + switch (ppu_stcx_accurate_tx(addr & -8, rtime, ppu.full_rdata, reg_value)) + { + case 0: + { + // Reservation lost + return false; + } + case 1: + { + return true; + } + default: break; + } + + cpu_thread::suspend_all cpu_lock(&ppu); + + // Give up if PUTLLUC happened + if (res == (rtime | 1) && cmp_rdata(ppu.full_rdata, vm::_ref(addr & -128))) + { + data.release(reg_value); + res.release(rtime + 128); + return true; + } + + res.release(rtime); + return false; + } + + if (!vm::reservation_trylock(res, rtime)) + { + return false; + } + + // Align address: we do not need the lower 7 bits anymore + addr &= -128; + + // Cache line data + auto& cline_data = vm::_ref(addr); + + data += 0; + + const auto render = rsx::get_rsx_if_needs_res_pause(addr); + + if (render) render->pause(); + + auto& super_data = *vm::get_super_ptr(addr); + const bool success = [&]() + { + // Full lock (heavyweight) + // TODO: vm::check_addr + vm::writer_lock lock(addr); + + if (cmp_rdata(ppu.full_rdata, super_data)) + { + data.release(reg_value); + res.release(rtime + 128); + return true; + } + + res.release(rtime); + return false; + }(); + + if (render) render->unpause(); + return success; } - return false; - } + if (reg_value == old_data) + { + return res.compare_and_swap_test(rtime, rtime + 128); + } - if (!vm::reservation_trylock(res, ppu.rtime)) + // Aligned 8-byte reservations will be used here + addr &= -8; + + if (g_use_rtm) [[likely]] + { + switch (ppu_stcx_tx(addr, rtime, old_data, reg_value)) + { + case 0: + { + // Reservation lost + return false; + } + case 1: + { + return true; + } + default: break; + } + + if (res == rtime && vm::reservation_trylock(res, rtime)) + { + const bool ret = data.compare_and_swap_test(old_data, reg_value); + res.release(rtime + 128); + return ret; + } + + return false; + } + + if (!vm::reservation_trylock(res, ppu.rtime)) + { + return false; + } + + const bool ret = data.compare_and_swap_test(old_data, reg_value); + res.release(rtime + 128); + return ret; + }()) { - return false; - } - - const bool result = data.compare_and_swap_test(old_data, reg_value); - - if (result) - { - res.release(ppu.rtime + 128); res.notify_all(); - } - else - { - res.release(ppu.rtime); + return true; } - return result; + return false; } extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value) @@ -1645,6 +2028,7 @@ extern void ppu_initialize(const ppu_module& info) accurate_ppu_vector_nan, java_mode_handling, accurate_cache_line_stores, + reservations_128_byte, __bitset_enum_max }; @@ -1670,6 +2054,10 @@ extern void ppu_initialize(const ppu_module& info) { settings += ppu_settings::accurate_cache_line_stores; } + if (g_cfg.core.ppu_128_reservations_loop_max_length > 0) + { + settings += ppu_settings::reservations_128_byte; + } // Write version, hash, CPU, settings fmt::append(obj_name, "v3-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); diff --git a/rpcs3/Emu/Cell/PPUThread.h b/rpcs3/Emu/Cell/PPUThread.h index c36ee89746..6e5d3ca16b 100644 --- a/rpcs3/Emu/Cell/PPUThread.h +++ b/rpcs3/Emu/Cell/PPUThread.h @@ -194,6 +194,8 @@ public: u32 raddr{0}; // Reservation addr u64 rtime{0}; u64 rdata{0}; // Reservation data + alignas(64) std::byte full_rdata[128]{}; // Full reservation data + bool use_full_rdata{}; atomic_t prio{0}; // Thread priority (0..3071) const u32 stack_size; // Stack size diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 97920f1034..af189c15ee 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -2527,6 +2527,12 @@ void PPUTranslator::MFOCRF(ppu_opcode_t op) void PPUTranslator::LWARX(ppu_opcode_t op) { + if (g_cfg.core.ppu_128_reservations_loop_max_length > 0) + { + // CIA will be used in lwarx handler + m_ir->CreateStore(Trunc(GetAddr(), GetType()), m_ir->CreateStructGEP(nullptr, m_thread, static_cast(&m_cia - m_locals)), true); + } + SetGpr(op.rd, Call(GetType(), "__lwarx", m_thread, op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb))); } @@ -2663,6 +2669,12 @@ void PPUTranslator::MULHW(ppu_opcode_t op) void PPUTranslator::LDARX(ppu_opcode_t op) { + if (g_cfg.core.ppu_128_reservations_loop_max_length > 0) + { + // CIA will be used in ldarx handler + m_ir->CreateStore(Trunc(GetAddr(), GetType()), m_ir->CreateStructGEP(nullptr, m_thread, static_cast(&m_cia - m_locals)), true); + } + SetGpr(op.rd, Call(GetType(), "__ldarx", m_thread, op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb))); } diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 2b33b7d135..94727d2cb8 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -26,6 +26,8 @@ #include #include +using spu_rdata_t = decltype(spu_thread::rdata); + template <> void fmt_class_string::format(std::string& out, u64 arg) { @@ -121,15 +123,22 @@ static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs) #endif } -static FORCE_INLINE bool cmp_rdata(const decltype(spu_thread::rdata)& lhs, const decltype(spu_thread::rdata)& rhs) +#ifdef _MSC_VER +__forceinline +#else +__attribute__((always_inline)) +#endif +extern bool cmp_rdata(const spu_rdata_t& _lhs, const spu_rdata_t& _rhs) { #ifndef __AVX__ if (s_tsx_avx) [[likely]] #endif { - return cmp_rdata_avx(reinterpret_cast(&lhs), reinterpret_cast(&rhs)); + return cmp_rdata_avx(reinterpret_cast(_lhs), reinterpret_cast(_rhs)); } + const auto lhs = reinterpret_cast(_lhs); + const auto rhs = reinterpret_cast(_rhs); const v128 a = (lhs[0] ^ rhs[0]) | (lhs[1] ^ rhs[1]); const v128 b = (lhs[2] ^ rhs[2]) | (lhs[3] ^ rhs[3]); const v128 c = (lhs[4] ^ rhs[4]) | (lhs[5] ^ rhs[5]); @@ -170,60 +179,23 @@ static FORCE_INLINE void mov_rdata_avx(__m256i* dst, const __m256i* src) #endif } -static FORCE_INLINE void mov_rdata(decltype(spu_thread::rdata)& dst, const decltype(spu_thread::rdata)& src) +#ifdef _MSC_VER +__forceinline +#else +__attribute__((always_inline)) +#endif +extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src) { #ifndef __AVX__ if (s_tsx_avx) [[likely]] #endif { - mov_rdata_avx(reinterpret_cast<__m256i*>(&dst), reinterpret_cast(&src)); + mov_rdata_avx(reinterpret_cast<__m256i*>(_dst), reinterpret_cast(_src)); return; } - { - const v128 data0 = src[0]; - const v128 data1 = src[1]; - const v128 data2 = src[2]; - dst[0] = data0; - dst[1] = data1; - dst[2] = data2; - } - - { - const v128 data0 = src[3]; - const v128 data1 = src[4]; - const v128 data2 = src[5]; - dst[3] = data0; - dst[4] = data1; - dst[5] = data2; - } - - { - const v128 data0 = src[6]; - const v128 data1 = src[7]; - dst[6] = data0; - dst[7] = data1; - } -} - -// Returns nullptr if rsx does not need pausing on reservations op, rsx ptr otherwise -static FORCE_INLINE rsx::thread* get_rsx_if_needs_res_pause(u32 addr) -{ - if (!g_cfg.core.rsx_accurate_res_access) [[likely]] - { - return {}; - } - - const auto render = rsx::get_current_renderer(); - - ASSUME(render); - - if (render->iomap_table.io[addr >> 20].load() == umax) [[likely]] - { - return {}; - } - - return render; + // TODO: use std::assume_aligned + std::memcpy(reinterpret_cast(_dst), reinterpret_cast(_src), 128); } extern u64 get_timebased_time(); @@ -1402,7 +1374,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } case 128: { - mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); break; } default: @@ -1424,7 +1396,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } } - if (time0 != vm::reservation_acquire(eal, size0) || (size0 == 128 && !cmp_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)))) + if (time0 != vm::reservation_acquire(eal, size0) || (size0 == 128 && !cmp_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)))) { continue; } @@ -1496,7 +1468,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) { case 128: { - mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); break; } default: @@ -1572,7 +1544,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) while (size0 >= 128) { - mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); dst += 128; src += 128; @@ -1606,7 +1578,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) while (size >= 128) { - mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); dst += 128; src += 128; @@ -1671,7 +1643,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) while (size >= 128) { - mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); dst += 128; src += 128; @@ -1849,7 +1821,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) return false; } - const auto& to_write = _ref(args.lsa & 0x3ff80); + const auto& to_write = _ref(args.lsa & 0x3ff80); auto& res = vm::reservation_acquire(addr, 128); if (!g_use_rtm && rtime != res) @@ -1860,16 +1832,16 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) if (!g_use_rtm && cmp_rdata(to_write, rdata)) { // Writeback of unchanged data. Only check memory change - return cmp_rdata(rdata, vm::_ref(addr)) && res.compare_and_swap_test(rtime, rtime + 128); + return cmp_rdata(rdata, vm::_ref(addr)) && res.compare_and_swap_test(rtime, rtime + 128); } if (g_use_rtm) [[likely]] { - switch (spu_putllc_tx(addr, rtime, rdata.data(), to_write.data())) + switch (spu_putllc_tx(addr, rtime, rdata, to_write)) { case 2: { - const auto render = get_rsx_if_needs_res_pause(addr); + const auto render = rsx::get_rsx_if_needs_res_pause(addr); if (render) render->pause(); @@ -1878,7 +1850,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) // Give up if PUTLLUC happened if (res == (rtime | 1)) { - auto& data = vm::_ref(addr); + auto& data = vm::_ref(addr); if (cmp_rdata(rdata, data)) { @@ -1906,11 +1878,11 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) vm::_ref>(addr) += 0; - const auto render = get_rsx_if_needs_res_pause(addr); + const auto render = rsx::get_rsx_if_needs_res_pause(addr); if (render) render->pause(); - auto& super_data = *vm::get_super_ptr(addr); + auto& super_data = *vm::get_super_ptr(addr); const bool success = [&]() { // Full lock (heavyweight) @@ -1941,7 +1913,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) if (raddr) { // Last check for event before we clear the reservation - if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) || !cmp_rdata(rdata, vm::_ref(raddr))) + if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) || !cmp_rdata(rdata, vm::_ref(raddr))) { set_events(SPU_EVENT_LR); } @@ -1954,14 +1926,13 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) void do_cell_atomic_128_store(u32 addr, const void* to_write) { - using rdata_t = decltype(spu_thread::rdata); const auto cpu = get_current_cpu_thread(); if (g_use_rtm) [[likely]] { const u32 result = spu_putlluc_tx(addr, to_write, cpu); - const auto render = result != 1 ? get_rsx_if_needs_res_pause(addr) : nullptr; + const auto render = result != 1 ? rsx::get_rsx_if_needs_res_pause(addr) : nullptr; if (render) render->pause(); @@ -1977,7 +1948,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) busy_wait(100); } - mov_rdata(vm::_ref(addr), *static_cast(to_write)); + mov_rdata(vm::_ref(addr), *static_cast(to_write)); vm::reservation_acquire(addr, 128) += 64; } } @@ -1995,7 +1966,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) busy_wait(100); } - mov_rdata(vm::_ref(addr), *static_cast(to_write)); + mov_rdata(vm::_ref(addr), *static_cast(to_write)); vm::reservation_acquire(addr, 128) += 64; } @@ -2004,21 +1975,21 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) } else { - auto& data = vm::_ref(addr); + auto& data = vm::_ref(addr); auto [res, time0] = vm::reservation_lock(addr, 128); *reinterpret_cast*>(&data) += 0; - const auto render = get_rsx_if_needs_res_pause(addr); + const auto render = rsx::get_rsx_if_needs_res_pause(addr); if (render) render->pause(); - auto& super_data = *vm::get_super_ptr(addr); + auto& super_data = *vm::get_super_ptr(addr); { // Full lock (heavyweight) // TODO: vm::check_addr vm::writer_lock lock(addr); - mov_rdata(super_data, *static_cast(to_write)); + mov_rdata(super_data, *static_cast(to_write)); res.release(time0 + 128); } @@ -2044,7 +2015,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args) // Failure, fallback to the main implementation } - do_cell_atomic_128_store(addr, _ptr(args.lsa & 0x3ff80)); + do_cell_atomic_128_store(addr, _ptr(args.lsa & 0x3ff80)); vm::reservation_notifier(addr, 128).notify_all(); } @@ -2202,7 +2173,7 @@ bool spu_thread::process_mfc_cmd() case MFC_GETLLAR_CMD: { const u32 addr = ch_mfc_cmd.eal & -128; - const auto& data = vm::_ref(addr); + const auto& data = vm::_ref(addr); if (addr == raddr && !g_use_rtm && g_cfg.core.spu_getllar_polling_detection && rtime == vm::reservation_acquire(addr, 128) && cmp_rdata(rdata, data)) { @@ -2210,7 +2181,7 @@ bool spu_thread::process_mfc_cmd() std::this_thread::yield(); } - auto& dst = _ref(ch_mfc_cmd.lsa & 0x3ff80); + auto& dst = _ref(ch_mfc_cmd.lsa & 0x3ff80); u64 ntime; for (u64 i = 0;; [&]() @@ -2269,7 +2240,7 @@ bool spu_thread::process_mfc_cmd() if (raddr && raddr != addr) { // Last check for event before we replace the reservation with a new one - if ((vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) != rtime || !cmp_rdata(rdata, vm::_ref(raddr))) + if ((vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) != rtime || !cmp_rdata(rdata, vm::_ref(raddr))) { set_events(SPU_EVENT_LR); } @@ -2443,7 +2414,7 @@ spu_thread::ch_events_t spu_thread::get_events(u32 mask_hint, bool waiting, bool u32 collect = 0; // Check reservation status and set SPU_EVENT_LR if lost - if (mask_hint & SPU_EVENT_LR && raddr && ((vm::reservation_acquire(raddr, sizeof(rdata)) & -128) != rtime || !cmp_rdata(rdata, vm::_ref(raddr)))) + if (mask_hint & SPU_EVENT_LR && raddr && ((vm::reservation_acquire(raddr, sizeof(rdata)) & -128) != rtime || !cmp_rdata(rdata, vm::_ref(raddr)))) { collect |= SPU_EVENT_LR; raddr = 0; diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 79d52ca2d1..b37c792c17 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -669,7 +669,7 @@ public: // Reservation Data u64 rtime = 0; - alignas(64) std::array rdata{}; + alignas(64) std::byte rdata[128]{}; u32 raddr = 0; u32 srr0; diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index 0641fe08e7..d8ba1ef808 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -273,7 +273,7 @@ namespace vm { if (auto& ptr = g_tls_locked) { - *ptr = nullptr; + ptr->release(nullptr); ptr = nullptr; if (cpu.state & cpu_flag::memory) diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index 999f8f086c..02b0cf279c 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -24,6 +24,7 @@ #include "Emu/Cell/lv2/sys_rsx.h" #include "Emu/IdManager.h" +#include "Emu/system_config.h" extern u64 get_guest_system_time(); extern u64 get_system_time(); @@ -964,4 +965,24 @@ namespace rsx { return g_fxo->get(); } + + // Returns nullptr if rsx does not need pausing on reservations op, rsx ptr otherwise + inline thread* get_rsx_if_needs_res_pause(u32 addr) + { + if (!g_cfg.core.rsx_accurate_res_access) [[likely]] + { + return {}; + } + + const auto render = get_current_renderer(); + + ASSUME(render); + + if (render->iomap_table.io[addr >> 20].load() == umax) [[likely]] + { + return {}; + } + + return render; + } } diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index 917b8ad7b4..f1a5fd84fe 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -54,6 +54,7 @@ struct cfg_root : cfg::node cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true }; cfg::_bool llvm_accurate_dfma{ this, "LLVM Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively cfg::_bool llvm_ppu_jm_handling{ this, "PPU LLVM Java Mode Handling", false }; // Respect current Java Mode for alti-vec ops by PPU LLVM + cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length cfg::_bool llvm_ppu_accurate_vector_nan{ this, "PPU LLVM Accurate Vector NaN values", false }; cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)