diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 8973aa88dc..d143b86491 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1823,8 +1823,6 @@ const auto ppu_stcx_accurate_tx = build_function_asm(&g_rtm_tx_limit1))); + build_get_tsc(c); + c.sub(x86::rax, stamp0); + c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast(&g_rtm_tx_limit2))); c.jae(fall); }); + c.prefetchw(x86::byte_ptr(x86::rbp, 0)); + c.prefetchw(x86::byte_ptr(x86::rbp, 64)); + + // Check pause flag c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast(cpu_flag::pause)); c.jc(fall); - c.xbegin(tx0); - c.mov(x86::rax, x86::qword_ptr(x86::rbx)); - c.test(x86::eax, vm::rsrv_unique_lock); - c.jnz(skip); - c.and_(x86::rax, -128); - c.cmp(x86::rax, x86::r13); - c.jne(fail); + c.xbegin(tx1); if (s_tsx_avx) { @@ -1943,137 +1938,14 @@ const auto ppu_stcx_accurate_tx = build_function_asm(&g_rtm_tx_limit2))); - c.jae(fall2); - build_get_tsc(c, stamp1); - c.sub(stamp1, x86::rax); - - Label tx1 = build_transaction_enter(c, fall2, [&]() - { - build_get_tsc(c); - c.sub(x86::rax, stamp1); - c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast(&g_rtm_tx_limit2))); - c.jae(fall2); - c.test(x86::qword_ptr(x86::rbx), 127 - 1); - c.jnz(fall2); - }); - c.prefetchw(x86::byte_ptr(x86::rbp, 0)); - c.prefetchw(x86::byte_ptr(x86::rbp, 64)); - - // Check pause flag - c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast(cpu_flag::pause)); - c.jc(fall2); - c.mov(x86::rax, x86::qword_ptr(x86::rbx)); - c.and_(x86::rax, -128); - c.cmp(x86::rax, x86::r13); - c.jne(fail2); - c.xbegin(tx1); - - if (s_tsx_avx) - { - c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0)); - c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32)); - c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64)); - c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96)); - c.vorps(x86::ymm0, x86::ymm0, x86::ymm1); - c.vorps(x86::ymm1, x86::ymm2, x86::ymm3); - c.vorps(x86::ymm0, x86::ymm1, x86::ymm0); - c.vptest(x86::ymm0, x86::ymm0); - } - else - { - c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); - c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); - c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); - c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); - c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); - c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); - c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); - c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); - c.orps(x86::xmm0, x86::xmm1); - c.orps(x86::xmm2, x86::xmm3); - c.orps(x86::xmm4, x86::xmm5); - c.orps(x86::xmm6, x86::xmm7); - c.orps(x86::xmm0, x86::xmm2); - c.orps(x86::xmm4, x86::xmm6); - c.orps(x86::xmm0, x86::xmm4); - c.ptest(x86::xmm0, x86::xmm0); - } - - c.jnz(fail3); - - // Store 8 bytes - c.mov(x86::qword_ptr(x86::rbp, args[0], 1, 0), args[3]); - - c.xend(); - c.lock().add(x86::qword_ptr(x86::rbx), 127); + c.lock().add(x86::qword_ptr(x86::rbx), 64); build_get_tsc(c); c.sub(x86::rax, stamp0); c.jmp(_ret); // XABORT is expensive so try to finish with xend instead - c.bind(fail3); + c.bind(fail); // Load old data to store back in rdata if (s_tsx_avx) @@ -2098,7 +1970,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm(new_data))) @@ -2226,12 +2115,12 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) if ((res & -128) == rtime && cmp_rdata(ppu.rdata, all_data)) { sdata.release(new_data); - res += 127; + res += 64; return true; } mov_rdata_nt(ppu.rdata, all_data); - res -= 1; + res -= 64; return false; }); @@ -2283,23 +2172,6 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) return true; } - auto [_oldd, _ok] = res.fetch_op([&](u64& r) - { - if ((r & -128) != rtime || (r & 127)) - { - return false; - } - - r += vm::rsrv_unique_lock; - return true; - }); - - if (!_ok) - { - // Already locked or updated: give up - return false; - } - // Align address: we do not need the lower 7 bits anymore addr &= -128; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index f8369d4de0..7815ad3e81 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -412,8 +412,6 @@ const auto spu_putllc_tx = build_function_asm(&g_rtm_tx_limit1))); + build_get_tsc(c); + c.sub(x86::rax, stamp0); + c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast(&g_rtm_tx_limit2))); c.jae(fall); }); + c.prefetchw(x86::byte_ptr(x86::rbp, 0)); + c.prefetchw(x86::byte_ptr(x86::rbp, 64)); + + // Check pause flag c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast(cpu_flag::pause)); c.jc(fall); - c.xbegin(tx0); - c.mov(x86::rax, x86::qword_ptr(x86::rbx)); - c.test(x86::eax, vm::rsrv_unique_lock); - c.jnz(skip); - c.and_(x86::rax, -128); - c.cmp(x86::rax, x86::r13); - c.jne(fail); + c.xbegin(tx1); if (s_tsx_avx) { @@ -569,157 +564,15 @@ const auto spu_putllc_tx = build_function_asm(&g_rtm_tx_limit2))); - c.jae(fall2); - - Label tx1 = build_transaction_enter(c, fall2, [&]() - { - c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1); - build_get_tsc(c); - c.sub(x86::rax, stamp1); - c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast(&g_rtm_tx_limit2))); - c.jae(fall2); - c.test(x86::qword_ptr(x86::rbx), 127 - 1); - c.jnz(fall2); - }); - c.prefetchw(x86::byte_ptr(x86::rbp, 0)); - c.prefetchw(x86::byte_ptr(x86::rbp, 64)); - - // Check pause flag - c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast(cpu_flag::pause)); - c.jc(fall2); - c.mov(x86::rax, x86::qword_ptr(x86::rbx)); - c.and_(x86::rax, -128); - c.cmp(x86::rax, x86::r13); - c.jne(fail2); - c.xbegin(tx1); - - if (s_tsx_avx) - { - c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0)); - c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32)); - c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64)); - c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96)); - c.vorps(x86::ymm0, x86::ymm0, x86::ymm1); - c.vorps(x86::ymm1, x86::ymm2, x86::ymm3); - c.vorps(x86::ymm0, x86::ymm1, x86::ymm0); - c.vptest(x86::ymm0, x86::ymm0); - } - else - { - c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); - c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); - c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); - c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); - c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); - c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); - c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); - c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); - c.orps(x86::xmm0, x86::xmm1); - c.orps(x86::xmm2, x86::xmm3); - c.orps(x86::xmm4, x86::xmm5); - c.orps(x86::xmm6, x86::xmm7); - c.orps(x86::xmm0, x86::xmm2); - c.orps(x86::xmm4, x86::xmm6); - c.orps(x86::xmm0, x86::xmm4); - c.ptest(x86::xmm0, x86::xmm0); - } - - c.jnz(fail3); - - if (s_tsx_avx) - { - c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm4); - c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm5); - c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm6); - c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm7); - } - else - { - c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm8); - c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm9); - c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm10); - c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm11); - c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm12); - c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm13); - c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm14); - c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm15); - } - - c.xend(); - c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1); - c.lock().add(x86::qword_ptr(x86::rbx), 127); - build_get_tsc(c); - c.sub(x86::rax, stamp0); - c.jmp(_ret); - // XABORT is expensive so try to finish with xend instead - c.bind(fail3); + c.bind(fail); // Load previous data to store back to rdata if (s_tsx_avx) @@ -745,12 +598,12 @@ const auto spu_putllc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +const auto spu_putlluc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; Label fall = c.newLabel(); Label _ret = c.newLabel(); - Label skip = c.newLabel(); - Label next = c.newLabel(); //if (utils::has_avx() && !s_tsx_avx) //{ @@ -876,94 +727,24 @@ const auto spu_putlluc_tx = build_function_asm(&g_rtm_tx_limit1))); - c.jae(fall); - }); - c.xbegin(tx0); - c.test(x86::qword_ptr(x86::rbx), vm::rsrv_unique_lock); - c.jnz(skip); - - if (s_tsx_avx) - { - c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm0); - c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm1); - c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm2); - c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm3); - } - else - { - c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm0); - c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm1); - c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm2); - c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm3); - c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm4); - c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm5); - c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm6); - c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm7); - } - - c.sub(x86::qword_ptr(x86::rbx), -128); - c.xend(); - c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1); - build_get_tsc(c); - c.sub(x86::rax, stamp0); - c.jmp(_ret); - - c.bind(skip); - c.xend(); - c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1); - build_get_tsc(c, stamp1); - //c.jmp(fall); - - c.bind(fall); - - Label fall2 = c.newLabel(); - - // Lightened transaction - c.bind(next); - - // Lock reservation - c.mov(x86::eax, 1); - c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax); - c.test(x86::eax, 127 - 1); - c.jnz(fall2); - - // Exclude some time spent on touching memory: stamp1 contains last success or failure - c.mov(x86::rax, stamp1); - c.sub(x86::rax, stamp0); - c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast(&g_rtm_tx_limit2))); - c.jae(fall2); - build_get_tsc(c, stamp1); - c.sub(stamp1, x86::rax); - - Label tx1 = build_transaction_enter(c, fall2, [&]() - { - c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1); + // ftx++; + c.add(x86::qword_ptr(args[3]), 1); build_get_tsc(c); - c.sub(x86::rax, stamp1); + c.sub(x86::rax, stamp0); c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast(&g_rtm_tx_limit2))); - c.jae(fall2); + c.jae(fall); }); c.prefetchw(x86::byte_ptr(x86::rbp, 0)); c.prefetchw(x86::byte_ptr(x86::rbp, 64)); - // Check pause flag - c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast(cpu_flag::pause)); - c.jc(fall2); - // Check contention - c.test(x86::qword_ptr(x86::rbx), 127 - 1); - c.jc(fall2); + // // Check pause flag + // c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast(cpu_flag::pause)); + // c.jc(fall); c.xbegin(tx1); if (s_tsx_avx) @@ -986,13 +767,14 @@ const auto spu_putlluc_tx = build_function_asm(cpu_flag::pause)); c.jc(fall); c.mov(x86::rax, x86::qword_ptr(x86::rbx)); - c.and_(x86::rax, ~vm::rsrv_shared_mask); + c.and_(x86::rax, -128); c.cmp(x86::rax, args[3]); c.jne(fall); c.xbegin(tx0); @@ -2784,6 +2566,23 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) return cmp_rdata(rdata, vm::_ref(addr)) && res.compare_and_swap_test(rtime, rtime + 128); } + auto [_oldd, _ok] = res.fetch_op([&](u64& r) + { + if ((r & -128) != rtime || (r & 127)) + { + return false; + } + + r += vm::rsrv_unique_lock; + return true; + }); + + if (!_ok) + { + // Already locked or updated: give up + return false; + } + if (g_use_rtm) [[likely]] { switch (u64 count = spu_putllc_tx(addr, rtime, rdata, to_write)) @@ -2799,14 +2598,14 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) if (cmp_rdata(rdata, data)) { mov_rdata(data, to_write); - res += 127; + res += 64; return true; } } // Save previous data mov_rdata_nt(rdata, data); - res -= 1; + res -= 64; return false; }); @@ -2865,23 +2664,6 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) return true; } - auto [_oldd, _ok] = res.fetch_op([&](u64& r) - { - if ((r & -128) != rtime || (r & 127)) - { - return false; - } - - r += vm::rsrv_unique_lock; - return true; - }); - - if (!_ok) - { - // Already locked or updated: give up - return false; - } - vm::_ref>(addr) += 0; auto& super_data = *vm::get_super_ptr(addr); @@ -2943,24 +2725,104 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) const auto cpu = get_current_cpu_thread(); rsx::reservation_lock rsx_lock(addr, 128); + u64 shared_mem = vm::g_shmem[addr >> 16]; + + if (!shared_mem) + { + shared_mem = addr; + } + if (g_use_rtm) [[likely]] { - u64 result = spu_putlluc_tx(addr, to_write, cpu); + auto& sdata = *vm::get_super_ptr(addr); + auto& res = *utils::bless>(vm::g_reservations + (addr & 0xff80) / 2); + + for (u64 j = 0;; j++) + { + auto [_oldd, _ok] = res.fetch_op([&](u128& r) + { + if (r & 127) + { + return false; + } + + r &= static_cast(r); + r |= u128{shared_mem} << 64; + r |= u128{vm::rsrv_unique_lock | vm::rsrv_putunc_flag}; + return true; + }); + + if (_ok) + { + break; + } + + if (static_cast(_oldd) & vm::rsrv_putunc_flag && static_cast(_oldd >> 64) == shared_mem) + { + // Abandon store + for (u64 k = 0;; k++) + { + if (res ^ _oldd) + { + break; + } + + if (auto cpu = get_current_cpu_thread(); cpu && cpu->state) + { + cpu->check_state(); + } + else if (k < 15) + { + busy_wait(500); + } + else + { + std::this_thread::yield(); + } + } + + return static_cast(cpu->test_stopped()); + } + + if (auto cpu = get_current_cpu_thread(); cpu && cpu->state) + { + cpu->check_state(); + } + else if (j < 15) + { + busy_wait(500); + } + else + { + std::this_thread::yield(); + } + } + + u64 result = 0; + + if (cpu->state & cpu_flag::pause) + { + result = 0; + } + else if (cpu->id_type() != 2) + { + u64 stx, ftx; + result = spu_putlluc_tx(addr, to_write, &stx, &ftx); + } + else + { + auto _spu = static_cast(cpu); + result = spu_putlluc_tx(addr, to_write, &_spu->stx, &_spu->ftx); + } if (result == 0) { - auto& sdata = *vm::get_super_ptr(addr); - auto& res = vm::reservation_acquire(addr); - - cpu_thread::suspend_all<+2>(cpu, {&res}, [&] + cpu_thread::suspend_all<+2>(cpu, {}, [&] { - mov_rdata_nt(sdata, *static_cast(to_write)); - res += 127; + mov_rdata(sdata, *static_cast(to_write)); }); - } - if (!result) - { + vm::reservation_acquire(addr) += 32; result = __rdtsc() - perf0.get(); } @@ -3359,11 +3221,6 @@ bool spu_thread::process_mfc_cmd() // See previous ntime check. continue; } - else - { - // If succeeded, only need to check unique lock bit - test_mask = ~vm::rsrv_shared_mask; - } } else { diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h index e27d083618..aeedda7409 100644 --- a/rpcs3/Emu/Memory/vm_reservation.h +++ b/rpcs3/Emu/Memory/vm_reservation.h @@ -32,7 +32,7 @@ namespace vm { rsrv_lock_mask = 127, rsrv_unique_lock = 64, - rsrv_shared_mask = 63, + rsrv_putunc_flag = 32, }; // Get reservation status for further atomic update: last update timestamp