From d0747076c65017778b659153dc7279c660efaca4 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Wed, 25 Dec 2024 11:35:03 +0200 Subject: [PATCH] SPU: Operating system LR memory signals --- rpcs3/Emu/Cell/SPUThread.cpp | 224 +++++++++++++++++++++++++- rpcs3/Emu/Cell/SPUThread.h | 5 + rpcs3/Emu/Cell/lv2/lv2.cpp | 132 +++++++++++++++ rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp | 10 +- rpcs3/Emu/Cell/lv2/sys_sync.h | 31 +--- 5 files changed, 366 insertions(+), 36 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 456c9894be..8804d93138 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -341,6 +341,21 @@ extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src) #endif } +#ifdef _MSC_VER +__forceinline +#endif +extern u32 compute_rdata_hash32(const spu_rdata_t& _src) +{ + const auto rhs = reinterpret_cast(_src); + const v128 a = gv_add32(rhs[0], rhs[1]); + const v128 c = gv_add32(rhs[4], rhs[5]); + const v128 b = gv_add32(rhs[2], rhs[3]); + const v128 d = gv_add32(rhs[6], rhs[7]); + const v128 r = gv_add32(gv_add32(a, b), gv_add32(c, d)); + const v128 r1 = gv_add32(r, gv_shuffle32<1, 0, 3, 2>(r)); + return r1._u32[0] + r1._u32[2]; +} + #if defined(ARCH_X64) static FORCE_INLINE void mov_rdata_nt_avx(__m256i* dst, const __m256i* src) { @@ -4718,6 +4733,12 @@ bool spu_thread::process_mfc_cmd() busy_wait(300); } + if (getllar_spin_count == 3) + { + // Check other reservations in other threads + lv2_obj::notify_all(); + } + // Reset perf perf0.restart(); } @@ -4729,12 +4750,17 @@ bool spu_thread::process_mfc_cmd() // Spinning, might as well yield cpu resources state += cpu_flag::wait; + usz cache_line_waiter_index = umax; + if (auto wait_var = vm::reservation_notifier_begin_wait(addr, rtime)) { + cache_line_waiter_index = register_cache_line_waiter(addr); utils::bless>(&wait_var->raw().wait_flag)->wait(1, atomic_wait_timeout{100'000}); vm::reservation_notifier_end_wait(*wait_var); } + deregister_cache_line_waiter(cache_line_waiter_index); + static_cast(test_stopped()); // Quick check if there were reservation changes @@ -5372,6 +5398,140 @@ bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data) const return !res; } +bool spu_thread::reservation_check(u32 addr, u32 hash, atomic_t* range_lock) +{ + if ((addr >> 28) < 2 || (addr >> 28) == 0xd) + { + // Always-allocated memory does not need strict checking (vm::main or vm::stack) + return compute_rdata_hash32(*vm::get_super_ptr(addr)) == hash; + } + + // Ensure data is allocated (HACK: would raise LR event if not) + // Set range_lock first optimistically + range_lock->store(u64{128} << 32 | addr); + + u64 lock_val = *std::prev(std::end(vm::g_range_lock_set)); + u64 old_lock = 0; + + while (lock_val != old_lock) + { + // Since we want to read data, let's check readability first + if (!(lock_val & vm::range_readable)) + { + // Only one abnormal operation is "unreadable" + if ((lock_val >> vm::range_pos) == (vm::range_locked >> vm::range_pos)) + { + // All page flags are untouched and can be read safely + if (!vm::check_addr(addr)) + { + // Assume our memory is being (de)allocated + range_lock->release(0); + break; + } + + // g_shmem values are unchanged too + const u64 is_shmem = vm::g_shmem[addr >> 16]; + + const u64 test_addr = is_shmem ? (is_shmem | static_cast(addr)) / 128 : u64{addr} / 128; + const u64 lock_addr = lock_val / 128; + + if (test_addr == lock_addr) + { + // Our reservation is locked + range_lock->release(0); + break; + } + + break; + } + } + + // Fallback to normal range check + const u64 lock_addr = static_cast(lock_val); + const u32 lock_size = static_cast(lock_val << 3 >> 35); + + if (lock_addr + lock_size <= addr || lock_addr >= addr + 128) + { + // We are outside locked range, so page flags are unaffected + if (!vm::check_addr(addr)) + { + range_lock->release(0); + break; + } + } + else if (!(lock_val & vm::range_readable)) + { + range_lock->release(0); + break; + } + + old_lock = std::exchange(lock_val, *std::prev(std::end(vm::g_range_lock_set))); + } + + if (!range_lock->load()) [[unlikely]] + { + return true; + } + + const bool res = compute_rdata_hash32(*vm::get_super_ptr(addr)) == hash; + + range_lock->release(0); + return !res; +} + +usz spu_thread::register_cache_line_waiter(u32 addr) +{ + const u64 value = u64{compute_rdata_hash32(rdata)} << 32 | raddr; + + for (usz i = 0; i < std::size(g_spu_waiters_by_value); i++) + { + auto [old, ok] = g_spu_waiters_by_value[i].fetch_op([value](u64& x) + { + if (x == 0) + { + x = value + 1; + return true; + } + + if ((x & -128) == value) + { + x++; + return true; + } + + return false; + }); + + if (ok) + { + return i; + } + } + + return umax; +} + + +void spu_thread::deregister_cache_line_waiter(usz index) +{ + if (index == umax) + { + return; + } + + g_spu_waiters_by_value[index].fetch_op([](u64& x) + { + x--; + + if ((x & 127) == 0) + { + x = 0; + } + + return false; + }); +} + std::pair spu_thread::read_dec() const { const u64 res = ch_dec_value - (is_dec_frozen ? 0 : (get_timebased_time() - ch_dec_start_timestamp)); @@ -5739,6 +5899,24 @@ s64 spu_thread::get_ch_value(u32 ch) #else const bool reservation_busy_waiting = (seed + ((raddr == spurs_addr) ? 50u : 0u)) < g_cfg.core.spu_reservation_busy_waiting_percentage; #endif + usz cache_line_waiter_index = umax; + + auto check_cache_line_waiter = [&]() + { + if (cache_line_waiter_index == umax) + { + return true; + } + + if ((g_spu_waiters_by_value[cache_line_waiter_index] & -128) == 0) + { + deregister_cache_line_waiter(cache_line_waiter_index); + cache_line_waiter_index = umax; + return false; + } + + return true; + }; for (; !events.count; events = get_events(mask1 & ~SPU_EVENT_LR, true, true)) { @@ -5746,12 +5924,22 @@ s64 spu_thread::get_ch_value(u32 ch) if (is_stopped(old)) { + if (cache_line_waiter_index != umax) + { + g_spu_waiters_by_value[cache_line_waiter_index].release(0); + } + return -1; } // Optimized check - if (raddr) + if (raddr && mask1 & SPU_EVENT_LR) { + if (cache_line_waiter_index == umax) + { + cache_line_waiter_index = register_cache_line_waiter(raddr); + } + bool set_lr = false; if (!vm::check_addr(raddr) || rtime != vm::reservation_acquire(raddr)) @@ -5819,13 +6007,20 @@ s64 spu_thread::get_ch_value(u32 ch) } } + // Check other reservations in other threads + lv2_obj::notify_all(); + if (raddr - spurs_addr <= 0x80 && !g_cfg.core.spu_accurate_reservations && mask1 == SPU_EVENT_LR) { // Wait with extended timeout, in this situation we have notifications for nearly all writes making it possible // Abort notifications are handled specially for performance reasons if (auto wait_var = vm::reservation_notifier_begin_wait(raddr, rtime)) { - utils::bless>(&wait_var->raw().wait_flag)->wait(1, atomic_wait_timeout{300'000}); + if (check_cache_line_waiter()) + { + utils::bless>(&wait_var->raw().wait_flag)->wait(1, atomic_wait_timeout{300'000}); + } + vm::reservation_notifier_end_wait(*wait_var); } @@ -5834,9 +6029,14 @@ s64 spu_thread::get_ch_value(u32 ch) const u32 _raddr = this->raddr; #ifdef __linux__ + if (auto wait_var = vm::reservation_notifier_begin_wait(_raddr, rtime)) { - utils::bless>(&wait_var->raw().wait_flag)->wait(1, atomic_wait_timeout{50'000}); + if (check_cache_line_waiter()) + { + utils::bless>(&wait_var->raw().wait_flag)->wait(1, atomic_wait_timeout{50'000}); + } + vm::reservation_notifier_end_wait(*wait_var); } #else @@ -5891,13 +6091,20 @@ s64 spu_thread::get_ch_value(u32 ch) return false; } + // Check other reservations in other threads + lv2_obj::notify_all(); + return true; }; if (auto wait_var = vm::reservation_notifier_begin_wait(_raddr, rtime)) { - atomic_wait_engine::set_one_time_use_wait_callback(wait_cb); - utils::bless>(&wait_var->raw().wait_flag)->wait(1, atomic_wait_timeout{80'000}); + if (check_cache_line_waiter()) + { + atomic_wait_engine::set_one_time_use_wait_callback(wait_cb); + utils::bless>(&wait_var->raw().wait_flag)->wait(1, atomic_wait_timeout{80'000}); + } + vm::reservation_notifier_end_wait(*wait_var); } @@ -5918,6 +6125,8 @@ s64 spu_thread::get_ch_value(u32 ch) thread_ctrl::wait_on(state, old, 100); } + deregister_cache_line_waiter(cache_line_waiter_index); + wakeup_delay(); if (is_paused(state - cpu_flag::suspend)) @@ -6617,6 +6826,8 @@ bool spu_thread::stop_and_signal(u32 code) } } + lv2_obj::notify_all(); + while (auto old = +state) { if (old & cpu_flag::signal && state.test_and_reset(cpu_flag::signal)) @@ -7185,6 +7396,8 @@ s64 spu_channel::pop_wait(cpu_thread& spu, bool pop) } } + lv2_obj::notify_all(); + const u32 wait_on_val = static_cast(((pop ? bit_occupy : 0) | bit_wait) >> 32); while (true) @@ -7470,3 +7683,4 @@ void fmt_class_string::format(std::string& out, u64 arg) DECLARE(spu_thread::g_raw_spu_ctr){}; DECLARE(spu_thread::g_raw_spu_id){}; DECLARE(spu_thread::g_spu_work_count){}; +DECLARE(spu_thread::g_spu_waiters_by_value){}; diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index c895e09211..9c45e9efe5 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -888,6 +888,9 @@ public: // Returns true if reservation existed but was just discovered to be lost // It is safe to use on any address, even if not directly accessed by SPU (so it's slower) bool reservation_check(u32 addr, const decltype(rdata)& data) const; + static bool reservation_check(u32 addr, u32 hash, atomic_t* range_lock); + usz register_cache_line_waiter(u32 addr); + void deregister_cache_line_waiter(usz index); bool read_reg(const u32 addr, u32& value); bool write_reg(const u32 addr, const u32 value); @@ -897,6 +900,8 @@ public: static atomic_t g_raw_spu_id[5]; static atomic_t g_spu_work_count; + static atomic_t g_spu_waiters_by_value[6]; + static u32 find_raw_spu(u32 id) { if (id < std::size(g_raw_spu_id)) [[likely]] diff --git a/rpcs3/Emu/Cell/lv2/lv2.cpp b/rpcs3/Emu/Cell/lv2/lv2.cpp index 2c29505a54..6349bd60c6 100644 --- a/rpcs3/Emu/Cell/lv2/lv2.cpp +++ b/rpcs3/Emu/Cell/lv2/lv2.cpp @@ -6,6 +6,8 @@ #include "Emu/Memory/vm_locking.h" #include "Emu/Cell/PPUFunction.h" +#include "Emu/Cell/PPUThread.h" +#include "Emu/Cell/SPUThread.h" #include "Emu/Cell/ErrorCodes.h" #include "Emu/Cell/MFC.h" #include "sys_sync.h" @@ -54,6 +56,7 @@ #include #include #include +#include #include "util/tsc.hpp" #include "util/sysinfo.hpp" #include "util/init_mutex.hpp" @@ -75,6 +78,9 @@ namespace rsx void set_rsx_yield_flag() noexcept; } +using spu_rdata_t = decltype(spu_thread::rdata); +extern u32 compute_rdata_hash32(const spu_rdata_t& _src); + template <> void fmt_class_string::format(std::string& out, u64 arg) { @@ -2215,3 +2221,129 @@ void lv2_obj::prepare_for_sleep(cpu_thread& cpu) vm::temporary_unlock(cpu); cpu_counter::remove(&cpu); } + +void lv2_obj::notify_all() noexcept +{ + for (auto cpu : g_to_notify) + { + if (!cpu) + { + break; + } + + if (cpu != &g_to_notify) + { + const auto res_start = vm::reservation_notifier(0).second; + const auto res_end = vm::reservation_notifier(umax).second; + + if (cpu >= res_start && cpu <= res_end) + { + atomic_wait_engine::notify_all(cpu); + } + else + { + // Note: by the time of notification the thread could have been deallocated which is why the direct function is used + atomic_wait_engine::notify_one(cpu); + } + } + } + + g_to_notify[0] = nullptr; + g_postpone_notify_barrier = false; + + const auto cpu = cpu_thread::get_current(); + + if (!cpu) + { + return; + } + + if (cpu->get_class() != thread_class::spu && cpu->state.none_of(cpu_flag::suspend)) + { + return; + } + + std::optional lock; + + constexpr usz total_waiters = std::size(spu_thread::g_spu_waiters_by_value); + + u32 notifies[total_waiters]{}; + + // There may be 6 waiters, but checking them all may be performance expensive + // Instead, check 2 at max, but use the CPU ID index to tell which index to start checking so the work would be distributed across all threads + + atomic_t* range_lock = nullptr; + + for (usz i = 0, checked = 0; checked < 3 && i < total_waiters; i++) + { + auto& waiter = spu_thread::g_spu_waiters_by_value[(i + cpu->id) % total_waiters]; + const u64 value = waiter.load(); + u32 raddr = static_cast(value) & -128; + + if (vm::check_addr(raddr)) + { + if (((raddr >> 28) < 2 || (raddr >> 28) == 0xd)) + { + checked++; + + if (compute_rdata_hash32(*vm::get_super_ptr(raddr)) != static_cast(value >> 32)) + { + // Clear address to avoid a race, keep waiter counter + if (waiter.fetch_op([&](u64& x) + { + if ((x & -128) == (value & -128)) + { + x &= 127; + return true; + } + + return false; + }).second) + { + notifies[i] = raddr; + } + } + + continue; + } + + if (!range_lock) + { + range_lock = vm::alloc_range_lock(); + } + + checked++; + + if (spu_thread::reservation_check(raddr, static_cast(value >> 32), range_lock)) + { + // Clear address to avoid a race, keep waiter counter + if (waiter.fetch_op([&](u64& x) + { + if ((x & -128) == (value & -128)) + { + x &= 127; + return true; + } + + return false; + }).second) + { + notifies[i] = raddr; + } + } + } + } + + if (range_lock) + { + vm::free_range_lock(range_lock); + } + + for (u32 addr : notifies) + { + if (addr) + { + vm::reservation_notifier_notify(addr); + } + } +} diff --git a/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp b/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp index 94153404fe..80e91bbd0d 100644 --- a/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_ppu_thread.cpp @@ -166,8 +166,16 @@ s32 sys_ppu_thread_yield(ppu_thread& ppu) sys_ppu_thread.trace("sys_ppu_thread_yield()"); + const s32 success = lv2_obj::yield(ppu) ? CELL_OK : CELL_CANCEL; + + if (success == CELL_CANCEL) + { + // Do other work in the meantime + lv2_obj::notify_all(); + } + // Return 0 on successful context switch, 1 otherwise - return +!lv2_obj::yield(ppu); + return success; } error_code sys_ppu_thread_join(ppu_thread& ppu, u32 thread_id, vm::ptr vptr) diff --git a/rpcs3/Emu/Cell/lv2/sys_sync.h b/rpcs3/Emu/Cell/lv2/sys_sync.h index 83a1b22c94..644e0c90f2 100644 --- a/rpcs3/Emu/Cell/lv2/sys_sync.h +++ b/rpcs3/Emu/Cell/lv2/sys_sync.h @@ -454,36 +454,7 @@ public: static bool wait_timeout(u64 usec, ppu_thread* cpu = {}, bool scale = true, bool is_usleep = false); - static inline void notify_all() - { - for (auto cpu : g_to_notify) - { - if (!cpu) - { - break; - } - - if (cpu != &g_to_notify) - { - const auto res_start = vm::reservation_notifier(0).second; - const auto res_end = vm::reservation_notifier(umax).second; - - if (cpu >= res_start && cpu <= res_end) - { - // Notify SPU reservation - atomic_wait_engine::notify_all(cpu); - } - else - { - // Note: by the time of notification the thread could have been deallocated which is why the direct function is used - atomic_wait_engine::notify_one(cpu); - } - } - } - - g_to_notify[0] = nullptr; - g_postpone_notify_barrier = false; - } + static void notify_all() noexcept; // Can be called before the actual sleep call in order to move it out of mutex scope static void prepare_for_sleep(cpu_thread& cpu);