diff --git a/rpcs3/util/atomic.cpp b/rpcs3/util/atomic.cpp index d61d061e1d..24d0885042 100644 --- a/rpcs3/util/atomic.cpp +++ b/rpcs3/util/atomic.cpp @@ -25,6 +25,9 @@ // Total number of entries, should be a power of 2. static constexpr std::size_t s_hashtable_size = 1u << 18; +// Reference counter combined with shifted pointer (which is assumed to be 47 bit) +static constexpr std::uintptr_t s_ref_mask = (1u << 17) - 1; + // Callback for wait() function, returns false if wait should return static thread_local bool(*s_tls_wait_cb)(const void* data) = [](const void*){ return true; }; @@ -341,17 +344,16 @@ namespace atomic_wait { constexpr cond_handle() noexcept = default; - atomic_t ptr{}; + // Combined pointer (most significant 47 bits) and ref counter (17 least significant bits) + atomic_t ptr_ref{}; u64 tid{}; __m128i mask{}; __m128i oldv{}; - // Temporarily reduced unique tsc stamp to 48 bits to make space for refs (TODO) - u64 tsc0 : 48 = 0; - u64 link : 16 = 0; + u64 tsc0{}; + u16 link{}; u8 size{}; u8 flag{}; - atomic_t refs{}; atomic_t sync{}; #ifdef USE_STD @@ -360,9 +362,8 @@ namespace atomic_wait un_t mtx{}; #endif - void init(const void* data) + void init(std::uintptr_t iptr) { - ptr.release(data); #ifdef _WIN32 tid = GetCurrentThreadId(); #else @@ -374,13 +375,11 @@ namespace atomic_wait mtx.init(mtx); #endif - verify(HERE), !refs; - refs.release(1); + verify(HERE), !ptr_ref.exchange((iptr << 17) | 1); } void destroy() { - ptr.release(nullptr); tid = 0; tsc0 = 0; link = 0; @@ -533,7 +532,7 @@ static u32 #ifdef _WIN32 __vectorcall #endif -cond_alloc(const void* data, __m128i mask) +cond_alloc(std::uintptr_t iptr, __m128i mask) { // Determine whether there is a free slot or not if (!s_cond_sema.try_inc(UINT16_MAX + 1)) @@ -581,7 +580,7 @@ cond_alloc(const void* data, __m128i mask) // Initialize new "semaphore" s_cond_list[id].mask = mask; - s_cond_list[id].init(data); + s_cond_list[id].init(iptr); return id; } @@ -603,7 +602,22 @@ static void cond_free(u32 cond_id) const auto cond = s_cond_list + cond_id; // Dereference, destroy on last ref - if (--cond->refs) + const bool last = cond->ptr_ref.atomic_op([](u64& val) + { + verify(HERE), val & s_ref_mask; + + val--; + + if ((val & s_ref_mask) == 0) + { + val = 0; + return true; + } + + return false; + }); + + if (!last) { return; } @@ -622,20 +636,26 @@ static atomic_wait::cond_handle* #ifdef _WIN32 __vectorcall #endif -cond_id_lock(u32 cond_id, __m128i mask, u64 thread_id = 0, const void* data = nullptr) +cond_id_lock(u32 cond_id, __m128i mask, u64 thread_id = 0, std::uintptr_t iptr = 0) { if (cond_id - 1 < u32{UINT16_MAX}) { const auto cond = s_cond_list + cond_id; - const auto [old, ok] = cond->refs.fetch_op([&](u16& ref) + const auto [old, ok] = cond->ptr_ref.fetch_op([&](u64& val) { - if (!ref || ref == UINT16_MAX) + if (!val || (val & s_ref_mask) == s_ref_mask) { // Don't reference already deallocated semaphore return false; } + if (iptr && (val >> 17) != iptr) + { + // Pointer mismatch + return false; + } + const u32 sync_val = cond->sync; if (sync_val == 0 || sync_val == 3) @@ -653,20 +673,12 @@ cond_id_lock(u32 cond_id, __m128i mask, u64 thread_id = 0, const void* data = nu } } - if (data) - { - if (cond->ptr != data) - { - return false; - } - } - if (_mm_cvtsi128_si64(_mm_packs_epi16(mask12, mask12)) == 0) { return false; } - ref++; + val++; return true; }); @@ -675,9 +687,9 @@ cond_id_lock(u32 cond_id, __m128i mask, u64 thread_id = 0, const void* data = nu return cond; } - if (old == UINT16_MAX) + if ((old & s_ref_mask) == s_ref_mask) { - fmt::raw_error("Reference count limit " STRINGIZE(UINT16_MAX) " reached in an atomic notifier."); + fmt::raw_error("Reference count limit (131071) reached in an atomic notifier."); } } @@ -716,7 +728,7 @@ namespace atomic_wait root_info* slot_free(atomic_t* slot) noexcept; template - auto slot_search(const void* data, u64 thread_id, __m128i mask, F func) noexcept; + auto slot_search(std::uintptr_t iptr, u64 thread_id, __m128i mask, F func) noexcept; }; static_assert(sizeof(root_info) == 128); @@ -913,7 +925,7 @@ atomic_wait::root_info* atomic_wait::root_info::slot_free(atomic_t* slot) n } template -FORCE_INLINE auto atomic_wait::root_info::slot_search(const void* data, u64 thread_id, __m128i mask, F func) noexcept +FORCE_INLINE auto atomic_wait::root_info::slot_search(std::uintptr_t iptr, u64 thread_id, __m128i mask, F func) noexcept { const u64 bits_val = this->bits.load(); const u64 max_order = bits_val >> max_threads; @@ -940,7 +952,7 @@ FORCE_INLINE auto atomic_wait::root_info::slot_search(const void* data, u64 thre for (u32 i = 0; i < cond_max; i++) { - if (cond_id_lock(cond_ids[i], mask, thread_id, data)) + if (cond_id_lock(cond_ids[i], mask, thread_id, iptr)) { if (func(cond_ids[i])) { @@ -974,7 +986,7 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time { const auto stamp0 = atomic_wait::get_unique_tsc(); - const std::uintptr_t iptr = reinterpret_cast(data); + const std::uintptr_t iptr = reinterpret_cast(data) & (~s_ref_mask >> 17); const auto root = &s_hashtable[iptr % s_hashtable_size]; @@ -1001,19 +1013,19 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time } } - iptr_ext[ext_size] = reinterpret_cast(e->data); + iptr_ext[ext_size] = reinterpret_cast(e->data) & (~s_ref_mask >> 17); root_ext[ext_size] = &s_hashtable[iptr & s_hashtable_size]; ext_size++; } } - const u32 cond_id = cond_alloc(data, mask); + const u32 cond_id = cond_alloc(iptr, mask); u32 cond_id_ext[atomic_wait::max_list - 1]{}; for (u32 i = 0; i < ext_size; i++) { - cond_id_ext[i] = cond_alloc(ext[i].data, ext[i].mask); + cond_id_ext[i] = cond_alloc(iptr_ext[i], ext[i].mask); } const auto slot = root->slot_alloc(iptr); @@ -1230,7 +1242,7 @@ alert_sema(u32 cond_id, const void* data, u64 tid, u32 size, __m128i mask, __m12 u32 ok = 0; - if (!size ? (!tid || cond->tid == tid) : (cond->ptr == data && cmp_mask(size, mask, new_value, cond->size | (cond->flag << 8), cond->mask, cond->oldv))) + if (!size ? (!tid || cond->tid == tid) : cmp_mask(size, mask, new_value, cond->size | (cond->flag << 8), cond->mask, cond->oldv)) { // Redirect if necessary const auto _old = cond; @@ -1313,17 +1325,17 @@ bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id) { const auto cond = s_cond_list + i; - const auto [old, ok] = cond->refs.fetch_op([&](u16& ref) + const auto [old, ok] = cond->ptr_ref.fetch_op([&](u64& val) { - if (!ref) + if (!val) { // Skip dead semaphores return false; } - u32 val = cond->sync.load(); + u32 sync_val = cond->sync.load(); - if (val == 0 || val >= 3) + if (sync_val == 0 || sync_val >= 3) { // Skip forced signaled or secondary semaphores return false; @@ -1338,9 +1350,9 @@ bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id) } } - if (ref < UINT16_MAX) + if ((val & s_ref_mask) < s_ref_mask) { - ref++; + val++; return true; } @@ -1360,22 +1372,27 @@ bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id) if (thread_id) { // Only if thread_id is speficied, stop only one and return true. - if (old < UINT16_MAX) + if ((old & s_ref_mask) < s_ref_mask) + { cond_free(i); + } + return true; } } } - if (old < UINT16_MAX) + if ((old & s_ref_mask) < s_ref_mask) + { cond_free(i); + } } } return false; } - const std::uintptr_t iptr = reinterpret_cast(data); + const std::uintptr_t iptr = reinterpret_cast(data) & (~s_ref_mask >> 17); auto* const root = &s_hashtable[iptr % s_hashtable_size]; @@ -1383,7 +1400,7 @@ bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id) u64 progress = 0; - root->slot_search(data, thread_id, _mm_set1_epi64x(-1), [&](u32 cond_id) + root->slot_search(iptr, thread_id, _mm_set1_epi64x(-1), [&](u32 cond_id) { // Forced notification if (alert_sema(cond_id, data, thread_id, 0, _mm_setzero_si128(), _mm_setzero_si128())) @@ -1413,7 +1430,7 @@ __vectorcall #endif atomic_wait_engine::notify_one(const void* data, u32 size, __m128i mask, __m128i new_value) { - const std::uintptr_t iptr = reinterpret_cast(data); + const std::uintptr_t iptr = reinterpret_cast(data) & (~s_ref_mask >> 17); auto* const root = &s_hashtable[iptr % s_hashtable_size]; @@ -1421,7 +1438,7 @@ atomic_wait_engine::notify_one(const void* data, u32 size, __m128i mask, __m128i u64 progress = 0; - root->slot_search(data, 0, mask, [&](u32 cond_id) + root->slot_search(iptr, 0, mask, [&](u32 cond_id) { if (alert_sema(cond_id, data, -1, size, mask, new_value)) { @@ -1441,7 +1458,7 @@ __vectorcall #endif atomic_wait_engine::notify_all(const void* data, u32 size, __m128i mask, __m128i new_value) { - const std::uintptr_t iptr = reinterpret_cast(data); + const std::uintptr_t iptr = reinterpret_cast(data) & (~s_ref_mask >> 17); auto* const root = &s_hashtable[iptr % s_hashtable_size]; @@ -1455,7 +1472,7 @@ atomic_wait_engine::notify_all(const void* data, u32 size, __m128i mask, __m128i // Array itself. u16 cond_ids[UINT16_MAX + 1]; - root->slot_search(data, 0, mask, [&](u32 cond_id) + root->slot_search(iptr, 0, mask, [&](u32 cond_id) { u32 res = alert_sema(cond_id, data, -1, size, mask, new_value);