diff --git a/Utilities/VirtualMemory.h b/Utilities/VirtualMemory.h index 5dd6be3c9f..e830713a7a 100644 --- a/Utilities/VirtualMemory.h +++ b/Utilities/VirtualMemory.h @@ -81,5 +81,8 @@ namespace utils { return m_flags; } + + // Another userdata + u64 info = 0; }; } diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 5a19e7d812..a66439d1ee 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -1368,6 +1368,9 @@ spu_thread::~spu_thread() g_raw_spu_id[index] = 0; g_raw_spu_ctr--; } + + // Free range lock + vm::free_range_lock(range_lock); } spu_thread::spu_thread(vm::addr_t _ls, lv2_spu_group* group, u32 index, std::string_view name, u32 lv2_id, bool is_isolated, u32 option) @@ -1418,6 +1421,8 @@ spu_thread::spu_thread(vm::addr_t _ls, lv2_spu_group* group, u32 index, std::str { cpu_init(); } + + range_lock = vm::alloc_range_lock(); } void spu_thread::push_snr(u32 number, u32 value) @@ -1704,101 +1709,121 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } } - switch (u32 size = args.size) + if (g_cfg.core.spu_accurate_dma) [[unlikely]] { - case 1: - { - auto [res, time0] = vm::reservation_lock(eal); - *reinterpret_cast(dst) = *reinterpret_cast(src); - res += 64; - break; - } - case 2: - { - auto [res, time0] = vm::reservation_lock(eal); - *reinterpret_cast(dst) = *reinterpret_cast(src); - res += 64; - break; - } - case 4: - { - auto [res, time0] = vm::reservation_lock(eal); - *reinterpret_cast(dst) = *reinterpret_cast(src); - res += 64; - break; - } - case 8: - { - auto [res, time0] = vm::reservation_lock(eal); - *reinterpret_cast(dst) = *reinterpret_cast(src); - res += 64; - break; - } - default: - { - if (g_cfg.core.spu_accurate_dma) + for (u32 size0, size = args.size;; size -= size0, dst += size0, src += size0, eal += size0) { - for (u32 size0;; size -= size0, dst += size0, src += size0, eal += size0) + size0 = std::min(128 - (eal & 127), std::min(size, 128)); + + if (size0 == 128u && g_cfg.core.accurate_cache_line_stores) { - size0 = std::min(128 - (eal & 127), std::min(size, 128)); - - if (size0 == 128u && g_cfg.core.accurate_cache_line_stores) - { - // As atomic as PUTLLUC - do_cell_atomic_128_store(eal, src); - - if (size == size0) - { - break; - } - - continue; - } - - // Lock each cache line execlusively - auto [res, time0] = vm::reservation_lock(eal); - - switch (size0) - { - case 128: - { - mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); - break; - } - default: - { - auto _dst = dst; - auto _src = src; - auto _size = size0; - - while (_size) - { - *reinterpret_cast(_dst) = *reinterpret_cast(_src); - - _dst += 16; - _src += 16; - _size -= 16; - } - - break; - } - } - - res += 64; + // As atomic as PUTLLUC + do_cell_atomic_128_store(eal, src); if (size == size0) { break; } + + continue; } - break; + // Lock each cache line execlusively + auto [res, time0] = vm::reservation_lock(eal); + + switch (size0) + { + case 1: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + case 2: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + case 4: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + case 8: + { + *reinterpret_cast(dst) = *reinterpret_cast(src); + break; + } + case 128: + { + mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); + break; + } + default: + { + auto _dst = dst; + auto _src = src; + auto _size = size0; + + while (_size) + { + *reinterpret_cast(_dst) = *reinterpret_cast(_src); + + _dst += 16; + _src += 16; + _size -= 16; + } + + break; + } + } + + res += 64; + + if (size == size0) + { + break; + } } + std::atomic_thread_fence(std::memory_order_seq_cst); + return; + } + + switch (u32 size = args.size) + { + case 1: + { + vm::range_lock(range_lock, eal, 1); + *reinterpret_cast(dst) = *reinterpret_cast(src); + range_lock->release(0); + break; + } + case 2: + { + vm::range_lock(range_lock, eal, 2); + *reinterpret_cast(dst) = *reinterpret_cast(src); + range_lock->release(0); + break; + } + case 4: + { + vm::range_lock(range_lock, eal, 4); + *reinterpret_cast(dst) = *reinterpret_cast(src); + range_lock->release(0); + break; + } + case 8: + { + vm::range_lock(range_lock, eal, 8); + *reinterpret_cast(dst) = *reinterpret_cast(src); + range_lock->release(0); + break; + } + default: + { if (((eal & 127) + size) <= 128) { - // Lock one cache line - auto [res, time0] = vm::reservation_lock(eal); + vm::range_lock(range_lock, eal, size); while (size) { @@ -1809,14 +1834,14 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) size -= 16; } - res += 64; + range_lock->release(0); break; } u32 range_addr = eal & -128; u32 range_end = ::align(eal + size, 128); - // Handle the case of crossing 64K page borders + // Handle the case of crossing 64K page borders (TODO: maybe split in 4K fragments?) if (range_addr >> 16 != (range_end - 1) >> 16) { u32 nexta = range_end & -65536; @@ -1824,7 +1849,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) size -= size0; // Split locking + transfer in two parts (before 64K border, and after it) - const auto lock = vm::range_lock(range_addr, nexta); + vm::range_lock(range_lock, range_addr, size0); // Avoid unaligned stores in mov_rdata_avx if (reinterpret_cast(dst) & 0x10) @@ -1854,11 +1879,11 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) size0 -= 16; } - lock->release(0); + range_lock->release(0); range_addr = nexta; } - const auto lock = vm::range_lock(range_addr, range_end); + vm::range_lock(range_lock, range_addr, range_end - range_addr); // Avoid unaligned stores in mov_rdata_avx if (reinterpret_cast(dst) & 0x10) @@ -1888,16 +1913,11 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) size -= 16; } - lock->release(0); + range_lock->release(0); break; } } - if (g_cfg.core.spu_accurate_dma) - { - std::atomic_thread_fence(std::memory_order_seq_cst); - } - return; } diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 28893826b3..e05de2cef9 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -678,6 +678,9 @@ public: alignas(64) std::byte rdata[128]{}; u32 raddr = 0; + // Range Lock pointer + atomic_t* range_lock{}; + u32 srr0; u32 ch_tag_upd; u32 ch_tag_mask; diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index 62c56d4a50..a8f3c93e7e 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -55,7 +55,7 @@ namespace vm alignas(4096) atomic_t g_shareable[65536]{0}; // Memory locations - std::vector> g_locations; + alignas(64) std::vector> g_locations; // Memory mutex core shared_mutex g_mutex; @@ -68,7 +68,12 @@ namespace vm // Memory mutex: passive locks std::array, g_cfg.core.ppu_threads.max> g_locks{}; - std::array, 6> g_range_locks{}; + + // Range lock slot allocation bits + atomic_t g_range_lock_bits{}; + + // Memory range lock slots (sparse atomics) + atomic_t g_range_lock_set[64]{}; // Page information struct memory_page @@ -126,45 +131,131 @@ namespace vm } } - static atomic_t* _register_range_lock(const u64 lock_info) + atomic_t* alloc_range_lock() { + const auto [bits, ok] = g_range_lock_bits.fetch_op([](u64& bits) + { + if (~bits) [[likely]] + { + bits |= bits + 1; + return true; + } + + return false; + }); + + if (!ok) [[unlikely]] + { + fmt::throw_exception("Out of range lock bits"); + } + + g_mutex.lock_unlock(); + + return &g_range_lock_set[std::countr_one(bits)]; + } + + void range_lock_internal(atomic_t* range_lock, u32 begin, u32 size) + { + perf_meter<"RHW_LOCK"_u64> perf0; + while (true) { - for (auto& lock : g_range_locks) + std::shared_lock lock(g_mutex); + + u32 test = 0; + + for (u32 i = begin / 4096, max = (begin + size - 1) / 4096; i <= max; i++) { - if (!lock && lock.compare_and_swap_test(0, lock_info)) + if (!(g_pages[i].flags & (vm::page_readable))) { - return &lock; + test = i * 4096; + break; } } + + if (test) + { + lock.unlock(); + + // Try tiggering a page fault (write) + // TODO: Read memory if needed + vm::_ref>(test) += 0; + continue; + } + + range_lock->release(begin | u64{size} << 32); + return; } } - static void _lock_shareable_cache(u8 /*value*/, u32 addr /*mutable*/, u32 end /*mutable*/) + void free_range_lock(atomic_t* range_lock) noexcept { - // Special value to block new range locks - g_addr_lock = addr | u64{end - addr} << 32; + if (range_lock < g_range_lock_set || range_lock >= std::end(g_range_lock_set)) + { + fmt::throw_exception("Invalid range lock" HERE); + } - // Convert to 64K-page numbers - addr >>= 16; - end >>= 16; + range_lock->release(0); + + std::shared_lock lock(g_mutex); + + // Use ptr difference to determine location + const auto diff = range_lock - g_range_lock_set; + g_range_lock_bits &= ~(1ull << diff); + } + + template + FORCE_INLINE static u64 for_all_range_locks(F func) + { + u64 result = 0; + + for (u64 bits = g_range_lock_bits.load(); bits; bits &= bits - 1) + { + const u32 id = std::countr_zero(bits); + + const u64 lock_val = g_range_lock_set[id].load(); + + if (const u32 size = static_cast(lock_val >> 32)) [[unlikely]] + { + const u32 addr = static_cast(lock_val); + + result += func(addr, size); + } + } + + return result; + } + + static void _lock_shareable_cache(u8 value, u32 addr, u32 size) + { + // Block new range locks + g_addr_lock = addr | u64{size} << 32; + + ASSUME(size); + + const auto range = utils::address_range::start_length(addr, size); // Wait for range locks to clear - for (auto& lock : g_range_locks) + while (value) { - while (const u64 _lock = lock.load()) + const u64 bads = for_all_range_locks([&](u32 addr2, u32 size2) { - if (const u32 lock_page = static_cast(_lock) >> 16) + ASSUME(size2); + + if (range.overlaps(utils::address_range::start_length(addr2, size2))) [[unlikely]] { - if (lock_page < addr || lock_page >= end) - { - // Ignoreable range lock - break; - } + return 1; } - _mm_pause(); + return 0; + }); + + if (!bads) + { + return; } + + _mm_pause(); } } @@ -204,82 +295,6 @@ namespace vm } } - atomic_t* range_lock(u32 addr, u32 end) - { - static const auto test_addr = [](u64 target, u32 addr, u32 end) -> u64 - { - if (const u32 target_size = static_cast(target >> 32)) - { - // Shareable info is being modified - const u32 target_addr = static_cast(target); - - if (addr >= target_addr + target_size || end <= target_addr) - { - // Outside of the locked range: proceed normally - if (g_shareable[addr >> 16]) - { - addr &= 0xffff; - end = ((end - 1) & 0xffff) + 1; - } - - return u64{end} << 32 | addr; - } - - return 0; - } - - if (g_shareable[target >> 16]) - { - // Target within shareable memory range - target &= 0xffff; - } - - if (g_shareable[addr >> 16]) - { - // Track shareable memory locks in 0x0..0xffff address range - addr &= 0xffff; - end = ((end - 1) & 0xffff) + 1; - } - - if (addr > target || end <= target) - { - return u64{end} << 32 | addr; - } - - return 0; - }; - - if (u64 _a1 = test_addr(g_addr_lock.load(), addr, end)) [[likely]] - { - // Optimistic path (hope that address range is not locked) - const auto _ret = _register_range_lock(_a1); - - if (_a1 == test_addr(g_addr_lock.load(), addr, end) && !!(g_pages[addr / 4096].flags & page_readable)) [[likely]] - { - return _ret; - } - - *_ret = 0; - } - - while (true) - { - std::shared_lock lock(g_mutex); - - if (!(g_pages[addr / 4096].flags & page_readable)) - { - lock.unlock(); - - // Try tiggering a page fault (write) - // TODO: Read memory if needed - vm::_ref>(addr) += 0; - continue; - } - - return _register_range_lock(test_addr(UINT32_MAX, addr, end)); - } - } - void passive_unlock(cpu_thread& cpu) { if (auto& ptr = g_tls_locked) @@ -401,7 +416,7 @@ namespace vm } } - g_addr_lock = addr; + g_addr_lock = addr | (u64{128} << 32); if (g_shareable[addr >> 16]) { @@ -409,26 +424,34 @@ namespace vm addr = addr & 0xffff; } - for (auto& lock : g_range_locks) + const auto range = utils::address_range::start_length(addr, 128); + + while (true) { - while (true) + const u64 bads = for_all_range_locks([&](u32 addr2, u32 size2) { - const u64 value = lock; - - // Test beginning address - if (static_cast(value) > addr) + // TODO (currently not possible): handle 2 64K pages (inverse range), or more pages + if (g_shareable[addr2 >> 16]) { - break; + addr2 &= 0xffff; } - // Test end address - if (static_cast(value >> 32) <= addr) + ASSUME(size2); + + if (range.overlaps(utils::address_range::start_length(addr2, size2))) [[unlikely]] { - break; + return 1; } - _mm_pause(); + return 0; + }); + + if (!bads) [[likely]] + { + break; } + + _mm_pause(); } for (auto lock = g_locks.cbegin(), end = lock + g_cfg.core.ppu_threads; lock != end; lock++) @@ -538,7 +561,7 @@ namespace vm } } - static void _page_map(u32 addr, u8 flags, u32 size, utils::shm* shm) + static void _page_map(u32 addr, u8 flags, u32 size, utils::shm* shm, std::pair>>* (*search_shm)(vm::block_t* block, utils::shm* shm)) { if (!size || (size | addr) % 4096 || flags & page_allocated) { @@ -553,13 +576,38 @@ namespace vm } } - if (shm && shm->flags() != 0) + if (shm && shm->flags() != 0 && shm->info++) { - _lock_shareable_cache(1, addr, addr + size); + // Memory mirror found, map its range as shareable + _lock_shareable_cache(1, addr, size); for (u32 i = addr / 65536; i < addr / 65536 + size / 65536; i++) { - g_shareable[i] = 1; + g_shareable[i].release(1); + } + + // Check ref counter (using unused member info for it) + if (shm->info == 2) + { + // Find another mirror and map it as shareable too + for (auto& ploc : g_locations) + { + if (auto loc = ploc.get()) + { + if (auto pp = search_shm(loc, shm)) + { + auto& [size2, ptr] = pp->second; + + // Relock cache + _lock_shareable_cache(1, pp->first, size2); + + for (u32 i = pp->first / 65536; i < pp->first / 65536 + size2 / 65536; i++) + { + g_shareable[i].release(1); + } + } + } + } } // Unlock @@ -702,13 +750,14 @@ namespace vm } } - if (g_shareable[addr >> 16]) + if (shm && shm->flags() != 0 && (--shm->info || g_shareable[addr >> 16])) { - _lock_shareable_cache(0, addr, addr + size); + // Remove mirror from shareable cache + _lock_shareable_cache(0, addr, size); for (u32 i = addr / 65536; i < addr / 65536 + size / 65536; i++) { - g_shareable[i] = 0; + g_shareable[i].release(0); } // Unlock @@ -844,8 +893,28 @@ namespace vm verify(HERE), !g_pages[addr / 4096 + size / 4096 - 1].flags.exchange(page_allocated); } - // Map "real" memory pages - _page_map(page_addr, flags, page_size, shm.get()); + // Map "real" memory pages; provide a function to search for mirrors with private member access + _page_map(page_addr, flags, page_size, shm.get(), [](vm::block_t* _this, utils::shm* shm) + { + decltype(m_map)::value_type* result = nullptr; + + // Check eligibility + if (!_this || !(SYS_MEMORY_PAGE_SIZE_MASK & _this->flags) || _this->addr < 0x20000000 || _this->addr >= 0xC0000000) + { + return result; + } + + for (auto& pp : _this->m_map) + { + if (pp.second.second.get() == shm) + { + // Found match + return &pp; + } + } + + return result; + }); // Add entry m_map[addr] = std::make_pair(size, std::move(shm)); @@ -1368,7 +1437,8 @@ namespace vm std::memset(g_reservations, 0, sizeof(g_reservations)); std::memset(g_shareable, 0, sizeof(g_shareable)); - std::memset(g_range_locks.data(), 0, sizeof(g_range_locks)); + std::memset(g_range_lock_set, 0, sizeof(g_range_lock_set)); + g_range_lock_bits = 0; } } diff --git a/rpcs3/Emu/Memory/vm_locking.h b/rpcs3/Emu/Memory/vm_locking.h index 9716bab16a..721783dad2 100644 --- a/rpcs3/Emu/Memory/vm_locking.h +++ b/rpcs3/Emu/Memory/vm_locking.h @@ -11,9 +11,44 @@ namespace vm extern thread_local atomic_t* g_tls_locked; + extern atomic_t g_addr_lock; + // Register reader void passive_lock(cpu_thread& cpu); - atomic_t* range_lock(u32 begin, u32 end); + + // Register range lock for further use + atomic_t* alloc_range_lock(); + + void range_lock_internal(atomic_t* range_lock, u32 begin, u32 size); + + // Lock memory range + FORCE_INLINE void range_lock(atomic_t* range_lock, u32 begin, u32 size) + { + const u64 lock_val = g_addr_lock.load(); + const u64 lock_addr = static_cast(lock_val); // -> u64 + const u32 lock_size = static_cast(lock_val >> 32); + + if (u64{begin} + size <= lock_addr || begin >= lock_addr + lock_size) [[likely]] + { + // Optimistic locking + range_lock->release(begin | (u64{size} << 32)); + + const u64 new_lock_val = g_addr_lock.load(); + + if (!new_lock_val || new_lock_val == lock_val) [[likely]] + { + return; + } + + range_lock->release(0); + } + + // Fallback to slow path + range_lock_internal(range_lock, begin, size); + } + + // Release it + void free_range_lock(atomic_t*) noexcept; // Unregister reader void passive_unlock(cpu_thread& cpu);