diff --git a/CMakeLists.txt b/CMakeLists.txt index 40f48a6d5d..ea1a194aec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.28) project(rpcs3 LANGUAGES C CXX) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11) diff --git a/Utilities/JITASM.cpp b/Utilities/JITASM.cpp index 63628f448b..acb5f40b04 100644 --- a/Utilities/JITASM.cpp +++ b/Utilities/JITASM.cpp @@ -344,15 +344,7 @@ jit_runtime_base& asmjit::get_global_runtime() { custom_runtime() noexcept { - // Search starting in first 2 GiB of memory - for (u64 addr = size;; addr += size) - { - if (auto ptr = utils::memory_reserve(size, reinterpret_cast(addr))) - { - m_pos.raw() = static_cast(ptr); - break; - } - } + ensure(m_pos.raw() = static_cast(utils::memory_reserve(size))); // Initialize "end" pointer m_max = m_pos + size; diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp index af4d9128d2..c67e720b6f 100644 --- a/Utilities/Thread.cpp +++ b/Utilities/Thread.cpp @@ -2490,7 +2490,7 @@ void thread_ctrl::wait_for(u64 usec, [[maybe_unused]] bool alert /* true */) if (alert) { list.set<0>(_this->m_sync, 0); - list.set<1>(utils::bless>(&_this->m_taskq)[1], 0); + list.template set<1>(_this->m_taskq); } else { diff --git a/Utilities/lockless.h b/Utilities/lockless.h index 48142de96a..63c2774bc8 100644 --- a/Utilities/lockless.h +++ b/Utilities/lockless.h @@ -49,7 +49,7 @@ public: if (!next) { - // Do not allow access beyond many element more at a time + // Do not allow access beyond many element more at a time ensure(!installed && index - i < N * 2); installed = true; @@ -384,17 +384,26 @@ public: template class lf_queue final { - atomic_t m_head{0}; - - lf_queue_item* load(u64 value) const noexcept +public: + struct fat_ptr { - return reinterpret_cast*>(value >> 16); + u64 ptr{}; + u32 is_non_null{}; + u32 reserved{}; + }; + +private: + atomic_t m_head{fat_ptr{}}; + + lf_queue_item* load(fat_ptr value) const noexcept + { + return reinterpret_cast*>(value.ptr); } // Extract all elements and reverse element order (FILO to FIFO) lf_queue_item* reverse() noexcept { - if (auto* head = load(m_head) ? load(m_head.exchange(0)) : nullptr) + if (auto* head = load(m_head) ? load(m_head.exchange(fat_ptr{})) : nullptr) { if (auto* prev = head->m_link) { @@ -420,7 +429,7 @@ public: lf_queue(lf_queue&& other) noexcept { - m_head.release(other.m_head.exchange(0)); + m_head.release(other.m_head.exchange(fat_ptr{})); } lf_queue& operator=(lf_queue&& other) noexcept @@ -431,7 +440,7 @@ public: } delete load(m_head); - m_head.release(other.m_head.exchange(0)); + m_head.release(other.m_head.exchange(fat_ptr{})); return *this; } @@ -442,9 +451,9 @@ public: void wait(std::nullptr_t /*null*/ = nullptr) noexcept { - if (m_head == 0) + if (!operator bool()) { - utils::bless>(&m_head)[1].wait(0); + utils::bless>(&m_head.raw().is_non_null)->wait(0); } } @@ -455,7 +464,7 @@ public: explicit operator bool() const noexcept { - return m_head != 0; + return observe() != nullptr; } template @@ -464,25 +473,25 @@ public: auto oldv = m_head.load(); auto item = new lf_queue_item(load(oldv), std::forward(args)...); - while (!m_head.compare_exchange(oldv, reinterpret_cast(item) << 16)) + while (!m_head.compare_exchange(oldv, fat_ptr{reinterpret_cast(item), item != nullptr, 0})) { item->m_link = load(oldv); } - if (!oldv && Notify) + if (!oldv.ptr && Notify) { // Notify only if queue was empty notify(true); } - return !oldv; + return !oldv.ptr; } void notify(bool force = false) { if (force || operator bool()) { - utils::bless>(&m_head)[1].notify_one(); + utils::bless>(&m_head.raw().is_non_null)->notify_one(); } } @@ -498,7 +507,7 @@ public: lf_queue_slice pop_all_reversed() { lf_queue_slice result; - result.m_head = load(m_head.exchange(0)); + result.m_head = load(m_head.exchange(fat_ptr{})); return result; } diff --git a/buildfiles/cmake/ConfigureCompiler.cmake b/buildfiles/cmake/ConfigureCompiler.cmake index 172ba9545e..638c5ab084 100644 --- a/buildfiles/cmake/ConfigureCompiler.cmake +++ b/buildfiles/cmake/ConfigureCompiler.cmake @@ -5,13 +5,12 @@ if(MSVC) add_compile_definitions( _CRT_SECURE_NO_DEPRECATE=1 _CRT_NON_CONFORMING_SWPRINTFS=1 _SCL_SECURE_NO_WARNINGS=1 NOMINMAX _ENABLE_EXTENDED_ALIGNED_STORAGE=1 _HAS_EXCEPTIONS=0) - add_link_options(/DYNAMICBASE:NO /BASE:0x10000 /FIXED) + add_link_options(/DYNAMICBASE:YES) #TODO: Some of these could be cleaned up add_compile_options(/wd4805) # Comparing boolean and int add_compile_options(/wd4804) # Using integer operators with booleans add_compile_options(/wd4200) # Zero-sized array in struct/union - add_link_options(/ignore:4281) # Undesirable base address 0x10000 # MSVC 2017 uses iterator as base class internally, causing a lot of warning spam add_compile_definitions(_SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING=1) @@ -19,8 +18,6 @@ if(MSVC) # Increase stack limit to 8 MB add_link_options(/STACK:8388608,1048576) else() - # Some distros have the compilers set to use PIE by default, but RPCS3 doesn't work with PIE, so we need to disable it. - check_cxx_compiler_flag("-no-pie" HAS_NO_PIE) check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) check_cxx_compiler_flag("-msse -msse2 -mcx16" COMPILER_X86) if (APPLE) @@ -96,15 +93,6 @@ else() if(NOT APPLE AND NOT WIN32) # This hides our LLVM from mesa's LLVM, otherwise we get some unresolvable conflicts. add_link_options(-Wl,--exclude-libs,ALL) - - if(HAS_NO_PIE) - add_link_options(-no-pie) - endif() - elseif(APPLE) - if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64") - add_link_options(-Wl,-image_base,0x10000 -Wl,-pagezero_size,0x10000) - add_link_options(-Wl,-no_pie) - endif() elseif(WIN32) add_compile_definitions(__STDC_FORMAT_MACROS=1) @@ -113,11 +101,6 @@ else() # Increase stack limit to 8 MB add_link_options(-Wl,--stack -Wl,8388608) - - # For arm64 windows, the image base cannot be below 4GB or the OS rejects the binary without much explanation. - if(COMPILER_X86) - add_link_options(-Wl,--image-base,0x10000) - endif() endif() # Specify C++ library to use as standard C++ when using clang (not required on linux due to GNU) diff --git a/rpcs3/Emu/Cell/PPUFunction.cpp b/rpcs3/Emu/Cell/PPUFunction.cpp index 5ac681d6af..1cc15440c7 100644 --- a/rpcs3/Emu/Cell/PPUFunction.cpp +++ b/rpcs3/Emu/Cell/PPUFunction.cpp @@ -1902,8 +1902,9 @@ auto gen_ghc_cpp_trampoline(ppu_intrp_func_t fn_target) // Take second ghc arg c.mov(args[0], x86::rbp); c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia))); - c.add(args[2], x86::qword_ptr(reinterpret_cast(&vm::g_base_addr))); - c.jmp(fn_target); + c.movabs(args[1], reinterpret_cast(&vm::g_base_addr)); + c.add(args[2], x86::qword_ptr(args[1])); + c.jmp(Imm(fn_target)); }; } diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 04268f3ca1..b385829f96 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -220,19 +220,21 @@ const auto ppu_gateway = build_function_asm("ppu_gateway", c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp); // Initialize args - c.mov(x86::r13, x86::qword_ptr(reinterpret_cast(&vm::g_exec_addr))); + c.movabs(x86::r13, reinterpret_cast(&vm::g_exec_addr)); + c.mov(x86::r13, x86::qword_ptr(x86::r13)); c.mov(x86::rbp, args[0]); c.mov(x86::edx, x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia))); // Load PC - c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::edx, 1, 0)); // Load call target - c.mov(x86::rdx, x86::rax); - c.shl(x86::rax, 16); - c.shr(x86::rax, 16); - c.shr(x86::rdx, 48); + c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::rdx, 1, 0)); // Load call target + c.movabs(x86::r12, vm::g_exec_addr_seg_offset); + c.add(x86::r12, x86::r13); + c.shr(x86::edx, 1); + c.mov(x86::edx, x86::word_ptr(x86::r12, x86::edx)); // Load relocation base c.shl(x86::edx, 13); - c.mov(x86::r12d, x86::edx); // Load relocation base + c.mov(x86::r12d, x86::edx); // Set relocation base - c.mov(x86::rbx, x86::qword_ptr(reinterpret_cast(&vm::g_base_addr))); + c.movabs(x86::rbx, reinterpret_cast(&vm::g_base_addr)); + c.mov(x86::rbx, x86::qword_ptr(x86::rbx)); c.mov(x86::r14, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 0))); // Load some registers c.mov(x86::rsi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 1))); c.mov(x86::rdi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 2))); @@ -346,14 +348,11 @@ const auto ppu_gateway = build_function_asm("ppu_gateway", c.ldr(call_target, arm::Mem(a64::x19, pc)); // Compute REG_Hp const arm::GpX reg_hp = a64::x21; - c.mov(reg_hp, call_target); - c.lsr(reg_hp, reg_hp, 48); + c.mov(reg_hp, Imm(vm::g_exec_addr_seg_offset)); + c.add(reg_hp, reg_hp, pc, arm::Shift(arm::ShiftOp::kLSR, 2)); + c.ldrh(reg_hp.w(), arm::Mem(a64::x19, reg_hp)); c.lsl(reg_hp.w(), reg_hp.w(), 13); - // Zero top 16 bits of call target - c.lsl(call_target, call_target, Imm(16)); - c.lsr(call_target, call_target, Imm(16)); - // Load registers c.mov(a64::x22, Imm(reinterpret_cast(&vm::g_base_addr))); c.ldr(a64::x22, arm::Mem(a64::x22)); @@ -473,6 +472,11 @@ static inline u8* ppu_ptr(u32 addr) return vm::g_exec_addr + u64{addr} * 2; } +static inline u8* ppu_seg_ptr(u32 addr) +{ + return vm::g_exec_addr + vm::g_exec_addr_seg_offset + (addr >> 1); +} + static inline ppu_intrp_func_t ppu_read(u32 addr) { return read_from_ptr(ppu_ptr(addr)); @@ -518,7 +522,7 @@ void ppu_recompiler_fallback(ppu_thread& ppu) while (true) { - if (uptr func = uptr(ppu_read(ppu.cia)); (func << 16 >> 16) != reinterpret_cast(ppu_recompiler_fallback_ghc)) + if (uptr func = uptr(ppu_read(ppu.cia)); func != reinterpret_cast(ppu_recompiler_fallback_ghc)) { // We found a recompiler function at cia, return break; @@ -773,6 +777,9 @@ extern void ppu_register_range(u32 addr, u32 size) utils::memory_commit(ppu_ptr(addr), u64{size} * 2, utils::protection::rw); ensure(vm::page_protect(addr, size, 0, vm::page_executable)); + // Segment data + utils::memory_commit(ppu_seg_ptr(addr), size >> 1, utils::protection::rw); + if (g_cfg.core.ppu_debug) { utils::memory_commit(vm::g_stat_addr + addr, size); @@ -785,12 +792,13 @@ extern void ppu_register_range(u32 addr, u32 size) if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm) { // Assume addr is the start of first segment of PRX - const uptr entry_value = reinterpret_cast(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3)); - write_to_ptr(ppu_ptr(addr), entry_value); + write_to_ptr(ppu_ptr(addr), std::bit_cast(ppu_recompiler_fallback_ghc)); + write_to_ptr(ppu_seg_ptr(addr), static_cast(seg_base >> 13)); } else { write_to_ptr(ppu_ptr(addr), ppu_fallback); + write_to_ptr(ppu_seg_ptr(addr), 0); } addr += 4; @@ -805,7 +813,7 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr = // Initialize specific function if (ptr) { - write_to_ptr(ppu_ptr(addr), (reinterpret_cast(ptr) & 0xffff'ffff'ffffu) | (uptr(ppu_read(addr)) & ~0xffff'ffff'ffffu)); + write_to_ptr(ppu_ptr(addr), std::bit_cast(ptr)); return; } @@ -3164,8 +3172,9 @@ const auto ppu_stcx_accurate_tx = build_function_asm(&vm::g_sudo_addr))); + c.movabs(x86::rbp, reinterpret_cast(&vm::g_sudo_addr)); + c.mov(x86::rbp, x86::qword_ptr(x86::rbp)); c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); c.and_(x86::rbp, -128); c.prefetchw(x86::byte_ptr(x86::rbp, 0)); c.prefetchw(x86::byte_ptr(x86::rbp, 64)); c.movzx(args[0].r32(), args[0].r16()); c.shr(args[0].r32(), 1); - c.lea(x86::r11, x86::qword_ptr(reinterpret_cast(+vm::g_reservations), args[0])); + c.movabs(x86::r11, reinterpret_cast(+vm::g_reservations)); + c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); c.and_(x86::r11, -128 / 2); c.and_(args[0].r32(), 63); @@ -3217,7 +3228,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm(&g_rtm_tx_limit2))); + c.movabs(x86::r13, reinterpret_cast(&g_rtm_tx_limit2)); + c.cmp(x86::rax, x86::qword_ptr(x86::r13)); c.jae(fall); }); @@ -3342,8 +3354,9 @@ const auto ppu_stcx_accurate_tx = build_function_asm& dir_queue, std::vector 14GB // The growth in memory requirements of LLVM is not linear with file size of course // But these estimates should hopefully protect RPCS3 in the coming years - // Especially when thread count is on the rise with each CPU generation + // Especially when thread count is on the rise with each CPU generation atomic_t file_size_limit = static_cast(std::clamp(utils::aligned_div(utils::get_total_memory(), 2000), 65536, u32{umax})); const u32 software_thread_limit = std::min(g_cfg.core.llvm_threads ? g_cfg.core.llvm_threads : u32{umax}, ::size32(file_queue)); @@ -4301,8 +4314,8 @@ extern void ppu_precompile(std::vector& dir_queue, std::vector(&Emu.klic[0])); - - if (src) + + if (src) { ppu_log.error("Possible missed KLIC for precompilation of '%s', please report to developers.", path); @@ -4333,7 +4346,7 @@ extern void ppu_precompile(std::vector& dir_queue, std::vector(utils::sub_saturate(value, file_size)); restore_mem = value - new_val; value = new_val; @@ -4506,8 +4519,8 @@ extern void ppu_precompile(std::vector& dir_queue, std::vector(&Emu.klic[0])); - - if (src) + + if (src) { ppu_log.error("Possible missed KLIC for precompilation of '%s', please report to developers.", path); } @@ -5079,17 +5092,18 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s code_size_until_jump = buf_end - buf_start; c.add(x86::edx, seg0); - c.mov(x86::rax, x86::qword_ptr(reinterpret_cast(&vm::g_exec_addr))); + c.movabs(x86::rax, reinterpret_cast(&vm::g_exec_addr)); + c.mov(x86::rax, x86::qword_ptr(x86::rax)); c.mov(x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia)), x86::edx); - c.mov(x86::rax, x86::qword_ptr(x86::rax, x86::rdx, 1, 0)); // Load call target - c.mov(x86::rdx, x86::rax); - c.shl(x86::rax, 16); - c.shr(x86::rax, 16); - c.shr(x86::rdx, 48); + c.mov(x86::rcx, x86::qword_ptr(x86::rax, x86::rdx, 1, 0)); // Load call target + c.movabs(x86::r12, vm::g_exec_addr_seg_offset); + c.add(x86::rax, x86::r12); + c.shr(x86::edx, 1); + c.mov(x86::edx, x86::word_ptr(x86::rax, x86::edx)); // Load relocation base c.shl(x86::edx, 13); - c.mov(x86::r12d, x86::edx); // Load relocation base - c.jmp(x86::rax); + c.mov(x86::r12d, x86::edx); // Set relocation base + c.jmp(x86::rcx); #else // Load REG_Base - use absolute jump target to bypass rel jmp range limits // X19 contains vm::g_exec_addr @@ -5125,14 +5139,11 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s // Compute REG_Hp const arm::GpX reg_hp = a64::x21; - c.mov(reg_hp, call_target); - c.lsr(reg_hp, reg_hp, 48); + c.mov(reg_hp, Imm(vm::g_exec_addr_seg_offset)); + c.add(reg_hp, reg_hp, pc, arm::Shift(arm::ShiftOp::kLSR, 2)); + c.ldrh(reg_hp.w(), arm::Mem(exec_addr, reg_hp)); c.lsl(reg_hp.w(), reg_hp.w(), 13); - // Zero top 16 bits of call target - c.lsl(call_target, call_target, 16); - c.lsr(call_target, call_target, 16); - // Execute LLE call c.br(call_target); #endif @@ -5340,7 +5351,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s sha1_update(&ctx, reinterpret_cast(addrs.data()), addrs.size() * sizeof(be_t)); } - part.jit_bounds = std::move(local_jit_bounds); + part.jit_bounds = std::move(local_jit_bounds); local_jit_bounds = std::make_shared>(u32{umax}, 0); } @@ -5400,7 +5411,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s settings += ppu_settings::contains_symbol_resolver; // Avoid invalidating all modules for this purpose // Write version, hash, CPU, settings - fmt::append(obj_name, "v6-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); + fmt::append(obj_name, "v7-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); } if (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped()) @@ -5712,7 +5723,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s for (u32 addr = info.segs[0].addr; addr < info.segs[0].addr + info.segs[0].size; addr += 4, inst_ptr++) { - if (*inst_ptr == ppu_instructions::BLR() && (reinterpret_cast(ppu_read(addr)) << 16 >> 16) == reinterpret_cast(ppu_recompiler_fallback_ghc)) + if (*inst_ptr == ppu_instructions::BLR() && reinterpret_cast(ppu_read(addr)) == reinterpret_cast(ppu_recompiler_fallback_ghc)) { write_to_ptr(ppu_ptr(addr), BLR_func); } diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index e485eeecfb..4d0cb528ca 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -411,12 +411,19 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info) const auto faddr = m_ir->CreateLoad(ptr_inst->getResultElementType(), ptr_inst); const auto faddr_int = m_ir->CreatePtrToInt(faddr, get_type()); - const auto fval = m_ir->CreateOr(m_ir->CreateShl(m_seg0, 32 + 3), faddr_int); - const auto pos = m_ir->CreateShl(m_reloc ? m_ir->CreateAdd(func_pc, m_seg0) : func_pc, 1); + const auto pos_32 = m_reloc ? m_ir->CreateAdd(func_pc, m_seg0) : func_pc; + const auto pos = m_ir->CreateShl(pos_32, 1); const auto ptr = dyn_cast(m_ir->CreateGEP(get_type(), m_exec, pos)); + const auto seg_base_ptr = m_ir->CreateIntToPtr(m_ir->CreateAdd( + m_ir->CreatePtrToInt(m_exec, get_type()), m_ir->getInt64(vm::g_exec_addr_seg_offset)), m_exec->getType()); + const auto seg_pos = m_ir->CreateLShr(pos_32, 1); + const auto seg_ptr = dyn_cast(m_ir->CreateGEP(get_type(), seg_base_ptr, seg_pos)); + const auto seg_val = m_ir->CreateTrunc(m_ir->CreateLShr(m_seg0, 13), get_type()); + // Store to jumptable - m_ir->CreateStore(fval, ptr); + m_ir->CreateStore(faddr_int, ptr); + m_ir->CreateStore(seg_val, seg_ptr); // Increment index and branch back to loop const auto post_add = m_ir->CreateAdd(index_value, m_ir->getInt64(1)); @@ -605,10 +612,15 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect) const auto pos = m_ir->CreateShl(indirect, 1); const auto ptr = dyn_cast(m_ir->CreateGEP(get_type(), m_exec, pos)); const auto val = m_ir->CreateLoad(get_type(), ptr); - callee = FunctionCallee(type, m_ir->CreateIntToPtr(m_ir->CreateAnd(val, 0xffff'ffff'ffff), type->getPointerTo())); + callee = FunctionCallee(type, m_ir->CreateIntToPtr(val, type->getPointerTo())); // Load new segment address - seg0 = m_ir->CreateShl(m_ir->CreateLShr(val, 48), 13); + const auto seg_base_ptr = m_ir->CreateIntToPtr(m_ir->CreateAdd( + m_ir->CreatePtrToInt(m_exec, get_type()), m_ir->getInt64(vm::g_exec_addr_seg_offset)), m_exec->getType()); + const auto seg_pos = m_ir->CreateLShr(indirect, 1); + const auto seg_ptr = dyn_cast(m_ir->CreateGEP(get_type(), seg_base_ptr, seg_pos)); + const auto seg_val = m_ir->CreateZExt(m_ir->CreateLoad(get_type(), seg_ptr), get_type()); + seg0 = m_ir->CreateShl(seg_val, 13); } m_ir->SetInsertPoint(block); diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 3eb75082a2..6b4b456f3d 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -2770,14 +2770,17 @@ void spu_recompiler::FREST(spu_opcode_t op) const u64 fraction_lut_addr = reinterpret_cast(spu_frest_fraction_lut); const u64 exponent_lut_addr = reinterpret_cast(spu_frest_exponent_lut); + c->movabs(*arg0, fraction_lut_addr); + c->movabs(*arg1, exponent_lut_addr); + for (u32 index = 0; index < 4; index++) { c->pextrd(*qw0, v_fraction, index); - c->mov(*qw1, asmjit::x86::dword_ptr(fraction_lut_addr, *qw0, 2)); + c->mov(*qw1, asmjit::x86::dword_ptr(*arg0, *qw0, 2)); c->pinsrd(v_fraction, *qw1, index); c->pextrd(*qw0, v_exponent, index); - c->mov(*qw1, asmjit::x86::dword_ptr(exponent_lut_addr, *qw0, 2)); + c->mov(*qw1, asmjit::x86::dword_ptr(*arg1, *qw0, 2)); c->pinsrd(v_exponent, *qw1, index); } @@ -2810,14 +2813,17 @@ void spu_recompiler::FRSQEST(spu_opcode_t op) const u64 fraction_lut_addr = reinterpret_cast(spu_frsqest_fraction_lut); const u64 exponent_lut_addr = reinterpret_cast(spu_frsqest_exponent_lut); + c->movabs(*arg0, fraction_lut_addr); + c->movabs(*arg1, exponent_lut_addr); + for (u32 index = 0; index < 4; index++) { c->pextrd(*qw0, v_fraction, index); - c->mov(*qw1, asmjit::x86::dword_ptr(fraction_lut_addr, *qw0, 2)); + c->mov(*qw1, asmjit::x86::dword_ptr(*arg0, *qw0, 2)); c->pinsrd(v_fraction, *qw1, index); c->pextrd(*qw0, v_exponent, index); - c->mov(*qw1, asmjit::x86::dword_ptr(exponent_lut_addr, *qw0, 2)); + c->mov(*qw1, asmjit::x86::dword_ptr(*arg1, *qw0, 2)); c->pinsrd(v_exponent, *qw1, index); } diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index e11cc7a817..ae1d5bd889 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -842,6 +842,7 @@ void spu_cache::initialize(bool build_existing_cache) // Initialize compiler instances for parallel compilation std::unique_ptr compiler; +#if defined(ARCH_X64) if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit) { compiler = spu_recompiler_base::make_asmjit_recompiler(); @@ -850,6 +851,22 @@ void spu_cache::initialize(bool build_existing_cache) { compiler = spu_recompiler_base::make_llvm_recompiler(); } + else + { + fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder); + } +#elif defined(ARCH_ARM64) + if (g_cfg.core.spu_decoder == spu_decoder_type::llvm) + { + compiler = spu_recompiler_base::make_llvm_recompiler(); + } + else + { + fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder); + } +#else +#error "Unimplemented" +#endif compiler->init(); @@ -2545,7 +2562,7 @@ bool reg_state_t::is_const() const bool reg_state_t::compare_tags(const reg_state_t& rhs) const { - // Compare by tag, address of instruction origin + // Compare by tag, address of instruction origin return tag == rhs.tag && origin == rhs.origin && is_instruction == rhs.is_instruction; } @@ -6066,7 +6083,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s else if (atomic16->ls_offs.compare_with_mask_indifference(atomic16->lsa, SPU_LS_MASK_128) && atomic16->ls.is_less_than(128 - (atomic16->ls_offs.value & 127))) { // Relative memory access with offset less than 128 bytes - // Common around SPU utilities which have less strict restrictions about memory alignment + // Common around SPU utilities which have less strict restrictions about memory alignment ok = true; } } @@ -6340,7 +6357,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s { atomic16->mem_count++; - // Do not clear lower 16 bytes addressing because the program can move on 4-byte basis + // Do not clear lower 16 bytes addressing because the program can move on 4-byte basis const u32 offs = spu_branch_target(pos - result.lower_bound, op.si16); if (atomic16->lsa.is_const() && [&]() @@ -8142,7 +8159,7 @@ std::array& block_reg_info::evaluate_start_state(const s // Check if the node is resolved if (!node->has_true_state) { - // Assume this block cannot be resolved at the moment + // Assume this block cannot be resolved at the moment is_all_resolved = false; break; } diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 3ab6a10a61..26d504f2c6 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -628,6 +628,8 @@ const auto spu_putllc_tx = build_function_asm(&vm::g_sudo_addr))); + c.movabs(args[1], reinterpret_cast(&vm::g_sudo_addr)); + c.mov(args[1], x86::qword_ptr(args[1])); c.lea(args[1], x86::qword_ptr(args[1], args[0])); c.prefetchw(x86::byte_ptr(args[1], 0)); c.prefetchw(x86::byte_ptr(args[1], 64)); c.and_(args[0].r32(), 0xff80); c.shr(args[0].r32(), 1); - c.lea(x86::r11, x86::qword_ptr(reinterpret_cast(+vm::g_reservations), args[0])); + c.movabs(x86::r11, reinterpret_cast(+vm::g_reservations)); + c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); // Prepare data if (s_tsx_avx) @@ -703,7 +709,8 @@ const auto spu_putllc_tx = build_function_asm(&g_rtm_tx_limit2))); + c.movabs(x86::rbx, reinterpret_cast(&g_rtm_tx_limit2)); + c.cmp(x86::rax, x86::qword_ptr(x86::rbx)); c.jae(fall); }); @@ -853,8 +860,13 @@ const auto spu_putllc_tx = build_function_asm(&vm::g_sudo_addr))); + c.movabs(x86::r11, reinterpret_cast(&vm::g_sudo_addr)); + c.mov(x86::r11, x86::qword_ptr(x86::r11)); c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); c.prefetchw(x86::byte_ptr(x86::r11, 0)); c.prefetchw(x86::byte_ptr(x86::r11, 64)); @@ -921,7 +936,8 @@ const auto spu_putlluc_tx = build_function_asm(+vm::g_reservations), args[0])); + c.movabs(args[1], reinterpret_cast(+vm::g_reservations)); + c.lea(args[1], x86::qword_ptr(args[1], args[0])); // Alloc args[0] to stamp0 const auto stamp0 = args[0]; @@ -933,7 +949,8 @@ const auto spu_putlluc_tx = build_function_asm(&g_rtm_tx_limit2))); + c.movabs(x86::rbx, reinterpret_cast(&g_rtm_tx_limit2)); + c.cmp(x86::rax, x86::qword_ptr(x86::rbx)); c.jae(fall); }); @@ -986,6 +1003,10 @@ const auto spu_putlluc_tx = build_function_asm(&vm::g_sudo_addr))); + c.movabs(x86::rbp, reinterpret_cast(&vm::g_sudo_addr)); + c.mov(x86::rbp, x86::qword_ptr(x86::rbp)); c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); c.and_(args[0].r32(), 0xff80); c.shr(args[0].r32(), 1); - c.lea(x86::r11, x86::qword_ptr(reinterpret_cast(+vm::g_reservations), args[0])); + c.movabs(x86::r11, reinterpret_cast(+vm::g_reservations)); + c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); // Alloc args[0] to stamp0 const auto stamp0 = args[0]; @@ -1039,7 +1062,8 @@ const auto spu_getllar_tx = build_function_asm(&g_rtm_tx_limit1))); + c.movabs(x86::rbx, reinterpret_cast(&g_rtm_tx_limit1)); + c.cmp(x86::rax, x86::qword_ptr(x86::rbx)); c.jae(fall); }); @@ -2118,20 +2142,31 @@ spu_thread::spu_thread(lv2_spu_group* group, u32 index, std::string_view name, u , lv2_id(lv2_id) , spu_tname(make_single(name)) { +#if defined(ARCH_X64) if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit) { jit = spu_recompiler_base::make_asmjit_recompiler(); } else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm) { -#if defined(ARCH_X64) jit = spu_recompiler_base::make_fast_llvm_recompiler(); + } + else + { + fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder); + } #elif defined(ARCH_ARM64) + if (g_cfg.core.spu_decoder == spu_decoder_type::llvm) + { jit = spu_recompiler_base::make_llvm_recompiler(); + } + else + { + fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder); + } #else #error "Unimplemented" #endif - } if (g_cfg.core.mfc_debug) { @@ -2193,20 +2228,31 @@ spu_thread::spu_thread(utils::serial& ar, lv2_spu_group* group) , lv2_id(ar) , spu_tname(make_single(ar.operator std::string())) { +#if defined(ARCH_X64) if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit) { jit = spu_recompiler_base::make_asmjit_recompiler(); } else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm) { -#if defined(ARCH_X64) jit = spu_recompiler_base::make_fast_llvm_recompiler(); + } + else + { + fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder); + } #elif defined(ARCH_ARM64) + if (g_cfg.core.spu_decoder == spu_decoder_type::llvm) + { jit = spu_recompiler_base::make_llvm_recompiler(); + } + else + { + fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder); + } #else #error "Unimplemented" #endif - } if (g_cfg.core.mfc_debug) { @@ -4445,7 +4491,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span ls_ptr, u32 base_add // Detect "invalid" relative branches // Branch offsets that, although are the only way to get X code address using relative address // Rely on overflow/underflow of SPU memory bounds - // Thus they would behave differently if SPU LS memory size was to increase (evolving the CELL architecture was the original plan) + // Thus they would behave differently if SPU LS memory size was to increase (evolving the CELL architecture was the original plan) // Making them highly unlikely to be valid code if (rel < 0) @@ -4666,7 +4712,7 @@ bool spu_thread::process_mfc_cmd() // Add to chance if previous wait was long enough const u32 add_count = zero_count == 3 && total_wait >= 40 ? (total_wait - 39) * 40 - : zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40 + : zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40 : zero_count == 1 && total_wait >= 8 ? (total_wait - 7) * 40 : zero_count == 0 && total_wait >= 6 ? (total_wait - 5) * 40 : 0; @@ -5004,7 +5050,7 @@ bool spu_thread::process_mfc_cmd() if (group->spurs_running == max_run - 1) { - // Try to let another thread slip in and take over execution + // Try to let another thread slip in and take over execution thread_ctrl::wait_for(300); // Update value @@ -5029,7 +5075,7 @@ bool spu_thread::process_mfc_cmd() if (spurs_last_task_timestamp) { const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate; - spurs_average_task_duration -= avg_entry; + spurs_average_task_duration -= avg_entry; spurs_average_task_duration += std::min(45'000, current - spurs_last_task_timestamp); spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate); spurs_last_task_timestamp = 0; @@ -5050,7 +5096,7 @@ bool spu_thread::process_mfc_cmd() } max_run = group->max_run; - + prev_running = group->spurs_running.fetch_op([max_run](u32& x) { if (x < max_run) @@ -5115,7 +5161,7 @@ bool spu_thread::process_mfc_cmd() if (spurs_last_task_timestamp) { const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate; - spurs_average_task_duration -= avg_entry; + spurs_average_task_duration -= avg_entry; spurs_average_task_duration += std::min(45'000, current - spurs_last_task_timestamp); spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate); spurs_last_task_timestamp = 0; diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index 7ad9d54bf4..891f0910c1 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -47,7 +47,7 @@ namespace vm u8* const g_sudo_addr = g_base_addr + 0x1'0000'0000; // Auxiliary virtual memory for executable areas - u8* const g_exec_addr = memory_reserve_4GiB(g_sudo_addr, 0x200000000); + u8* const g_exec_addr = memory_reserve_4GiB(g_sudo_addr, 0x300000000); // Hooks for memory R/W interception (default: zero offset to some function with only ret instructions) u8* const g_hook_addr = memory_reserve_4GiB(g_exec_addr, 0x800000000); diff --git a/rpcs3/Emu/Memory/vm.h b/rpcs3/Emu/Memory/vm.h index 25ff1cd33e..2b2eccc24d 100644 --- a/rpcs3/Emu/Memory/vm.h +++ b/rpcs3/Emu/Memory/vm.h @@ -34,6 +34,8 @@ namespace vm extern u8* const g_free_addr; extern u8 g_reservations[65536 / 128 * 64]; + static constexpr u64 g_exec_addr_seg_offset = 0x2'0000'0000ULL; + struct writer_lock; enum memory_location_t : uint diff --git a/rpcs3/rpcs3.vcxproj b/rpcs3/rpcs3.vcxproj index b8f0a02209..318142ed10 100644 --- a/rpcs3/rpcs3.vcxproj +++ b/rpcs3/rpcs3.vcxproj @@ -97,10 +97,9 @@ true false $(OutDir)\rpcs3.exe - false + true Windows true - 0x10000 mainCRTStartup @@ -148,10 +147,11 @@ Debug true $(OutDir)\rpcs3d.exe - false + true Windows true - 0x10000 + + mainCRTStartup @@ -2123,4 +2123,4 @@ - \ No newline at end of file + diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp index 77790ac0a0..10fa33c31d 100644 --- a/rpcs3/rpcs3qt/settings_dialog.cpp +++ b/rpcs3/rpcs3qt/settings_dialog.cpp @@ -398,6 +398,10 @@ settings_dialog::settings_dialog(std::shared_ptr gui_settings, std spu_bg->addButton(ui->spu_asmjit, static_cast(spu_decoder_type::asmjit)); spu_bg->addButton(ui->spu_llvm, static_cast(spu_decoder_type::llvm)); +#ifndef ARCH_X64 + ui->spu_asmjit->setEnabled(false); +#endif + connect(spu_bg, &QButtonGroup::idToggled, [this](int id, bool checked) { if (!checked) return; diff --git a/rpcs3/util/atomic.cpp b/rpcs3/util/atomic.cpp index cd9a6e37a3..c8e76cbf17 100644 --- a/rpcs3/util/atomic.cpp +++ b/rpcs3/util/atomic.cpp @@ -57,8 +57,8 @@ static bool has_waitv() // Total number of entries. static constexpr usz s_hashtable_size = 1u << 17; -// Reference counter combined with shifted pointer (which is assumed to be 48 bit) -static constexpr uptr s_ref_mask = 0xffff; +// Reference counter mask +static constexpr uptr s_ref_mask = 0xffff'ffff; // Fix for silly on-first-use initializer static bool s_null_wait_cb(const void*, u64, u64){ return true; }; @@ -153,8 +153,16 @@ namespace // Essentially a fat semaphore struct alignas(64) cond_handle { - // Combined pointer (most significant 48 bits) and ref counter (16 least significant bits) - atomic_t ptr_ref; + struct fat_ptr + { + u64 ptr{}; + u32 reserved{}; + u32 ref_ctr{}; + + auto operator<=>(const fat_ptr& other) const = default; + }; + + atomic_t ptr_ref; u64 tid; u32 oldv; @@ -183,7 +191,7 @@ namespace mtx.init(mtx); #endif - ensure(!ptr_ref.exchange((iptr << 16) | 1)); + ensure(ptr_ref.exchange(fat_ptr{iptr, 0, 1}) == fat_ptr{}); } void destroy() @@ -370,7 +378,7 @@ namespace if (cond_id) { // Set fake refctr - s_cond_list[cond_id].ptr_ref.release(1); + s_cond_list[cond_id].ptr_ref.release(cond_handle::fat_ptr{0, 0, 1}); cond_free(cond_id, -1); } } @@ -390,7 +398,7 @@ static u32 cond_alloc(uptr iptr, u32 tls_slot = -1) { // Fast reinitialize const u32 id = std::exchange(*ptls, 0); - s_cond_list[id].ptr_ref.release((iptr << 16) | 1); + s_cond_list[id].ptr_ref.release(cond_handle::fat_ptr{iptr, 0, 1}); return id; } @@ -461,15 +469,15 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1) const auto cond = s_cond_list + cond_id; // Dereference, destroy on last ref - const bool last = cond->ptr_ref.atomic_op([](u64& val) + const bool last = cond->ptr_ref.atomic_op([](cond_handle::fat_ptr& val) { - ensure(val & s_ref_mask); + ensure(val.ref_ctr); - val--; + val.ref_ctr--; - if ((val & s_ref_mask) == 0) + if (val.ref_ctr == 0) { - val = 0; + val = cond_handle::fat_ptr{}; return true; } @@ -525,15 +533,15 @@ static cond_handle* cond_id_lock(u32 cond_id, uptr iptr = 0) while (true) { - const auto [old, ok] = cond->ptr_ref.fetch_op([&](u64& val) + const auto [old, ok] = cond->ptr_ref.fetch_op([&](cond_handle::fat_ptr& val) { - if (!val || (val & s_ref_mask) == s_ref_mask) + if (val == cond_handle::fat_ptr{} || val.ref_ctr == s_ref_mask) { // Don't reference already deallocated semaphore return false; } - if (iptr && (val >> 16) != iptr) + if (iptr && val.ptr != iptr) { // Pointer mismatch return false; @@ -548,7 +556,7 @@ static cond_handle* cond_id_lock(u32 cond_id, uptr iptr = 0) if (!did_ref) { - val++; + val.ref_ctr++; } return true; @@ -566,7 +574,7 @@ static cond_handle* cond_id_lock(u32 cond_id, uptr iptr = 0) return cond; } - if ((old & s_ref_mask) == s_ref_mask) + if (old.ref_ctr == s_ref_mask) { fmt::throw_exception("Reference count limit (%u) reached in an atomic notifier.", s_ref_mask); } @@ -589,12 +597,14 @@ namespace u64 maxc: 5; // Collision counter u64 maxd: 11; // Distance counter u64 bits: 24; // Allocated bits - u64 prio: 24; // Reserved + u64 prio: 8; // Reserved u64 ref : 16; // Ref counter - u64 iptr: 48; // First pointer to use slot (to count used slots) + u64 iptr: 64; // First pointer to use slot (to count used slots) }; + static_assert(sizeof(slot_allocator) == 16); + // Need to spare 16 bits for ref counter static constexpr u64 max_threads = 24; @@ -935,7 +945,7 @@ atomic_wait_engine::wait(const void* data, u32 old_value, u64 timeout, atomic_wa const auto stamp0 = utils::get_unique_tsc(); - const uptr iptr = reinterpret_cast(data) & (~s_ref_mask >> 16); + const uptr iptr = reinterpret_cast(data); uptr iptr_ext[atomic_wait::max_list - 1]{}; @@ -956,7 +966,7 @@ atomic_wait_engine::wait(const void* data, u32 old_value, u64 timeout, atomic_wa } } - iptr_ext[ext_size] = reinterpret_cast(e->data) & (~s_ref_mask >> 16); + iptr_ext[ext_size] = reinterpret_cast(e->data); ext_size++; } } @@ -1266,7 +1276,7 @@ void atomic_wait_engine::notify_one(const void* data) return; } #endif - const uptr iptr = reinterpret_cast(data) & (~s_ref_mask >> 16); + const uptr iptr = reinterpret_cast(data); root_info::slot_search(iptr, [&](u32 cond_id) { @@ -1289,7 +1299,7 @@ atomic_wait_engine::notify_all(const void* data) return; } #endif - const uptr iptr = reinterpret_cast(data) & (~s_ref_mask >> 16); + const uptr iptr = reinterpret_cast(data); // Array count for batch notification u32 count = 0; diff --git a/rpcs3/util/atomic.hpp b/rpcs3/util/atomic.hpp index 593b7a51f1..85c8b10482 100644 --- a/rpcs3/util/atomic.hpp +++ b/rpcs3/util/atomic.hpp @@ -205,9 +205,9 @@ namespace atomic_wait constexpr void set(lf_queue& var, std::nullptr_t = nullptr) { static_assert(Index < Max); - static_assert(sizeof(var) == sizeof(uptr)); + static_assert(sizeof(var) == sizeof(uptr) * 2); - m_info[Index].data = reinterpret_cast(&var) + sizeof(u32); + m_info[Index].data = reinterpret_cast(&var) + offsetof(typename lf_queue::fat_ptr, is_non_null); m_info[Index].old = 0; } @@ -215,9 +215,9 @@ namespace atomic_wait constexpr void set(stx::atomic_ptr& var, std::nullptr_t = nullptr) { static_assert(Index < Max); - static_assert(sizeof(var) == sizeof(uptr)); + static_assert(sizeof(var) == sizeof(uptr) * 2); - m_info[Index].data = reinterpret_cast(&var) + sizeof(u32); + m_info[Index].data = reinterpret_cast(&var) + offsetof(typename stx::atomic_ptr::fat_ptr, is_non_null); m_info[Index].old = 0; } diff --git a/rpcs3/util/shared_ptr.hpp b/rpcs3/util/shared_ptr.hpp index 29e4150c21..375784e7dd 100644 --- a/rpcs3/util/shared_ptr.hpp +++ b/rpcs3/util/shared_ptr.hpp @@ -19,14 +19,8 @@ namespace stx template class atomic_ptr; - // Basic assumption of userspace pointer size - constexpr uint c_ptr_size = 48; - - // Use lower 16 bits as atomic_ptr internal counter of borrowed refs (pointer itself is shifted) - constexpr uint c_ref_mask = 0xffff, c_ref_size = 16; - - // Remaining pointer bits - constexpr uptr c_ptr_mask = static_cast(-1) << c_ref_size; + // Use 16 bits as atomic_ptr internal counter of borrowed refs + constexpr uint c_ref_mask = 0xffff; struct shared_counter { @@ -574,7 +568,6 @@ namespace stx } // Random checks which may fail on invalid pointer - ensure((reinterpret_cast(r.d()->destroy.load()) - 0x10000) >> 47 == 0); ensure((r.d()->refs++ - 1) >> 58 == 0); return r; } @@ -583,11 +576,21 @@ namespace stx template class atomic_ptr { - mutable atomic_t m_val{0}; - - static shared_counter* d(uptr val) noexcept + public: + struct fat_ptr { - return std::launder(reinterpret_cast((val >> c_ref_size) - sizeof(shared_counter))); + uptr ptr{}; + u32 is_non_null{}; + u32 ref_ctr{}; + }; + + private: + + mutable atomic_t m_val{fat_ptr{}}; + + static shared_counter* d(fat_ptr val) noexcept + { + return std::launder(reinterpret_cast(val.ptr - sizeof(shared_counter))); } shared_counter* d() const noexcept @@ -595,14 +598,19 @@ namespace stx return d(m_val); } - static uptr to_val(const volatile std::remove_extent_t* ptr) noexcept + static fat_ptr to_val(const volatile std::remove_extent_t* ptr) noexcept { - return (reinterpret_cast(ptr) << c_ref_size); + return fat_ptr{reinterpret_cast(ptr), ptr != nullptr, 0}; } - static std::remove_extent_t* ptr_to(uptr val) noexcept + static fat_ptr to_val(uptr ptr) noexcept { - return reinterpret_cast*>(val >> c_ref_size); + return fat_ptr{ptr, ptr != 0, 0}; + } + + static std::remove_extent_t* ptr_to(fat_ptr val) noexcept + { + return reinterpret_cast*>(val.ptr); } template @@ -645,7 +653,7 @@ namespace stx atomic_ptr(const shared_ptr& r) noexcept { // Obtain a ref + as many refs as an atomic_ptr can additionally reference - if (uptr rval = to_val(r.m_ptr)) + if (fat_ptr rval = to_val(r.m_ptr); rval.ptr != 0) { m_val.raw() = rval; d(rval)->refs += c_ref_mask + 1; @@ -655,7 +663,7 @@ namespace stx template requires same_ptr_implicit_v atomic_ptr(shared_ptr&& r) noexcept { - if (uptr rval = to_val(r.m_ptr)) + if (fat_ptr rval = to_val(r.m_ptr); rval.ptr != 0) { m_val.raw() = rval; d(rval)->refs += c_ref_mask; @@ -667,7 +675,7 @@ namespace stx template requires same_ptr_implicit_v atomic_ptr(single_ptr&& r) noexcept { - if (uptr rval = to_val(r.m_ptr)) + if (fat_ptr rval = to_val(r.m_ptr); rval.ptr != 0) { m_val.raw() = rval; d(rval)->refs += c_ref_mask; @@ -678,13 +686,13 @@ namespace stx ~atomic_ptr() noexcept { - const uptr v = m_val.raw(); + const fat_ptr v = m_val.raw(); - if (v >> c_ref_size) + if (v.ptr) { const auto o = d(v); - if (!o->refs.sub_fetch(c_ref_mask + 1 - (v & c_ref_mask))) + if (!o->refs.sub_fetch(c_ref_mask + 1 - (v.ref_ctr & c_ref_mask))) { o->destroy.load()(o); } @@ -733,11 +741,11 @@ namespace stx shared_type r; // Add reference - const auto [prev, did_ref] = m_val.fetch_op([](uptr& val) + const auto [prev, did_ref] = m_val.fetch_op([](fat_ptr& val) { - if (val >> c_ref_size) + if (val.ptr) { - val++; + val.ref_ctr++; return true; } @@ -755,11 +763,11 @@ namespace stx r.d()->refs++; // Dereference if still the same pointer - const auto [_, did_deref] = m_val.fetch_op([prev = prev](uptr& val) + const auto [_, did_deref] = m_val.fetch_op([prev = prev](fat_ptr& val) { - if (val >> c_ref_size == prev >> c_ref_size) + if (val.ptr == prev.ptr) { - val--; + val.ref_ctr--; return true; } @@ -782,11 +790,11 @@ namespace stx shared_type r; // Add reference - const auto [prev, did_ref] = m_val.fetch_op([](uptr& val) + const auto [prev, did_ref] = m_val.fetch_op([](fat_ptr& val) { - if (val >> c_ref_size) + if (val.ptr) { - val++; + val.ref_ctr++; return true; } @@ -823,11 +831,11 @@ namespace stx } // Dereference if still the same pointer - const auto [_, did_deref] = m_val.fetch_op([prev = prev](uptr& val) + const auto [_, did_deref] = m_val.fetch_op([prev = prev](fat_ptr& val) { - if (val >> c_ref_size == prev >> c_ref_size) + if (val.ptr == prev.ptr) { - val--; + val.ref_ctr--; return true; } @@ -888,7 +896,7 @@ namespace stx atomic_ptr old; old.m_val.raw() = m_val.exchange(to_val(r.m_ptr)); - old.m_val.raw() += 1; + old.m_val.raw().ref_ctr += 1; r.m_ptr = std::launder(ptr_to(old.m_val)); return r; @@ -904,7 +912,7 @@ namespace stx atomic_ptr old; old.m_val.raw() = m_val.exchange(to_val(value.m_ptr)); - old.m_val.raw() += 1; + old.m_val.raw().ref_ctr += 1; value.m_ptr = std::launder(ptr_to(old.m_val)); return value; @@ -923,21 +931,21 @@ namespace stx atomic_ptr old; - const uptr _val = m_val.fetch_op([&](uptr& val) + const fat_ptr _val = m_val.fetch_op([&](fat_ptr& val) { - if (val >> c_ref_size == _old) + if (val.ptr == _old) { // Set new value - val = _new << c_ref_size; + val = to_val(_new); } - else if (val) + else if (val.ptr != 0) { // Reference previous value - val++; + val.ref_ctr++; } }); - if (_val >> c_ref_size == _old) + if (_val.ptr == _old) { // Success (exch is consumed, cmp_and_old is unchanged) if (exch.m_ptr) @@ -954,9 +962,10 @@ namespace stx old_exch.m_val.raw() = to_val(std::exchange(exch.m_ptr, nullptr)); // Set to reset old cmp_and_old value - old.m_val.raw() = to_val(cmp_and_old.m_ptr) | c_ref_mask; + old.m_val.raw() = to_val(cmp_and_old.m_ptr); + old.m_val.raw().ref_ctr |= c_ref_mask; - if (!_val) + if (!_val.ptr) { return false; } @@ -966,11 +975,11 @@ namespace stx cmp_and_old.d()->refs++; // Dereference if still the same pointer - const auto [_, did_deref] = m_val.fetch_op([_val](uptr& val) + const auto [_, did_deref] = m_val.fetch_op([_val](fat_ptr& val) { - if (val >> c_ref_size == _val >> c_ref_size) + if (val.ptr == _val.ptr) { - val--; + val.ref_ctr--; return true; } @@ -1009,12 +1018,12 @@ namespace stx atomic_ptr old; - const auto [_val, ok] = m_val.fetch_op([&](uptr& val) + const auto [_val, ok] = m_val.fetch_op([&](fat_ptr& val) { - if (val >> c_ref_size == _old) + if (val.ptr == _old) { // Set new value - val = _new << c_ref_size; + val = to_val(_new); return true; } @@ -1081,7 +1090,7 @@ namespace stx if (next.m_ptr) { // Compensation for `next` assignment - old.m_val.raw() += 1; + old.m_val.raw().ref_ctr += 1; } } @@ -1093,7 +1102,7 @@ namespace stx explicit constexpr operator bool() const noexcept { - return m_val != 0; + return observe() != nullptr; } template requires same_ptr_implicit_v @@ -1110,17 +1119,17 @@ namespace stx void wait(std::nullptr_t, atomic_wait_timeout timeout = atomic_wait_timeout::inf) { - utils::bless>(&m_val)[1].wait(0, timeout); + utils::bless>(&m_val.raw().is_non_null)->wait(0, timeout); } void notify_one() { - utils::bless>(&m_val)[1].notify_one(); + utils::bless>(&m_val.raw().is_non_null)->notify_one(); } void notify_all() { - utils::bless>(&m_val)[1].notify_all(); + utils::bless>(&m_val.raw().is_non_null)->notify_all(); } };