This commit is contained in:
Vestral 2025-04-19 15:52:03 +00:00 committed by GitHub
commit ddf9c672ed
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 317 additions and 214 deletions

View file

@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.28)
project(rpcs3 LANGUAGES C CXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11)

View file

@ -344,15 +344,7 @@ jit_runtime_base& asmjit::get_global_runtime()
{
custom_runtime() noexcept
{
// Search starting in first 2 GiB of memory
for (u64 addr = size;; addr += size)
{
if (auto ptr = utils::memory_reserve(size, reinterpret_cast<void*>(addr)))
{
m_pos.raw() = static_cast<uchar*>(ptr);
break;
}
}
ensure(m_pos.raw() = static_cast<uchar*>(utils::memory_reserve(size)));
// Initialize "end" pointer
m_max = m_pos + size;

View file

@ -2490,7 +2490,7 @@ void thread_ctrl::wait_for(u64 usec, [[maybe_unused]] bool alert /* true */)
if (alert)
{
list.set<0>(_this->m_sync, 0);
list.set<1>(utils::bless<atomic_t<u32>>(&_this->m_taskq)[1], 0);
list.template set<1>(_this->m_taskq);
}
else
{

View file

@ -49,7 +49,7 @@ public:
if (!next)
{
// Do not allow access beyond many element more at a time
// Do not allow access beyond many element more at a time
ensure(!installed && index - i < N * 2);
installed = true;
@ -384,17 +384,26 @@ public:
template <typename T>
class lf_queue final
{
atomic_t<u64> m_head{0};
lf_queue_item<T>* load(u64 value) const noexcept
public:
struct fat_ptr
{
return reinterpret_cast<lf_queue_item<T>*>(value >> 16);
u64 ptr{};
u32 is_non_null{};
u32 reserved{};
};
private:
atomic_t<fat_ptr> m_head{fat_ptr{}};
lf_queue_item<T>* load(fat_ptr value) const noexcept
{
return reinterpret_cast<lf_queue_item<T>*>(value.ptr);
}
// Extract all elements and reverse element order (FILO to FIFO)
lf_queue_item<T>* reverse() noexcept
{
if (auto* head = load(m_head) ? load(m_head.exchange(0)) : nullptr)
if (auto* head = load(m_head) ? load(m_head.exchange(fat_ptr{})) : nullptr)
{
if (auto* prev = head->m_link)
{
@ -420,7 +429,7 @@ public:
lf_queue(lf_queue&& other) noexcept
{
m_head.release(other.m_head.exchange(0));
m_head.release(other.m_head.exchange(fat_ptr{}));
}
lf_queue& operator=(lf_queue&& other) noexcept
@ -431,7 +440,7 @@ public:
}
delete load(m_head);
m_head.release(other.m_head.exchange(0));
m_head.release(other.m_head.exchange(fat_ptr{}));
return *this;
}
@ -442,9 +451,9 @@ public:
void wait(std::nullptr_t /*null*/ = nullptr) noexcept
{
if (m_head == 0)
if (!operator bool())
{
utils::bless<atomic_t<u32>>(&m_head)[1].wait(0);
utils::bless<atomic_t<u32>>(&m_head.raw().is_non_null)->wait(0);
}
}
@ -455,7 +464,7 @@ public:
explicit operator bool() const noexcept
{
return m_head != 0;
return observe() != nullptr;
}
template <bool Notify = true, typename... Args>
@ -464,25 +473,25 @@ public:
auto oldv = m_head.load();
auto item = new lf_queue_item<T>(load(oldv), std::forward<Args>(args)...);
while (!m_head.compare_exchange(oldv, reinterpret_cast<u64>(item) << 16))
while (!m_head.compare_exchange(oldv, fat_ptr{reinterpret_cast<u64>(item), item != nullptr, 0}))
{
item->m_link = load(oldv);
}
if (!oldv && Notify)
if (!oldv.ptr && Notify)
{
// Notify only if queue was empty
notify(true);
}
return !oldv;
return !oldv.ptr;
}
void notify(bool force = false)
{
if (force || operator bool())
{
utils::bless<atomic_t<u32>>(&m_head)[1].notify_one();
utils::bless<atomic_t<u32>>(&m_head.raw().is_non_null)->notify_one();
}
}
@ -498,7 +507,7 @@ public:
lf_queue_slice<T> pop_all_reversed()
{
lf_queue_slice<T> result;
result.m_head = load(m_head.exchange(0));
result.m_head = load(m_head.exchange(fat_ptr{}));
return result;
}

View file

@ -5,13 +5,12 @@ if(MSVC)
add_compile_definitions(
_CRT_SECURE_NO_DEPRECATE=1 _CRT_NON_CONFORMING_SWPRINTFS=1 _SCL_SECURE_NO_WARNINGS=1
NOMINMAX _ENABLE_EXTENDED_ALIGNED_STORAGE=1 _HAS_EXCEPTIONS=0)
add_link_options(/DYNAMICBASE:NO /BASE:0x10000 /FIXED)
add_link_options(/DYNAMICBASE:YES)
#TODO: Some of these could be cleaned up
add_compile_options(/wd4805) # Comparing boolean and int
add_compile_options(/wd4804) # Using integer operators with booleans
add_compile_options(/wd4200) # Zero-sized array in struct/union
add_link_options(/ignore:4281) # Undesirable base address 0x10000
# MSVC 2017 uses iterator as base class internally, causing a lot of warning spam
add_compile_definitions(_SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING=1)
@ -19,8 +18,6 @@ if(MSVC)
# Increase stack limit to 8 MB
add_link_options(/STACK:8388608,1048576)
else()
# Some distros have the compilers set to use PIE by default, but RPCS3 doesn't work with PIE, so we need to disable it.
check_cxx_compiler_flag("-no-pie" HAS_NO_PIE)
check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
check_cxx_compiler_flag("-msse -msse2 -mcx16" COMPILER_X86)
if (APPLE)
@ -96,15 +93,6 @@ else()
if(NOT APPLE AND NOT WIN32)
# This hides our LLVM from mesa's LLVM, otherwise we get some unresolvable conflicts.
add_link_options(-Wl,--exclude-libs,ALL)
if(HAS_NO_PIE)
add_link_options(-no-pie)
endif()
elseif(APPLE)
if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
add_link_options(-Wl,-image_base,0x10000 -Wl,-pagezero_size,0x10000)
add_link_options(-Wl,-no_pie)
endif()
elseif(WIN32)
add_compile_definitions(__STDC_FORMAT_MACROS=1)
@ -113,11 +101,6 @@ else()
# Increase stack limit to 8 MB
add_link_options(-Wl,--stack -Wl,8388608)
# For arm64 windows, the image base cannot be below 4GB or the OS rejects the binary without much explanation.
if(COMPILER_X86)
add_link_options(-Wl,--image-base,0x10000)
endif()
endif()
# Specify C++ library to use as standard C++ when using clang (not required on linux due to GNU)

View file

@ -1902,8 +1902,9 @@ auto gen_ghc_cpp_trampoline(ppu_intrp_func_t fn_target)
// Take second ghc arg
c.mov(args[0], x86::rbp);
c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)));
c.add(args[2], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
c.jmp(fn_target);
c.movabs(args[1], reinterpret_cast<u64>(&vm::g_base_addr));
c.add(args[2], x86::qword_ptr(args[1]));
c.jmp(Imm(fn_target));
};
}

View file

@ -220,19 +220,21 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp);
// Initialize args
c.mov(x86::r13, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
c.movabs(x86::r13, reinterpret_cast<u64>(&vm::g_exec_addr));
c.mov(x86::r13, x86::qword_ptr(x86::r13));
c.mov(x86::rbp, args[0]);
c.mov(x86::edx, x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia))); // Load PC
c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::edx, 1, 0)); // Load call target
c.mov(x86::rdx, x86::rax);
c.shl(x86::rax, 16);
c.shr(x86::rax, 16);
c.shr(x86::rdx, 48);
c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::rdx, 1, 0)); // Load call target
c.movabs(x86::r12, vm::g_exec_addr_seg_offset);
c.add(x86::r12, x86::r13);
c.shr(x86::edx, 1);
c.mov(x86::edx, x86::word_ptr(x86::r12, x86::edx)); // Load relocation base
c.shl(x86::edx, 13);
c.mov(x86::r12d, x86::edx); // Load relocation base
c.mov(x86::r12d, x86::edx); // Set relocation base
c.mov(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
c.movabs(x86::rbx, reinterpret_cast<u64>(&vm::g_base_addr));
c.mov(x86::rbx, x86::qword_ptr(x86::rbx));
c.mov(x86::r14, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 0))); // Load some registers
c.mov(x86::rsi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 1)));
c.mov(x86::rdi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 2)));
@ -346,14 +348,11 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
c.ldr(call_target, arm::Mem(a64::x19, pc));
// Compute REG_Hp
const arm::GpX reg_hp = a64::x21;
c.mov(reg_hp, call_target);
c.lsr(reg_hp, reg_hp, 48);
c.mov(reg_hp, Imm(vm::g_exec_addr_seg_offset));
c.add(reg_hp, reg_hp, pc, arm::Shift(arm::ShiftOp::kLSR, 2));
c.ldrh(reg_hp.w(), arm::Mem(a64::x19, reg_hp));
c.lsl(reg_hp.w(), reg_hp.w(), 13);
// Zero top 16 bits of call target
c.lsl(call_target, call_target, Imm(16));
c.lsr(call_target, call_target, Imm(16));
// Load registers
c.mov(a64::x22, Imm(reinterpret_cast<u64>(&vm::g_base_addr)));
c.ldr(a64::x22, arm::Mem(a64::x22));
@ -473,6 +472,11 @@ static inline u8* ppu_ptr(u32 addr)
return vm::g_exec_addr + u64{addr} * 2;
}
static inline u8* ppu_seg_ptr(u32 addr)
{
return vm::g_exec_addr + vm::g_exec_addr_seg_offset + (addr >> 1);
}
static inline ppu_intrp_func_t ppu_read(u32 addr)
{
return read_from_ptr<ppu_intrp_func_t>(ppu_ptr(addr));
@ -518,7 +522,7 @@ void ppu_recompiler_fallback(ppu_thread& ppu)
while (true)
{
if (uptr func = uptr(ppu_read(ppu.cia)); (func << 16 >> 16) != reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
if (uptr func = uptr(ppu_read(ppu.cia)); func != reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
{
// We found a recompiler function at cia, return
break;
@ -773,6 +777,9 @@ extern void ppu_register_range(u32 addr, u32 size)
utils::memory_commit(ppu_ptr(addr), u64{size} * 2, utils::protection::rw);
ensure(vm::page_protect(addr, size, 0, vm::page_executable));
// Segment data
utils::memory_commit(ppu_seg_ptr(addr), size >> 1, utils::protection::rw);
if (g_cfg.core.ppu_debug)
{
utils::memory_commit(vm::g_stat_addr + addr, size);
@ -785,12 +792,13 @@ extern void ppu_register_range(u32 addr, u32 size)
if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
{
// Assume addr is the start of first segment of PRX
const uptr entry_value = reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3));
write_to_ptr<uptr>(ppu_ptr(addr), entry_value);
write_to_ptr<uptr>(ppu_ptr(addr), std::bit_cast<uptr>(ppu_recompiler_fallback_ghc));
write_to_ptr<u16>(ppu_seg_ptr(addr), static_cast<u16>(seg_base >> 13));
}
else
{
write_to_ptr<ppu_intrp_func_t>(ppu_ptr(addr), ppu_fallback);
write_to_ptr<u16>(ppu_seg_ptr(addr), 0);
}
addr += 4;
@ -805,7 +813,7 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr =
// Initialize specific function
if (ptr)
{
write_to_ptr<uptr>(ppu_ptr(addr), (reinterpret_cast<uptr>(ptr) & 0xffff'ffff'ffffu) | (uptr(ppu_read(addr)) & ~0xffff'ffff'ffffu));
write_to_ptr<uptr>(ppu_ptr(addr), std::bit_cast<uptr>(ptr));
return;
}
@ -3164,8 +3172,9 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r14);
c.sub(x86::rsp, 40);
c.sub(x86::rsp, 48);
#ifdef _WIN32
if (!s_tsx_avx)
{
@ -3176,14 +3185,16 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
// Prepare registers
build_swap_rdx_with(c, args, x86::r10);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.movabs(x86::rbp, reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rbp));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(x86::rbp, -128);
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
c.movzx(args[0].r32(), args[0].r16());
c.shr(args[0].r32(), 1);
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.and_(x86::r11, -128 / 2);
c.and_(args[0].r32(), 63);
@ -3217,7 +3228,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
{
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
c.movabs(x86::r13, reinterpret_cast<u64>(&g_rtm_tx_limit2));
c.cmp(x86::rax, x86::qword_ptr(x86::r13));
c.jae(fall);
});
@ -3342,8 +3354,9 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
c.vzeroupper();
}
c.add(x86::rsp, 40);
c.add(x86::rsp, 48);
c.pop(x86::r14);
c.pop(x86::r13);
c.pop(x86::rbp);
maybe_flush_lbr(c);
@ -4179,7 +4192,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
// 2 7MB overlay files -> 14GB
// The growth in memory requirements of LLVM is not linear with file size of course
// But these estimates should hopefully protect RPCS3 in the coming years
// Especially when thread count is on the rise with each CPU generation
// Especially when thread count is on the rise with each CPU generation
atomic_t<u32> file_size_limit = static_cast<u32>(std::clamp<u64>(utils::aligned_div<u64>(utils::get_total_memory(), 2000), 65536, u32{umax}));
const u32 software_thread_limit = std::min<u32>(g_cfg.core.llvm_threads ? g_cfg.core.llvm_threads : u32{umax}, ::size32(file_queue));
@ -4301,8 +4314,8 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
if (!src && !Emu.klic.empty() && src.open(path))
{
src = decrypt_self(src, reinterpret_cast<u8*>(&Emu.klic[0]));
if (src)
if (src)
{
ppu_log.error("Possible missed KLIC for precompilation of '%s', please report to developers.", path);
@ -4333,7 +4346,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
{
if (value)
{
// Allow at least one file, make 0 the "memory unavailable" sign value for atomic waiting efficiency
// Allow at least one file, make 0 the "memory unavailable" sign value for atomic waiting efficiency
const u32 new_val = static_cast<u32>(utils::sub_saturate<u64>(value, file_size));
restore_mem = value - new_val;
value = new_val;
@ -4506,8 +4519,8 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
if (!src && !Emu.klic.empty() && src.open(path))
{
src = decrypt_self(src, reinterpret_cast<u8*>(&Emu.klic[0]));
if (src)
if (src)
{
ppu_log.error("Possible missed KLIC for precompilation of '%s', please report to developers.", path);
}
@ -5079,17 +5092,18 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
code_size_until_jump = buf_end - buf_start;
c.add(x86::edx, seg0);
c.mov(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
c.movabs(x86::rax, reinterpret_cast<u64>(&vm::g_exec_addr));
c.mov(x86::rax, x86::qword_ptr(x86::rax));
c.mov(x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia)), x86::edx);
c.mov(x86::rax, x86::qword_ptr(x86::rax, x86::rdx, 1, 0)); // Load call target
c.mov(x86::rdx, x86::rax);
c.shl(x86::rax, 16);
c.shr(x86::rax, 16);
c.shr(x86::rdx, 48);
c.mov(x86::rcx, x86::qword_ptr(x86::rax, x86::rdx, 1, 0)); // Load call target
c.movabs(x86::r12, vm::g_exec_addr_seg_offset);
c.add(x86::rax, x86::r12);
c.shr(x86::edx, 1);
c.mov(x86::edx, x86::word_ptr(x86::rax, x86::edx)); // Load relocation base
c.shl(x86::edx, 13);
c.mov(x86::r12d, x86::edx); // Load relocation base
c.jmp(x86::rax);
c.mov(x86::r12d, x86::edx); // Set relocation base
c.jmp(x86::rcx);
#else
// Load REG_Base - use absolute jump target to bypass rel jmp range limits
// X19 contains vm::g_exec_addr
@ -5125,14 +5139,11 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
// Compute REG_Hp
const arm::GpX reg_hp = a64::x21;
c.mov(reg_hp, call_target);
c.lsr(reg_hp, reg_hp, 48);
c.mov(reg_hp, Imm(vm::g_exec_addr_seg_offset));
c.add(reg_hp, reg_hp, pc, arm::Shift(arm::ShiftOp::kLSR, 2));
c.ldrh(reg_hp.w(), arm::Mem(exec_addr, reg_hp));
c.lsl(reg_hp.w(), reg_hp.w(), 13);
// Zero top 16 bits of call target
c.lsl(call_target, call_target, 16);
c.lsr(call_target, call_target, 16);
// Execute LLE call
c.br(call_target);
#endif
@ -5340,7 +5351,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
sha1_update(&ctx, reinterpret_cast<const u8*>(addrs.data()), addrs.size() * sizeof(be_t<u32>));
}
part.jit_bounds = std::move(local_jit_bounds);
part.jit_bounds = std::move(local_jit_bounds);
local_jit_bounds = std::make_shared<std::pair<u32, u32>>(u32{umax}, 0);
}
@ -5400,7 +5411,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
settings += ppu_settings::contains_symbol_resolver; // Avoid invalidating all modules for this purpose
// Write version, hash, CPU, settings
fmt::append(obj_name, "v6-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
fmt::append(obj_name, "v7-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
}
if (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped())
@ -5712,7 +5723,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
for (u32 addr = info.segs[0].addr; addr < info.segs[0].addr + info.segs[0].size; addr += 4, inst_ptr++)
{
if (*inst_ptr == ppu_instructions::BLR() && (reinterpret_cast<uptr>(ppu_read(addr)) << 16 >> 16) == reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
if (*inst_ptr == ppu_instructions::BLR() && reinterpret_cast<uptr>(ppu_read(addr)) == reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
{
write_to_ptr<ppu_intrp_func_t>(ppu_ptr(addr), BLR_func);
}

View file

@ -411,12 +411,19 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module<lv2_obj>& info)
const auto faddr = m_ir->CreateLoad(ptr_inst->getResultElementType(), ptr_inst);
const auto faddr_int = m_ir->CreatePtrToInt(faddr, get_type<uptr>());
const auto fval = m_ir->CreateOr(m_ir->CreateShl(m_seg0, 32 + 3), faddr_int);
const auto pos = m_ir->CreateShl(m_reloc ? m_ir->CreateAdd(func_pc, m_seg0) : func_pc, 1);
const auto pos_32 = m_reloc ? m_ir->CreateAdd(func_pc, m_seg0) : func_pc;
const auto pos = m_ir->CreateShl(pos_32, 1);
const auto ptr = dyn_cast<GetElementPtrInst>(m_ir->CreateGEP(get_type<u8>(), m_exec, pos));
const auto seg_base_ptr = m_ir->CreateIntToPtr(m_ir->CreateAdd(
m_ir->CreatePtrToInt(m_exec, get_type<u64>()), m_ir->getInt64(vm::g_exec_addr_seg_offset)), m_exec->getType());
const auto seg_pos = m_ir->CreateLShr(pos_32, 1);
const auto seg_ptr = dyn_cast<GetElementPtrInst>(m_ir->CreateGEP(get_type<u8>(), seg_base_ptr, seg_pos));
const auto seg_val = m_ir->CreateTrunc(m_ir->CreateLShr(m_seg0, 13), get_type<u16>());
// Store to jumptable
m_ir->CreateStore(fval, ptr);
m_ir->CreateStore(faddr_int, ptr);
m_ir->CreateStore(seg_val, seg_ptr);
// Increment index and branch back to loop
const auto post_add = m_ir->CreateAdd(index_value, m_ir->getInt64(1));
@ -605,10 +612,15 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
const auto pos = m_ir->CreateShl(indirect, 1);
const auto ptr = dyn_cast<GetElementPtrInst>(m_ir->CreateGEP(get_type<u8>(), m_exec, pos));
const auto val = m_ir->CreateLoad(get_type<u64>(), ptr);
callee = FunctionCallee(type, m_ir->CreateIntToPtr(m_ir->CreateAnd(val, 0xffff'ffff'ffff), type->getPointerTo()));
callee = FunctionCallee(type, m_ir->CreateIntToPtr(val, type->getPointerTo()));
// Load new segment address
seg0 = m_ir->CreateShl(m_ir->CreateLShr(val, 48), 13);
const auto seg_base_ptr = m_ir->CreateIntToPtr(m_ir->CreateAdd(
m_ir->CreatePtrToInt(m_exec, get_type<u64>()), m_ir->getInt64(vm::g_exec_addr_seg_offset)), m_exec->getType());
const auto seg_pos = m_ir->CreateLShr(indirect, 1);
const auto seg_ptr = dyn_cast<GetElementPtrInst>(m_ir->CreateGEP(get_type<u8>(), seg_base_ptr, seg_pos));
const auto seg_val = m_ir->CreateZExt(m_ir->CreateLoad(get_type<u16>(), seg_ptr), get_type<u64>());
seg0 = m_ir->CreateShl(seg_val, 13);
}
m_ir->SetInsertPoint(block);

View file

@ -2770,14 +2770,17 @@ void spu_recompiler::FREST(spu_opcode_t op)
const u64 fraction_lut_addr = reinterpret_cast<u64>(spu_frest_fraction_lut);
const u64 exponent_lut_addr = reinterpret_cast<u64>(spu_frest_exponent_lut);
c->movabs(*arg0, fraction_lut_addr);
c->movabs(*arg1, exponent_lut_addr);
for (u32 index = 0; index < 4; index++)
{
c->pextrd(*qw0, v_fraction, index);
c->mov(*qw1, asmjit::x86::dword_ptr(fraction_lut_addr, *qw0, 2));
c->mov(*qw1, asmjit::x86::dword_ptr(*arg0, *qw0, 2));
c->pinsrd(v_fraction, *qw1, index);
c->pextrd(*qw0, v_exponent, index);
c->mov(*qw1, asmjit::x86::dword_ptr(exponent_lut_addr, *qw0, 2));
c->mov(*qw1, asmjit::x86::dword_ptr(*arg1, *qw0, 2));
c->pinsrd(v_exponent, *qw1, index);
}
@ -2810,14 +2813,17 @@ void spu_recompiler::FRSQEST(spu_opcode_t op)
const u64 fraction_lut_addr = reinterpret_cast<u64>(spu_frsqest_fraction_lut);
const u64 exponent_lut_addr = reinterpret_cast<u64>(spu_frsqest_exponent_lut);
c->movabs(*arg0, fraction_lut_addr);
c->movabs(*arg1, exponent_lut_addr);
for (u32 index = 0; index < 4; index++)
{
c->pextrd(*qw0, v_fraction, index);
c->mov(*qw1, asmjit::x86::dword_ptr(fraction_lut_addr, *qw0, 2));
c->mov(*qw1, asmjit::x86::dword_ptr(*arg0, *qw0, 2));
c->pinsrd(v_fraction, *qw1, index);
c->pextrd(*qw0, v_exponent, index);
c->mov(*qw1, asmjit::x86::dword_ptr(exponent_lut_addr, *qw0, 2));
c->mov(*qw1, asmjit::x86::dword_ptr(*arg1, *qw0, 2));
c->pinsrd(v_exponent, *qw1, index);
}

View file

@ -842,6 +842,7 @@ void spu_cache::initialize(bool build_existing_cache)
// Initialize compiler instances for parallel compilation
std::unique_ptr<spu_recompiler_base> compiler;
#if defined(ARCH_X64)
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
{
compiler = spu_recompiler_base::make_asmjit_recompiler();
@ -850,6 +851,22 @@ void spu_cache::initialize(bool build_existing_cache)
{
compiler = spu_recompiler_base::make_llvm_recompiler();
}
else
{
fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
}
#elif defined(ARCH_ARM64)
if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
compiler = spu_recompiler_base::make_llvm_recompiler();
}
else
{
fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
}
#else
#error "Unimplemented"
#endif
compiler->init();
@ -2545,7 +2562,7 @@ bool reg_state_t::is_const() const
bool reg_state_t::compare_tags(const reg_state_t& rhs) const
{
// Compare by tag, address of instruction origin
// Compare by tag, address of instruction origin
return tag == rhs.tag && origin == rhs.origin && is_instruction == rhs.is_instruction;
}
@ -6066,7 +6083,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
else if (atomic16->ls_offs.compare_with_mask_indifference(atomic16->lsa, SPU_LS_MASK_128) && atomic16->ls.is_less_than(128 - (atomic16->ls_offs.value & 127)))
{
// Relative memory access with offset less than 128 bytes
// Common around SPU utilities which have less strict restrictions about memory alignment
// Common around SPU utilities which have less strict restrictions about memory alignment
ok = true;
}
}
@ -6340,7 +6357,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
{
atomic16->mem_count++;
// Do not clear lower 16 bytes addressing because the program can move on 4-byte basis
// Do not clear lower 16 bytes addressing because the program can move on 4-byte basis
const u32 offs = spu_branch_target(pos - result.lower_bound, op.si16);
if (atomic16->lsa.is_const() && [&]()
@ -8142,7 +8159,7 @@ std::array<reg_state_t, s_reg_max>& block_reg_info::evaluate_start_state(const s
// Check if the node is resolved
if (!node->has_true_state)
{
// Assume this block cannot be resolved at the moment
// Assume this block cannot be resolved at the moment
is_all_resolved = false;
break;
}

View file

@ -628,6 +628,8 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::rbx);
#ifdef _WIN32
c.sub(x86::rsp, 168);
if (s_tsx_avx)
@ -648,17 +650,21 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
c.movups(x86::oword_ptr(x86::rsp, 128), x86::xmm14);
c.movups(x86::oword_ptr(x86::rsp, 144), x86::xmm15);
}
#else
c.sub(x86::rsp, 40);
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r10);
c.mov(args[1], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.movabs(args[1], reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(args[1], x86::qword_ptr(args[1]));
c.lea(args[1], x86::qword_ptr(args[1], args[0]));
c.prefetchw(x86::byte_ptr(args[1], 0));
c.prefetchw(x86::byte_ptr(args[1], 64));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
// Prepare data
if (s_tsx_avx)
@ -703,7 +709,8 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit2));
c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
c.jae(fall);
});
@ -853,8 +860,13 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144));
}
c.add(x86::rsp, 168);
#else
c.add(x86::rsp, 40);
#endif
c.pop(x86::rbx);
c.pop(x86::rbp);
if (s_tsx_avx)
{
c.vzeroupper();
@ -884,8 +896,10 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
#ifdef _WIN32
c.push(x86::rbp);
c.push(x86::rbx);
c.sub(x86::rsp, 40);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
@ -894,7 +908,8 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r10);
c.mov(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.movabs(x86::r11, reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(x86::r11, x86::qword_ptr(x86::r11));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.prefetchw(x86::byte_ptr(x86::r11, 0));
c.prefetchw(x86::byte_ptr(x86::r11, 64));
@ -921,7 +936,8 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(args[1], x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.movabs(args[1], reinterpret_cast<u64>(+vm::g_reservations));
c.lea(args[1], x86::qword_ptr(args[1], args[0]));
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
@ -933,7 +949,8 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
c.add(x86::qword_ptr(args[3]), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit2));
c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
c.jae(fall);
});
@ -986,6 +1003,10 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
c.vzeroupper();
}
c.add(x86::rsp, 40);
c.pop(x86::rbx);
c.pop(x86::rbp);
maybe_flush_lbr(c);
c.ret();
#else
@ -1023,11 +1044,13 @@ const auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cp
// Prepare registers
build_swap_rdx_with(c, args, x86::r10);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.movabs(x86::rbp, reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rbp));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
@ -1039,7 +1062,8 @@ const auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cp
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit1)));
c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit1));
c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
c.jae(fall);
});
@ -2118,20 +2142,31 @@ spu_thread::spu_thread(lv2_spu_group* group, u32 index, std::string_view name, u
, lv2_id(lv2_id)
, spu_tname(make_single<std::string>(name))
{
#if defined(ARCH_X64)
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
{
jit = spu_recompiler_base::make_asmjit_recompiler();
}
else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
#if defined(ARCH_X64)
jit = spu_recompiler_base::make_fast_llvm_recompiler();
}
else
{
fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
}
#elif defined(ARCH_ARM64)
if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
jit = spu_recompiler_base::make_llvm_recompiler();
}
else
{
fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
}
#else
#error "Unimplemented"
#endif
}
if (g_cfg.core.mfc_debug)
{
@ -2193,20 +2228,31 @@ spu_thread::spu_thread(utils::serial& ar, lv2_spu_group* group)
, lv2_id(ar)
, spu_tname(make_single<std::string>(ar.operator std::string()))
{
#if defined(ARCH_X64)
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
{
jit = spu_recompiler_base::make_asmjit_recompiler();
}
else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
#if defined(ARCH_X64)
jit = spu_recompiler_base::make_fast_llvm_recompiler();
}
else
{
fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
}
#elif defined(ARCH_ARM64)
if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
jit = spu_recompiler_base::make_llvm_recompiler();
}
else
{
fmt::throw_exception("Unsupported spu decoder '%s'", g_cfg.core.spu_decoder);
}
#else
#error "Unimplemented"
#endif
}
if (g_cfg.core.mfc_debug)
{
@ -4445,7 +4491,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span<const u8> ls_ptr, u32 base_add
// Detect "invalid" relative branches
// Branch offsets that, although are the only way to get X code address using relative address
// Rely on overflow/underflow of SPU memory bounds
// Thus they would behave differently if SPU LS memory size was to increase (evolving the CELL architecture was the original plan)
// Thus they would behave differently if SPU LS memory size was to increase (evolving the CELL architecture was the original plan)
// Making them highly unlikely to be valid code
if (rel < 0)
@ -4666,7 +4712,7 @@ bool spu_thread::process_mfc_cmd()
// Add to chance if previous wait was long enough
const u32 add_count = zero_count == 3 && total_wait >= 40 ? (total_wait - 39) * 40
: zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40
: zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40
: zero_count == 1 && total_wait >= 8 ? (total_wait - 7) * 40
: zero_count == 0 && total_wait >= 6 ? (total_wait - 5) * 40
: 0;
@ -5004,7 +5050,7 @@ bool spu_thread::process_mfc_cmd()
if (group->spurs_running == max_run - 1)
{
// Try to let another thread slip in and take over execution
// Try to let another thread slip in and take over execution
thread_ctrl::wait_for(300);
// Update value
@ -5029,7 +5075,7 @@ bool spu_thread::process_mfc_cmd()
if (spurs_last_task_timestamp)
{
const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate;
spurs_average_task_duration -= avg_entry;
spurs_average_task_duration -= avg_entry;
spurs_average_task_duration += std::min<u64>(45'000, current - spurs_last_task_timestamp);
spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate);
spurs_last_task_timestamp = 0;
@ -5050,7 +5096,7 @@ bool spu_thread::process_mfc_cmd()
}
max_run = group->max_run;
prev_running = group->spurs_running.fetch_op([max_run](u32& x)
{
if (x < max_run)
@ -5115,7 +5161,7 @@ bool spu_thread::process_mfc_cmd()
if (spurs_last_task_timestamp)
{
const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate;
spurs_average_task_duration -= avg_entry;
spurs_average_task_duration -= avg_entry;
spurs_average_task_duration += std::min<u64>(45'000, current - spurs_last_task_timestamp);
spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate);
spurs_last_task_timestamp = 0;

View file

@ -47,7 +47,7 @@ namespace vm
u8* const g_sudo_addr = g_base_addr + 0x1'0000'0000;
// Auxiliary virtual memory for executable areas
u8* const g_exec_addr = memory_reserve_4GiB(g_sudo_addr, 0x200000000);
u8* const g_exec_addr = memory_reserve_4GiB(g_sudo_addr, 0x300000000);
// Hooks for memory R/W interception (default: zero offset to some function with only ret instructions)
u8* const g_hook_addr = memory_reserve_4GiB(g_exec_addr, 0x800000000);

View file

@ -34,6 +34,8 @@ namespace vm
extern u8* const g_free_addr;
extern u8 g_reservations[65536 / 128 * 64];
static constexpr u64 g_exec_addr_seg_offset = 0x2'0000'0000ULL;
struct writer_lock;
enum memory_location_t : uint

View file

@ -97,10 +97,9 @@
<IgnoreImportLibrary>true</IgnoreImportLibrary>
<LinkIncremental>false</LinkIncremental>
<OutputFile>$(OutDir)\rpcs3.exe</OutputFile>
<RandomizedBaseAddress>false</RandomizedBaseAddress>
<RandomizedBaseAddress>true</RandomizedBaseAddress>
<SubSystem>Windows</SubSystem>
<SuppressStartupBanner>true</SuppressStartupBanner>
<BaseAddress>0x10000</BaseAddress>
<EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
</Link>
<Midl>
@ -148,10 +147,11 @@
<GenerateDebugInformation>Debug</GenerateDebugInformation>
<IgnoreImportLibrary>true</IgnoreImportLibrary>
<OutputFile>$(OutDir)\rpcs3d.exe</OutputFile>
<RandomizedBaseAddress>false</RandomizedBaseAddress>
<RandomizedBaseAddress>true</RandomizedBaseAddress>
<SubSystem>Windows</SubSystem>
<SuppressStartupBanner>true</SuppressStartupBanner>
<BaseAddress>0x10000</BaseAddress>
<BaseAddress>
</BaseAddress>
<EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
</Link>
<Midl>
@ -2123,4 +2123,4 @@
<UserProperties MocDir=".\QTGeneratedFiles\$(ConfigurationName)" Qt5Version_x0020_x64="$(DefaultQtVersion)" RccDir=".\QTGeneratedFiles" UicDir=".\QTGeneratedFiles" />
</VisualStudio>
</ProjectExtensions>
</Project>
</Project>

View file

@ -398,6 +398,10 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
spu_bg->addButton(ui->spu_asmjit, static_cast<int>(spu_decoder_type::asmjit));
spu_bg->addButton(ui->spu_llvm, static_cast<int>(spu_decoder_type::llvm));
#ifndef ARCH_X64
ui->spu_asmjit->setEnabled(false);
#endif
connect(spu_bg, &QButtonGroup::idToggled, [this](int id, bool checked)
{
if (!checked) return;

View file

@ -57,8 +57,8 @@ static bool has_waitv()
// Total number of entries.
static constexpr usz s_hashtable_size = 1u << 17;
// Reference counter combined with shifted pointer (which is assumed to be 48 bit)
static constexpr uptr s_ref_mask = 0xffff;
// Reference counter mask
static constexpr uptr s_ref_mask = 0xffff'ffff;
// Fix for silly on-first-use initializer
static bool s_null_wait_cb(const void*, u64, u64){ return true; };
@ -153,8 +153,16 @@ namespace
// Essentially a fat semaphore
struct alignas(64) cond_handle
{
// Combined pointer (most significant 48 bits) and ref counter (16 least significant bits)
atomic_t<u64> ptr_ref;
struct fat_ptr
{
u64 ptr{};
u32 reserved{};
u32 ref_ctr{};
auto operator<=>(const fat_ptr& other) const = default;
};
atomic_t<fat_ptr> ptr_ref;
u64 tid;
u32 oldv;
@ -183,7 +191,7 @@ namespace
mtx.init(mtx);
#endif
ensure(!ptr_ref.exchange((iptr << 16) | 1));
ensure(ptr_ref.exchange(fat_ptr{iptr, 0, 1}) == fat_ptr{});
}
void destroy()
@ -370,7 +378,7 @@ namespace
if (cond_id)
{
// Set fake refctr
s_cond_list[cond_id].ptr_ref.release(1);
s_cond_list[cond_id].ptr_ref.release(cond_handle::fat_ptr{0, 0, 1});
cond_free(cond_id, -1);
}
}
@ -390,7 +398,7 @@ static u32 cond_alloc(uptr iptr, u32 tls_slot = -1)
{
// Fast reinitialize
const u32 id = std::exchange(*ptls, 0);
s_cond_list[id].ptr_ref.release((iptr << 16) | 1);
s_cond_list[id].ptr_ref.release(cond_handle::fat_ptr{iptr, 0, 1});
return id;
}
@ -461,15 +469,15 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1)
const auto cond = s_cond_list + cond_id;
// Dereference, destroy on last ref
const bool last = cond->ptr_ref.atomic_op([](u64& val)
const bool last = cond->ptr_ref.atomic_op([](cond_handle::fat_ptr& val)
{
ensure(val & s_ref_mask);
ensure(val.ref_ctr);
val--;
val.ref_ctr--;
if ((val & s_ref_mask) == 0)
if (val.ref_ctr == 0)
{
val = 0;
val = cond_handle::fat_ptr{};
return true;
}
@ -525,15 +533,15 @@ static cond_handle* cond_id_lock(u32 cond_id, uptr iptr = 0)
while (true)
{
const auto [old, ok] = cond->ptr_ref.fetch_op([&](u64& val)
const auto [old, ok] = cond->ptr_ref.fetch_op([&](cond_handle::fat_ptr& val)
{
if (!val || (val & s_ref_mask) == s_ref_mask)
if (val == cond_handle::fat_ptr{} || val.ref_ctr == s_ref_mask)
{
// Don't reference already deallocated semaphore
return false;
}
if (iptr && (val >> 16) != iptr)
if (iptr && val.ptr != iptr)
{
// Pointer mismatch
return false;
@ -548,7 +556,7 @@ static cond_handle* cond_id_lock(u32 cond_id, uptr iptr = 0)
if (!did_ref)
{
val++;
val.ref_ctr++;
}
return true;
@ -566,7 +574,7 @@ static cond_handle* cond_id_lock(u32 cond_id, uptr iptr = 0)
return cond;
}
if ((old & s_ref_mask) == s_ref_mask)
if (old.ref_ctr == s_ref_mask)
{
fmt::throw_exception("Reference count limit (%u) reached in an atomic notifier.", s_ref_mask);
}
@ -589,12 +597,14 @@ namespace
u64 maxc: 5; // Collision counter
u64 maxd: 11; // Distance counter
u64 bits: 24; // Allocated bits
u64 prio: 24; // Reserved
u64 prio: 8; // Reserved
u64 ref : 16; // Ref counter
u64 iptr: 48; // First pointer to use slot (to count used slots)
u64 iptr: 64; // First pointer to use slot (to count used slots)
};
static_assert(sizeof(slot_allocator) == 16);
// Need to spare 16 bits for ref counter
static constexpr u64 max_threads = 24;
@ -935,7 +945,7 @@ atomic_wait_engine::wait(const void* data, u32 old_value, u64 timeout, atomic_wa
const auto stamp0 = utils::get_unique_tsc();
const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 16);
const uptr iptr = reinterpret_cast<uptr>(data);
uptr iptr_ext[atomic_wait::max_list - 1]{};
@ -956,7 +966,7 @@ atomic_wait_engine::wait(const void* data, u32 old_value, u64 timeout, atomic_wa
}
}
iptr_ext[ext_size] = reinterpret_cast<uptr>(e->data) & (~s_ref_mask >> 16);
iptr_ext[ext_size] = reinterpret_cast<uptr>(e->data);
ext_size++;
}
}
@ -1266,7 +1276,7 @@ void atomic_wait_engine::notify_one(const void* data)
return;
}
#endif
const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 16);
const uptr iptr = reinterpret_cast<uptr>(data);
root_info::slot_search(iptr, [&](u32 cond_id)
{
@ -1289,7 +1299,7 @@ atomic_wait_engine::notify_all(const void* data)
return;
}
#endif
const uptr iptr = reinterpret_cast<uptr>(data) & (~s_ref_mask >> 16);
const uptr iptr = reinterpret_cast<uptr>(data);
// Array count for batch notification
u32 count = 0;

View file

@ -205,9 +205,9 @@ namespace atomic_wait
constexpr void set(lf_queue<T2>& var, std::nullptr_t = nullptr)
{
static_assert(Index < Max);
static_assert(sizeof(var) == sizeof(uptr));
static_assert(sizeof(var) == sizeof(uptr) * 2);
m_info[Index].data = reinterpret_cast<char*>(&var) + sizeof(u32);
m_info[Index].data = reinterpret_cast<char*>(&var) + offsetof(typename lf_queue<T2>::fat_ptr, is_non_null);
m_info[Index].old = 0;
}
@ -215,9 +215,9 @@ namespace atomic_wait
constexpr void set(stx::atomic_ptr<T2>& var, std::nullptr_t = nullptr)
{
static_assert(Index < Max);
static_assert(sizeof(var) == sizeof(uptr));
static_assert(sizeof(var) == sizeof(uptr) * 2);
m_info[Index].data = reinterpret_cast<char*>(&var) + sizeof(u32);
m_info[Index].data = reinterpret_cast<char*>(&var) + offsetof(typename stx::atomic_ptr<T2>::fat_ptr, is_non_null);
m_info[Index].old = 0;
}

View file

@ -19,14 +19,8 @@ namespace stx
template <typename T>
class atomic_ptr;
// Basic assumption of userspace pointer size
constexpr uint c_ptr_size = 48;
// Use lower 16 bits as atomic_ptr internal counter of borrowed refs (pointer itself is shifted)
constexpr uint c_ref_mask = 0xffff, c_ref_size = 16;
// Remaining pointer bits
constexpr uptr c_ptr_mask = static_cast<uptr>(-1) << c_ref_size;
// Use 16 bits as atomic_ptr internal counter of borrowed refs
constexpr uint c_ref_mask = 0xffff;
struct shared_counter
{
@ -574,7 +568,6 @@ namespace stx
}
// Random checks which may fail on invalid pointer
ensure((reinterpret_cast<u64>(r.d()->destroy.load()) - 0x10000) >> 47 == 0);
ensure((r.d()->refs++ - 1) >> 58 == 0);
return r;
}
@ -583,11 +576,21 @@ namespace stx
template <typename T>
class atomic_ptr
{
mutable atomic_t<uptr> m_val{0};
static shared_counter* d(uptr val) noexcept
public:
struct fat_ptr
{
return std::launder(reinterpret_cast<shared_counter*>((val >> c_ref_size) - sizeof(shared_counter)));
uptr ptr{};
u32 is_non_null{};
u32 ref_ctr{};
};
private:
mutable atomic_t<fat_ptr> m_val{fat_ptr{}};
static shared_counter* d(fat_ptr val) noexcept
{
return std::launder(reinterpret_cast<shared_counter*>(val.ptr - sizeof(shared_counter)));
}
shared_counter* d() const noexcept
@ -595,14 +598,19 @@ namespace stx
return d(m_val);
}
static uptr to_val(const volatile std::remove_extent_t<T>* ptr) noexcept
static fat_ptr to_val(const volatile std::remove_extent_t<T>* ptr) noexcept
{
return (reinterpret_cast<uptr>(ptr) << c_ref_size);
return fat_ptr{reinterpret_cast<uptr>(ptr), ptr != nullptr, 0};
}
static std::remove_extent_t<T>* ptr_to(uptr val) noexcept
static fat_ptr to_val(uptr ptr) noexcept
{
return reinterpret_cast<std::remove_extent_t<T>*>(val >> c_ref_size);
return fat_ptr{ptr, ptr != 0, 0};
}
static std::remove_extent_t<T>* ptr_to(fat_ptr val) noexcept
{
return reinterpret_cast<std::remove_extent_t<T>*>(val.ptr);
}
template <typename U>
@ -645,7 +653,7 @@ namespace stx
atomic_ptr(const shared_ptr<U>& r) noexcept
{
// Obtain a ref + as many refs as an atomic_ptr can additionally reference
if (uptr rval = to_val(r.m_ptr))
if (fat_ptr rval = to_val(r.m_ptr); rval.ptr != 0)
{
m_val.raw() = rval;
d(rval)->refs += c_ref_mask + 1;
@ -655,7 +663,7 @@ namespace stx
template <typename U> requires same_ptr_implicit_v<T, U>
atomic_ptr(shared_ptr<U>&& r) noexcept
{
if (uptr rval = to_val(r.m_ptr))
if (fat_ptr rval = to_val(r.m_ptr); rval.ptr != 0)
{
m_val.raw() = rval;
d(rval)->refs += c_ref_mask;
@ -667,7 +675,7 @@ namespace stx
template <typename U> requires same_ptr_implicit_v<T, U>
atomic_ptr(single_ptr<U>&& r) noexcept
{
if (uptr rval = to_val(r.m_ptr))
if (fat_ptr rval = to_val(r.m_ptr); rval.ptr != 0)
{
m_val.raw() = rval;
d(rval)->refs += c_ref_mask;
@ -678,13 +686,13 @@ namespace stx
~atomic_ptr() noexcept
{
const uptr v = m_val.raw();
const fat_ptr v = m_val.raw();
if (v >> c_ref_size)
if (v.ptr)
{
const auto o = d(v);
if (!o->refs.sub_fetch(c_ref_mask + 1 - (v & c_ref_mask)))
if (!o->refs.sub_fetch(c_ref_mask + 1 - (v.ref_ctr & c_ref_mask)))
{
o->destroy.load()(o);
}
@ -733,11 +741,11 @@ namespace stx
shared_type r;
// Add reference
const auto [prev, did_ref] = m_val.fetch_op([](uptr& val)
const auto [prev, did_ref] = m_val.fetch_op([](fat_ptr& val)
{
if (val >> c_ref_size)
if (val.ptr)
{
val++;
val.ref_ctr++;
return true;
}
@ -755,11 +763,11 @@ namespace stx
r.d()->refs++;
// Dereference if still the same pointer
const auto [_, did_deref] = m_val.fetch_op([prev = prev](uptr& val)
const auto [_, did_deref] = m_val.fetch_op([prev = prev](fat_ptr& val)
{
if (val >> c_ref_size == prev >> c_ref_size)
if (val.ptr == prev.ptr)
{
val--;
val.ref_ctr--;
return true;
}
@ -782,11 +790,11 @@ namespace stx
shared_type r;
// Add reference
const auto [prev, did_ref] = m_val.fetch_op([](uptr& val)
const auto [prev, did_ref] = m_val.fetch_op([](fat_ptr& val)
{
if (val >> c_ref_size)
if (val.ptr)
{
val++;
val.ref_ctr++;
return true;
}
@ -823,11 +831,11 @@ namespace stx
}
// Dereference if still the same pointer
const auto [_, did_deref] = m_val.fetch_op([prev = prev](uptr& val)
const auto [_, did_deref] = m_val.fetch_op([prev = prev](fat_ptr& val)
{
if (val >> c_ref_size == prev >> c_ref_size)
if (val.ptr == prev.ptr)
{
val--;
val.ref_ctr--;
return true;
}
@ -888,7 +896,7 @@ namespace stx
atomic_ptr old;
old.m_val.raw() = m_val.exchange(to_val(r.m_ptr));
old.m_val.raw() += 1;
old.m_val.raw().ref_ctr += 1;
r.m_ptr = std::launder(ptr_to(old.m_val));
return r;
@ -904,7 +912,7 @@ namespace stx
atomic_ptr old;
old.m_val.raw() = m_val.exchange(to_val(value.m_ptr));
old.m_val.raw() += 1;
old.m_val.raw().ref_ctr += 1;
value.m_ptr = std::launder(ptr_to(old.m_val));
return value;
@ -923,21 +931,21 @@ namespace stx
atomic_ptr old;
const uptr _val = m_val.fetch_op([&](uptr& val)
const fat_ptr _val = m_val.fetch_op([&](fat_ptr& val)
{
if (val >> c_ref_size == _old)
if (val.ptr == _old)
{
// Set new value
val = _new << c_ref_size;
val = to_val(_new);
}
else if (val)
else if (val.ptr != 0)
{
// Reference previous value
val++;
val.ref_ctr++;
}
});
if (_val >> c_ref_size == _old)
if (_val.ptr == _old)
{
// Success (exch is consumed, cmp_and_old is unchanged)
if (exch.m_ptr)
@ -954,9 +962,10 @@ namespace stx
old_exch.m_val.raw() = to_val(std::exchange(exch.m_ptr, nullptr));
// Set to reset old cmp_and_old value
old.m_val.raw() = to_val(cmp_and_old.m_ptr) | c_ref_mask;
old.m_val.raw() = to_val(cmp_and_old.m_ptr);
old.m_val.raw().ref_ctr |= c_ref_mask;
if (!_val)
if (!_val.ptr)
{
return false;
}
@ -966,11 +975,11 @@ namespace stx
cmp_and_old.d()->refs++;
// Dereference if still the same pointer
const auto [_, did_deref] = m_val.fetch_op([_val](uptr& val)
const auto [_, did_deref] = m_val.fetch_op([_val](fat_ptr& val)
{
if (val >> c_ref_size == _val >> c_ref_size)
if (val.ptr == _val.ptr)
{
val--;
val.ref_ctr--;
return true;
}
@ -1009,12 +1018,12 @@ namespace stx
atomic_ptr old;
const auto [_val, ok] = m_val.fetch_op([&](uptr& val)
const auto [_val, ok] = m_val.fetch_op([&](fat_ptr& val)
{
if (val >> c_ref_size == _old)
if (val.ptr == _old)
{
// Set new value
val = _new << c_ref_size;
val = to_val(_new);
return true;
}
@ -1081,7 +1090,7 @@ namespace stx
if (next.m_ptr)
{
// Compensation for `next` assignment
old.m_val.raw() += 1;
old.m_val.raw().ref_ctr += 1;
}
}
@ -1093,7 +1102,7 @@ namespace stx
explicit constexpr operator bool() const noexcept
{
return m_val != 0;
return observe() != nullptr;
}
template <typename U> requires same_ptr_implicit_v<T, U>
@ -1110,17 +1119,17 @@ namespace stx
void wait(std::nullptr_t, atomic_wait_timeout timeout = atomic_wait_timeout::inf)
{
utils::bless<atomic_t<u32>>(&m_val)[1].wait(0, timeout);
utils::bless<atomic_t<u32>>(&m_val.raw().is_non_null)->wait(0, timeout);
}
void notify_one()
{
utils::bless<atomic_t<u32>>(&m_val)[1].notify_one();
utils::bless<atomic_t<u32>>(&m_val.raw().is_non_null)->notify_one();
}
void notify_all()
{
utils::bless<atomic_t<u32>>(&m_val)[1].notify_all();
utils::bless<atomic_t<u32>>(&m_val.raw().is_non_null)->notify_all();
}
};