mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-04-20 03:24:49 +00:00
cpu_patches: Remove CPU patches for macOS and bump minimum OS version to 15.4 (#2743)
* cpu_patches: Remove CPU patches for macOS and bump minimum OS version to 15.4 * cpu_patches: Remove BMI1 patches These are now only good for very old Intel CPUs that: * Still do not currently function due to other CPU instruction issues. * Will probably be too slow to run shadPS4 well.
This commit is contained in:
parent
afd0251dd2
commit
806b2ddc89
4 changed files with 8 additions and 587 deletions
|
@ -9,7 +9,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
|
|||
|
||||
if(APPLE)
|
||||
list(APPEND ADDITIONAL_LANGUAGES OBJC)
|
||||
set(CMAKE_OSX_DEPLOYMENT_TARGET 14)
|
||||
# Starting with 15.4, Rosetta 2 has support for all the necessary instruction sets.
|
||||
set(CMAKE_OSX_DEPLOYMENT_TARGET 15.4)
|
||||
endif()
|
||||
|
||||
if (NOT CMAKE_BUILD_TYPE)
|
||||
|
|
|
@ -22,10 +22,6 @@
|
|||
#include <windows.h>
|
||||
#else
|
||||
#include <pthread.h>
|
||||
#ifdef __APPLE__
|
||||
#include <half.hpp>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
using namespace Xbyak::util;
|
||||
|
@ -81,538 +77,6 @@ static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& opera
|
|||
return ptr[expression];
|
||||
}
|
||||
|
||||
static u64 ZydisToXbyakImmediateOperand(const ZydisDecodedOperand& operand) {
|
||||
ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_IMMEDIATE,
|
||||
"Expected immediate operand, got type: {}", static_cast<u32>(operand.type));
|
||||
return operand.imm.value.u;
|
||||
}
|
||||
|
||||
static std::unique_ptr<Xbyak::Operand> ZydisToXbyakOperand(const ZydisDecodedOperand& operand) {
|
||||
switch (operand.type) {
|
||||
case ZYDIS_OPERAND_TYPE_REGISTER: {
|
||||
return std::make_unique<Xbyak::Reg>(ZydisToXbyakRegisterOperand(operand));
|
||||
}
|
||||
case ZYDIS_OPERAND_TYPE_MEMORY: {
|
||||
return std::make_unique<Xbyak::Address>(ZydisToXbyakMemoryOperand(operand));
|
||||
}
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported operand type: {}", static_cast<u32>(operand.type));
|
||||
}
|
||||
}
|
||||
|
||||
static bool OperandUsesRegister(const Xbyak::Operand* operand, int index) {
|
||||
if (operand->isREG()) {
|
||||
return operand->getIdx() == index;
|
||||
}
|
||||
if (operand->isMEM()) {
|
||||
const Xbyak::RegExp& reg_exp = operand->getAddress().getRegExp();
|
||||
return reg_exp.getBase().getIdx() == index || reg_exp.getIndex().getIdx() == index;
|
||||
}
|
||||
UNREACHABLE_MSG("Unsupported operand kind: {}", static_cast<u32>(operand->getKind()));
|
||||
}
|
||||
|
||||
static bool IsRegisterAllocated(
|
||||
const std::initializer_list<const Xbyak::Operand*>& allocated_registers, const int index) {
|
||||
return std::ranges::find_if(allocated_registers.begin(), allocated_registers.end(),
|
||||
[index](const Xbyak::Operand* operand) {
|
||||
return OperandUsesRegister(operand, index);
|
||||
}) != allocated_registers.end();
|
||||
}
|
||||
|
||||
static Xbyak::Reg AllocateScratchRegister(
|
||||
const std::initializer_list<const Xbyak::Operand*> allocated_registers, const u32 bits) {
|
||||
for (int index = Xbyak::Operand::R8; index <= Xbyak::Operand::R15; index++) {
|
||||
if (!IsRegisterAllocated(allocated_registers, index)) {
|
||||
return Xbyak::Reg32e(index, static_cast<int>(bits));
|
||||
}
|
||||
}
|
||||
UNREACHABLE_MSG("Out of scratch registers!");
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
||||
static pthread_key_t stack_pointer_slot;
|
||||
static pthread_key_t patch_stack_slot;
|
||||
static std::once_flag patch_context_slots_init_flag;
|
||||
static constexpr u32 patch_stack_size = 0x1000;
|
||||
|
||||
static_assert(sizeof(void*) == sizeof(u64),
|
||||
"Cannot fit a register inside a thread local storage slot.");
|
||||
|
||||
static void FreePatchStack(void* patch_stack) {
|
||||
// Subtract back to the bottom of the stack for free.
|
||||
std::free(static_cast<u8*>(patch_stack) - patch_stack_size);
|
||||
}
|
||||
|
||||
static void InitializePatchContextSlots() {
|
||||
ASSERT_MSG(pthread_key_create(&stack_pointer_slot, nullptr) == 0,
|
||||
"Unable to allocate thread-local register for stack pointer.");
|
||||
ASSERT_MSG(pthread_key_create(&patch_stack_slot, FreePatchStack) == 0,
|
||||
"Unable to allocate thread-local register for patch stack.");
|
||||
}
|
||||
|
||||
void InitializeThreadPatchStack() {
|
||||
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
|
||||
|
||||
pthread_setspecific(patch_stack_slot,
|
||||
static_cast<u8*>(std::malloc(patch_stack_size)) + patch_stack_size);
|
||||
}
|
||||
|
||||
/// Saves the stack pointer to thread local storage and loads the patch stack.
|
||||
static void SaveStack(Xbyak::CodeGenerator& c) {
|
||||
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
|
||||
|
||||
// Save original stack pointer and load patch stack.
|
||||
c.putSeg(gs);
|
||||
c.mov(qword[reinterpret_cast<void*>(stack_pointer_slot * sizeof(void*))], rsp);
|
||||
c.putSeg(gs);
|
||||
c.mov(rsp, qword[reinterpret_cast<void*>(patch_stack_slot * sizeof(void*))]);
|
||||
}
|
||||
|
||||
/// Restores the stack pointer from thread local storage.
|
||||
static void RestoreStack(Xbyak::CodeGenerator& c) {
|
||||
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
|
||||
|
||||
// Save patch stack pointer and load original stack.
|
||||
c.putSeg(gs);
|
||||
c.mov(qword[reinterpret_cast<void*>(patch_stack_slot * sizeof(void*))], rsp);
|
||||
c.putSeg(gs);
|
||||
c.mov(rsp, qword[reinterpret_cast<void*>(stack_pointer_slot * sizeof(void*))]);
|
||||
}
|
||||
|
||||
/// Validates that the dst register is supported given the SaveStack/RestoreStack implementation.
|
||||
static void ValidateDst(const Xbyak::Reg& dst) {
|
||||
// No restrictions.
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void InitializeThreadPatchStack() {
|
||||
// No-op
|
||||
}
|
||||
|
||||
// NOTE: Since stack pointer here is subtracted through safe zone and not saved anywhere,
|
||||
// it must not be modified during the instruction. Otherwise, we will not be able to find
|
||||
// and load registers back from where they were saved. Thus, a limitation is placed on
|
||||
// instructions, that they must not use the stack pointer register as a destination.
|
||||
|
||||
/// Saves the stack pointer to thread local storage and loads the patch stack.
|
||||
static void SaveStack(Xbyak::CodeGenerator& c) {
|
||||
c.lea(rsp, ptr[rsp - 128]); // red zone
|
||||
}
|
||||
|
||||
/// Restores the stack pointer from thread local storage.
|
||||
static void RestoreStack(Xbyak::CodeGenerator& c) {
|
||||
c.lea(rsp, ptr[rsp + 128]); // red zone
|
||||
}
|
||||
|
||||
/// Validates that the dst register is supported given the SaveStack/RestoreStack implementation.
|
||||
static void ValidateDst(const Xbyak::Reg& dst) {
|
||||
// Stack pointer is not preserved, so it can't be used as a dst.
|
||||
ASSERT_MSG(dst.getIdx() != rsp.getIdx(), "Stack pointer not supported as destination.");
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/// Switches to the patch stack, saves registers, and restores the original stack.
|
||||
static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list<Xbyak::Reg> regs) {
|
||||
// Uses a more robust solution for saving registers on MacOS to avoid potential stack corruption
|
||||
// if games decide to not follow the ABI and use the red zone.
|
||||
SaveStack(c);
|
||||
for (const auto& reg : regs) {
|
||||
c.push(reg.cvt64());
|
||||
}
|
||||
RestoreStack(c);
|
||||
}
|
||||
|
||||
/// Switches to the patch stack, restores registers, and restores the original stack.
|
||||
static void RestoreRegisters(Xbyak::CodeGenerator& c,
|
||||
const std::initializer_list<Xbyak::Reg> regs) {
|
||||
SaveStack(c);
|
||||
for (const auto& reg : regs) {
|
||||
c.pop(reg.cvt64());
|
||||
}
|
||||
RestoreStack(c);
|
||||
}
|
||||
|
||||
/// Switches to the patch stack and stores all registers.
|
||||
static void SaveContext(Xbyak::CodeGenerator& c, bool save_flags = false) {
|
||||
SaveStack(c);
|
||||
for (int reg = Xbyak::Operand::RAX; reg <= Xbyak::Operand::R15; reg++) {
|
||||
c.push(Xbyak::Reg64(reg));
|
||||
}
|
||||
c.lea(rsp, ptr[rsp - 32 * 16]);
|
||||
for (int reg = 0; reg <= 15; reg++) {
|
||||
c.vmovdqu(ptr[rsp + 32 * reg], Xbyak::Ymm(reg));
|
||||
}
|
||||
if (save_flags) {
|
||||
c.pushfq();
|
||||
}
|
||||
}
|
||||
|
||||
/// Restores all registers and restores the original stack.
|
||||
/// If the destination is a register, it is not restored to preserve the output.
|
||||
static void RestoreContext(Xbyak::CodeGenerator& c, const Xbyak::Operand& dst,
|
||||
bool restore_flags = false) {
|
||||
if (restore_flags) {
|
||||
c.popfq();
|
||||
}
|
||||
for (int reg = 15; reg >= 0; reg--) {
|
||||
if ((!dst.isXMM() && !dst.isYMM()) || dst.getIdx() != reg) {
|
||||
c.vmovdqu(Xbyak::Ymm(reg), ptr[rsp + 32 * reg]);
|
||||
}
|
||||
}
|
||||
c.lea(rsp, ptr[rsp + 32 * 16]);
|
||||
for (int reg = Xbyak::Operand::R15; reg >= Xbyak::Operand::RAX; reg--) {
|
||||
if (!dst.isREG() || dst.getIdx() != reg) {
|
||||
c.pop(Xbyak::Reg64(reg));
|
||||
} else {
|
||||
c.lea(rsp, ptr[rsp + 8]);
|
||||
}
|
||||
}
|
||||
RestoreStack(c);
|
||||
}
|
||||
|
||||
static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||
const auto src1 = ZydisToXbyakRegisterOperand(operands[1]);
|
||||
const auto src2 = ZydisToXbyakOperand(operands[2]);
|
||||
ValidateDst(dst);
|
||||
|
||||
// Check if src2 is a memory operand or a register different to dst.
|
||||
// In those cases, we don't need to use a temporary register and are free to modify dst.
|
||||
// In cases where dst and src2 are the same register, a temporary needs to be used to avoid
|
||||
// modifying src2.
|
||||
bool src2_uses_dst = false;
|
||||
if (src2->isMEM()) {
|
||||
const auto base = src2->getAddress().getRegExp().getBase().getIdx();
|
||||
const auto index = src2->getAddress().getRegExp().getIndex().getIdx();
|
||||
src2_uses_dst = base == dst.getIdx() || index == dst.getIdx();
|
||||
} else {
|
||||
ASSERT(src2->isREG());
|
||||
src2_uses_dst = src2->getReg() == dst;
|
||||
}
|
||||
|
||||
if (!src2_uses_dst) {
|
||||
if (dst != src1)
|
||||
c.mov(dst, src1);
|
||||
c.not_(dst);
|
||||
c.and_(dst, *src2);
|
||||
} else {
|
||||
const auto scratch = AllocateScratchRegister({&dst, &src1, src2.get()}, dst.getBit());
|
||||
|
||||
SaveRegisters(c, {scratch});
|
||||
|
||||
c.mov(scratch, src1);
|
||||
c.not_(scratch);
|
||||
c.and_(scratch, *src2);
|
||||
c.mov(dst, scratch);
|
||||
|
||||
RestoreRegisters(c, {scratch});
|
||||
}
|
||||
}
|
||||
|
||||
static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||
const auto src = ZydisToXbyakOperand(operands[1]);
|
||||
const auto start_len = ZydisToXbyakRegisterOperand(operands[2]);
|
||||
ValidateDst(dst);
|
||||
|
||||
const Xbyak::Reg32e shift(Xbyak::Operand::RCX, static_cast<int>(start_len.getBit()));
|
||||
const auto scratch1 =
|
||||
AllocateScratchRegister({&dst, src.get(), &start_len, &shift}, dst.getBit());
|
||||
const auto scratch2 =
|
||||
AllocateScratchRegister({&dst, src.get(), &start_len, &shift, &scratch1}, dst.getBit());
|
||||
|
||||
if (dst.getIdx() == shift.getIdx()) {
|
||||
SaveRegisters(c, {scratch1, scratch2});
|
||||
} else {
|
||||
SaveRegisters(c, {scratch1, scratch2, shift});
|
||||
}
|
||||
|
||||
c.mov(scratch1, *src);
|
||||
if (shift.getIdx() != start_len.getIdx()) {
|
||||
c.mov(shift, start_len);
|
||||
}
|
||||
|
||||
c.shr(scratch1, shift.cvt8());
|
||||
c.shr(shift, 8);
|
||||
c.mov(scratch2, 1);
|
||||
c.shl(scratch2, shift.cvt8());
|
||||
c.dec(scratch2);
|
||||
|
||||
c.mov(dst, scratch1);
|
||||
c.and_(dst, scratch2);
|
||||
|
||||
if (dst.getIdx() == shift.getIdx()) {
|
||||
RestoreRegisters(c, {scratch2, scratch1});
|
||||
} else {
|
||||
RestoreRegisters(c, {shift, scratch2, scratch1});
|
||||
}
|
||||
}
|
||||
|
||||
static void GenerateBLSI(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||
const auto src = ZydisToXbyakOperand(operands[1]);
|
||||
ValidateDst(dst);
|
||||
|
||||
const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit());
|
||||
|
||||
SaveRegisters(c, {scratch});
|
||||
|
||||
// BLSI sets CF to zero if source is zero, otherwise it sets CF to one.
|
||||
Xbyak::Label clear_carry, end;
|
||||
|
||||
c.mov(scratch, *src);
|
||||
c.neg(scratch); // NEG, like BLSI, clears CF if the source is zero and sets it otherwise
|
||||
c.jnc(clear_carry);
|
||||
|
||||
c.and_(scratch, *src);
|
||||
c.stc(); // setting/clearing carry needs to happen after the AND because that clears CF
|
||||
c.jmp(end);
|
||||
|
||||
c.L(clear_carry);
|
||||
c.and_(scratch, *src);
|
||||
// We don't need to clear carry here since AND does that for us
|
||||
|
||||
c.L(end);
|
||||
c.mov(dst, scratch);
|
||||
|
||||
RestoreRegisters(c, {scratch});
|
||||
}
|
||||
|
||||
static void GenerateBLSMSK(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||
const auto src = ZydisToXbyakOperand(operands[1]);
|
||||
ValidateDst(dst);
|
||||
|
||||
const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit());
|
||||
|
||||
SaveRegisters(c, {scratch});
|
||||
|
||||
Xbyak::Label clear_carry, end;
|
||||
|
||||
// BLSMSK sets CF to zero if source is NOT zero, otherwise it sets CF to one.
|
||||
c.mov(scratch, *src);
|
||||
c.test(scratch, scratch);
|
||||
c.jnz(clear_carry);
|
||||
|
||||
c.dec(scratch);
|
||||
c.xor_(scratch, *src);
|
||||
c.stc();
|
||||
c.jmp(end);
|
||||
|
||||
c.L(clear_carry);
|
||||
c.dec(scratch);
|
||||
c.xor_(scratch, *src);
|
||||
// We don't need to clear carry here since XOR does that for us
|
||||
|
||||
c.L(end);
|
||||
c.mov(dst, scratch);
|
||||
|
||||
RestoreRegisters(c, {scratch});
|
||||
}
|
||||
|
||||
static void GenerateTZCNT(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||
const auto src = ZydisToXbyakOperand(operands[1]);
|
||||
ValidateDst(dst);
|
||||
|
||||
Xbyak::Label src_zero, end;
|
||||
|
||||
c.cmp(*src, 0);
|
||||
c.je(src_zero);
|
||||
|
||||
// If src is not zero, functions like a BSF, but also clears the CF
|
||||
c.bsf(dst, *src);
|
||||
c.clc();
|
||||
c.jmp(end);
|
||||
|
||||
c.L(src_zero);
|
||||
c.mov(dst, operands[0].size);
|
||||
// Since dst is not zero, also set ZF to zero. Testing dst with itself when we know
|
||||
// it isn't zero is a good way to do this.
|
||||
// Use cvt32 to avoid REX/Operand size prefixes.
|
||||
c.test(dst.cvt32(), dst.cvt32());
|
||||
// When source is zero, TZCNT also sets CF.
|
||||
c.stc();
|
||||
|
||||
c.L(end);
|
||||
}
|
||||
|
||||
static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||
const auto src = ZydisToXbyakOperand(operands[1]);
|
||||
ValidateDst(dst);
|
||||
|
||||
const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit());
|
||||
|
||||
SaveRegisters(c, {scratch});
|
||||
|
||||
Xbyak::Label clear_carry, end;
|
||||
|
||||
// BLSR sets CF to zero if source is NOT zero, otherwise it sets CF to one.
|
||||
c.mov(scratch, *src);
|
||||
c.test(scratch, scratch);
|
||||
c.jnz(clear_carry);
|
||||
|
||||
c.dec(scratch);
|
||||
c.and_(scratch, *src);
|
||||
c.stc();
|
||||
c.jmp(end);
|
||||
|
||||
c.L(clear_carry);
|
||||
c.dec(scratch);
|
||||
c.and_(scratch, *src);
|
||||
// We don't need to clear carry here since AND does that for us
|
||||
|
||||
c.L(end);
|
||||
c.mov(dst, scratch);
|
||||
|
||||
RestoreRegisters(c, {scratch});
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
||||
static __attribute__((sysv_abi)) void PerformVCVTPH2PS(float* out, const half_float::half* in,
|
||||
const u32 count) {
|
||||
for (u32 i = 0; i < count; i++) {
|
||||
out[i] = half_float::half_cast<float>(in[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void GenerateVCVTPH2PS(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||
const auto src = ZydisToXbyakOperand(operands[1]);
|
||||
|
||||
const auto float_count = dst.getBit() / 32;
|
||||
const auto byte_count = float_count * 4;
|
||||
|
||||
SaveContext(c, true);
|
||||
|
||||
// Allocate stack space for outputs and load into first parameter.
|
||||
c.sub(rsp, byte_count);
|
||||
c.mov(rdi, rsp);
|
||||
|
||||
if (src->isXMM()) {
|
||||
// Allocate stack space for inputs and load into second parameter.
|
||||
c.sub(rsp, byte_count);
|
||||
c.mov(rsi, rsp);
|
||||
|
||||
// Move input to the allocated space.
|
||||
c.movdqu(ptr[rsp], *reinterpret_cast<Xbyak::Xmm*>(src.get()));
|
||||
} else {
|
||||
c.lea(rsi, src->getAddress());
|
||||
}
|
||||
|
||||
// Load float count into third parameter.
|
||||
c.mov(rdx, float_count);
|
||||
|
||||
c.mov(rax, reinterpret_cast<u64>(PerformVCVTPH2PS));
|
||||
c.call(rax);
|
||||
|
||||
if (src->isXMM()) {
|
||||
// Clean up after inputs space.
|
||||
c.add(rsp, byte_count);
|
||||
}
|
||||
|
||||
// Load outputs into destination register and clean up space.
|
||||
if (dst.isYMM()) {
|
||||
c.vmovdqu(*reinterpret_cast<const Xbyak::Ymm*>(&dst), ptr[rsp]);
|
||||
} else {
|
||||
c.movdqu(*reinterpret_cast<const Xbyak::Xmm*>(&dst), ptr[rsp]);
|
||||
}
|
||||
c.add(rsp, byte_count);
|
||||
|
||||
RestoreContext(c, dst, true);
|
||||
}
|
||||
|
||||
using SingleToHalfFloatConverter = half_float::half (*)(float);
|
||||
static const SingleToHalfFloatConverter SingleToHalfFloatConverters[4] = {
|
||||
half_float::half_cast<half_float::half, std::round_to_nearest, float>,
|
||||
half_float::half_cast<half_float::half, std::round_toward_neg_infinity, float>,
|
||||
half_float::half_cast<half_float::half, std::round_toward_infinity, float>,
|
||||
half_float::half_cast<half_float::half, std::round_toward_zero, float>,
|
||||
};
|
||||
|
||||
static __attribute__((sysv_abi)) void PerformVCVTPS2PH(half_float::half* out, const float* in,
|
||||
const u32 count, const u8 rounding_mode) {
|
||||
const auto conversion_func = SingleToHalfFloatConverters[rounding_mode];
|
||||
|
||||
for (u32 i = 0; i < count; i++) {
|
||||
out[i] = conversion_func(in[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void GenerateVCVTPS2PH(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||
const auto dst = ZydisToXbyakOperand(operands[0]);
|
||||
const auto src = ZydisToXbyakRegisterOperand(operands[1]);
|
||||
const auto ctrl = ZydisToXbyakImmediateOperand(operands[2]);
|
||||
|
||||
const auto float_count = src.getBit() / 32;
|
||||
const auto byte_count = float_count * 4;
|
||||
|
||||
SaveContext(c, true);
|
||||
|
||||
if (dst->isXMM()) {
|
||||
// Allocate stack space for outputs and load into first parameter.
|
||||
c.sub(rsp, byte_count);
|
||||
c.mov(rdi, rsp);
|
||||
} else {
|
||||
c.lea(rdi, dst->getAddress());
|
||||
}
|
||||
|
||||
// Allocate stack space for inputs and load into second parameter.
|
||||
c.sub(rsp, byte_count);
|
||||
c.mov(rsi, rsp);
|
||||
|
||||
// Move input to the allocated space.
|
||||
if (src.isYMM()) {
|
||||
c.vmovdqu(ptr[rsp], *reinterpret_cast<const Xbyak::Ymm*>(&src));
|
||||
} else {
|
||||
c.movdqu(ptr[rsp], *reinterpret_cast<const Xbyak::Xmm*>(&src));
|
||||
}
|
||||
|
||||
// Load float count into third parameter.
|
||||
c.mov(rdx, float_count);
|
||||
|
||||
// Load rounding mode into fourth parameter.
|
||||
if (ctrl & 4) {
|
||||
// Load from MXCSR.RC.
|
||||
c.stmxcsr(ptr[rsp - 4]);
|
||||
c.mov(rcx, ptr[rsp - 4]);
|
||||
c.shr(rcx, 13);
|
||||
c.and_(rcx, 3);
|
||||
} else {
|
||||
c.mov(rcx, ctrl & 3);
|
||||
}
|
||||
|
||||
c.mov(rax, reinterpret_cast<u64>(PerformVCVTPS2PH));
|
||||
c.call(rax);
|
||||
|
||||
// Clean up after inputs space.
|
||||
c.add(rsp, byte_count);
|
||||
|
||||
if (dst->isXMM()) {
|
||||
// Load outputs into destination register and clean up space.
|
||||
c.movdqu(*reinterpret_cast<Xbyak::Xmm*>(dst.get()), ptr[rsp]);
|
||||
c.add(rsp, byte_count);
|
||||
}
|
||||
|
||||
RestoreContext(c, *dst, true);
|
||||
}
|
||||
|
||||
static bool FilterRosetta2Only(const ZydisDecodedOperand*) {
|
||||
int ret = 0;
|
||||
size_t size = sizeof(ret);
|
||||
if (sysctlbyname("sysctl.proc_translated", &ret, &size, nullptr, 0) != 0) {
|
||||
return false;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else // __APPLE__
|
||||
|
||||
static bool FilterTcbAccess(const ZydisDecodedOperand* operands) {
|
||||
const auto& dst_op = operands[0];
|
||||
const auto& src_op = operands[1];
|
||||
|
@ -657,18 +121,11 @@ static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGe
|
|||
#endif
|
||||
}
|
||||
|
||||
#endif // __APPLE__
|
||||
|
||||
static bool FilterNoSSE4a(const ZydisDecodedOperand*) {
|
||||
Cpu cpu;
|
||||
return !cpu.has(Cpu::tSSE4a);
|
||||
}
|
||||
|
||||
static bool FilterNoBMI1(const ZydisDecodedOperand*) {
|
||||
Cpu cpu;
|
||||
return !cpu.has(Cpu::tBMI1);
|
||||
}
|
||||
|
||||
static void GenerateEXTRQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||
bool immediateForm = operands[1].type == ZYDIS_OPERAND_TYPE_IMMEDIATE &&
|
||||
operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE;
|
||||
|
@ -940,30 +397,16 @@ struct PatchInfo {
|
|||
};
|
||||
|
||||
static const std::unordered_map<ZydisMnemonic, PatchInfo> Patches = {
|
||||
// SSE4a
|
||||
{ZYDIS_MNEMONIC_EXTRQ, {FilterNoSSE4a, GenerateEXTRQ, true}},
|
||||
{ZYDIS_MNEMONIC_INSERTQ, {FilterNoSSE4a, GenerateINSERTQ, true}},
|
||||
|
||||
#if defined(_WIN32)
|
||||
// Windows needs a trampoline.
|
||||
{ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, true}},
|
||||
#elif !defined(__APPLE__)
|
||||
{ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}},
|
||||
#endif
|
||||
|
||||
{ZYDIS_MNEMONIC_EXTRQ, {FilterNoSSE4a, GenerateEXTRQ, true}},
|
||||
{ZYDIS_MNEMONIC_INSERTQ, {FilterNoSSE4a, GenerateINSERTQ, true}},
|
||||
|
||||
// BMI1
|
||||
{ZYDIS_MNEMONIC_ANDN, {FilterNoBMI1, GenerateANDN, true}},
|
||||
{ZYDIS_MNEMONIC_BEXTR, {FilterNoBMI1, GenerateBEXTR, true}},
|
||||
{ZYDIS_MNEMONIC_BLSI, {FilterNoBMI1, GenerateBLSI, true}},
|
||||
{ZYDIS_MNEMONIC_BLSMSK, {FilterNoBMI1, GenerateBLSMSK, true}},
|
||||
{ZYDIS_MNEMONIC_BLSR, {FilterNoBMI1, GenerateBLSR, true}},
|
||||
{ZYDIS_MNEMONIC_TZCNT, {FilterNoBMI1, GenerateTZCNT, true}},
|
||||
|
||||
#ifdef __APPLE__
|
||||
// Patches for instruction sets not supported by Rosetta 2.
|
||||
// F16C
|
||||
{ZYDIS_MNEMONIC_VCVTPH2PS, {FilterRosetta2Only, GenerateVCVTPH2PS, true}},
|
||||
{ZYDIS_MNEMONIC_VCVTPS2PH, {FilterRosetta2Only, GenerateVCVTPS2PH, true}},
|
||||
#endif
|
||||
};
|
||||
|
||||
static std::once_flag init_flag;
|
||||
|
@ -1280,18 +723,7 @@ void RegisterPatchModule(void* module_ptr, u64 module_size, void* trampoline_are
|
|||
}
|
||||
|
||||
void PrePatchInstructions(u64 segment_addr, u64 segment_size) {
|
||||
#if defined(__APPLE__)
|
||||
// HACK: For some reason patching in the signal handler at the start of a page does not work
|
||||
// under Rosetta 2. Patch any instructions at the start of a page ahead of time.
|
||||
if (!Patches.empty()) {
|
||||
auto* code_page = reinterpret_cast<u8*>(Common::AlignUp(segment_addr, 0x1000));
|
||||
const auto* end_page = code_page + Common::AlignUp(segment_size, 0x1000);
|
||||
while (code_page < end_page) {
|
||||
TryPatchJit(code_page);
|
||||
code_page += 0x1000;
|
||||
}
|
||||
}
|
||||
#elif !defined(_WIN32)
|
||||
#if !defined(_WIN32) && !defined(__APPLE__)
|
||||
// Linux and others have an FS segment pointing to valid memory, so continue to do full
|
||||
// ahead-of-time patching for now until a better solution is worked out.
|
||||
if (!Patches.empty()) {
|
||||
|
|
|
@ -7,12 +7,6 @@
|
|||
|
||||
namespace Core {
|
||||
|
||||
/// Initializes a stack for the current thread for use by patch implementations.
|
||||
void InitializeThreadPatchStack();
|
||||
|
||||
/// Cleans up the patch stack for the current thread.
|
||||
void CleanupThreadPatchStack();
|
||||
|
||||
/// Registers a module for patching, providing an area to generate trampoline code.
|
||||
void RegisterPatchModule(void* module_ptr, u64 module_size, void* trampoline_area_ptr,
|
||||
u64 trampoline_area_size);
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
#include "common/arch.h"
|
||||
#include "common/assert.h"
|
||||
#include "common/types.h"
|
||||
#include "core/cpu_patches.h"
|
||||
#include "core/libraries/kernel/threads/pthread.h"
|
||||
#include "core/tls.h"
|
||||
|
||||
|
@ -197,12 +196,7 @@ Tcb* GetTcbBase() {
|
|||
thread_local std::once_flag init_tls_flag;
|
||||
|
||||
void EnsureThreadInitialized() {
|
||||
std::call_once(init_tls_flag, [] {
|
||||
#ifdef ARCH_X86_64
|
||||
InitializeThreadPatchStack();
|
||||
#endif
|
||||
SetTcbBase(Libraries::Kernel::g_curthread->tcb);
|
||||
});
|
||||
std::call_once(init_tls_flag, [] { SetTcbBase(Libraries::Kernel::g_curthread->tcb); });
|
||||
}
|
||||
|
||||
} // namespace Core
|
||||
|
|
Loading…
Add table
Reference in a new issue