From cc335105772a20c9b90ea1a68ab6243da53e31b2 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 28 Dec 2024 19:34:31 +0100 Subject: [PATCH] JitArm64: Allocate scratch registers inside EmitBackpatchRoutine This cuts down on how much callers have to think about what registers EmitBackpatchRoutine is using. Also, by allocating registers dynamically instead of using a fixed set of registers, we improve codegen in cases where the fixed registers are taken but other registers are free. (These improvements don't apply to the emitting_routine == true case, where everything still works like before by necessity.) --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 37 ++-- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 164 +++++++++++++++--- 2 files changed, 165 insertions(+), 36 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index a89e8aa01f..f1eb73e160 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -252,29 +252,40 @@ protected: // // Registers used: // - // addr scratch - // Store: X2 X1 + // addr + // Store: X2 // Load: X1 - // Zero 256: X1 X30 - // Store float: X2 Q0 + // Zero 256: X1 + // Store float: X2 // Load float: X1 // // If mode == AlwaysFastAccess, the addr argument can be any register. // Otherwise it must be the register listed in the table above. // - // Additional scratch registers are used in the following situations: + // This routine allocates most scratch registers dynamically, but in the following + // situations, specific scratch registers have to be allocated in advance: // - // emitting_routine && mode == Auto: X0 - // emitting_routine && mode == Auto && !(flags & BackPatchInfo::FLAG_STORE): X3 - // emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X3 - // mode != AlwaysSlowAccess && !jo.fastmem: X0 - // !emitting_routine && mode != AlwaysFastAccess && jo.memcheck && - // (flags & BackPatchInfo::FLAG_LOAD): X0 - // !emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X30 - // !emitting_routine && mode == Auto && jo.fastmem: X30 + // emitting_routine && mode == Auto: X0 + // emitting_routine && mode == Auto && (flags & BackPatchInfo::FLAG_STORE): X1 + // emitting_routine && mode == Auto && !(flags & BackPatchInfo::FLAG_STORE): X3 + // emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X3 + // emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X0 + // emitting_routine && mode != AlwaysSlowAccess && + // (flags & BackPatchInfo::FLAG_STORE) && !(flags & BackPatchInfo::FLAG_FLOAT): X1 + // emitting_routine && mode != AlwaysSlowAccess && + // (flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT): Q0 + // emitting_routine && mode != AlwaysSlowAccess && + // (flags & BackPatchInfo::FLAG_ZERO_256): X30 + // !emitting_routine && mode == Auto && jo.fastmem: X30 // // If there are any other registers that the caller doesn't mind being overwritten, // these can be indicated in scratch_gprs and scratch_fprs. + // + // In the following situations, certain host registers must not contain guest registers: + // + // !emitting_routine && mode != AlwaysFastAccess && jo.memcheck: X30 + // !emitting_routine && mode != AlwaysFastAccess && jo.memcheck && + // (flags & BackPatchInfo::FLAG_LOAD): X0 void EmitBackpatchRoutine(u32 flags, MemAccessMode mode, Arm64Gen::ARM64Reg RS, Arm64Gen::ARM64Reg addr, BitSet32 scratch_gprs = BitSet32(0), BitSet32 scratch_fprs = BitSet32(0), bool emitting_routine = false); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 059be57048..ad11385edd 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -65,11 +65,140 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, const bool emit_fast_access = mode != MemAccessMode::AlwaysSlowAccess; const bool emit_slow_access = mode != MemAccessMode::AlwaysFastAccess; - const BitSet32 gprs_to_push = + const bool memcheck = jo.memcheck && !emitting_routine; + + BitSet32 temp_gpr_candidates = scratch_gprs; + BitSet32 temp_fpr_candidates = scratch_fprs; + temp_gpr_candidates[DecodeReg(addr)] = false; + if (flags & BackPatchInfo::FLAG_FLOAT) + temp_fpr_candidates[DecodeReg(RS)] = false; + else if (!(flags & BackPatchInfo::FLAG_ZERO_256)) + temp_gpr_candidates[DecodeReg(RS)] = false; + if (!emitting_routine && mode == MemAccessMode::Auto && jo.fastmem) + temp_gpr_candidates[30] = true; + + const auto allocate_temp_reg = [this](Arm64RegCache& reg_cache, + BitSet32& candidates) -> Arm64RegCache::ScopedARM64Reg { + for (int i : candidates) + { + candidates[i] = false; + ARM64Reg reg = ARM64Reg(i); + if (®_cache == &fpr) + reg = EncodeRegToQuad(reg); + return reg; + } + return reg_cache.GetScopedReg(); + }; + + const auto can_allocate_temp_reg_for_free = [](Arm64RegCache& reg_cache, BitSet32& candidates) { + return candidates != BitSet32{} || reg_cache.GetUnlockedRegisterCount() > 0; + }; + + Arm64RegCache::ScopedARM64Reg temp_gpr_1; + Arm64RegCache::ScopedARM64Reg temp_gpr_2; + Arm64RegCache::ScopedARM64Reg temp_gpr_3; + Arm64RegCache::ScopedARM64Reg temp_fpr_1; + + if (emit_fast_access) + { + if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) + { + temp_fpr_1 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::Q0) : + allocate_temp_reg(fpr, temp_fpr_candidates); + scratch_fprs[DecodeReg(temp_fpr_1)] = true; + } + else if (flags & BackPatchInfo::FLAG_STORE) + { + temp_gpr_1 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W1) : + allocate_temp_reg(gpr, temp_gpr_candidates); + scratch_gprs[DecodeReg(temp_gpr_1)] = true; + } + else if (flags & BackPatchInfo::FLAG_ZERO_256) + { + temp_gpr_1 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W30) : + allocate_temp_reg(gpr, temp_gpr_candidates); + scratch_gprs[DecodeReg(temp_gpr_1)] = true; + } + + if (!jo.fastmem) + { + temp_gpr_2 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W0) : + allocate_temp_reg(gpr, temp_gpr_candidates); + temp_gpr_3 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W3) : + allocate_temp_reg(gpr, temp_gpr_candidates); + scratch_gprs[DecodeReg(temp_gpr_2)] = true; + scratch_gprs[DecodeReg(temp_gpr_3)] = true; + } + else if (emit_slow_access && emitting_routine) + { + temp_gpr_2 = ARM64Reg::W0; + temp_gpr_3 = flags & BackPatchInfo::FLAG_STORE ? ARM64Reg::W1 : ARM64Reg::W3; + scratch_gprs[DecodeReg(temp_gpr_2)] = true; + scratch_gprs[DecodeReg(temp_gpr_3)] = true; + } + } + + // Setting memcheck_temp_gpr to W30 works, but because W30 is a register that needs to be pushed + // and popped, using W30 may require us to emit an extra push and pop instruction, depending on + // what other registers need pushing and popping. If we can find another register to use without + // having to evict anything from the register cache, let's do that instead of using W30. + ARM64Reg memcheck_temp_gpr = ARM64Reg::W30; + if (emit_slow_access && memcheck) + { + const auto is_suitable_as_memcheck_temp_gpr = [flags](ARM64Reg reg) { + return reg != ARM64Reg::INVALID_REG && reg != ARM64Reg::W30 && + (reg != ARM64Reg::W0 || !(flags & BackPatchInfo::FLAG_LOAD)); + }; + + const auto get_unset_temp_gpr = [&]() -> Arm64RegCache::ScopedARM64Reg& { + if (temp_gpr_1 == ARM64Reg::INVALID_REG) + return temp_gpr_1; + if (temp_gpr_2 == ARM64Reg::INVALID_REG) + return temp_gpr_2; + ASSERT(temp_gpr_3 == ARM64Reg::INVALID_REG); + return temp_gpr_3; + }; + + if (is_suitable_as_memcheck_temp_gpr(temp_gpr_1)) + { + memcheck_temp_gpr = temp_gpr_1; + } + else if (is_suitable_as_memcheck_temp_gpr(temp_gpr_2)) + { + memcheck_temp_gpr = temp_gpr_2; + } + else if (is_suitable_as_memcheck_temp_gpr(temp_gpr_3)) + { + memcheck_temp_gpr = temp_gpr_3; + } + else + { + while (can_allocate_temp_reg_for_free(gpr, temp_gpr_candidates)) + { + Arm64RegCache::ScopedARM64Reg& temp_gpr_x = get_unset_temp_gpr(); + temp_gpr_x = allocate_temp_reg(gpr, temp_gpr_candidates); + scratch_gprs[DecodeReg(temp_gpr_x)] = true; + if (is_suitable_as_memcheck_temp_gpr(temp_gpr_x)) + break; + } + } + + if (temp_fpr_1 == ARM64Reg::INVALID_REG && + can_allocate_temp_reg_for_free(fpr, temp_fpr_candidates)) + { + temp_fpr_1 = allocate_temp_reg(fpr, temp_fpr_candidates); + scratch_fprs[DecodeReg(temp_fpr_1)] = true; + } + } + + BitSet32 gprs_to_push = (emitting_routine ? CALLER_SAVED_GPRS : gpr.GetCallerSavedUsed()) & ~scratch_gprs; - const BitSet32 fprs_to_push = + BitSet32 fprs_to_push = (emitting_routine ? BitSet32(0xFFFFFFFF) : fpr.GetCallerSavedUsed()) & ~scratch_fprs; + if (!emitting_routine && mode == MemAccessMode::Auto && jo.fastmem) + gprs_to_push[30] = true; + bool in_far_code = false; const u8* fast_access_start = GetCodePtr(); std::optional slow_access_fixup; @@ -81,13 +210,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, if (!jo.fastmem) { - const ARM64Reg temp = emitting_routine ? ARM64Reg::W3 : ARM64Reg::W30; + memory_base = EncodeRegTo64(temp_gpr_3); + memory_offset = temp_gpr_2; - memory_base = EncodeRegTo64(temp); - memory_offset = ARM64Reg::W0; - - LSR(temp, addr, PowerPC::BAT_INDEX_SHIFT); - LDR(memory_base, MEM_REG, ArithOption(temp, true)); + LSR(temp_gpr_3, addr, PowerPC::BAT_INDEX_SHIFT); + LDR(memory_base, MEM_REG, ArithOption(temp_gpr_3, true)); if (emit_slow_access) { @@ -100,15 +227,12 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, } else if (emit_slow_access && emitting_routine) { - const ARM64Reg temp1 = flags & BackPatchInfo::FLAG_STORE ? ARM64Reg::W1 : ARM64Reg::W3; - const ARM64Reg temp2 = ARM64Reg::W0; - - slow_access_fixup = CheckIfSafeAddress(addr, temp1, temp2); + slow_access_fixup = CheckIfSafeAddress(addr, temp_gpr_3, temp_gpr_2); } if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) { - ARM64Reg temp = ARM64Reg::D0; + ARM64Reg temp = EncodeRegToDouble(temp_fpr_1); temp = ByteswapBeforeStore(this, &m_float_emit, temp, EncodeRegToDouble(RS), flags, true); m_float_emit.STR(access_size, temp, memory_base, memory_offset); @@ -122,7 +246,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, } else if (flags & BackPatchInfo::FLAG_STORE) { - ARM64Reg temp = ARM64Reg::W1; + ARM64Reg temp = temp_gpr_1; temp = ByteswapBeforeStore(this, &m_float_emit, temp, RS, flags, true); if (flags & BackPatchInfo::FLAG_SIZE_32) @@ -135,7 +259,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, else if (flags & BackPatchInfo::FLAG_ZERO_256) { // This literally only stores 32bytes of zeros to the target address - ARM64Reg temp = ARM64Reg::X30; + ARM64Reg temp = EncodeRegTo64(temp_gpr_1); ADD(temp, memory_base, memory_offset); STP(IndexType::Signed, ARM64Reg::ZR, ARM64Reg::ZR, temp, 0); STP(IndexType::Signed, ARM64Reg::ZR, ARM64Reg::ZR, temp, 16); @@ -156,8 +280,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, if (emit_slow_access) { - const bool memcheck = jo.memcheck && !emitting_routine; - if (emit_fast_access) { in_far_code = true; @@ -174,12 +296,9 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, if (slow_access_fixup) SetJumpTarget(*slow_access_fixup); - const ARM64Reg temp_gpr = ARM64Reg::W1; - const int temp_gpr_index = DecodeReg(temp_gpr); - BitSet32 gprs_to_push_early = {}; if (memcheck) - gprs_to_push_early[temp_gpr_index] = true; + gprs_to_push_early[DecodeReg(memcheck_temp_gpr)] = true; if (flags & BackPatchInfo::FLAG_LOAD) gprs_to_push_early[0] = true; @@ -270,11 +389,10 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, if (memcheck) { - const ARM64Reg temp_fpr = fprs_to_push[0] ? ARM64Reg::INVALID_REG : ARM64Reg::Q0; const u64 early_push_count = (gprs_to_push & gprs_to_push_early).Count(); const u64 early_push_size = Common::AlignUp(early_push_count, 2) * 8; - WriteConditionalExceptionExit(EXCEPTION_DSI, temp_gpr, temp_fpr, early_push_size); + WriteConditionalExceptionExit(EXCEPTION_DSI, memcheck_temp_gpr, temp_fpr_1, early_push_size); } if (flags & BackPatchInfo::FLAG_LOAD)