From 958b75b707e78e3b5bcc7fa9b54ea5a1d87926de Mon Sep 17 00:00:00 2001 From: degasus Date: Sat, 12 Aug 2017 22:18:22 +0200 Subject: [PATCH] JitCommon: Restructure the profiler calls. --- .../CachedInterpreter/CachedInterpreter.cpp | 1 - Source/Core/Core/PowerPC/Jit64/Jit.cpp | 28 +++++----- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 53 +++++++++---------- Source/Core/Core/PowerPC/JitCommon/JitCache.h | 15 +++--- Source/Core/Core/PowerPC/JitInterface.cpp | 10 ++-- 5 files changed, 52 insertions(+), 55 deletions(-) diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp index 0067a7f3f4..ff7739905c 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp @@ -200,7 +200,6 @@ void CachedInterpreter::Jit(u32 address) b->checkedEntry = GetCodePtr(); b->normalEntry = GetCodePtr(); - b->runCount = 0; for (u32 i = 0; i < code_block.m_num_instructions; i++) { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 25b391aac7..875a3a298b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -628,7 +628,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc const u8* start = AlignCode4(); // TODO: Test if this or AlignCode16 make a difference from GetCodePtr b->checkedEntry = start; - b->runCount = 0; // Downcount flag check. The last block decremented downcounter, and the flag should still be // available. @@ -649,16 +648,13 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc } // Conditionally add profiling code. - b->ticCounter = 0; - b->ticStart = 0; - b->ticStop = 0; if (Profiler::g_ProfileBlocks) { - MOV(64, R(RSCRATCH), ImmPtr(&b->runCount)); - ADD(32, MatR(RSCRATCH), Imm8(1)); - // get start tic - MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->ticStart))); + MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->profile_data.ticStart))); + int offset = static_cast(offsetof(JitBlock::ProfileData, runCount)) - + static_cast(offsetof(JitBlock::ProfileData, ticStart)); + ADD(64, MDisp(ABI_PARAM1, offset), Imm8(1)); ABI_CallFunction(QueryPerformanceCounter); } #if defined(_DEBUG) || defined(DEBUGFAST) || defined(NAN_CHECK) @@ -736,18 +732,20 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc { if (Profiler::g_ProfileBlocks) { - // WARNING - cmp->branch merging will screw this up. + // TODO: Move this to WriteExit() calls. BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); // get end tic - MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->ticStop))); + MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast(&b->profile_data.ticStop))); ABI_CallFunction(QueryPerformanceCounter); // tic counter += (end tic - start tic) - MOV(64, R(RSCRATCH2), Imm64((u64)b)); - MOV(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticStop))); - SUB(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticStart))); - ADD(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(struct JitBlock, ticCounter))); - MOV(64, MDisp(RSCRATCH2, offsetof(struct JitBlock, ticCounter)), R(RSCRATCH)); + MOV(64, R(RSCRATCH2), Imm64(reinterpret_cast(&b->profile_data))); + MOV(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticStop))); + SUB(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticStart))); + ADD(64, R(RSCRATCH), MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticCounter))); + ADD(64, MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, downcountCounter)), + Imm32(js.downcountAmount)); + MOV(64, MDisp(RSCRATCH2, offsetof(JitBlock::ProfileData, ticCounter)), R(RSCRATCH)); ABI_PopRegistersAndAdjustStack(registersInUse, 0); } js.isLastInstruction = true; diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index a507485043..b7dbf5a161 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -520,10 +520,10 @@ void JitArm64::EmitResetCycleCounters() const u32 PMCR_EL0_P = 2; const u32 PMCR_EL0_C = 4; const u32 PMCR_EL0_LC = 0x40; - _MSR(FIELD_PMCR_EL0, X0); - MOVI2R(X1, PMCR_EL0_E | PMCR_EL0_P | PMCR_EL0_C | PMCR_EL0_LC); - ORR(X0, X0, X1); - MRS(X0, FIELD_PMCR_EL0); + _MSR(FIELD_PMCR_EL0, X10); + MOVI2R(X11, PMCR_EL0_E | PMCR_EL0_P | PMCR_EL0_C | PMCR_EL0_LC); + ORR(X10, X10, X11); + MRS(X10, FIELD_PMCR_EL0); } void JitArm64::EmitGetCycles(Arm64Gen::ARM64Reg reg) @@ -533,47 +533,54 @@ void JitArm64::EmitGetCycles(Arm64Gen::ARM64Reg reg) void JitArm64::BeginTimeProfile(JitBlock* b) { - b->ticCounter = 0; - b->ticStart = 0; - b->ticStop = 0; + MOVP2R(X0, &b->profile_data); + LDR(INDEX_UNSIGNED, X1, X0, offsetof(JitBlock::ProfileData, runCount)); + ADD(X1, X1, 1); if (m_supports_cycle_counter) { EmitResetCycleCounters(); - EmitGetCycles(X1); - MOVP2R(X0, &b->ticStart); - STR(INDEX_UNSIGNED, X1, X0, 0); + EmitGetCycles(X2); + + // stores runCount and ticStart + STP(INDEX_UNSIGNED, X1, X2, X0, offsetof(JitBlock::ProfileData, runCount)); } else { + STR(INDEX_UNSIGNED, X1, X0, offsetof(JitBlock::ProfileData, runCount)); + MOVP2R(X1, &QueryPerformanceCounter); - MOVP2R(X0, &b->ticStart); + ADD(X0, X0, offsetof(JitBlock::ProfileData, ticStart)); BLR(X1); } } void JitArm64::EndTimeProfile(JitBlock* b) { + MOVP2R(X20, &b->profile_data); if (m_supports_cycle_counter) { EmitGetCycles(X2); - MOVP2R(X0, &b->ticStart); } else { MOVP2R(X1, &QueryPerformanceCounter); - MOVP2R(X0, &b->ticStop); + ADD(X0, X20, offsetof(JitBlock::ProfileData, ticStop)); BLR(X1); - MOVP2R(X0, &b->ticStart); - LDR(INDEX_UNSIGNED, X2, X0, 8); // Stop + LDR(INDEX_UNSIGNED, X2, X20, offsetof(JitBlock::ProfileData, ticStop)); } - LDR(INDEX_UNSIGNED, X1, X0, 0); // Start - LDR(INDEX_UNSIGNED, X3, X0, 16); // Counter + LDR(INDEX_UNSIGNED, X1, X20, offsetof(JitBlock::ProfileData, ticStart)); + + // loads ticCounter and downcountCounter + LDP(INDEX_UNSIGNED, X3, X4, X20, offsetof(JitBlock::ProfileData, ticCounter)); SUB(X2, X2, X1); ADD(X3, X3, X2); - STR(INDEX_UNSIGNED, X3, X0, 16); + ADDI2R(X4, X4, js.downcountAmount); + + // stores ticCounter and downcountCounter + STP(INDEX_UNSIGNED, X3, X4, X20, offsetof(JitBlock::ProfileData, ticCounter)); } void JitArm64::Run() @@ -657,7 +664,6 @@ void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* const u8* start = GetCodePtr(); b->checkedEntry = start; - b->runCount = 0; // Downcount flag check, Only valid for linked blocks { @@ -673,15 +679,6 @@ void JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* // Conditionally add profiling code. if (Profiler::g_ProfileBlocks) { - ARM64Reg WA = gpr.GetReg(); - ARM64Reg WB = gpr.GetReg(); - ARM64Reg XA = EncodeRegTo64(WA); - ARM64Reg XB = EncodeRegTo64(WB); - MOVP2R(XA, &b->runCount); - LDR(INDEX_UNSIGNED, XB, XA, 0); - ADD(XB, XB, 1); - STR(INDEX_UNSIGNED, XB, XA, 0); - gpr.Unlock(WA, WB); // get start tic BeginTimeProfile(b); } diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index 76e2d4dbf9..7b8e44b1b2 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -49,7 +49,6 @@ struct JitBlock // The number of PPC instructions represented by this block. Mostly // useful for logging. u32 originalSize; - int runCount; // for profiling. // Information about exits to a known address from this block. // This is used to implement block linking. @@ -65,11 +64,15 @@ struct JitBlock // This set stores all physical addresses of all occupied instructions. std::set physical_addresses; - // we don't really need to save start and stop - // TODO (mb2): ticStart and ticStop -> "local var" mean "in block" ... low priority ;) - u64 ticStart; // for profiling - time. - u64 ticStop; // for profiling - time. - u64 ticCounter; // for profiling - time. + // Block profiling data, structure is inlined in Jit.cpp + struct ProfileData + { + u64 ticCounter; + u64 downcountCounter; + u64 runCount; + u64 ticStart; + u64 ticStop; + } profile_data = {}; // This tracks the position if this block within the fast block cache. // We allow each block to have only one map entry. diff --git a/Source/Core/Core/PowerPC/JitInterface.cpp b/Source/Core/Core/PowerPC/JitInterface.cpp index 2369eb75d6..284b7ea366 100644 --- a/Source/Core/Core/PowerPC/JitInterface.cpp +++ b/Source/Core/Core/PowerPC/JitInterface.cpp @@ -119,12 +119,12 @@ void GetProfileResults(ProfileStats* prof_stats) QueryPerformanceFrequency((LARGE_INTEGER*)&prof_stats->countsPerSec); g_jit->GetBlockCache()->RunOnBlocks([&prof_stats](const JitBlock& block) { - // Rough heuristic. Mem instructions should cost more. - u64 cost = block.originalSize * (block.runCount / 4); - u64 timecost = block.ticCounter; + const auto& data = block.profile_data; + u64 cost = data.downcountCounter; + u64 timecost = data.ticCounter; // Todo: tweak. - if (block.runCount >= 1) - prof_stats->block_stats.emplace_back(block.effectiveAddress, cost, timecost, block.runCount, + if (data.runCount >= 1) + prof_stats->block_stats.emplace_back(block.effectiveAddress, cost, timecost, data.runCount, block.codeSize); prof_stats->cost_sum += cost; prof_stats->timecost_sum += timecost;