Add optimizations related to caller/callee saved registers, thread synchronization and disable tier 0

This commit is contained in:
gdkchan 2019-02-25 01:08:38 -03:00
parent 5001f78b1d
commit b73a8ef01d
13 changed files with 204 additions and 124 deletions

View file

@ -60,6 +60,8 @@ namespace ChocolArm64.Instructions
{
OpCodeBReg64 op = (OpCodeBReg64)context.CurrOp;
context.HasIndirectJump = true;
context.EmitStoreState();
context.EmitLdintzr(op.Rn);

View file

@ -86,7 +86,11 @@ namespace ChocolArm64.Instructions
context.EmitLdarg(TranslatedSub.StateArgIdx);
context.EmitLdtmp();
context.EmitPrivateCall(typeof(Translator), nameof(Translator.GetOrTranslateVirtualSubroutine));
string name = isJump
? nameof(Translator.GetOrTranslateVirtualSubroutineForJump)
: nameof(Translator.GetOrTranslateVirtualSubroutine);
context.EmitPrivateCall(typeof(Translator), name);
context.EmitLdarg(TranslatedSub.StateArgIdx);
context.EmitLdarg(TranslatedSub.MemoryArgIdx);

View file

@ -2,21 +2,23 @@ using System.Runtime.Intrinsics.X86;
public static class Optimizations
{
internal static bool FastFP = true;
public static bool AssumeStrictAbiCompliance { get; set; } = true;
private static bool _useAllSseIfAvailable = true;
public static bool FastFP { get; set; } = true;
private static bool _useSseIfAvailable = true;
private static bool _useSse2IfAvailable = true;
private static bool _useSse3IfAvailable = true;
private static bool _useSsse3IfAvailable = true;
private static bool _useSse41IfAvailable = true;
private static bool _useSse42IfAvailable = true;
private const bool UseAllSseIfAvailable = true;
internal static bool UseSse = (_useAllSseIfAvailable && _useSseIfAvailable) && Sse.IsSupported;
internal static bool UseSse2 = (_useAllSseIfAvailable && _useSse2IfAvailable) && Sse2.IsSupported;
internal static bool UseSse3 = (_useAllSseIfAvailable && _useSse3IfAvailable) && Sse3.IsSupported;
internal static bool UseSsse3 = (_useAllSseIfAvailable && _useSsse3IfAvailable) && Ssse3.IsSupported;
internal static bool UseSse41 = (_useAllSseIfAvailable && _useSse41IfAvailable) && Sse41.IsSupported;
internal static bool UseSse42 = (_useAllSseIfAvailable && _useSse42IfAvailable) && Sse42.IsSupported;
}
public static bool UseSseIfAvailable { get; set; } = UseAllSseIfAvailable;
public static bool UseSse2IfAvailable { get; set; } = UseAllSseIfAvailable;
public static bool UseSse3IfAvailable { get; set; } = UseAllSseIfAvailable;
public static bool UseSsse3IfAvailable { get; set; } = UseAllSseIfAvailable;
public static bool UseSse41IfAvailable { get; set; } = UseAllSseIfAvailable;
public static bool UseSse42IfAvailable { get; set; } = UseAllSseIfAvailable;
internal static bool UseSse => UseSseIfAvailable && Sse.IsSupported;
internal static bool UseSse2 => UseSse2IfAvailable && Sse2.IsSupported;
internal static bool UseSse3 => UseSse3IfAvailable && Sse3.IsSupported;
internal static bool UseSsse3 => UseSsse3IfAvailable && Ssse3.IsSupported;
internal static bool UseSse41 => UseSse41IfAvailable && Sse41.IsSupported;
internal static bool UseSse42 => UseSse42IfAvailable && Sse42.IsSupported;
}

View file

@ -4,13 +4,13 @@ namespace ChocolArm64.Translation
{
class ILBlock : IILEmit
{
public long IntInputs { get; private set; }
public long IntOutputs { get; private set; }
public long IntAwOutputs { get; private set; }
public long IntInputs { get; private set; }
public long IntOutputs { get; private set; }
private long _intAwOutputs;
public long VecInputs { get; private set; }
public long VecOutputs { get; private set; }
public long VecAwOutputs { get; private set; }
public long VecInputs { get; private set; }
public long VecOutputs { get; private set; }
private long _vecAwOutputs;
public bool HasStateStore { get; private set; }
@ -34,16 +34,16 @@ namespace ChocolArm64.Translation
//opcodes emitted by each ARM instruction.
//We can only consider the new outputs for doing input elimination
//after all the CIL opcodes used by the instruction being emitted.
IntAwOutputs = IntOutputs;
VecAwOutputs = VecOutputs;
_intAwOutputs = IntOutputs;
_vecAwOutputs = VecOutputs;
}
else if (emitter is ILOpCodeLoad ld && ILMethodBuilder.IsRegIndex(ld.Index))
{
switch (ld.IoType)
{
case IoType.Flag: IntInputs |= ((1L << ld.Index) << 32) & ~IntAwOutputs; break;
case IoType.Int: IntInputs |= (1L << ld.Index) & ~IntAwOutputs; break;
case IoType.Vector: VecInputs |= (1L << ld.Index) & ~VecAwOutputs; break;
case IoType.Flag: IntInputs |= ((1L << ld.Index) << 32) & ~_intAwOutputs; break;
case IoType.Int: IntInputs |= (1L << ld.Index) & ~_intAwOutputs; break;
case IoType.Vector: VecInputs |= (1L << ld.Index) & ~_vecAwOutputs; break;
}
}
else if (emitter is ILOpCodeStore st && ILMethodBuilder.IsRegIndex(st.Index))

View file

@ -31,6 +31,8 @@ namespace ChocolArm64.Translation
public Aarch32Mode Mode { get; } = Aarch32Mode.User; //TODO
public bool HasIndirectJump { get; set; }
private Dictionary<Block, ILBlock> _visitedBlocks;
private Queue<Block> _branchTargets;
@ -91,7 +93,12 @@ namespace ChocolArm64.Translation
ResetBlockState();
AdvanceOpCode();
if (AdvanceOpCode())
{
EmitSynchronization();
_ilBlock.Add(new ILOpCodeLoadState(_ilBlock, isSubEntry: true));
}
}
public static int GetIntTempIndex()
@ -127,10 +134,18 @@ namespace ChocolArm64.Translation
return;
}
if (_opcIndex == 0)
int opcIndex = _opcIndex;
if (opcIndex == 0)
{
MarkLabel(GetLabel(_currBlock.Position));
}
bool isLastOp = opcIndex == CurrBlock.OpCodes.Count - 1;
if (isLastOp && CurrBlock.Branch != null &&
(ulong)CurrBlock.Branch.Position <= (ulong)CurrBlock.Position)
{
EmitSynchronization();
}
@ -161,7 +176,7 @@ namespace ChocolArm64.Translation
//of the next instruction to be executed (in the case that the condition
//is false, and the branch was not taken, as all basic blocks should end with
//some kind of branch).
if (CurrOp == CurrBlock.GetLastOp() && CurrBlock.Next == null)
if (isLastOp && CurrBlock.Next == null)
{
EmitStoreState();
EmitLdc_I8(CurrOp.Position + CurrOp.OpCodeSizeInBytes);
@ -285,7 +300,7 @@ namespace ChocolArm64.Translation
return;
}
_queue.Enqueue(new TranslatorQueueItem(position, mode, TranslationTier.Tier1));
_queue.Enqueue(new TranslatorQueueItem(position, mode, TranslationTier.Tier1, isComplete: true));
}
public bool TryOptEmitSubroutineCall()

View file

@ -6,7 +6,7 @@ namespace ChocolArm64.Translation
{
private bool _hasLabel;
private Label _lbl;
private Label _label;
public void Emit(ILMethodBuilder context)
{
@ -17,12 +17,12 @@ namespace ChocolArm64.Translation
{
if (!_hasLabel)
{
_lbl = context.Generator.DefineLabel();
_label = context.Generator.DefineLabel();
_hasLabel = true;
}
return _lbl;
return _label;
}
}
}

View file

@ -18,17 +18,29 @@ namespace ChocolArm64.Translation
private string _subName;
public bool IsAarch64 { get; }
public bool IsSubComplete { get; }
private int _localsCount;
public ILMethodBuilder(ILBlock[] ilBlocks, string subName)
public ILMethodBuilder(
ILBlock[] ilBlocks,
string subName,
bool isAarch64,
bool isSubComplete = false)
{
_ilBlocks = ilBlocks;
_subName = subName;
_ilBlocks = ilBlocks;
_subName = subName;
IsAarch64 = isAarch64;
IsSubComplete = isSubComplete;
}
public TranslatedSub GetSubroutine(TranslationTier tier)
{
LocalAlloc = new LocalAlloc(_ilBlocks, _ilBlocks[0]);
LocalAlloc = new LocalAlloc();
LocalAlloc.BuildUses(_ilBlocks[0]);
DynamicMethod method = new DynamicMethod(_subName, typeof(long), TranslatedSub.FixedArgTypes);
@ -40,8 +52,6 @@ namespace ChocolArm64.Translation
_localsCount = 0;
new ILOpCodeLoadState(_ilBlocks[0]).Emit(this);
foreach (ILBlock ilBlock in _ilBlocks)
{
ilBlock.Emit(this);

View file

@ -4,7 +4,7 @@ namespace ChocolArm64.Translation
{
struct ILOpCodeBranch : IILEmit
{
private OpCode _ilOp;
private OpCode _ilOp;
private ILLabel _label;
public ILOpCodeBranch(OpCode ilOp, ILLabel label)

View file

@ -7,9 +7,12 @@ namespace ChocolArm64.Translation
{
private ILBlock _block;
public ILOpCodeLoadState(ILBlock block)
private bool _isSubEntry;
public ILOpCodeLoadState(ILBlock block, bool isSubEntry = false)
{
_block = block;
_block = block;
_isSubEntry = isSubEntry;
}
public void Emit(ILMethodBuilder context)
@ -17,6 +20,12 @@ namespace ChocolArm64.Translation
long intInputs = context.LocalAlloc.GetIntInputs(_block);
long vecInputs = context.LocalAlloc.GetVecInputs(_block);
if (Optimizations.AssumeStrictAbiCompliance && context.IsSubComplete)
{
intInputs = LocalAlloc.ClearCallerSavedIntRegs(intInputs, context.IsAarch64);
vecInputs = LocalAlloc.ClearCallerSavedVecRegs(vecInputs, context.IsAarch64);
}
LoadLocals(context, intInputs, RegisterType.Int);
LoadLocals(context, vecInputs, RegisterType.Vector);
}

View file

@ -17,6 +17,12 @@ namespace ChocolArm64.Translation
long intOutputs = context.LocalAlloc.GetIntOutputs(_block);
long vecOutputs = context.LocalAlloc.GetVecOutputs(_block);
if (Optimizations.AssumeStrictAbiCompliance && context.IsSubComplete)
{
intOutputs = LocalAlloc.ClearCallerSavedIntRegs(intOutputs, context.IsAarch64);
vecOutputs = LocalAlloc.ClearCallerSavedVecRegs(vecOutputs, context.IsAarch64);
}
StoreLocals(context, intOutputs, RegisterType.Int);
StoreLocals(context, vecOutputs, RegisterType.Vector);
}

View file

@ -5,6 +5,11 @@ namespace ChocolArm64.Translation
{
class LocalAlloc
{
public const long CallerSavedIntRegistersMask = 0x7fL << 9;
public const long PStateNzcvFlagsMask = 0xfL << 60;
public const long CallerSavedVecRegistersMask = 0xffffL << 16;
private class PathIo
{
private Dictionary<ILBlock, long> _allInputs;
@ -57,15 +62,40 @@ namespace ChocolArm64.Translation
private Dictionary<ILBlock, PathIo> _intPaths;
private Dictionary<ILBlock, PathIo> _vecPaths;
private HashSet<ILBlock> _entryBlocks;
private struct BlockIo
{
public ILBlock Block;
public ILBlock Entry;
public ILBlock Block { get; }
public ILBlock Entry { get; }
public long IntInputs;
public long VecInputs;
public long IntOutputs;
public long VecOutputs;
public long IntInputs { get; set; }
public long VecInputs { get; set; }
public long IntOutputs { get; set; }
public long VecOutputs { get; set; }
public BlockIo(ILBlock block, ILBlock entry)
{
Block = block;
Entry = entry;
IntInputs = IntOutputs = 0;
VecInputs = VecOutputs = 0;
}
public BlockIo(
ILBlock block,
ILBlock entry,
long intInputs,
long vecInputs,
long intOutputs,
long vecOutputs) : this(block, entry)
{
IntInputs = intInputs;
VecInputs = vecInputs;
IntOutputs = intOutputs;
VecOutputs = vecOutputs;
}
public override bool Equals(object obj)
{
@ -98,25 +128,15 @@ namespace ChocolArm64.Translation
}
}
private const int MaxOptGraphLength = 40;
public LocalAlloc(ILBlock[] graph, ILBlock entry)
public LocalAlloc()
{
_intPaths = new Dictionary<ILBlock, PathIo>();
_vecPaths = new Dictionary<ILBlock, PathIo>();
if (graph.Length > 1 &&
graph.Length < MaxOptGraphLength)
{
InitializeOptimal(graph, entry);
}
else
{
InitializeFast(graph);
}
_entryBlocks = new HashSet<ILBlock>();
}
private void InitializeOptimal(ILBlock[] graph, ILBlock entry)
public void BuildUses(ILBlock entry)
{
//This will go through all possible paths on the graph,
//and store all inputs/outputs for each block. A register
@ -133,19 +153,15 @@ namespace ChocolArm64.Translation
void Enqueue(BlockIo block)
{
if (!visited.Contains(block))
if (visited.Add(block))
{
unvisited.Enqueue(block);
visited.Add(block);
}
}
Enqueue(new BlockIo()
{
Block = entry,
Entry = entry
});
_entryBlocks.Add(entry);
Enqueue(new BlockIo(entry, entry));
while (unvisited.Count > 0)
{
@ -177,19 +193,23 @@ namespace ChocolArm64.Translation
void EnqueueFromCurrent(ILBlock block, bool retTarget)
{
BlockIo blockIo = new BlockIo() { Block = block };
BlockIo blockIo;
if (retTarget)
{
blockIo.Entry = block;
blockIo = new BlockIo(block, block);
_entryBlocks.Add(block);
}
else
{
blockIo.Entry = current.Entry;
blockIo.IntInputs = current.IntInputs;
blockIo.VecInputs = current.VecInputs;
blockIo.IntOutputs = current.IntOutputs;
blockIo.VecOutputs = current.VecOutputs;
blockIo = new BlockIo(
block,
current.Entry,
current.IntInputs,
current.VecInputs,
current.IntOutputs,
current.VecOutputs);
}
Enqueue(blockIo);
@ -207,38 +227,6 @@ namespace ChocolArm64.Translation
}
}
private void InitializeFast(ILBlock[] graph)
{
//This is WAY faster than InitializeOptimal, but results in
//unneeded loads and stores, so the resulting code will be slower.
long intInputs = 0, intOutputs = 0;
long vecInputs = 0, vecOutputs = 0;
foreach (ILBlock block in graph)
{
intInputs |= block.IntInputs;
intOutputs |= block.IntOutputs;
vecInputs |= block.VecInputs;
vecOutputs |= block.VecOutputs;
}
//It's possible that not all code paths writes to those output registers,
//in those cases if we attempt to write an output registers that was
//not written, we will be just writing zero and messing up the old register value.
//So we just need to ensure that all outputs are loaded.
if (graph.Length > 1)
{
intInputs |= intOutputs;
vecInputs |= vecOutputs;
}
foreach (ILBlock block in graph)
{
_intPaths.Add(block, new PathIo(block, intInputs, intOutputs));
_vecPaths.Add(block, new PathIo(block, vecInputs, vecOutputs));
}
}
public long GetIntInputs(ILBlock root) => GetInputsImpl(root, _intPaths.Values);
public long GetVecInputs(ILBlock root) => GetInputsImpl(root, _vecPaths.Values);
@ -256,5 +244,29 @@ namespace ChocolArm64.Translation
public long GetIntOutputs(ILBlock block) => _intPaths[block].GetOutputs();
public long GetVecOutputs(ILBlock block) => _vecPaths[block].GetOutputs();
public static long ClearCallerSavedIntRegs(long mask, bool isAarch64)
{
//TODO: ARM32 support.
if (isAarch64)
{
mask &= ~CallerSavedIntRegistersMask;
mask &= ~PStateNzcvFlagsMask;
}
return mask;
}
public static long ClearCallerSavedVecRegs(long mask, bool isAarch64)
{
//TODO: ARM32 support.
if (isAarch64)
{
mask &= ~CallerSavedVecRegistersMask;
}
return mask;
}
}
}

View file

@ -80,16 +80,21 @@ namespace ChocolArm64.Translation
}
}
internal ArmSubroutine GetOrTranslateVirtualSubroutineForJump(CpuThreadState state, long position)
{
return GetOrTranslateVirtualSubroutineImpl(state, position, isJump: true);
}
internal ArmSubroutine GetOrTranslateVirtualSubroutine(CpuThreadState state, long position)
{
return GetOrTranslateVirtualSubroutineImpl(state, position, isJump: false);
}
private ArmSubroutine GetOrTranslateVirtualSubroutineImpl(CpuThreadState state, long position, bool isJump)
{
if (!_cache.TryGetSubroutine(position, out TranslatedSub sub))
{
sub = TranslateLowCq(position, state.GetExecutionMode());
}
if (sub.Tier == TranslationTier.Tier0)
{
_queue.Enqueue(new TranslatorQueueItem(position, state.GetExecutionMode(), TranslationTier.Tier1));
sub = TranslateHighCq(position, state.GetExecutionMode(), !isJump);
}
return sub.Delegate;
@ -99,7 +104,7 @@ namespace ChocolArm64.Translation
{
if (!_cache.TryGetSubroutine(position, out TranslatedSub subroutine))
{
subroutine = TranslateLowCq(position, state.GetExecutionMode());
subroutine = TranslateHighCq(position, state.GetExecutionMode(), true);
}
return subroutine;
@ -124,7 +129,7 @@ namespace ChocolArm64.Translation
}
else
{
TranslateHighCq(item.Position, item.Mode);
TranslateHighCq(item.Position, item.Mode, item.IsComplete);
}
}
else
@ -142,14 +147,16 @@ namespace ChocolArm64.Translation
string subName = GetSubroutineName(position);
ILMethodBuilder ilMthdBuilder = new ILMethodBuilder(context.GetILBlocks(), subName);
bool isAarch64 = mode == ExecutionMode.Aarch64;
ILMethodBuilder ilMthdBuilder = new ILMethodBuilder(context.GetILBlocks(), subName, isAarch64);
TranslatedSub subroutine = ilMthdBuilder.GetSubroutine(TranslationTier.Tier0);
return _cache.GetOrAdd(position, subroutine, block.OpCodes.Count);
}
private void TranslateHighCq(long position, ExecutionMode mode)
private TranslatedSub TranslateHighCq(long position, ExecutionMode mode, bool isComplete)
{
Block graph = Decoder.DecodeSubroutine(_memory, position, mode);
@ -159,7 +166,11 @@ namespace ChocolArm64.Translation
string subName = GetSubroutineName(position);
ILMethodBuilder ilMthdBuilder = new ILMethodBuilder(ilBlocks, subName);
bool isAarch64 = mode == ExecutionMode.Aarch64;
isComplete &= !context.HasIndirectJump;
ILMethodBuilder ilMthdBuilder = new ILMethodBuilder(ilBlocks, subName, isAarch64, isComplete);
TranslatedSub subroutine = ilMthdBuilder.GetSubroutine(TranslationTier.Tier1);
@ -173,6 +184,8 @@ namespace ChocolArm64.Translation
_cache.AddOrUpdate(position, subroutine, ilOpCount);
ForceAheadOfTimeCompilation(subroutine);
return subroutine;
}
private string GetSubroutineName(long position)

View file

@ -10,11 +10,18 @@ namespace ChocolArm64.Translation
public TranslationTier Tier { get; }
public TranslatorQueueItem(long position, ExecutionMode mode, TranslationTier tier)
public bool IsComplete { get; }
public TranslatorQueueItem(
long position,
ExecutionMode mode,
TranslationTier tier,
bool isComplete = false)
{
Position = position;
Mode = mode;
Tier = tier;
Position = position;
Mode = mode;
Tier = tier;
IsComplete = isComplete;
}
}
}