Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator

This commit is contained in:
gdkchan 2019-07-24 23:55:00 -03:00
parent 3a0676c596
commit 581b7c8bbf
2 changed files with 576 additions and 329 deletions

View file

@ -625,6 +625,11 @@ namespace ARMeilleure.CodeGen.X86
Operand dest = operation.Dest;
Operand source = operation.GetSource(0);
if (dest.Type != source.Type)
{
System.Console.WriteLine(dest.Type + " " + source.Type);
}
EnsureSameType(dest, source);
Debug.Assert(dest.Type.IsInteger() || source.Kind != OperandKind.Constant);
@ -1073,15 +1078,61 @@ namespace ARMeilleure.CodeGen.X86
if (dest.Type == OperandType.I32)
{
context.Assembler.Pextrd(dest, src1, index);
Debug.Assert(index < 4);
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Pextrd(dest, src1, index);
}
else
{
if (index != 0)
{
int mask0 = 0b11_10_01_00;
int mask1 = 0b11_10_01_00;
mask0 = BitUtils.RotateRight(mask0, index * 2, 8);
mask1 = BitUtils.RotateRight(mask1, 8 - index * 2, 8);
context.Assembler.Pshufd(src1, src1, (byte)mask0);
context.Assembler.Movd (dest, src1);
context.Assembler.Pshufd(src1, src1, (byte)mask1);
}
else
{
context.Assembler.Movd(dest, src1);
}
}
}
else if (dest.Type == OperandType.I64)
{
context.Assembler.Pextrq(dest, src1, index);
Debug.Assert(index < 2);
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Pextrq(dest, src1, index);
}
else
{
if (index != 0)
{
const byte mask = 0b01_00_11_10;
context.Assembler.Pshufd(src1, src1, mask);
context.Assembler.Movq (dest, src1);
context.Assembler.Pshufd(src1, src1, mask);
}
else
{
context.Assembler.Movq(dest, src1);
}
}
}
else
{
//Floating-point type.
Debug.Assert(index < (dest.Type == OperandType.FP32 ? 4 : 2));
//Floating-point types.
if ((index >= 2 && dest.Type == OperandType.FP32) ||
(index == 1 && dest.Type == OperandType.FP64))
{
@ -1111,6 +1162,8 @@ namespace ARMeilleure.CodeGen.X86
byte index = src2.AsByte();
Debug.Assert(index < 8);
context.Assembler.Pextrw(dest, src1, index);
}
@ -1125,8 +1178,25 @@ namespace ARMeilleure.CodeGen.X86
byte index = src2.AsByte();
//TODO: SSE/SSE2 version.
context.Assembler.Pextrb(dest, src1, index);
Debug.Assert(index < 16);
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Pextrb(dest, src1, index);
}
else
{
context.Assembler.Pextrw(dest, src1, (byte)(index >> 1));
if ((index & 1) != 0)
{
context.Assembler.Shr(dest, new Operand(8), OperandType.I32);
}
else
{
context.Assembler.Movzx8(dest, dest, OperandType.I32);
}
}
}
private static void GenerateVectorInsert(CodeGenContext context, Operation operation)
@ -1136,27 +1206,97 @@ namespace ARMeilleure.CodeGen.X86
Operand src2 = operation.GetSource(1); //Value
Operand src3 = operation.GetSource(2); //Index
if (!HardwareCapabilities.SupportsVexEncoding)
{
EnsureSameReg(dest, src1);
}
Debug.Assert(src1.Type == OperandType.V128);
Debug.Assert(src3.Kind == OperandKind.Constant);
byte index = src3.AsByte();
void InsertIntSse2(int words)
{
if (dest.GetRegister() != src1.GetRegister())
{
context.Assembler.Movdqu(dest, src1);
}
for (int word = 0; word < words; word++)
{
// Insert lower 16-bits.
context.Assembler.Pinsrw(dest, dest, src2, (byte)(index * words + word));
// Move next word down.
context.Assembler.Ror(src2, new Operand(16), src2.Type);
}
}
if (src2.Type == OperandType.I32)
{
//TODO: SSE/SSE2 version.
context.Assembler.Pinsrd(dest, src1, src2, index);
Debug.Assert(index < 4);
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Pinsrd(dest, src1, src2, index);
}
else
{
InsertIntSse2(2);
}
}
else if (src2.Type == OperandType.I64)
{
//TODO: SSE/SSE2 version.
context.Assembler.Pinsrq(dest, src1, src2, index);
Debug.Assert(index < 2);
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Pinsrq(dest, src1, src2, index);
}
else
{
InsertIntSse2(4);
}
}
else if (src2.Type == OperandType.FP32)
{
Debug.Assert(index < 4);
if (index != 0)
{
//TODO: SSE/SSE2 version.
context.Assembler.Insertps(dest, src1, src2, (byte)(index << 4));
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Insertps(dest, src1, src2, (byte)(index << 4));
}
else
{
if (src1.GetRegister() == src2.GetRegister())
{
int mask = 0b11_10_01_00;
mask &= ~(0b11 << index * 2);
context.Assembler.Pshufd(dest, src1, (byte)mask);
}
else
{
int mask0 = 0b11_10_01_00;
int mask1 = 0b11_10_01_00;
mask0 = BitUtils.RotateRight(mask0, index * 2, 8);
mask1 = BitUtils.RotateRight(mask1, 8 - index * 2, 8);
if (dest.GetRegister() != src1.GetRegister())
{
context.Assembler.Movdqu(dest, src1);
}
context.Assembler.Pshufd(dest, dest, (byte)mask0);
context.Assembler.Movss (dest, dest, src2);
context.Assembler.Pshufd(dest, dest, (byte)mask1);
}
}
}
else
{
@ -1165,6 +1305,8 @@ namespace ARMeilleure.CodeGen.X86
}
else /* if (src2.Type == OperandType.FP64) */
{
Debug.Assert(index < 2);
if (index != 0)
{
context.Assembler.Movlhps(dest, src1, src2);
@ -1183,6 +1325,11 @@ namespace ARMeilleure.CodeGen.X86
Operand src2 = operation.GetSource(1); //Value
Operand src3 = operation.GetSource(2); //Index
if (!HardwareCapabilities.SupportsVexEncoding)
{
EnsureSameReg(dest, src1);
}
Debug.Assert(src1.Type == OperandType.V128);
Debug.Assert(src3.Kind == OperandKind.Constant);
@ -1198,12 +1345,22 @@ namespace ARMeilleure.CodeGen.X86
Operand src2 = operation.GetSource(1); //Value
Operand src3 = operation.GetSource(2); //Index
// It's not possible to emulate this instruction without
// SSE 4.1 support without the use of a temporary register,
// so we instead handle that case on the pre-allocator when
// SSE 4.1 is not supported on the CPU.
Debug.Assert(HardwareCapabilities.SupportsSse41);
if (!HardwareCapabilities.SupportsVexEncoding)
{
EnsureSameReg(dest, src1);
}
Debug.Assert(src1.Type == OperandType.V128);
Debug.Assert(src3.Kind == OperandKind.Constant);
byte index = src3.AsByte();
//TODO: SSE/SSE2 version.
context.Assembler.Pinsrb(dest, src1, src2, index);
}

View file

@ -8,6 +8,8 @@ using static ARMeilleure.IntermediateRepresentation.OperandHelper;
namespace ARMeilleure.CodeGen.X86
{
using LLNode = LinkedListNode<Node>;
static class PreAllocator
{
public static void RunPass(CompilerContext cctx, StackAllocator stackAlloc, out int maxCallArgs)
@ -18,9 +20,9 @@ namespace ARMeilleure.CodeGen.X86
foreach (BasicBlock block in cctx.Cfg.Blocks)
{
LinkedListNode<Node> nextNode;
LLNode nextNode;
for (LinkedListNode<Node> node = block.Operations.First; node != null; node = nextNode)
for (LLNode node = block.Operations.First; node != null; node = nextNode)
{
nextNode = node.Next;
@ -29,64 +31,67 @@ namespace ARMeilleure.CodeGen.X86
continue;
}
Instruction inst = operation.Inst;
HandleConstantCopy(node, operation);
HandleFixedRegisterCopy(node, operation);
HandleSameDestSrc1Copy(node, operation);
//Unsigned integer to FP conversions are not supported on X86.
//We need to turn them into signed integer to FP conversions, and
//adjust the final result.
if (inst == Instruction.ConvertToFPUI)
{
ReplaceConvertToFPUIWithSI(node, operation);
}
node = HandleFixedRegisterCopy(node, operation);
//There's no SSE FP negate instruction, so we need to transform that into
//a XOR of the value to be negated with a mask with the highest bit set.
//This also produces -0 for a negation of the value 0.
if (inst == Instruction.Negate && !operation.GetSource(0).Type.IsInteger())
switch (operation.Inst)
{
ReplaceNegateWithXor(node, operation);
}
case Instruction.Call:
// Get the maximum number of arguments used on a call.
// On windows, when a struct is returned from the call,
// we also need to pass the pointer where the struct
// should be written on the first argument.
int argsCount = operation.SourcesCount - 1;
//Get the maximum number of arguments used on a call. On windows,
//when a struct is returned from the call, we also need to pass
//the pointer where the struct should be written on the first argument.
if (inst == Instruction.Call)
{
int argsCount = operation.SourcesCount - 1;
if (operation.Dest != null && operation.Dest.Type == OperandType.V128)
{
argsCount++;
}
if (operation.Dest != null && operation.Dest.Type == OperandType.V128)
{
argsCount++;
}
if (maxCallArgs < argsCount)
{
maxCallArgs = argsCount;
}
if (maxCallArgs < argsCount)
{
maxCallArgs = argsCount;
}
// Copy values to registers expected by the function
// being called, as mandated by the ABI.
node = HandleCallWindowsAbi(stackAlloc, node, operation);
break;
//Copy values to registers expected by the function being called,
//as mandated by the ABI.
HandleCallWindowsAbi(stackAlloc, node, operation);
}
else if (inst == Instruction.Return)
{
HandleReturnWindowsAbi(cctx, node, preservedArgs, operation);
}
else if (inst == Instruction.LoadArgument)
{
HandleLoadArgumentWindowsAbi(cctx, node, preservedArgs, operation);
case Instruction.ConvertToFPUI:
HandleConvertToFPUI(node, operation);
break;
case Instruction.LoadArgument:
HandleLoadArgumentWindowsAbi(cctx, node, preservedArgs, operation);
break;
case Instruction.Negate:
if (!operation.GetSource(0).Type.IsInteger())
{
node = HandleNegate(node, operation);
}
break;
case Instruction.Return:
HandleReturnWindowsAbi(cctx, node, preservedArgs, operation);
break;
case Instruction.VectorInsert8:
if (!HardwareCapabilities.SupportsSse41)
{
node = HandleVectorInsert8(node, operation);
}
break;
}
}
}
}
private static void HandleConstantCopy(LinkedListNode<Node> node, Operation operation)
private static void HandleConstantCopy(LLNode node, Operation operation)
{
if (operation.SourcesCount == 0 || IsIntrinsic(operation.Inst))
{
@ -95,7 +100,6 @@ namespace ARMeilleure.CodeGen.X86
Instruction inst = operation.Inst;
Operand dest = operation.Dest;
Operand src1 = operation.GetSource(0);
Operand src2;
@ -103,25 +107,25 @@ namespace ARMeilleure.CodeGen.X86
{
if (!src1.Type.IsInteger())
{
//Handle non-integer types (FP32, FP64 and V128).
//For instructions without an immediate operand, we do the following:
//- Insert a copy with the constant value (as integer) to a GPR.
//- Insert a copy from the GPR to a XMM register.
//- Replace the constant use with the XMM register.
// Handle non-integer types (FP32, FP64 and V128).
// For instructions without an immediate operand, we do the following:
// - Insert a copy with the constant value (as integer) to a GPR.
// - Insert a copy from the GPR to a XMM register.
// - Replace the constant use with the XMM register.
src1 = AddXmmCopy(node, src1);
operation.SetSource(0, src1);
}
else if (!HasConstSrc1(inst))
{
//Handle integer types.
//Most ALU instructions accepts a 32-bits immediate on the second operand.
//We need to ensure the following:
//- If the constant is on operand 1, we need to move it.
//-- But first, we try to swap operand 1 and 2 if the instruction is commutative.
//-- Doing so may allow us to encode the constant as operand 2 and avoid a copy.
//- If the constant is on operand 2, we check if the instruction supports it,
//if not, we also add a copy. 64-bits constants are usually not supported.
// Handle integer types.
// Most ALU instructions accepts a 32-bits immediate on the second operand.
// We need to ensure the following:
// - If the constant is on operand 1, we need to move it.
// -- But first, we try to swap operand 1 and 2 if the instruction is commutative.
// -- Doing so may allow us to encode the constant as operand 2 and avoid a copy.
// - If the constant is on operand 2, we check if the instruction supports it,
// if not, we also add a copy. 64-bits constants are usually not supported.
if (IsCommutative(inst))
{
src2 = operation.GetSource(1);
@ -168,8 +172,207 @@ namespace ARMeilleure.CodeGen.X86
}
}
private static void ReplaceConvertToFPUIWithSI(LinkedListNode<Node> node, Operation operation)
private static LLNode HandleFixedRegisterCopy(LLNode node, Operation operation)
{
Operand dest = operation.Dest;
LinkedList<Node> nodes = node.List;
switch (operation.Inst)
{
case Instruction.CompareAndSwap128:
{
// Handle the many restrictions of the compare and exchange (16 bytes) instruction:
// - The expected value should be in RDX:RAX.
// - The new value to be written should be in RCX:RBX.
// - The value at the memory location is loaded to RDX:RAX.
void SplitOperand(Operand source, X86Register lowReg, X86Register highReg)
{
Operand lr = Gpr(lowReg, OperandType.I64);
Operand hr = Gpr(highReg, OperandType.I64);
nodes.AddBefore(node, new Operation(Instruction.VectorExtract, lr, source, Const(0)));
nodes.AddBefore(node, new Operation(Instruction.VectorExtract, hr, source, Const(1)));
}
SplitOperand(operation.GetSource(1), X86Register.Rax, X86Register.Rdx);
SplitOperand(operation.GetSource(2), X86Register.Rbx, X86Register.Rcx);
Operand rax = Gpr(X86Register.Rax, OperandType.I64);
Operand rdx = Gpr(X86Register.Rdx, OperandType.I64);
node = nodes.AddAfter(node, new Operation(Instruction.VectorCreateScalar, dest, rax));
node = nodes.AddAfter(node, new Operation(Instruction.VectorInsert, dest, dest, rdx, Const(1)));
operation.SetSource(1, Undef());
operation.SetSource(2, Undef());
operation.Dest = null;
break;
}
case Instruction.CpuId:
{
// Handle the many restrictions of the CPU Id instruction:
// - EAX controls the information returned by this instruction.
// - When EAX is 1, feature information is returned.
// - The information is written to registers EAX, EBX, ECX and EDX.
Debug.Assert(dest.Type == OperandType.I64);
Operand eax = Gpr(X86Register.Rax, OperandType.I32);
Operand ebx = Gpr(X86Register.Rbx, OperandType.I32);
Operand rcx = Gpr(X86Register.Rcx, OperandType.I64);
Operand edx = Gpr(X86Register.Rdx, OperandType.I32);
// Value 0x01 = Version, family and feature information.
node = nodes.AddBefore(node, new Operation(Instruction.Copy, eax, Const(1)));
// We don't care about those two, but their values are overwritten,
// so we need to take that into account.
node = nodes.AddAfter(node, new Operation(Instruction.Clobber, eax));
node = nodes.AddAfter(node, new Operation(Instruction.Clobber, ebx));
// Copy results to the destination register.
// The values are split into 2 32-bits registers, we merge them
// into a single 64-bits register.
node = nodes.AddAfter(node, new Operation(Instruction.ZeroExtend32, dest, edx));
node = nodes.AddAfter(node, new Operation(Instruction.ShiftLeft, dest, dest, Const(32)));
node = nodes.AddAfter(node, new Operation(Instruction.BitwiseOr, dest, dest, rcx));
operation.Dest = null;
break;
}
case Instruction.Divide:
case Instruction.DivideUI:
{
// Handle the many restrictions of the division instructions:
// - The dividend is always in RDX:RAX.
// - The result is always in RAX.
// - Additionally it also writes the remainder in RDX.
Operand src1 = operation.GetSource(0);
Operand rax = Gpr(X86Register.Rax, src1.Type);
Operand rdx = Gpr(X86Register.Rdx, src1.Type);
nodes.AddBefore(node, new Operation(Instruction.Copy, rax, src1));
operation.SetSource(0, rax);
nodes.AddBefore(node, new Operation(Instruction.Clobber, rdx));
node = nodes.AddAfter(node, new Operation(Instruction.Copy, dest, rax));
operation.Dest = rax;
break;
}
case Instruction.Extended:
{
IntrinsicOperation intrinOp = (IntrinsicOperation)operation;
// PBLENDVB last operand is always implied to be XMM0 when VEX is not supported.
if (intrinOp.Intrinsic == Intrinsic.X86Pblendvb && !HardwareCapabilities.SupportsVexEncoding)
{
Operand xmm0 = Xmm(X86Register.Xmm0, OperandType.V128);
nodes.AddBefore(node, new Operation(Instruction.Copy, xmm0, operation.GetSource(2)));
operation.SetSource(2, xmm0);
}
break;
}
case Instruction.Multiply64HighSI:
case Instruction.Multiply64HighUI:
{
// Handle the many restrictions of the i64 * i64 = i128 multiply instructions:
// - The multiplicand is always in RAX.
// - The lower 64-bits of the result is always in RAX.
// - The higher 64-bits of the result is always in RDX.
Operand src1 = operation.GetSource(0);
Operand rax = Gpr(X86Register.Rax, src1.Type);
Operand rdx = Gpr(X86Register.Rdx, src1.Type);
nodes.AddBefore(node, new Operation(Instruction.Copy, rax, src1));
operation.SetSource(0, rax);
node = nodes.AddAfter(node, new Operation(Instruction.Copy, dest, rdx));
operation.Dest = rdx;
break;
}
case Instruction.RotateRight:
case Instruction.ShiftLeft:
case Instruction.ShiftRightSI:
case Instruction.ShiftRightUI:
{
// The shift register is always implied to be CL (low 8-bits of RCX or ECX).
if (operation.GetSource(1).Kind == OperandKind.LocalVariable)
{
Operand rcx = Gpr(X86Register.Rcx, OperandType.I32);
nodes.AddBefore(node, new Operation(Instruction.Copy, rcx, operation.GetSource(1)));
operation.SetSource(1, rcx);
}
break;
}
}
return node;
}
private static void HandleSameDestSrc1Copy(LLNode node, Operation operation)
{
if (operation.Dest == null || operation.SourcesCount == 0)
{
return;
}
Instruction inst = operation.Inst;
Operand dest = operation.Dest;
Operand src1 = operation.GetSource(0);
// The multiply instruction (that maps to IMUL) is somewhat special, it has
// a three operand form where the second source is a immediate value.
bool threeOperandForm = inst == Instruction.Multiply && operation.GetSource(1).Kind == OperandKind.Constant;
if (IsSameOperandDestSrc1(operation) && src1.Kind == OperandKind.LocalVariable && !threeOperandForm)
{
Operation copyOp = new Operation(Instruction.Copy, dest, src1);
node.List.AddBefore(node, copyOp);
operation.SetSource(0, dest);
}
else if (inst == Instruction.ConditionalSelect)
{
Operand src3 = operation.GetSource(2);
Operation copyOp = new Operation(Instruction.Copy, dest, src3);
node.List.AddBefore(node, copyOp);
operation.SetSource(2, dest);
}
}
private static LLNode HandleConvertToFPUI(LLNode node, Operation operation)
{
// Unsigned integer to FP conversions are not supported on X86.
// We need to turn them into signed integer to FP conversions, and
// adjust the final result.
Operand dest = operation.Dest;
Operand source = operation.GetSource(0);
@ -177,52 +380,57 @@ namespace ARMeilleure.CodeGen.X86
LinkedList<Node> nodes = node.List;
LinkedListNode<Node> temp = node;
LLNode currentNode = node;
if (source.Type == OperandType.I32)
{
//For 32-bits integers, we can just zero-extend to 64-bits,
//and then use the 64-bits signed conversion instructions.
// For 32-bits integers, we can just zero-extend to 64-bits,
// and then use the 64-bits signed conversion instructions.
Operand zex = Local(OperandType.I64);
temp = nodes.AddAfter(temp, new Operation(Instruction.Copy, zex, source));
temp = nodes.AddAfter(temp, new Operation(Instruction.ConvertToFP, dest, zex));
node = nodes.AddAfter(node, new Operation(Instruction.ZeroExtend32, zex, source));
node = nodes.AddAfter(node, new Operation(Instruction.ConvertToFP, dest, zex));
}
else /* if (source.Type == OperandType.I64) */
{
//For 64-bits integers, we need to do the following:
//- Ensure that the integer has the most significant bit clear.
//-- This can be done by shifting the value right by 1, that is, dividing by 2.
//-- The least significant bit is lost in this case though.
//- We can then convert the shifted value with a signed integer instruction.
//- The result still needs to be corrected after that.
//-- First, we need to multiply the result by 2, as we divided it by 2 before.
//--- This can be done efficiently by adding the result to itself.
//-- Then, we need to add the least significant bit that was shifted out.
//--- We can convert the least significant bit to float, and add it to the result.
// For 64-bits integers, we need to do the following:
// - Ensure that the integer has the most significant bit clear.
// -- This can be done by shifting the value right by 1, that is, dividing by 2.
// -- The least significant bit is lost in this case though.
// - We can then convert the shifted value with a signed integer instruction.
// - The result still needs to be corrected after that.
// -- First, we need to multiply the result by 2, as we divided it by 2 before.
// --- This can be done efficiently by adding the result to itself.
// -- Then, we need to add the least significant bit that was shifted out.
// --- We can convert the least significant bit to float, and add it to the result.
Operand lsb = Local(OperandType.I64);
Operand half = Local(OperandType.I64);
Operand lsbF = Local(dest.Type);
temp = nodes.AddAfter(temp, new Operation(Instruction.Copy, lsb, source));
temp = nodes.AddAfter(temp, new Operation(Instruction.Copy, half, source));
node = nodes.AddAfter(node, new Operation(Instruction.Copy, lsb, source));
node = nodes.AddAfter(node, new Operation(Instruction.Copy, half, source));
temp = nodes.AddAfter(temp, new Operation(Instruction.BitwiseAnd, lsb, lsb, Const(1L)));
temp = nodes.AddAfter(temp, new Operation(Instruction.ShiftRightUI, half, half, Const(1)));
node = nodes.AddAfter(node, new Operation(Instruction.BitwiseAnd, lsb, lsb, Const(1L)));
node = nodes.AddAfter(node, new Operation(Instruction.ShiftRightUI, half, half, Const(1)));
temp = nodes.AddAfter(temp, new Operation(Instruction.ConvertToFP, lsbF, lsb));
temp = nodes.AddAfter(temp, new Operation(Instruction.ConvertToFP, dest, half));
node = nodes.AddAfter(node, new Operation(Instruction.ConvertToFP, lsbF, lsb));
node = nodes.AddAfter(node, new Operation(Instruction.ConvertToFP, dest, half));
temp = nodes.AddAfter(temp, new Operation(Instruction.Add, dest, dest, dest));
temp = nodes.AddAfter(temp, new Operation(Instruction.Add, dest, dest, lsbF));
node = nodes.AddAfter(node, new Operation(Instruction.Add, dest, dest, dest));
node = nodes.AddAfter(node, new Operation(Instruction.Add, dest, dest, lsbF));
}
Delete(node, operation);
Delete(currentNode, operation);
return node;
}
private static void ReplaceNegateWithXor(LinkedListNode<Node> node, Operation operation)
private static LLNode HandleNegate(LLNode node, Operation operation)
{
// There's no SSE FP negate instruction, so we need to transform that into
// a XOR of the value to be negated with a mask with the highest bit set.
// This also produces -0 for a negation of the value 0.
Operand dest = operation.Dest;
Operand source = operation.GetSource(0);
@ -231,187 +439,103 @@ namespace ARMeilleure.CodeGen.X86
LinkedList<Node> nodes = node.List;
LinkedListNode<Node> temp = node;
LLNode currentNode = node;
Operand res = Local(dest.Type);
temp = nodes.AddAfter(temp, new Operation(Instruction.VectorOne, res));
node = nodes.AddAfter(node, new Operation(Instruction.VectorOne, res));
if (dest.Type == OperandType.FP32)
{
temp = nodes.AddAfter(temp, new IntrinsicOperation(Intrinsic.X86Pslld, res, res, Const(31)));
node = nodes.AddAfter(node, new IntrinsicOperation(Intrinsic.X86Pslld, res, res, Const(31)));
}
else /* if (dest.Type == OperandType.FP64) */
{
temp = nodes.AddAfter(temp, new IntrinsicOperation(Intrinsic.X86Psllq, res, res, Const(63)));
node = nodes.AddAfter(node, new IntrinsicOperation(Intrinsic.X86Psllq, res, res, Const(63)));
}
temp = nodes.AddAfter(temp, new IntrinsicOperation(Intrinsic.X86Xorps, res, res, source));
node = nodes.AddAfter(node, new IntrinsicOperation(Intrinsic.X86Xorps, res, res, source));
temp = nodes.AddAfter(temp, new Operation(Instruction.Copy, dest, res));
node = nodes.AddAfter(node, new Operation(Instruction.Copy, dest, res));
Delete(node, operation);
Delete(currentNode, operation);
return node;
}
private static void HandleFixedRegisterCopy(LinkedListNode<Node> node, Operation operation)
private static LLNode HandleVectorInsert8(LLNode node, Operation operation)
{
Instruction inst = operation.Inst;
// Handle vector insertion, when SSE 4.1 is not supported.
Operand dest = operation.Dest;
Operand src1 = operation.GetSource(0); // Vector
Operand src2 = operation.GetSource(1); // Value
Operand src3 = operation.GetSource(2); // Index
//Handle the many restrictions of the CPU Id instruction:
//- EAX controls the information returned by this instruction.
//- When EAX is 1, feature information is returned.
//- The information is written to registers EAX, EBX, ECX and EDX.
if (inst == Instruction.CpuId)
Debug.Assert(src3.Kind == OperandKind.Constant);
byte index = src3.AsByte();
Debug.Assert(index < 16);
LinkedList<Node> nodes = node.List;
LLNode currentNode = node;
Operand temp = Local(OperandType.I32);
Operation vextOp = new Operation(Instruction.VectorExtract16, temp, src1, Const(index >> 1));
node = nodes.AddAfter(node, vextOp);
if ((index & 1) != 0)
{
Debug.Assert(dest.Type == OperandType.I64);
Operand temp2 = Local(OperandType.I32);
Operand eax = Gpr(X86Register.Rax, OperandType.I32);
Operand ebx = Gpr(X86Register.Rbx, OperandType.I32);
Operand ecx = Gpr(X86Register.Rcx, OperandType.I32);
Operand edx = Gpr(X86Register.Rdx, OperandType.I32);
Operation copyOp = new Operation(Instruction.Copy, temp2, src2);
Operation andOp = new Operation(Instruction.ZeroExtend8, temp, temp);
Operation shlOp = new Operation(Instruction.ShiftLeft, temp2, temp2, Const(8));
Operation orOp = new Operation(Instruction.BitwiseOr, temp, temp, temp2);
// Value 0x01 = Version, family and feature information.
node.List.AddBefore(node, new Operation(Instruction.Copy, eax, Const(1)));
node = nodes.AddAfter(node, copyOp);
node = nodes.AddAfter(node, andOp);
node = nodes.AddAfter(node, shlOp);
node = nodes.AddAfter(node, orOp);
}
else
{
Operation andOp = new Operation(Instruction.BitwiseAnd, temp, temp, Const(0xff00));
Operation orOp = new Operation(Instruction.BitwiseOr, temp, temp, src2);
// Copy results to the destination register.
// The values are split into 2 32-bits registers, we merge them
// into a single 64-bits register.
Operand rcx = Gpr(X86Register.Rcx, OperandType.I64);
node.List.AddAfter(node, new Operation(Instruction.BitwiseOr, dest, dest, rcx));
node.List.AddAfter(node, new Operation(Instruction.ShiftLeft, dest, dest, Const(32)));
node.List.AddAfter(node, new Operation(Instruction.ZeroExtend32, dest, edx));
// We don't care about those two, but their values are overwritten,
// so we need to take that into account.
node.List.AddAfter(node, new Operation(Instruction.Clobber, ebx));
node.List.AddAfter(node, new Operation(Instruction.Clobber, eax));
operation.Dest = null;
node = nodes.AddAfter(node, andOp);
node = nodes.AddAfter(node, orOp);
}
if (operation.SourcesCount == 0)
{
return;
}
Operation vinsOp = new Operation(Instruction.VectorInsert16, dest, src1, temp, Const(index >> 1));
Operand src1 = operation.GetSource(0);
node = nodes.AddAfter(node, vinsOp);
//Handle the many restrictions of the division instructions:
//- The dividend is always in RDX:RAX.
//- The result is always in RAX.
//- Additionally it also writes the remainder in RDX.
if (inst == Instruction.Divide || inst == Instruction.DivideUI)
{
Operand rax = Gpr(X86Register.Rax, src1.Type);
Operand rdx = Gpr(X86Register.Rdx, src1.Type);
Delete(currentNode, operation);
node.List.AddBefore(node, new Operation(Instruction.Copy, rax, src1));
operation.SetSource(0, rax);
node.List.AddBefore(node, new Operation(Instruction.Clobber, rdx));
node.List.AddAfter(node, new Operation(Instruction.Copy, dest, rax));
operation.Dest = rax;
}
//Handle the many restrictions of the i64 * i64 = i128 multiply instructions:
//- The multiplicand is always in RAX.
//- The lower 64-bits of the result is always in RAX.
//- The higher 64-bits of the result is always in RDX.
if (inst == Instruction.Multiply64HighSI || inst == Instruction.Multiply64HighUI)
{
Operand rax = Gpr(X86Register.Rax, src1.Type);
Operand rdx = Gpr(X86Register.Rdx, src1.Type);
node.List.AddBefore(node, new Operation(Instruction.Copy, rax, src1));
operation.SetSource(0, rax);
node.List.AddAfter(node, new Operation(Instruction.Copy, dest, rdx));
operation.Dest = rdx;
}
//Handle the many restrictions of the compare and exchange (16 bytes) instruction:
//- The expected value should be in RDX:RAX.
//- The new value to be written should be in RCX:RBX.
//- The value at the memory location is loaded to RDX:RAX.
if (inst == Instruction.CompareAndSwap128)
{
void SplitOperand(Operand source, X86Register lowReg, X86Register highReg)
{
Operand lr = Gpr(lowReg, OperandType.I64);
Operand hr = Gpr(highReg, OperandType.I64);
node.List.AddBefore(node, new Operation(Instruction.VectorExtract, lr, source, Const(0)));
node.List.AddBefore(node, new Operation(Instruction.VectorExtract, hr, source, Const(1)));
}
SplitOperand(operation.GetSource(1), X86Register.Rax, X86Register.Rdx);
SplitOperand(operation.GetSource(2), X86Register.Rbx, X86Register.Rcx);
Operand rax = Gpr(X86Register.Rax, OperandType.I64);
Operand rdx = Gpr(X86Register.Rdx, OperandType.I64);
node.List.AddAfter(node, new Operation(Instruction.VectorInsert, dest, dest, rdx, Const(1)));
node.List.AddAfter(node, new Operation(Instruction.VectorCreateScalar, dest, rax));
operation.SetSource(1, Undef());
operation.SetSource(2, Undef());
operation.Dest = null;
}
//The shift register is always implied to be CL (low 8-bits of RCX or ECX).
if (inst.IsShift() && operation.GetSource(1).Kind == OperandKind.LocalVariable)
{
Operand rcx = Gpr(X86Register.Rcx, OperandType.I32);
node.List.AddBefore(node, new Operation(Instruction.Copy, rcx, operation.GetSource(1)));
operation.SetSource(1, rcx);
}
//Handle intrinsics.
if (IsIntrinsic(inst))
{
IntrinsicOperation intrinOp = (IntrinsicOperation)operation;
//PBLENDVB last operand is always implied to be XMM0 when VEX is not supported.
if (intrinOp.Intrinsic == Intrinsic.X86Pblendvb && !HardwareCapabilities.SupportsVexEncoding)
{
Operand xmm0 = Xmm(X86Register.Xmm0, OperandType.V128);
node.List.AddBefore(node, new Operation(Instruction.Copy, xmm0, operation.GetSource(2)));
operation.SetSource(2, xmm0);
}
}
return node;
}
private static void HandleCallWindowsAbi(
StackAllocator stackAlloc,
LinkedListNode<Node> node,
Operation operation)
private static LLNode HandleCallWindowsAbi(StackAllocator stackAlloc, LLNode node, Operation operation)
{
Operand dest = operation.Dest;
//Handle struct arguments.
LinkedList<Node> nodes = node.List;
// Handle struct arguments.
int retArgs = 0;
int stackAllocOffset = 0;
int AllocateOnStack(int size)
{
//We assume that the stack allocator is initially empty (TotalSize = 0).
//Taking that into account, we can reuse the space allocated for other
//calls by keeping track of our own allocated size (stackAllocOffset).
//If the space allocated is not big enough, then we just expand it.
// We assume that the stack allocator is initially empty (TotalSize = 0).
// Taking that into account, we can reuse the space allocated for other
// calls by keeping track of our own allocated size (stackAllocOffset).
// If the space allocated is not big enough, then we just expand it.
int offset = stackAllocOffset;
if (stackAllocOffset + size > stackAlloc.TotalSize)
@ -432,7 +556,7 @@ namespace ARMeilleure.CodeGen.X86
Operation allocOp = new Operation(Instruction.StackAlloc, arg0Reg, Const(stackOffset));
node.List.AddBefore(node, allocOp);
nodes.AddBefore(node, allocOp);
retArgs = 1;
}
@ -449,17 +573,17 @@ namespace ARMeilleure.CodeGen.X86
Operation allocOp = new Operation(Instruction.StackAlloc, stackAddr, Const(stackOffset));
node.List.AddBefore(node, allocOp);
nodes.AddBefore(node, allocOp);
Operation storeOp = new Operation(Instruction.Store, null, stackAddr, source);
HandleConstantCopy(node.List.AddBefore(node, storeOp), storeOp);
HandleConstantCopy(nodes.AddBefore(node, storeOp), storeOp);
operation.SetSource(index, stackAddr);
}
}
//Handle arguments passed on registers.
// Handle arguments passed on registers.
int argsCount = operation.SourcesCount - 1;
int maxArgs = CallingConvention.GetArgumentsOnRegsCount() - retArgs;
@ -490,13 +614,13 @@ namespace ARMeilleure.CodeGen.X86
Operation srcCopyOp = new Operation(Instruction.Copy, argReg, source);
HandleConstantCopy(node.List.AddBefore(node, srcCopyOp), srcCopyOp);
HandleConstantCopy(nodes.AddBefore(node, srcCopyOp), srcCopyOp);
operation.SetSource(index + 1, argReg);
}
//The remaining arguments (those that are not passed on registers)
//should be passed on the stack, we write them to the stack with "SpillArg".
// The remaining arguments (those that are not passed on registers)
// should be passed on the stack, we write them to the stack with "SpillArg".
for (int index = argsCount; index < operation.SourcesCount - 1; index++)
{
Operand source = operation.GetSource(index + 1);
@ -505,7 +629,7 @@ namespace ARMeilleure.CodeGen.X86
Operation spillOp = new Operation(Instruction.SpillArg, null, offset, source);
HandleConstantCopy(node.List.AddBefore(node, spillOp), spillOp);
HandleConstantCopy(nodes.AddBefore(node, spillOp), spillOp);
operation.SetSource(index + 1, new Operand(OperandKind.Undefined));
}
@ -520,11 +644,11 @@ namespace ARMeilleure.CodeGen.X86
Operation copyOp = new Operation(Instruction.Copy, retValueAddr, arg0Reg);
node.List.AddBefore(node, copyOp);
nodes.AddBefore(node, copyOp);
Operation loadOp = new Operation(Instruction.Load, dest, retValueAddr);
node.List.AddAfter(node, loadOp);
node = nodes.AddAfter(node, loadOp);
operation.Dest = null;
}
@ -545,71 +669,18 @@ namespace ARMeilleure.CodeGen.X86
Operation destCopyOp = new Operation(Instruction.Copy, dest, retReg);
node.List.AddAfter(node, destCopyOp);
node = nodes.AddAfter(node, destCopyOp);
operation.Dest = retReg;
}
}
}
private static void HandleReturnWindowsAbi(
CompilerContext cctx,
LinkedListNode<Node> node,
Operand[] preservedArgs,
Operation operation)
{
if (operation.SourcesCount == 0)
{
return;
}
Operand source = operation.GetSource(0);
Operand retReg;
if (source.Type.IsInteger())
{
retReg = Gpr(CallingConvention.GetIntReturnRegister(), source.Type);
}
else if (source.Type == OperandType.V128)
{
if (preservedArgs[0] == null)
{
Operand preservedArg = Local(OperandType.I64);
Operand arg0 = Gpr(CallingConvention.GetIntArgumentRegister(0), OperandType.I64);
Operation copyOp = new Operation(Instruction.Copy, preservedArg, arg0);
cctx.Cfg.Entry.Operations.AddFirst(copyOp);
preservedArgs[0] = preservedArg;
}
retReg = preservedArgs[0];
}
else /* if (regType == RegisterType.Vector) */
{
retReg = Xmm(CallingConvention.GetVecReturnRegister(), source.Type);
}
if (source.Type == OperandType.V128)
{
Operation retStoreOp = new Operation(Instruction.Store, null, retReg, source);
node.List.AddBefore(node, retStoreOp);
}
else
{
Operation retCopyOp = new Operation(Instruction.Copy, retReg, source);
node.List.AddBefore(node, retCopyOp);
}
return node;
}
private static void HandleLoadArgumentWindowsAbi(
CompilerContext cctx,
LinkedListNode<Node> node,
LLNode node,
Operand[] preservedArgs,
Operation operation)
{
@ -667,43 +738,62 @@ namespace ARMeilleure.CodeGen.X86
}
}
private static void HandleSameDestSrc1Copy(LinkedListNode<Node> node, Operation operation)
private static void HandleReturnWindowsAbi(
CompilerContext cctx,
LLNode node,
Operand[] preservedArgs,
Operation operation)
{
if (operation.Dest == null || operation.SourcesCount == 0)
if (operation.SourcesCount == 0)
{
return;
}
Instruction inst = operation.Inst;
Operand source = operation.GetSource(0);
Operand dest = operation.Dest;
Operand src1 = operation.GetSource(0);
Operand retReg;
//The multiply instruction (that maps to IMUL) is somewhat special, it has
//a three operand form where the second source is a immediate value.
bool threeOperandForm = inst == Instruction.Multiply && operation.GetSource(1).Kind == OperandKind.Constant;
if (IsSameOperandDestSrc1(operation) && src1.Kind == OperandKind.LocalVariable && !threeOperandForm)
if (source.Type.IsInteger())
{
Operation copyOp = new Operation(Instruction.Copy, dest, src1);
node.List.AddBefore(node, copyOp);
operation.SetSource(0, dest);
retReg = Gpr(CallingConvention.GetIntReturnRegister(), source.Type);
}
else if (inst == Instruction.ConditionalSelect)
else if (source.Type == OperandType.V128)
{
Operand src3 = operation.GetSource(2);
if (preservedArgs[0] == null)
{
Operand preservedArg = Local(OperandType.I64);
Operation copyOp = new Operation(Instruction.Copy, dest, src3);
Operand arg0 = Gpr(CallingConvention.GetIntArgumentRegister(0), OperandType.I64);
node.List.AddBefore(node, copyOp);
Operation copyOp = new Operation(Instruction.Copy, preservedArg, arg0);
operation.SetSource(2, dest);
cctx.Cfg.Entry.Operations.AddFirst(copyOp);
preservedArgs[0] = preservedArg;
}
retReg = preservedArgs[0];
}
else /* if (regType == RegisterType.Vector) */
{
retReg = Xmm(CallingConvention.GetVecReturnRegister(), source.Type);
}
if (source.Type == OperandType.V128)
{
Operation retStoreOp = new Operation(Instruction.Store, null, retReg, source);
node.List.AddBefore(node, retStoreOp);
}
else
{
Operation retCopyOp = new Operation(Instruction.Copy, retReg, source);
node.List.AddBefore(node, retCopyOp);
}
}
private static Operand AddXmmCopy(LinkedListNode<Node> node, Operand source)
private static Operand AddXmmCopy(LLNode node, Operand source)
{
Operand temp = Local(source.Type);
@ -716,7 +806,7 @@ namespace ARMeilleure.CodeGen.X86
return temp;
}
private static Operand AddCopy(LinkedListNode<Node> node, Operand source)
private static Operand AddCopy(LLNode node, Operand source)
{
Operand temp = Local(source.Type);
@ -755,7 +845,7 @@ namespace ARMeilleure.CodeGen.X86
return value == (int)value;
}
private static void Delete(LinkedListNode<Node> node, Operation operation)
private static void Delete(LLNode node, Operation operation)
{
operation.Dest = null;