From 581b7c8bbfe9a4a7e4e5c42ca42e3b62384eafdf Mon Sep 17 00:00:00 2001 From: gdkchan Date: Wed, 24 Jul 2019 23:55:00 -0300 Subject: [PATCH] Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator --- ARMeilleure/CodeGen/X86/CodeGenerator.cs | 181 +++++- ARMeilleure/CodeGen/X86/PreAllocator.cs | 724 +++++++++++++---------- 2 files changed, 576 insertions(+), 329 deletions(-) diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs index 69678983ec..1cfe908913 100644 --- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs +++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs @@ -625,6 +625,11 @@ namespace ARMeilleure.CodeGen.X86 Operand dest = operation.Dest; Operand source = operation.GetSource(0); + if (dest.Type != source.Type) + { + System.Console.WriteLine(dest.Type + " " + source.Type); + } + EnsureSameType(dest, source); Debug.Assert(dest.Type.IsInteger() || source.Kind != OperandKind.Constant); @@ -1073,15 +1078,61 @@ namespace ARMeilleure.CodeGen.X86 if (dest.Type == OperandType.I32) { - context.Assembler.Pextrd(dest, src1, index); + Debug.Assert(index < 4); + + if (HardwareCapabilities.SupportsSse41) + { + context.Assembler.Pextrd(dest, src1, index); + } + else + { + if (index != 0) + { + int mask0 = 0b11_10_01_00; + int mask1 = 0b11_10_01_00; + + mask0 = BitUtils.RotateRight(mask0, index * 2, 8); + mask1 = BitUtils.RotateRight(mask1, 8 - index * 2, 8); + + context.Assembler.Pshufd(src1, src1, (byte)mask0); + context.Assembler.Movd (dest, src1); + context.Assembler.Pshufd(src1, src1, (byte)mask1); + } + else + { + context.Assembler.Movd(dest, src1); + } + } } else if (dest.Type == OperandType.I64) { - context.Assembler.Pextrq(dest, src1, index); + Debug.Assert(index < 2); + + if (HardwareCapabilities.SupportsSse41) + { + context.Assembler.Pextrq(dest, src1, index); + } + else + { + if (index != 0) + { + const byte mask = 0b01_00_11_10; + + context.Assembler.Pshufd(src1, src1, mask); + context.Assembler.Movq (dest, src1); + context.Assembler.Pshufd(src1, src1, mask); + } + else + { + context.Assembler.Movq(dest, src1); + } + } } else { - //Floating-point type. + Debug.Assert(index < (dest.Type == OperandType.FP32 ? 4 : 2)); + + //Floating-point types. if ((index >= 2 && dest.Type == OperandType.FP32) || (index == 1 && dest.Type == OperandType.FP64)) { @@ -1111,6 +1162,8 @@ namespace ARMeilleure.CodeGen.X86 byte index = src2.AsByte(); + Debug.Assert(index < 8); + context.Assembler.Pextrw(dest, src1, index); } @@ -1125,8 +1178,25 @@ namespace ARMeilleure.CodeGen.X86 byte index = src2.AsByte(); - //TODO: SSE/SSE2 version. - context.Assembler.Pextrb(dest, src1, index); + Debug.Assert(index < 16); + + if (HardwareCapabilities.SupportsSse41) + { + context.Assembler.Pextrb(dest, src1, index); + } + else + { + context.Assembler.Pextrw(dest, src1, (byte)(index >> 1)); + + if ((index & 1) != 0) + { + context.Assembler.Shr(dest, new Operand(8), OperandType.I32); + } + else + { + context.Assembler.Movzx8(dest, dest, OperandType.I32); + } + } } private static void GenerateVectorInsert(CodeGenContext context, Operation operation) @@ -1136,27 +1206,97 @@ namespace ARMeilleure.CodeGen.X86 Operand src2 = operation.GetSource(1); //Value Operand src3 = operation.GetSource(2); //Index + if (!HardwareCapabilities.SupportsVexEncoding) + { + EnsureSameReg(dest, src1); + } + Debug.Assert(src1.Type == OperandType.V128); Debug.Assert(src3.Kind == OperandKind.Constant); byte index = src3.AsByte(); + void InsertIntSse2(int words) + { + if (dest.GetRegister() != src1.GetRegister()) + { + context.Assembler.Movdqu(dest, src1); + } + + for (int word = 0; word < words; word++) + { + // Insert lower 16-bits. + context.Assembler.Pinsrw(dest, dest, src2, (byte)(index * words + word)); + + // Move next word down. + context.Assembler.Ror(src2, new Operand(16), src2.Type); + } + } + if (src2.Type == OperandType.I32) { - //TODO: SSE/SSE2 version. - context.Assembler.Pinsrd(dest, src1, src2, index); + Debug.Assert(index < 4); + + if (HardwareCapabilities.SupportsSse41) + { + context.Assembler.Pinsrd(dest, src1, src2, index); + } + else + { + InsertIntSse2(2); + } } else if (src2.Type == OperandType.I64) { - //TODO: SSE/SSE2 version. - context.Assembler.Pinsrq(dest, src1, src2, index); + Debug.Assert(index < 2); + + if (HardwareCapabilities.SupportsSse41) + { + context.Assembler.Pinsrq(dest, src1, src2, index); + } + else + { + InsertIntSse2(4); + } } else if (src2.Type == OperandType.FP32) { + Debug.Assert(index < 4); + if (index != 0) { - //TODO: SSE/SSE2 version. - context.Assembler.Insertps(dest, src1, src2, (byte)(index << 4)); + if (HardwareCapabilities.SupportsSse41) + { + context.Assembler.Insertps(dest, src1, src2, (byte)(index << 4)); + } + else + { + if (src1.GetRegister() == src2.GetRegister()) + { + int mask = 0b11_10_01_00; + + mask &= ~(0b11 << index * 2); + + context.Assembler.Pshufd(dest, src1, (byte)mask); + } + else + { + int mask0 = 0b11_10_01_00; + int mask1 = 0b11_10_01_00; + + mask0 = BitUtils.RotateRight(mask0, index * 2, 8); + mask1 = BitUtils.RotateRight(mask1, 8 - index * 2, 8); + + if (dest.GetRegister() != src1.GetRegister()) + { + context.Assembler.Movdqu(dest, src1); + } + + context.Assembler.Pshufd(dest, dest, (byte)mask0); + context.Assembler.Movss (dest, dest, src2); + context.Assembler.Pshufd(dest, dest, (byte)mask1); + } + } } else { @@ -1165,6 +1305,8 @@ namespace ARMeilleure.CodeGen.X86 } else /* if (src2.Type == OperandType.FP64) */ { + Debug.Assert(index < 2); + if (index != 0) { context.Assembler.Movlhps(dest, src1, src2); @@ -1183,6 +1325,11 @@ namespace ARMeilleure.CodeGen.X86 Operand src2 = operation.GetSource(1); //Value Operand src3 = operation.GetSource(2); //Index + if (!HardwareCapabilities.SupportsVexEncoding) + { + EnsureSameReg(dest, src1); + } + Debug.Assert(src1.Type == OperandType.V128); Debug.Assert(src3.Kind == OperandKind.Constant); @@ -1198,12 +1345,22 @@ namespace ARMeilleure.CodeGen.X86 Operand src2 = operation.GetSource(1); //Value Operand src3 = operation.GetSource(2); //Index + // It's not possible to emulate this instruction without + // SSE 4.1 support without the use of a temporary register, + // so we instead handle that case on the pre-allocator when + // SSE 4.1 is not supported on the CPU. + Debug.Assert(HardwareCapabilities.SupportsSse41); + + if (!HardwareCapabilities.SupportsVexEncoding) + { + EnsureSameReg(dest, src1); + } + Debug.Assert(src1.Type == OperandType.V128); Debug.Assert(src3.Kind == OperandKind.Constant); byte index = src3.AsByte(); - //TODO: SSE/SSE2 version. context.Assembler.Pinsrb(dest, src1, src2, index); } diff --git a/ARMeilleure/CodeGen/X86/PreAllocator.cs b/ARMeilleure/CodeGen/X86/PreAllocator.cs index edc30fbfac..515cdd8b22 100644 --- a/ARMeilleure/CodeGen/X86/PreAllocator.cs +++ b/ARMeilleure/CodeGen/X86/PreAllocator.cs @@ -8,6 +8,8 @@ using static ARMeilleure.IntermediateRepresentation.OperandHelper; namespace ARMeilleure.CodeGen.X86 { + using LLNode = LinkedListNode; + static class PreAllocator { public static void RunPass(CompilerContext cctx, StackAllocator stackAlloc, out int maxCallArgs) @@ -18,9 +20,9 @@ namespace ARMeilleure.CodeGen.X86 foreach (BasicBlock block in cctx.Cfg.Blocks) { - LinkedListNode nextNode; + LLNode nextNode; - for (LinkedListNode node = block.Operations.First; node != null; node = nextNode) + for (LLNode node = block.Operations.First; node != null; node = nextNode) { nextNode = node.Next; @@ -29,64 +31,67 @@ namespace ARMeilleure.CodeGen.X86 continue; } - Instruction inst = operation.Inst; - HandleConstantCopy(node, operation); - HandleFixedRegisterCopy(node, operation); - HandleSameDestSrc1Copy(node, operation); - //Unsigned integer to FP conversions are not supported on X86. - //We need to turn them into signed integer to FP conversions, and - //adjust the final result. - if (inst == Instruction.ConvertToFPUI) - { - ReplaceConvertToFPUIWithSI(node, operation); - } + node = HandleFixedRegisterCopy(node, operation); - //There's no SSE FP negate instruction, so we need to transform that into - //a XOR of the value to be negated with a mask with the highest bit set. - //This also produces -0 for a negation of the value 0. - if (inst == Instruction.Negate && !operation.GetSource(0).Type.IsInteger()) + switch (operation.Inst) { - ReplaceNegateWithXor(node, operation); - } + case Instruction.Call: + // Get the maximum number of arguments used on a call. + // On windows, when a struct is returned from the call, + // we also need to pass the pointer where the struct + // should be written on the first argument. + int argsCount = operation.SourcesCount - 1; - //Get the maximum number of arguments used on a call. On windows, - //when a struct is returned from the call, we also need to pass - //the pointer where the struct should be written on the first argument. - if (inst == Instruction.Call) - { - int argsCount = operation.SourcesCount - 1; + if (operation.Dest != null && operation.Dest.Type == OperandType.V128) + { + argsCount++; + } - if (operation.Dest != null && operation.Dest.Type == OperandType.V128) - { - argsCount++; - } + if (maxCallArgs < argsCount) + { + maxCallArgs = argsCount; + } - if (maxCallArgs < argsCount) - { - maxCallArgs = argsCount; - } + // Copy values to registers expected by the function + // being called, as mandated by the ABI. + node = HandleCallWindowsAbi(stackAlloc, node, operation); + break; - //Copy values to registers expected by the function being called, - //as mandated by the ABI. - HandleCallWindowsAbi(stackAlloc, node, operation); - } - else if (inst == Instruction.Return) - { - HandleReturnWindowsAbi(cctx, node, preservedArgs, operation); - } - else if (inst == Instruction.LoadArgument) - { - HandleLoadArgumentWindowsAbi(cctx, node, preservedArgs, operation); + case Instruction.ConvertToFPUI: + HandleConvertToFPUI(node, operation); + break; + + case Instruction.LoadArgument: + HandleLoadArgumentWindowsAbi(cctx, node, preservedArgs, operation); + break; + + case Instruction.Negate: + if (!operation.GetSource(0).Type.IsInteger()) + { + node = HandleNegate(node, operation); + } + break; + + case Instruction.Return: + HandleReturnWindowsAbi(cctx, node, preservedArgs, operation); + break; + + case Instruction.VectorInsert8: + if (!HardwareCapabilities.SupportsSse41) + { + node = HandleVectorInsert8(node, operation); + } + break; } } } } - private static void HandleConstantCopy(LinkedListNode node, Operation operation) + private static void HandleConstantCopy(LLNode node, Operation operation) { if (operation.SourcesCount == 0 || IsIntrinsic(operation.Inst)) { @@ -95,7 +100,6 @@ namespace ARMeilleure.CodeGen.X86 Instruction inst = operation.Inst; - Operand dest = operation.Dest; Operand src1 = operation.GetSource(0); Operand src2; @@ -103,25 +107,25 @@ namespace ARMeilleure.CodeGen.X86 { if (!src1.Type.IsInteger()) { - //Handle non-integer types (FP32, FP64 and V128). - //For instructions without an immediate operand, we do the following: - //- Insert a copy with the constant value (as integer) to a GPR. - //- Insert a copy from the GPR to a XMM register. - //- Replace the constant use with the XMM register. + // Handle non-integer types (FP32, FP64 and V128). + // For instructions without an immediate operand, we do the following: + // - Insert a copy with the constant value (as integer) to a GPR. + // - Insert a copy from the GPR to a XMM register. + // - Replace the constant use with the XMM register. src1 = AddXmmCopy(node, src1); operation.SetSource(0, src1); } else if (!HasConstSrc1(inst)) { - //Handle integer types. - //Most ALU instructions accepts a 32-bits immediate on the second operand. - //We need to ensure the following: - //- If the constant is on operand 1, we need to move it. - //-- But first, we try to swap operand 1 and 2 if the instruction is commutative. - //-- Doing so may allow us to encode the constant as operand 2 and avoid a copy. - //- If the constant is on operand 2, we check if the instruction supports it, - //if not, we also add a copy. 64-bits constants are usually not supported. + // Handle integer types. + // Most ALU instructions accepts a 32-bits immediate on the second operand. + // We need to ensure the following: + // - If the constant is on operand 1, we need to move it. + // -- But first, we try to swap operand 1 and 2 if the instruction is commutative. + // -- Doing so may allow us to encode the constant as operand 2 and avoid a copy. + // - If the constant is on operand 2, we check if the instruction supports it, + // if not, we also add a copy. 64-bits constants are usually not supported. if (IsCommutative(inst)) { src2 = operation.GetSource(1); @@ -168,8 +172,207 @@ namespace ARMeilleure.CodeGen.X86 } } - private static void ReplaceConvertToFPUIWithSI(LinkedListNode node, Operation operation) + private static LLNode HandleFixedRegisterCopy(LLNode node, Operation operation) { + Operand dest = operation.Dest; + + LinkedList nodes = node.List; + + switch (operation.Inst) + { + case Instruction.CompareAndSwap128: + { + // Handle the many restrictions of the compare and exchange (16 bytes) instruction: + // - The expected value should be in RDX:RAX. + // - The new value to be written should be in RCX:RBX. + // - The value at the memory location is loaded to RDX:RAX. + void SplitOperand(Operand source, X86Register lowReg, X86Register highReg) + { + Operand lr = Gpr(lowReg, OperandType.I64); + Operand hr = Gpr(highReg, OperandType.I64); + + nodes.AddBefore(node, new Operation(Instruction.VectorExtract, lr, source, Const(0))); + nodes.AddBefore(node, new Operation(Instruction.VectorExtract, hr, source, Const(1))); + } + + SplitOperand(operation.GetSource(1), X86Register.Rax, X86Register.Rdx); + SplitOperand(operation.GetSource(2), X86Register.Rbx, X86Register.Rcx); + + Operand rax = Gpr(X86Register.Rax, OperandType.I64); + Operand rdx = Gpr(X86Register.Rdx, OperandType.I64); + + node = nodes.AddAfter(node, new Operation(Instruction.VectorCreateScalar, dest, rax)); + node = nodes.AddAfter(node, new Operation(Instruction.VectorInsert, dest, dest, rdx, Const(1))); + + operation.SetSource(1, Undef()); + operation.SetSource(2, Undef()); + + operation.Dest = null; + + break; + } + + case Instruction.CpuId: + { + // Handle the many restrictions of the CPU Id instruction: + // - EAX controls the information returned by this instruction. + // - When EAX is 1, feature information is returned. + // - The information is written to registers EAX, EBX, ECX and EDX. + Debug.Assert(dest.Type == OperandType.I64); + + Operand eax = Gpr(X86Register.Rax, OperandType.I32); + Operand ebx = Gpr(X86Register.Rbx, OperandType.I32); + Operand rcx = Gpr(X86Register.Rcx, OperandType.I64); + Operand edx = Gpr(X86Register.Rdx, OperandType.I32); + + // Value 0x01 = Version, family and feature information. + node = nodes.AddBefore(node, new Operation(Instruction.Copy, eax, Const(1))); + + // We don't care about those two, but their values are overwritten, + // so we need to take that into account. + node = nodes.AddAfter(node, new Operation(Instruction.Clobber, eax)); + node = nodes.AddAfter(node, new Operation(Instruction.Clobber, ebx)); + + // Copy results to the destination register. + // The values are split into 2 32-bits registers, we merge them + // into a single 64-bits register. + node = nodes.AddAfter(node, new Operation(Instruction.ZeroExtend32, dest, edx)); + node = nodes.AddAfter(node, new Operation(Instruction.ShiftLeft, dest, dest, Const(32))); + node = nodes.AddAfter(node, new Operation(Instruction.BitwiseOr, dest, dest, rcx)); + + operation.Dest = null; + + break; + } + + case Instruction.Divide: + case Instruction.DivideUI: + { + // Handle the many restrictions of the division instructions: + // - The dividend is always in RDX:RAX. + // - The result is always in RAX. + // - Additionally it also writes the remainder in RDX. + Operand src1 = operation.GetSource(0); + + Operand rax = Gpr(X86Register.Rax, src1.Type); + Operand rdx = Gpr(X86Register.Rdx, src1.Type); + + nodes.AddBefore(node, new Operation(Instruction.Copy, rax, src1)); + + operation.SetSource(0, rax); + + nodes.AddBefore(node, new Operation(Instruction.Clobber, rdx)); + + node = nodes.AddAfter(node, new Operation(Instruction.Copy, dest, rax)); + + operation.Dest = rax; + + break; + } + + case Instruction.Extended: + { + IntrinsicOperation intrinOp = (IntrinsicOperation)operation; + + // PBLENDVB last operand is always implied to be XMM0 when VEX is not supported. + if (intrinOp.Intrinsic == Intrinsic.X86Pblendvb && !HardwareCapabilities.SupportsVexEncoding) + { + Operand xmm0 = Xmm(X86Register.Xmm0, OperandType.V128); + + nodes.AddBefore(node, new Operation(Instruction.Copy, xmm0, operation.GetSource(2))); + + operation.SetSource(2, xmm0); + } + + break; + } + + case Instruction.Multiply64HighSI: + case Instruction.Multiply64HighUI: + { + // Handle the many restrictions of the i64 * i64 = i128 multiply instructions: + // - The multiplicand is always in RAX. + // - The lower 64-bits of the result is always in RAX. + // - The higher 64-bits of the result is always in RDX. + Operand src1 = operation.GetSource(0); + + Operand rax = Gpr(X86Register.Rax, src1.Type); + Operand rdx = Gpr(X86Register.Rdx, src1.Type); + + nodes.AddBefore(node, new Operation(Instruction.Copy, rax, src1)); + + operation.SetSource(0, rax); + + node = nodes.AddAfter(node, new Operation(Instruction.Copy, dest, rdx)); + + operation.Dest = rdx; + + break; + } + + case Instruction.RotateRight: + case Instruction.ShiftLeft: + case Instruction.ShiftRightSI: + case Instruction.ShiftRightUI: + { + // The shift register is always implied to be CL (low 8-bits of RCX or ECX). + if (operation.GetSource(1).Kind == OperandKind.LocalVariable) + { + Operand rcx = Gpr(X86Register.Rcx, OperandType.I32); + + nodes.AddBefore(node, new Operation(Instruction.Copy, rcx, operation.GetSource(1))); + + operation.SetSource(1, rcx); + } + + break; + } + } + + return node; + } + + private static void HandleSameDestSrc1Copy(LLNode node, Operation operation) + { + if (operation.Dest == null || operation.SourcesCount == 0) + { + return; + } + + Instruction inst = operation.Inst; + + Operand dest = operation.Dest; + Operand src1 = operation.GetSource(0); + + // The multiply instruction (that maps to IMUL) is somewhat special, it has + // a three operand form where the second source is a immediate value. + bool threeOperandForm = inst == Instruction.Multiply && operation.GetSource(1).Kind == OperandKind.Constant; + + if (IsSameOperandDestSrc1(operation) && src1.Kind == OperandKind.LocalVariable && !threeOperandForm) + { + Operation copyOp = new Operation(Instruction.Copy, dest, src1); + + node.List.AddBefore(node, copyOp); + + operation.SetSource(0, dest); + } + else if (inst == Instruction.ConditionalSelect) + { + Operand src3 = operation.GetSource(2); + + Operation copyOp = new Operation(Instruction.Copy, dest, src3); + + node.List.AddBefore(node, copyOp); + + operation.SetSource(2, dest); + } + } + + private static LLNode HandleConvertToFPUI(LLNode node, Operation operation) + { + // Unsigned integer to FP conversions are not supported on X86. + // We need to turn them into signed integer to FP conversions, and + // adjust the final result. Operand dest = operation.Dest; Operand source = operation.GetSource(0); @@ -177,52 +380,57 @@ namespace ARMeilleure.CodeGen.X86 LinkedList nodes = node.List; - LinkedListNode temp = node; + LLNode currentNode = node; if (source.Type == OperandType.I32) { - //For 32-bits integers, we can just zero-extend to 64-bits, - //and then use the 64-bits signed conversion instructions. + // For 32-bits integers, we can just zero-extend to 64-bits, + // and then use the 64-bits signed conversion instructions. Operand zex = Local(OperandType.I64); - temp = nodes.AddAfter(temp, new Operation(Instruction.Copy, zex, source)); - temp = nodes.AddAfter(temp, new Operation(Instruction.ConvertToFP, dest, zex)); + node = nodes.AddAfter(node, new Operation(Instruction.ZeroExtend32, zex, source)); + node = nodes.AddAfter(node, new Operation(Instruction.ConvertToFP, dest, zex)); } else /* if (source.Type == OperandType.I64) */ { - //For 64-bits integers, we need to do the following: - //- Ensure that the integer has the most significant bit clear. - //-- This can be done by shifting the value right by 1, that is, dividing by 2. - //-- The least significant bit is lost in this case though. - //- We can then convert the shifted value with a signed integer instruction. - //- The result still needs to be corrected after that. - //-- First, we need to multiply the result by 2, as we divided it by 2 before. - //--- This can be done efficiently by adding the result to itself. - //-- Then, we need to add the least significant bit that was shifted out. - //--- We can convert the least significant bit to float, and add it to the result. + // For 64-bits integers, we need to do the following: + // - Ensure that the integer has the most significant bit clear. + // -- This can be done by shifting the value right by 1, that is, dividing by 2. + // -- The least significant bit is lost in this case though. + // - We can then convert the shifted value with a signed integer instruction. + // - The result still needs to be corrected after that. + // -- First, we need to multiply the result by 2, as we divided it by 2 before. + // --- This can be done efficiently by adding the result to itself. + // -- Then, we need to add the least significant bit that was shifted out. + // --- We can convert the least significant bit to float, and add it to the result. Operand lsb = Local(OperandType.I64); Operand half = Local(OperandType.I64); Operand lsbF = Local(dest.Type); - temp = nodes.AddAfter(temp, new Operation(Instruction.Copy, lsb, source)); - temp = nodes.AddAfter(temp, new Operation(Instruction.Copy, half, source)); + node = nodes.AddAfter(node, new Operation(Instruction.Copy, lsb, source)); + node = nodes.AddAfter(node, new Operation(Instruction.Copy, half, source)); - temp = nodes.AddAfter(temp, new Operation(Instruction.BitwiseAnd, lsb, lsb, Const(1L))); - temp = nodes.AddAfter(temp, new Operation(Instruction.ShiftRightUI, half, half, Const(1))); + node = nodes.AddAfter(node, new Operation(Instruction.BitwiseAnd, lsb, lsb, Const(1L))); + node = nodes.AddAfter(node, new Operation(Instruction.ShiftRightUI, half, half, Const(1))); - temp = nodes.AddAfter(temp, new Operation(Instruction.ConvertToFP, lsbF, lsb)); - temp = nodes.AddAfter(temp, new Operation(Instruction.ConvertToFP, dest, half)); + node = nodes.AddAfter(node, new Operation(Instruction.ConvertToFP, lsbF, lsb)); + node = nodes.AddAfter(node, new Operation(Instruction.ConvertToFP, dest, half)); - temp = nodes.AddAfter(temp, new Operation(Instruction.Add, dest, dest, dest)); - temp = nodes.AddAfter(temp, new Operation(Instruction.Add, dest, dest, lsbF)); + node = nodes.AddAfter(node, new Operation(Instruction.Add, dest, dest, dest)); + node = nodes.AddAfter(node, new Operation(Instruction.Add, dest, dest, lsbF)); } - Delete(node, operation); + Delete(currentNode, operation); + + return node; } - private static void ReplaceNegateWithXor(LinkedListNode node, Operation operation) + private static LLNode HandleNegate(LLNode node, Operation operation) { + // There's no SSE FP negate instruction, so we need to transform that into + // a XOR of the value to be negated with a mask with the highest bit set. + // This also produces -0 for a negation of the value 0. Operand dest = operation.Dest; Operand source = operation.GetSource(0); @@ -231,187 +439,103 @@ namespace ARMeilleure.CodeGen.X86 LinkedList nodes = node.List; - LinkedListNode temp = node; + LLNode currentNode = node; Operand res = Local(dest.Type); - temp = nodes.AddAfter(temp, new Operation(Instruction.VectorOne, res)); + node = nodes.AddAfter(node, new Operation(Instruction.VectorOne, res)); if (dest.Type == OperandType.FP32) { - temp = nodes.AddAfter(temp, new IntrinsicOperation(Intrinsic.X86Pslld, res, res, Const(31))); + node = nodes.AddAfter(node, new IntrinsicOperation(Intrinsic.X86Pslld, res, res, Const(31))); } else /* if (dest.Type == OperandType.FP64) */ { - temp = nodes.AddAfter(temp, new IntrinsicOperation(Intrinsic.X86Psllq, res, res, Const(63))); + node = nodes.AddAfter(node, new IntrinsicOperation(Intrinsic.X86Psllq, res, res, Const(63))); } - temp = nodes.AddAfter(temp, new IntrinsicOperation(Intrinsic.X86Xorps, res, res, source)); + node = nodes.AddAfter(node, new IntrinsicOperation(Intrinsic.X86Xorps, res, res, source)); - temp = nodes.AddAfter(temp, new Operation(Instruction.Copy, dest, res)); + node = nodes.AddAfter(node, new Operation(Instruction.Copy, dest, res)); - Delete(node, operation); + Delete(currentNode, operation); + + return node; } - private static void HandleFixedRegisterCopy(LinkedListNode node, Operation operation) + private static LLNode HandleVectorInsert8(LLNode node, Operation operation) { - Instruction inst = operation.Inst; - + // Handle vector insertion, when SSE 4.1 is not supported. Operand dest = operation.Dest; + Operand src1 = operation.GetSource(0); // Vector + Operand src2 = operation.GetSource(1); // Value + Operand src3 = operation.GetSource(2); // Index - //Handle the many restrictions of the CPU Id instruction: - //- EAX controls the information returned by this instruction. - //- When EAX is 1, feature information is returned. - //- The information is written to registers EAX, EBX, ECX and EDX. - if (inst == Instruction.CpuId) + Debug.Assert(src3.Kind == OperandKind.Constant); + + byte index = src3.AsByte(); + + Debug.Assert(index < 16); + + LinkedList nodes = node.List; + + LLNode currentNode = node; + + Operand temp = Local(OperandType.I32); + + Operation vextOp = new Operation(Instruction.VectorExtract16, temp, src1, Const(index >> 1)); + + node = nodes.AddAfter(node, vextOp); + + if ((index & 1) != 0) { - Debug.Assert(dest.Type == OperandType.I64); + Operand temp2 = Local(OperandType.I32); - Operand eax = Gpr(X86Register.Rax, OperandType.I32); - Operand ebx = Gpr(X86Register.Rbx, OperandType.I32); - Operand ecx = Gpr(X86Register.Rcx, OperandType.I32); - Operand edx = Gpr(X86Register.Rdx, OperandType.I32); + Operation copyOp = new Operation(Instruction.Copy, temp2, src2); + Operation andOp = new Operation(Instruction.ZeroExtend8, temp, temp); + Operation shlOp = new Operation(Instruction.ShiftLeft, temp2, temp2, Const(8)); + Operation orOp = new Operation(Instruction.BitwiseOr, temp, temp, temp2); - // Value 0x01 = Version, family and feature information. - node.List.AddBefore(node, new Operation(Instruction.Copy, eax, Const(1))); + node = nodes.AddAfter(node, copyOp); + node = nodes.AddAfter(node, andOp); + node = nodes.AddAfter(node, shlOp); + node = nodes.AddAfter(node, orOp); + } + else + { + Operation andOp = new Operation(Instruction.BitwiseAnd, temp, temp, Const(0xff00)); + Operation orOp = new Operation(Instruction.BitwiseOr, temp, temp, src2); - // Copy results to the destination register. - // The values are split into 2 32-bits registers, we merge them - // into a single 64-bits register. - Operand rcx = Gpr(X86Register.Rcx, OperandType.I64); - - node.List.AddAfter(node, new Operation(Instruction.BitwiseOr, dest, dest, rcx)); - node.List.AddAfter(node, new Operation(Instruction.ShiftLeft, dest, dest, Const(32))); - node.List.AddAfter(node, new Operation(Instruction.ZeroExtend32, dest, edx)); - - // We don't care about those two, but their values are overwritten, - // so we need to take that into account. - node.List.AddAfter(node, new Operation(Instruction.Clobber, ebx)); - node.List.AddAfter(node, new Operation(Instruction.Clobber, eax)); - - operation.Dest = null; + node = nodes.AddAfter(node, andOp); + node = nodes.AddAfter(node, orOp); } - if (operation.SourcesCount == 0) - { - return; - } + Operation vinsOp = new Operation(Instruction.VectorInsert16, dest, src1, temp, Const(index >> 1)); - Operand src1 = operation.GetSource(0); + node = nodes.AddAfter(node, vinsOp); - //Handle the many restrictions of the division instructions: - //- The dividend is always in RDX:RAX. - //- The result is always in RAX. - //- Additionally it also writes the remainder in RDX. - if (inst == Instruction.Divide || inst == Instruction.DivideUI) - { - Operand rax = Gpr(X86Register.Rax, src1.Type); - Operand rdx = Gpr(X86Register.Rdx, src1.Type); + Delete(currentNode, operation); - node.List.AddBefore(node, new Operation(Instruction.Copy, rax, src1)); - - operation.SetSource(0, rax); - - node.List.AddBefore(node, new Operation(Instruction.Clobber, rdx)); - - node.List.AddAfter(node, new Operation(Instruction.Copy, dest, rax)); - - operation.Dest = rax; - } - - //Handle the many restrictions of the i64 * i64 = i128 multiply instructions: - //- The multiplicand is always in RAX. - //- The lower 64-bits of the result is always in RAX. - //- The higher 64-bits of the result is always in RDX. - if (inst == Instruction.Multiply64HighSI || inst == Instruction.Multiply64HighUI) - { - Operand rax = Gpr(X86Register.Rax, src1.Type); - Operand rdx = Gpr(X86Register.Rdx, src1.Type); - - node.List.AddBefore(node, new Operation(Instruction.Copy, rax, src1)); - - operation.SetSource(0, rax); - - node.List.AddAfter(node, new Operation(Instruction.Copy, dest, rdx)); - - operation.Dest = rdx; - } - - //Handle the many restrictions of the compare and exchange (16 bytes) instruction: - //- The expected value should be in RDX:RAX. - //- The new value to be written should be in RCX:RBX. - //- The value at the memory location is loaded to RDX:RAX. - if (inst == Instruction.CompareAndSwap128) - { - void SplitOperand(Operand source, X86Register lowReg, X86Register highReg) - { - Operand lr = Gpr(lowReg, OperandType.I64); - Operand hr = Gpr(highReg, OperandType.I64); - - node.List.AddBefore(node, new Operation(Instruction.VectorExtract, lr, source, Const(0))); - node.List.AddBefore(node, new Operation(Instruction.VectorExtract, hr, source, Const(1))); - } - - SplitOperand(operation.GetSource(1), X86Register.Rax, X86Register.Rdx); - SplitOperand(operation.GetSource(2), X86Register.Rbx, X86Register.Rcx); - - Operand rax = Gpr(X86Register.Rax, OperandType.I64); - Operand rdx = Gpr(X86Register.Rdx, OperandType.I64); - - node.List.AddAfter(node, new Operation(Instruction.VectorInsert, dest, dest, rdx, Const(1))); - node.List.AddAfter(node, new Operation(Instruction.VectorCreateScalar, dest, rax)); - - operation.SetSource(1, Undef()); - operation.SetSource(2, Undef()); - - operation.Dest = null; - } - - //The shift register is always implied to be CL (low 8-bits of RCX or ECX). - if (inst.IsShift() && operation.GetSource(1).Kind == OperandKind.LocalVariable) - { - Operand rcx = Gpr(X86Register.Rcx, OperandType.I32); - - node.List.AddBefore(node, new Operation(Instruction.Copy, rcx, operation.GetSource(1))); - - operation.SetSource(1, rcx); - } - - //Handle intrinsics. - if (IsIntrinsic(inst)) - { - IntrinsicOperation intrinOp = (IntrinsicOperation)operation; - - //PBLENDVB last operand is always implied to be XMM0 when VEX is not supported. - if (intrinOp.Intrinsic == Intrinsic.X86Pblendvb && !HardwareCapabilities.SupportsVexEncoding) - { - Operand xmm0 = Xmm(X86Register.Xmm0, OperandType.V128); - - node.List.AddBefore(node, new Operation(Instruction.Copy, xmm0, operation.GetSource(2))); - - operation.SetSource(2, xmm0); - } - } + return node; } - private static void HandleCallWindowsAbi( - StackAllocator stackAlloc, - LinkedListNode node, - Operation operation) + private static LLNode HandleCallWindowsAbi(StackAllocator stackAlloc, LLNode node, Operation operation) { Operand dest = operation.Dest; - //Handle struct arguments. + LinkedList nodes = node.List; + + // Handle struct arguments. int retArgs = 0; int stackAllocOffset = 0; int AllocateOnStack(int size) { - //We assume that the stack allocator is initially empty (TotalSize = 0). - //Taking that into account, we can reuse the space allocated for other - //calls by keeping track of our own allocated size (stackAllocOffset). - //If the space allocated is not big enough, then we just expand it. + // We assume that the stack allocator is initially empty (TotalSize = 0). + // Taking that into account, we can reuse the space allocated for other + // calls by keeping track of our own allocated size (stackAllocOffset). + // If the space allocated is not big enough, then we just expand it. int offset = stackAllocOffset; if (stackAllocOffset + size > stackAlloc.TotalSize) @@ -432,7 +556,7 @@ namespace ARMeilleure.CodeGen.X86 Operation allocOp = new Operation(Instruction.StackAlloc, arg0Reg, Const(stackOffset)); - node.List.AddBefore(node, allocOp); + nodes.AddBefore(node, allocOp); retArgs = 1; } @@ -449,17 +573,17 @@ namespace ARMeilleure.CodeGen.X86 Operation allocOp = new Operation(Instruction.StackAlloc, stackAddr, Const(stackOffset)); - node.List.AddBefore(node, allocOp); + nodes.AddBefore(node, allocOp); Operation storeOp = new Operation(Instruction.Store, null, stackAddr, source); - HandleConstantCopy(node.List.AddBefore(node, storeOp), storeOp); + HandleConstantCopy(nodes.AddBefore(node, storeOp), storeOp); operation.SetSource(index, stackAddr); } } - //Handle arguments passed on registers. + // Handle arguments passed on registers. int argsCount = operation.SourcesCount - 1; int maxArgs = CallingConvention.GetArgumentsOnRegsCount() - retArgs; @@ -490,13 +614,13 @@ namespace ARMeilleure.CodeGen.X86 Operation srcCopyOp = new Operation(Instruction.Copy, argReg, source); - HandleConstantCopy(node.List.AddBefore(node, srcCopyOp), srcCopyOp); + HandleConstantCopy(nodes.AddBefore(node, srcCopyOp), srcCopyOp); operation.SetSource(index + 1, argReg); } - //The remaining arguments (those that are not passed on registers) - //should be passed on the stack, we write them to the stack with "SpillArg". + // The remaining arguments (those that are not passed on registers) + // should be passed on the stack, we write them to the stack with "SpillArg". for (int index = argsCount; index < operation.SourcesCount - 1; index++) { Operand source = operation.GetSource(index + 1); @@ -505,7 +629,7 @@ namespace ARMeilleure.CodeGen.X86 Operation spillOp = new Operation(Instruction.SpillArg, null, offset, source); - HandleConstantCopy(node.List.AddBefore(node, spillOp), spillOp); + HandleConstantCopy(nodes.AddBefore(node, spillOp), spillOp); operation.SetSource(index + 1, new Operand(OperandKind.Undefined)); } @@ -520,11 +644,11 @@ namespace ARMeilleure.CodeGen.X86 Operation copyOp = new Operation(Instruction.Copy, retValueAddr, arg0Reg); - node.List.AddBefore(node, copyOp); + nodes.AddBefore(node, copyOp); Operation loadOp = new Operation(Instruction.Load, dest, retValueAddr); - node.List.AddAfter(node, loadOp); + node = nodes.AddAfter(node, loadOp); operation.Dest = null; } @@ -545,71 +669,18 @@ namespace ARMeilleure.CodeGen.X86 Operation destCopyOp = new Operation(Instruction.Copy, dest, retReg); - node.List.AddAfter(node, destCopyOp); + node = nodes.AddAfter(node, destCopyOp); operation.Dest = retReg; } } - } - private static void HandleReturnWindowsAbi( - CompilerContext cctx, - LinkedListNode node, - Operand[] preservedArgs, - Operation operation) - { - if (operation.SourcesCount == 0) - { - return; - } - - Operand source = operation.GetSource(0); - - Operand retReg; - - if (source.Type.IsInteger()) - { - retReg = Gpr(CallingConvention.GetIntReturnRegister(), source.Type); - } - else if (source.Type == OperandType.V128) - { - if (preservedArgs[0] == null) - { - Operand preservedArg = Local(OperandType.I64); - - Operand arg0 = Gpr(CallingConvention.GetIntArgumentRegister(0), OperandType.I64); - - Operation copyOp = new Operation(Instruction.Copy, preservedArg, arg0); - - cctx.Cfg.Entry.Operations.AddFirst(copyOp); - - preservedArgs[0] = preservedArg; - } - - retReg = preservedArgs[0]; - } - else /* if (regType == RegisterType.Vector) */ - { - retReg = Xmm(CallingConvention.GetVecReturnRegister(), source.Type); - } - - if (source.Type == OperandType.V128) - { - Operation retStoreOp = new Operation(Instruction.Store, null, retReg, source); - - node.List.AddBefore(node, retStoreOp); - } - else - { - Operation retCopyOp = new Operation(Instruction.Copy, retReg, source); - - node.List.AddBefore(node, retCopyOp); - } + return node; } private static void HandleLoadArgumentWindowsAbi( CompilerContext cctx, - LinkedListNode node, + LLNode node, Operand[] preservedArgs, Operation operation) { @@ -667,43 +738,62 @@ namespace ARMeilleure.CodeGen.X86 } } - private static void HandleSameDestSrc1Copy(LinkedListNode node, Operation operation) + private static void HandleReturnWindowsAbi( + CompilerContext cctx, + LLNode node, + Operand[] preservedArgs, + Operation operation) { - if (operation.Dest == null || operation.SourcesCount == 0) + if (operation.SourcesCount == 0) { return; } - Instruction inst = operation.Inst; + Operand source = operation.GetSource(0); - Operand dest = operation.Dest; - Operand src1 = operation.GetSource(0); + Operand retReg; - //The multiply instruction (that maps to IMUL) is somewhat special, it has - //a three operand form where the second source is a immediate value. - bool threeOperandForm = inst == Instruction.Multiply && operation.GetSource(1).Kind == OperandKind.Constant; - - if (IsSameOperandDestSrc1(operation) && src1.Kind == OperandKind.LocalVariable && !threeOperandForm) + if (source.Type.IsInteger()) { - Operation copyOp = new Operation(Instruction.Copy, dest, src1); - - node.List.AddBefore(node, copyOp); - - operation.SetSource(0, dest); + retReg = Gpr(CallingConvention.GetIntReturnRegister(), source.Type); } - else if (inst == Instruction.ConditionalSelect) + else if (source.Type == OperandType.V128) { - Operand src3 = operation.GetSource(2); + if (preservedArgs[0] == null) + { + Operand preservedArg = Local(OperandType.I64); - Operation copyOp = new Operation(Instruction.Copy, dest, src3); + Operand arg0 = Gpr(CallingConvention.GetIntArgumentRegister(0), OperandType.I64); - node.List.AddBefore(node, copyOp); + Operation copyOp = new Operation(Instruction.Copy, preservedArg, arg0); - operation.SetSource(2, dest); + cctx.Cfg.Entry.Operations.AddFirst(copyOp); + + preservedArgs[0] = preservedArg; + } + + retReg = preservedArgs[0]; + } + else /* if (regType == RegisterType.Vector) */ + { + retReg = Xmm(CallingConvention.GetVecReturnRegister(), source.Type); + } + + if (source.Type == OperandType.V128) + { + Operation retStoreOp = new Operation(Instruction.Store, null, retReg, source); + + node.List.AddBefore(node, retStoreOp); + } + else + { + Operation retCopyOp = new Operation(Instruction.Copy, retReg, source); + + node.List.AddBefore(node, retCopyOp); } } - private static Operand AddXmmCopy(LinkedListNode node, Operand source) + private static Operand AddXmmCopy(LLNode node, Operand source) { Operand temp = Local(source.Type); @@ -716,7 +806,7 @@ namespace ARMeilleure.CodeGen.X86 return temp; } - private static Operand AddCopy(LinkedListNode node, Operand source) + private static Operand AddCopy(LLNode node, Operand source) { Operand temp = Local(source.Type); @@ -755,7 +845,7 @@ namespace ARMeilleure.CodeGen.X86 return value == (int)value; } - private static void Delete(LinkedListNode node, Operation operation) + private static void Delete(LLNode node, Operation operation) { operation.Dest = null;