diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs index 907c57459f..105b79aa0a 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs @@ -16,6 +16,7 @@ namespace ARMeilleure.Instructions public static void Vabs_S(ArmEmitterContext context) { OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitScalarUnaryOpSimd32(context, (m) => @@ -36,7 +37,6 @@ namespace ARMeilleure.Instructions { EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Abs, Math.Abs, op1)); } - } public static void Vabs_V(ArmEmitterContext context) @@ -113,7 +113,15 @@ namespace ARMeilleure.Instructions public static void Vadd_I(ArmEmitterContext context) { - EmitVectorBinaryOpZx32(context, (op1, op2) => context.Add(op1, op2)); + if (Optimizations.UseSse2) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PaddInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.Add(op1, op2)); + } } public static void Vdup(ArmEmitterContext context) @@ -223,6 +231,7 @@ namespace ARMeilleure.Instructions public static void Vneg_S(ArmEmitterContext context) { OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + if (Optimizations.UseSse2) { EmitScalarUnaryOpSimd32(context, (m) => @@ -248,6 +257,7 @@ namespace ARMeilleure.Instructions public static void Vnmul_S(ArmEmitterContext context) { OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + if (Optimizations.UseSse2) { EmitScalarBinaryOpSimd32(context, (n, m) => @@ -275,6 +285,7 @@ namespace ARMeilleure.Instructions public static void Vnmla_S(ArmEmitterContext context) { OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitScalarTernaryOpSimd32(context, (d, n, m) => @@ -314,6 +325,7 @@ namespace ARMeilleure.Instructions public static void Vnmls_S(ArmEmitterContext context) { OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + if (Optimizations.FastFP && Optimizations.UseSse2) { EmitScalarTernaryOpSimd32(context, (d, n, m) => @@ -472,11 +484,25 @@ namespace ARMeilleure.Instructions if (op.U) { - EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2)); + if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxuInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2)); + } } else { - EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreater(op1, op2), op1, op2)); + if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxsInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreater(op1, op2), op1, op2)); + } } } @@ -501,11 +527,25 @@ namespace ARMeilleure.Instructions if (op.U) { - EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2)); + if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2)); + } } else { - EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2)); + if (Optimizations.UseSse2) + { + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2)); + } } } @@ -559,7 +599,11 @@ namespace ARMeilleure.Instructions if (op.F) { - if (Optimizations.FastFP) + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd); + } + else if (Optimizations.FastFP) { EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2)); } @@ -626,7 +670,11 @@ namespace ARMeilleure.Instructions if (op.F) { - if (Optimizations.FastFP) + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd); + } + else if (Optimizations.FastFP) { EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); } @@ -693,7 +741,11 @@ namespace ARMeilleure.Instructions if (op.F) { - if (Optimizations.FastFP) + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd); + } + else if (Optimizations.FastFP) { EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); } @@ -710,58 +762,118 @@ namespace ARMeilleure.Instructions public static void Vpadd_V(ArmEmitterContext context) { - EmitVectorPairwiseOpF32(context, (op1, op2) => context.Add(op1, op2)); + if (Optimizations.FastFP && Optimizations.UseSse2 && false) + { + EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Addps, Intrinsic.X86Addpd); + } + else + { + EmitVectorPairwiseOpF32(context, (op1, op2) => context.Add(op1, op2)); + } + } public static void Vpadd_I(ArmEmitterContext context) { OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; - EmitVectorPairwiseOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U); + if (Optimizations.UseSsse3) + { + EmitSsse3VectorPairwiseOp32(context, X86PaddInstruction); + } + else + { + EmitVectorPairwiseOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U); + } } public static void Vrev(ArmEmitterContext context) { - OpCode32Simd op = (OpCode32Simd)context.CurrOp; - - EmitVectorUnaryOpZx32(context, (op1) => + OpCode32SimdRev op = (OpCode32SimdRev)context.CurrOp; + + if (Optimizations.UseSsse3) { - switch (op.Opc) + EmitVectorUnaryOpSimd32(context, (op1) => { - case 0: - switch (op.Size) // Swap bytes. - { - default: - return op1; - case 1: - return InstEmitAluHelper.EmitReverseBytes16_32Op(context, op1); - case 2: - case 3: - return context.ByteSwap(op1); - } - case 1: - switch (op.Size) - { - default: - return op1; - case 2: - return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff0000)), Const(16)), - context.ShiftLeft(context.BitwiseAnd(op1, Const(0x0000ffff)), Const(16))); - case 3: - return context.BitwiseOr( - context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff000000000000ul)), Const(48)), - context.ShiftLeft(context.BitwiseAnd(op1, Const(0x000000000000fffful)), Const(48))), - context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0x0000ffff00000000ul)), Const(16)), - context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000ffff0000ul)), Const(16)))); - } - case 2: - // Swap upper and lower halves. - return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffffffff00000000ul)), Const(32)), - context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000fffffffful)), Const(32))); - } + Operand mask; + switch (op.Size) + { + case 3: + // rev64 + switch (op.Opc) + { + case 0: + mask = X86GetElements(context, 0x08090a0b0c0d0e0fL, 0x0001020304050607L); + return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); + case 1: + mask = X86GetElements(context, 0x09080b0a0d0c0f0eL, 0x0100030205040706L); + return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); + case 2: + return context.AddIntrinsic(Intrinsic.X86Shufps, op1, op1, Const(1 | (0 << 2) | (3 << 4) | (2 << 6))); + } + break; + case 2: + // rev32 + switch (op.Opc) + { + case 0: + mask = X86GetElements(context, 0x0c0d0e0f_08090a0bL, 0x04050607_00010203L); + return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); + case 1: + mask = X86GetElements(context, 0x0d0c0f0e_09080b0aL, 0x05040706_01000302L); + return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); + } + break; + case 1: + // rev16 + mask = X86GetElements(context, 0x0e0f_0c0d_0a0b_0809L, 0x_0607_0405_0203_0001L); + return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); + } - return op1; - }); + throw new InvalidOperationException("Unknown VREV Opcode+Size combo."); + }); + } + else + { + EmitVectorUnaryOpZx32(context, (op1) => + { + switch (op.Opc) + { + case 0: + switch (op.Size) // Swap bytes. + { + default: + return op1; + case 1: + return InstEmitAluHelper.EmitReverseBytes16_32Op(context, op1); + case 2: + case 3: + return context.ByteSwap(op1); + } + case 1: + switch (op.Size) + { + default: + return op1; + case 2: + return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff0000)), Const(16)), + context.ShiftLeft(context.BitwiseAnd(op1, Const(0x0000ffff)), Const(16))); + case 3: + return context.BitwiseOr( + context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff000000000000ul)), Const(48)), + context.ShiftLeft(context.BitwiseAnd(op1, Const(0x000000000000fffful)), Const(48))), + context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0x0000ffff00000000ul)), Const(16)), + context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000ffff0000ul)), Const(16)))); + } + case 2: + // Swap upper and lower halves. + return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffffffff00000000ul)), Const(32)), + context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000fffffffful)), Const(32))); + } + + return op1; + }); + } } public static void Vrecpe(ArmEmitterContext context) @@ -772,7 +884,7 @@ namespace ARMeilleure.Instructions { int sizeF = op.Size & 1; - if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0) + if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0) { EmitVectorUnaryOpF32(context, Intrinsic.X86Rcpps, 0); } @@ -792,10 +904,38 @@ namespace ARMeilleure.Instructions public static void Vrecps(ArmEmitterContext context) { - EmitVectorBinaryOpF32(context, (op1, op2) => + if (Optimizations.FastFP && Optimizations.UseSse2) { - return EmitSoftFloatCall(context, SoftFloat32.FPRecipStep, SoftFloat64.FPRecipStep, op1, op2); - }); + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + bool single = (op.Size & 1) == 0; + // (2 - (n*m)) + EmitVectorBinaryOpSimd32(context, (n, m) => + { + if (single) + { + Operand maskTwo = X86GetAllElements(context, 2f); + + Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + + return context.AddIntrinsic(Intrinsic.X86Subps, maskTwo, res); + } + else + { + Operand maskTwo = X86GetAllElements(context, 2d); + + Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + + return context.AddIntrinsic(Intrinsic.X86Subpd, maskTwo, res); + } + }); + } + else + { + EmitVectorBinaryOpF32(context, (op1, op2) => + { + return EmitSoftFloatCall(context, SoftFloat32.FPRecipStep, SoftFloat64.FPRecipStep, op1, op2); + }); + } } public static void Vrsqrte(ArmEmitterContext context) @@ -806,7 +946,7 @@ namespace ARMeilleure.Instructions { int sizeF = op.Size & 1; - if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0) + if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0) { EmitVectorUnaryOpF32(context, Intrinsic.X86Rsqrtps, 0); } @@ -826,10 +966,42 @@ namespace ARMeilleure.Instructions public static void Vrsqrts(ArmEmitterContext context) { - EmitVectorBinaryOpF32(context, (op1, op2) => + if (Optimizations.FastFP && Optimizations.UseSse2) { - return EmitSoftFloatCall(context, SoftFloat32.FPRSqrtStep, SoftFloat64.FPRSqrtStep, op1, op2); - }); + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + bool single = (op.Size & 1) == 0; + // (3 - (n*m)) / 2 + EmitVectorBinaryOpSimd32(context, (n, m) => + { + if (single) + { + Operand maskHalf = X86GetAllElements(context, 0.5f); + Operand maskThree = X86GetAllElements(context, 3f); + + Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + + res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res); + return context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res); + } + else + { + Operand maskHalf = X86GetAllElements(context, 0.5d); + Operand maskThree = X86GetAllElements(context, 3d); + + Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + + res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res); + return context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res); + } + }); + } + else + { + EmitVectorBinaryOpF32(context, (op1, op2) => + { + return EmitSoftFloatCall(context, SoftFloat32.FPRSqrtStep, SoftFloat64.FPRSqrtStep, op1, op2); + }); + } } public static void Vsel(ArmEmitterContext context) @@ -853,10 +1025,35 @@ namespace ARMeilleure.Instructions break; } - EmitScalarBinaryOpI32(context, (op1, op2) => + if (false && Optimizations.UseSse2) { - return context.ConditionalSelect(condition, op1, op2); - }); + Operand falseLabel = Label(); + Operand doneLabel = Label(); + + context.BranchIfFalse(condition, falseLabel); + + EmitScalarBinaryOpSimd32(context, (op1, op2) => + { + return op1; + }); + + context.Branch(doneLabel); + context.MarkLabel(falseLabel); + + EmitScalarBinaryOpSimd32(context, (op1, op2) => + { + return op2; + }); + + context.MarkLabel(doneLabel); + } + else + { + EmitScalarBinaryOpI32(context, (op1, op2) => + { + return context.ConditionalSelect(condition, op1, op2); + }); + } } public static void Vsqrt_S(ArmEmitterContext context) @@ -900,7 +1097,15 @@ namespace ARMeilleure.Instructions public static void Vsub_I(ArmEmitterContext context) { - EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2)); + if (Optimizations.UseSse2) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PsubInstruction[op.Size], op1, op2)); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2)); + } } private static void EmitSse41MaxMinNumOpF32(ArmEmitterContext context, bool isMaxNum, bool scalar) diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs index a87dac015a..d45b55fcf5 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs @@ -31,7 +31,7 @@ namespace ARMeilleure.Instructions 15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0 // S }; - private static readonly long _zeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0; + public static readonly long ZeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0; #endregion #region "X86 SSE Intrinsics" @@ -1026,8 +1026,8 @@ namespace ARMeilleure.Instructions if (op.RegisterSize == RegisterSize.Simd64) { - Operand zeroEvenMask = X86GetElements(context, _zeroMask, EvenMasks[op.Size]); - Operand zeroOddMask = X86GetElements(context, _zeroMask, OddMasks [op.Size]); + Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]); + Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks [op.Size]); Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs index f1dd34630a..1f4f10c710 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs @@ -589,7 +589,7 @@ namespace ARMeilleure.Instructions EmitVectorUnaryOpSimd32(context, (m) => context.AddIntrinsic(inst, m)); } - public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc) + public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc, int side = -1) { OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; @@ -597,16 +597,19 @@ namespace ARMeilleure.Instructions Operand m = GetVecA32(op.Qm); Operand d = GetVecA32(op.Qd); + if (side == -1) side = op.Vd; + if (!op.Q) //register swap: move relevant doubleword to destination side { - n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd); - m = EmitSwapDoubleWordToSide(context, m, op.Vm, op.Vd); + n = EmitSwapDoubleWordToSide(context, n, op.Vn, side); + m = EmitSwapDoubleWordToSide(context, m, op.Vm, side); } Operand res = vectorFunc(n, m); if (!op.Q) //register insert { + if (side != op.Vd) EmitSwapDoubleWordToSide(context, m, side, op.Vd); res = EmitDoubleWordInsert(context, d, res, op.Vd); } @@ -771,6 +774,169 @@ namespace ARMeilleure.Instructions }); } + // By Scalar + + public static void EmitVectorByScalarOpSimd32(ArmEmitterContext context, Func2I vectorFunc) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + Operand n = GetVecA32(op.Qn); + Operand d = GetVecA32(op.Qd); + + int index = op.Vm & 3; + int dupeMask = (index << 6) | (index << 4) | (index << 2) | index; + Operand m = GetVecA32(op.Vm >> 2); + m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask)); + + if (!op.Q) //register swap: move relevant doubleword to destination side + { + n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd); + } + + Operand res = vectorFunc(n, m); + + if (!op.Q) //register insert + { + res = EmitDoubleWordInsert(context, d, res, op.Vd); + } + + context.Copy(d, res); + } + + public static void EmitVectorByScalarOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + EmitVectorByScalarOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m)); + } + + public static void EmitVectorsByScalarOpSimd32(ArmEmitterContext context, Func3I vectorFunc) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + Operand n = GetVecA32(op.Qn); + Operand d = GetVecA32(op.Qd); + Operand initialD = d; + + int index = op.Vm & 3; + int dupeMask = (index << 6) | (index << 4) | (index << 2) | index; + Operand m = GetVecA32(op.Vm >> 2); + m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask)); + + if (!op.Q) //register swap: move relevant doubleword to destination side + { + n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd); + } + + Operand res = vectorFunc(d, n, m); + + if (!op.Q) //register insert + { + res = EmitDoubleWordInsert(context, initialD, res, op.Vd); + } + + context.Copy(initialD, res); + } + + public static void EmitVectorsByScalarOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2) + { + OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; + + Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1; + Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2; + + EmitVectorsByScalarOpSimd32(context, (d, n, m) => + { + Operand res = context.AddIntrinsic(inst1, n, m); + return res = context.AddIntrinsic(inst2, d, res); + }); + } + + // Pairwise + + public static void EmitSse2VectorPairwiseOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitVectorBinaryOpSimd32(context, (n, m) => + { + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + if (op.RegisterSize == RegisterSize.Simd64) + { + Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, n, m); + + Operand zero = context.VectorZero(); + + Operand part0 = context.AddIntrinsic(Intrinsic.X86Movlhps, unpck, zero); + Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, unpck); + + return context.AddIntrinsic(inst32, part0, part1); + } + else /* if (op.RegisterSize == RegisterSize.Simd128) */ + { + const int sm0 = 2 << 6 | 0 << 4 | 2 << 2 | 0 << 0; + const int sm1 = 3 << 6 | 1 << 4 | 3 << 2 | 1 << 0; + + Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm0)); + Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm1)); + + return context.AddIntrinsic(inst32, part0, part1); + } + } + else /* if (sizeF == 1) */ + { + Operand part0 = context.AddIntrinsic(Intrinsic.X86Unpcklpd, n, m); + Operand part1 = context.AddIntrinsic(Intrinsic.X86Unpckhpd, n, m); + + return context.AddIntrinsic(inst64, part0, part1); + } + }, 0); + } + + public static void EmitSsse3VectorPairwiseOp32(ArmEmitterContext context, Intrinsic[] inst) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + EmitVectorBinaryOpSimd32(context, (n, m) => + { + if (op.RegisterSize == RegisterSize.Simd64) + { + Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]); + Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks[op.Size]); + + Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n + + Operand left = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroEvenMask); // 0:even from m:n + Operand right = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroOddMask); // 0:odd from m:n + + return context.AddIntrinsic(inst[op.Size], left, right); + } + else if (op.Size < 3) + { + Operand oddEvenMask = X86GetElements(context, OddMasks[op.Size], EvenMasks[op.Size]); + + Operand oddEvenN = context.AddIntrinsic(Intrinsic.X86Pshufb, n, oddEvenMask); // odd:even from n + Operand oddEvenM = context.AddIntrinsic(Intrinsic.X86Pshufb, m, oddEvenMask); // odd:even from m + + Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, oddEvenN, oddEvenM); + Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, oddEvenN, oddEvenM); + + return context.AddIntrinsic(inst[op.Size], left, right); + } + else + { + Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); + Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, n, m); + + return context.AddIntrinsic(inst[3], left, right); + } + }, 0); + } + // Generic Functions public static Operand EmitSoftFloatCallDefaultFpscr(