Intrinsics for more Arithmetic instructions.

This commit is contained in:
riperiperi 2020-01-22 01:03:28 +00:00
parent 1d8c595cd4
commit 6784041926
3 changed files with 437 additions and 66 deletions

View file

@ -16,6 +16,7 @@ namespace ARMeilleure.Instructions
public static void Vabs_S(ArmEmitterContext context)
{
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarUnaryOpSimd32(context, (m) =>
@ -36,7 +37,6 @@ namespace ARMeilleure.Instructions
{
EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Abs, Math.Abs, op1));
}
}
public static void Vabs_V(ArmEmitterContext context)
@ -113,7 +113,15 @@ namespace ARMeilleure.Instructions
public static void Vadd_I(ArmEmitterContext context)
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Add(op1, op2));
if (Optimizations.UseSse2)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PaddInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Add(op1, op2));
}
}
public static void Vdup(ArmEmitterContext context)
@ -223,6 +231,7 @@ namespace ARMeilleure.Instructions
public static void Vneg_S(ArmEmitterContext context)
{
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
if (Optimizations.UseSse2)
{
EmitScalarUnaryOpSimd32(context, (m) =>
@ -248,6 +257,7 @@ namespace ARMeilleure.Instructions
public static void Vnmul_S(ArmEmitterContext context)
{
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
if (Optimizations.UseSse2)
{
EmitScalarBinaryOpSimd32(context, (n, m) =>
@ -275,6 +285,7 @@ namespace ARMeilleure.Instructions
public static void Vnmla_S(ArmEmitterContext context)
{
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
@ -314,6 +325,7 @@ namespace ARMeilleure.Instructions
public static void Vnmls_S(ArmEmitterContext context)
{
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
@ -472,11 +484,25 @@ namespace ARMeilleure.Instructions
if (op.U)
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2));
if (Optimizations.UseSse2)
{
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxuInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2));
}
}
else
{
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreater(op1, op2), op1, op2));
if (Optimizations.UseSse2)
{
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxsInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreater(op1, op2), op1, op2));
}
}
}
@ -501,11 +527,25 @@ namespace ARMeilleure.Instructions
if (op.U)
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2));
if (Optimizations.UseSse2)
{
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2));
}
}
else
{
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2));
if (Optimizations.UseSse2)
{
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2));
}
}
}
@ -559,7 +599,11 @@ namespace ARMeilleure.Instructions
if (op.F)
{
if (Optimizations.FastFP)
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
}
else if (Optimizations.FastFP)
{
EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2));
}
@ -626,7 +670,11 @@ namespace ARMeilleure.Instructions
if (op.F)
{
if (Optimizations.FastFP)
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
}
else if (Optimizations.FastFP)
{
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
}
@ -693,7 +741,11 @@ namespace ARMeilleure.Instructions
if (op.F)
{
if (Optimizations.FastFP)
if (Optimizations.FastFP && Optimizations.UseSse2)
{
EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
}
else if (Optimizations.FastFP)
{
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
}
@ -710,58 +762,118 @@ namespace ARMeilleure.Instructions
public static void Vpadd_V(ArmEmitterContext context)
{
EmitVectorPairwiseOpF32(context, (op1, op2) => context.Add(op1, op2));
if (Optimizations.FastFP && Optimizations.UseSse2 && false)
{
EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Addps, Intrinsic.X86Addpd);
}
else
{
EmitVectorPairwiseOpF32(context, (op1, op2) => context.Add(op1, op2));
}
}
public static void Vpadd_I(ArmEmitterContext context)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
EmitVectorPairwiseOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
if (Optimizations.UseSsse3)
{
EmitSsse3VectorPairwiseOp32(context, X86PaddInstruction);
}
else
{
EmitVectorPairwiseOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
}
}
public static void Vrev(ArmEmitterContext context)
{
OpCode32Simd op = (OpCode32Simd)context.CurrOp;
EmitVectorUnaryOpZx32(context, (op1) =>
OpCode32SimdRev op = (OpCode32SimdRev)context.CurrOp;
if (Optimizations.UseSsse3)
{
switch (op.Opc)
EmitVectorUnaryOpSimd32(context, (op1) =>
{
case 0:
switch (op.Size) // Swap bytes.
{
default:
return op1;
case 1:
return InstEmitAluHelper.EmitReverseBytes16_32Op(context, op1);
case 2:
case 3:
return context.ByteSwap(op1);
}
case 1:
switch (op.Size)
{
default:
return op1;
case 2:
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff0000)), Const(16)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x0000ffff)), Const(16)));
case 3:
return context.BitwiseOr(
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff000000000000ul)), Const(48)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x000000000000fffful)), Const(48))),
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0x0000ffff00000000ul)), Const(16)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000ffff0000ul)), Const(16))));
}
case 2:
// Swap upper and lower halves.
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffffffff00000000ul)), Const(32)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000fffffffful)), Const(32)));
}
Operand mask;
switch (op.Size)
{
case 3:
// rev64
switch (op.Opc)
{
case 0:
mask = X86GetElements(context, 0x08090a0b0c0d0e0fL, 0x0001020304050607L);
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
case 1:
mask = X86GetElements(context, 0x09080b0a0d0c0f0eL, 0x0100030205040706L);
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
case 2:
return context.AddIntrinsic(Intrinsic.X86Shufps, op1, op1, Const(1 | (0 << 2) | (3 << 4) | (2 << 6)));
}
break;
case 2:
// rev32
switch (op.Opc)
{
case 0:
mask = X86GetElements(context, 0x0c0d0e0f_08090a0bL, 0x04050607_00010203L);
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
case 1:
mask = X86GetElements(context, 0x0d0c0f0e_09080b0aL, 0x05040706_01000302L);
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
}
break;
case 1:
// rev16
mask = X86GetElements(context, 0x0e0f_0c0d_0a0b_0809L, 0x_0607_0405_0203_0001L);
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
}
return op1;
});
throw new InvalidOperationException("Unknown VREV Opcode+Size combo.");
});
}
else
{
EmitVectorUnaryOpZx32(context, (op1) =>
{
switch (op.Opc)
{
case 0:
switch (op.Size) // Swap bytes.
{
default:
return op1;
case 1:
return InstEmitAluHelper.EmitReverseBytes16_32Op(context, op1);
case 2:
case 3:
return context.ByteSwap(op1);
}
case 1:
switch (op.Size)
{
default:
return op1;
case 2:
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff0000)), Const(16)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x0000ffff)), Const(16)));
case 3:
return context.BitwiseOr(
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff000000000000ul)), Const(48)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x000000000000fffful)), Const(48))),
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0x0000ffff00000000ul)), Const(16)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000ffff0000ul)), Const(16))));
}
case 2:
// Swap upper and lower halves.
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffffffff00000000ul)), Const(32)),
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000fffffffful)), Const(32)));
}
return op1;
});
}
}
public static void Vrecpe(ArmEmitterContext context)
@ -772,7 +884,7 @@ namespace ARMeilleure.Instructions
{
int sizeF = op.Size & 1;
if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0)
if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0)
{
EmitVectorUnaryOpF32(context, Intrinsic.X86Rcpps, 0);
}
@ -792,10 +904,38 @@ namespace ARMeilleure.Instructions
public static void Vrecps(ArmEmitterContext context)
{
EmitVectorBinaryOpF32(context, (op1, op2) =>
if (Optimizations.FastFP && Optimizations.UseSse2)
{
return EmitSoftFloatCall(context, SoftFloat32.FPRecipStep, SoftFloat64.FPRecipStep, op1, op2);
});
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
bool single = (op.Size & 1) == 0;
// (2 - (n*m))
EmitVectorBinaryOpSimd32(context, (n, m) =>
{
if (single)
{
Operand maskTwo = X86GetAllElements(context, 2f);
Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
return context.AddIntrinsic(Intrinsic.X86Subps, maskTwo, res);
}
else
{
Operand maskTwo = X86GetAllElements(context, 2d);
Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
return context.AddIntrinsic(Intrinsic.X86Subpd, maskTwo, res);
}
});
}
else
{
EmitVectorBinaryOpF32(context, (op1, op2) =>
{
return EmitSoftFloatCall(context, SoftFloat32.FPRecipStep, SoftFloat64.FPRecipStep, op1, op2);
});
}
}
public static void Vrsqrte(ArmEmitterContext context)
@ -806,7 +946,7 @@ namespace ARMeilleure.Instructions
{
int sizeF = op.Size & 1;
if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0)
if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0)
{
EmitVectorUnaryOpF32(context, Intrinsic.X86Rsqrtps, 0);
}
@ -826,10 +966,42 @@ namespace ARMeilleure.Instructions
public static void Vrsqrts(ArmEmitterContext context)
{
EmitVectorBinaryOpF32(context, (op1, op2) =>
if (Optimizations.FastFP && Optimizations.UseSse2)
{
return EmitSoftFloatCall(context, SoftFloat32.FPRSqrtStep, SoftFloat64.FPRSqrtStep, op1, op2);
});
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
bool single = (op.Size & 1) == 0;
// (3 - (n*m)) / 2
EmitVectorBinaryOpSimd32(context, (n, m) =>
{
if (single)
{
Operand maskHalf = X86GetAllElements(context, 0.5f);
Operand maskThree = X86GetAllElements(context, 3f);
Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res);
return context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res);
}
else
{
Operand maskHalf = X86GetAllElements(context, 0.5d);
Operand maskThree = X86GetAllElements(context, 3d);
Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res);
return context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res);
}
});
}
else
{
EmitVectorBinaryOpF32(context, (op1, op2) =>
{
return EmitSoftFloatCall(context, SoftFloat32.FPRSqrtStep, SoftFloat64.FPRSqrtStep, op1, op2);
});
}
}
public static void Vsel(ArmEmitterContext context)
@ -853,10 +1025,35 @@ namespace ARMeilleure.Instructions
break;
}
EmitScalarBinaryOpI32(context, (op1, op2) =>
if (false && Optimizations.UseSse2)
{
return context.ConditionalSelect(condition, op1, op2);
});
Operand falseLabel = Label();
Operand doneLabel = Label();
context.BranchIfFalse(condition, falseLabel);
EmitScalarBinaryOpSimd32(context, (op1, op2) =>
{
return op1;
});
context.Branch(doneLabel);
context.MarkLabel(falseLabel);
EmitScalarBinaryOpSimd32(context, (op1, op2) =>
{
return op2;
});
context.MarkLabel(doneLabel);
}
else
{
EmitScalarBinaryOpI32(context, (op1, op2) =>
{
return context.ConditionalSelect(condition, op1, op2);
});
}
}
public static void Vsqrt_S(ArmEmitterContext context)
@ -900,7 +1097,15 @@ namespace ARMeilleure.Instructions
public static void Vsub_I(ArmEmitterContext context)
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2));
if (Optimizations.UseSse2)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PsubInstruction[op.Size], op1, op2));
}
else
{
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2));
}
}
private static void EmitSse41MaxMinNumOpF32(ArmEmitterContext context, bool isMaxNum, bool scalar)

View file

@ -31,7 +31,7 @@ namespace ARMeilleure.Instructions
15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0 // S
};
private static readonly long _zeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0;
public static readonly long ZeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0;
#endregion
#region "X86 SSE Intrinsics"
@ -1026,8 +1026,8 @@ namespace ARMeilleure.Instructions
if (op.RegisterSize == RegisterSize.Simd64)
{
Operand zeroEvenMask = X86GetElements(context, _zeroMask, EvenMasks[op.Size]);
Operand zeroOddMask = X86GetElements(context, _zeroMask, OddMasks [op.Size]);
Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]);
Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks [op.Size]);
Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n

View file

@ -589,7 +589,7 @@ namespace ARMeilleure.Instructions
EmitVectorUnaryOpSimd32(context, (m) => context.AddIntrinsic(inst, m));
}
public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc)
public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc, int side = -1)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
@ -597,16 +597,19 @@ namespace ARMeilleure.Instructions
Operand m = GetVecA32(op.Qm);
Operand d = GetVecA32(op.Qd);
if (side == -1) side = op.Vd;
if (!op.Q) //register swap: move relevant doubleword to destination side
{
n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd);
m = EmitSwapDoubleWordToSide(context, m, op.Vm, op.Vd);
n = EmitSwapDoubleWordToSide(context, n, op.Vn, side);
m = EmitSwapDoubleWordToSide(context, m, op.Vm, side);
}
Operand res = vectorFunc(n, m);
if (!op.Q) //register insert
{
if (side != op.Vd) EmitSwapDoubleWordToSide(context, m, side, op.Vd);
res = EmitDoubleWordInsert(context, d, res, op.Vd);
}
@ -771,6 +774,169 @@ namespace ARMeilleure.Instructions
});
}
// By Scalar
public static void EmitVectorByScalarOpSimd32(ArmEmitterContext context, Func2I vectorFunc)
{
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
Operand n = GetVecA32(op.Qn);
Operand d = GetVecA32(op.Qd);
int index = op.Vm & 3;
int dupeMask = (index << 6) | (index << 4) | (index << 2) | index;
Operand m = GetVecA32(op.Vm >> 2);
m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask));
if (!op.Q) //register swap: move relevant doubleword to destination side
{
n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd);
}
Operand res = vectorFunc(n, m);
if (!op.Q) //register insert
{
res = EmitDoubleWordInsert(context, d, res, op.Vd);
}
context.Copy(d, res);
}
public static void EmitVectorByScalarOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
{
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
EmitVectorByScalarOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m));
}
public static void EmitVectorsByScalarOpSimd32(ArmEmitterContext context, Func3I vectorFunc)
{
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
Operand n = GetVecA32(op.Qn);
Operand d = GetVecA32(op.Qd);
Operand initialD = d;
int index = op.Vm & 3;
int dupeMask = (index << 6) | (index << 4) | (index << 2) | index;
Operand m = GetVecA32(op.Vm >> 2);
m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask));
if (!op.Q) //register swap: move relevant doubleword to destination side
{
n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd);
}
Operand res = vectorFunc(d, n, m);
if (!op.Q) //register insert
{
res = EmitDoubleWordInsert(context, initialD, res, op.Vd);
}
context.Copy(initialD, res);
}
public static void EmitVectorsByScalarOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
{
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1;
Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2;
EmitVectorsByScalarOpSimd32(context, (d, n, m) =>
{
Operand res = context.AddIntrinsic(inst1, n, m);
return res = context.AddIntrinsic(inst2, d, res);
});
}
// Pairwise
public static void EmitSse2VectorPairwiseOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
EmitVectorBinaryOpSimd32(context, (n, m) =>
{
int sizeF = op.Size & 1;
if (sizeF == 0)
{
if (op.RegisterSize == RegisterSize.Simd64)
{
Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, n, m);
Operand zero = context.VectorZero();
Operand part0 = context.AddIntrinsic(Intrinsic.X86Movlhps, unpck, zero);
Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, unpck);
return context.AddIntrinsic(inst32, part0, part1);
}
else /* if (op.RegisterSize == RegisterSize.Simd128) */
{
const int sm0 = 2 << 6 | 0 << 4 | 2 << 2 | 0 << 0;
const int sm1 = 3 << 6 | 1 << 4 | 3 << 2 | 1 << 0;
Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm0));
Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm1));
return context.AddIntrinsic(inst32, part0, part1);
}
}
else /* if (sizeF == 1) */
{
Operand part0 = context.AddIntrinsic(Intrinsic.X86Unpcklpd, n, m);
Operand part1 = context.AddIntrinsic(Intrinsic.X86Unpckhpd, n, m);
return context.AddIntrinsic(inst64, part0, part1);
}
}, 0);
}
public static void EmitSsse3VectorPairwiseOp32(ArmEmitterContext context, Intrinsic[] inst)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
EmitVectorBinaryOpSimd32(context, (n, m) =>
{
if (op.RegisterSize == RegisterSize.Simd64)
{
Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]);
Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks[op.Size]);
Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n
Operand left = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroEvenMask); // 0:even from m:n
Operand right = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroOddMask); // 0:odd from m:n
return context.AddIntrinsic(inst[op.Size], left, right);
}
else if (op.Size < 3)
{
Operand oddEvenMask = X86GetElements(context, OddMasks[op.Size], EvenMasks[op.Size]);
Operand oddEvenN = context.AddIntrinsic(Intrinsic.X86Pshufb, n, oddEvenMask); // odd:even from n
Operand oddEvenM = context.AddIntrinsic(Intrinsic.X86Pshufb, m, oddEvenMask); // odd:even from m
Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, oddEvenN, oddEvenM);
Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, oddEvenN, oddEvenM);
return context.AddIntrinsic(inst[op.Size], left, right);
}
else
{
Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m);
Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, n, m);
return context.AddIntrinsic(inst[3], left, right);
}
}, 0);
}
// Generic Functions
public static Operand EmitSoftFloatCallDefaultFpscr(