Intrinsics for more Arithmetic instructions.
This commit is contained in:
parent
1d8c595cd4
commit
6784041926
3 changed files with 437 additions and 66 deletions
|
@ -16,6 +16,7 @@ namespace ARMeilleure.Instructions
|
|||
public static void Vabs_S(ArmEmitterContext context)
|
||||
{
|
||||
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
|
||||
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarUnaryOpSimd32(context, (m) =>
|
||||
|
@ -36,7 +37,6 @@ namespace ARMeilleure.Instructions
|
|||
{
|
||||
EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Abs, Math.Abs, op1));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static void Vabs_V(ArmEmitterContext context)
|
||||
|
@ -113,7 +113,15 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
public static void Vadd_I(ArmEmitterContext context)
|
||||
{
|
||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Add(op1, op2));
|
||||
if (Optimizations.UseSse2)
|
||||
{
|
||||
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PaddInstruction[op.Size], op1, op2));
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Add(op1, op2));
|
||||
}
|
||||
}
|
||||
|
||||
public static void Vdup(ArmEmitterContext context)
|
||||
|
@ -223,6 +231,7 @@ namespace ARMeilleure.Instructions
|
|||
public static void Vneg_S(ArmEmitterContext context)
|
||||
{
|
||||
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
|
||||
|
||||
if (Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarUnaryOpSimd32(context, (m) =>
|
||||
|
@ -248,6 +257,7 @@ namespace ARMeilleure.Instructions
|
|||
public static void Vnmul_S(ArmEmitterContext context)
|
||||
{
|
||||
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
|
||||
|
||||
if (Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarBinaryOpSimd32(context, (n, m) =>
|
||||
|
@ -275,6 +285,7 @@ namespace ARMeilleure.Instructions
|
|||
public static void Vnmla_S(ArmEmitterContext context)
|
||||
{
|
||||
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
|
||||
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
|
||||
|
@ -314,6 +325,7 @@ namespace ARMeilleure.Instructions
|
|||
public static void Vnmls_S(ArmEmitterContext context)
|
||||
{
|
||||
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
|
||||
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
|
||||
|
@ -472,11 +484,25 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
if (op.U)
|
||||
{
|
||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2));
|
||||
if (Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxuInstruction[op.Size], op1, op2));
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreater(op1, op2), op1, op2));
|
||||
if (Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxsInstruction[op.Size], op1, op2));
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreater(op1, op2), op1, op2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -501,11 +527,25 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
if (op.U)
|
||||
{
|
||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2));
|
||||
if (Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2));
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2));
|
||||
if (Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2));
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -559,7 +599,11 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
if (op.F)
|
||||
{
|
||||
if (Optimizations.FastFP)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
|
||||
}
|
||||
else if (Optimizations.FastFP)
|
||||
{
|
||||
EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2));
|
||||
}
|
||||
|
@ -626,7 +670,11 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
if (op.F)
|
||||
{
|
||||
if (Optimizations.FastFP)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
|
||||
}
|
||||
else if (Optimizations.FastFP)
|
||||
{
|
||||
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
|
||||
}
|
||||
|
@ -693,7 +741,11 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
if (op.F)
|
||||
{
|
||||
if (Optimizations.FastFP)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
|
||||
}
|
||||
else if (Optimizations.FastFP)
|
||||
{
|
||||
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
|
||||
}
|
||||
|
@ -710,58 +762,118 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
public static void Vpadd_V(ArmEmitterContext context)
|
||||
{
|
||||
EmitVectorPairwiseOpF32(context, (op1, op2) => context.Add(op1, op2));
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2 && false)
|
||||
{
|
||||
EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Addps, Intrinsic.X86Addpd);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorPairwiseOpF32(context, (op1, op2) => context.Add(op1, op2));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static void Vpadd_I(ArmEmitterContext context)
|
||||
{
|
||||
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||
|
||||
EmitVectorPairwiseOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
|
||||
if (Optimizations.UseSsse3)
|
||||
{
|
||||
EmitSsse3VectorPairwiseOp32(context, X86PaddInstruction);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorPairwiseOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
|
||||
}
|
||||
}
|
||||
|
||||
public static void Vrev(ArmEmitterContext context)
|
||||
{
|
||||
OpCode32Simd op = (OpCode32Simd)context.CurrOp;
|
||||
|
||||
EmitVectorUnaryOpZx32(context, (op1) =>
|
||||
OpCode32SimdRev op = (OpCode32SimdRev)context.CurrOp;
|
||||
|
||||
if (Optimizations.UseSsse3)
|
||||
{
|
||||
switch (op.Opc)
|
||||
EmitVectorUnaryOpSimd32(context, (op1) =>
|
||||
{
|
||||
case 0:
|
||||
switch (op.Size) // Swap bytes.
|
||||
{
|
||||
default:
|
||||
return op1;
|
||||
case 1:
|
||||
return InstEmitAluHelper.EmitReverseBytes16_32Op(context, op1);
|
||||
case 2:
|
||||
case 3:
|
||||
return context.ByteSwap(op1);
|
||||
}
|
||||
case 1:
|
||||
switch (op.Size)
|
||||
{
|
||||
default:
|
||||
return op1;
|
||||
case 2:
|
||||
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff0000)), Const(16)),
|
||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x0000ffff)), Const(16)));
|
||||
case 3:
|
||||
return context.BitwiseOr(
|
||||
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff000000000000ul)), Const(48)),
|
||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x000000000000fffful)), Const(48))),
|
||||
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0x0000ffff00000000ul)), Const(16)),
|
||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000ffff0000ul)), Const(16))));
|
||||
}
|
||||
case 2:
|
||||
// Swap upper and lower halves.
|
||||
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffffffff00000000ul)), Const(32)),
|
||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000fffffffful)), Const(32)));
|
||||
}
|
||||
Operand mask;
|
||||
switch (op.Size)
|
||||
{
|
||||
case 3:
|
||||
// rev64
|
||||
switch (op.Opc)
|
||||
{
|
||||
case 0:
|
||||
mask = X86GetElements(context, 0x08090a0b0c0d0e0fL, 0x0001020304050607L);
|
||||
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
|
||||
case 1:
|
||||
mask = X86GetElements(context, 0x09080b0a0d0c0f0eL, 0x0100030205040706L);
|
||||
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
|
||||
case 2:
|
||||
return context.AddIntrinsic(Intrinsic.X86Shufps, op1, op1, Const(1 | (0 << 2) | (3 << 4) | (2 << 6)));
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
// rev32
|
||||
switch (op.Opc)
|
||||
{
|
||||
case 0:
|
||||
mask = X86GetElements(context, 0x0c0d0e0f_08090a0bL, 0x04050607_00010203L);
|
||||
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
|
||||
case 1:
|
||||
mask = X86GetElements(context, 0x0d0c0f0e_09080b0aL, 0x05040706_01000302L);
|
||||
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
// rev16
|
||||
mask = X86GetElements(context, 0x0e0f_0c0d_0a0b_0809L, 0x_0607_0405_0203_0001L);
|
||||
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
|
||||
}
|
||||
|
||||
return op1;
|
||||
});
|
||||
throw new InvalidOperationException("Unknown VREV Opcode+Size combo.");
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorUnaryOpZx32(context, (op1) =>
|
||||
{
|
||||
switch (op.Opc)
|
||||
{
|
||||
case 0:
|
||||
switch (op.Size) // Swap bytes.
|
||||
{
|
||||
default:
|
||||
return op1;
|
||||
case 1:
|
||||
return InstEmitAluHelper.EmitReverseBytes16_32Op(context, op1);
|
||||
case 2:
|
||||
case 3:
|
||||
return context.ByteSwap(op1);
|
||||
}
|
||||
case 1:
|
||||
switch (op.Size)
|
||||
{
|
||||
default:
|
||||
return op1;
|
||||
case 2:
|
||||
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff0000)), Const(16)),
|
||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x0000ffff)), Const(16)));
|
||||
case 3:
|
||||
return context.BitwiseOr(
|
||||
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff000000000000ul)), Const(48)),
|
||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x000000000000fffful)), Const(48))),
|
||||
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0x0000ffff00000000ul)), Const(16)),
|
||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000ffff0000ul)), Const(16))));
|
||||
}
|
||||
case 2:
|
||||
// Swap upper and lower halves.
|
||||
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffffffff00000000ul)), Const(32)),
|
||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000fffffffful)), Const(32)));
|
||||
}
|
||||
|
||||
return op1;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public static void Vrecpe(ArmEmitterContext context)
|
||||
|
@ -772,7 +884,7 @@ namespace ARMeilleure.Instructions
|
|||
{
|
||||
int sizeF = op.Size & 1;
|
||||
|
||||
if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0)
|
||||
{
|
||||
EmitVectorUnaryOpF32(context, Intrinsic.X86Rcpps, 0);
|
||||
}
|
||||
|
@ -792,10 +904,38 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
public static void Vrecps(ArmEmitterContext context)
|
||||
{
|
||||
EmitVectorBinaryOpF32(context, (op1, op2) =>
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
return EmitSoftFloatCall(context, SoftFloat32.FPRecipStep, SoftFloat64.FPRecipStep, op1, op2);
|
||||
});
|
||||
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||
bool single = (op.Size & 1) == 0;
|
||||
// (2 - (n*m))
|
||||
EmitVectorBinaryOpSimd32(context, (n, m) =>
|
||||
{
|
||||
if (single)
|
||||
{
|
||||
Operand maskTwo = X86GetAllElements(context, 2f);
|
||||
|
||||
Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
|
||||
|
||||
return context.AddIntrinsic(Intrinsic.X86Subps, maskTwo, res);
|
||||
}
|
||||
else
|
||||
{
|
||||
Operand maskTwo = X86GetAllElements(context, 2d);
|
||||
|
||||
Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
|
||||
|
||||
return context.AddIntrinsic(Intrinsic.X86Subpd, maskTwo, res);
|
||||
}
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorBinaryOpF32(context, (op1, op2) =>
|
||||
{
|
||||
return EmitSoftFloatCall(context, SoftFloat32.FPRecipStep, SoftFloat64.FPRecipStep, op1, op2);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public static void Vrsqrte(ArmEmitterContext context)
|
||||
|
@ -806,7 +946,7 @@ namespace ARMeilleure.Instructions
|
|||
{
|
||||
int sizeF = op.Size & 1;
|
||||
|
||||
if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0)
|
||||
{
|
||||
EmitVectorUnaryOpF32(context, Intrinsic.X86Rsqrtps, 0);
|
||||
}
|
||||
|
@ -826,10 +966,42 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
public static void Vrsqrts(ArmEmitterContext context)
|
||||
{
|
||||
EmitVectorBinaryOpF32(context, (op1, op2) =>
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
return EmitSoftFloatCall(context, SoftFloat32.FPRSqrtStep, SoftFloat64.FPRSqrtStep, op1, op2);
|
||||
});
|
||||
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||
bool single = (op.Size & 1) == 0;
|
||||
// (3 - (n*m)) / 2
|
||||
EmitVectorBinaryOpSimd32(context, (n, m) =>
|
||||
{
|
||||
if (single)
|
||||
{
|
||||
Operand maskHalf = X86GetAllElements(context, 0.5f);
|
||||
Operand maskThree = X86GetAllElements(context, 3f);
|
||||
|
||||
Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
|
||||
|
||||
res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res);
|
||||
return context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res);
|
||||
}
|
||||
else
|
||||
{
|
||||
Operand maskHalf = X86GetAllElements(context, 0.5d);
|
||||
Operand maskThree = X86GetAllElements(context, 3d);
|
||||
|
||||
Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
|
||||
|
||||
res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res);
|
||||
return context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res);
|
||||
}
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorBinaryOpF32(context, (op1, op2) =>
|
||||
{
|
||||
return EmitSoftFloatCall(context, SoftFloat32.FPRSqrtStep, SoftFloat64.FPRSqrtStep, op1, op2);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public static void Vsel(ArmEmitterContext context)
|
||||
|
@ -853,10 +1025,35 @@ namespace ARMeilleure.Instructions
|
|||
break;
|
||||
}
|
||||
|
||||
EmitScalarBinaryOpI32(context, (op1, op2) =>
|
||||
if (false && Optimizations.UseSse2)
|
||||
{
|
||||
return context.ConditionalSelect(condition, op1, op2);
|
||||
});
|
||||
Operand falseLabel = Label();
|
||||
Operand doneLabel = Label();
|
||||
|
||||
context.BranchIfFalse(condition, falseLabel);
|
||||
|
||||
EmitScalarBinaryOpSimd32(context, (op1, op2) =>
|
||||
{
|
||||
return op1;
|
||||
});
|
||||
|
||||
context.Branch(doneLabel);
|
||||
context.MarkLabel(falseLabel);
|
||||
|
||||
EmitScalarBinaryOpSimd32(context, (op1, op2) =>
|
||||
{
|
||||
return op2;
|
||||
});
|
||||
|
||||
context.MarkLabel(doneLabel);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitScalarBinaryOpI32(context, (op1, op2) =>
|
||||
{
|
||||
return context.ConditionalSelect(condition, op1, op2);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public static void Vsqrt_S(ArmEmitterContext context)
|
||||
|
@ -900,7 +1097,15 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
public static void Vsub_I(ArmEmitterContext context)
|
||||
{
|
||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2));
|
||||
if (Optimizations.UseSse2)
|
||||
{
|
||||
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PsubInstruction[op.Size], op1, op2));
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2));
|
||||
}
|
||||
}
|
||||
|
||||
private static void EmitSse41MaxMinNumOpF32(ArmEmitterContext context, bool isMaxNum, bool scalar)
|
||||
|
|
|
@ -31,7 +31,7 @@ namespace ARMeilleure.Instructions
|
|||
15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0 // S
|
||||
};
|
||||
|
||||
private static readonly long _zeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0;
|
||||
public static readonly long ZeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0;
|
||||
#endregion
|
||||
|
||||
#region "X86 SSE Intrinsics"
|
||||
|
@ -1026,8 +1026,8 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
if (op.RegisterSize == RegisterSize.Simd64)
|
||||
{
|
||||
Operand zeroEvenMask = X86GetElements(context, _zeroMask, EvenMasks[op.Size]);
|
||||
Operand zeroOddMask = X86GetElements(context, _zeroMask, OddMasks [op.Size]);
|
||||
Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]);
|
||||
Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks [op.Size]);
|
||||
|
||||
Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n
|
||||
|
||||
|
|
|
@ -589,7 +589,7 @@ namespace ARMeilleure.Instructions
|
|||
EmitVectorUnaryOpSimd32(context, (m) => context.AddIntrinsic(inst, m));
|
||||
}
|
||||
|
||||
public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc)
|
||||
public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc, int side = -1)
|
||||
{
|
||||
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||
|
||||
|
@ -597,16 +597,19 @@ namespace ARMeilleure.Instructions
|
|||
Operand m = GetVecA32(op.Qm);
|
||||
Operand d = GetVecA32(op.Qd);
|
||||
|
||||
if (side == -1) side = op.Vd;
|
||||
|
||||
if (!op.Q) //register swap: move relevant doubleword to destination side
|
||||
{
|
||||
n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd);
|
||||
m = EmitSwapDoubleWordToSide(context, m, op.Vm, op.Vd);
|
||||
n = EmitSwapDoubleWordToSide(context, n, op.Vn, side);
|
||||
m = EmitSwapDoubleWordToSide(context, m, op.Vm, side);
|
||||
}
|
||||
|
||||
Operand res = vectorFunc(n, m);
|
||||
|
||||
if (!op.Q) //register insert
|
||||
{
|
||||
if (side != op.Vd) EmitSwapDoubleWordToSide(context, m, side, op.Vd);
|
||||
res = EmitDoubleWordInsert(context, d, res, op.Vd);
|
||||
}
|
||||
|
||||
|
@ -771,6 +774,169 @@ namespace ARMeilleure.Instructions
|
|||
});
|
||||
}
|
||||
|
||||
// By Scalar
|
||||
|
||||
public static void EmitVectorByScalarOpSimd32(ArmEmitterContext context, Func2I vectorFunc)
|
||||
{
|
||||
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
|
||||
|
||||
Operand n = GetVecA32(op.Qn);
|
||||
Operand d = GetVecA32(op.Qd);
|
||||
|
||||
int index = op.Vm & 3;
|
||||
int dupeMask = (index << 6) | (index << 4) | (index << 2) | index;
|
||||
Operand m = GetVecA32(op.Vm >> 2);
|
||||
m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask));
|
||||
|
||||
if (!op.Q) //register swap: move relevant doubleword to destination side
|
||||
{
|
||||
n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd);
|
||||
}
|
||||
|
||||
Operand res = vectorFunc(n, m);
|
||||
|
||||
if (!op.Q) //register insert
|
||||
{
|
||||
res = EmitDoubleWordInsert(context, d, res, op.Vd);
|
||||
}
|
||||
|
||||
context.Copy(d, res);
|
||||
}
|
||||
|
||||
public static void EmitVectorByScalarOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
|
||||
{
|
||||
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
|
||||
|
||||
Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
|
||||
EmitVectorByScalarOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m));
|
||||
}
|
||||
|
||||
public static void EmitVectorsByScalarOpSimd32(ArmEmitterContext context, Func3I vectorFunc)
|
||||
{
|
||||
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
|
||||
|
||||
Operand n = GetVecA32(op.Qn);
|
||||
Operand d = GetVecA32(op.Qd);
|
||||
Operand initialD = d;
|
||||
|
||||
int index = op.Vm & 3;
|
||||
int dupeMask = (index << 6) | (index << 4) | (index << 2) | index;
|
||||
Operand m = GetVecA32(op.Vm >> 2);
|
||||
m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask));
|
||||
|
||||
if (!op.Q) //register swap: move relevant doubleword to destination side
|
||||
{
|
||||
n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd);
|
||||
}
|
||||
|
||||
Operand res = vectorFunc(d, n, m);
|
||||
|
||||
if (!op.Q) //register insert
|
||||
{
|
||||
res = EmitDoubleWordInsert(context, initialD, res, op.Vd);
|
||||
}
|
||||
|
||||
context.Copy(initialD, res);
|
||||
}
|
||||
|
||||
public static void EmitVectorsByScalarOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
|
||||
{
|
||||
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
|
||||
|
||||
Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1;
|
||||
Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2;
|
||||
|
||||
EmitVectorsByScalarOpSimd32(context, (d, n, m) =>
|
||||
{
|
||||
Operand res = context.AddIntrinsic(inst1, n, m);
|
||||
return res = context.AddIntrinsic(inst2, d, res);
|
||||
});
|
||||
}
|
||||
|
||||
// Pairwise
|
||||
|
||||
public static void EmitSse2VectorPairwiseOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
|
||||
{
|
||||
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||
|
||||
EmitVectorBinaryOpSimd32(context, (n, m) =>
|
||||
{
|
||||
int sizeF = op.Size & 1;
|
||||
|
||||
if (sizeF == 0)
|
||||
{
|
||||
if (op.RegisterSize == RegisterSize.Simd64)
|
||||
{
|
||||
Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, n, m);
|
||||
|
||||
Operand zero = context.VectorZero();
|
||||
|
||||
Operand part0 = context.AddIntrinsic(Intrinsic.X86Movlhps, unpck, zero);
|
||||
Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, unpck);
|
||||
|
||||
return context.AddIntrinsic(inst32, part0, part1);
|
||||
}
|
||||
else /* if (op.RegisterSize == RegisterSize.Simd128) */
|
||||
{
|
||||
const int sm0 = 2 << 6 | 0 << 4 | 2 << 2 | 0 << 0;
|
||||
const int sm1 = 3 << 6 | 1 << 4 | 3 << 2 | 1 << 0;
|
||||
|
||||
Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm0));
|
||||
Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm1));
|
||||
|
||||
return context.AddIntrinsic(inst32, part0, part1);
|
||||
}
|
||||
}
|
||||
else /* if (sizeF == 1) */
|
||||
{
|
||||
Operand part0 = context.AddIntrinsic(Intrinsic.X86Unpcklpd, n, m);
|
||||
Operand part1 = context.AddIntrinsic(Intrinsic.X86Unpckhpd, n, m);
|
||||
|
||||
return context.AddIntrinsic(inst64, part0, part1);
|
||||
}
|
||||
}, 0);
|
||||
}
|
||||
|
||||
public static void EmitSsse3VectorPairwiseOp32(ArmEmitterContext context, Intrinsic[] inst)
|
||||
{
|
||||
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||
|
||||
EmitVectorBinaryOpSimd32(context, (n, m) =>
|
||||
{
|
||||
if (op.RegisterSize == RegisterSize.Simd64)
|
||||
{
|
||||
Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]);
|
||||
Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks[op.Size]);
|
||||
|
||||
Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n
|
||||
|
||||
Operand left = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroEvenMask); // 0:even from m:n
|
||||
Operand right = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroOddMask); // 0:odd from m:n
|
||||
|
||||
return context.AddIntrinsic(inst[op.Size], left, right);
|
||||
}
|
||||
else if (op.Size < 3)
|
||||
{
|
||||
Operand oddEvenMask = X86GetElements(context, OddMasks[op.Size], EvenMasks[op.Size]);
|
||||
|
||||
Operand oddEvenN = context.AddIntrinsic(Intrinsic.X86Pshufb, n, oddEvenMask); // odd:even from n
|
||||
Operand oddEvenM = context.AddIntrinsic(Intrinsic.X86Pshufb, m, oddEvenMask); // odd:even from m
|
||||
|
||||
Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, oddEvenN, oddEvenM);
|
||||
Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, oddEvenN, oddEvenM);
|
||||
|
||||
return context.AddIntrinsic(inst[op.Size], left, right);
|
||||
}
|
||||
else
|
||||
{
|
||||
Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m);
|
||||
Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, n, m);
|
||||
|
||||
return context.AddIntrinsic(inst[3], left, right);
|
||||
}
|
||||
}, 0);
|
||||
}
|
||||
|
||||
// Generic Functions
|
||||
|
||||
public static Operand EmitSoftFloatCallDefaultFpscr(
|
||||
|
|
Loading…
Add table
Reference in a new issue