Intrinsics for more Arithmetic instructions.
This commit is contained in:
parent
1d8c595cd4
commit
6784041926
3 changed files with 437 additions and 66 deletions
|
@ -16,6 +16,7 @@ namespace ARMeilleure.Instructions
|
||||||
public static void Vabs_S(ArmEmitterContext context)
|
public static void Vabs_S(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
|
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
|
||||||
|
|
||||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||||
{
|
{
|
||||||
EmitScalarUnaryOpSimd32(context, (m) =>
|
EmitScalarUnaryOpSimd32(context, (m) =>
|
||||||
|
@ -36,7 +37,6 @@ namespace ARMeilleure.Instructions
|
||||||
{
|
{
|
||||||
EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Abs, Math.Abs, op1));
|
EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Abs, Math.Abs, op1));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void Vabs_V(ArmEmitterContext context)
|
public static void Vabs_V(ArmEmitterContext context)
|
||||||
|
@ -113,7 +113,15 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
public static void Vadd_I(ArmEmitterContext context)
|
public static void Vadd_I(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Add(op1, op2));
|
if (Optimizations.UseSse2)
|
||||||
|
{
|
||||||
|
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||||
|
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PaddInstruction[op.Size], op1, op2));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Add(op1, op2));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void Vdup(ArmEmitterContext context)
|
public static void Vdup(ArmEmitterContext context)
|
||||||
|
@ -223,6 +231,7 @@ namespace ARMeilleure.Instructions
|
||||||
public static void Vneg_S(ArmEmitterContext context)
|
public static void Vneg_S(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
|
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
|
||||||
|
|
||||||
if (Optimizations.UseSse2)
|
if (Optimizations.UseSse2)
|
||||||
{
|
{
|
||||||
EmitScalarUnaryOpSimd32(context, (m) =>
|
EmitScalarUnaryOpSimd32(context, (m) =>
|
||||||
|
@ -248,6 +257,7 @@ namespace ARMeilleure.Instructions
|
||||||
public static void Vnmul_S(ArmEmitterContext context)
|
public static void Vnmul_S(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
|
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
|
||||||
|
|
||||||
if (Optimizations.UseSse2)
|
if (Optimizations.UseSse2)
|
||||||
{
|
{
|
||||||
EmitScalarBinaryOpSimd32(context, (n, m) =>
|
EmitScalarBinaryOpSimd32(context, (n, m) =>
|
||||||
|
@ -275,6 +285,7 @@ namespace ARMeilleure.Instructions
|
||||||
public static void Vnmla_S(ArmEmitterContext context)
|
public static void Vnmla_S(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
|
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
|
||||||
|
|
||||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||||
{
|
{
|
||||||
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
|
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
|
||||||
|
@ -314,6 +325,7 @@ namespace ARMeilleure.Instructions
|
||||||
public static void Vnmls_S(ArmEmitterContext context)
|
public static void Vnmls_S(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
|
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
|
||||||
|
|
||||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||||
{
|
{
|
||||||
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
|
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
|
||||||
|
@ -472,11 +484,25 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
if (op.U)
|
if (op.U)
|
||||||
{
|
{
|
||||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2));
|
if (Optimizations.UseSse2)
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxuInstruction[op.Size], op1, op2));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreater(op1, op2), op1, op2));
|
if (Optimizations.UseSse2)
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxsInstruction[op.Size], op1, op2));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreater(op1, op2), op1, op2));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -501,11 +527,25 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
if (op.U)
|
if (op.U)
|
||||||
{
|
{
|
||||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2));
|
if (Optimizations.UseSse2)
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2));
|
if (Optimizations.UseSse2)
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -559,7 +599,11 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
if (op.F)
|
if (op.F)
|
||||||
{
|
{
|
||||||
if (Optimizations.FastFP)
|
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||||
|
{
|
||||||
|
EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
|
||||||
|
}
|
||||||
|
else if (Optimizations.FastFP)
|
||||||
{
|
{
|
||||||
EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2));
|
EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2));
|
||||||
}
|
}
|
||||||
|
@ -626,7 +670,11 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
if (op.F)
|
if (op.F)
|
||||||
{
|
{
|
||||||
if (Optimizations.FastFP)
|
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||||
|
{
|
||||||
|
EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd);
|
||||||
|
}
|
||||||
|
else if (Optimizations.FastFP)
|
||||||
{
|
{
|
||||||
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
|
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)));
|
||||||
}
|
}
|
||||||
|
@ -693,7 +741,11 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
if (op.F)
|
if (op.F)
|
||||||
{
|
{
|
||||||
if (Optimizations.FastFP)
|
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||||
|
{
|
||||||
|
EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd);
|
||||||
|
}
|
||||||
|
else if (Optimizations.FastFP)
|
||||||
{
|
{
|
||||||
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
|
EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)));
|
||||||
}
|
}
|
||||||
|
@ -710,58 +762,118 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
public static void Vpadd_V(ArmEmitterContext context)
|
public static void Vpadd_V(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
EmitVectorPairwiseOpF32(context, (op1, op2) => context.Add(op1, op2));
|
if (Optimizations.FastFP && Optimizations.UseSse2 && false)
|
||||||
|
{
|
||||||
|
EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Addps, Intrinsic.X86Addpd);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorPairwiseOpF32(context, (op1, op2) => context.Add(op1, op2));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void Vpadd_I(ArmEmitterContext context)
|
public static void Vpadd_I(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||||
|
|
||||||
EmitVectorPairwiseOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
|
if (Optimizations.UseSsse3)
|
||||||
|
{
|
||||||
|
EmitSsse3VectorPairwiseOp32(context, X86PaddInstruction);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorPairwiseOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void Vrev(ArmEmitterContext context)
|
public static void Vrev(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
OpCode32Simd op = (OpCode32Simd)context.CurrOp;
|
OpCode32SimdRev op = (OpCode32SimdRev)context.CurrOp;
|
||||||
|
|
||||||
EmitVectorUnaryOpZx32(context, (op1) =>
|
if (Optimizations.UseSsse3)
|
||||||
{
|
{
|
||||||
switch (op.Opc)
|
EmitVectorUnaryOpSimd32(context, (op1) =>
|
||||||
{
|
{
|
||||||
case 0:
|
Operand mask;
|
||||||
switch (op.Size) // Swap bytes.
|
switch (op.Size)
|
||||||
{
|
{
|
||||||
default:
|
case 3:
|
||||||
return op1;
|
// rev64
|
||||||
case 1:
|
switch (op.Opc)
|
||||||
return InstEmitAluHelper.EmitReverseBytes16_32Op(context, op1);
|
{
|
||||||
case 2:
|
case 0:
|
||||||
case 3:
|
mask = X86GetElements(context, 0x08090a0b0c0d0e0fL, 0x0001020304050607L);
|
||||||
return context.ByteSwap(op1);
|
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
|
||||||
}
|
case 1:
|
||||||
case 1:
|
mask = X86GetElements(context, 0x09080b0a0d0c0f0eL, 0x0100030205040706L);
|
||||||
switch (op.Size)
|
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
|
||||||
{
|
case 2:
|
||||||
default:
|
return context.AddIntrinsic(Intrinsic.X86Shufps, op1, op1, Const(1 | (0 << 2) | (3 << 4) | (2 << 6)));
|
||||||
return op1;
|
}
|
||||||
case 2:
|
break;
|
||||||
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff0000)), Const(16)),
|
case 2:
|
||||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x0000ffff)), Const(16)));
|
// rev32
|
||||||
case 3:
|
switch (op.Opc)
|
||||||
return context.BitwiseOr(
|
{
|
||||||
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff000000000000ul)), Const(48)),
|
case 0:
|
||||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x000000000000fffful)), Const(48))),
|
mask = X86GetElements(context, 0x0c0d0e0f_08090a0bL, 0x04050607_00010203L);
|
||||||
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0x0000ffff00000000ul)), Const(16)),
|
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
|
||||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000ffff0000ul)), Const(16))));
|
case 1:
|
||||||
}
|
mask = X86GetElements(context, 0x0d0c0f0e_09080b0aL, 0x05040706_01000302L);
|
||||||
case 2:
|
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
|
||||||
// Swap upper and lower halves.
|
}
|
||||||
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffffffff00000000ul)), Const(32)),
|
break;
|
||||||
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000fffffffful)), Const(32)));
|
case 1:
|
||||||
}
|
// rev16
|
||||||
|
mask = X86GetElements(context, 0x0e0f_0c0d_0a0b_0809L, 0x_0607_0405_0203_0001L);
|
||||||
|
return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask);
|
||||||
|
}
|
||||||
|
|
||||||
return op1;
|
throw new InvalidOperationException("Unknown VREV Opcode+Size combo.");
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorUnaryOpZx32(context, (op1) =>
|
||||||
|
{
|
||||||
|
switch (op.Opc)
|
||||||
|
{
|
||||||
|
case 0:
|
||||||
|
switch (op.Size) // Swap bytes.
|
||||||
|
{
|
||||||
|
default:
|
||||||
|
return op1;
|
||||||
|
case 1:
|
||||||
|
return InstEmitAluHelper.EmitReverseBytes16_32Op(context, op1);
|
||||||
|
case 2:
|
||||||
|
case 3:
|
||||||
|
return context.ByteSwap(op1);
|
||||||
|
}
|
||||||
|
case 1:
|
||||||
|
switch (op.Size)
|
||||||
|
{
|
||||||
|
default:
|
||||||
|
return op1;
|
||||||
|
case 2:
|
||||||
|
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff0000)), Const(16)),
|
||||||
|
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x0000ffff)), Const(16)));
|
||||||
|
case 3:
|
||||||
|
return context.BitwiseOr(
|
||||||
|
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff000000000000ul)), Const(48)),
|
||||||
|
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x000000000000fffful)), Const(48))),
|
||||||
|
context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0x0000ffff00000000ul)), Const(16)),
|
||||||
|
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000ffff0000ul)), Const(16))));
|
||||||
|
}
|
||||||
|
case 2:
|
||||||
|
// Swap upper and lower halves.
|
||||||
|
return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffffffff00000000ul)), Const(32)),
|
||||||
|
context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000fffffffful)), Const(32)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return op1;
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void Vrecpe(ArmEmitterContext context)
|
public static void Vrecpe(ArmEmitterContext context)
|
||||||
|
@ -772,7 +884,7 @@ namespace ARMeilleure.Instructions
|
||||||
{
|
{
|
||||||
int sizeF = op.Size & 1;
|
int sizeF = op.Size & 1;
|
||||||
|
|
||||||
if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0)
|
if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0)
|
||||||
{
|
{
|
||||||
EmitVectorUnaryOpF32(context, Intrinsic.X86Rcpps, 0);
|
EmitVectorUnaryOpF32(context, Intrinsic.X86Rcpps, 0);
|
||||||
}
|
}
|
||||||
|
@ -792,10 +904,38 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
public static void Vrecps(ArmEmitterContext context)
|
public static void Vrecps(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
EmitVectorBinaryOpF32(context, (op1, op2) =>
|
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||||
{
|
{
|
||||||
return EmitSoftFloatCall(context, SoftFloat32.FPRecipStep, SoftFloat64.FPRecipStep, op1, op2);
|
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||||
});
|
bool single = (op.Size & 1) == 0;
|
||||||
|
// (2 - (n*m))
|
||||||
|
EmitVectorBinaryOpSimd32(context, (n, m) =>
|
||||||
|
{
|
||||||
|
if (single)
|
||||||
|
{
|
||||||
|
Operand maskTwo = X86GetAllElements(context, 2f);
|
||||||
|
|
||||||
|
Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
|
||||||
|
|
||||||
|
return context.AddIntrinsic(Intrinsic.X86Subps, maskTwo, res);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Operand maskTwo = X86GetAllElements(context, 2d);
|
||||||
|
|
||||||
|
Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
|
||||||
|
|
||||||
|
return context.AddIntrinsic(Intrinsic.X86Subpd, maskTwo, res);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpF32(context, (op1, op2) =>
|
||||||
|
{
|
||||||
|
return EmitSoftFloatCall(context, SoftFloat32.FPRecipStep, SoftFloat64.FPRecipStep, op1, op2);
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void Vrsqrte(ArmEmitterContext context)
|
public static void Vrsqrte(ArmEmitterContext context)
|
||||||
|
@ -806,7 +946,7 @@ namespace ARMeilleure.Instructions
|
||||||
{
|
{
|
||||||
int sizeF = op.Size & 1;
|
int sizeF = op.Size & 1;
|
||||||
|
|
||||||
if (Optimizations.FastFP && Optimizations.UseSse && sizeF == 0)
|
if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0)
|
||||||
{
|
{
|
||||||
EmitVectorUnaryOpF32(context, Intrinsic.X86Rsqrtps, 0);
|
EmitVectorUnaryOpF32(context, Intrinsic.X86Rsqrtps, 0);
|
||||||
}
|
}
|
||||||
|
@ -826,10 +966,42 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
public static void Vrsqrts(ArmEmitterContext context)
|
public static void Vrsqrts(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
EmitVectorBinaryOpF32(context, (op1, op2) =>
|
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||||
{
|
{
|
||||||
return EmitSoftFloatCall(context, SoftFloat32.FPRSqrtStep, SoftFloat64.FPRSqrtStep, op1, op2);
|
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||||
});
|
bool single = (op.Size & 1) == 0;
|
||||||
|
// (3 - (n*m)) / 2
|
||||||
|
EmitVectorBinaryOpSimd32(context, (n, m) =>
|
||||||
|
{
|
||||||
|
if (single)
|
||||||
|
{
|
||||||
|
Operand maskHalf = X86GetAllElements(context, 0.5f);
|
||||||
|
Operand maskThree = X86GetAllElements(context, 3f);
|
||||||
|
|
||||||
|
Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
|
||||||
|
|
||||||
|
res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res);
|
||||||
|
return context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Operand maskHalf = X86GetAllElements(context, 0.5d);
|
||||||
|
Operand maskThree = X86GetAllElements(context, 3d);
|
||||||
|
|
||||||
|
Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
|
||||||
|
|
||||||
|
res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res);
|
||||||
|
return context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpF32(context, (op1, op2) =>
|
||||||
|
{
|
||||||
|
return EmitSoftFloatCall(context, SoftFloat32.FPRSqrtStep, SoftFloat64.FPRSqrtStep, op1, op2);
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void Vsel(ArmEmitterContext context)
|
public static void Vsel(ArmEmitterContext context)
|
||||||
|
@ -853,10 +1025,35 @@ namespace ARMeilleure.Instructions
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
EmitScalarBinaryOpI32(context, (op1, op2) =>
|
if (false && Optimizations.UseSse2)
|
||||||
{
|
{
|
||||||
return context.ConditionalSelect(condition, op1, op2);
|
Operand falseLabel = Label();
|
||||||
});
|
Operand doneLabel = Label();
|
||||||
|
|
||||||
|
context.BranchIfFalse(condition, falseLabel);
|
||||||
|
|
||||||
|
EmitScalarBinaryOpSimd32(context, (op1, op2) =>
|
||||||
|
{
|
||||||
|
return op1;
|
||||||
|
});
|
||||||
|
|
||||||
|
context.Branch(doneLabel);
|
||||||
|
context.MarkLabel(falseLabel);
|
||||||
|
|
||||||
|
EmitScalarBinaryOpSimd32(context, (op1, op2) =>
|
||||||
|
{
|
||||||
|
return op2;
|
||||||
|
});
|
||||||
|
|
||||||
|
context.MarkLabel(doneLabel);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitScalarBinaryOpI32(context, (op1, op2) =>
|
||||||
|
{
|
||||||
|
return context.ConditionalSelect(condition, op1, op2);
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void Vsqrt_S(ArmEmitterContext context)
|
public static void Vsqrt_S(ArmEmitterContext context)
|
||||||
|
@ -900,7 +1097,15 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
public static void Vsub_I(ArmEmitterContext context)
|
public static void Vsub_I(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2));
|
if (Optimizations.UseSse2)
|
||||||
|
{
|
||||||
|
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||||
|
EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PsubInstruction[op.Size], op1, op2));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void EmitSse41MaxMinNumOpF32(ArmEmitterContext context, bool isMaxNum, bool scalar)
|
private static void EmitSse41MaxMinNumOpF32(ArmEmitterContext context, bool isMaxNum, bool scalar)
|
||||||
|
|
|
@ -31,7 +31,7 @@ namespace ARMeilleure.Instructions
|
||||||
15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0 // S
|
15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0 // S
|
||||||
};
|
};
|
||||||
|
|
||||||
private static readonly long _zeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0;
|
public static readonly long ZeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0;
|
||||||
#endregion
|
#endregion
|
||||||
|
|
||||||
#region "X86 SSE Intrinsics"
|
#region "X86 SSE Intrinsics"
|
||||||
|
@ -1026,8 +1026,8 @@ namespace ARMeilleure.Instructions
|
||||||
|
|
||||||
if (op.RegisterSize == RegisterSize.Simd64)
|
if (op.RegisterSize == RegisterSize.Simd64)
|
||||||
{
|
{
|
||||||
Operand zeroEvenMask = X86GetElements(context, _zeroMask, EvenMasks[op.Size]);
|
Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]);
|
||||||
Operand zeroOddMask = X86GetElements(context, _zeroMask, OddMasks [op.Size]);
|
Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks [op.Size]);
|
||||||
|
|
||||||
Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n
|
Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n
|
||||||
|
|
||||||
|
|
|
@ -589,7 +589,7 @@ namespace ARMeilleure.Instructions
|
||||||
EmitVectorUnaryOpSimd32(context, (m) => context.AddIntrinsic(inst, m));
|
EmitVectorUnaryOpSimd32(context, (m) => context.AddIntrinsic(inst, m));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc)
|
public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc, int side = -1)
|
||||||
{
|
{
|
||||||
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||||
|
|
||||||
|
@ -597,16 +597,19 @@ namespace ARMeilleure.Instructions
|
||||||
Operand m = GetVecA32(op.Qm);
|
Operand m = GetVecA32(op.Qm);
|
||||||
Operand d = GetVecA32(op.Qd);
|
Operand d = GetVecA32(op.Qd);
|
||||||
|
|
||||||
|
if (side == -1) side = op.Vd;
|
||||||
|
|
||||||
if (!op.Q) //register swap: move relevant doubleword to destination side
|
if (!op.Q) //register swap: move relevant doubleword to destination side
|
||||||
{
|
{
|
||||||
n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd);
|
n = EmitSwapDoubleWordToSide(context, n, op.Vn, side);
|
||||||
m = EmitSwapDoubleWordToSide(context, m, op.Vm, op.Vd);
|
m = EmitSwapDoubleWordToSide(context, m, op.Vm, side);
|
||||||
}
|
}
|
||||||
|
|
||||||
Operand res = vectorFunc(n, m);
|
Operand res = vectorFunc(n, m);
|
||||||
|
|
||||||
if (!op.Q) //register insert
|
if (!op.Q) //register insert
|
||||||
{
|
{
|
||||||
|
if (side != op.Vd) EmitSwapDoubleWordToSide(context, m, side, op.Vd);
|
||||||
res = EmitDoubleWordInsert(context, d, res, op.Vd);
|
res = EmitDoubleWordInsert(context, d, res, op.Vd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -771,6 +774,169 @@ namespace ARMeilleure.Instructions
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// By Scalar
|
||||||
|
|
||||||
|
public static void EmitVectorByScalarOpSimd32(ArmEmitterContext context, Func2I vectorFunc)
|
||||||
|
{
|
||||||
|
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
|
||||||
|
|
||||||
|
Operand n = GetVecA32(op.Qn);
|
||||||
|
Operand d = GetVecA32(op.Qd);
|
||||||
|
|
||||||
|
int index = op.Vm & 3;
|
||||||
|
int dupeMask = (index << 6) | (index << 4) | (index << 2) | index;
|
||||||
|
Operand m = GetVecA32(op.Vm >> 2);
|
||||||
|
m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask));
|
||||||
|
|
||||||
|
if (!op.Q) //register swap: move relevant doubleword to destination side
|
||||||
|
{
|
||||||
|
n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd);
|
||||||
|
}
|
||||||
|
|
||||||
|
Operand res = vectorFunc(n, m);
|
||||||
|
|
||||||
|
if (!op.Q) //register insert
|
||||||
|
{
|
||||||
|
res = EmitDoubleWordInsert(context, d, res, op.Vd);
|
||||||
|
}
|
||||||
|
|
||||||
|
context.Copy(d, res);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void EmitVectorByScalarOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
|
||||||
|
{
|
||||||
|
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
|
||||||
|
|
||||||
|
Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;
|
||||||
|
EmitVectorByScalarOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void EmitVectorsByScalarOpSimd32(ArmEmitterContext context, Func3I vectorFunc)
|
||||||
|
{
|
||||||
|
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
|
||||||
|
|
||||||
|
Operand n = GetVecA32(op.Qn);
|
||||||
|
Operand d = GetVecA32(op.Qd);
|
||||||
|
Operand initialD = d;
|
||||||
|
|
||||||
|
int index = op.Vm & 3;
|
||||||
|
int dupeMask = (index << 6) | (index << 4) | (index << 2) | index;
|
||||||
|
Operand m = GetVecA32(op.Vm >> 2);
|
||||||
|
m = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(dupeMask));
|
||||||
|
|
||||||
|
if (!op.Q) //register swap: move relevant doubleword to destination side
|
||||||
|
{
|
||||||
|
n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd);
|
||||||
|
}
|
||||||
|
|
||||||
|
Operand res = vectorFunc(d, n, m);
|
||||||
|
|
||||||
|
if (!op.Q) //register insert
|
||||||
|
{
|
||||||
|
res = EmitDoubleWordInsert(context, initialD, res, op.Vd);
|
||||||
|
}
|
||||||
|
|
||||||
|
context.Copy(initialD, res);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void EmitVectorsByScalarOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
|
||||||
|
{
|
||||||
|
OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp;
|
||||||
|
|
||||||
|
Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1;
|
||||||
|
Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2;
|
||||||
|
|
||||||
|
EmitVectorsByScalarOpSimd32(context, (d, n, m) =>
|
||||||
|
{
|
||||||
|
Operand res = context.AddIntrinsic(inst1, n, m);
|
||||||
|
return res = context.AddIntrinsic(inst2, d, res);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pairwise
|
||||||
|
|
||||||
|
public static void EmitSse2VectorPairwiseOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
|
||||||
|
{
|
||||||
|
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||||
|
|
||||||
|
EmitVectorBinaryOpSimd32(context, (n, m) =>
|
||||||
|
{
|
||||||
|
int sizeF = op.Size & 1;
|
||||||
|
|
||||||
|
if (sizeF == 0)
|
||||||
|
{
|
||||||
|
if (op.RegisterSize == RegisterSize.Simd64)
|
||||||
|
{
|
||||||
|
Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, n, m);
|
||||||
|
|
||||||
|
Operand zero = context.VectorZero();
|
||||||
|
|
||||||
|
Operand part0 = context.AddIntrinsic(Intrinsic.X86Movlhps, unpck, zero);
|
||||||
|
Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, unpck);
|
||||||
|
|
||||||
|
return context.AddIntrinsic(inst32, part0, part1);
|
||||||
|
}
|
||||||
|
else /* if (op.RegisterSize == RegisterSize.Simd128) */
|
||||||
|
{
|
||||||
|
const int sm0 = 2 << 6 | 0 << 4 | 2 << 2 | 0 << 0;
|
||||||
|
const int sm1 = 3 << 6 | 1 << 4 | 3 << 2 | 1 << 0;
|
||||||
|
|
||||||
|
Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm0));
|
||||||
|
Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, n, m, Const(sm1));
|
||||||
|
|
||||||
|
return context.AddIntrinsic(inst32, part0, part1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else /* if (sizeF == 1) */
|
||||||
|
{
|
||||||
|
Operand part0 = context.AddIntrinsic(Intrinsic.X86Unpcklpd, n, m);
|
||||||
|
Operand part1 = context.AddIntrinsic(Intrinsic.X86Unpckhpd, n, m);
|
||||||
|
|
||||||
|
return context.AddIntrinsic(inst64, part0, part1);
|
||||||
|
}
|
||||||
|
}, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void EmitSsse3VectorPairwiseOp32(ArmEmitterContext context, Intrinsic[] inst)
|
||||||
|
{
|
||||||
|
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;
|
||||||
|
|
||||||
|
EmitVectorBinaryOpSimd32(context, (n, m) =>
|
||||||
|
{
|
||||||
|
if (op.RegisterSize == RegisterSize.Simd64)
|
||||||
|
{
|
||||||
|
Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]);
|
||||||
|
Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks[op.Size]);
|
||||||
|
|
||||||
|
Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n
|
||||||
|
|
||||||
|
Operand left = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroEvenMask); // 0:even from m:n
|
||||||
|
Operand right = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroOddMask); // 0:odd from m:n
|
||||||
|
|
||||||
|
return context.AddIntrinsic(inst[op.Size], left, right);
|
||||||
|
}
|
||||||
|
else if (op.Size < 3)
|
||||||
|
{
|
||||||
|
Operand oddEvenMask = X86GetElements(context, OddMasks[op.Size], EvenMasks[op.Size]);
|
||||||
|
|
||||||
|
Operand oddEvenN = context.AddIntrinsic(Intrinsic.X86Pshufb, n, oddEvenMask); // odd:even from n
|
||||||
|
Operand oddEvenM = context.AddIntrinsic(Intrinsic.X86Pshufb, m, oddEvenMask); // odd:even from m
|
||||||
|
|
||||||
|
Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, oddEvenN, oddEvenM);
|
||||||
|
Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, oddEvenN, oddEvenM);
|
||||||
|
|
||||||
|
return context.AddIntrinsic(inst[op.Size], left, right);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Operand left = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m);
|
||||||
|
Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, n, m);
|
||||||
|
|
||||||
|
return context.AddIntrinsic(inst[3], left, right);
|
||||||
|
}
|
||||||
|
}, 0);
|
||||||
|
}
|
||||||
|
|
||||||
// Generic Functions
|
// Generic Functions
|
||||||
|
|
||||||
public static Operand EmitSoftFloatCallDefaultFpscr(
|
public static Operand EmitSoftFloatCallDefaultFpscr(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue