diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs index 4603ae0b17..557e707e10 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -3121,7 +3121,7 @@ namespace ARMeilleure.Instructions context.Copy(GetVec(op.Rd), res); } - private static Operand EmitSse2VectorIsQNaNOpF(ArmEmitterContext context, Operand opF) + public static Operand EmitSse2VectorIsQNaNOpF(ArmEmitterContext context, Operand opF) { IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs index afcfb52c27..907c57459f 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic32.cs @@ -15,7 +15,28 @@ namespace ARMeilleure.Instructions { public static void Vabs_S(ArmEmitterContext context) { - EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Abs, Math.Abs, op1)); + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarUnaryOpSimd32(context, (m) => + { + if ((op.Size & 1) == 0) + { + Operand mask = X86GetScalar(context, -0f); + return context.AddIntrinsic(Intrinsic.X86Andnps, mask, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + return context.AddIntrinsic(Intrinsic.X86Andnpd, mask, m); + } + }); + } + else + { + EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Abs, Math.Abs, op1)); + } + } public static void Vabs_V(ArmEmitterContext context) @@ -24,7 +45,26 @@ namespace ARMeilleure.Instructions if (op.F) { - EmitVectorUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Abs, Math.Abs, op1)); + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitVectorUnaryOpSimd32(context, (m) => + { + if ((op.Size & 1) == 0) + { + Operand mask = X86GetScalar(context, -0f); + return context.AddIntrinsic(Intrinsic.X86Andnps, mask, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + return context.AddIntrinsic(Intrinsic.X86Andnpd, mask, m); + } + }); + } + else + { + EmitVectorUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Abs, Math.Abs, op1)); + } } else { @@ -182,18 +222,80 @@ namespace ARMeilleure.Instructions public static void Vneg_S(ArmEmitterContext context) { - //TODO: intrinsic that XORs the sign bit - EmitScalarUnaryOpF32(context, (op1) => context.Negate(op1)); + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + if (Optimizations.UseSse2) + { + EmitScalarUnaryOpSimd32(context, (m) => + { + if ((op.Size & 1) == 0) + { + Operand mask = X86GetScalar(context, -0f); + return context.AddIntrinsic(Intrinsic.X86Xorps, mask, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, m); + } + }); + } + else + { + EmitScalarUnaryOpF32(context, (op1) => context.Negate(op1)); + } } public static void Vnmul_S(ArmEmitterContext context) { - EmitScalarBinaryOpF32(context, (op1, op2) => context.Negate(context.Multiply(op1, op2))); + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + if (Optimizations.UseSse2) + { + EmitScalarBinaryOpSimd32(context, (n, m) => + { + if ((op.Size & 1) == 0) + { + Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + Operand mask = X86GetScalar(context, -0f); + return context.AddIntrinsic(Intrinsic.X86Xorps, mask, res); + } + else + { + Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + Operand mask = X86GetScalar(context, -0d); + return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res); + } + }); + } + else + { + EmitScalarBinaryOpF32(context, (op1, op2) => context.Negate(context.Multiply(op1, op2))); + } } public static void Vnmla_S(ArmEmitterContext context) { - if (Optimizations.FastFP) + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpSimd32(context, (d, n, m) => + { + if ((op.Size & 1) == 0) + { + Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addss, d, res); + Operand mask = X86GetScalar(context, -0f); + return context.AddIntrinsic(Intrinsic.X86Xorps, mask, res); + } + else + { + Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addsd, d, res); + Operand mask = X86GetScalar(context, -0d); + return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res); + } + }); + } + else if (Optimizations.FastFP) { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { @@ -211,7 +313,29 @@ namespace ARMeilleure.Instructions public static void Vnmls_S(ArmEmitterContext context) { - if (Optimizations.FastFP) + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitScalarTernaryOpSimd32(context, (d, n, m) => + { + if ((op.Size & 1) == 0) + { + Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + Operand mask = X86GetScalar(context, -0f); + d = context.AddIntrinsic(Intrinsic.X86Xorps, mask, d); + return context.AddIntrinsic(Intrinsic.X86Addss, d, res); + + } + else + { + Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + Operand mask = X86GetScalar(context, -0d); + d = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res); + return context.AddIntrinsic(Intrinsic.X86Addsd, d, res); + } + }); + } + else if (Optimizations.FastFP) { EmitScalarTernaryOpF32(context, (op1, op2, op3) => { @@ -229,9 +353,30 @@ namespace ARMeilleure.Instructions public static void Vneg_V(ArmEmitterContext context) { - if ((context.CurrOp as OpCode32Simd).F) + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + if (op.F) { - EmitVectorUnaryOpF32(context, (op1) => context.Negate(op1)); + if (Optimizations.UseSse2) + { + EmitVectorUnaryOpSimd32(context, (m) => + { + if ((op.Size & 1) == 0) + { + Operand mask = X86GetScalar(context, -0f); + return context.AddIntrinsic(Intrinsic.X86Xorps, mask, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, m); + } + }); + } + else + { + EmitVectorUnaryOpF32(context, (op1) => context.Negate(op1)); + } } else { @@ -260,22 +405,50 @@ namespace ARMeilleure.Instructions public static void Vmaxnm_S(ArmEmitterContext context) { - EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, SoftFloat32.FPMaxNum, SoftFloat64.FPMaxNum, op1, op2)); + if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF32(context, true, true); + } + else + { + EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, SoftFloat32.FPMaxNum, SoftFloat64.FPMaxNum, op1, op2)); + } } public static void Vmaxnm_V(ArmEmitterContext context) { - EmitVectorBinaryOpSx32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMaxNumFpscr, SoftFloat64.FPMaxNumFpscr, op1, op2)); + if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF32(context, true, false); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMaxNumFpscr, SoftFloat64.FPMaxNumFpscr, op1, op2)); + } } public static void Vminnm_S(ArmEmitterContext context) { - EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, SoftFloat32.FPMinNum, SoftFloat64.FPMinNum, op1, op2)); + if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF32(context, false, true); + } + else + { + EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, SoftFloat32.FPMinNum, SoftFloat64.FPMinNum, op1, op2)); + } } public static void Vminnm_V(ArmEmitterContext context) { - EmitVectorBinaryOpSx32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMinNumFpscr, SoftFloat64.FPMinNumFpscr, op1, op2)); + if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse41MaxMinNumOpF32(context, false, false); + } + else + { + EmitVectorBinaryOpSx32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMinNumFpscr, SoftFloat64.FPMinNumFpscr, op1, op2)); + } } public static void Vmax_V(ArmEmitterContext context) @@ -291,12 +464,12 @@ namespace ARMeilleure.Instructions return EmitSoftFloatCallDefaultFpscr(context, SoftFloat32.FPMaxFpscr, SoftFloat64.FPMaxFpscr, op1, op2); }); } - } public static void Vmax_I(ArmEmitterContext context) { OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + if (op.U) { EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2)); @@ -325,6 +498,7 @@ namespace ARMeilleure.Instructions public static void Vmin_I(ArmEmitterContext context) { OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + if (op.U) { EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2)); @@ -728,5 +902,56 @@ namespace ARMeilleure.Instructions { EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2)); } + + private static void EmitSse41MaxMinNumOpF32(ArmEmitterContext context, bool isMaxNum, bool scalar) + { + IOpCode32Simd op = (IOpCode32Simd)context.CurrOp; + + Func genericEmit = (n, m) => + { + Operand nNum = context.Copy(n); + Operand mNum = context.Copy(m); + + Operand nQNaNMask = InstEmit.EmitSse2VectorIsQNaNOpF(context, nNum); + Operand mQNaNMask = InstEmit.EmitSse2VectorIsQNaNOpF(context, mNum); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + Operand negInfMask = X86GetAllElements(context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity); + + Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnps, mQNaNMask, nQNaNMask); + Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnps, nQNaNMask, mQNaNMask); + + nNum = context.AddIntrinsic(Intrinsic.X86Blendvps, nNum, negInfMask, nMask); + mNum = context.AddIntrinsic(Intrinsic.X86Blendvps, mNum, negInfMask, mMask); + + return context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxps : Intrinsic.X86Minps, nNum, mNum); + } + else /* if (sizeF == 1) */ + { + Operand negInfMask = X86GetAllElements(context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity); + + Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnpd, mQNaNMask, nQNaNMask); + Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnpd, nQNaNMask, mQNaNMask); + + nNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, nNum, negInfMask, nMask); + mNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, mNum, negInfMask, mMask); + + return context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxpd : Intrinsic.X86Minpd, nNum, mNum); + } + }; + + if (scalar) + { + EmitScalarBinaryOpSimd32(context, genericEmit); + } + else + { + EmitVectorBinaryOpSimd32(context, genericEmit); + } + + } } } diff --git a/ARMeilleure/Instructions/InstEmitSimdCmp32.cs b/ARMeilleure/Instructions/InstEmitSimdCmp32.cs index 3b2483ce57..672dfd81bb 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCmp32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCmp32.cs @@ -5,6 +5,7 @@ using ARMeilleure.Translation; using System; using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; using static ARMeilleure.Instructions.InstEmitSimdHelper32; using static ARMeilleure.IntermediateRepresentation.OperandHelper; @@ -16,7 +17,14 @@ namespace ARMeilleure.Instructions { public static void Vceq_V(ArmEmitterContext context) { - EmitCmpOpF32(context, SoftFloat32.FPCompareEQFpscr, SoftFloat64.FPCompareEQFpscr, false); + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF32(context, CmpCondition.Equal, false); + } + else + { + EmitCmpOpF32(context, SoftFloat32.FPCompareEQFpscr, SoftFloat64.FPCompareEQFpscr, false); + } } public static void Vceq_I(ArmEmitterContext context) @@ -30,7 +38,14 @@ namespace ARMeilleure.Instructions if (op.F) { - EmitCmpOpF32(context, SoftFloat32.FPCompareEQFpscr, SoftFloat64.FPCompareEQFpscr, true); + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF32(context, CmpCondition.Equal, true); + } + else + { + EmitCmpOpF32(context, SoftFloat32.FPCompareEQFpscr, SoftFloat64.FPCompareEQFpscr, true); + } } else { @@ -40,7 +55,14 @@ namespace ARMeilleure.Instructions public static void Vcge_V(ArmEmitterContext context) { - EmitCmpOpF32(context, SoftFloat32.FPCompareGEFpscr, SoftFloat64.FPCompareGEFpscr, false); + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF32(context, CmpCondition.GreaterThanOrEqual, false); + } + else + { + EmitCmpOpF32(context, SoftFloat32.FPCompareGEFpscr, SoftFloat64.FPCompareGEFpscr, false); + } } public static void Vcge_I(ArmEmitterContext context) @@ -56,7 +78,14 @@ namespace ARMeilleure.Instructions if (op.F) { - EmitCmpOpF32(context, SoftFloat32.FPCompareGEFpscr, SoftFloat64.FPCompareGEFpscr, true); + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF32(context, CmpCondition.GreaterThanOrEqual, true); + } + else + { + EmitCmpOpF32(context, SoftFloat32.FPCompareGEFpscr, SoftFloat64.FPCompareGEFpscr, true); + } } else { @@ -66,7 +95,14 @@ namespace ARMeilleure.Instructions public static void Vcgt_V(ArmEmitterContext context) { - EmitCmpOpF32(context, SoftFloat32.FPCompareGTFpscr, SoftFloat64.FPCompareGTFpscr, false); + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF32(context, CmpCondition.GreaterThan, false); + } + else + { + EmitCmpOpF32(context, SoftFloat32.FPCompareGTFpscr, SoftFloat64.FPCompareGTFpscr, false); + } } public static void Vcgt_I(ArmEmitterContext context) @@ -82,7 +118,14 @@ namespace ARMeilleure.Instructions if (op.F) { - EmitCmpOpF32(context, SoftFloat32.FPCompareGTFpscr, SoftFloat64.FPCompareGTFpscr, true); + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF32(context, CmpCondition.GreaterThan, true); + } + else + { + EmitCmpOpF32(context, SoftFloat32.FPCompareGTFpscr, SoftFloat64.FPCompareGTFpscr, true); + } } else { @@ -96,7 +139,14 @@ namespace ARMeilleure.Instructions if (op.F) { - EmitCmpOpF32(context, SoftFloat32.FPCompareLEFpscr, SoftFloat64.FPCompareLEFpscr, true); + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF32(context, CmpCondition.LessThanOrEqual, true); + } + else + { + EmitCmpOpF32(context, SoftFloat32.FPCompareLEFpscr, SoftFloat64.FPCompareLEFpscr, true); + } } else { @@ -110,7 +160,14 @@ namespace ARMeilleure.Instructions if (op.F) { - EmitCmpOpF32(context, SoftFloat32.FPCompareLTFpscr, SoftFloat64.FPCompareLTFpscr, true); + if (Optimizations.FastFP && Optimizations.UseSse2) + { + EmitSse2CmpOpF32(context, CmpCondition.LessThanOrEqual, true); + } + else + { + EmitCmpOpF32(context, SoftFloat32.FPCompareLTFpscr, SoftFloat64.FPCompareLTFpscr, true); + } } else { @@ -224,8 +281,74 @@ namespace ARMeilleure.Instructions OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; bool cmpWithZero = (op.Opc & 2) != 0; + int fSize = op.Size & 1; + + if (Optimizations.FastFP && (signalNaNs ? Optimizations.UseAvx : Optimizations.UseSse2)) + { + CmpCondition cmpOrdered = signalNaNs ? CmpCondition.OrderedS : CmpCondition.OrderedQ; + + bool doubleSize = fSize != 0; + int shift = doubleSize ? 1 : 2; + Operand m = GetVecA32(op.Vm >> shift); + Operand n = GetVecA32(op.Vd >> shift); + + n = EmitSwapScalar(context, n, op.Vd, doubleSize); + m = cmpWithZero ? context.VectorZero() : EmitSwapScalar(context, m, op.Vm, doubleSize); + + Operand lblNaN = Label(); + Operand lblEnd = Label(); + + if (!doubleSize) + { + Operand ordMask = context.AddIntrinsic(Intrinsic.X86Cmpss, n, m, Const((int)cmpOrdered)); + + Operand isOrdered = context.AddIntrinsicInt(Intrinsic.X86Cvtsi2si, ordMask); + + context.BranchIfFalse(lblNaN, isOrdered); + + Operand cf = context.AddIntrinsicInt(Intrinsic.X86Comissge, n, m); + Operand zf = context.AddIntrinsicInt(Intrinsic.X86Comisseq, n, m); + Operand nf = context.AddIntrinsicInt(Intrinsic.X86Comisslt, n, m); + + EmitSetFPSCRFlags(context, context.BitwiseOr( + context.ShiftLeft(cf, Const(1)), + context.BitwiseOr( + context.ShiftLeft(zf, Const(2)), + context.ShiftLeft(nf, Const(3)) + ) + )); + } + else + { + Operand ordMask = context.AddIntrinsic(Intrinsic.X86Cmpsd, n, m, Const((int)cmpOrdered)); + + Operand isOrdered = context.AddIntrinsicLong(Intrinsic.X86Cvtsi2si, ordMask); + + context.BranchIfFalse(lblNaN, isOrdered); + + Operand cf = context.AddIntrinsicInt(Intrinsic.X86Comisdge, n, m); + Operand zf = context.AddIntrinsicInt(Intrinsic.X86Comisdeq, n, m); + Operand nf = context.AddIntrinsicInt(Intrinsic.X86Comisdlt, n, m); + + EmitSetFPSCRFlags(context, context.BitwiseOr( + context.ShiftLeft(cf, Const(1)), + context.BitwiseOr( + context.ShiftLeft(zf, Const(2)), + context.ShiftLeft(nf, Const(3)) + ) + )); + } + + context.Branch(lblEnd); + + context.MarkLabel(lblNaN); + + EmitSetFPSCRFlags(context, Const(3)); + + context.MarkLabel(lblEnd); + } + else { - int fSize = op.Size & 1; OperandType type = fSize != 0 ? OperandType.FP64 : OperandType.FP32; Operand ne = ExtractScalar(context, type, op.Vd); @@ -269,5 +392,28 @@ namespace ARMeilleure.Instructions SetFpFlag(context, FPState.ZFlag, Extract(nzcv, 2)); SetFpFlag(context, FPState.NFlag, Extract(nzcv, 3)); } + + private static void EmitSse2CmpOpF32(ArmEmitterContext context, CmpCondition cond, bool zero) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + int sizeF = op.Size & 1; + Intrinsic inst = (sizeF == 0) ? Intrinsic.X86Cmpps : Intrinsic.X86Cmppd; + + if (zero) + { + EmitVectorUnaryOpSimd32(context, (m) => + { + return context.AddIntrinsic(inst, m, context.VectorZero(), Const((int)cond)); + }); + } + else + { + EmitVectorBinaryOpSimd32(context, (n, m) => + { + return context.AddIntrinsic(inst, n, m, Const((int)cond)); + }); + } + } } } diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs index 9947822ac4..f1dd34630a 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs @@ -475,7 +475,7 @@ namespace ARMeilleure.Instructions // Intrinsic Emits - private static Operand EmitSwapDoubleWordToSide(ArmEmitterContext context, Operand input, int originalV, int targetV) + public static Operand EmitSwapDoubleWordToSide(ArmEmitterContext context, Operand input, int originalV, int targetV) { int originalSide = originalV & 1; int targetSide = targetV & 1; @@ -495,7 +495,7 @@ namespace ARMeilleure.Instructions } } - private static Operand EmitDoubleWordInsert(ArmEmitterContext context, Operand target, Operand value, int targetV) + public static Operand EmitDoubleWordInsert(ArmEmitterContext context, Operand target, Operand value, int targetV) { int targetSide = targetV & 1; int shuffleMask = 2 | 0; @@ -510,7 +510,7 @@ namespace ARMeilleure.Instructions } } - private static Operand EmitSwapScalar(ArmEmitterContext context, Operand target, int reg, bool doubleWidth) + public static Operand EmitSwapScalar(ArmEmitterContext context, Operand target, int reg, bool doubleWidth) { // index into 0, 0 into index. This swap happens at the start and end of an A32 scalar op if required. int index = reg & (doubleWidth ? 1 : 3); @@ -530,7 +530,7 @@ namespace ARMeilleure.Instructions } } - private static Operand EmitInsertScalar(ArmEmitterContext context, Operand target, Operand value, int reg, bool doubleWidth) + public static Operand EmitInsertScalar(ArmEmitterContext context, Operand target, Operand value, int reg, bool doubleWidth) { // insert from index 0 in value to index in target int index = reg & (doubleWidth ? 1 : 3); @@ -556,21 +556,54 @@ namespace ARMeilleure.Instructions } } - public static void EmitVectorUnaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + // Vector Operand Templates + + public static void EmitVectorUnaryOpSimd32(ArmEmitterContext context, Func1I vectorFunc) { OpCode32Simd op = (OpCode32Simd)context.CurrOp; Operand m = GetVecA32(op.Qm); Operand d = GetVecA32(op.Qd); - Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; - if (!op.Q) //register swap: move relevant doubleword to destination side { m = EmitSwapDoubleWordToSide(context, m, op.Vm, op.Vd); } - Operand res = context.AddIntrinsic(inst, m); + Operand res = vectorFunc(m); + + if (!op.Q) //register insert + { + res = EmitDoubleWordInsert(context, d, res, op.Vd); + } + + context.Copy(d, res); + } + + public static void EmitVectorUnaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCode32Simd op = (OpCode32Simd)context.CurrOp; + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + + EmitVectorUnaryOpSimd32(context, (m) => context.AddIntrinsic(inst, m)); + } + + public static void EmitVectorBinaryOpSimd32(ArmEmitterContext context, Func2I vectorFunc) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Operand n = GetVecA32(op.Qn); + Operand m = GetVecA32(op.Qm); + Operand d = GetVecA32(op.Qd); + + if (!op.Q) //register swap: move relevant doubleword to destination side + { + n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd); + m = EmitSwapDoubleWordToSide(context, m, op.Vm, op.Vd); + } + + Operand res = vectorFunc(n, m); if (!op.Q) //register insert { @@ -584,29 +617,11 @@ namespace ARMeilleure.Instructions { OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; - Operand n = GetVecA32(op.Qn); - Operand m = GetVecA32(op.Qm); - Operand d = GetVecA32(op.Qd); - Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; - - if (!op.Q) //register swap: move relevant doubleword to destination side - { - n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd); - m = EmitSwapDoubleWordToSide(context, m, op.Vm, op.Vd); - } - - Operand res = context.AddIntrinsic(inst, n, m); - - if (!op.Q) //register insert - { - res = EmitDoubleWordInsert(context, d, res, op.Vd); - } - - context.Copy(d, res); + EmitVectorBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m)); } - public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2) + public static void EmitVectorTernaryOpSimd32(ArmEmitterContext context, Func3I vectorFunc) { OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; @@ -615,17 +630,13 @@ namespace ARMeilleure.Instructions Operand d = GetVecA32(op.Qd); Operand initialD = d; - Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1; - Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2; - if (!op.Q) //register swap: move relevant doubleword to destination side { n = EmitSwapDoubleWordToSide(context, n, op.Vn, op.Vd); m = EmitSwapDoubleWordToSide(context, m, op.Vm, op.Vd); } - Operand res = context.AddIntrinsic(inst1, n, m); - res = context.AddIntrinsic(inst2, d, res); + Operand res = vectorFunc(d, n, m); if (!op.Q) //register insert { @@ -635,7 +646,21 @@ namespace ARMeilleure.Instructions context.Copy(initialD, res); } - public static void EmitScalarUnaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + public static void EmitVectorTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2) + { + OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; + + Intrinsic inst1 = (op.Size & 1) != 0 ? inst64pt1 : inst32pt1; + Intrinsic inst2 = (op.Size & 1) != 0 ? inst64pt2 : inst32pt2; + + EmitVectorTernaryOpSimd32(context, (d, n, m) => + { + Operand res = context.AddIntrinsic(inst1, n, m); + return res = context.AddIntrinsic(inst2, d, res); + }); + } + + public static void EmitScalarUnaryOpSimd32(ArmEmitterContext context, Func1I scalarFunc) { OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; @@ -646,9 +671,8 @@ namespace ARMeilleure.Instructions m = EmitSwapScalar(context, m, op.Vm, doubleSize); - Intrinsic inst = doubleSize ? inst64 : inst32; + Operand res = scalarFunc(m); - Operand res = (inst == 0) ? m : context.AddIntrinsic(inst, m); if (false) // op.Vd == op.Vm) //small optimisation: can just swap it back for the result { res = EmitSwapScalar(context, res, op.Vd, doubleSize); @@ -662,7 +686,16 @@ namespace ARMeilleure.Instructions context.Copy(d, res); } - public static void EmitScalarBinaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + public static void EmitScalarUnaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + + EmitScalarUnaryOpSimd32(context, (m) => (inst == 0) ? m : context.AddIntrinsic(inst, m)); + } + + public static void EmitScalarBinaryOpSimd32(ArmEmitterContext context, Func2I scalarFunc) { OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; @@ -675,9 +708,7 @@ namespace ARMeilleure.Instructions n = EmitSwapScalar(context, n, op.Vn, doubleSize); m = EmitSwapScalar(context, m, op.Vm, doubleSize); - Intrinsic inst = doubleSize ? inst64 : inst32; - - Operand res = context.AddIntrinsic(inst, n, m); + Operand res = scalarFunc(n, m); if (false) // //small optimisation: can just swap it back for the result { @@ -692,7 +723,16 @@ namespace ARMeilleure.Instructions context.Copy(d, res); } - public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2) + public static void EmitScalarBinaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32; + + EmitScalarBinaryOpSimd32(context, (n, m) => context.AddIntrinsic(inst, n, m)); + } + + public static void EmitScalarTernaryOpSimd32(ArmEmitterContext context, Func3I scalarFunc) { OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; @@ -707,11 +747,7 @@ namespace ARMeilleure.Instructions m = EmitSwapScalar(context, m, op.Vm, doubleSize); d = EmitSwapScalar(context, d, op.Vd, doubleSize); - Intrinsic inst1 = doubleSize ? inst64pt1 : inst32pt1; - Intrinsic inst2 = doubleSize ? inst64pt2 : inst32pt2; - - Operand res = context.AddIntrinsic(inst1, n, m); - res = context.AddIntrinsic(inst2, d, res); + Operand res = scalarFunc(d, n, m); // insert scalar into vector res = EmitInsertScalar(context, initialD, res, op.Vd, doubleSize); @@ -719,6 +755,22 @@ namespace ARMeilleure.Instructions context.Copy(initialD, res); } + public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2) + { + OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; + + bool doubleSize = (op.Size & 1) != 0; + int shift = doubleSize ? 1 : 2; + Intrinsic inst1 = doubleSize ? inst64pt1 : inst32pt1; + Intrinsic inst2 = doubleSize ? inst64pt2 : inst32pt2; + + EmitScalarTernaryOpSimd32(context, (d, n, m) => + { + Operand res = context.AddIntrinsic(inst1, n, m); + return context.AddIntrinsic(inst2, d, res); + }); + } + // Generic Functions public static Operand EmitSoftFloatCallDefaultFpscr( diff --git a/ARMeilleure/Instructions/InstEmitSimdLogical32.cs b/ARMeilleure/Instructions/InstEmitSimdLogical32.cs index e2e9e18ee1..fef40a17ea 100644 --- a/ARMeilleure/Instructions/InstEmitSimdLogical32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdLogical32.cs @@ -1,4 +1,5 @@ using ARMeilleure.Decoders; +using ARMeilleure.IntermediateRepresentation; using ARMeilleure.Translation; using static ARMeilleure.Instructions.InstEmitSimdHelper32; @@ -9,7 +10,14 @@ namespace ARMeilleure.Instructions { public static void Vand_I(ArmEmitterContext context) { - EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseAnd(op1, op2)); + if (Optimizations.UseSse2) + { + EmitVectorBinaryOpF32(context, Intrinsic.X86Pand, Intrinsic.X86Pand); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseAnd(op1, op2)); + } } public static void Vbif(ArmEmitterContext context) @@ -24,33 +32,64 @@ namespace ARMeilleure.Instructions public static void Vbsl(ArmEmitterContext context) { - EmitVectorTernaryOpZx32(context, (op1, op2, op3) => + if (Optimizations.UseSse2) { - return context.BitwiseExclusiveOr( - context.BitwiseAnd(op1, - context.BitwiseExclusiveOr(op2, op3)), op3); - }); + EmitVectorTernaryOpSimd32(context, (d, n, m) => + { + Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, m); + res = context.AddIntrinsic(Intrinsic.X86Pand, res, d); + return context.AddIntrinsic(Intrinsic.X86Pxor, res, m); + }); + } + else + { + EmitVectorTernaryOpZx32(context, (op1, op2, op3) => + { + return context.BitwiseExclusiveOr( + context.BitwiseAnd(op1, + context.BitwiseExclusiveOr(op2, op3)), op3); + }); + } } public static void Vorr_I(ArmEmitterContext context) { - EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseOr(op1, op2)); + if (Optimizations.UseSse2) + { + EmitVectorBinaryOpF32(context, Intrinsic.X86Por, Intrinsic.X86Por); + } + else + { + EmitVectorBinaryOpZx32(context, (op1, op2) => context.BitwiseOr(op1, op2)); + } } private static void EmitBifBit(ArmEmitterContext context, bool notRm) { OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; - EmitVectorTernaryOpZx32(context, (d, n, m) => + if (Optimizations.UseSse2) { - if (notRm) + EmitVectorTernaryOpSimd32(context, (d, n, m) => { - m = context.BitwiseNot(m); - } - return context.BitwiseExclusiveOr( - context.BitwiseAnd(m, - context.BitwiseExclusiveOr(d, n)), d); - }); + Operand res = context.AddIntrinsic(Intrinsic.X86Pxor, n, d); + res = context.AddIntrinsic((notRm) ? Intrinsic.X86Pandn : Intrinsic.X86Pand, m, res); + return context.AddIntrinsic(Intrinsic.X86Pxor, d, res); + }); + } + else + { + EmitVectorTernaryOpZx32(context, (d, n, m) => + { + if (notRm) + { + m = context.BitwiseNot(m); + } + return context.BitwiseExclusiveOr( + context.BitwiseAnd(m, + context.BitwiseExclusiveOr(d, n)), d); + }); + } } } }