diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs index a8d62a01fc..c003eff309 100644 --- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs @@ -63,6 +63,7 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Minss, new IntrinsicInfo(X86Instruction.Minss, IntrinsicType.Binary)); Add(Intrinsic.X86Movhlps, new IntrinsicInfo(X86Instruction.Movhlps, IntrinsicType.Binary)); Add(Intrinsic.X86Movlhps, new IntrinsicInfo(X86Instruction.Movlhps, IntrinsicType.Binary)); + Add(Intrinsic.X86Movss, new IntrinsicInfo(X86Instruction.Movss, IntrinsicType.Binary)); Add(Intrinsic.X86Mulpd, new IntrinsicInfo(X86Instruction.Mulpd, IntrinsicType.Binary)); Add(Intrinsic.X86Mulps, new IntrinsicInfo(X86Instruction.Mulps, IntrinsicType.Binary)); Add(Intrinsic.X86Mulsd, new IntrinsicInfo(X86Instruction.Mulsd, IntrinsicType.Binary)); diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs index d60aefb059..c49130d63c 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs @@ -173,7 +173,6 @@ namespace ARMeilleure.Instructions // TODO: Fast Path. if (roundWithFpscr) { - // These need to get the FPSCR value, so it's worth noting we'd need to do a c# call at some point. if (floatSize == OperandType.FP64) { if (unsigned) @@ -363,7 +362,6 @@ namespace ARMeilleure.Instructions { EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Truncate, Math.Truncate, op1)); } - } private static Operand EmitFPConvert(ArmEmitterContext context, Operand value, OperandType type, bool signed) @@ -382,7 +380,7 @@ namespace ARMeilleure.Instructions private static void EmitSse41ConvertInt32(ArmEmitterContext context, FPRoundingMode roundMode, bool signed) { - // a port of the similar round function in InstEmitSimdCvt + // A port of the similar round function in InstEmitSimdCvt. OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; bool doubleSize = (op.Size & 1) != 0; @@ -457,7 +455,7 @@ namespace ARMeilleure.Instructions nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp); } - long fpMaxVal = 0x41E0000000000000L; // 2147483648.0000000d (2147483648) + long fpMaxVal = 0x41E0000000000000L; // 2147483648.0000000d (2147483648) Operand fpMaxValMask = X86GetScalar(context, fpMaxVal); diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs index 14229b8310..9ee7c39433 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs @@ -473,10 +473,12 @@ namespace ARMeilleure.Instructions context.Copy(GetVecA32(op.Qd), res); } - // Intrinsic Emits + // Intrinsic Helpers public static Operand EmitSwapDoubleWordToSide(ArmEmitterContext context, Operand input, int originalV, int targetV) { + Debug.Assert(input.Type == OperandType.V128); + int originalSide = originalV & 1; int targetSide = targetV & 1; @@ -487,16 +489,18 @@ namespace ARMeilleure.Instructions if (targetSide == 1) { - return context.AddIntrinsic(Intrinsic.X86Movlhps, input, input); // low to high + return context.AddIntrinsic(Intrinsic.X86Movlhps, input, input); // Low to high. } else { - return context.AddIntrinsic(Intrinsic.X86Movhlps, input, input); // high to low + return context.AddIntrinsic(Intrinsic.X86Movhlps, input, input); // High to low. } } public static Operand EmitDoubleWordInsert(ArmEmitterContext context, Operand target, Operand value, int targetV) { + Debug.Assert(target.Type == OperandType.V128 && value.Type == OperandType.V128); + int targetSide = targetV & 1; int shuffleMask = 2 | 0; @@ -510,45 +514,56 @@ namespace ARMeilleure.Instructions } } - public static Operand EmitSwapScalar(ArmEmitterContext context, Operand target, int reg, bool doubleWidth) + public static Operand EmitScalarInsert(ArmEmitterContext context, Operand target, Operand value, int reg, bool doubleWidth) { - // index into 0, 0 into index. This swap happens at the start and end of an A32 scalar op if required. - int index = reg & (doubleWidth ? 1 : 3); - if (index == 0) return target; + Debug.Assert(target.Type == OperandType.V128 && value.Type == OperandType.V128); - if (doubleWidth) - { - int shuffleMask = 1; // swap top and bottom (b0 = 1, b1 = 0) - return context.AddIntrinsic(Intrinsic.X86Shufpd, target, target, Const(shuffleMask)); - } - else - { - int shuffleMask = (3 << 6) | (2 << 4) | (1 << 2) | index; // swap index and 0 (others remain) - shuffleMask &= ~(3 << (index * 2)); - - return context.AddIntrinsic(Intrinsic.X86Shufps, target, target, Const(shuffleMask)); - } - } - - public static Operand EmitInsertScalar(ArmEmitterContext context, Operand target, Operand value, int reg, bool doubleWidth) - { - // insert from index 0 in value to index in target + // Insert from index 0 in value to index in target. int index = reg & (doubleWidth ? 1 : 3); if (doubleWidth) { if (index == 1) { - return context.AddIntrinsic(Intrinsic.X86Movlhps, target, value); // low to high + return context.AddIntrinsic(Intrinsic.X86Movlhps, target, value); // Low to high. } else { - return context.AddIntrinsic(Intrinsic.X86Shufpd, value, target, Const(2)); // low to low, keep high from original + return context.AddIntrinsic(Intrinsic.X86Shufpd, value, target, Const(2)); // Low to low, keep high from original. } } else { - return context.AddIntrinsic(Intrinsic.X86Insertps, target, value, Const(index << 4)); + if (Optimizations.UseSse41) + { + return context.AddIntrinsic(Intrinsic.X86Insertps, target, value, Const(index << 4)); + } + else + { + target = EmitSwapScalar(context, target, index, doubleWidth); // Swap value to replace into element 0. + target = context.AddIntrinsic(Intrinsic.X86Movss, target, value); // Move the value into element 0 of the vector. + return EmitSwapScalar(context, target, index, doubleWidth); // Swap new value back to the correct index. + } + } + } + + public static Operand EmitSwapScalar(ArmEmitterContext context, Operand target, int reg, bool doubleWidth) + { + // Index into 0, 0 into index. This swap happens at the start of an A32 scalar op if required. + int index = reg & (doubleWidth ? 1 : 3); + if (index == 0) return target; + + if (doubleWidth) + { + int shuffleMask = 1; // Swap top and bottom. (b0 = 1, b1 = 0) + return context.AddIntrinsic(Intrinsic.X86Shufpd, target, target, Const(shuffleMask)); + } + else + { + int shuffleMask = (3 << 6) | (2 << 4) | (1 << 2) | index; // Swap index and 0. (others remain) + shuffleMask &= ~(3 << (index * 2)); + + return context.AddIntrinsic(Intrinsic.X86Shufps, target, target, Const(shuffleMask)); } } @@ -679,7 +694,7 @@ namespace ARMeilleure.Instructions Operand res = scalarFunc(m); // Insert scalar into vector. - res = EmitInsertScalar(context, d, res, op.Vd, doubleSize); + res = EmitScalarInsert(context, d, res, op.Vd, doubleSize); context.Copy(d, res); } @@ -709,7 +724,7 @@ namespace ARMeilleure.Instructions Operand res = scalarFunc(n, m); // Insert scalar into vector. - res = EmitInsertScalar(context, d, res, op.Vd, doubleSize); + res = EmitScalarInsert(context, d, res, op.Vd, doubleSize); context.Copy(d, res); } @@ -741,7 +756,7 @@ namespace ARMeilleure.Instructions Operand res = scalarFunc(d, n, m); // Insert scalar into vector. - res = EmitInsertScalar(context, initialD, res, op.Vd, doubleSize); + res = EmitScalarInsert(context, initialD, res, op.Vd, doubleSize); context.Copy(initialD, res); } diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs index e2e11e9f40..c60e80cf08 100644 --- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs +++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs @@ -52,6 +52,7 @@ namespace ARMeilleure.IntermediateRepresentation X86Minss, X86Movhlps, X86Movlhps, + X86Movss, X86Mulpd, X86Mulps, X86Mulsd,