From 69450534acd0b6678b79c8acf6a3bf14470d422e Mon Sep 17 00:00:00 2001 From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> Date: Sun, 10 Mar 2019 00:29:23 +0100 Subject: [PATCH] Add (R)shrn_V Sse opt.; add "Part" & "Shift" opt.. Remove Tmp stuff; remove Dup. Nits. --- ChocolArm64/Instructions/InstEmitSimdShift.cs | 177 +++++++++++++----- 1 file changed, 131 insertions(+), 46 deletions(-) diff --git a/ChocolArm64/Instructions/InstEmitSimdShift.cs b/ChocolArm64/Instructions/InstEmitSimdShift.cs index c0b20d7ea6..092d291524 100644 --- a/ChocolArm64/Instructions/InstEmitSimdShift.cs +++ b/ChocolArm64/Instructions/InstEmitSimdShift.cs @@ -5,6 +5,7 @@ using ChocolArm64.State; using ChocolArm64.Translation; using System; using System.Reflection.Emit; +using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using static ChocolArm64.Instructions.InstEmitSimdHelper; @@ -13,9 +14,65 @@ namespace ChocolArm64.Instructions { static partial class InstEmit { +#region "Masks" + private static readonly long[] _masks_RshrnShrn = new long[] + { + 14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0, + 13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0, + 11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0 + }; +#endregion + public static void Rshrn_V(ILEmitterCtx context) { - EmitVectorShrImmNarrowOpZx(context, round: true); + if (Optimizations.UseSsse3) + { + OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp; + + Type[] typesAdd = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1], VectorUIntTypesPerSizeLog2[op.Size + 1] }; + Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1], typeof(byte) }; + Type[] typesSfl = new Type[] { typeof(Vector128), typeof(Vector128) }; + Type[] typesSav = new Type[] { UIntTypesPerSizeLog2[op.Size + 1] }; + Type[] typesSve = new Type[] { typeof(long), typeof(long) }; + + string nameMov = op.RegisterSize == RegisterSize.Simd128 + ? nameof(Sse.MoveLowToHigh) + : nameof(Sse.MoveHighToLow); + + int shift = GetImmShr(op); + + long roundConst = 1L << (shift - 1); + + context.EmitLdvec(op.Rd); + VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero)); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh))); + + context.EmitLdvec(op.Rn); + + context.EmitLdc_I8(roundConst); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd)); + + context.EmitLdc_I4(shift); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesSrl)); // value + + context.EmitLdc_I8(_masks_RshrnShrn[op.Size]); // mask + context.Emit(OpCodes.Dup); // mask + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl)); + + context.EmitCall(typeof(Sse).GetMethod(nameMov)); + + context.EmitStvec(op.Rd); + } + else + { + EmitVectorShrImmNarrowOpZx(context, round: true); + } } public static void Shl_S(ILEmitterCtx context) @@ -80,12 +137,13 @@ namespace ChocolArm64.Instructions nameof(Sse41.ConvertToVector128Int32), nameof(Sse41.ConvertToVector128Int64) }; - int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0; - context.EmitLdvec(op.Rn); - context.EmitLdc_I4(numBytes); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll)); + if (op.RegisterSize == RegisterSize.Simd128) + { + context.Emit(OpCodes.Ldc_I4_8); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll)); + } context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt)); @@ -102,7 +160,45 @@ namespace ChocolArm64.Instructions public static void Shrn_V(ILEmitterCtx context) { - EmitVectorShrImmNarrowOpZx(context, round: false); + if (Optimizations.UseSsse3) + { + OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp; + + Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1], typeof(byte) }; + Type[] typesSfl = new Type[] { typeof(Vector128), typeof(Vector128) }; + Type[] typesSve = new Type[] { typeof(long), typeof(long) }; + + string nameMov = op.RegisterSize == RegisterSize.Simd128 + ? nameof(Sse.MoveLowToHigh) + : nameof(Sse.MoveHighToLow); + + int shift = GetImmShr(op); + + context.EmitLdvec(op.Rd); + VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero)); + + context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh))); + + context.EmitLdvec(op.Rn); + + context.EmitLdc_I4(shift); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesSrl)); // value + + context.EmitLdc_I8(_masks_RshrnShrn[op.Size]); // mask + context.Emit(OpCodes.Dup); // mask + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl)); + + context.EmitCall(typeof(Sse).GetMethod(nameMov)); + + context.EmitStvec(op.Rd); + } + else + { + EmitVectorShrImmNarrowOpZx(context, round: false); + } } public static void Sli_V(ILEmitterCtx context) @@ -271,8 +367,7 @@ namespace ChocolArm64.Instructions { OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp; - if (Optimizations.UseSse2 && op.Size > 0 - && op.Size < 3) + if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3) { Type[] typesShs = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) }; Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] }; @@ -282,16 +377,13 @@ namespace ChocolArm64.Instructions context.EmitLdvec(op.Rn); - context.Emit(OpCodes.Dup); - context.EmitStvectmp(); - context.EmitLdc_I4(eSize - shift); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs)); context.EmitLdc_I4(eSize - 1); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs)); - context.EmitLdvectmp(); + context.EmitLdvec(op.Rn); context.EmitLdc_I4(shift); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightArithmetic), typesShs)); @@ -320,8 +412,7 @@ namespace ChocolArm64.Instructions { OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp; - if (Optimizations.UseSse2 && op.Size > 0 - && op.Size < 3) + if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3) { Type[] typesShs = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) }; Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] }; @@ -332,16 +423,13 @@ namespace ChocolArm64.Instructions context.EmitLdvec(op.Rd); context.EmitLdvec(op.Rn); - context.Emit(OpCodes.Dup); - context.EmitStvectmp(); - context.EmitLdc_I4(eSize - shift); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs)); context.EmitLdc_I4(eSize - 1); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs)); - context.EmitLdvectmp(); + context.EmitLdvec(op.Rn); context.EmitLdc_I4(shift); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightArithmetic), typesShs)); @@ -403,17 +491,21 @@ namespace ChocolArm64.Instructions nameof(Sse41.ConvertToVector128Int32), nameof(Sse41.ConvertToVector128Int64) }; - int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0; - context.EmitLdvec(op.Rn); - context.EmitLdc_I4(numBytes); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll)); + if (op.RegisterSize == RegisterSize.Simd128) + { + context.Emit(OpCodes.Ldc_I4_8); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll)); + } context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt)); - context.EmitLdc_I4(shift); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll)); + if (shift != 0) + { + context.EmitLdc_I4(shift); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll)); + } context.EmitStvec(op.Rd); } @@ -432,8 +524,7 @@ namespace ChocolArm64.Instructions { OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp; - if (Optimizations.UseSse2 && op.Size > 0 - && op.Size < 3) + if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3) { Type[] typesSra = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) }; @@ -464,8 +555,7 @@ namespace ChocolArm64.Instructions { OpCodeSimdShImm64 op = (OpCodeSimdShImm64)context.CurrOp; - if (Optimizations.UseSse2 && op.Size > 0 - && op.Size < 3) + if (Optimizations.UseSse2 && op.Size > 0 && op.Size < 3) { Type[] typesSra = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) }; Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] }; @@ -612,16 +702,13 @@ namespace ChocolArm64.Instructions context.EmitLdvec(op.Rn); - context.Emit(OpCodes.Dup); - context.EmitStvectmp(); - context.EmitLdc_I4(eSize - shift); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs)); context.EmitLdc_I4(eSize - 1); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs)); - context.EmitLdvectmp(); + context.EmitLdvec(op.Rn); context.EmitLdc_I4(shift); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs)); @@ -661,16 +748,13 @@ namespace ChocolArm64.Instructions context.EmitLdvec(op.Rd); context.EmitLdvec(op.Rn); - context.Emit(OpCodes.Dup); - context.EmitStvectmp(); - context.EmitLdc_I4(eSize - shift); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesShs)); context.EmitLdc_I4(eSize - 1); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs)); - context.EmitLdvectmp(); + context.EmitLdvec(op.Rn); context.EmitLdc_I4(shift); context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical), typesShs)); @@ -732,17 +816,21 @@ namespace ChocolArm64.Instructions nameof(Sse41.ConvertToVector128Int32), nameof(Sse41.ConvertToVector128Int64) }; - int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0; - context.EmitLdvec(op.Rn); - context.EmitLdc_I4(numBytes); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll)); + if (op.RegisterSize == RegisterSize.Simd128) + { + context.Emit(OpCodes.Ldc_I4_8); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSll)); + } context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt)); - context.EmitLdc_I4(shift); - context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll)); + if (shift != 0) + { + context.EmitLdc_I4(shift); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical), typesSll)); + } context.EmitStvec(op.Rd); } @@ -899,12 +987,9 @@ namespace ChocolArm64.Instructions context.Emit(OpCodes.Add); } - EmitVectorInsertTmp(context, index, op.Size); + EmitVectorInsert(context, op.Rd, index, op.Size); } - context.EmitLdvectmp(); - context.EmitStvec(op.Rd); - if ((op.RegisterSize == RegisterSize.Simd64) || scalar) { EmitVectorZeroUpper(context, op.Rd);