diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs index 55cd5cb610..ced61882b2 100644 --- a/ARMeilleure/Decoders/OpCodeTable.cs +++ b/ARMeilleure/Decoders/OpCodeTable.cs @@ -485,6 +485,8 @@ namespace ARMeilleure.Decoders SetA64("01111110<<100001001010xxxxxxxxxx", InstName.Sqxtun_S, InstEmit.Sqxtun_S, typeof(OpCodeSimd)); SetA64("0x101110<<100001001010xxxxxxxxxx", InstName.Sqxtun_V, InstEmit.Sqxtun_V, typeof(OpCodeSimd)); SetA64("0x001110<<1xxxxx000101xxxxxxxxxx", InstName.Srhadd_V, InstEmit.Srhadd_V, typeof(OpCodeSimdReg)); + SetA64("0x10111100>>>xxx010001xxxxxxxxxx", InstName.Sri_V, InstEmit.Sri_V, typeof(OpCodeSimdShImm)); + SetA64("0110111101xxxxxx010001xxxxxxxxxx", InstName.Sri_V, InstEmit.Sri_V, typeof(OpCodeSimdShImm)); SetA64("0>001110<<1xxxxx010101xxxxxxxxxx", InstName.Srshl_V, InstEmit.Srshl_V, typeof(OpCodeSimdReg)); SetA64("0101111101xxxxxx001001xxxxxxxxxx", InstName.Srshr_S, InstEmit.Srshr_S, typeof(OpCodeSimdShImm)); SetA64("0x00111100>>>xxx001001xxxxxxxxxx", InstName.Srshr_V, InstEmit.Srshr_V, typeof(OpCodeSimdShImm)); diff --git a/ARMeilleure/Instructions/InstEmitSimdShift.cs b/ARMeilleure/Instructions/InstEmitSimdShift.cs index 1aae491dfd..77c08ff07f 100644 --- a/ARMeilleure/Instructions/InstEmitSimdShift.cs +++ b/ARMeilleure/Instructions/InstEmitSimdShift.cs @@ -22,6 +22,11 @@ namespace ARMeilleure.Instructions 13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0, 11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0 }; + + private static readonly long[] _masks_SliSri = new long[] // Replication masks. + { + 0x0101010101010101L, 0x0001000100010001L, 0x0000000100000001L, 0x0000000000000001L + }; #endregion public static void Rshrn_V(ArmEmitterContext context) @@ -66,7 +71,7 @@ namespace ARMeilleure.Instructions res = context.AddIntrinsic(movInst, dLow, res); - context.Copy(GetVec(op.Rd), res); + context.Copy(d, res); } else { @@ -106,7 +111,7 @@ namespace ARMeilleure.Instructions } else { - EmitVectorUnaryOpZx(context, (op1) => context.ShiftLeft(op1, Const(shift))); + EmitVectorUnaryOpZx(context, (op1) => context.ShiftLeft(op1, Const(shift))); } } @@ -149,8 +154,6 @@ namespace ARMeilleure.Instructions int shift = GetImmShr(op); - long roundConst = 1L << (shift - 1); - Operand d = GetVec(op.Rd); Operand n = GetVec(op.Rn); @@ -170,7 +173,7 @@ namespace ARMeilleure.Instructions res = context.AddIntrinsic(movInst, dLow, res); - context.Copy(GetVec(op.Rd), res); + context.Copy(d, res); } else { @@ -182,30 +185,55 @@ namespace ARMeilleure.Instructions { OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; - Operand res = context.VectorZero(); - - int elems = op.GetBytesCount() >> op.Size; - int shift = GetImmShl(op); - ulong mask = shift != 0 ? ulong.MaxValue >> (64 - shift) : 0; + ulong mask = shift != 0 ? ulong.MaxValue >> (64 - shift) : 0UL; - for (int index = 0; index < elems; index++) + if (Optimizations.UseSse2 && op.Size > 0) { - Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); - Operand neShifted = context.ShiftLeft(ne, Const(shift)); + Intrinsic sllInst = X86PsllInstruction[op.Size]; - Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); + Operand nShifted = context.AddIntrinsic(sllInst, n, Const(shift)); - Operand deMasked = context.BitwiseAnd(de, Const(mask)); + Operand dMask = X86GetAllElements(context, (long)mask * _masks_SliSri[op.Size]); - Operand e = context.BitwiseOr(neShifted, deMasked); + Operand dMasked = context.AddIntrinsic(Intrinsic.X86Pand, d, dMask); - res = EmitVectorInsert(context, res, e, index, op.Size); + Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, dMasked); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); } + else + { + Operand res = context.VectorZero(); - context.Copy(GetVec(op.Rd), res); + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + + Operand neShifted = context.ShiftLeft(ne, Const(shift)); + + Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); + + Operand deMasked = context.BitwiseAnd(de, Const(mask)); + + Operand e = context.BitwiseOr(neShifted, deMasked); + + res = EmitVectorInsert(context, res, e, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } } public static void Sqrshl_V(ArmEmitterContext context) @@ -290,6 +318,62 @@ namespace ARMeilleure.Instructions EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx); } + public static void Sri_V(ArmEmitterContext context) + { + OpCodeSimdShImm op = (OpCodeSimdShImm)context.CurrOp; + + int shift = GetImmShr(op); + int eSize = 8 << op.Size; + + ulong mask = (ulong.MaxValue << (eSize - shift)) & (ulong.MaxValue >> (64 - eSize)); + + if (Optimizations.UseSse2 && op.Size > 0) + { + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + + Intrinsic srlInst = X86PsrlInstruction[op.Size]; + + Operand nShifted = context.AddIntrinsic(srlInst, n, Const(shift)); + + Operand dMask = X86GetAllElements(context, (long)mask * _masks_SliSri[op.Size]); + + Operand dMasked = context.AddIntrinsic(Intrinsic.X86Pand, d, dMask); + + Operand res = context.AddIntrinsic(Intrinsic.X86Por, nShifted, dMasked); + + if (op.RegisterSize == RegisterSize.Simd64) + { + res = context.VectorZeroUpper64(res); + } + + context.Copy(d, res); + } + else + { + Operand res = context.VectorZero(); + + int elems = op.GetBytesCount() >> op.Size; + + for (int index = 0; index < elems; index++) + { + Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size); + + Operand neShifted = shift != 64 ? context.ShiftRightUI(ne, Const(shift)) : Const(0UL); + + Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size); + + Operand deMasked = context.BitwiseAnd(de, Const(mask)); + + Operand e = context.BitwiseOr(neShifted, deMasked); + + res = EmitVectorInsert(context, res, e, index, op.Size); + } + + context.Copy(GetVec(op.Rd), res); + } + } + public static void Srshl_V(ArmEmitterContext context) { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; @@ -395,7 +479,7 @@ namespace ARMeilleure.Instructions res = context.VectorZeroUpper64(res); } - context.Copy(GetVec(op.Rd), res); + context.Copy(d, res); } else { @@ -690,7 +774,7 @@ namespace ARMeilleure.Instructions res = context.VectorZeroUpper64(res); } - context.Copy(GetVec(op.Rd), res); + context.Copy(d, res); } else { diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs index e03ab6166a..943a9674f5 100644 --- a/ARMeilleure/Instructions/InstName.cs +++ b/ARMeilleure/Instructions/InstName.cs @@ -354,6 +354,7 @@ namespace ARMeilleure.Instructions Sqxtun_S, Sqxtun_V, Srhadd_V, + Sri_V, Srshl_V, Srshr_S, Srshr_V, diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs index fbbc9f9fbe..327900c92b 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimdShImm.cs @@ -285,10 +285,11 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _ShrImm_S_D_() + private static uint[] _ShrImm_Sri_S_D_() { return new uint[] { + //0x7F404400u, // SRI D0, D0, #64 0x5F402400u, // SRSHR D0, D0, #64 0x5F403400u, // SRSRA D0, D0, #64 0x5F400400u, // SSHR D0, D0, #64 @@ -300,10 +301,11 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _ShrImm_V_8B_16B_() + private static uint[] _ShrImm_Sri_V_8B_16B_() { return new uint[] { + 0x2F084400u, // SRI V0.8B, V0.8B, #8 0x0F082400u, // SRSHR V0.8B, V0.8B, #8 0x0F083400u, // SRSRA V0.8B, V0.8B, #8 0x0F080400u, // SSHR V0.8B, V0.8B, #8 @@ -315,10 +317,11 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _ShrImm_V_4H_8H_() + private static uint[] _ShrImm_Sri_V_4H_8H_() { return new uint[] { + 0x2F104400u, // SRI V0.4H, V0.4H, #16 0x0F102400u, // SRSHR V0.4H, V0.4H, #16 0x0F103400u, // SRSRA V0.4H, V0.4H, #16 0x0F100400u, // SSHR V0.4H, V0.4H, #16 @@ -330,10 +333,11 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _ShrImm_V_2S_4S_() + private static uint[] _ShrImm_Sri_V_2S_4S_() { return new uint[] { + 0x2F204400u, // SRI V0.2S, V0.2S, #32 0x0F202400u, // SRSHR V0.2S, V0.2S, #32 0x0F203400u, // SRSRA V0.2S, V0.2S, #32 0x0F200400u, // SSHR V0.2S, V0.2S, #32 @@ -345,10 +349,11 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _ShrImm_V_2D_() + private static uint[] _ShrImm_Sri_V_2D_() { return new uint[] { + 0x6F404400u, // SRI V0.2D, V0.2D, #64 0x4F402400u, // SRSHR V0.2D, V0.2D, #64 0x4F403400u, // SRSRA V0.2D, V0.2D, #64 0x4F400400u, // SSHR V0.2D, V0.2D, #64 @@ -743,12 +748,12 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] - public void ShrImm_S_D([ValueSource("_ShrImm_S_D_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [ValueSource("_1D_")] [Random(RndCnt)] ulong z, - [ValueSource("_1D_")] [Random(RndCnt)] ulong a, - [Values(1u, 64u)] [Random(2u, 63u, RndCntShift)] uint shift) + public void ShrImm_Sri_S_D([ValueSource("_ShrImm_Sri_S_D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_1D_")] [Random(RndCnt)] ulong z, + [ValueSource("_1D_")] [Random(RndCnt)] ulong a, + [Values(1u, 64u)] [Random(2u, 63u, RndCntShift)] uint shift) { uint immHb = (128 - shift) & 0x7F; @@ -764,13 +769,13 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] - public void ShrImm_V_8B_16B([ValueSource("_ShrImm_V_8B_16B_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [ValueSource("_8B_")] [Random(RndCnt)] ulong z, - [ValueSource("_8B_")] [Random(RndCnt)] ulong a, - [Values(1u, 8u)] [Random(2u, 7u, RndCntShift)] uint shift, - [Values(0b0u, 0b1u)] uint q) // <8B, 16B> + public void ShrImm_Sri_V_8B_16B([ValueSource("_ShrImm_Sri_V_8B_16B_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_8B_")] [Random(RndCnt)] ulong z, + [ValueSource("_8B_")] [Random(RndCnt)] ulong a, + [Values(1u, 8u)] [Random(2u, 7u, RndCntShift)] uint shift, + [Values(0b0u, 0b1u)] uint q) // <8B, 16B> { uint immHb = (16 - shift) & 0x7F; @@ -787,13 +792,13 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] - public void ShrImm_V_4H_8H([ValueSource("_ShrImm_V_4H_8H_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [ValueSource("_4H_")] [Random(RndCnt)] ulong z, - [ValueSource("_4H_")] [Random(RndCnt)] ulong a, - [Values(1u, 16u)] [Random(2u, 15u, RndCntShift)] uint shift, - [Values(0b0u, 0b1u)] uint q) // <4H, 8H> + public void ShrImm_Sri_V_4H_8H([ValueSource("_ShrImm_Sri_V_4H_8H_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_4H_")] [Random(RndCnt)] ulong z, + [ValueSource("_4H_")] [Random(RndCnt)] ulong a, + [Values(1u, 16u)] [Random(2u, 15u, RndCntShift)] uint shift, + [Values(0b0u, 0b1u)] uint q) // <4H, 8H> { uint immHb = (32 - shift) & 0x7F; @@ -810,13 +815,13 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] - public void ShrImm_V_2S_4S([ValueSource("_ShrImm_V_2S_4S_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [ValueSource("_2S_")] [Random(RndCnt)] ulong z, - [ValueSource("_2S_")] [Random(RndCnt)] ulong a, - [Values(1u, 32u)] [Random(2u, 31u, RndCntShift)] uint shift, - [Values(0b0u, 0b1u)] uint q) // <2S, 4S> + public void ShrImm_Sri_V_2S_4S([ValueSource("_ShrImm_Sri_V_2S_4S_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_2S_")] [Random(RndCnt)] ulong z, + [ValueSource("_2S_")] [Random(RndCnt)] ulong a, + [Values(1u, 32u)] [Random(2u, 31u, RndCntShift)] uint shift, + [Values(0b0u, 0b1u)] uint q) // <2S, 4S> { uint immHb = (64 - shift) & 0x7F; @@ -833,12 +838,12 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] - public void ShrImm_V_2D([ValueSource("_ShrImm_V_2D_")] uint opcodes, - [Values(0u)] uint rd, - [Values(1u, 0u)] uint rn, - [ValueSource("_1D_")] [Random(RndCnt)] ulong z, - [ValueSource("_1D_")] [Random(RndCnt)] ulong a, - [Values(1u, 64u)] [Random(2u, 63u, RndCntShift)] uint shift) + public void ShrImm_Sri_V_2D([ValueSource("_ShrImm_Sri_V_2D_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u, 0u)] uint rn, + [ValueSource("_1D_")] [Random(RndCnt)] ulong z, + [ValueSource("_1D_")] [Random(RndCnt)] ulong a, + [Values(1u, 64u)] [Random(2u, 63u, RndCntShift)] uint shift) { uint immHb = (128 - shift) & 0x7F;