diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp index 83f22a527f..3e1276478a 100644 --- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp @@ -1065,11 +1065,30 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he break; case GX_TF_I8: // speed critical { - // JSD optimized with SSE2 intrinsics. - // Produces an ~86% speed increase over reference C implementation. for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 8) { +#if _M_SSE >= 0x401 + // SSE4 intrinsics: About 5-10% faster than SSE2 version + for (int iy = 0; iy < 4; ++iy, src+=8) + { + const __m128i mask3210 = _mm_set_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0); + + const __m128i mask7654 = _mm_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4); + __m128i *quaddst, r, rgba0, rgba1; + // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba) + r = _mm_loadl_epi64((const __m128i *)src); + // Shuffle select bytes to expand from (0000 0000 hgfe dcba) to: + rgba0 = _mm_shuffle_epi8(r, mask3210); // (dddd cccc bbbb aaaa) + rgba1 = _mm_shuffle_epi8(r, mask7654); // (hhhh gggg ffff eeee) + + quaddst = (__m128i *)(dst + (y + iy)*width + x); + _mm_storeu_si128(quaddst, rgba0); + _mm_storeu_si128(quaddst+1, rgba1); + } +#else + // JSD optimized with SSE2 intrinsics. + // Produces an ~86% speed increase over reference C implementation. // Each loop iteration processes 4 rows from 4 64-bit reads. // TODO: is it more efficient to group the loads together sequentially and also the stores at the end? @@ -1146,6 +1165,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he _mm_storeu_si128(quaddst+1, rgba7); src += 8; +#endif } #if 0 // Reference C implementation