diff --git a/Source/Core/VideoBackends/D3D/main.cpp b/Source/Core/VideoBackends/D3D/main.cpp index a21b03a03c..e78d809e1b 100644 --- a/Source/Core/VideoBackends/D3D/main.cpp +++ b/Source/Core/VideoBackends/D3D/main.cpp @@ -72,7 +72,6 @@ void InitBackendInfo() } g_Config.backend_info.APIType = API_D3D; - g_Config.backend_info.bUseRGBATextures = true; // the GX formats barely match any D3D11 formats g_Config.backend_info.bUseMinimalMipCount = true; g_Config.backend_info.bSupportsExclusiveFullscreen = true; g_Config.backend_info.bSupportsDualSourceBlend = true; diff --git a/Source/Core/VideoBackends/OGL/main.cpp b/Source/Core/VideoBackends/OGL/main.cpp index f2741cc9ba..472bb19f43 100644 --- a/Source/Core/VideoBackends/OGL/main.cpp +++ b/Source/Core/VideoBackends/OGL/main.cpp @@ -132,7 +132,6 @@ static void GetShaders(std::vector &shaders) static void InitBackendInfo() { g_Config.backend_info.APIType = API_OPENGL; - g_Config.backend_info.bUseRGBATextures = true; g_Config.backend_info.bUseMinimalMipCount = false; g_Config.backend_info.bSupportsExclusiveFullscreen = false; //g_Config.backend_info.bSupportsDualSourceBlend = true; // is gpu dependent and must be set in renderer diff --git a/Source/Core/VideoCommon/TextureCacheBase.cpp b/Source/Core/VideoCommon/TextureCacheBase.cpp index 0bbc336a31..ad414e36cc 100644 --- a/Source/Core/VideoCommon/TextureCacheBase.cpp +++ b/Source/Core/VideoCommon/TextureCacheBase.cpp @@ -490,8 +490,7 @@ TextureCache::TCacheEntryBase* TextureCache::Load(unsigned int const stage, { if (!(texformat == GX_TF_RGBA8 && from_tmem)) { - pcfmt = TexDecoder_Decode(temp, src_data, expandedWidth, - expandedHeight, texformat, tlutaddr, tlutfmt, g_ActiveConfig.backend_info.bUseRGBATextures); + pcfmt = TexDecoder_Decode(temp, src_data, expandedWidth, expandedHeight, texformat, tlutaddr, tlutfmt); } else { @@ -567,7 +566,7 @@ TextureCache::TCacheEntryBase* TextureCache::Load(unsigned int const stage, const u8*& mip_src_data = from_tmem ? ((level % 2) ? ptr_odd : ptr_even) : src_data; - TexDecoder_Decode(temp, mip_src_data, expanded_mip_width, expanded_mip_height, texformat, tlutaddr, tlutfmt, g_ActiveConfig.backend_info.bUseRGBATextures); + TexDecoder_Decode(temp, mip_src_data, expanded_mip_width, expanded_mip_height, texformat, tlutaddr, tlutfmt); mip_src_data += TexDecoder_GetTextureSizeInBytes(expanded_mip_width, expanded_mip_height, texformat); entry->Load(mip_width, mip_height, expanded_mip_width, level); diff --git a/Source/Core/VideoCommon/TextureDecoder.h b/Source/Core/VideoCommon/TextureDecoder.h index efcce209b5..0c5493be71 100644 --- a/Source/Core/VideoCommon/TextureDecoder.h +++ b/Source/Core/VideoCommon/TextureDecoder.h @@ -71,7 +71,7 @@ enum PC_TexFormat PC_TEX_FMT_DXT1, }; -PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly = false); +PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt); void TexDecoder_DecodeTexel(u8 *dst, const u8 *src, int s, int t, int imageWidth, int texformat, int tlutaddr, int tlutfmt); void TexDecoder_DecodeTexelRGBA8FromTmem(u8 *dst, const u8 *src_ar, const u8* src_gb, int s, int t, int imageWidth); PC_TexFormat TexDecoder_DecodeRGBA8FromTmem(u8* dst, const u8 *src_ar, const u8 *src_gb, int width, int height); @@ -79,4 +79,4 @@ PC_TexFormat TexDecoder_DecodeRGBA8FromTmem(u8* dst, const u8 *src_ar, const u8 void TexDecoder_SetTexFmtOverlayOptions(bool enable, bool center); /* Internal method, implemented by TextureDecoder_Generic and TextureDecoder_x64. */ -PC_TexFormat _TexDecoder_DecodeImpl(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly); +PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt); diff --git a/Source/Core/VideoCommon/TextureDecoder_Common.cpp b/Source/Core/VideoCommon/TextureDecoder_Common.cpp index b65200cb69..63e6dc672e 100644 --- a/Source/Core/VideoCommon/TextureDecoder_Common.cpp +++ b/Source/Core/VideoCommon/TextureDecoder_Common.cpp @@ -242,9 +242,9 @@ static void TexDecoder_DrawOverlay(u8 *dst, int width, int height, int texformat } } -PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly) +PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) { - PC_TexFormat pc_texformat = _TexDecoder_DecodeImpl(dst, src, width, height, texformat, tlutaddr, tlutfmt, rgbaOnly); + PC_TexFormat pc_texformat = _TexDecoder_DecodeImpl((u32*)dst, src, width, height, texformat, tlutaddr, tlutfmt); if (TexFmt_Overlay_Enable && pc_texformat != PC_TEX_FMT_NONE) TexDecoder_DrawOverlay(dst, width, height, texformat, pc_texformat); diff --git a/Source/Core/VideoCommon/TextureDecoder_Generic.cpp b/Source/Core/VideoCommon/TextureDecoder_Generic.cpp index 2357f7c322..dd23e4f924 100644 --- a/Source/Core/VideoCommon/TextureDecoder_Generic.cpp +++ b/Source/Core/VideoCommon/TextureDecoder_Generic.cpp @@ -17,26 +17,6 @@ // Decodes all known GameCube/Wii texture formats. // by ector -static inline u32 decode5A3(u16 val) -{ - int r,g,b,a; - if ((val & 0x8000)) - { - a = 0xFF; - r = Convert5To8((val >> 10) & 0x1F); - g = Convert5To8((val >> 5) & 0x1F); - b = Convert5To8(val & 0x1F); - } - else - { - a = Convert3To8((val >> 12) & 0x7); - r = Convert4To8((val >> 8) & 0xF); - g = Convert4To8((val >> 4) & 0xF); - b = Convert4To8(val & 0xF); - } - return (a << 24) | (r << 16) | (g << 8) | b; -} - static inline u32 decode5A3RGBA(u16 val) { int r,g,b,a; @@ -80,18 +60,6 @@ struct DXTBlock u8 lines[4]; }; -//inline void decodebytesC4(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) -inline void decodebytesC4_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u8 val = src[x]; - *dst++ = decode5A3(Common::swap16(tlut[val >> 4])); - *dst++ = decode5A3(Common::swap16(tlut[val & 0xF])); - } -} - inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -103,17 +71,6 @@ inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr) } } -inline void decodebytesC4_To_Raw16(u16* dst, const u8* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem+tlutaddr); - for (int x = 0; x < 4; x++) - { - u8 val = src[x]; - *dst++ = Common::swap16(tlut[val >> 4]); - *dst++ = Common::swap16(tlut[val & 0xF]); - } -} - inline void decodebytesC4IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr) { u16* tlut = (u16*)(texMem+tlutaddr); @@ -136,17 +93,6 @@ inline void decodebytesC4RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr) } } -//inline void decodebytesC8(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) -inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 8; x++) - { - u8 val = src[x]; - *dst++ = decode5A3(Common::swap16(tlut[val])); - } -} - inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -157,16 +103,6 @@ inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr) } } -inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 8; x++) - { - u8 val = src[x]; - *dst++ = Common::swap16(tlut[val]); - } -} - inline void decodebytesC8IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr) { u16* tlut = (u16*)(texMem + tlutaddr); @@ -185,16 +121,6 @@ inline void decodebytesC8RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr) } } -inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u16 val = Common::swap16(src[x]); - *dst++ = decode5A3(Common::swap16(tlut[(val & 0x3FFF)])); - } -} - inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -205,16 +131,6 @@ inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr) } } -inline void decodebytesC14X2_To_Raw16(u16* dst, const u16* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u16 val = Common::swap16(src[x]); - *dst++ = Common::swap16(tlut[(val & 0x3FFF)]); - } -} - inline void decodebytesC14X2IA8_To_RGBA(u32* dst, const u16* src, int tlutaddr) { u16* tlut = (u16*)(texMem + tlutaddr); @@ -235,18 +151,6 @@ inline void decodebytesC14X2rgb565_To_RGBA(u32* dst, const u16* src, int tlutadd } } -// Needs more speed. -inline void decodebytesIA4(u16 *dst, const u8 *src) -{ - for (int x = 0; x < 8; x++) - { - const u8 val = src[x]; - u8 a = Convert4To8(val >> 4); - u8 l = Convert4To8(val & 0xF); - dst[x] = (a << 8) | l; - } -} - inline void decodebytesIA4RGBA(u32 *dst, const u8 *src) { for (int x = 0; x < 8; x++) @@ -258,19 +162,6 @@ inline void decodebytesIA4RGBA(u32 *dst, const u8 *src) } } -inline void decodebytesRGB5A3(u32 *dst, const u16 *src) -{ -#if 0 - for (int x = 0; x < 4; x++) - dst[x] = decode5A3(Common::swap16(src[x])); -#else - dst[0] = decode5A3(Common::swap16(src[0])); - dst[1] = decode5A3(Common::swap16(src[1])); - dst[2] = decode5A3(Common::swap16(src[2])); - dst[3] = decode5A3(Common::swap16(src[3])); -#endif -} - inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src) { #if 0 @@ -284,29 +175,6 @@ inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src) #endif } -// This one is used by many video formats. It'd therefore be good if it was fast. -// Needs more speed. -inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2) -{ -#if 0 - for (int x = 0; x < 4; x++) - dst[x] = Common::swap32((src2[x] << 16) | src[x]); -#else - dst[0] = Common::swap32((src2[0] << 16) | src[0]); - dst[1] = Common::swap32((src2[1] << 16) | src[1]); - dst[2] = Common::swap32((src2[2] << 16) | src[2]); - dst[3] = Common::swap32((src2[3] << 16) | src[3]); -#endif - - // This can probably be done in a few SSE pack/unpack instructions + pshufb - // some unpack instruction x2: - // ABABABABABABABAB 1212121212121212 -> - // AB12AB12AB12AB12 AB12AB12AB12AB12 - // 2x pshufb-> - // 21BA21BA21BA21BA 21BA21BA21BA21BA - // and we are done. -} - inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2) { #if 0 @@ -322,59 +190,11 @@ inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2) #endif } -inline u32 makecol(int r, int g, int b, int a) -{ - return (a << 24)|(r << 16)|(g << 8)|b; -} - inline u32 makeRGBA(int r, int g, int b, int a) { return (a<<24)|(b<<16)|(g<<8)|r; } -void decodeDXTBlock(u32 *dst, const DXTBlock *src, int pitch) -{ - // S3TC Decoder (Note: GCN decodes differently from PC so we can't use native support) - // Needs more speed. - u16 c1 = Common::swap16(src->color1); - u16 c2 = Common::swap16(src->color2); - int blue1 = Convert5To8(c1 & 0x1F); - int blue2 = Convert5To8(c2 & 0x1F); - int green1 = Convert6To8((c1 >> 5) & 0x3F); - int green2 = Convert6To8((c2 >> 5) & 0x3F); - int red1 = Convert5To8((c1 >> 11) & 0x1F); - int red2 = Convert5To8((c2 >> 11) & 0x1F); - int colors[4]; - colors[0] = makecol(red1, green1, blue1, 255); - colors[1] = makecol(red2, green2, blue2, 255); - if (c1 > c2) - { - int blue3 = ((blue2 - blue1) >> 1) - ((blue2 - blue1) >> 3); - int green3 = ((green2 - green1) >> 1) - ((green2 - green1) >> 3); - int red3 = ((red2 - red1) >> 1) - ((red2 - red1) >> 3); - colors[2] = makecol(red1 + red3, green1 + green3, blue1 + blue3, 255); - colors[3] = makecol(red2 - red3, green2 - green3, blue2 - blue3, 255); - } - else - { - colors[2] = makecol((red1 + red2 + 1) / 2, // Average - (green1 + green2 + 1) / 2, - (blue1 + blue2 + 1) / 2, 255); - colors[3] = makecol(red2, green2, blue2, 0); // Color2 but transparent - } - - for (int y = 0; y < 4; y++) - { - int val = src->lines[y]; - for (int x = 0; x < 4; x++) - { - dst[x] = colors[(val >> 6) & 3]; - val <<= 2; - } - dst += pitch; - } -} - void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch) { // S3TC Decoder (Note: GCN decodes differently from PC so we can't use native support) @@ -418,210 +238,6 @@ void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch) } } -#if 0 // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8 -static void copyDXTBlock(u8* dst, const u8* src) -{ - ((u16*)dst)[0] = Common::swap16(((u16*)src)[0]); - ((u16*)dst)[1] = Common::swap16(((u16*)src)[1]); - u32 pixels = ((u32*)src)[1]; - // A bit of trickiness here: the row are in the same order - // between the two formats, but the ordering within the rows - // is reversed. - pixels = ((pixels >> 4) & 0x0F0F0F0F) | ((pixels << 4) & 0xF0F0F0F0); - pixels = ((pixels >> 2) & 0x33333333) | ((pixels << 2) & 0xCCCCCCCC); - ((u32*)dst)[1] = pixels; -} -#endif - -//switch endianness, unswizzle -//TODO: to save memory, don't blindly convert everything to argb8888 -//also ARGB order needs to be swapped later, to accommodate modern hardware better -//need to add DXT support too -PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) -{ - const int Wsteps4 = (width + 3) / 4; - const int Wsteps8 = (width + 7) / 8; - - switch (texformat) - { - case GX_TF_C4: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++) - decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr); - } - else - { - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++) - decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr); - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_I4: - { - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8 ; iy < 8; iy++,xStep++) - for (int ix = 0; ix < 4; ix++) - { - int val = src[4 * xStep + ix]; - dst[(y + iy) * width + x + ix * 2] = Convert4To8(val >> 4); - dst[(y + iy) * width + x + ix * 2 + 1] = Convert4To8(val & 0xF); - } - } - return PC_TEX_FMT_I4_AS_I8; - case GX_TF_I8: // speed critical - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - ((u64*)(dst + (y + iy) * width + x))[0] = ((u64*)(src + 8 * xStep))[0]; - } - return PC_TEX_FMT_I8; - case GX_TF_C8: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr); - } - else - { - - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr); - } - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_IA4: - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesIA4((u16*)dst + (y + iy) * width + x, src + 8 * xStep); - } - return PC_TEX_FMT_IA4_AS_IA8; - case GX_TF_IA8: - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = yStep * 4; iy < 4; iy++, xStep++) - { - u16 *ptr = (u16 *)dst + (y + iy) * width + x; - u16 *s = (u16 *)(src + 8 * xStep); - for (int j = 0; j < 4; j++) - *ptr++ = Common::swap16(*s++); - } - } - return PC_TEX_FMT_IA8; - case GX_TF_C14X2: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr); - } - else - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x,(u16*)(src + 8 * xStep), tlutaddr); - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_RGB565: - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - { - u16 *ptr = (u16 *)dst + (y + iy) * width + x; - u16 *s = (u16 *)(src + 8 * xStep); - for (int j = 0; j < 4; j++) - *ptr++ = Common::swap16(*s++); - } - } - return PC_TEX_FMT_RGB565; - case GX_TF_RGB5A3: - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - //decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4); - decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)(src + 8 * xStep)); - } - return PC_TEX_FMT_BGRA32; - case GX_TF_RGBA8: // speed critical - { - for (int y = 0; y < height; y += 4) - { - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - { - const u8* src2 = src + 64 * yStep; - for (int iy = 0; iy < 4; iy++) - decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src2 + 4 * iy, (u16*)src2 + 4 * iy + 16); - } - } - } - return PC_TEX_FMT_BGRA32; - case GX_TF_CMPR: // speed critical - // The metroid games use this format almost exclusively. - { -#if 0 // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8 - // 11111111 22222222 55555555 66666666 - // 33333333 44444444 77777777 88888888 - for (int y = 0; y < height; y += 8) - { - for (int x = 0; x < width; x += 8) - { - copyDXTBlock(dst+(y/2)*width+x*2, src); - src += 8; - copyDXTBlock(dst+(y/2)*width+x*2+8, src); - src += 8; - copyDXTBlock(dst+(y/2+2)*width+x*2, src); - src += 8; - copyDXTBlock(dst+(y/2+2)*width+x*2+8, src); - src += 8; - } - } - return PC_TEX_FMT_DXT1; -#else - for (int y = 0; y < height; y += 8) - { - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - { - const u8* src2 = src + 4 * sizeof(DXTBlock) * yStep; - decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src2, width); - } - } -#endif - return PC_TEX_FMT_BGRA32; - } - } - - // The "copy" texture formats, too? - return PC_TEX_FMT_NONE; -} - - - // JSD 01/06/11: // TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to // squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128 @@ -630,7 +246,7 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh // TODO: complete SSE2 optimization of less often used texture formats. // TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads. -PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt) +PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt) { const int Wsteps4 = (width + 3) / 4; @@ -832,11 +448,3 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he // The "copy" texture formats, too? return PC_TEX_FMT_RGBA32; } - -PC_TexFormat _TexDecoder_DecodeImpl(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly) -{ - if (rgbaOnly) - return TexDecoder_Decode_RGBA((u32*)dst, src, width, height, texformat, tlutaddr, tlutfmt); - else - return TexDecoder_Decode_real(dst, src, width, height, texformat, tlutaddr, tlutfmt); -} diff --git a/Source/Core/VideoCommon/TextureDecoder_x64.cpp b/Source/Core/VideoCommon/TextureDecoder_x64.cpp index d1bb6ceabb..e9119ff1d1 100644 --- a/Source/Core/VideoCommon/TextureDecoder_x64.cpp +++ b/Source/Core/VideoCommon/TextureDecoder_x64.cpp @@ -37,26 +37,6 @@ // Decodes all known GameCube/Wii texture formats. // by ector -static inline u32 decode5A3(u16 val) -{ - int r,g,b,a; - if ((val & 0x8000)) - { - a = 0xFF; - r = Convert5To8((val >> 10) & 0x1F); - g = Convert5To8((val >> 5) & 0x1F); - b = Convert5To8(val & 0x1F); - } - else - { - a = Convert3To8((val >> 12) & 0x7); - r = Convert4To8((val >> 8) & 0xF); - g = Convert4To8((val >> 4) & 0xF); - b = Convert4To8(val & 0xF); - } - return (a << 24) | (r << 16) | (g << 8) | b; -} - static inline u32 decode5A3RGBA(u16 val) { int r,g,b,a; @@ -103,18 +83,6 @@ struct DXTBlock u8 lines[4]; }; -//inline void decodebytesC4(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) -inline void decodebytesC4_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u8 val = src[x]; - *dst++ = decode5A3(Common::swap16(tlut[val >> 4])); - *dst++ = decode5A3(Common::swap16(tlut[val & 0xF])); - } -} - inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -126,17 +94,6 @@ inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr) } } -inline void decodebytesC4_To_Raw16(u16* dst, const u8* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem+tlutaddr); - for (int x = 0; x < 4; x++) - { - u8 val = src[x]; - *dst++ = Common::swap16(tlut[val >> 4]); - *dst++ = Common::swap16(tlut[val & 0xF]); - } -} - inline void decodebytesC4IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr) { u16* tlut = (u16*)(texMem+tlutaddr); @@ -159,17 +116,6 @@ inline void decodebytesC4RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr) } } -//inline void decodebytesC8(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) -inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 8; x++) - { - u8 val = src[x]; - *dst++ = decode5A3(Common::swap16(tlut[val])); - } -} - inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -180,16 +126,6 @@ inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr) } } -inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 8; x++) - { - u8 val = src[x]; - *dst++ = Common::swap16(tlut[val]); - } -} - inline void decodebytesC8IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr) { u16* tlut = (u16*)(texMem + tlutaddr); @@ -208,42 +144,6 @@ inline void decodebytesC8RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr) } } -#if _M_SSE >= 0x301 -static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L); - -inline void decodebytesC8_To_Raw16_SSSE3(u16* dst, const u8* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem + tlutaddr); - - // Make 8 16-bits unsigned integer values - __m128i a = _mm_setzero_si128(); - a = _mm_insert_epi16(a, tlut[src[0]], 0); - a = _mm_insert_epi16(a, tlut[src[1]], 1); - a = _mm_insert_epi16(a, tlut[src[2]], 2); - a = _mm_insert_epi16(a, tlut[src[3]], 3); - a = _mm_insert_epi16(a, tlut[src[4]], 4); - a = _mm_insert_epi16(a, tlut[src[5]], 5); - a = _mm_insert_epi16(a, tlut[src[6]], 6); - a = _mm_insert_epi16(a, tlut[src[7]], 7); - - // Apply Common::swap16() to 16-bits unsigned integers at once - const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16); - - // Store values to dst without polluting the caches - _mm_stream_si128((__m128i*)dst, b); -} -#endif - -inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u16 val = Common::swap16(src[x]); - *dst++ = decode5A3(Common::swap16(tlut[(val & 0x3FFF)])); - } -} - inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -254,16 +154,6 @@ inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr) } } -inline void decodebytesC14X2_To_Raw16(u16* dst, const u16* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u16 val = Common::swap16(src[x]); - *dst++ = Common::swap16(tlut[(val & 0x3FFF)]); - } -} - inline void decodebytesC14X2IA8_To_RGBA(u32* dst, const u16* src, int tlutaddr) { u16* tlut = (u16*)(texMem + tlutaddr); @@ -284,18 +174,6 @@ inline void decodebytesC14X2rgb565_To_RGBA(u32* dst, const u16* src, int tlutadd } } -// Needs more speed. -inline void decodebytesIA4(u16 *dst, const u8 *src) -{ - for (int x = 0; x < 8; x++) - { - const u8 val = src[x]; - u8 a = Convert4To8(val >> 4); - u8 l = Convert4To8(val & 0xF); - dst[x] = (a << 8) | l; - } -} - inline void decodebytesIA4RGBA(u32 *dst, const u8 *src) { for (int x = 0; x < 8; x++) @@ -307,19 +185,6 @@ inline void decodebytesIA4RGBA(u32 *dst, const u8 *src) } } -inline void decodebytesRGB5A3(u32 *dst, const u16 *src) -{ -#if 0 - for (int x = 0; x < 4; x++) - dst[x] = decode5A3(Common::swap16(src[x])); -#else - dst[0] = decode5A3(Common::swap16(src[0])); - dst[1] = decode5A3(Common::swap16(src[1])); - dst[2] = decode5A3(Common::swap16(src[2])); - dst[3] = decode5A3(Common::swap16(src[3])); -#endif -} - inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src) { #if 0 @@ -333,29 +198,6 @@ inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src) #endif } -// This one is used by many video formats. It'd therefore be good if it was fast. -// Needs more speed. -inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2) -{ -#if 0 - for (int x = 0; x < 4; x++) - dst[x] = Common::swap32((src2[x] << 16) | src[x]); -#else - dst[0] = Common::swap32((src2[0] << 16) | src[0]); - dst[1] = Common::swap32((src2[1] << 16) | src[1]); - dst[2] = Common::swap32((src2[2] << 16) | src[2]); - dst[3] = Common::swap32((src2[3] << 16) | src[3]); -#endif - - // This can probably be done in a few SSE pack/unpack instructions + pshufb - // some unpack instruction x2: - // ABABABABABABABAB 1212121212121212 -> - // AB12AB12AB12AB12 AB12AB12AB12AB12 - // 2x pshufb-> - // 21BA21BA21BA21BA 21BA21BA21BA21BA - // and we are done. -} - inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2) { #if 0 @@ -371,59 +213,11 @@ inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2) #endif } -inline u32 makecol(int r, int g, int b, int a) -{ - return (a << 24)|(r << 16)|(g << 8)|b; -} - inline u32 makeRGBA(int r, int g, int b, int a) { return (a<<24)|(b<<16)|(g<<8)|r; } -static void decodeDXTBlock(u32 *dst, const DXTBlock *src, int pitch) -{ - // S3TC Decoder (Note: GCN decodes differently from PC so we can't use native support) - // Needs more speed. - u16 c1 = Common::swap16(src->color1); - u16 c2 = Common::swap16(src->color2); - int blue1 = Convert5To8(c1 & 0x1F); - int blue2 = Convert5To8(c2 & 0x1F); - int green1 = Convert6To8((c1 >> 5) & 0x3F); - int green2 = Convert6To8((c2 >> 5) & 0x3F); - int red1 = Convert5To8((c1 >> 11) & 0x1F); - int red2 = Convert5To8((c2 >> 11) & 0x1F); - int colors[4]; - colors[0] = makecol(red1, green1, blue1, 255); - colors[1] = makecol(red2, green2, blue2, 255); - if (c1 > c2) - { - int blue3 = ((blue2 - blue1) >> 1) - ((blue2 - blue1) >> 3); - int green3 = ((green2 - green1) >> 1) - ((green2 - green1) >> 3); - int red3 = ((red2 - red1) >> 1) - ((red2 - red1) >> 3); - colors[2] = makecol(red1 + red3, green1 + green3, blue1 + blue3, 255); - colors[3] = makecol(red2 - red3, green2 - green3, blue2 - blue3, 255); - } - else - { - colors[2] = makecol((red1 + red2 + 1) / 2, // Average - (green1 + green2 + 1) / 2, - (blue1 + blue2 + 1) / 2, 255); - colors[3] = makecol(red2, green2, blue2, 0); // Color2 but transparent - } - - for (int y = 0; y < 4; y++) - { - int val = src->lines[y]; - for (int x = 0; x < 4; x++) - { - dst[x] = colors[(val >> 6) & 3]; - val <<= 2; - } - dst += pitch; - } -} - #ifdef CHECK static void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch) { @@ -469,21 +263,6 @@ static void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch) } #endif -#if 0 // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8 -static void copyDXTBlock(u8* dst, const u8* src) -{ - ((u16*)dst)[0] = Common::swap16(((u16*)src)[0]); - ((u16*)dst)[1] = Common::swap16(((u16*)src)[1]); - u32 pixels = ((u32*)src)[1]; - // A bit of trickiness here: the row are in the same order - // between the two formats, but the ordering within the rows - // is reversed. - pixels = ((pixels >> 4) & 0x0F0F0F0F) | ((pixels << 4) & 0xF0F0F0F0); - pixels = ((pixels >> 2) & 0x33333333) | ((pixels << 2) & 0xCCCCCCCC); - ((u32*)dst)[1] = pixels; -} -#endif - inline void SetOpenMPThreadCount(int width, int height) { #ifdef _OPENMP @@ -500,274 +279,6 @@ inline void SetOpenMPThreadCount(int width, int height) #endif } -//switch endianness, unswizzle -//TODO: to save memory, don't blindly convert everything to argb8888 -//also ARGB order needs to be swapped later, to accommodate modern hardware better -//need to add DXT support too -static PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) -{ - SetOpenMPThreadCount(width, height); - - const int Wsteps4 = (width + 3) / 4; - const int Wsteps8 = (width + 7) / 8; - - switch (texformat) - { - case GX_TF_C4: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - #pragma omp parallel for - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++) - decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr); - } - else - { - #pragma omp parallel for - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++) - decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr); - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_I4: - { - #pragma omp parallel for - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8 ; iy < 8; iy++,xStep++) - for (int ix = 0; ix < 4; ix++) - { - int val = src[4 * xStep + ix]; - dst[(y + iy) * width + x + ix * 2] = Convert4To8(val >> 4); - dst[(y + iy) * width + x + ix * 2 + 1] = Convert4To8(val & 0xF); - } - } - return PC_TEX_FMT_I4_AS_I8; - case GX_TF_I8: // speed critical - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - { - ((u64*)(dst + (y + iy) * width + x))[0] = ((u64*)(src + 8 * xStep))[0]; - } - } - return PC_TEX_FMT_I8; - case GX_TF_C8: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr); - } - else - { - -#if _M_SSE >= 0x301 - - if (cpu_info.bSSSE3) - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr); - } - else -#endif - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr); - } - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_IA4: - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesIA4((u16*)dst + (y + iy) * width + x, src + 8 * xStep); - } - return PC_TEX_FMT_IA4_AS_IA8; - case GX_TF_IA8: - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = yStep * 4; iy < 4; iy++, xStep++) - { - u16 *ptr = (u16 *)dst + (y + iy) * width + x; - u16 *s = (u16 *)(src + 8 * xStep); - for (int j = 0; j < 4; j++) - *ptr++ = Common::swap16(*s++); - } - - } - return PC_TEX_FMT_IA8; - case GX_TF_C14X2: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr); - } - else - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x,(u16*)(src + 8 * xStep), tlutaddr); - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_RGB565: - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - { - u16 *ptr = (u16 *)dst + (y + iy) * width + x; - u16 *s = (u16 *)(src + 8 * xStep); - for (int j = 0; j < 4; j++) - *ptr++ = Common::swap16(*s++); - } - } - return PC_TEX_FMT_RGB565; - case GX_TF_RGB5A3: - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - //decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4); - decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)(src + 8 * xStep)); - } - return PC_TEX_FMT_BGRA32; - case GX_TF_RGBA8: // speed critical - { - -#if _M_SSE >= 0x301 - - if (cpu_info.bSSSE3) - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - { - __m128i* p = (__m128i*)(src + y * width * 4); - for (int x = 0; x < width; x += 4) - { - // We use _mm_loadu_si128 instead of _mm_load_si128 - // because "p" may not be aligned in 16-bytes alignment. - // See Issue 3493. - const __m128i a0 = _mm_loadu_si128(p++); - const __m128i a1 = _mm_loadu_si128(p++); - const __m128i a2 = _mm_loadu_si128(p++); - const __m128i a3 = _mm_loadu_si128(p++); - - // Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(), - // apply Common::swap32() by _mm_shuffle_epi8() and - // store them by _mm_stream_si128(). - // See decodebytesARGB8_4() about the idea. - - static const __m128i kMaskSwap32 = _mm_set_epi32(0x0C0D0E0FL, 0x08090A0BL, 0x04050607L, 0x00010203L); - - const __m128i b0 = _mm_unpacklo_epi16(a0, a2); - const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0); - - const __m128i b1 = _mm_unpackhi_epi16(a0, a2); - const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1); - - const __m128i b2 = _mm_unpacklo_epi16(a1, a3); - const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2); - - const __m128i b3 = _mm_unpackhi_epi16(a1, a3); - const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3); - } - } - } - else - -#endif - - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - { - const u8* src2 = src + 64 * yStep; - for (int iy = 0; iy < 4; iy++) - decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src2 + 4 * iy, (u16*)src2 + 4 * iy + 16); - } - } - } - return PC_TEX_FMT_BGRA32; - case GX_TF_CMPR: // speed critical - // The metroid games use this format almost exclusively. - { -#if 0 // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8 - // 11111111 22222222 55555555 66666666 - // 33333333 44444444 77777777 88888888 - for (int y = 0; y < height; y += 8) - { - for (int x = 0; x < width; x += 8) - { - copyDXTBlock(dst+(y/2)*width+x*2, src); - src += 8; - copyDXTBlock(dst+(y/2)*width+x*2+8, src); - src += 8; - copyDXTBlock(dst+(y/2+2)*width+x*2, src); - src += 8; - copyDXTBlock(dst+(y/2+2)*width+x*2+8, src); - src += 8; - } - } - return PC_TEX_FMT_DXT1; -#else - #pragma omp parallel for - for (int y = 0; y < height; y += 8) - { - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - { - const u8* src2 = src + 4 * sizeof(DXTBlock) * yStep; - decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src2, width); - } - } -#endif - return PC_TEX_FMT_BGRA32; - } - } - - // The "copy" texture formats, too? - return PC_TEX_FMT_NONE; -} - - - // JSD 01/06/11: // TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to // squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128 @@ -776,7 +287,7 @@ static PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, in // TODO: complete SSE2 optimization of less often used texture formats. // TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads. -static PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt) +PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt) { SetOpenMPThreadCount(width, height); @@ -1844,11 +1355,3 @@ static PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, // The "copy" texture formats, too? return PC_TEX_FMT_RGBA32; } - -PC_TexFormat _TexDecoder_DecodeImpl(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly) -{ - if (rgbaOnly) - return TexDecoder_Decode_RGBA((u32*)dst, src, width, height, texformat, tlutaddr, tlutfmt); - else - return TexDecoder_Decode_real(dst, src, width, height, texformat, tlutaddr, tlutfmt); -} diff --git a/Source/Core/VideoCommon/VideoConfig.cpp b/Source/Core/VideoCommon/VideoConfig.cpp index d6e2177b6b..8fd7a9efbb 100644 --- a/Source/Core/VideoCommon/VideoConfig.cpp +++ b/Source/Core/VideoCommon/VideoConfig.cpp @@ -36,7 +36,6 @@ VideoConfig::VideoConfig() // disable all features by default backend_info.APIType = API_NONE; - backend_info.bUseRGBATextures = false; backend_info.bUseMinimalMipCount = false; backend_info.bSupportsExclusiveFullscreen = false; } diff --git a/Source/Core/VideoCommon/VideoConfig.h b/Source/Core/VideoCommon/VideoConfig.h index ea63a61763..4d7f87f0ed 100644 --- a/Source/Core/VideoCommon/VideoConfig.h +++ b/Source/Core/VideoCommon/VideoConfig.h @@ -137,7 +137,6 @@ struct VideoConfig final std::vector AAModes; std::vector PPShaders; // post-processing shaders - bool bUseRGBATextures; // used for D3D in TextureCache bool bUseMinimalMipCount; bool bSupportsExclusiveFullscreen; bool bSupportsDualSourceBlend;