mirror of
				https://github.com/dolphin-emu/dolphin.git
				synced 2025-10-26 09:59:15 +00:00 
			
		
		
		
	git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6972 8ced0084-cf51-0410-be5f-012b33b47a6e
		
			
				
	
	
		
			419 lines
		
	
	
		
			No EOL
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			419 lines
		
	
	
		
			No EOL
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| // Copyright (C) 2003 Dolphin Project.
 | |
| 
 | |
| // This program is free software: you can redistribute it and/or modify
 | |
| // it under the terms of the GNU General Public License as published by
 | |
| // the Free Software Foundation, version 2.0.
 | |
| 
 | |
| // This program is distributed in the hope that it will be useful,
 | |
| // but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
| // GNU General Public License 2.0 for more details.
 | |
| 
 | |
| // A copy of the GPL 2.0 should have been included with the program.
 | |
| // If not, see http://www.gnu.org/licenses/
 | |
| 
 | |
| // Official SVN repository and contact information can be found at
 | |
| // http://code.google.com/p/dolphin-emu/
 | |
| 
 | |
| #include "D3DBase.h"
 | |
| #include "D3DTexture.h"
 | |
| 
 | |
| #include "CPUDetect.h"
 | |
| 
 | |
| #if _M_SSE >= 0x401
 | |
| #include <smmintrin.h>
 | |
| #include <emmintrin.h>
 | |
| #elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
 | |
| #include <tmmintrin.h>
 | |
| #endif
 | |
| 
 | |
| namespace DX9
 | |
| {
 | |
| 
 | |
| namespace D3D
 | |
| {
 | |
| 
 | |
| void ConvertRGBA_BGRA_SSE2(u32 *dst, const int dstPitch, u32 *pIn, const int width, const int height, const int pitch)
 | |
| {
 | |
| 	// Converts RGBA to BGRA:
 | |
| 	// TODO: this would be totally unnecessary if we just change the TextureDecoder_RGBA to decode
 | |
| 	// to BGRA instead.
 | |
| 	for (int y = 0; y < height; y++, pIn += pitch)
 | |
| 	{
 | |
| 		u8 *pIn8 = (u8 *)pIn;
 | |
| 		u8 *pBits = (u8 *)((u8*)dst + (y * dstPitch));
 | |
| 
 | |
| 		// Batch up loads/stores into 16 byte chunks to use SSE2 efficiently:
 | |
| 		int sse2blocks = (width * 4) / 16;
 | |
| 		int sse2remainder = (width * 4) & 15;
 | |
| 
 | |
| 		// Do conversions in batches of 16 bytes:
 | |
| 		if (sse2blocks > 0)
 | |
| 		{
 | |
| 			// Generate a constant of all FF bytes:
 | |
| 			const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
 | |
| 			__m128i *src128 = (__m128i *)pIn8;
 | |
| 			__m128i *dst128 = (__m128i *)pBits;
 | |
| 
 | |
| 			// Increment by 16 bytes at a time:
 | |
| 			for (int i = 0; i < sse2blocks; ++i, ++dst128, ++src128)
 | |
| 			{
 | |
| 				// Load up 4 colors simultaneously:
 | |
| 				__m128i rgba = _mm_loadu_si128(src128);
 | |
| 				// Swap the R and B components:
 | |
| 				// Isolate the B component and shift it left 16 bits:
 | |
| 				// ABGR
 | |
| 				const __m128i bMask = _mm_srli_epi32(allFFs128, 24);
 | |
| 				const __m128i bNew = _mm_slli_epi32(_mm_and_si128(rgba, bMask), 16);
 | |
| 				// Isolate the R component and shift it right 16 bits:
 | |
| 				const __m128i rMask = _mm_slli_epi32(bMask, 16);
 | |
| 				const __m128i rNew = _mm_srli_epi32(_mm_and_si128(rgba, rMask), 16);
 | |
| 				// Now mask off the old R and B components from the rgba data to get 0g0a:
 | |
| 				const __m128i _g_a = _mm_or_si128(
 | |
| 					_mm_and_si128(
 | |
| 						rgba,
 | |
| 						_mm_or_si128(
 | |
| 							_mm_slli_epi32(bMask, 8),
 | |
| 							_mm_slli_epi32(rMask, 8)
 | |
| 						)
 | |
| 					),
 | |
| 					_mm_or_si128(rNew, bNew)
 | |
| 				);
 | |
| 				// Finally, OR up all the individual components to get BGRA:
 | |
| 				const __m128i bgra = _mm_or_si128(_g_a, _mm_or_si128(rNew, bNew));
 | |
| 				_mm_storeu_si128(dst128, bgra);
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// Take the remainder colors at the end of the row that weren't able to
 | |
| 		// be included into the last 16 byte chunk:
 | |
| 		if (sse2remainder > 0)
 | |
| 		{
 | |
| 			for (int x = (sse2blocks * 16); x < (width * 4); x += 4)
 | |
| 			{
 | |
| 				pBits[x + 0] = pIn8[x + 2];
 | |
| 				pBits[x + 1] = pIn8[x + 1];
 | |
| 				pBits[x + 2] = pIn8[x + 0];
 | |
| 				pBits[x + 3] = pIn8[x + 3];
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Memory fence to make sure the stores are good:
 | |
| 	_mm_mfence();
 | |
| }
 | |
| 
 | |
| void ConvertRGBA_BGRA_SSSE3(u32 *dst, const int dstPitch, u32 *pIn, const int width, const int height, const int pitch)
 | |
| {
 | |
| 	__m128i mask = _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2);
 | |
| 	for (int y = 0; y < height; y++, pIn += pitch)
 | |
| 	{
 | |
| 		u8 *pIn8 = (u8 *)pIn;
 | |
| 		u8 *pBits = (u8 *)((u8*)dst + (y * dstPitch));
 | |
| 
 | |
| 		// Batch up loads/stores into 16 byte chunks to use SSE2 efficiently:
 | |
| 		int ssse3blocks = (width * 4) / 16;
 | |
| 		int ssse3remainder = (width * 4) & 15;
 | |
| 
 | |
| 		// Do conversions in batches of 16 bytes:
 | |
| 		if (ssse3blocks > 0)
 | |
| 		{
 | |
| 			__m128i *src128 = (__m128i *)pIn8;
 | |
| 			__m128i *dst128 = (__m128i *)pBits;
 | |
| 
 | |
| 			// Increment by 16 bytes at a time:
 | |
| 			for (int i = 0; i < ssse3blocks; ++i, ++dst128, ++src128)
 | |
| 			{
 | |
| 				_mm_storeu_si128(dst128, _mm_shuffle_epi8(_mm_loadu_si128(src128), mask));
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// Take the remainder colors at the end of the row that weren't able to
 | |
| 		// be included into the last 16 byte chunk:
 | |
| 		if (ssse3remainder > 0)
 | |
| 		{
 | |
| 			for (int x = (ssse3blocks * 16); x < (width * 4); x += 4)
 | |
| 			{
 | |
| 				pBits[x + 0] = pIn8[x + 2];
 | |
| 				pBits[x + 1] = pIn8[x + 1];
 | |
| 				pBits[x + 2] = pIn8[x + 0];
 | |
| 				pBits[x + 3] = pIn8[x + 3];
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Memory fence to make sure the stores are good:
 | |
| 	_mm_mfence();
 | |
| }
 | |
| 
 | |
| LPDIRECT3DTEXTURE9 CreateTexture2D(const u8* buffer, const int width, const int height, const int pitch, D3DFORMAT fmt, bool swap_r_b, int levels)
 | |
| {
 | |
| 	u32* pBuffer = (u32*)buffer;
 | |
| 	LPDIRECT3DTEXTURE9 pTexture;
 | |
| 
 | |
| 	// crazy bitmagic, sorry :)
 | |
| 	bool isPow2 = !((width&(width-1)) || (height&(height-1)));
 | |
| 	bool bExpand = false;
 | |
| 
 | |
| 	if (fmt == D3DFMT_A8P8) {
 | |
| 		fmt = D3DFMT_A8L8;
 | |
| 		bExpand = true;
 | |
| 	}
 | |
| 
 | |
| 	HRESULT hr;
 | |
| 	if (levels > 0)
 | |
| 		hr = dev->CreateTexture(width, height, levels, 0, fmt, D3DPOOL_MANAGED, &pTexture, NULL);
 | |
| 	else
 | |
| 		hr = dev->CreateTexture(width, height, 0, D3DUSAGE_AUTOGENMIPMAP, fmt, D3DPOOL_MANAGED, &pTexture, NULL);
 | |
| 
 | |
| 	if (FAILED(hr))
 | |
| 		return 0;
 | |
| 	int level = 0;
 | |
| 	D3DLOCKED_RECT Lock;
 | |
| 	pTexture->LockRect(level, &Lock, NULL, 0);
 | |
| 	switch (fmt) 
 | |
| 	{
 | |
| 	case D3DFMT_L8:
 | |
| 	case D3DFMT_A8:
 | |
| 	case D3DFMT_A4L4:
 | |
| 		{
 | |
| 			const u8 *pIn = buffer;
 | |
| 			for (int y = 0; y < height; y++)
 | |
| 			{
 | |
| 				u8* pBits = ((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 				memcpy(pBits, pIn, width);
 | |
| 				pIn += pitch;
 | |
| 			}
 | |
| 		}
 | |
| 		break;
 | |
| 	case D3DFMT_R5G6B5:
 | |
| 		{
 | |
| 			const u16 *pIn = (u16*)buffer;
 | |
| 			for (int y = 0; y < height; y++)
 | |
| 			{
 | |
| 				u16* pBits = (u16*)((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 				memcpy(pBits, pIn, width * 2);
 | |
| 				pIn += pitch;
 | |
| 			}
 | |
| 		}
 | |
| 		break;
 | |
| 	case D3DFMT_A8L8:
 | |
| 		{
 | |
| 			if (bExpand) { // I8
 | |
| 				const u8 *pIn = buffer;
 | |
| 				// TODO(XK): Find a better way that does not involve either unpacking
 | |
| 				//           or downsampling (i.e. A4L4)
 | |
| 				for (int y = 0; y < height; y++)
 | |
| 				{
 | |
| 					u8* pBits = ((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 					for(int i = 0; i < width * 2; i += 2) {
 | |
| 						pBits[i] = pIn[i / 2];
 | |
| 						pBits[i + 1] = pIn[i / 2];
 | |
| 					}
 | |
| 					pIn += pitch;
 | |
| 				}
 | |
| 			} else { // IA8
 | |
| 				const u16 *pIn = (u16*)buffer;
 | |
| 
 | |
| 				for (int y = 0; y < height; y++)
 | |
| 				{
 | |
| 					u16* pBits = (u16*)((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 					memcpy(pBits, pIn, width * 2);
 | |
| 					pIn += pitch;
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 		break;
 | |
| 	case D3DFMT_A8R8G8B8:
 | |
| 		{
 | |
| 			if(pitch * 4 == Lock.Pitch && !swap_r_b)
 | |
| 			{
 | |
| 				memcpy(Lock.pBits,buffer,Lock.Pitch*height);
 | |
| 			}
 | |
| 			else
 | |
| 			{
 | |
| 				u32* pIn = pBuffer;
 | |
| 				if (!swap_r_b) {
 | |
| 					for (int y = 0; y < height; y++)
 | |
| 					{
 | |
| 						u32 *pBits = (u32*)((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 						memcpy(pBits, pIn, width * 4);
 | |
| 						pIn += pitch;
 | |
| 					}
 | |
| 				} else {
 | |
| #if _M_SSE >= 0x301
 | |
| 					// Uses SSSE3 intrinsics to optimize RGBA -> BGRA swizzle:
 | |
| 					if (cpu_info.bSSSE3) {
 | |
| 						ConvertRGBA_BGRA_SSSE3((u32 *)Lock.pBits, Lock.Pitch, pIn, width, height, pitch);
 | |
| 					} else
 | |
| #endif
 | |
| 					// Uses SSE2 intrinsics to optimize RGBA -> BGRA swizzle:
 | |
| 					{
 | |
| 						ConvertRGBA_BGRA_SSE2((u32 *)Lock.pBits, Lock.Pitch, pIn, width, height, pitch);
 | |
| 					}
 | |
| #if 0
 | |
| 					for (int y = 0; y < height; y++)
 | |
| 					{
 | |
| 						u8 *pIn8 = (u8 *)pIn;
 | |
| 						u8 *pBits = (u8 *)((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 						for (int x = 0; x < width * 4; x += 4) {
 | |
| 							pBits[x + 0] = pIn8[x + 2];
 | |
| 							pBits[x + 1] = pIn8[x + 1];
 | |
| 							pBits[x + 2] = pIn8[x + 0];
 | |
| 							pBits[x + 3] = pIn8[x + 3];
 | |
| 						}
 | |
| 						pIn += pitch;
 | |
| 					}
 | |
| #endif
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 		break;
 | |
| 	case D3DFMT_DXT1:
 | |
| 		memcpy(Lock.pBits,buffer,((width+3)/4)*((height+3)/4)*8);
 | |
| 		break;
 | |
| 	default:
 | |
| 		PanicAlert("D3D: Invalid texture format %i", fmt);
 | |
| 	}
 | |
| 	pTexture->UnlockRect(level); 
 | |
| 	return pTexture;
 | |
| }
 | |
| 
 | |
| LPDIRECT3DTEXTURE9 CreateOnlyTexture2D(const int width, const int height, D3DFORMAT fmt)
 | |
| {
 | |
| 	LPDIRECT3DTEXTURE9 pTexture;
 | |
| 	// crazy bitmagic, sorry :)
 | |
| 	bool isPow2 = !((width&(width-1)) || (height&(height-1)));
 | |
| 	bool bExpand = false;
 | |
| 	HRESULT hr;
 | |
| 	// TODO(ector): Allow mipmaps for non-pow textures on newer cards?
 | |
| 	// TODO(ector): Use the game-specified mipmaps?
 | |
| 	if (!isPow2)
 | |
| 		hr = dev->CreateTexture(width, height, 1, 0, fmt, D3DPOOL_MANAGED, &pTexture, NULL);
 | |
| 	else
 | |
| 		hr = dev->CreateTexture(width, height, 0, D3DUSAGE_AUTOGENMIPMAP, fmt, D3DPOOL_MANAGED, &pTexture, NULL);
 | |
| 
 | |
| 	if (FAILED(hr))
 | |
| 		return 0;
 | |
| 	return pTexture;
 | |
| }
 | |
| 
 | |
| void ReplaceTexture2D(LPDIRECT3DTEXTURE9 pTexture, const u8* buffer, const int width, const int height, const int pitch, D3DFORMAT fmt, bool swap_r_b, int level)
 | |
| {
 | |
| 	u32* pBuffer = (u32*)buffer;
 | |
| 	D3DLOCKED_RECT Lock;
 | |
| 	pTexture->LockRect(level, &Lock, NULL, 0);
 | |
| 	u32* pIn = pBuffer;
 | |
| 
 | |
| 	bool bExpand = false;
 | |
| 
 | |
| 	if (fmt == D3DFMT_A8P8) {
 | |
| 		fmt = D3DFMT_A8L8;
 | |
| 		bExpand = true;
 | |
| 	}
 | |
| 	switch (fmt) 
 | |
| 	{
 | |
| 	case D3DFMT_A8R8G8B8:
 | |
| 		if(pitch * 4 == Lock.Pitch && !swap_r_b)
 | |
| 		{
 | |
| 			memcpy(Lock.pBits, pBuffer, Lock.Pitch*height);
 | |
| 		}
 | |
| 		else if (!swap_r_b)
 | |
| 		{
 | |
| 			for (int y = 0; y < height; y++)
 | |
| 			{
 | |
| 				u32 *pBits = (u32*)((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 				memcpy(pBits, pIn, width * 4);
 | |
| 				pIn += pitch;
 | |
| 			}
 | |
| 		}
 | |
| 		else
 | |
| 		{
 | |
| #if _M_SSE >= 0x301
 | |
| 			// Uses SSSE3 intrinsics to optimize RGBA -> BGRA swizzle:
 | |
| 			if (cpu_info.bSSSE3) {
 | |
| 				ConvertRGBA_BGRA_SSSE3((u32 *)Lock.pBits, Lock.Pitch, pIn, width, height, pitch);
 | |
| 			} else
 | |
| #endif
 | |
| 			// Uses SSE2 intrinsics to optimize RGBA -> BGRA swizzle:
 | |
| 			{
 | |
| 				ConvertRGBA_BGRA_SSE2((u32 *)Lock.pBits, Lock.Pitch, pIn, width, height, pitch);
 | |
| 			}
 | |
| #if 0
 | |
| 			for (int y = 0; y < height; y++)
 | |
| 			{
 | |
| 				u8 *pIn8 = (u8 *)pIn;
 | |
| 				u8 *pBits = (u8 *)((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 				for (int x = 0; x < width * 4; x += 4)
 | |
| 				{
 | |
| 					pBits[x + 0] = pIn8[x + 2];
 | |
| 					pBits[x + 1] = pIn8[x + 1];
 | |
| 					pBits[x + 2] = pIn8[x + 0];
 | |
| 					pBits[x + 3] = pIn8[x + 3];
 | |
| 				}
 | |
| 				pIn += pitch;
 | |
| 			}
 | |
| #endif
 | |
| 		}
 | |
| 		break;
 | |
| 	case D3DFMT_L8:
 | |
| 	case D3DFMT_A8:
 | |
| 	case D3DFMT_A4L4:
 | |
| 		{
 | |
| 			const u8 *pIn = buffer;
 | |
| 			for (int y = 0; y < height; y++)
 | |
| 			{
 | |
| 				u8* pBits = ((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 				memcpy(pBits, pIn, width);
 | |
| 				pIn += pitch;
 | |
| 			}
 | |
| 		}
 | |
| 		break;
 | |
| 	case D3DFMT_R5G6B5:
 | |
| 		{
 | |
| 			const u16 *pIn = (u16*)buffer;
 | |
| 			for (int y = 0; y < height; y++)
 | |
| 			{
 | |
| 				u16* pBits = (u16*)((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 				memcpy(pBits, pIn, width * 2);
 | |
| 				pIn += pitch;
 | |
| 			}
 | |
| 		}
 | |
| 		break;
 | |
| 	case D3DFMT_A8L8:
 | |
| 		{
 | |
| 			if (bExpand) { // I8
 | |
| 				const u8 *pIn = buffer;
 | |
| 				// TODO(XK): Find a better way that does not involve either unpacking
 | |
| 				//           or downsampling (i.e. A4L4)
 | |
| 				for (int y = 0; y < height; y++)
 | |
| 				{
 | |
| 					u8* pBits = ((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 					for(int i = 0; i < width * 2; i += 2) {
 | |
| 						pBits[i] = pIn[i / 2];
 | |
| 						pBits[i + 1] = pIn[i / 2];
 | |
| 					}
 | |
| 					pIn += pitch;
 | |
| 				}
 | |
| 			} else { // IA8
 | |
| 				const u16 *pIn = (u16*)buffer;
 | |
| 
 | |
| 				for (int y = 0; y < height; y++)
 | |
| 				{
 | |
| 					u16* pBits = (u16*)((u8*)Lock.pBits + (y * Lock.Pitch));
 | |
| 					memcpy(pBits, pIn, width * 2);
 | |
| 					pIn += pitch;
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 		break;
 | |
| 	case D3DFMT_DXT1:
 | |
| 		memcpy(Lock.pBits,buffer,((width+3)/4)*((height+3)/4)*8);
 | |
| 		break;
 | |
| 	}
 | |
| 	pTexture->UnlockRect(level); 
 | |
| }
 | |
| 
 | |
| }  // namespace
 | |
| 
 | |
| }  // namespace DX9
 |