rsx: Allow GPU-accelerated stream manipulation when doing texture uploads

2025-04-20 11:36:13 +00:00 · 2019-08-27 17:01:36 +03:00 · 2019-08-27 17:01:36 +03:00 · 99fb6d6a5d
commit 99fb6d6a5d
parent e0a7912d7c
5 changed files with 244 additions and 131 deletions
--- a/rpcs3/Emu/RSX/Common/TextureUtils.cpp
+++ b/rpcs3/Emu/RSX/Common/TextureUtils.cpp
@ -469,48 +469,51 @@ std::vector<rsx_subresource_layout> get_subresources_layout(const rsx::vertex_te
 	return get_subresources_layout_impl(texture);
 }

-void upload_texture_subresource(gsl::span<gsl::byte> dst_buffer, const rsx_subresource_layout &src_layout, int format, bool is_swizzled, bool vtc_support, size_t dst_row_pitch_multiple_of)
+texture_memory_info upload_texture_subresource(gsl::span<gsl::byte> dst_buffer, const rsx_subresource_layout &src_layout, int format, bool is_swizzled, const texture_uploader_capabilities& caps)
 {
 	u16 w = src_layout.width_in_block;
 	u16 h = src_layout.height_in_block;
 	u16 depth = src_layout.depth;
 	u32 pitch = src_layout.pitch_in_block;

+	texture_memory_info result{};
+
 	// Ignore when texture width > pitch
 	if (w > pitch)
-		return;
+		return result;
+
+	// Check if we can use a fast path
+	int word_size = 0;
+	int words_per_block;
+	u32 dst_pitch_in_block;

-	// NOTE: Avoid block optimization for formats that can be modified internally by the GPU itself
-	// Since the gpu code does not attempt to do wide translations (e.g WZYX32->XYZW32), only perform, per-channel transform and use proper swizzles to get the proper output
 	switch (format)
 	{
 	case CELL_GCM_TEXTURE_B8:
 	{
-		if (is_swizzled)
-			copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u8>(dst_buffer), as_const_span<const u8>(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block<u8>(w, dst_row_pitch_multiple_of));
-		else
-			copy_unmodified_block::copy_mipmap_level(as_span_workaround<u8>(dst_buffer), as_const_span<const u8>(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block<u8>(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block);
+		word_size = words_per_block = 1;
+		dst_pitch_in_block = get_row_pitch_in_block<u8>(w, caps.alignment);
 		break;
 	}

 	case CELL_GCM_TEXTURE_COMPRESSED_B8R8_G8R8:
 	{
-		copy_decoded_rb_rg_block::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), w, h, depth, get_row_pitch_in_block<u32>(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block);
+		copy_decoded_rb_rg_block::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), w, h, depth, get_row_pitch_in_block<u32>(w, caps.alignment), src_layout.pitch_in_block);
 		break;
 	}

 	case CELL_GCM_TEXTURE_COMPRESSED_R8B8_R8G8:
 	{
-		copy_decoded_rb_rg_block::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const u16>(src_layout.data), w, h, depth, get_row_pitch_in_block<u32>(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block);
+		copy_decoded_rb_rg_block::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const u16>(src_layout.data), w, h, depth, get_row_pitch_in_block<u32>(w, caps.alignment), src_layout.pitch_in_block);
 		break;
 	}

 	case CELL_GCM_TEXTURE_R6G5B5:
 	{
 		if (is_swizzled)
-			copy_rgb655_block_swizzled::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), w, h, depth, src_layout.border, get_row_pitch_in_block<u16>(w, dst_row_pitch_multiple_of));
+			copy_rgb655_block_swizzled::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), w, h, depth, src_layout.border, get_row_pitch_in_block<u16>(w, caps.alignment));
 		else
-			copy_rgb655_block::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), w, h, depth, src_layout.border, get_row_pitch_in_block<u16>(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block);
+			copy_rgb655_block::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), w, h, depth, src_layout.border, get_row_pitch_in_block<u16>(w, caps.alignment), src_layout.pitch_in_block);
 		break;
 	}

@ -526,10 +529,9 @@ void upload_texture_subresource(gsl::span<gsl::byte> dst_buffer, const rsx_subre
 	case CELL_GCM_TEXTURE_R5G6B5:
 	case CELL_GCM_TEXTURE_G8B8:
 	{
-		if (is_swizzled)
-			copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block<u16>(w, dst_row_pitch_multiple_of));
-		else
-			copy_unmodified_block::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block<u16>(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block);
+		word_size = 2;
+		words_per_block = 1;
+		dst_pitch_in_block = get_row_pitch_in_block<u16>(w, caps.alignment);
 		break;
 	}

@ -538,10 +540,9 @@ void upload_texture_subresource(gsl::span<gsl::byte> dst_buffer, const rsx_subre
 	case CELL_GCM_TEXTURE_DEPTH24_D8:
 	case CELL_GCM_TEXTURE_DEPTH24_D8_FLOAT: // Untested
 	{
-		if (is_swizzled)
-			copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const be_t<u32>>(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block<u32>(w, dst_row_pitch_multiple_of));
-		else
-			copy_unmodified_block::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const be_t<u32>>(src_layout.data), 1, w, h, depth, src_layout.border, get_row_pitch_in_block<u32>(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block);
+		word_size = 4;
+		words_per_block = 1;
+		dst_pitch_in_block = get_row_pitch_in_block<u32>(w, caps.alignment);
 		break;
 	}

@ -554,13 +555,9 @@ void upload_texture_subresource(gsl::span<gsl::byte> dst_buffer, const rsx_subre
 	case CELL_GCM_TEXTURE_W16_Z16_Y16_X16_FLOAT:
 	{
 		const u16 block_size = get_format_block_size_in_bytes(format);
-		const u16 words_per_block = block_size / 2;
-		const auto dst_pitch_in_block = get_row_pitch_in_block(block_size, w, dst_row_pitch_multiple_of);
-
-		if (is_swizzled)
-			copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
-		else
-			copy_unmodified_block::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
+		word_size = 2;
+		words_per_block = block_size / 2;
+		dst_pitch_in_block = get_row_pitch_in_block(block_size, w, caps.alignment);
 		break;
 	}

@ -568,28 +565,24 @@ void upload_texture_subresource(gsl::span<gsl::byte> dst_buffer, const rsx_subre
 	case CELL_GCM_TEXTURE_W32_Z32_Y32_X32_FLOAT:
 	{
 		const u16 block_size = get_format_block_size_in_bytes(format);
-		const u16 words_per_block = block_size / 4;
-		const auto dst_pitch_in_block = get_row_pitch_in_block(block_size, w, dst_row_pitch_multiple_of);
-
-		if (is_swizzled)
-			copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const be_t<u32>>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
-		else
-			copy_unmodified_block::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const be_t<u32>>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
+		word_size = 4;
+		words_per_block = block_size / 4;
+		dst_pitch_in_block = get_row_pitch_in_block(block_size, w, caps.alignment);
 		break;
 	}

 	case CELL_GCM_TEXTURE_COMPRESSED_DXT1:
 	{
-		if (depth > 1 && !vtc_support)
+		if (depth > 1 && !caps.supports_vtc_decoding)
 		{
 			// PS3 uses the Nvidia VTC memory layout for compressed 3D textures.
 			// This is only supported using Nvidia OpenGL.
 			// Remove the VTC tiling to support ATI and Vulkan.
-			copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround<u64>(dst_buffer), as_const_span<const u64>(src_layout.data), w, h, depth, get_row_pitch_in_block<u64>(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block);
+			copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround<u64>(dst_buffer), as_const_span<const u64>(src_layout.data), w, h, depth, get_row_pitch_in_block<u64>(w, caps.alignment), src_layout.pitch_in_block);
 		}
 		else
 		{
-			copy_unmodified_block::copy_mipmap_level(as_span_workaround<u64>(dst_buffer), as_const_span<const u64>(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block<u64>(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block);
+			copy_unmodified_block::copy_mipmap_level(as_span_workaround<u64>(dst_buffer), as_const_span<const u64>(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block<u64>(w, caps.alignment), src_layout.pitch_in_block);
 		}
 		break;
 	}
@ -597,16 +590,16 @@ void upload_texture_subresource(gsl::span<gsl::byte> dst_buffer, const rsx_subre
 	case CELL_GCM_TEXTURE_COMPRESSED_DXT23:
 	case CELL_GCM_TEXTURE_COMPRESSED_DXT45:
 	{
-		if (depth > 1 && !vtc_support)
+		if (depth > 1 && !caps.supports_vtc_decoding)
 		{
 			// PS3 uses the Nvidia VTC memory layout for compressed 3D textures.
 			// This is only supported using Nvidia OpenGL.
 			// Remove the VTC tiling to support ATI and Vulkan.
-			copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround<u128>(dst_buffer), as_const_span<const u128>(src_layout.data), w, h, depth, get_row_pitch_in_block<u128>(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block);
+			copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround<u128>(dst_buffer), as_const_span<const u128>(src_layout.data), w, h, depth, get_row_pitch_in_block<u128>(w, caps.alignment), src_layout.pitch_in_block);
 		}
 		else
 		{
-			copy_unmodified_block::copy_mipmap_level(as_span_workaround<u128>(dst_buffer), as_const_span<const u128>(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block<u128>(w, dst_row_pitch_multiple_of), src_layout.pitch_in_block);
+			copy_unmodified_block::copy_mipmap_level(as_span_workaround<u128>(dst_buffer), as_const_span<const u128>(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block<u128>(w, caps.alignment), src_layout.pitch_in_block);
 		}
 		break;
 	}
@ -614,6 +607,56 @@ void upload_texture_subresource(gsl::span<gsl::byte> dst_buffer, const rsx_subre
 	default:
 		fmt::throw_exception("Wrong format 0x%x" HERE, format);
 	}
+
+	if (word_size)
+	{
+		if (word_size == 1)
+		{
+			if (is_swizzled)
+				copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u8>(dst_buffer), as_const_span<const u8>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
+			else
+				copy_unmodified_block::copy_mipmap_level(as_span_workaround<u8>(dst_buffer), as_const_span<const u8>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
+		}
+		else if (caps.supports_byteswap)
+		{
+			result.require_swap = true;
+			result.element_size = word_size;
+
+			if (word_size == 2)
+			{
+				if (is_swizzled)
+					copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const u16>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
+				else
+					copy_unmodified_block::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const u16>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
+			}
+			else if (word_size == 4)
+			{
+				if (is_swizzled)
+					copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const u32>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
+				else
+					copy_unmodified_block::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const u32>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
+			}
+		}
+		else
+		{
+			if (word_size == 2)
+			{
+				if (is_swizzled)
+					copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
+				else
+					copy_unmodified_block::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const be_t<u16>>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
+			}
+			else if (word_size == 4)
+			{
+				if (is_swizzled)
+					copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const be_t<u32>>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
+				else
+					copy_unmodified_block::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const be_t<u32>>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
+			}
+		}
+	}
+
+	return result;
 }

 /**
--- a/rpcs3/Emu/RSX/Common/TextureUtils.h
+++ b/rpcs3/Emu/RSX/Common/TextureUtils.h
@ -110,6 +110,19 @@ struct rsx_subresource_layout
 	u32 pitch_in_block;
 };

+struct texture_memory_info
+{
+	int element_size;
+	bool require_swap;
+};
+
+struct texture_uploader_capabilities
+{
+	bool supports_byteswap;
+	bool supports_vtc_decoding;
+	size_t alignment;
+};
+
 /**
 * Get size to store texture in a linear fashion.
 * Storage is assumed to use a rowPitchAlignment boundary for every row of texture.
@ -125,7 +138,7 @@ size_t get_placed_texture_storage_size(const rsx::vertex_texture &texture, size_
 std::vector<rsx_subresource_layout> get_subresources_layout(const rsx::fragment_texture &texture);
 std::vector<rsx_subresource_layout> get_subresources_layout(const rsx::vertex_texture &texture);

-void upload_texture_subresource(gsl::span<gsl::byte> dst_buffer, const rsx_subresource_layout &src_layout, int format, bool is_swizzled, bool vtc_support, size_t dst_row_pitch_multiple_of);
+texture_memory_info upload_texture_subresource(gsl::span<gsl::byte> dst_buffer, const rsx_subresource_layout &src_layout, int format, bool is_swizzled, const texture_uploader_capabilities& caps);

 u8 get_format_block_size_in_bytes(int format);
 u8 get_format_block_size_in_texel(int format);
--- a/rpcs3/Emu/RSX/D3D12/D3D12Texture.cpp
+++ b/rpcs3/Emu/RSX/D3D12/D3D12Texture.cpp
@ -115,7 +115,8 @@ namespace {
 		size_t offset_in_buffer = 0;
 		for (const rsx_subresource_layout &layout : input_layouts)
 		{
-			upload_texture_subresource(mapped_buffer.subspan(offset_in_buffer), layout, format, is_swizzled, false, 256);
+			texture_uploader_capabilities caps{ false, false, 256 };
+			upload_texture_subresource(mapped_buffer.subspan(offset_in_buffer), layout, format, is_swizzled, caps);
 			UINT row_pitch = align(layout.width_in_block * block_size_in_bytes, 256);
 			command_list->CopyTextureRegion(&CD3DX12_TEXTURE_COPY_LOCATION(existing_texture, (UINT)mip_level), 0, 0, 0,
 				&CD3DX12_TEXTURE_COPY_LOCATION(texture_buffer_heap.get_heap(),
--- a/rpcs3/Emu/RSX/GL/GLTexture.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp
@ -63,7 +63,7 @@ namespace gl
 		case CELL_GCM_TEXTURE_A1R5G5B5: return std::make_tuple(GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV);
 		case CELL_GCM_TEXTURE_A4R4G4B4: return std::make_tuple(GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4);
 		case CELL_GCM_TEXTURE_R5G6B5: return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_6_5);
-		case CELL_GCM_TEXTURE_A8R8G8B8: return std::make_tuple(GL_BGRA, GL_UNSIGNED_BYTE);
+		case CELL_GCM_TEXTURE_A8R8G8B8: return std::make_tuple(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8);
 		case CELL_GCM_TEXTURE_G8B8: return std::make_tuple(GL_RG, GL_UNSIGNED_BYTE);
 		case CELL_GCM_TEXTURE_R6G5B5: return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_6_5);
 		case CELL_GCM_TEXTURE_DEPTH24_D8: return std::make_tuple(GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8);
@ -370,7 +370,7 @@ namespace gl
 		case CELL_GCM_TEXTURE_R5G5B5A1:
 		case CELL_GCM_TEXTURE_R6G5B5:
 		case CELL_GCM_TEXTURE_R5G6B5:
-		case CELL_GCM_TEXTURE_A8R8G8B8: // TODO
+		case CELL_GCM_TEXTURE_A8R8G8B8:
 		case CELL_GCM_TEXTURE_COMPRESSED_DXT1:
 		case CELL_GCM_TEXTURE_COMPRESSED_DXT23:
 		case CELL_GCM_TEXTURE_COMPRESSED_DXT45:
@ -458,107 +458,122 @@ namespace gl
 			const std::vector<rsx_subresource_layout> &input_layouts, bool is_swizzled, GLenum gl_format, GLenum gl_type, std::vector<gsl::byte>& staging_buffer)
 	{
 		int mip_level = 0;
-		bool vtc_support = gl::get_driver_caps().vendor_NVIDIA;
+		texture_uploader_capabilities caps{ true, false, 4 };

-		if (is_compressed_format(format))
+		pixel_unpack_settings unpack_settings;
+		unpack_settings.row_length(0).alignment(4);
+
+		if (LIKELY(is_compressed_format(format)))
 		{
 			//Compressed formats have a 4-byte alignment
 			//TODO: Verify that samplers are not affected by the padding
 			width = align(width, 4);
 			height = align(height, 4);
-		}

-		if (dim == rsx::texture_dimension_extended::texture_dimension_1d)
-		{
-			if (!is_compressed_format(format))
+			caps.supports_vtc_decoding = gl::get_driver_caps().vendor_NVIDIA;
+			unpack_settings.apply();
+
+			for (const rsx_subresource_layout& layout : input_layouts)
 			{
-				for (const rsx_subresource_layout &layout : input_layouts)
+				upload_texture_subresource(staging_buffer, layout, format, is_swizzled, caps);
+
+				switch (dim)
 				{
-					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4);
-					glTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block, gl_format, gl_type, staging_buffer.data());
-				}
-			}
-			else
-			{
-				for (const rsx_subresource_layout &layout : input_layouts)
+				case rsx::texture_dimension_extended::texture_dimension_1d:
 				{
 					u32 size = layout.width_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16);
-					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4);
 					glCompressedTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block * 4, gl_format, size, staging_buffer.data());
+					break;
 				}
-			}
-			return;
-		}
-
-		if (dim == rsx::texture_dimension_extended::texture_dimension_2d)
-		{
-			if (!is_compressed_format(format))
-			{
-				for (const rsx_subresource_layout &layout : input_layouts)
-				{
-					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4);
-					glTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data());
-				}
-			}
-			else
-			{
-				for (const rsx_subresource_layout &layout : input_layouts)
+				case rsx::texture_dimension_extended::texture_dimension_2d:
 				{
 					u32 size = layout.width_in_block * layout.height_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16);
-					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4);
 					glCompressedTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, gl_format, size, staging_buffer.data());
+					break;
 				}
-			}
-			return;
-		}
-
-		if (dim == rsx::texture_dimension_extended::texture_dimension_cubemap)
-		{
-			// Note : input_layouts size is get_exact_mipmap_count() for non cubemap texture, and 6 * get_exact_mipmap_count() for cubemap
-			// Thus for non cubemap texture, mip_level / mipmap_per_layer will always be rounded to 0.
-			// mip_level % mipmap_per_layer will always be equal to mip_level
-			if (!is_compressed_format(format))
-			{
-				for (const rsx_subresource_layout &layout : input_layouts)
-				{
-					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4);
-					glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data());
-					mip_level++;
-				}
-			}
-			else
-			{
-				for (const rsx_subresource_layout &layout : input_layouts)
+				case rsx::texture_dimension_extended::texture_dimension_cubemap:
 				{
+					// Note : input_layouts size is get_exact_mipmap_count() for non cubemap texture, and 6 * get_exact_mipmap_count() for cubemap
+					// Thus for non cubemap texture, mip_level / mipmap_per_layer will always be rounded to 0.
+					// mip_level % mipmap_per_layer will always be equal to mip_level
 					u32 size = layout.width_in_block * layout.height_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16);
-					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4);
 					glCompressedTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, gl_format, size, staging_buffer.data());
 					mip_level++;
+					break;
 				}
-			}
-			return;
-		}
-
-		if (dim == rsx::texture_dimension_extended::texture_dimension_3d)
-		{
-			if (!is_compressed_format(format))
-			{
-				for (const rsx_subresource_layout &layout : input_layouts)
-				{
-					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4);
-					glTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block, layout.height_in_block, depth, gl_format, gl_type, staging_buffer.data());
-				}
-			}
-			else
-			{
-				for (const rsx_subresource_layout &layout : input_layouts)
+				case rsx::texture_dimension_extended::texture_dimension_3d:
 				{
 					u32 size = layout.width_in_block * layout.height_in_block * layout.depth * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16);
-					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, vtc_support, 4);
 					glCompressedTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, layout.depth, gl_format, size, staging_buffer.data());
+					break;
+				}
+				default:
+				{
+					ASSUME(0);
+					fmt::throw_exception("Unreachable" HERE);
+				}
+				}
+			}
+		}
+		else
+		{
+			bool apply_settings = true;
+			switch (gl_type)
+			{
+			case GL_UNSIGNED_INT_8_8_8_8:
+				// NOTE: GL_UNSIGNED_INT_8_8_8_8 is already a swapped type
+				// TODO: Remove reliance on format and type checks when compute acceleration is implemented
+				apply_settings = false;
+				break;
+			case GL_BYTE:
+			case GL_UNSIGNED_BYTE:
+				// Multi-channel format uploaded one byte at a time. This is due to poor driver support for formats like GL_UNSIGNED SHORT_8_8
+				// Do byteswapping in software for now until compute acceleration is available
+				apply_settings = (gl_format == GL_RED);
+				caps.supports_byteswap = apply_settings;
+				break;
+			default:
+				break;
+			}
+
+			if (!apply_settings)
+			{
+				unpack_settings.apply();
+			}
+
+			for (const rsx_subresource_layout& layout : input_layouts)
+			{
+				auto op = upload_texture_subresource(staging_buffer, layout, format, is_swizzled, caps);
+				if (apply_settings)
+				{
+					unpack_settings.swap_bytes(op.require_swap);
+					unpack_settings.apply();
+					apply_settings = false;
+				}
+
+				switch (dim)
+				{
+				case rsx::texture_dimension_extended::texture_dimension_1d:
+					glTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block, gl_format, gl_type, staging_buffer.data());
+					break;
+				case rsx::texture_dimension_extended::texture_dimension_2d:
+					glTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data());
+					break;
+				case rsx::texture_dimension_extended::texture_dimension_cubemap:
+					// Note : input_layouts size is get_exact_mipmap_count() for non cubemap texture, and 6 * get_exact_mipmap_count() for cubemap
+					// Thus for non cubemap texture, mip_level / mipmap_per_layer will always be rounded to 0.
+					// mip_level % mipmap_per_layer will always be equal to mip_level
+					glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data());
+					mip_level++;
+					break;
+				case rsx::texture_dimension_extended::texture_dimension_3d:
+					glTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block, layout.height_in_block, depth, gl_format, gl_type, staging_buffer.data());
+					break;
+				default:
+					ASSUME(0);
+					fmt::throw_exception("Unreachable" HERE);
 				}
 			}
-			return;
 		}
 	}

@ -615,9 +630,6 @@ namespace gl
 		}

 		glBindTexture(target, id);
-		glPixelStorei(GL_UNPACK_ALIGNMENT, 4);
-		glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
-		glPixelStorei(GL_UNPACK_SWAP_BYTES, GL_FALSE);
 		glTexParameteri(target, GL_TEXTURE_BASE_LEVEL, 0);
 		glTexParameteri(target, GL_TEXTURE_MAX_LEVEL, mipmaps - 1);
 		// The rest of sampler state is now handled by sampler state objects
@ -627,6 +639,7 @@ namespace gl
 		size_t texture_data_sz = depth * height * aligned_pitch;
 		std::vector<gsl::byte> data_upload_buf(texture_data_sz);

+		// TODO: GL drivers support byteswapping and this should be used instead of doing so manually
 		const auto format_type = get_format_type(gcm_format);
 		const GLenum gl_format = std::get<0>(format_type);
 		const GLenum gl_type = std::get<1>(format_type);
--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@ -516,19 +516,26 @@ namespace vk
 		u32 block_in_pixel = get_format_block_size_in_texel(format);
 		u8  block_size_in_bytes = get_format_block_size_in_bytes(format);

+		texture_uploader_capabilities caps{ true, false, heap_align };
+		vk::buffer* scratch_buf = nullptr;
+		u32 scratch_offset = 0;
+
 		for (const rsx_subresource_layout &layout : subresource_layout)
 		{
 			u32 row_pitch = (((layout.width_in_block * block_size_in_bytes) + heap_align - 1) / heap_align) * heap_align;
 			if (heap_align != 256) verify(HERE), row_pitch == heap_align;
 			u32 image_linear_size = row_pitch * layout.height_in_block * layout.depth;

-			//Map with extra padding bytes in case of realignment
+			// Map with extra padding bytes in case of realignment
 			size_t offset_in_buffer = upload_heap.alloc<512>(image_linear_size + 8);
 			void *mapped_buffer = upload_heap.map(offset_in_buffer, image_linear_size + 8);
 			VkBuffer buffer_handle = upload_heap.heap->value;

+			// Only do GPU-side conversion if occupancy is good
+			caps.supports_byteswap = (image_linear_size >= 1024);
+
 			gsl::span<gsl::byte> mapped{ (gsl::byte*)mapped_buffer, ::narrow<int>(image_linear_size) };
-			upload_texture_subresource(mapped, layout, format, is_swizzled, false, heap_align);
+			auto opt = upload_texture_subresource(mapped, layout, format, is_swizzled, caps);
 			upload_heap.unmap();

 			VkBufferImageCopy copy_info = {};
@ -542,25 +549,61 @@ namespace vk
 			copy_info.imageSubresource.mipLevel = mipmap_level % mipmap_count;
 			copy_info.bufferRowLength = block_in_pixel * row_pitch / block_size_in_bytes;

-			if (dst_image->info.format == VK_FORMAT_D24_UNORM_S8_UINT ||
-				dst_image->info.format == VK_FORMAT_D32_SFLOAT_S8_UINT)
+			if (opt.require_swap || dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
 			{
-				// Executing GPU tasks on host_visible RAM is awful, copy to device-local buffer instead
-				auto scratch_buf = vk::get_scratch_buffer();
+				if (!scratch_buf)
+				{
+					scratch_buf = vk::get_scratch_buffer();
+				}
+				else if ((scratch_offset + image_linear_size) > scratch_buf->size())
+				{
+					scratch_offset = 0;
+					insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_buf->size(), VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+						VK_ACCESS_TRANSFER_READ_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
+				}

 				VkBufferCopy copy = {};
 				copy.srcOffset = offset_in_buffer;
-				copy.dstOffset = 0;
+				copy.dstOffset = scratch_offset;
 				copy.size = image_linear_size;

 				vkCmdCopyBuffer(cmd, buffer_handle, scratch_buf->value, 1, &copy);

-				insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, image_linear_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-					VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
+				insert_buffer_memory_barrier(cmd, scratch_buf->value, scratch_offset, image_linear_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+					VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
+			}

-				copy_info.bufferOffset = 0;
+			if (opt.require_swap)
+			{
+				if (opt.element_size == 4)
+				{
+					vk::get_compute_task<vk::cs_shuffle_32>()->run(cmd, scratch_buf, image_linear_size, scratch_offset);
+				}
+				else if (opt.element_size == 2)
+				{
+					vk::get_compute_task<vk::cs_shuffle_16>()->run(cmd, scratch_buf, image_linear_size, scratch_offset);
+				}
+				else
+				{
+					fmt::throw_exception("Unreachable" HERE);
+				}
+			}
+
+			if (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT)
+			{
+				copy_info.bufferOffset = scratch_offset;
+				scratch_offset = align(scratch_offset + image_linear_size, 512);
 				vk::copy_buffer_to_image(cmd, scratch_buf, dst_image, copy_info);
 			}
+			else if (opt.require_swap)
+			{
+				insert_buffer_memory_barrier(cmd, scratch_buf->value, scratch_offset, image_linear_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+					VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
+
+				copy_info.bufferOffset = scratch_offset;
+				scratch_offset = align(scratch_offset + image_linear_size, 512);
+				vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy_info);
+			}
 			else
 			{
 				vkCmdCopyBufferToImage(cmd, buffer_handle, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy_info);