Add DXT1-5 decompression on ARM

2025-04-19 19:15:26 +00:00 · 2025-02-10 18:32:01 +00:00 · 2025-02-10 18:32:01 +00:00 · e72cb6801a
commit e72cb6801a
parent 1e01511ca0
14 changed files with 312 additions and 44 deletions
--- a/3rdparty/bcdec/bcdec.hpp
+++ b/3rdparty/bcdec/bcdec.hpp
@ -0,0 +1,170 @@
+// Based on https://github.com/iOrange/bcdec/blob/963c5e56b7a335e066cff7d16a3de75f4e8ad366/bcdec.h
+// provides functions to decompress blocks of BC compressed images
+//
+// ------------------------------------------------------------------------------
+//
+// MIT LICENSE
+// ===========
+// Copyright (c) 2022 Sergii Kudlai
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal in
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// ------------------------------------------------------------------------------
+
+#pragma once
+
+#include <util/types.hpp>
+
+static void bcdec__color_block(const u8* compressedBlock, u8* dstColors, int destinationPitch, bool onlyOpaqueMode) {
+	u16 c0, c1;
+	u32 refColors[4]; /* 0xAABBGGRR */
+	u32 colorIndices;
+	u32 r0, g0, b0, r1, g1, b1, r, g, b;
+
+	c0 = *reinterpret_cast<const u16*>(compressedBlock);
+	c1 = *(reinterpret_cast<const u16*>(compressedBlock) + 1);
+
+	/* Unpack 565 ref colors */
+	r0 = (c0 >> 11) & 0x1F;
+	g0 = (c0 >> 5)  & 0x3F;
+	b0 =  c0        & 0x1F;
+
+	r1 = (c1 >> 11) & 0x1F;
+	g1 = (c1 >> 5)  & 0x3F;
+	b1 =  c1        & 0x1F;
+
+	/* Expand 565 ref colors to 888 */
+	r = (r0 * 527 + 23) >> 6;
+	g = (g0 * 259 + 33) >> 6;
+	b = (b0 * 527 + 23) >> 6;
+	refColors[0] = 0xFF000000 | (r << 16) | (g << 8) | b;
+
+	r = (r1 * 527 + 23) >> 6;
+	g = (g1 * 259 + 33) >> 6;
+	b = (b1 * 527 + 23) >> 6;
+	refColors[1] = 0xFF000000 | (r << 16) | (g << 8) | b;
+
+	if (c0 > c1 || onlyOpaqueMode)
+	{   /* Standard BC1 mode (also BC3 color block uses ONLY this mode) */
+		/* color_2 = 2/3*color_0 + 1/3*color_1
+		color_3 = 1/3*color_0 + 2/3*color_1 */
+		r = ((2 * r0 + r1) *  351 +   61) >>  7;
+		g = ((2 * g0 + g1) * 2763 + 1039) >> 11;
+		b = ((2 * b0 + b1) *  351 +   61) >>  7;
+		refColors[2] = 0xFF000000 | (r << 16) | (g << 8) | b;
+
+		r = ((r0 + r1 * 2) *  351 +   61) >>  7;
+		g = ((g0 + g1 * 2) * 2763 + 1039) >> 11;
+		b = ((b0 + b1 * 2) *  351 +   61) >>  7;
+		refColors[3] = 0xFF000000 | (r << 16) | (g << 8) | b;
+	}
+	else
+	{   /* Quite rare BC1A mode */
+		/* color_2 = 1/2*color_0 + 1/2*color_1;
+		color_3 = 0;                         */
+		r = ((r0 + r1) * 1053 +  125) >>  8;
+		g = ((g0 + g1) * 4145 + 1019) >> 11;
+		b = ((b0 + b1) * 1053 +  125) >>  8;
+		refColors[2] = 0xFF000000 | (r << 16) | (g << 8) | b;
+
+		refColors[3] = 0x00000000;
+	}
+
+	colorIndices = *reinterpret_cast<const u32*>(compressedBlock + 4);
+
+	/* Fill out the decompressed color block */
+	for (int i = 0; i < 4; ++i)
+	{
+		for (int j = 0; j < 4; ++j)
+		{
+			int idx = colorIndices & 0x03;
+			*reinterpret_cast<u32*>(dstColors + j * 4) = refColors[idx];
+			colorIndices >>= 2;
+		}
+
+		dstColors += destinationPitch;
+	}
+}
+
+static void bcdec__sharp_alpha_block(const u16* alpha, u8* decompressed, int destinationPitch) {
+	for (int i = 0; i < 4; ++i)
+	{
+		for (int j = 0; j < 4; ++j)
+		{
+			decompressed[j * 4] = ((alpha[i] >> (4 * j)) & 0x0F) * 17;
+		}
+		decompressed += destinationPitch;
+	}
+}
+
+static void bcdec__smooth_alpha_block(const u8* compressedBlock, u8* decompressed, int destinationPitch) {
+	u8 alpha[8];
+	u64 block = *reinterpret_cast<const u64*>(compressedBlock);
+	u64 indices;
+
+	alpha[0] = block & 0xFF;
+	alpha[1] = (block >> 8) & 0xFF;
+
+	if (alpha[0] > alpha[1])
+	{
+		/* 6 interpolated alpha values. */
+		alpha[2] = (6 * alpha[0] +     alpha[1]) / 7;   /* 6/7*alpha_0 + 1/7*alpha_1 */
+		alpha[3] = (5 * alpha[0] + 2 * alpha[1]) / 7;   /* 5/7*alpha_0 + 2/7*alpha_1 */
+		alpha[4] = (4 * alpha[0] + 3 * alpha[1]) / 7;   /* 4/7*alpha_0 + 3/7*alpha_1 */
+		alpha[5] = (3 * alpha[0] + 4 * alpha[1]) / 7;   /* 3/7*alpha_0 + 4/7*alpha_1 */
+		alpha[6] = (2 * alpha[0] + 5 * alpha[1]) / 7;   /* 2/7*alpha_0 + 5/7*alpha_1 */
+		alpha[7] = (    alpha[0] + 6 * alpha[1]) / 7;   /* 1/7*alpha_0 + 6/7*alpha_1 */
+	}
+	else
+	{
+		/* 4 interpolated alpha values. */
+		alpha[2] = (4 * alpha[0] +     alpha[1]) / 5;   /* 4/5*alpha_0 + 1/5*alpha_1 */
+		alpha[3] = (3 * alpha[0] + 2 * alpha[1]) / 5;   /* 3/5*alpha_0 + 2/5*alpha_1 */
+		alpha[4] = (2 * alpha[0] + 3 * alpha[1]) / 5;   /* 2/5*alpha_0 + 3/5*alpha_1 */
+		alpha[5] = (    alpha[0] + 4 * alpha[1]) / 5;   /* 1/5*alpha_0 + 4/5*alpha_1 */
+		alpha[6] = 0x00;
+		alpha[7] = 0xFF;
+	}
+
+	indices = block >> 16;
+	for (int i = 0; i < 4; ++i)
+	{
+		for (int j = 0; j < 4; ++j)
+		{
+			decompressed[j * 4] = alpha[indices & 0x07];
+			indices >>= 3;
+		}
+		decompressed += destinationPitch;
+	}
+}
+
+static inline void bcdec_bc1(const u8* compressedBlock, u8* decompressedBlock, int destinationPitch) {
+	bcdec__color_block(compressedBlock, decompressedBlock, destinationPitch, false);
+}
+
+static inline void bcdec_bc2(const u8* compressedBlock, u8* decompressedBlock, int destinationPitch) {
+	bcdec__color_block(compressedBlock + 8, decompressedBlock, destinationPitch, true);
+	bcdec__sharp_alpha_block(reinterpret_cast<const u16*>(compressedBlock), decompressedBlock + 3, destinationPitch);
+}
+
+static inline void bcdec_bc3(const u8* compressedBlock, u8* decompressedBlock, int destinationPitch) {
+	bcdec__color_block(compressedBlock + 8, decompressedBlock, destinationPitch, true);
+	bcdec__smooth_alpha_block(compressedBlock, decompressedBlock + 3, destinationPitch);
+}
+
--- a/rpcs3/Emu/RSX/Common/TextureUtils.cpp
+++ b/rpcs3/Emu/RSX/Common/TextureUtils.cpp
@ -3,6 +3,7 @@
 #include "TextureUtils.h"
 #include "../RSXThread.h"
 #include "../rsx_utils.h"
+#include "3rdparty/bcdec/bcdec.hpp"

 #include "util/asm.hpp"

@ -497,6 +498,63 @@ struct copy_rgb655_block_swizzled
 	}
 };

+struct copy_decoded_bc1_block
+{
+	static void copy_mipmap_level(std::span<u32> dst, std::span<const u64> src, u16 width_in_block, u16 row_count, u16 depth, u32 dst_pitch_in_block, u32 src_pitch_in_block)
+	{
+		u32 src_offset = 0, dst_offset = 0, destinationPitch = dst_pitch_in_block * 4;
+		for (u32 row = 0; row < row_count * depth; row++)
+		{
+			for (u32 col = 0; col < width_in_block; col++) {
+				const u8* compressedBlock = reinterpret_cast<const u8*>(&src[src_offset + col]);
+				u8* decompressedBlock = reinterpret_cast<u8*>(&dst[dst_offset + col * 4]);
+				bcdec_bc1(compressedBlock, decompressedBlock, destinationPitch);
+			}
+
+			src_offset += src_pitch_in_block;
+			dst_offset += destinationPitch;
+		}
+	}
+};
+
+struct copy_decoded_bc2_block
+{
+	static void copy_mipmap_level(std::span<u32> dst, std::span<const u128> src, u16 width_in_block, u16 row_count, u16 depth, u32 dst_pitch_in_block, u32 src_pitch_in_block)
+	{
+		u32 src_offset = 0, dst_offset = 0, destinationPitch = dst_pitch_in_block * 4;
+		for (u32 row = 0; row < row_count * depth; row++)
+		{
+			for (u32 col = 0; col < width_in_block; col++) {
+				const u8* compressedBlock = reinterpret_cast<const u8*>(&src[src_offset + col]);
+				u8* decompressedBlock = reinterpret_cast<u8*>(&dst[dst_offset + col * 4]);
+				bcdec_bc2(compressedBlock, decompressedBlock, destinationPitch);
+			}
+
+			src_offset += src_pitch_in_block;
+			dst_offset += destinationPitch;
+		}
+	}
+};
+
+struct copy_decoded_bc3_block
+{
+	static void copy_mipmap_level(std::span<u32> dst, std::span<const u128> src, u16 width_in_block, u16 row_count, u16 depth, u32 dst_pitch_in_block, u32 src_pitch_in_block)
+	{
+		u32 src_offset = 0, dst_offset = 0, destinationPitch = dst_pitch_in_block * 4;
+		for (u32 row = 0; row < row_count * depth; row++)
+		{
+			for (u32 col = 0; col < width_in_block; col++) {
+				const u8* compressedBlock = reinterpret_cast<const u8*>(&src[src_offset + col]);
+				u8* decompressedBlock = reinterpret_cast<u8*>(&dst[dst_offset + col * 4]);
+				bcdec_bc3(compressedBlock, decompressedBlock, destinationPitch);
+			}
+
+			src_offset += src_pitch_in_block;
+			dst_offset += destinationPitch;
+		}
+	}
+};
+
 namespace
 {
 	/**
@ -952,6 +1010,12 @@ namespace rsx

 		case CELL_GCM_TEXTURE_COMPRESSED_DXT1:
 		{
+			if (!caps.supports_dxt)
+			{
+				copy_decoded_bc1_block::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const u64>(), w, h, depth, get_row_pitch_in_block<u32>(w, caps.alignment), src_layout.pitch_in_block);
+				break;
+			}
+
 			const bool is_3d = depth > 1;
 			const bool is_po2 = utils::is_power_of_2(src_layout.width_in_texel) && utils::is_power_of_2(src_layout.height_in_texel);

@ -981,8 +1045,22 @@ namespace rsx
 		}

 		case CELL_GCM_TEXTURE_COMPRESSED_DXT23:
+		{
+			if (!caps.supports_dxt)
+			{
+				copy_decoded_bc2_block::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const u128>(), w, h, depth, get_row_pitch_in_block<u32>(w, caps.alignment), src_layout.pitch_in_block);
+				break;
+			}
+			[[fallthrough]];
+		}
 		case CELL_GCM_TEXTURE_COMPRESSED_DXT45:
 		{
+			if (!caps.supports_dxt)
+			{
+				copy_decoded_bc3_block::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const u128>(), w, h, depth, get_row_pitch_in_block<u32>(w, caps.alignment), src_layout.pitch_in_block);
+				break;
+			}
+
 			const bool is_3d = depth > 1;
 			const bool is_po2 = utils::is_power_of_2(src_layout.width_in_texel) && utils::is_power_of_2(src_layout.height_in_texel);

@ -1094,7 +1172,7 @@ namespace rsx
 		return result;
 	}

-	bool is_compressed_host_format(u32 texture_format)
+	bool is_compressed_host_format(const texture_uploader_capabilities& caps, u32 texture_format)
 	{
 		switch (texture_format)
 		{
@ -1129,7 +1207,7 @@ namespace rsx
 		case CELL_GCM_TEXTURE_COMPRESSED_DXT1:
 		case CELL_GCM_TEXTURE_COMPRESSED_DXT23:
 		case CELL_GCM_TEXTURE_COMPRESSED_DXT45:
-			return true;
+			return caps.supports_dxt;
 		}
 		fmt::throw_exception("Unknown format 0x%x", texture_format);
 	}
--- a/rpcs3/Emu/RSX/Common/TextureUtils.h
+++ b/rpcs3/Emu/RSX/Common/TextureUtils.h
@ -227,6 +227,7 @@ namespace rsx
 		bool supports_vtc_decoding;
 		bool supports_hw_deswizzle;
 		bool supports_zero_copy;
+		bool supports_dxt;
 		usz alignment;
 	};

@ -252,7 +253,7 @@ namespace rsx
 	u8 get_format_block_size_in_bytes(rsx::surface_color_format format);
 	u8 get_format_block_size_in_bytes(rsx::surface_depth_format2 format);

-	bool is_compressed_host_format(u32 format); // Returns true for host-compressed formats (DXT)
+	bool is_compressed_host_format(const texture_uploader_capabilities& caps, u32 format); // Returns true for host-compressed formats (DXT)
 	u8 get_format_sample_count(rsx::surface_antialiasing antialias);
 	u32 get_max_depth_value(rsx::surface_depth_format2 format);
 	bool is_depth_stencil_format(rsx::surface_depth_format2 format);
--- a/rpcs3/Emu/RSX/GL/GLTexture.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp
@ -70,6 +70,7 @@ namespace gl

 	GLenum get_sized_internal_format(u32 texture_format)
 	{
+		const bool supports_dxt = get_driver_caps().EXT_texture_compression_s3tc_supported;
 		switch (texture_format)
 		{
 		case CELL_GCM_TEXTURE_B8: return GL_R8;
@ -92,9 +93,9 @@ namespace gl
 		case CELL_GCM_TEXTURE_D1R5G5B5: return GL_BGR5_A1;
 		case CELL_GCM_TEXTURE_D8R8G8B8: return GL_BGRA8;
 		case CELL_GCM_TEXTURE_Y16_X16_FLOAT: return GL_RG16F;
-		case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return GL_COMPRESSED_RGBA_S3TC_DXT1_EXT;
-		case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return GL_COMPRESSED_RGBA_S3TC_DXT3_EXT;
-		case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return GL_COMPRESSED_RGBA_S3TC_DXT5_EXT;
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT1_EXT : GL_BGRA8;
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT3_EXT : GL_BGRA8;
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT5_EXT : GL_BGRA8;
 		case CELL_GCM_TEXTURE_COMPRESSED_HILO8: return GL_RG8;
 		case CELL_GCM_TEXTURE_COMPRESSED_HILO_S8: return GL_RG8_SNORM;
 		case CELL_GCM_TEXTURE_COMPRESSED_B8R8_G8R8: return GL_BGRA8;
@ -105,6 +106,7 @@ namespace gl

 	std::tuple<GLenum, GLenum> get_format_type(u32 texture_format)
 	{
+		const bool supports_dxt = get_driver_caps().EXT_texture_compression_s3tc_supported;
 		switch (texture_format)
 		{
 		case CELL_GCM_TEXTURE_B8: return std::make_tuple(GL_RED, GL_UNSIGNED_BYTE);
@ -127,9 +129,9 @@ namespace gl
 		case CELL_GCM_TEXTURE_D1R5G5B5: return std::make_tuple(GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV);
 		case CELL_GCM_TEXTURE_D8R8G8B8: return std::make_tuple(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV);
 		case CELL_GCM_TEXTURE_Y16_X16_FLOAT: return std::make_tuple(GL_RG, GL_HALF_FLOAT);
-		case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT1_EXT, GL_UNSIGNED_BYTE);
-		case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_UNSIGNED_BYTE);
-		case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_UNSIGNED_BYTE);
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return std::make_tuple(supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT1_EXT : GL_BGRA, GL_UNSIGNED_BYTE);
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return std::make_tuple(supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT3_EXT : GL_BGRA, GL_UNSIGNED_BYTE);
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return std::make_tuple(supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT5_EXT : GL_BGRA, GL_UNSIGNED_BYTE);
 		case CELL_GCM_TEXTURE_COMPRESSED_HILO8: return std::make_tuple(GL_RG, GL_UNSIGNED_BYTE);
 		case CELL_GCM_TEXTURE_COMPRESSED_HILO_S8: return std::make_tuple(GL_RG, GL_BYTE);
 		case CELL_GCM_TEXTURE_COMPRESSED_B8R8_G8R8: return std::make_tuple(GL_BGRA, GL_UNSIGNED_BYTE);
@ -587,6 +589,7 @@ namespace gl
 			.supports_vtc_decoding = false,
 			.supports_hw_deswizzle = driver_caps.ARB_compute_shader_supported,
 			.supports_zero_copy = false,
+			.supports_dxt = driver_caps.EXT_texture_compression_s3tc_supported,
 			.alignment = 4
 		};

@ -596,7 +599,7 @@ namespace gl
 		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE);
 		glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);

-		if (rsx::is_compressed_host_format(format)) [[likely]]
+		if (rsx::is_compressed_host_format(caps, format)) [[likely]]
 		{
 			caps.supports_vtc_decoding = driver_caps.vendor_NVIDIA;
 			unpack_settings.apply();
@ -687,13 +690,13 @@ namespace gl
 				if (driver_caps.ARB_compute_shader_supported)
 				{
 					u64 row_pitch = rsx::align2<u64, u64>(layout.width_in_block * block_size_in_bytes, caps.alignment);
-					if (!rsx::is_compressed_host_format(format))
-					{
-						// Handle emulated compressed formats with host unpack (R8G8 compressed)
-						row_pitch = std::max<u64>(row_pitch, dst->pitch());
-					}

-					image_linear_size = row_pitch * layout.height_in_block * layout.depth;
+					// We're in the "else" branch, so "is_compressed_host_format()" is always false.
+					// Handle emulated compressed formats with host unpack (R8G8 compressed)
+					row_pitch = std::max<u64>(row_pitch, dst->pitch());
+
+					// FIXME: Double-check this logic; it seems like we should always use texels both here and for row_pitch.
+					image_linear_size = row_pitch * layout.height_in_texel * layout.depth;

 					compute_scratch_mem = { nullptr, g_compute_decode_buffer.alloc(static_cast<u32>(image_linear_size), 256) };
 					compute_scratch_mem.first = reinterpret_cast<void*>(static_cast<uintptr_t>(compute_scratch_mem.second));
@ -815,7 +818,8 @@ namespace gl
 		// Calculate staging buffer size
 		rsx::simple_array<std::byte> data_upload_buf;

-		if (rsx::is_compressed_host_format(gcm_format))
+		rsx::texture_uploader_capabilities caps { .supports_dxt = gl::get_driver_caps().EXT_texture_compression_s3tc_supported };
+		if (rsx::is_compressed_host_format(caps, gcm_format))
 		{
 			const auto& desc = subresources_layout[0];
 			const u32 texture_data_sz = desc.width_in_block * desc.height_in_block * desc.depth * rsx::get_format_block_size_in_bytes(gcm_format);
--- a/rpcs3/Emu/RSX/GL/glutils/capabilities.cpp
+++ b/rpcs3/Emu/RSX/GL/glutils/capabilities.cpp
@ -33,7 +33,7 @@ namespace gl

 	void capabilities::initialize()
 	{
-		int find_count = 18;
+		int find_count = 19;
 		int ext_count = 0;
 		glGetIntegerv(GL_NUM_EXTENSIONS, &ext_count);

@ -178,6 +178,13 @@ namespace gl
 				find_count--;
 				continue;
 			}
+
+			if (check(ext_name, "GL_EXT_texture_compression_s3tc"))
+			{
+				EXT_texture_compression_s3tc_supported = true;
+				find_count--;
+				continue;
+			}
 		}

 		// Set GLSL version
--- a/rpcs3/Emu/RSX/GL/glutils/capabilities.h
+++ b/rpcs3/Emu/RSX/GL/glutils/capabilities.h
@ -41,6 +41,7 @@ namespace gl
 		bool NV_depth_buffer_float_supported = false;
 		bool NV_fragment_shader_barycentric_supported = false;
 		bool ARB_shader_texture_image_samples = false;
+		bool EXT_texture_compression_s3tc_supported = false;

 		bool vendor_INTEL = false;  // has broken GLSL compiler
 		bool vendor_AMD = false;    // has broken ARB_multidraw
--- a/rpcs3/Emu/RSX/VK/VKCompute.cpp
+++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp
@ -80,6 +80,7 @@ namespace vk
 			case vk::driver_vendor::LAVAPIPE:
 			case vk::driver_vendor::V3DV:
 			case vk::driver_vendor::PANVK:
+			case vk::driver_vendor::ARM_MALI:
 				// TODO: Actually bench this. Using 32 for now to match other common configurations.
 			case vk::driver_vendor::DOZEN:
 				// Actual optimal size depends on the D3D device. Use 32 since it should work well on both AMD and NVIDIA
--- a/rpcs3/Emu/RSX/VK/VKFormats.cpp
+++ b/rpcs3/Emu/RSX/VK/VKFormats.cpp
@ -1,5 +1,6 @@
 #include "stdafx.h"
 #include "VKFormats.h"
+#include "VKHelpers.h"
 #include "vkutils/device.h"
 #include "vkutils/image.h"

@ -193,6 +194,7 @@ namespace vk

 	VkFormat get_compatible_sampler_format(const gpu_formats_support& support, u32 format)
 	{
+		const bool supports_dxt = vk::get_current_renderer()->get_texture_compression_bc_support();
 		switch (format)
 		{
 #ifndef __APPLE__
@ -213,9 +215,9 @@ namespace vk
 #endif
 		case CELL_GCM_TEXTURE_B8: return VK_FORMAT_R8_UNORM;
 		case CELL_GCM_TEXTURE_A8R8G8B8: return VK_FORMAT_B8G8R8A8_UNORM;
-		case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return VK_FORMAT_BC1_RGBA_UNORM_BLOCK;
-		case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return VK_FORMAT_BC2_UNORM_BLOCK;
-		case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return VK_FORMAT_BC3_UNORM_BLOCK;
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return supports_dxt ? VK_FORMAT_BC1_RGBA_UNORM_BLOCK : VK_FORMAT_B8G8R8A8_UNORM;
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return supports_dxt ? VK_FORMAT_BC2_UNORM_BLOCK : VK_FORMAT_B8G8R8A8_UNORM;
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return supports_dxt ? VK_FORMAT_BC3_UNORM_BLOCK : VK_FORMAT_B8G8R8A8_UNORM;
 		case CELL_GCM_TEXTURE_G8B8: return VK_FORMAT_R8G8_UNORM;
 		case CELL_GCM_TEXTURE_DEPTH24_D8: return support.d24_unorm_s8? VK_FORMAT_D24_UNORM_S8_UINT : VK_FORMAT_D32_SFLOAT_S8_UINT;
 		case CELL_GCM_TEXTURE_DEPTH24_D8_FLOAT:	return VK_FORMAT_D32_SFLOAT_S8_UINT;
--- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp
+++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp
@ -145,6 +145,9 @@ namespace vk
 		case driver_vendor::PANVK:
 			// Needs more testing
 			break;
+		case driver_vendor::ARM_MALI:
+			// Needs more testing
+			break;
 		default:
 			rsx_log.warning("Unsupported device: %s", gpu_name);
 		}
--- a/rpcs3/Emu/RSX/VK/VKRenderTargets.h
+++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.h
@ -188,6 +188,7 @@ namespace vk
 			case driver_vendor::V3DV:
 			case driver_vendor::HONEYKRISP:
 			case driver_vendor::PANVK:
+			case driver_vendor::ARM_MALI:
 				break;
 			}

--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@ -929,7 +929,7 @@ namespace vk
 		return *pcmd;
 	}

-	static const std::pair<u32, u32> calculate_upload_pitch(int format, u32 heap_align, vk::image* dst_image, const rsx::subresource_layout& layout)
+	static const std::pair<u32, u32> calculate_upload_pitch(int format, u32 heap_align, vk::image* dst_image, const rsx::subresource_layout& layout, const rsx::texture_uploader_capabilities& caps)
 	{
 		u32 block_in_pixel = rsx::get_format_block_size_in_texel(format);
 		u8  block_size_in_bytes = rsx::get_format_block_size_in_bytes(format);
@ -950,7 +950,7 @@ namespace vk

 			// We have row_pitch in source coordinates. But some formats have a software decode step which can affect this packing!
 			// For such formats, the packed pitch on src does not match packed pitch on dst
-			if (!rsx::is_compressed_host_format(format))
+			if (!rsx::is_compressed_host_format(caps, format))
 			{
 				const auto host_texel_width = vk::get_format_texel_width(dst_image->format());
 				const auto host_packed_pitch = host_texel_width * layout.width_in_texel;
@ -977,7 +977,8 @@ namespace vk
 		VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align, rsx::flags32_t image_setup_flags)
 	{
 		const bool requires_depth_processing = (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || (format == CELL_GCM_TEXTURE_DEPTH16_FLOAT);
-		rsx::texture_uploader_capabilities caps{ .alignment = heap_align };
+		auto pdev = vk::get_current_renderer();
+		rsx::texture_uploader_capabilities caps{ .supports_dxt = pdev->get_texture_compression_bc_support(), .alignment = heap_align };
 		rsx::texture_memory_info opt{};
 		bool check_caps = true;

@ -997,11 +998,11 @@ namespace vk

 		for (const rsx::subresource_layout &layout : subresource_layout)
 		{
-			const auto [row_pitch, upload_pitch_in_texel] = calculate_upload_pitch(format, heap_align, dst_image, layout);
+			const auto [row_pitch, upload_pitch_in_texel] = calculate_upload_pitch(format, heap_align, dst_image, layout, caps);
 			caps.alignment = row_pitch;

 			// Calculate estimated memory utilization for this subresource
-			image_linear_size = row_pitch * layout.height_in_block * layout.depth;
+			image_linear_size = row_pitch * layout.depth * (rsx::is_compressed_host_format(caps, format) ? layout.height_in_block : layout.height_in_texel);

 			// Only do GPU-side conversion if occupancy is good
 			if (check_caps)
--- a/rpcs3/Emu/RSX/VK/vkutils/chip_class.h
+++ b/rpcs3/Emu/RSX/VK/vkutils/chip_class.h
@ -55,7 +55,8 @@ namespace vk
 		NVK,
 		V3DV,
 		HONEYKRISP,
-		PANVK
+		PANVK,
+		ARM_MALI
 	};

 	driver_vendor get_driver_vendor();
--- a/rpcs3/Emu/RSX/VK/vkutils/device.cpp
+++ b/rpcs3/Emu/RSX/VK/vkutils/device.cpp
@ -134,6 +134,10 @@ namespace vk
 			// So far only AMD is known to remap image view and border color together. Mark as not required.
 			custom_border_color_support.require_border_color_remap = get_driver_vendor() != driver_vendor::AMD;
 		}
+
+		// v3dv and PanVK support BC1-BC3 which is all we require, support is reported as false since not all formats are supported
+		optional_features_support.texture_compression_bc = features.textureCompressionBC
+				|| get_driver_vendor() == driver_vendor::V3DV || get_driver_vendor() == driver_vendor::PANVK;
 	}

 	void physical_device::get_physical_device_properties(bool allow_extensions)
@ -303,9 +307,13 @@ namespace vk
 			}

 			if (gpu_name.find("Panfrost") != umax)
-			{
+			{ // e.g. "Mali-G610 (Panfrost)"
 				return driver_vendor::PANVK;
 			}
+			else if (gpu_name.find("Mali") != umax)
+			{ // e.g. "Mali-G610", hence "else"
+				return driver_vendor::ARM_MALI;
+			}

 			return driver_vendor::unknown;
 		}
@ -336,6 +344,8 @@ namespace vk
 				return driver_vendor::HONEYKRISP;
 			case VK_DRIVER_ID_MESA_PANVK:
 				return driver_vendor::PANVK;
+			case VK_DRIVER_ID_ARM_PROPRIETARY:
+				return driver_vendor::ARM_MALI;
 			default:
 				// Mobile?
 				return driver_vendor::unknown;
@ -471,8 +481,7 @@ namespace vk
 		// Enable hardware features manually
 		// Currently we require:
 		// 1. Anisotropic sampling
-		// 2. DXT support
-		// 3. Indexable storage buffers
+		// 2. Indexable storage buffers
 		VkPhysicalDeviceFeatures enabled_features{};
 		if (pgpu->shader_types_support.allow_float16)
 		{
@ -566,7 +575,7 @@ namespace vk
 		// enabled_features.shaderCullDistance = VK_TRUE;  // Alt notation of clip distance

 		enabled_features.samplerAnisotropy = VK_TRUE;
-		enabled_features.textureCompressionBC = VK_TRUE;
+		enabled_features.textureCompressionBC = pgpu->optional_features_support.texture_compression_bc;
 		enabled_features.shaderStorageBufferArrayDynamicIndexing = VK_TRUE;

 		// Optionally disable unsupported stuff
@ -659,19 +668,6 @@ namespace vk
 			enabled_features.logicOp = VK_FALSE;
 		}

-		if (!pgpu->features.textureCompressionBC && pgpu->get_driver_vendor() == driver_vendor::V3DV)
-		{
-			// v3dv supports BC1-BC3 which is all we require, support is reported as false since not all formats are supported
-			rsx_log.error("Your GPU running on the V3DV driver does not support full texture block compression. Graphics may not render correctly.");
-			enabled_features.textureCompressionBC = VK_FALSE;
-		}
-
-		if (!pgpu->features.textureCompressionBC && pgpu->get_driver_vendor() == driver_vendor::PANVK)
-		{
-			rsx_log.error("Your GPU running on the PANVK driver does not support full texture block compression. Graphics may not render correctly.");
-			enabled_features.textureCompressionBC = VK_FALSE;
-		}
-
 		VkDeviceCreateInfo device = {};
 		device.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
 		device.pNext = nullptr;
--- a/rpcs3/Emu/RSX/VK/vkutils/device.h
+++ b/rpcs3/Emu/RSX/VK/vkutils/device.h
@ -92,6 +92,7 @@ namespace vk
 			bool synchronization_2 = false;
 			bool unrestricted_depth_range = false;
 			bool extended_device_fault = false;
+			bool texture_compression_bc = false;
 		} optional_features_support;

 		friend class render_device;
@ -190,6 +191,7 @@ namespace vk
 		bool get_barycoords_support() const { return pgpu->optional_features_support.barycentric_coords; }
 		bool get_synchronization2_support() const { return pgpu->optional_features_support.synchronization_2; }
 		bool get_extended_device_fault_support() const { return pgpu->optional_features_support.extended_device_fault; }
+		bool get_texture_compression_bc_support() const { return pgpu->optional_features_support.texture_compression_bc; }

 		u64 get_descriptor_update_after_bind_support() const { return pgpu->descriptor_indexing_support.update_after_bind_mask; }
 		u32 get_descriptor_max_draw_calls() const { return pgpu->descriptor_max_draw_calls; }