From e72cb6801a4e083360f20700116f80a780185a1a Mon Sep 17 00:00:00 2001 From: Ivan Podogov Date: Mon, 10 Feb 2025 18:32:01 +0000 Subject: [PATCH] Add DXT1-5 decompression on ARM --- 3rdparty/bcdec/bcdec.hpp | 170 ++++++++++++++++++++++ rpcs3/Emu/RSX/Common/TextureUtils.cpp | 82 ++++++++++- rpcs3/Emu/RSX/Common/TextureUtils.h | 3 +- rpcs3/Emu/RSX/GL/GLTexture.cpp | 32 ++-- rpcs3/Emu/RSX/GL/glutils/capabilities.cpp | 9 +- rpcs3/Emu/RSX/GL/glutils/capabilities.h | 1 + rpcs3/Emu/RSX/VK/VKCompute.cpp | 1 + rpcs3/Emu/RSX/VK/VKFormats.cpp | 8 +- rpcs3/Emu/RSX/VK/VKHelpers.cpp | 3 + rpcs3/Emu/RSX/VK/VKRenderTargets.h | 1 + rpcs3/Emu/RSX/VK/VKTexture.cpp | 11 +- rpcs3/Emu/RSX/VK/vkutils/chip_class.h | 3 +- rpcs3/Emu/RSX/VK/vkutils/device.cpp | 30 ++-- rpcs3/Emu/RSX/VK/vkutils/device.h | 2 + 14 files changed, 312 insertions(+), 44 deletions(-) create mode 100644 3rdparty/bcdec/bcdec.hpp diff --git a/3rdparty/bcdec/bcdec.hpp b/3rdparty/bcdec/bcdec.hpp new file mode 100644 index 0000000000..9fac43d0fb --- /dev/null +++ b/3rdparty/bcdec/bcdec.hpp @@ -0,0 +1,170 @@ +// Based on https://github.com/iOrange/bcdec/blob/963c5e56b7a335e066cff7d16a3de75f4e8ad366/bcdec.h +// provides functions to decompress blocks of BC compressed images +// +// ------------------------------------------------------------------------------ +// +// MIT LICENSE +// =========== +// Copyright (c) 2022 Sergii Kudlai +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// ------------------------------------------------------------------------------ + +#pragma once + +#include + +static void bcdec__color_block(const u8* compressedBlock, u8* dstColors, int destinationPitch, bool onlyOpaqueMode) { + u16 c0, c1; + u32 refColors[4]; /* 0xAABBGGRR */ + u32 colorIndices; + u32 r0, g0, b0, r1, g1, b1, r, g, b; + + c0 = *reinterpret_cast(compressedBlock); + c1 = *(reinterpret_cast(compressedBlock) + 1); + + /* Unpack 565 ref colors */ + r0 = (c0 >> 11) & 0x1F; + g0 = (c0 >> 5) & 0x3F; + b0 = c0 & 0x1F; + + r1 = (c1 >> 11) & 0x1F; + g1 = (c1 >> 5) & 0x3F; + b1 = c1 & 0x1F; + + /* Expand 565 ref colors to 888 */ + r = (r0 * 527 + 23) >> 6; + g = (g0 * 259 + 33) >> 6; + b = (b0 * 527 + 23) >> 6; + refColors[0] = 0xFF000000 | (r << 16) | (g << 8) | b; + + r = (r1 * 527 + 23) >> 6; + g = (g1 * 259 + 33) >> 6; + b = (b1 * 527 + 23) >> 6; + refColors[1] = 0xFF000000 | (r << 16) | (g << 8) | b; + + if (c0 > c1 || onlyOpaqueMode) + { /* Standard BC1 mode (also BC3 color block uses ONLY this mode) */ + /* color_2 = 2/3*color_0 + 1/3*color_1 + color_3 = 1/3*color_0 + 2/3*color_1 */ + r = ((2 * r0 + r1) * 351 + 61) >> 7; + g = ((2 * g0 + g1) * 2763 + 1039) >> 11; + b = ((2 * b0 + b1) * 351 + 61) >> 7; + refColors[2] = 0xFF000000 | (r << 16) | (g << 8) | b; + + r = ((r0 + r1 * 2) * 351 + 61) >> 7; + g = ((g0 + g1 * 2) * 2763 + 1039) >> 11; + b = ((b0 + b1 * 2) * 351 + 61) >> 7; + refColors[3] = 0xFF000000 | (r << 16) | (g << 8) | b; + } + else + { /* Quite rare BC1A mode */ + /* color_2 = 1/2*color_0 + 1/2*color_1; + color_3 = 0; */ + r = ((r0 + r1) * 1053 + 125) >> 8; + g = ((g0 + g1) * 4145 + 1019) >> 11; + b = ((b0 + b1) * 1053 + 125) >> 8; + refColors[2] = 0xFF000000 | (r << 16) | (g << 8) | b; + + refColors[3] = 0x00000000; + } + + colorIndices = *reinterpret_cast(compressedBlock + 4); + + /* Fill out the decompressed color block */ + for (int i = 0; i < 4; ++i) + { + for (int j = 0; j < 4; ++j) + { + int idx = colorIndices & 0x03; + *reinterpret_cast(dstColors + j * 4) = refColors[idx]; + colorIndices >>= 2; + } + + dstColors += destinationPitch; + } +} + +static void bcdec__sharp_alpha_block(const u16* alpha, u8* decompressed, int destinationPitch) { + for (int i = 0; i < 4; ++i) + { + for (int j = 0; j < 4; ++j) + { + decompressed[j * 4] = ((alpha[i] >> (4 * j)) & 0x0F) * 17; + } + decompressed += destinationPitch; + } +} + +static void bcdec__smooth_alpha_block(const u8* compressedBlock, u8* decompressed, int destinationPitch) { + u8 alpha[8]; + u64 block = *reinterpret_cast(compressedBlock); + u64 indices; + + alpha[0] = block & 0xFF; + alpha[1] = (block >> 8) & 0xFF; + + if (alpha[0] > alpha[1]) + { + /* 6 interpolated alpha values. */ + alpha[2] = (6 * alpha[0] + alpha[1]) / 7; /* 6/7*alpha_0 + 1/7*alpha_1 */ + alpha[3] = (5 * alpha[0] + 2 * alpha[1]) / 7; /* 5/7*alpha_0 + 2/7*alpha_1 */ + alpha[4] = (4 * alpha[0] + 3 * alpha[1]) / 7; /* 4/7*alpha_0 + 3/7*alpha_1 */ + alpha[5] = (3 * alpha[0] + 4 * alpha[1]) / 7; /* 3/7*alpha_0 + 4/7*alpha_1 */ + alpha[6] = (2 * alpha[0] + 5 * alpha[1]) / 7; /* 2/7*alpha_0 + 5/7*alpha_1 */ + alpha[7] = ( alpha[0] + 6 * alpha[1]) / 7; /* 1/7*alpha_0 + 6/7*alpha_1 */ + } + else + { + /* 4 interpolated alpha values. */ + alpha[2] = (4 * alpha[0] + alpha[1]) / 5; /* 4/5*alpha_0 + 1/5*alpha_1 */ + alpha[3] = (3 * alpha[0] + 2 * alpha[1]) / 5; /* 3/5*alpha_0 + 2/5*alpha_1 */ + alpha[4] = (2 * alpha[0] + 3 * alpha[1]) / 5; /* 2/5*alpha_0 + 3/5*alpha_1 */ + alpha[5] = ( alpha[0] + 4 * alpha[1]) / 5; /* 1/5*alpha_0 + 4/5*alpha_1 */ + alpha[6] = 0x00; + alpha[7] = 0xFF; + } + + indices = block >> 16; + for (int i = 0; i < 4; ++i) + { + for (int j = 0; j < 4; ++j) + { + decompressed[j * 4] = alpha[indices & 0x07]; + indices >>= 3; + } + decompressed += destinationPitch; + } +} + +static inline void bcdec_bc1(const u8* compressedBlock, u8* decompressedBlock, int destinationPitch) { + bcdec__color_block(compressedBlock, decompressedBlock, destinationPitch, false); +} + +static inline void bcdec_bc2(const u8* compressedBlock, u8* decompressedBlock, int destinationPitch) { + bcdec__color_block(compressedBlock + 8, decompressedBlock, destinationPitch, true); + bcdec__sharp_alpha_block(reinterpret_cast(compressedBlock), decompressedBlock + 3, destinationPitch); +} + +static inline void bcdec_bc3(const u8* compressedBlock, u8* decompressedBlock, int destinationPitch) { + bcdec__color_block(compressedBlock + 8, decompressedBlock, destinationPitch, true); + bcdec__smooth_alpha_block(compressedBlock, decompressedBlock + 3, destinationPitch); +} + diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.cpp b/rpcs3/Emu/RSX/Common/TextureUtils.cpp index 05146eb4a3..f2bc571e5f 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.cpp +++ b/rpcs3/Emu/RSX/Common/TextureUtils.cpp @@ -3,6 +3,7 @@ #include "TextureUtils.h" #include "../RSXThread.h" #include "../rsx_utils.h" +#include "3rdparty/bcdec/bcdec.hpp" #include "util/asm.hpp" @@ -497,6 +498,63 @@ struct copy_rgb655_block_swizzled } }; +struct copy_decoded_bc1_block +{ + static void copy_mipmap_level(std::span dst, std::span src, u16 width_in_block, u16 row_count, u16 depth, u32 dst_pitch_in_block, u32 src_pitch_in_block) + { + u32 src_offset = 0, dst_offset = 0, destinationPitch = dst_pitch_in_block * 4; + for (u32 row = 0; row < row_count * depth; row++) + { + for (u32 col = 0; col < width_in_block; col++) { + const u8* compressedBlock = reinterpret_cast(&src[src_offset + col]); + u8* decompressedBlock = reinterpret_cast(&dst[dst_offset + col * 4]); + bcdec_bc1(compressedBlock, decompressedBlock, destinationPitch); + } + + src_offset += src_pitch_in_block; + dst_offset += destinationPitch; + } + } +}; + +struct copy_decoded_bc2_block +{ + static void copy_mipmap_level(std::span dst, std::span src, u16 width_in_block, u16 row_count, u16 depth, u32 dst_pitch_in_block, u32 src_pitch_in_block) + { + u32 src_offset = 0, dst_offset = 0, destinationPitch = dst_pitch_in_block * 4; + for (u32 row = 0; row < row_count * depth; row++) + { + for (u32 col = 0; col < width_in_block; col++) { + const u8* compressedBlock = reinterpret_cast(&src[src_offset + col]); + u8* decompressedBlock = reinterpret_cast(&dst[dst_offset + col * 4]); + bcdec_bc2(compressedBlock, decompressedBlock, destinationPitch); + } + + src_offset += src_pitch_in_block; + dst_offset += destinationPitch; + } + } +}; + +struct copy_decoded_bc3_block +{ + static void copy_mipmap_level(std::span dst, std::span src, u16 width_in_block, u16 row_count, u16 depth, u32 dst_pitch_in_block, u32 src_pitch_in_block) + { + u32 src_offset = 0, dst_offset = 0, destinationPitch = dst_pitch_in_block * 4; + for (u32 row = 0; row < row_count * depth; row++) + { + for (u32 col = 0; col < width_in_block; col++) { + const u8* compressedBlock = reinterpret_cast(&src[src_offset + col]); + u8* decompressedBlock = reinterpret_cast(&dst[dst_offset + col * 4]); + bcdec_bc3(compressedBlock, decompressedBlock, destinationPitch); + } + + src_offset += src_pitch_in_block; + dst_offset += destinationPitch; + } + } +}; + namespace { /** @@ -952,6 +1010,12 @@ namespace rsx case CELL_GCM_TEXTURE_COMPRESSED_DXT1: { + if (!caps.supports_dxt) + { + copy_decoded_bc1_block::copy_mipmap_level(dst_buffer.as_span(), src_layout.data.as_span(), w, h, depth, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); + break; + } + const bool is_3d = depth > 1; const bool is_po2 = utils::is_power_of_2(src_layout.width_in_texel) && utils::is_power_of_2(src_layout.height_in_texel); @@ -981,8 +1045,22 @@ namespace rsx } case CELL_GCM_TEXTURE_COMPRESSED_DXT23: + { + if (!caps.supports_dxt) + { + copy_decoded_bc2_block::copy_mipmap_level(dst_buffer.as_span(), src_layout.data.as_span(), w, h, depth, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); + break; + } + [[fallthrough]]; + } case CELL_GCM_TEXTURE_COMPRESSED_DXT45: { + if (!caps.supports_dxt) + { + copy_decoded_bc3_block::copy_mipmap_level(dst_buffer.as_span(), src_layout.data.as_span(), w, h, depth, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); + break; + } + const bool is_3d = depth > 1; const bool is_po2 = utils::is_power_of_2(src_layout.width_in_texel) && utils::is_power_of_2(src_layout.height_in_texel); @@ -1094,7 +1172,7 @@ namespace rsx return result; } - bool is_compressed_host_format(u32 texture_format) + bool is_compressed_host_format(const texture_uploader_capabilities& caps, u32 texture_format) { switch (texture_format) { @@ -1129,7 +1207,7 @@ namespace rsx case CELL_GCM_TEXTURE_COMPRESSED_DXT1: case CELL_GCM_TEXTURE_COMPRESSED_DXT23: case CELL_GCM_TEXTURE_COMPRESSED_DXT45: - return true; + return caps.supports_dxt; } fmt::throw_exception("Unknown format 0x%x", texture_format); } diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.h b/rpcs3/Emu/RSX/Common/TextureUtils.h index f03667042c..2b9552bc7a 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.h +++ b/rpcs3/Emu/RSX/Common/TextureUtils.h @@ -227,6 +227,7 @@ namespace rsx bool supports_vtc_decoding; bool supports_hw_deswizzle; bool supports_zero_copy; + bool supports_dxt; usz alignment; }; @@ -252,7 +253,7 @@ namespace rsx u8 get_format_block_size_in_bytes(rsx::surface_color_format format); u8 get_format_block_size_in_bytes(rsx::surface_depth_format2 format); - bool is_compressed_host_format(u32 format); // Returns true for host-compressed formats (DXT) + bool is_compressed_host_format(const texture_uploader_capabilities& caps, u32 format); // Returns true for host-compressed formats (DXT) u8 get_format_sample_count(rsx::surface_antialiasing antialias); u32 get_max_depth_value(rsx::surface_depth_format2 format); bool is_depth_stencil_format(rsx::surface_depth_format2 format); diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index 74ffd7bf7b..36b4ffafa6 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -70,6 +70,7 @@ namespace gl GLenum get_sized_internal_format(u32 texture_format) { + const bool supports_dxt = get_driver_caps().EXT_texture_compression_s3tc_supported; switch (texture_format) { case CELL_GCM_TEXTURE_B8: return GL_R8; @@ -92,9 +93,9 @@ namespace gl case CELL_GCM_TEXTURE_D1R5G5B5: return GL_BGR5_A1; case CELL_GCM_TEXTURE_D8R8G8B8: return GL_BGRA8; case CELL_GCM_TEXTURE_Y16_X16_FLOAT: return GL_RG16F; - case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return GL_COMPRESSED_RGBA_S3TC_DXT1_EXT; - case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return GL_COMPRESSED_RGBA_S3TC_DXT3_EXT; - case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return GL_COMPRESSED_RGBA_S3TC_DXT5_EXT; + case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT1_EXT : GL_BGRA8; + case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT3_EXT : GL_BGRA8; + case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT5_EXT : GL_BGRA8; case CELL_GCM_TEXTURE_COMPRESSED_HILO8: return GL_RG8; case CELL_GCM_TEXTURE_COMPRESSED_HILO_S8: return GL_RG8_SNORM; case CELL_GCM_TEXTURE_COMPRESSED_B8R8_G8R8: return GL_BGRA8; @@ -105,6 +106,7 @@ namespace gl std::tuple get_format_type(u32 texture_format) { + const bool supports_dxt = get_driver_caps().EXT_texture_compression_s3tc_supported; switch (texture_format) { case CELL_GCM_TEXTURE_B8: return std::make_tuple(GL_RED, GL_UNSIGNED_BYTE); @@ -127,9 +129,9 @@ namespace gl case CELL_GCM_TEXTURE_D1R5G5B5: return std::make_tuple(GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV); case CELL_GCM_TEXTURE_D8R8G8B8: return std::make_tuple(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV); case CELL_GCM_TEXTURE_Y16_X16_FLOAT: return std::make_tuple(GL_RG, GL_HALF_FLOAT); - case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT1_EXT, GL_UNSIGNED_BYTE); - case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_UNSIGNED_BYTE); - case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_UNSIGNED_BYTE); + case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return std::make_tuple(supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT1_EXT : GL_BGRA, GL_UNSIGNED_BYTE); + case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return std::make_tuple(supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT3_EXT : GL_BGRA, GL_UNSIGNED_BYTE); + case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return std::make_tuple(supports_dxt ? GL_COMPRESSED_RGBA_S3TC_DXT5_EXT : GL_BGRA, GL_UNSIGNED_BYTE); case CELL_GCM_TEXTURE_COMPRESSED_HILO8: return std::make_tuple(GL_RG, GL_UNSIGNED_BYTE); case CELL_GCM_TEXTURE_COMPRESSED_HILO_S8: return std::make_tuple(GL_RG, GL_BYTE); case CELL_GCM_TEXTURE_COMPRESSED_B8R8_G8R8: return std::make_tuple(GL_BGRA, GL_UNSIGNED_BYTE); @@ -587,6 +589,7 @@ namespace gl .supports_vtc_decoding = false, .supports_hw_deswizzle = driver_caps.ARB_compute_shader_supported, .supports_zero_copy = false, + .supports_dxt = driver_caps.EXT_texture_compression_s3tc_supported, .alignment = 4 }; @@ -596,7 +599,7 @@ namespace gl glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE); glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); - if (rsx::is_compressed_host_format(format)) [[likely]] + if (rsx::is_compressed_host_format(caps, format)) [[likely]] { caps.supports_vtc_decoding = driver_caps.vendor_NVIDIA; unpack_settings.apply(); @@ -687,13 +690,13 @@ namespace gl if (driver_caps.ARB_compute_shader_supported) { u64 row_pitch = rsx::align2(layout.width_in_block * block_size_in_bytes, caps.alignment); - if (!rsx::is_compressed_host_format(format)) - { - // Handle emulated compressed formats with host unpack (R8G8 compressed) - row_pitch = std::max(row_pitch, dst->pitch()); - } - image_linear_size = row_pitch * layout.height_in_block * layout.depth; + // We're in the "else" branch, so "is_compressed_host_format()" is always false. + // Handle emulated compressed formats with host unpack (R8G8 compressed) + row_pitch = std::max(row_pitch, dst->pitch()); + + // FIXME: Double-check this logic; it seems like we should always use texels both here and for row_pitch. + image_linear_size = row_pitch * layout.height_in_texel * layout.depth; compute_scratch_mem = { nullptr, g_compute_decode_buffer.alloc(static_cast(image_linear_size), 256) }; compute_scratch_mem.first = reinterpret_cast(static_cast(compute_scratch_mem.second)); @@ -815,7 +818,8 @@ namespace gl // Calculate staging buffer size rsx::simple_array data_upload_buf; - if (rsx::is_compressed_host_format(gcm_format)) + rsx::texture_uploader_capabilities caps { .supports_dxt = gl::get_driver_caps().EXT_texture_compression_s3tc_supported }; + if (rsx::is_compressed_host_format(caps, gcm_format)) { const auto& desc = subresources_layout[0]; const u32 texture_data_sz = desc.width_in_block * desc.height_in_block * desc.depth * rsx::get_format_block_size_in_bytes(gcm_format); diff --git a/rpcs3/Emu/RSX/GL/glutils/capabilities.cpp b/rpcs3/Emu/RSX/GL/glutils/capabilities.cpp index 0c8de71786..9a5cbf9c12 100644 --- a/rpcs3/Emu/RSX/GL/glutils/capabilities.cpp +++ b/rpcs3/Emu/RSX/GL/glutils/capabilities.cpp @@ -33,7 +33,7 @@ namespace gl void capabilities::initialize() { - int find_count = 18; + int find_count = 19; int ext_count = 0; glGetIntegerv(GL_NUM_EXTENSIONS, &ext_count); @@ -178,6 +178,13 @@ namespace gl find_count--; continue; } + + if (check(ext_name, "GL_EXT_texture_compression_s3tc")) + { + EXT_texture_compression_s3tc_supported = true; + find_count--; + continue; + } } // Set GLSL version diff --git a/rpcs3/Emu/RSX/GL/glutils/capabilities.h b/rpcs3/Emu/RSX/GL/glutils/capabilities.h index 801a426d80..70ac5e9ae2 100644 --- a/rpcs3/Emu/RSX/GL/glutils/capabilities.h +++ b/rpcs3/Emu/RSX/GL/glutils/capabilities.h @@ -41,6 +41,7 @@ namespace gl bool NV_depth_buffer_float_supported = false; bool NV_fragment_shader_barycentric_supported = false; bool ARB_shader_texture_image_samples = false; + bool EXT_texture_compression_s3tc_supported = false; bool vendor_INTEL = false; // has broken GLSL compiler bool vendor_AMD = false; // has broken ARB_multidraw diff --git a/rpcs3/Emu/RSX/VK/VKCompute.cpp b/rpcs3/Emu/RSX/VK/VKCompute.cpp index 7cda0a8ff9..637642d8bf 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.cpp +++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp @@ -80,6 +80,7 @@ namespace vk case vk::driver_vendor::LAVAPIPE: case vk::driver_vendor::V3DV: case vk::driver_vendor::PANVK: + case vk::driver_vendor::ARM_MALI: // TODO: Actually bench this. Using 32 for now to match other common configurations. case vk::driver_vendor::DOZEN: // Actual optimal size depends on the D3D device. Use 32 since it should work well on both AMD and NVIDIA diff --git a/rpcs3/Emu/RSX/VK/VKFormats.cpp b/rpcs3/Emu/RSX/VK/VKFormats.cpp index 4e270cc72d..f766a70e79 100644 --- a/rpcs3/Emu/RSX/VK/VKFormats.cpp +++ b/rpcs3/Emu/RSX/VK/VKFormats.cpp @@ -1,5 +1,6 @@ #include "stdafx.h" #include "VKFormats.h" +#include "VKHelpers.h" #include "vkutils/device.h" #include "vkutils/image.h" @@ -193,6 +194,7 @@ namespace vk VkFormat get_compatible_sampler_format(const gpu_formats_support& support, u32 format) { + const bool supports_dxt = vk::get_current_renderer()->get_texture_compression_bc_support(); switch (format) { #ifndef __APPLE__ @@ -213,9 +215,9 @@ namespace vk #endif case CELL_GCM_TEXTURE_B8: return VK_FORMAT_R8_UNORM; case CELL_GCM_TEXTURE_A8R8G8B8: return VK_FORMAT_B8G8R8A8_UNORM; - case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return VK_FORMAT_BC1_RGBA_UNORM_BLOCK; - case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return VK_FORMAT_BC2_UNORM_BLOCK; - case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return VK_FORMAT_BC3_UNORM_BLOCK; + case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return supports_dxt ? VK_FORMAT_BC1_RGBA_UNORM_BLOCK : VK_FORMAT_B8G8R8A8_UNORM; + case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return supports_dxt ? VK_FORMAT_BC2_UNORM_BLOCK : VK_FORMAT_B8G8R8A8_UNORM; + case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return supports_dxt ? VK_FORMAT_BC3_UNORM_BLOCK : VK_FORMAT_B8G8R8A8_UNORM; case CELL_GCM_TEXTURE_G8B8: return VK_FORMAT_R8G8_UNORM; case CELL_GCM_TEXTURE_DEPTH24_D8: return support.d24_unorm_s8? VK_FORMAT_D24_UNORM_S8_UINT : VK_FORMAT_D32_SFLOAT_S8_UINT; case CELL_GCM_TEXTURE_DEPTH24_D8_FLOAT: return VK_FORMAT_D32_SFLOAT_S8_UINT; diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.cpp b/rpcs3/Emu/RSX/VK/VKHelpers.cpp index a0d0b643d1..ffd020979c 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp +++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp @@ -145,6 +145,9 @@ namespace vk case driver_vendor::PANVK: // Needs more testing break; + case driver_vendor::ARM_MALI: + // Needs more testing + break; default: rsx_log.warning("Unsupported device: %s", gpu_name); } diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.h b/rpcs3/Emu/RSX/VK/VKRenderTargets.h index e7df9325c1..64ff78533a 100644 --- a/rpcs3/Emu/RSX/VK/VKRenderTargets.h +++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.h @@ -188,6 +188,7 @@ namespace vk case driver_vendor::V3DV: case driver_vendor::HONEYKRISP: case driver_vendor::PANVK: + case driver_vendor::ARM_MALI: break; } diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 1f6be63c78..d8d06420da 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -929,7 +929,7 @@ namespace vk return *pcmd; } - static const std::pair calculate_upload_pitch(int format, u32 heap_align, vk::image* dst_image, const rsx::subresource_layout& layout) + static const std::pair calculate_upload_pitch(int format, u32 heap_align, vk::image* dst_image, const rsx::subresource_layout& layout, const rsx::texture_uploader_capabilities& caps) { u32 block_in_pixel = rsx::get_format_block_size_in_texel(format); u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format); @@ -950,7 +950,7 @@ namespace vk // We have row_pitch in source coordinates. But some formats have a software decode step which can affect this packing! // For such formats, the packed pitch on src does not match packed pitch on dst - if (!rsx::is_compressed_host_format(format)) + if (!rsx::is_compressed_host_format(caps, format)) { const auto host_texel_width = vk::get_format_texel_width(dst_image->format()); const auto host_packed_pitch = host_texel_width * layout.width_in_texel; @@ -977,7 +977,8 @@ namespace vk VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align, rsx::flags32_t image_setup_flags) { const bool requires_depth_processing = (dst_image->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || (format == CELL_GCM_TEXTURE_DEPTH16_FLOAT); - rsx::texture_uploader_capabilities caps{ .alignment = heap_align }; + auto pdev = vk::get_current_renderer(); + rsx::texture_uploader_capabilities caps{ .supports_dxt = pdev->get_texture_compression_bc_support(), .alignment = heap_align }; rsx::texture_memory_info opt{}; bool check_caps = true; @@ -997,11 +998,11 @@ namespace vk for (const rsx::subresource_layout &layout : subresource_layout) { - const auto [row_pitch, upload_pitch_in_texel] = calculate_upload_pitch(format, heap_align, dst_image, layout); + const auto [row_pitch, upload_pitch_in_texel] = calculate_upload_pitch(format, heap_align, dst_image, layout, caps); caps.alignment = row_pitch; // Calculate estimated memory utilization for this subresource - image_linear_size = row_pitch * layout.height_in_block * layout.depth; + image_linear_size = row_pitch * layout.depth * (rsx::is_compressed_host_format(caps, format) ? layout.height_in_block : layout.height_in_texel); // Only do GPU-side conversion if occupancy is good if (check_caps) diff --git a/rpcs3/Emu/RSX/VK/vkutils/chip_class.h b/rpcs3/Emu/RSX/VK/vkutils/chip_class.h index a905b7cc07..07fbb9a1c9 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/chip_class.h +++ b/rpcs3/Emu/RSX/VK/vkutils/chip_class.h @@ -55,7 +55,8 @@ namespace vk NVK, V3DV, HONEYKRISP, - PANVK + PANVK, + ARM_MALI }; driver_vendor get_driver_vendor(); diff --git a/rpcs3/Emu/RSX/VK/vkutils/device.cpp b/rpcs3/Emu/RSX/VK/vkutils/device.cpp index 9a4471b784..207c1155c3 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/device.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/device.cpp @@ -134,6 +134,10 @@ namespace vk // So far only AMD is known to remap image view and border color together. Mark as not required. custom_border_color_support.require_border_color_remap = get_driver_vendor() != driver_vendor::AMD; } + + // v3dv and PanVK support BC1-BC3 which is all we require, support is reported as false since not all formats are supported + optional_features_support.texture_compression_bc = features.textureCompressionBC + || get_driver_vendor() == driver_vendor::V3DV || get_driver_vendor() == driver_vendor::PANVK; } void physical_device::get_physical_device_properties(bool allow_extensions) @@ -303,9 +307,13 @@ namespace vk } if (gpu_name.find("Panfrost") != umax) - { + { // e.g. "Mali-G610 (Panfrost)" return driver_vendor::PANVK; } + else if (gpu_name.find("Mali") != umax) + { // e.g. "Mali-G610", hence "else" + return driver_vendor::ARM_MALI; + } return driver_vendor::unknown; } @@ -336,6 +344,8 @@ namespace vk return driver_vendor::HONEYKRISP; case VK_DRIVER_ID_MESA_PANVK: return driver_vendor::PANVK; + case VK_DRIVER_ID_ARM_PROPRIETARY: + return driver_vendor::ARM_MALI; default: // Mobile? return driver_vendor::unknown; @@ -471,8 +481,7 @@ namespace vk // Enable hardware features manually // Currently we require: // 1. Anisotropic sampling - // 2. DXT support - // 3. Indexable storage buffers + // 2. Indexable storage buffers VkPhysicalDeviceFeatures enabled_features{}; if (pgpu->shader_types_support.allow_float16) { @@ -566,7 +575,7 @@ namespace vk // enabled_features.shaderCullDistance = VK_TRUE; // Alt notation of clip distance enabled_features.samplerAnisotropy = VK_TRUE; - enabled_features.textureCompressionBC = VK_TRUE; + enabled_features.textureCompressionBC = pgpu->optional_features_support.texture_compression_bc; enabled_features.shaderStorageBufferArrayDynamicIndexing = VK_TRUE; // Optionally disable unsupported stuff @@ -659,19 +668,6 @@ namespace vk enabled_features.logicOp = VK_FALSE; } - if (!pgpu->features.textureCompressionBC && pgpu->get_driver_vendor() == driver_vendor::V3DV) - { - // v3dv supports BC1-BC3 which is all we require, support is reported as false since not all formats are supported - rsx_log.error("Your GPU running on the V3DV driver does not support full texture block compression. Graphics may not render correctly."); - enabled_features.textureCompressionBC = VK_FALSE; - } - - if (!pgpu->features.textureCompressionBC && pgpu->get_driver_vendor() == driver_vendor::PANVK) - { - rsx_log.error("Your GPU running on the PANVK driver does not support full texture block compression. Graphics may not render correctly."); - enabled_features.textureCompressionBC = VK_FALSE; - } - VkDeviceCreateInfo device = {}; device.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; device.pNext = nullptr; diff --git a/rpcs3/Emu/RSX/VK/vkutils/device.h b/rpcs3/Emu/RSX/VK/vkutils/device.h index 98f97e6e19..febd132d7e 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/device.h +++ b/rpcs3/Emu/RSX/VK/vkutils/device.h @@ -92,6 +92,7 @@ namespace vk bool synchronization_2 = false; bool unrestricted_depth_range = false; bool extended_device_fault = false; + bool texture_compression_bc = false; } optional_features_support; friend class render_device; @@ -190,6 +191,7 @@ namespace vk bool get_barycoords_support() const { return pgpu->optional_features_support.barycentric_coords; } bool get_synchronization2_support() const { return pgpu->optional_features_support.synchronization_2; } bool get_extended_device_fault_support() const { return pgpu->optional_features_support.extended_device_fault; } + bool get_texture_compression_bc_support() const { return pgpu->optional_features_support.texture_compression_bc; } u64 get_descriptor_update_after_bind_support() const { return pgpu->descriptor_indexing_support.update_after_bind_mask; } u32 get_descriptor_max_draw_calls() const { return pgpu->descriptor_max_draw_calls; }