diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXMemoryTiling.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXMemoryTiling.glsl new file mode 100644 index 0000000000..9b3dd62f73 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXMemoryTiling.glsl @@ -0,0 +1,349 @@ +R"( +#version 450 +layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in; + +#define SSBO_LOCATION(x) (x + %loc) + +#define MEMORY_OP %op +#define MEMORY_OP_DETILE 0 +#define MEMORY_OP_TILE 1 + +#if (MEMORY_OP == MEMORY_OP_TILE) + #define TILED_DATA_MODIFIER + #define LINEAR_DATA_MODIFIER readonly +#else + #define TILED_DATA_MODIFIER readonly + #define LINEAR_DATA_MODIFIER +#endif + +layout(%set, binding=SSBO_LOCATION(0), std430) TILED_DATA_MODIFIER restrict buffer TiledDataBlock +{ + uint tiled_data[]; +}; + +layout(%set, binding=SSBO_LOCATION(1), std430) LINEAR_DATA_MODIFIER restrict buffer LinearDataBlock +{ + uint linear_data[]; +}; + +#ifdef VULKAN +layout(%push_block) uniform Configuration +{ + uint prime; + uint factor; + uint num_tiles_per_row; + uint tile_base_address; + uint tile_size; + uint tile_offset; + uint tile_pitch; + uint tile_bank; + uint image_width; + uint image_height; + uint image_bpp; +}; +#else + uniform uint prime; + uniform uint factor; + uniform uint num_tiles_per_row; + uniform uint tile_base_address; + uniform uint tile_size; + uniform uint tile_offset; + uniform uint tile_pitch; + uniform uint tile_bank; + uniform uint image_width; + uniform uint image_height; + uniform uint image_bpp; +#endif + +// Constants +#define RSX_TILE_WIDTH 256 +#define RSX_TILE_HEIGHT 64 + +#if (MEMORY_OP == MEMORY_OP_TILE) + +uvec4 read_linear(const in uint offset) +{ + switch (image_bpp) + { + case 16: + { + return uvec4( + linear_data[offset * 4], + linear_data[offset * 4 + 1], + linear_data[offset * 4 + 2], + linear_data[offset * 4 + 3]); + } + case 8: + { + return uvec4( + linear_data[offset * 2], + linear_data[offset * 2 + 1], + 0, + 0); + } + case 4: + { + return uvec4(linear_data[offset], 0, 0, 0); + } + case 2: + { + const uint word = linear_data[offset >> 1]; + const int shift = int(offset & 1) << 4; + return uvec4(bitfieldExtract(word, shift, 16), 0, 0, 0); + } + case 1: + { + const uint word = linear_data[offset >> 2]; + const int shift = int(offset & 3) << 3; + return uvec4(bitfieldExtract(word, shift, 8), 0, 0, 0); + } + default: + return uvec4(0); + } +} + +void write_tiled(const in uint offset, const in uvec4 value) +{ + switch (image_bpp) + { + case 16: + { + tiled_data[offset * 4] = value.x; + tiled_data[offset * 4 + 1] = value.y; + tiled_data[offset * 4 + 2] = value.z; + tiled_data[offset * 4 + 3] = value.w; + break; + } + case 8: + { + tiled_data[offset * 2] = value.x; + tiled_data[offset * 2 + 1] = value.y; + break; + } + case 4: + { + tiled_data[offset] = value.x; + break; + } + case 2: + { + const uint word_offset = offset >> 1; + const uint word = tiled_data[word_offset]; + const int shift = int(offset & 1) << 4; + tiled_data[word_offset] = bitfieldInsert(word, value.x, shift, 16); + break; + } + case 1: + { + const uint word_offset = offset >> 2; + const uint word = tiled_data[word_offset]; + const int shift = int(offset & 3) << 3; + tiled_data[word_offset] = bitfieldInsert(word, value.x, shift, 8); + break; + } + default: + break; + } +} + +#else + +uvec4 read_tiled(const in uint offset) +{ + switch (image_bpp) + { + case 16: + { + return uvec4( + tiled_data[offset * 4], + tiled_data[offset * 4 + 1], + tiled_data[offset * 4 + 2], + tiled_data[offset * 4 + 3]); + } + case 8: + { + return uvec4( + tiled_data[offset * 2], + tiled_data[offset * 2 + 1], + 0, + 0); + } + case 4: + { + return uvec4(tiled_data[offset], 0, 0, 0); + } + case 2: + { + const uint word = tiled_data[offset >> 1]; + const int shift = int(offset & 1) << 4; + return uvec4(bitfieldExtract(word, shift, 16), 0, 0, 0); + } + case 1: + { + const uint word = tiled_data[offset >> 2]; + const int shift = int(offset & 3) << 3; + return uvec4(bitfieldExtract(word, shift, 8), 0, 0, 0); + } + default: + return uvec4(0); + } +} + +void write_linear(const in uint offset, const in uvec4 value) +{ + switch (image_bpp) + { + case 16: + { + linear_data[offset * 4] = value.x; + linear_data[offset * 4 + 1] = value.y; + linear_data[offset * 4 + 2] = value.z; + linear_data[offset * 4 + 3] = value.w; + break; + } + case 8: + { + linear_data[offset * 2] = value.x; + linear_data[offset * 2 + 1] = value.y; + break; + } + case 4: + { + linear_data[offset] = value.x; + break; + } + case 2: + { + const uint word_offset = offset >> 1; + const uint word = linear_data[word_offset]; + const int shift = int(offset & 1) << 4; + linear_data[word_offset] = bitfieldInsert(word, value.x, shift, 16); + break; + } + case 1: + { + const uint word_offset = offset >> 2; + const uint word = linear_data[word_offset]; + const int shift = int(offset & 3) << 3; + linear_data[word_offset] = bitfieldInsert(word, value.x, shift, 8); + break; + } + default: + break; + } +} + +#endif + +void do_memory_op(const in uint row, const in uint col) +{ + const uint row_offset = (row * tile_pitch) + tile_base_address + tile_offset; + const uint this_address = row_offset + (col * image_bpp); + + // 1. Calculate row_addr + const uint texel_offset = (this_address - tile_base_address) / RSX_TILE_WIDTH; + // Calculate coordinate of the tile grid we're supposed to be in + const uint tile_x = texel_offset % num_tiles_per_row; + const uint tile_y = (texel_offset / num_tiles_per_row) / RSX_TILE_HEIGHT; + // Calculate the grid offset for the tile selected and add the base offset. It's supposed to affect the bank stuff in the next step + const uint tile_id = tile_y * num_tiles_per_row + tile_x; + const uint tile_selector = (tile_id + (tile_base_address >> 14)) & 0x3ffff; + // Calculate row address + const uint row_address = (tile_selector >> 2) & 0xffff; + + // 2. Calculate bank selector + // There's a lot of weird math here, but it's just a variant of (tile_selector % 4) to pick a value between [0..3] + uint bank_selector = 0; + const uint bank_distribution_lookup[16] = { 0, 1, 2, 3, 2, 3, 0, 1, 1, 2, 3, 0, 3, 0, 1, 2 }; + + if (factor == 1) + { + bank_selector = (tile_selector & 3); + } + else if (factor == 2) + { + const uint idx = ((tile_selector + ((tile_y & 1) << 1)) & 3) * 4 + (tile_y & 3); + bank_selector = bank_distribution_lookup[idx]; + } + else if (factor >= 4) + { + const uint idx = (tile_selector & 3) * 4 + (tile_y & 3); + bank_selector = bank_distribution_lookup[idx]; + } + bank_selector = (bank_selector + tile_bank) % 4; + + // 3. Calculate column selector + uint column_selector = 0; + const uint line_offset_in_tile = (texel_offset / num_tiles_per_row) % RSX_TILE_HEIGHT; + // Calculate column_selector by bit-twiddling line offset and the other calculated parameter bits: + // column_selector[9:7] = line_offset_in_tile[5:3] + // column_selector[6:4] = this_address[7:5] + // column_selector[3:2] = line_offset_in_tile[1:0] + // column_selector[1:0] = 0 + column_selector |= ((line_offset_in_tile >> 3) & 0x7) << 7; + column_selector |= ((this_address >> 5) & 0x7) << 4; + column_selector |= ((line_offset_in_tile >> 0) & 0x3) << 2; + + // 4. Calculate partition selector (0 or 1) + const uint partition_selector = (((line_offset_in_tile >> 2) & 1) + ((this_address >> 6) & 1)) & 1; + + // 5. Build tiled address + uint tile_address = 0; + // tile_address[31:16] = row_adr[15:0] + // tile_address[15:14] = bank_sel[1:0] + // tile_address[13:8] = column_sel[9:4] + // tile_address[7:7] = partition_sel[0:0] + // tile_address[6:5] = column_sel[3:2] + // tile_address[4:0] = this_address[4:0] + tile_address |= ((row_address >> 0) & 0xFFFF) << 16; + tile_address |= ((bank_selector >> 0) & 0x3) << 14; + tile_address |= ((column_selector >> 4) & 0x3F) << 8; + tile_address |= ((partition_selector >> 0) & 0x1) << 7; + tile_address |= ((column_selector >> 2) & 0x3) << 5; + tile_address |= ((this_address >> 0) & 0x1F) << 0; + // Twiddle bits 9 and 10 + tile_address ^= (((tile_address >> 12) ^ ((bank_selector ^ tile_selector) & 1) ^ (tile_address >> 14)) & 1) << 9; + tile_address ^= ((tile_address >> 11) & 1) << 10; + + // Calculate relative addresses and sample + uint linear_image_offset = (row * tile_pitch) + (col * image_bpp); + uint tile_data_offset = tile_address - (tile_base_address + tile_offset); + + if (tile_data_offset >= tile_size) + { + // Do not touch anything out of bounds + return; + } + + // Convert to texel addresses for data access + linear_image_offset /= image_bpp; + tile_data_offset /= image_bpp; + +#if (MEMORY_OP == MEMORY_OP_DETILE) + // Write to linear from tiled + write_linear(linear_image_offset, read_tiled(tile_data_offset)); +#else + // Opposite. Write to tile from linear + write_tiled(tile_data_offset, read_linear(linear_image_offset)); +#endif +} + +void main() +{ + // The 2D coordinates are retrieved from gl_GlobalInvocationID + const uint num_iterations = (image_bpp < 4) ? (4 / image_bpp) : 1; + const uint row = gl_GlobalInvocationID.y; + const uint col0 = gl_GlobalInvocationID.x; + + // for (uint col = col0; col < (col0 + num_iterations); ++col) + { + if (row >= image_height || col0 >= image_width) + { + // Out of bounds + return; + } + + do_memory_op(row, col0); + } +} +)" diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index d79f809c57..c48ab89d76 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -502,6 +502,159 @@ namespace vk void run(const vk::command_buffer& cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words); }; + enum RSX_detiler_op + { + decode = 0, + encode = 1 + }; + + struct RSX_detiler_config + { + u32 tile_base_address; + u32 tile_base_offset; + u32 tile_size; + u32 tile_pitch; + u32 bank; + + const vk::buffer* dst; + u32 dst_offset; + const vk::buffer* src; + u32 src_offset; + + u16 image_width; + u16 image_height; + u32 image_pitch; + }; + + template + struct cs_tile_memcpy : compute_task + { +#pragma pack (push, 1) + struct + { + u32 prime; + u32 factor; + u32 num_tiles_per_row; + u32 tile_base_address; + u32 tile_size; + u32 tile_offset; + u32 tile_pitch; + u32 tile_bank; + u32 image_width; + u32 image_height; + u32 image_bpp; + } params; +#pragma pack (pop) + + const vk::buffer* src_buffer = nullptr; + const vk::buffer* dst_buffer = nullptr; + u32 in_offset = 0; + u32 out_offset = 0; + u32 in_block_length = 0; + u32 out_block_length = 0; + + cs_tile_memcpy() + { + ssbo_count = 2; + use_push_constants = true; + push_constants_size = 44; + + create(); + + m_src = + #include "../Program/GLSLSnippets/RSXMemoryTiling.glsl" + ; + + optimal_group_size = 1; + const std::pair syntax_replace[] = + { + { "%loc", "0" }, + { "%set", "set = 0" }, + { "%push_block", "push_constant" }, + { "%ws", std::to_string(optimal_group_size) }, + { "%op", std::to_string(Op) } + }; + + m_src = fmt::replace_all(m_src, syntax_replace); + } + + void bind_resources() override + { + const auto op = static_cast(Op); + m_program->bind_buffer({ src_buffer->value, in_offset, in_block_length }, 0 ^ op, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); + m_program->bind_buffer({ dst_buffer->value, out_offset, out_block_length }, 1 ^ op, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); + } + + void set_parameters(const vk::command_buffer& cmd) + { + vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constants_size, ¶ms); + } + + void run(const vk::command_buffer& cmd, const RSX_detiler_config& config) + { + dst_buffer = config.dst; + src_buffer = config.src; + + this->in_offset = config.src_offset; + this->out_offset = config.dst_offset; + + const auto tiled_height = std::min( + utils::align(config.image_height, 64), + utils::aligned_div(config.tile_size - config.tile_base_offset, config.tile_pitch) + ); + + if constexpr (Op == RSX_detiler_op::decode) + { + this->in_block_length = tiled_height * config.tile_pitch; + this->out_block_length = config.image_height * config.image_pitch; + } + else + { + this->in_block_length = config.image_height * config.image_pitch; + this->out_block_length = tiled_height* config.tile_pitch; + } + + auto get_prime_factor = [](u32 pitch) -> std::pair + { + const u32 base = (pitch >> 8); + if ((pitch & (pitch - 1)) == 0) + { + return { 1u, base }; + } + + for (const auto prime : { 3, 5, 7, 11, 13 }) + { + if ((base % prime) == 0) + { + return { prime, base / prime }; + } + } + + rsx_log.error("Unexpected pitch value 0x%x", pitch); + return {}; + }; + + const auto [prime, factor] = get_prime_factor(config.tile_pitch); + const u32 tiles_per_row = prime * factor; + + params.prime = prime; + params.factor = factor; + params.num_tiles_per_row = tiles_per_row; + params.tile_base_address = config.tile_base_address; + params.tile_size = config.tile_size; + params.tile_offset = config.tile_base_offset; + params.tile_pitch = config.tile_pitch; + params.tile_bank = config.bank; + params.image_width = config.image_width; + params.image_height = config.image_height; + params.image_bpp = config.image_pitch / config.image_width; + set_parameters(cmd); + + const u32 invocations_x = utils::aligned_div(config.image_width, optimal_group_size); + compute_task::run(cmd, invocations_x, config.image_height, 1); + } + }; + // TODO: Replace with a proper manager extern std::unordered_map> g_compute_tasks; diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index dd65f9470a..212c5852af 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -92,16 +92,24 @@ namespace vk rsx_pitch = pitch; const bool require_format_conversion = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || src->format() == VK_FORMAT_D32_SFLOAT; + const auto tiled_region = rsx::get_current_renderer()->get_tiled_memory_region(valid_range); + const bool require_tiling = !!tiled_region; + const bool require_gpu_transform = require_format_conversion || pack_unpack_swap_bytes || require_tiling; auto dma_mapping = vk::map_dma(valid_range.start, valid_range.length()); - if (require_format_conversion || pack_unpack_swap_bytes) + if (require_gpu_transform) { const auto section_length = valid_range.length(); const auto transfer_pitch = real_pitch; const auto task_length = transfer_pitch * src_area.height(); - const auto working_buffer_length = calculate_working_buffer_size(task_length, src->aspect()); + auto working_buffer_length = calculate_working_buffer_size(task_length, src->aspect()); + + if (require_tiling) { + working_buffer_length += tiled_region.tile->size; + } auto working_buffer = vk::get_scratch_buffer(cmd, working_buffer_length); + u32 result_offset = 0; VkBufferImageCopy region = {}; region.imageSubresource = { src->aspect(), 0, 0, 1 }; @@ -142,17 +150,56 @@ namespace vk shuffle_kernel->run(cmd, working_buffer, task_length); - vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); + if (!require_tiling) + { + vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); - require_rw_barrier = false; + require_rw_barrier = false; + } } } + if (require_tiling) + { +#if !DEBUG_DMA_TILING + // Compute -> Compute barrier + vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT); + + // Prepare payload + const RSX_detiler_config config = + { + .tile_base_address = tiled_region.base_address, + .tile_base_offset = valid_range.start - tiled_region.base_address, + .tile_size = tiled_region.tile->size, + .tile_pitch = tiled_region.tile->pitch, + .bank = tiled_region.tile->bank, + + .dst = working_buffer, + .dst_offset = task_length, + .src = working_buffer, + .src_offset = 0, + + .image_width = width, + .image_height = height, + .image_pitch = real_pitch + }; + + // Execute + const auto job = vk::get_compute_task>(); + job->run(cmd, config); + + result_offset = task_length; + require_rw_barrier = true; +#endif + } + if (require_rw_barrier) { - vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, working_buffer_length, + vk::insert_buffer_memory_barrier(cmd, working_buffer->value, result_offset, working_buffer_length, VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); } @@ -160,6 +207,7 @@ namespace vk if (rsx_pitch == real_pitch) [[likely]] { VkBufferCopy copy = {}; + copy.srcOffset = result_offset; copy.dstOffset = dma_mapping.first; copy.size = section_length; vkCmdCopyBuffer(cmd, working_buffer->value, dma_mapping.second->value, 1, ©); @@ -178,7 +226,7 @@ namespace vk copy.reserve(transfer_height); u32 dst_offset = dma_mapping.first; - u32 src_offset = 0; + u32 src_offset = result_offset; for (unsigned row = 0; row < transfer_height; ++row) { diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index f8f9fa0830..09df70d67d 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -14,7 +14,7 @@ #include #include -#define DEBUG_DMA_TILING 1 +#define DEBUG_DMA_TILING 0 #if DEBUG_DMA_TILING #include "../Common/tiled_dma_copy.hpp"