rsx/vk: Fix GPU tile encoding

- Decoding also needs some love, but currently WIP
This commit is contained in:
kd-11 2024-02-06 01:29:51 +03:00 committed by kd-11
parent b8ed3eb824
commit e5c831a800
8 changed files with 107 additions and 36 deletions

View file

@ -475,7 +475,7 @@ namespace rsx
rsx::texture_upload_context context, rsx::texture_dimension_extended type, bool swizzled, component_order swizzle_flags, rsx::flags32_t flags) = 0;
virtual section_storage_type* upload_image_from_cpu(commandbuffer_type&, const address_range &rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u32 pitch, u32 gcm_format, texture_upload_context context,
const std::vector<rsx::subresource_layout>& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) = 0;
virtual section_storage_type* create_nul_section(commandbuffer_type&, const address_range &rsx_range, const image_section_attributes_t& attrs, bool memory_load) = 0;
virtual section_storage_type* create_nul_section(commandbuffer_type&, const address_range &rsx_range, const image_section_attributes_t& attrs, const GCM_tile_reference& tile, bool memory_load) = 0;
virtual void set_component_order(section_storage_type& section, u32 gcm_format, component_order expected) = 0;
virtual void insert_texture_barrier(commandbuffer_type&, image_storage_type* tex, bool strong_ordering = true) = 0;
virtual image_view_type generate_cubemap_from_images(commandbuffer_type&, u32 gcm_format, u16 size, const std::vector<copy_region_descriptor>& sources, const texture_channel_remap_t& remap_vector) = 0;
@ -2551,11 +2551,10 @@ namespace rsx
src_address += (src.width - src_w) * src_bpp;
}
const auto is_tiled_mem = [&](const utils::address_range& range)
const auto get_tiled_region = [&](const utils::address_range& range)
{
auto rsxthr = rsx::get_current_renderer();
auto region = rsxthr->get_tiled_memory_region(range);
return region.tile != nullptr;
return rsxthr->get_tiled_memory_region(range);
};
auto rtt_lookup = [&m_rtts, &cmd, &scale_x, &scale_y, this](u32 address, u32 width, u32 height, u32 pitch, u8 bpp, rsx::flags32_t access, bool allow_clipped) -> typename surface_store_type::surface_overlap_info
@ -2662,8 +2661,10 @@ namespace rsx
};
// Check tiled mem
const auto dst_is_tiled = is_tiled_mem(utils::address_range::start_length(dst_address, dst.pitch * dst.clip_height));
const auto src_is_tiled = is_tiled_mem(utils::address_range::start_length(src_address, src.pitch * src.height));
const auto dst_tile = get_tiled_region(utils::address_range::start_length(dst_address, dst.pitch * dst.clip_height));
const auto src_tile = get_tiled_region(utils::address_range::start_length(src_address, src.pitch * src.height));
const auto dst_is_tiled = !!dst_tile;
const auto src_is_tiled = !!src_tile;
// Check if src/dst are parts of render targets
typename surface_store_type::surface_overlap_info dst_subres;
@ -3219,9 +3220,10 @@ namespace rsx
{
.pitch = dst.pitch,
.width = static_cast<u16>(dst_dimensions.width),
.height = static_cast<u16>(dst_dimensions.height)
.height = static_cast<u16>(dst_dimensions.height),
.bpp = dst_bpp
};
cached_dest = create_nul_section(cmd, rsx_range, attrs, force_dma_load);
cached_dest = create_nul_section(cmd, rsx_range, attrs, dst_tile, force_dma_load);
}
else
{

View file

@ -712,6 +712,7 @@ namespace gl
gl::command_context& /*cmd*/,
const utils::address_range& rsx_range,
const rsx::image_section_attributes_t& attrs,
const rsx::GCM_tile_reference& /*tile*/,
bool /*memory_load*/) override
{
auto& cached = *find_cached_texture(rsx_range, { .gcm_format = RSX_GCM_FORMAT_IGNORED }, true, false, false);

View file

@ -512,6 +512,7 @@ namespace vk
{
u32 tile_base_address;
u32 tile_base_offset;
u32 tile_rw_offset;
u32 tile_size;
u32 tile_pitch;
u32 bank;
@ -643,8 +644,9 @@ namespace vk
params.factor = factor;
params.num_tiles_per_row = tiles_per_row;
params.tile_base_address = config.tile_base_address;
params.tile_rw_offset = config.tile_rw_offset;
params.tile_size = config.tile_size;
params.tile_offset = config.tile_base_offset;
params.tile_address_offset = config.tile_base_offset;
params.tile_pitch = config.tile_pitch;
params.tile_bank = config.bank;
params.image_width = config.image_width;

View file

@ -174,7 +174,7 @@ namespace vk
// NOTE: Do not unmap. This can be extremely slow on some platforms.
}
std::pair<u32, buffer*> dma_block::get(const utils::address_range& range)
dma_mapping_handle dma_block::get(const utils::address_range& range)
{
if (inheritance_info.parent)
{
@ -331,7 +331,7 @@ namespace vk
block->init(*g_render_device, base_address, expected_length);
}
std::pair<u32, vk::buffer*> map_dma(u32 local_address, u32 length)
dma_mapping_handle map_dma(u32 local_address, u32 length)
{
// Not much contention expected here, avoid searching twice
std::lock_guard lock(g_dma_mutex);

View file

@ -4,7 +4,9 @@
namespace vk
{
std::pair<u32, vk::buffer*> map_dma(u32 local_address, u32 length);
using dma_mapping_handle = std::pair<u32, vk::buffer*>;
dma_mapping_handle map_dma(u32 local_address, u32 length);
void load_dma(u32 local_address, u32 length);
void flush_dma(u32 local_address, u32 length);
void unmap_dma(u32 local_address, u32 length);

View file

@ -1274,6 +1274,7 @@ namespace vk
{
.tile_base_address = tiled_region.base_address,
.tile_base_offset = range.start - tiled_region.base_address,
.tile_rw_offset = range.start - tiled_region.base_address, // TODO
.tile_size = tiled_region.tile->size,
.tile_pitch = tiled_region.tile->pitch,
.bank = tiled_region.tile->bank,

View file

@ -95,7 +95,23 @@ namespace vk
const auto tiled_region = rsx::get_current_renderer()->get_tiled_memory_region(valid_range);
const bool require_tiling = !!tiled_region;
const bool require_gpu_transform = require_format_conversion || pack_unpack_swap_bytes || require_tiling;
auto dma_mapping = vk::map_dma(valid_range.start, valid_range.length());
auto dma_sync_region = valid_range;
dma_mapping_handle dma_mapping = { 0, nullptr };
auto dma_sync = [&dma_sync_region, &dma_mapping](bool load, bool force = false)
{
if (dma_mapping.second && !force)
{
return;
}
dma_mapping = vk::map_dma(dma_sync_region.start, dma_sync_region.length());
if (load)
{
vk::load_dma(dma_sync_region.start, dma_sync_region.length());
}
};
if (require_gpu_transform)
{
@ -104,20 +120,16 @@ namespace vk
const auto task_length = transfer_pitch * src_area.height();
auto working_buffer_length = calculate_working_buffer_size(task_length, src->aspect());
#if !DEBUG_DMA_TILING
if (require_tiling)
{
// Safety padding
working_buffer_length += tiled_region.tile->size;
// Calculate actual section length
const auto available_tile_size = tiled_region.tile->size - (valid_range.start - tiled_region.base_address);
const auto max_content_size = tiled_region.tile->pitch * utils::align(height, 64);
section_length = std::min(max_content_size, available_tile_size);
if (section_length > valid_range.length()) [[ likely ]]
{
dma_mapping = vk::map_dma(valid_range.start, section_length);
}
// Calculate actual working section for the memory op
dma_sync_region = tiled_region.tile_align(dma_sync_region);
}
#endif
auto working_buffer = vk::get_scratch_buffer(cmd, working_buffer_length);
u32 result_offset = 0;
@ -177,14 +189,43 @@ namespace vk
#if !DEBUG_DMA_TILING
// Compute -> Compute barrier
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT);
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
// We don't need to calibrate write if two conditions are met:
// 1. The start offset of our 2D region is a multiple of 64 lines
// 2. We use the whole pitch.
// If these conditions are not met, we need to upload the entire tile (or at least the affected tiles wholly)
if (valid_range.start != dma_sync_region.start || real_pitch != tiled_region.tile->pitch)
{
// Tile indices run to the end of the row (full pitch).
// Tiles address outside their 64x64 area too, so we need to actually load the whole thing and "fill in" missing blocks.
// Visualizing "hot" pixels when doing a partial copy is very revealing, there's lots of data from the padding areas to be filled in.
dma_sync(true);
ensure(dma_mapping.second);
// Upload memory to the working buffer
const auto dst_offset = task_length; // Append to the end of the input
VkBufferCopy mem_load{};
mem_load.srcOffset = dma_mapping.first;
mem_load.dstOffset = dst_offset;
mem_load.size = dma_sync_region.length();
vkCmdCopyBuffer(cmd, dma_mapping.second->value, working_buffer->value, 1, &mem_load);
// Transfer -> Compute barrier
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, dst_offset, dma_sync_region.length(),
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT);
}
// Prepare payload
const RSX_detiler_config config =
{
.tile_base_address = tiled_region.base_address,
.tile_base_offset = valid_range.start - tiled_region.base_address,
.tile_rw_offset = dma_sync_region.start - tiled_region.base_address,
.tile_size = tiled_region.tile->size,
.tile_pitch = tiled_region.tile->pitch,
.bank = tiled_region.tile->bank,
@ -195,8 +236,8 @@ namespace vk
.src_offset = 0,
// TODO: Check interaction with anti-aliasing
.image_width = width,
.image_height = height,
.image_width = (u16)transfer_width,
.image_height = (u16)transfer_height,
.image_pitch = real_pitch,
.image_bpp = context == rsx::texture_upload_context::dma ? internal_bpp : rsx::get_format_block_size_in_bytes(gcm_format)
};
@ -207,8 +248,30 @@ namespace vk
// Update internal variables
result_offset = task_length;
real_pitch = tiled_region.tile->pitch;
real_pitch = tiled_region.tile->pitch; // We're always copying the full image. In case of partials we're "filling in" blocks, not doing partial 2D copies.
require_rw_barrier = true;
#if 0
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, result_offset, working_buffer_length,
VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
// Debug write
auto scratch_img = vk::get_typeless_helper(VK_FORMAT_B8G8R8A8_UNORM, RSX_FORMAT_CLASS_COLOR, tiled_region.tile->pitch / 4, 768);
scratch_img->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
VkBufferImageCopy dbg_copy{};
dbg_copy.bufferOffset = config.dst_offset;
dbg_copy.imageExtent.width = width;
dbg_copy.imageExtent.height = height;
dbg_copy.imageExtent.depth = 1;
dbg_copy.bufferRowLength = tiled_region.tile->pitch / 4;
dbg_copy.imageSubresource = { .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1 };
vk::copy_buffer_to_image(cmd, working_buffer, scratch_img, dbg_copy);
scratch_img->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
#endif
#endif
}
@ -221,6 +284,8 @@ namespace vk
if (rsx_pitch == real_pitch) [[likely]]
{
dma_sync(false);
VkBufferCopy copy = {};
copy.srcOffset = result_offset;
copy.dstOffset = dma_mapping.first;
@ -229,13 +294,7 @@ namespace vk
}
else
{
if (context != rsx::texture_upload_context::dma)
{
// Partial load for the bits outside the existing image
// NOTE: A true DMA section would have been prepped beforehand
// TODO: Parial range load/flush
vk::load_dma(valid_range.start, section_length);
}
dma_sync(true);
std::vector<VkBufferCopy> copy;
copy.reserve(transfer_height);
@ -255,6 +314,8 @@ namespace vk
}
else
{
dma_sync(false);
VkBufferImageCopy region = {};
region.bufferRowLength = (rsx_pitch / internal_bpp);
region.imageSubresource = { src->aspect(), 0, 0, 1 };
@ -1011,6 +1072,7 @@ namespace vk
vk::command_buffer& /*cmd*/,
const utils::address_range& rsx_range,
const rsx::image_section_attributes_t& attrs,
const rsx::GCM_tile_reference& tile,
bool memory_load)
{
auto& region = *find_cached_texture(rsx_range, { .gcm_format = RSX_GCM_FORMAT_IGNORED }, true, false, false);
@ -1022,7 +1084,7 @@ namespace vk
region.set_dirty(false);
region.set_unpack_swap_bytes(true);
if (memory_load)
if (memory_load && !tile) // Memory load on DMA tiles will always happen during the actual copy command
{
vk::map_dma(rsx_range.start, rsx_range.length());
vk::load_dma(rsx_range.start, rsx_range.length());

View file

@ -482,7 +482,8 @@ namespace vk
cached_texture_section* create_new_texture(vk::command_buffer& cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u32 pitch,
u32 gcm_format, rsx::texture_upload_context context, rsx::texture_dimension_extended type, bool swizzled, rsx::component_order swizzle_flags, rsx::flags32_t flags) override;
cached_texture_section* create_nul_section(vk::command_buffer& cmd, const utils::address_range& rsx_range, const rsx::image_section_attributes_t& attrs, bool memory_load) override;
cached_texture_section* create_nul_section(vk::command_buffer& cmd, const utils::address_range& rsx_range, const rsx::image_section_attributes_t& attrs,
const rsx::GCM_tile_reference& tile, bool memory_load) override;
cached_texture_section* upload_image_from_cpu(vk::command_buffer& cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u32 pitch, u32 gcm_format,
rsx::texture_upload_context context, const std::vector<rsx::subresource_layout>& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) override;