mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-21 12:05:23 +00:00
rsx/vk: Fix GPU tile encoding
- Decoding also needs some love, but currently WIP
This commit is contained in:
parent
b8ed3eb824
commit
e5c831a800
8 changed files with 107 additions and 36 deletions
|
@ -475,7 +475,7 @@ namespace rsx
|
|||
rsx::texture_upload_context context, rsx::texture_dimension_extended type, bool swizzled, component_order swizzle_flags, rsx::flags32_t flags) = 0;
|
||||
virtual section_storage_type* upload_image_from_cpu(commandbuffer_type&, const address_range &rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u32 pitch, u32 gcm_format, texture_upload_context context,
|
||||
const std::vector<rsx::subresource_layout>& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) = 0;
|
||||
virtual section_storage_type* create_nul_section(commandbuffer_type&, const address_range &rsx_range, const image_section_attributes_t& attrs, bool memory_load) = 0;
|
||||
virtual section_storage_type* create_nul_section(commandbuffer_type&, const address_range &rsx_range, const image_section_attributes_t& attrs, const GCM_tile_reference& tile, bool memory_load) = 0;
|
||||
virtual void set_component_order(section_storage_type& section, u32 gcm_format, component_order expected) = 0;
|
||||
virtual void insert_texture_barrier(commandbuffer_type&, image_storage_type* tex, bool strong_ordering = true) = 0;
|
||||
virtual image_view_type generate_cubemap_from_images(commandbuffer_type&, u32 gcm_format, u16 size, const std::vector<copy_region_descriptor>& sources, const texture_channel_remap_t& remap_vector) = 0;
|
||||
|
@ -2551,11 +2551,10 @@ namespace rsx
|
|||
src_address += (src.width - src_w) * src_bpp;
|
||||
}
|
||||
|
||||
const auto is_tiled_mem = [&](const utils::address_range& range)
|
||||
const auto get_tiled_region = [&](const utils::address_range& range)
|
||||
{
|
||||
auto rsxthr = rsx::get_current_renderer();
|
||||
auto region = rsxthr->get_tiled_memory_region(range);
|
||||
return region.tile != nullptr;
|
||||
return rsxthr->get_tiled_memory_region(range);
|
||||
};
|
||||
|
||||
auto rtt_lookup = [&m_rtts, &cmd, &scale_x, &scale_y, this](u32 address, u32 width, u32 height, u32 pitch, u8 bpp, rsx::flags32_t access, bool allow_clipped) -> typename surface_store_type::surface_overlap_info
|
||||
|
@ -2662,8 +2661,10 @@ namespace rsx
|
|||
};
|
||||
|
||||
// Check tiled mem
|
||||
const auto dst_is_tiled = is_tiled_mem(utils::address_range::start_length(dst_address, dst.pitch * dst.clip_height));
|
||||
const auto src_is_tiled = is_tiled_mem(utils::address_range::start_length(src_address, src.pitch * src.height));
|
||||
const auto dst_tile = get_tiled_region(utils::address_range::start_length(dst_address, dst.pitch * dst.clip_height));
|
||||
const auto src_tile = get_tiled_region(utils::address_range::start_length(src_address, src.pitch * src.height));
|
||||
const auto dst_is_tiled = !!dst_tile;
|
||||
const auto src_is_tiled = !!src_tile;
|
||||
|
||||
// Check if src/dst are parts of render targets
|
||||
typename surface_store_type::surface_overlap_info dst_subres;
|
||||
|
@ -3219,9 +3220,10 @@ namespace rsx
|
|||
{
|
||||
.pitch = dst.pitch,
|
||||
.width = static_cast<u16>(dst_dimensions.width),
|
||||
.height = static_cast<u16>(dst_dimensions.height)
|
||||
.height = static_cast<u16>(dst_dimensions.height),
|
||||
.bpp = dst_bpp
|
||||
};
|
||||
cached_dest = create_nul_section(cmd, rsx_range, attrs, force_dma_load);
|
||||
cached_dest = create_nul_section(cmd, rsx_range, attrs, dst_tile, force_dma_load);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -712,6 +712,7 @@ namespace gl
|
|||
gl::command_context& /*cmd*/,
|
||||
const utils::address_range& rsx_range,
|
||||
const rsx::image_section_attributes_t& attrs,
|
||||
const rsx::GCM_tile_reference& /*tile*/,
|
||||
bool /*memory_load*/) override
|
||||
{
|
||||
auto& cached = *find_cached_texture(rsx_range, { .gcm_format = RSX_GCM_FORMAT_IGNORED }, true, false, false);
|
||||
|
|
|
@ -512,6 +512,7 @@ namespace vk
|
|||
{
|
||||
u32 tile_base_address;
|
||||
u32 tile_base_offset;
|
||||
u32 tile_rw_offset;
|
||||
u32 tile_size;
|
||||
u32 tile_pitch;
|
||||
u32 bank;
|
||||
|
@ -643,8 +644,9 @@ namespace vk
|
|||
params.factor = factor;
|
||||
params.num_tiles_per_row = tiles_per_row;
|
||||
params.tile_base_address = config.tile_base_address;
|
||||
params.tile_rw_offset = config.tile_rw_offset;
|
||||
params.tile_size = config.tile_size;
|
||||
params.tile_offset = config.tile_base_offset;
|
||||
params.tile_address_offset = config.tile_base_offset;
|
||||
params.tile_pitch = config.tile_pitch;
|
||||
params.tile_bank = config.bank;
|
||||
params.image_width = config.image_width;
|
||||
|
|
|
@ -174,7 +174,7 @@ namespace vk
|
|||
// NOTE: Do not unmap. This can be extremely slow on some platforms.
|
||||
}
|
||||
|
||||
std::pair<u32, buffer*> dma_block::get(const utils::address_range& range)
|
||||
dma_mapping_handle dma_block::get(const utils::address_range& range)
|
||||
{
|
||||
if (inheritance_info.parent)
|
||||
{
|
||||
|
@ -331,7 +331,7 @@ namespace vk
|
|||
block->init(*g_render_device, base_address, expected_length);
|
||||
}
|
||||
|
||||
std::pair<u32, vk::buffer*> map_dma(u32 local_address, u32 length)
|
||||
dma_mapping_handle map_dma(u32 local_address, u32 length)
|
||||
{
|
||||
// Not much contention expected here, avoid searching twice
|
||||
std::lock_guard lock(g_dma_mutex);
|
||||
|
|
|
@ -4,7 +4,9 @@
|
|||
|
||||
namespace vk
|
||||
{
|
||||
std::pair<u32, vk::buffer*> map_dma(u32 local_address, u32 length);
|
||||
using dma_mapping_handle = std::pair<u32, vk::buffer*>;
|
||||
|
||||
dma_mapping_handle map_dma(u32 local_address, u32 length);
|
||||
void load_dma(u32 local_address, u32 length);
|
||||
void flush_dma(u32 local_address, u32 length);
|
||||
void unmap_dma(u32 local_address, u32 length);
|
||||
|
|
|
@ -1274,6 +1274,7 @@ namespace vk
|
|||
{
|
||||
.tile_base_address = tiled_region.base_address,
|
||||
.tile_base_offset = range.start - tiled_region.base_address,
|
||||
.tile_rw_offset = range.start - tiled_region.base_address, // TODO
|
||||
.tile_size = tiled_region.tile->size,
|
||||
.tile_pitch = tiled_region.tile->pitch,
|
||||
.bank = tiled_region.tile->bank,
|
||||
|
|
|
@ -95,7 +95,23 @@ namespace vk
|
|||
const auto tiled_region = rsx::get_current_renderer()->get_tiled_memory_region(valid_range);
|
||||
const bool require_tiling = !!tiled_region;
|
||||
const bool require_gpu_transform = require_format_conversion || pack_unpack_swap_bytes || require_tiling;
|
||||
auto dma_mapping = vk::map_dma(valid_range.start, valid_range.length());
|
||||
|
||||
auto dma_sync_region = valid_range;
|
||||
dma_mapping_handle dma_mapping = { 0, nullptr };
|
||||
|
||||
auto dma_sync = [&dma_sync_region, &dma_mapping](bool load, bool force = false)
|
||||
{
|
||||
if (dma_mapping.second && !force)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
dma_mapping = vk::map_dma(dma_sync_region.start, dma_sync_region.length());
|
||||
if (load)
|
||||
{
|
||||
vk::load_dma(dma_sync_region.start, dma_sync_region.length());
|
||||
}
|
||||
};
|
||||
|
||||
if (require_gpu_transform)
|
||||
{
|
||||
|
@ -104,20 +120,16 @@ namespace vk
|
|||
const auto task_length = transfer_pitch * src_area.height();
|
||||
auto working_buffer_length = calculate_working_buffer_size(task_length, src->aspect());
|
||||
|
||||
#if !DEBUG_DMA_TILING
|
||||
if (require_tiling)
|
||||
{
|
||||
// Safety padding
|
||||
working_buffer_length += tiled_region.tile->size;
|
||||
|
||||
// Calculate actual section length
|
||||
const auto available_tile_size = tiled_region.tile->size - (valid_range.start - tiled_region.base_address);
|
||||
const auto max_content_size = tiled_region.tile->pitch * utils::align(height, 64);
|
||||
section_length = std::min(max_content_size, available_tile_size);
|
||||
|
||||
if (section_length > valid_range.length()) [[ likely ]]
|
||||
{
|
||||
dma_mapping = vk::map_dma(valid_range.start, section_length);
|
||||
}
|
||||
// Calculate actual working section for the memory op
|
||||
dma_sync_region = tiled_region.tile_align(dma_sync_region);
|
||||
}
|
||||
#endif
|
||||
|
||||
auto working_buffer = vk::get_scratch_buffer(cmd, working_buffer_length);
|
||||
u32 result_offset = 0;
|
||||
|
@ -177,14 +189,43 @@ namespace vk
|
|||
#if !DEBUG_DMA_TILING
|
||||
// Compute -> Compute barrier
|
||||
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT);
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
|
||||
|
||||
// We don't need to calibrate write if two conditions are met:
|
||||
// 1. The start offset of our 2D region is a multiple of 64 lines
|
||||
// 2. We use the whole pitch.
|
||||
// If these conditions are not met, we need to upload the entire tile (or at least the affected tiles wholly)
|
||||
|
||||
if (valid_range.start != dma_sync_region.start || real_pitch != tiled_region.tile->pitch)
|
||||
{
|
||||
// Tile indices run to the end of the row (full pitch).
|
||||
// Tiles address outside their 64x64 area too, so we need to actually load the whole thing and "fill in" missing blocks.
|
||||
// Visualizing "hot" pixels when doing a partial copy is very revealing, there's lots of data from the padding areas to be filled in.
|
||||
|
||||
dma_sync(true);
|
||||
ensure(dma_mapping.second);
|
||||
|
||||
// Upload memory to the working buffer
|
||||
const auto dst_offset = task_length; // Append to the end of the input
|
||||
VkBufferCopy mem_load{};
|
||||
mem_load.srcOffset = dma_mapping.first;
|
||||
mem_load.dstOffset = dst_offset;
|
||||
mem_load.size = dma_sync_region.length();
|
||||
vkCmdCopyBuffer(cmd, dma_mapping.second->value, working_buffer->value, 1, &mem_load);
|
||||
|
||||
// Transfer -> Compute barrier
|
||||
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, dst_offset, dma_sync_region.length(),
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT);
|
||||
}
|
||||
|
||||
// Prepare payload
|
||||
const RSX_detiler_config config =
|
||||
{
|
||||
.tile_base_address = tiled_region.base_address,
|
||||
.tile_base_offset = valid_range.start - tiled_region.base_address,
|
||||
.tile_rw_offset = dma_sync_region.start - tiled_region.base_address,
|
||||
.tile_size = tiled_region.tile->size,
|
||||
.tile_pitch = tiled_region.tile->pitch,
|
||||
.bank = tiled_region.tile->bank,
|
||||
|
@ -195,8 +236,8 @@ namespace vk
|
|||
.src_offset = 0,
|
||||
|
||||
// TODO: Check interaction with anti-aliasing
|
||||
.image_width = width,
|
||||
.image_height = height,
|
||||
.image_width = (u16)transfer_width,
|
||||
.image_height = (u16)transfer_height,
|
||||
.image_pitch = real_pitch,
|
||||
.image_bpp = context == rsx::texture_upload_context::dma ? internal_bpp : rsx::get_format_block_size_in_bytes(gcm_format)
|
||||
};
|
||||
|
@ -207,8 +248,30 @@ namespace vk
|
|||
|
||||
// Update internal variables
|
||||
result_offset = task_length;
|
||||
real_pitch = tiled_region.tile->pitch;
|
||||
real_pitch = tiled_region.tile->pitch; // We're always copying the full image. In case of partials we're "filling in" blocks, not doing partial 2D copies.
|
||||
require_rw_barrier = true;
|
||||
|
||||
#if 0
|
||||
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, result_offset, working_buffer_length,
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
|
||||
|
||||
// Debug write
|
||||
auto scratch_img = vk::get_typeless_helper(VK_FORMAT_B8G8R8A8_UNORM, RSX_FORMAT_CLASS_COLOR, tiled_region.tile->pitch / 4, 768);
|
||||
scratch_img->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
|
||||
|
||||
VkBufferImageCopy dbg_copy{};
|
||||
dbg_copy.bufferOffset = config.dst_offset;
|
||||
dbg_copy.imageExtent.width = width;
|
||||
dbg_copy.imageExtent.height = height;
|
||||
dbg_copy.imageExtent.depth = 1;
|
||||
dbg_copy.bufferRowLength = tiled_region.tile->pitch / 4;
|
||||
dbg_copy.imageSubresource = { .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1 };
|
||||
vk::copy_buffer_to_image(cmd, working_buffer, scratch_img, dbg_copy);
|
||||
|
||||
scratch_img->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -221,6 +284,8 @@ namespace vk
|
|||
|
||||
if (rsx_pitch == real_pitch) [[likely]]
|
||||
{
|
||||
dma_sync(false);
|
||||
|
||||
VkBufferCopy copy = {};
|
||||
copy.srcOffset = result_offset;
|
||||
copy.dstOffset = dma_mapping.first;
|
||||
|
@ -229,13 +294,7 @@ namespace vk
|
|||
}
|
||||
else
|
||||
{
|
||||
if (context != rsx::texture_upload_context::dma)
|
||||
{
|
||||
// Partial load for the bits outside the existing image
|
||||
// NOTE: A true DMA section would have been prepped beforehand
|
||||
// TODO: Parial range load/flush
|
||||
vk::load_dma(valid_range.start, section_length);
|
||||
}
|
||||
dma_sync(true);
|
||||
|
||||
std::vector<VkBufferCopy> copy;
|
||||
copy.reserve(transfer_height);
|
||||
|
@ -255,6 +314,8 @@ namespace vk
|
|||
}
|
||||
else
|
||||
{
|
||||
dma_sync(false);
|
||||
|
||||
VkBufferImageCopy region = {};
|
||||
region.bufferRowLength = (rsx_pitch / internal_bpp);
|
||||
region.imageSubresource = { src->aspect(), 0, 0, 1 };
|
||||
|
@ -1011,6 +1072,7 @@ namespace vk
|
|||
vk::command_buffer& /*cmd*/,
|
||||
const utils::address_range& rsx_range,
|
||||
const rsx::image_section_attributes_t& attrs,
|
||||
const rsx::GCM_tile_reference& tile,
|
||||
bool memory_load)
|
||||
{
|
||||
auto& region = *find_cached_texture(rsx_range, { .gcm_format = RSX_GCM_FORMAT_IGNORED }, true, false, false);
|
||||
|
@ -1022,7 +1084,7 @@ namespace vk
|
|||
region.set_dirty(false);
|
||||
region.set_unpack_swap_bytes(true);
|
||||
|
||||
if (memory_load)
|
||||
if (memory_load && !tile) // Memory load on DMA tiles will always happen during the actual copy command
|
||||
{
|
||||
vk::map_dma(rsx_range.start, rsx_range.length());
|
||||
vk::load_dma(rsx_range.start, rsx_range.length());
|
||||
|
|
|
@ -482,7 +482,8 @@ namespace vk
|
|||
cached_texture_section* create_new_texture(vk::command_buffer& cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u32 pitch,
|
||||
u32 gcm_format, rsx::texture_upload_context context, rsx::texture_dimension_extended type, bool swizzled, rsx::component_order swizzle_flags, rsx::flags32_t flags) override;
|
||||
|
||||
cached_texture_section* create_nul_section(vk::command_buffer& cmd, const utils::address_range& rsx_range, const rsx::image_section_attributes_t& attrs, bool memory_load) override;
|
||||
cached_texture_section* create_nul_section(vk::command_buffer& cmd, const utils::address_range& rsx_range, const rsx::image_section_attributes_t& attrs,
|
||||
const rsx::GCM_tile_reference& tile, bool memory_load) override;
|
||||
|
||||
cached_texture_section* upload_image_from_cpu(vk::command_buffer& cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u32 pitch, u32 gcm_format,
|
||||
rsx::texture_upload_context context, const std::vector<rsx::subresource_layout>& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) override;
|
||||
|
|
Loading…
Add table
Reference in a new issue