From e95cff0bded90178f2a26157ef48748179caebec Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 2 Oct 2023 23:49:44 +0300 Subject: [PATCH] rsx: Detiler improvements - Detile on blit src read - Improve blit engine integration --- rpcs3/Emu/RSX/Common/texture_cache.h | 23 +++++----- rpcs3/Emu/RSX/VK/VKHelpers.h | 9 ++++ rpcs3/Emu/RSX/VK/VKRenderTargets.cpp | 55 +---------------------- rpcs3/Emu/RSX/VK/VKTexture.cpp | 65 ++++++++++++++++++++++++++++ rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 33 +++++++++++++- 5 files changed, 117 insertions(+), 68 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 06aa4e388f..671cf8d2da 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -2548,12 +2548,12 @@ namespace rsx src_address += (src.width - src_w) * src_bpp; } - const auto is_tiled = [&]() + const auto is_tiled_mem = [&](const utils::address_range& range) { auto rsxthr = rsx::get_current_renderer(); - auto region = rsxthr->get_tiled_memory_region(utils::address_range::start_length(dst_address, dst.pitch * dst.clip_height)); + auto region = rsxthr->get_tiled_memory_region(range); return region.tile != nullptr; - }(); + }; auto rtt_lookup = [&m_rtts, &cmd, &scale_x, &scale_y, this](u32 address, u32 width, u32 height, u32 pitch, u8 bpp, rsx::flags32_t access, bool allow_clipped) -> typename surface_store_type::surface_overlap_info { @@ -2637,6 +2637,10 @@ namespace rsx return true; }; + // Check tiled mem + const auto dst_is_tiled = is_tiled_mem(utils::address_range::start_length(dst_address, dst.pitch * dst.clip_height)); + const auto src_is_tiled = is_tiled_mem(utils::address_range::start_length(src_address, src.pitch * src.height)); + // Check if src/dst are parts of render targets typename surface_store_type::surface_overlap_info dst_subres; bool use_null_region = false; @@ -2646,7 +2650,6 @@ namespace rsx auto src_subres = rtt_lookup(src_address, src_w, src_h, src.pitch, src_bpp, surface_access::transfer_read, false); src_is_render_target = src_subres.surface != nullptr; - if (get_location(dst_address) == CELL_GCM_LOCATION_LOCAL) { // TODO: HACK @@ -2657,7 +2660,7 @@ namespace rsx else { // Surface exists in local memory. - use_null_region = (is_copy_op && !is_format_convert); + use_null_region = (is_copy_op && !is_format_convert && !src_is_tiled); // Invalidate surfaces in range. Sample tests should catch overlaps in theory. m_rtts.invalidate_range(utils::address_range::start_length(dst_address, dst.pitch* dst_h)); @@ -2693,7 +2696,7 @@ namespace rsx else { // Determine whether to perform this transfer on CPU or GPU (src data may not be graphical) - const bool is_trivial_copy = is_copy_op && !is_format_convert && !dst.swizzled && !is_tiled; + const bool is_trivial_copy = is_copy_op && !is_format_convert && !dst.swizzled && !dst_is_tiled && !src_is_tiled; const bool is_block_transfer = (dst_w == src_w && dst_h == src_h && (src.pitch == dst.pitch || src_h == 1)); const bool is_mirror_op = (dst.scale_x < 0.f || dst.scale_y < 0.f); @@ -2723,17 +2726,11 @@ namespace rsx skip_if_collision_exists = true; } - if (!g_cfg.video.use_gpu_texture_scaling) + if (!g_cfg.video.use_gpu_texture_scaling && !dst_is_tiled && !src_is_tiled) { if (dst.swizzled) { // Swizzle operation requested. Use fallback - if (is_tiled) - { - // Corner case - // FIXME: We have had hw-accelerated swizzle support for some time now - rsx_log.error("Swizzled write to tiled area."); - } return false; } diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index 63973a785a..0db4c7153f 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -18,6 +18,11 @@ #define OCCLUSION_MAX_POOL_SIZE DESCRIPTOR_MAX_DRAW_CALLS +namespace rsx +{ + struct GCM_tile_reference; +} + namespace vk { // Forward declarations @@ -86,6 +91,10 @@ namespace vk const std::vector& subresource_layout, int format, bool is_swizzled, u16 layer_count, VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align, rsx::flags32_t image_setup_flags); + std::pair detile_memory_block( + const vk::command_buffer& cmd, const rsx::GCM_tile_reference& tiled_region, const utils::address_range& range, + u16 width, u16 height, u8 bpp); + // Other texture management helpers void copy_image_to_buffer(const vk::command_buffer& cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region, const image_readback_options_t& options = {}); void copy_buffer_to_image(const vk::command_buffer& cmd, const vk::buffer* src, const vk::image* dst, const VkBufferImageCopy& region); diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp b/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp index 2f87f175ab..a3fe493e95 100644 --- a/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp +++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp @@ -709,62 +709,11 @@ namespace vk ); subres.data = std::span(ext_data); #else - const auto available_tile_size = tiled_region.tile->size - (range.start - tiled_region.base_address); - const auto max_content_size = tiled_region.tile->pitch * utils::align(subres.height_in_block, 64); - const auto section_length = std::min(max_content_size, available_tile_size); - - const auto dma_mapping = vk::map_dma(range.start, section_length); - vk::load_dma(range.start, section_length); - const auto scratch_buf = vk::get_scratch_buffer(cmd, section_length * 3); // 0 = linear data, 1 = padding (deswz), 2 = tiled data - const auto tiled_data_scratch_offset = section_length * 2; - const auto linear_data_scratch_offset = 0; - - // Schedule the job - const RSX_detiler_config config = - { - .tile_base_address = tiled_region.base_address, - .tile_base_offset = range.start - tiled_region.base_address, - .tile_size = tiled_region.tile->size, - .tile_pitch = tiled_region.tile->pitch, - .bank = tiled_region.tile->bank, - - .dst = scratch_buf, - .dst_offset = linear_data_scratch_offset, - .src = scratch_buf, - .src_offset = section_length * 2, - - .image_width = subres.width_in_block, - .image_height = subres.height_in_block, - .image_pitch = subres.width_in_block * static_cast(get_bpp()), - .image_bpp = get_bpp() - }; - - // Transfer - VkBufferCopy copy_rgn - { - .srcOffset = dma_mapping.first, - .dstOffset = tiled_data_scratch_offset, - .size = section_length - }; - vkCmdCopyBuffer(cmd, dma_mapping.second->value, scratch_buf->value, 1, ©_rgn); - - // Barrier - vk::insert_buffer_memory_barrier( - cmd, scratch_buf->value, linear_data_scratch_offset, section_length, - VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); - - // Detile - vk::get_compute_task>()->run(cmd, config); - - // Barrier - vk::insert_buffer_memory_barrier( - cmd, scratch_buf->value, linear_data_scratch_offset, subres.width_in_block * get_bpp() * subres.height_in_block, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT); + const auto [scratch_buf, linear_data_scratch_offset] = vk::detile_memory_block(cmd, tiled_region, range, subres.width_in_block, subres.height_in_block, get_bpp()); // FIXME: !!EVIL!! subres.data = { scratch_buf, linear_data_scratch_offset }; + subres.pitch_in_block = subres.width_in_block; upload_flags |= source_is_gpu_resident; heap_align = subres.width_in_block * get_bpp(); #endif diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 770381767d..d5d9e4df17 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -1252,6 +1252,71 @@ namespace vk } } + std::pair detile_memory_block(const vk::command_buffer& cmd, const rsx::GCM_tile_reference& tiled_region, + const utils::address_range& range, u16 width, u16 height, u8 bpp) + { + // Calculate the true length of the usable memory section + const auto available_tile_size = tiled_region.tile->size - (range.start - tiled_region.base_address); + const auto max_content_size = tiled_region.tile->pitch * utils::align(height, 64); + const auto section_length = std::min(max_content_size, available_tile_size); + + // Sync the DMA layer + const auto dma_mapping = vk::map_dma(range.start, section_length); + vk::load_dma(range.start, section_length); + + // Allocate scratch and prepare for the GPU job + const auto scratch_buf = vk::get_scratch_buffer(cmd, section_length * 3); // 0 = linear data, 1 = padding (deswz), 2 = tiled data + const auto tiled_data_scratch_offset = section_length * 2; + const auto linear_data_scratch_offset = 0u; + + // Schedule the job + const RSX_detiler_config config = + { + .tile_base_address = tiled_region.base_address, + .tile_base_offset = range.start - tiled_region.base_address, + .tile_size = tiled_region.tile->size, + .tile_pitch = tiled_region.tile->pitch, + .bank = tiled_region.tile->bank, + + .dst = scratch_buf, + .dst_offset = linear_data_scratch_offset, + .src = scratch_buf, + .src_offset = section_length * 2, + + .image_width = width, + .image_height = height, + .image_pitch = static_cast(width) * bpp, + .image_bpp = bpp + }; + + // Transfer + VkBufferCopy copy_rgn + { + .srcOffset = dma_mapping.first, + .dstOffset = tiled_data_scratch_offset, + .size = section_length + }; + vkCmdCopyBuffer(cmd, dma_mapping.second->value, scratch_buf->value, 1, ©_rgn); + + // Barrier + vk::insert_buffer_memory_barrier( + cmd, scratch_buf->value, linear_data_scratch_offset, section_length, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + + // Detile + vk::get_compute_task>()->run(cmd, config); + + // Barrier + vk::insert_buffer_memory_barrier( + cmd, scratch_buf->value, linear_data_scratch_offset, static_cast(width) * height * bpp, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT); + + // Return a descriptor pointing to the decrypted data + return { scratch_buf, linear_data_scratch_offset }; + } + void blitter::scale_image(vk::command_buffer& cmd, vk::image* src, vk::image* dst, areai src_area, areai dst_area, bool interpolate, const rsx::typeless_xfer& xfer_info) { vk::image* real_src = src; diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 2371aa9a8e..ea47e308fe 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -1050,9 +1050,38 @@ namespace vk upload_command_flags |= upload_contents_async; } + std::vector tmp; + auto p_subresource_layout = &subresource_layout; + u32 heap_align = upload_heap_align_default; + + if (auto tiled_region = rsx::get_current_renderer()->get_tiled_memory_region(rsx_range); + context == rsx::texture_upload_context::blit_engine_src && tiled_region) + { + if (mipmaps > 1) + { + // This really shouldn't happen on framebuffer tiled memory + rsx_log.error("Tiled decode of mipmapped textures is not supported."); + } + else + { + const auto bpp = rsx::get_format_block_size_in_bytes(gcm_format); + const auto [scratch_buf, linear_data_scratch_offset] = vk::detile_memory_block(cmd, tiled_region, rsx_range, width, height, bpp); + + auto subres = subresource_layout.front(); + // FIXME: !!EVIL!! + subres.data = { scratch_buf, linear_data_scratch_offset }; + subres.pitch_in_block = width; + upload_command_flags |= source_is_gpu_resident; + heap_align = width * bpp; + + tmp.push_back(subres); + p_subresource_layout = &tmp; + } + } + const u16 layer_count = (type == rsx::texture_dimension_extended::texture_dimension_cubemap) ? 6 : 1; - vk::upload_image(cmd, image, subresource_layout, gcm_format, input_swizzled, layer_count, image->aspect(), - *m_texture_upload_heap, upload_heap_align_default, upload_command_flags); + vk::upload_image(cmd, image, *p_subresource_layout, gcm_format, input_swizzled, layer_count, image->aspect(), + *m_texture_upload_heap, heap_align, upload_command_flags); vk::leave_uninterruptible();