diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index 99a0772eed..74455557e8 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -358,6 +358,7 @@ target_sources(rpcs3_emu PRIVATE if(TARGET 3rdparty_vulkan) target_sources(rpcs3_emu PRIVATE RSX/VK/VKCommonDecompiler.cpp + RSX/VK/VKDMA.cpp RSX/VK/VKFormats.cpp RSX/VK/VKFragmentProgram.cpp RSX/VK/VKFramebuffer.cpp diff --git a/rpcs3/Emu/RSX/Common/texture_cache_utils.h b/rpcs3/Emu/RSX/Common/texture_cache_utils.h index 12afc893f8..b5b29910f7 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache_utils.h +++ b/rpcs3/Emu/RSX/Common/texture_cache_utils.h @@ -1401,7 +1401,7 @@ namespace rsx } } - void imp_flush() + virtual void imp_flush() { AUDIT(synchronized); diff --git a/rpcs3/Emu/RSX/VK/VKDMA.cpp b/rpcs3/Emu/RSX/VK/VKDMA.cpp new file mode 100644 index 0000000000..717eb154fb --- /dev/null +++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp @@ -0,0 +1,372 @@ +#include "stdafx.h" +#include "VKHelpers.h" +#include "VKResourceManager.h" +#include "VKDMA.h" + +namespace vk +{ + static constexpr size_t s_dma_block_length = 0x01000000; + static constexpr u32 s_dma_block_mask = 0xFF000000; + static constexpr u32 s_dma_offset_mask = 0x00FFFFFF; + + static constexpr u32 s_page_size = 16384; + static constexpr u32 s_page_align = s_page_size - 1; + static constexpr u32 s_pages_per_entry = 32; + static constexpr u32 s_bits_per_page = 2; + static constexpr u32 s_bytes_per_entry = (s_page_size * s_pages_per_entry); + + std::unordered_map g_dma_pool; + + void* dma_block::map_range(const utils::address_range& range) + { + if (inheritance_info.parent) + { + return inheritance_info.parent->map_range(range); + } + + verify(HERE), range.start >= base_address; + u32 start = range.start; + start -= base_address; + return allocated_memory->map(start, range.length()); + } + + void dma_block::unmap() + { + if (inheritance_info.parent) + { + inheritance_info.parent->unmap(); + } + else + { + allocated_memory->unmap(); + } + } + + void dma_block::init(const render_device& dev, u32 addr, size_t size) + { + verify(HERE), size, !(size % s_dma_block_length); + base_address = addr; + + allocated_memory = std::make_unique(dev, size, + dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); + + page_info.resize(size / s_bytes_per_entry, ~0ull); + } + + void dma_block::init(dma_block* parent, u32 addr, size_t size) + { + base_address = addr; + inheritance_info.parent = parent; + inheritance_info.block_offset = (addr - parent->base_address); + } + + void dma_block::set_page_bit(u32 offset, u64 bits) + { + const auto entry = (offset / s_bytes_per_entry); + const auto word = entry / s_pages_per_entry; + const auto shift = (entry % s_pages_per_entry) * s_bits_per_page; + + page_info[word] &= ~(3 << shift); + page_info[word] |= (bits << shift); + } + + bool dma_block::test_page_bit(u32 offset, u64 bits) + { + const auto entry = (offset / s_bytes_per_entry); + const auto word = entry / s_pages_per_entry; + const auto shift = (entry % s_pages_per_entry) * s_bits_per_page; + + return !!(page_info[word] & (bits << shift)); + } + + void dma_block::mark_dirty(const utils::address_range& range) + { + if (!inheritance_info.parent) + { + const u32 start = align(range.start, s_page_size); + const u32 end = ((range.end + 1) & s_page_align); + + for (u32 page = start; page < end; page += s_page_size) + { + set_page_bit(page - base_address, page_bits::dirty); + } + + if (UNLIKELY(start > range.start)) + { + set_page_bit(start - s_page_size, page_bits::nocache); + } + + if (UNLIKELY(end < range.end)) + { + set_page_bit(end + s_page_size, page_bits::nocache); + } + } + else + { + inheritance_info.parent->mark_dirty(range); + } + } + + void dma_block::set_page_info(u32 page_offset, const std::vector& bits) + { + if (!inheritance_info.parent) + { + auto bit_offset = page_offset / s_bytes_per_entry; + verify(HERE), (bit_offset + bits.size()) <= page_info.size(); + std::memcpy(page_info.data() + bit_offset, bits.data(), bits.size()); + } + else + { + inheritance_info.parent->set_page_info(page_offset + inheritance_info.block_offset, bits); + } + } + + void dma_block::flush(const utils::address_range& range) + { + auto src = map_range(range); + auto dst = vm::get_super_ptr(range.start); + std::memcpy(dst, src, range.length()); + + // TODO: Clear page bits + unmap(); + } + + void dma_block::load(const utils::address_range& range) + { + auto src = vm::get_super_ptr(range.start); + auto dst = map_range(range); + std::memcpy(dst, src, range.length()); + + // TODO: Clear page bits to sychronized + unmap(); + } + + std::pair dma_block::get(const utils::address_range& range) + { + if (inheritance_info.parent) + { + return inheritance_info.parent->get(range); + } + + verify(HERE), range.start >= base_address, range.end <= end(); + + // mark_dirty(range); + return { (range.start - base_address), allocated_memory.get() }; + } + + dma_block* dma_block::head() + { + if (!inheritance_info.parent) + return this; + + return inheritance_info.parent->head(); + } + + const dma_block* dma_block::head() const + { + if (!inheritance_info.parent) + return this; + + return inheritance_info.parent->head(); + } + + void dma_block::set_parent(command_buffer& cmd, dma_block* parent) + { + verify(HERE), parent; + if (inheritance_info.parent == parent) + { + // Nothing to do + return; + } + + inheritance_info.parent = parent; + inheritance_info.block_offset = (base_address - parent->base_address); + + if (allocated_memory) + { + VkBufferCopy copy{}; + copy.srcOffset = 0; + copy.dstOffset = inheritance_info.block_offset; + copy.size = allocated_memory->size(); + vkCmdCopyBuffer(cmd, allocated_memory->value, parent->allocated_memory->value, 1, ©); + + auto gc = vk::get_resource_manager(); + gc->dispose(allocated_memory); + + parent->set_page_info(inheritance_info.block_offset, page_info); + page_info.clear(); + } + } + + void dma_block::extend(command_buffer& cmd, const render_device &dev, size_t new_size) + { + verify(HERE), allocated_memory; + if (new_size <= allocated_memory->size()) + return; + + const auto required_entries = new_size / s_bytes_per_entry; + page_info.resize(required_entries, ~0ull); + + auto new_allocation = std::make_unique(dev, new_size, + dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); + + VkBufferCopy copy{}; + copy.size = allocated_memory->size(); + vkCmdCopyBuffer(cmd, allocated_memory->value, new_allocation->value, 1, ©); + + auto gc = vk::get_resource_manager(); + gc->dispose(allocated_memory); + allocated_memory = std::move(new_allocation); + } + + u32 dma_block::start() const + { + return base_address; + } + + u32 dma_block::end() const + { + auto source = head(); + return (source->base_address + source->allocated_memory->size() - 1); + } + + u32 dma_block::size() const + { + return (allocated_memory) ? allocated_memory->size() : 0; + } + + std::pair map_dma(command_buffer& cmd, u32 local_address, u32 length) + { + const auto map_range = utils::address_range::start_length(local_address, length); + const auto first_block = (local_address & s_dma_block_mask); + const auto limit = local_address + length - 1; + auto last_block = (limit & s_dma_block_mask); + + if (LIKELY(first_block == last_block)) + { + if (auto found = g_dma_pool.find(first_block); found != g_dma_pool.end()) + { + return found->second.get(map_range); + } + + auto &block_info = g_dma_pool[first_block]; + block_info.init(*vk::get_current_renderer(), first_block, s_dma_block_length); + return block_info.get(map_range); + } + + dma_block* block_head = nullptr; + auto block_end = align(limit, s_dma_block_length); + + // Reverse scan to try and find the minimum required length in case of other chaining + for (auto block = last_block; block != first_block; block -= s_dma_block_length) + { + if (auto found = g_dma_pool.find(block); found != g_dma_pool.end()) + { + const auto end = found->second.end(); + last_block = std::max(last_block, end & s_dma_block_mask); + block_end = std::max(block_end, end + 1); + break; + } + } + + for (auto block = first_block; block <= last_block; block += s_dma_block_length) + { + auto found = g_dma_pool.find(block); + const bool exists = (found != g_dma_pool.end()); + auto entry = exists ? &found->second : &g_dma_pool[block]; + + if (block == first_block) + { + block_head = entry->head(); + + if (exists) + { + if (entry->end() < limit) + { + auto new_length = block_end - block_head->start(); + block_head->extend(cmd, *vk::get_current_renderer(), new_length); + } + } + else + { + auto required_size = (block_end - block); + block_head->init(*vk::get_current_renderer(), block, required_size); + } + } + else + { + if (exists) + { + entry->set_parent(cmd, block_head); + } + else + { + entry->init(block_head, block, s_dma_block_length); + } + } + } + + verify(HERE), block_head; + return block_head->get(map_range); + } + + template + void sync_dma_impl(u32 local_address, u32 length) + { + const auto limit = local_address + length - 1; + while (length) + { + u32 block = (local_address & s_dma_block_mask); + if (auto found = g_dma_pool.find(block); found != g_dma_pool.end()) + { + const auto sync_end = std::min(limit, found->second.end()); + const auto range = utils::address_range::start_end(local_address, sync_end); + + if constexpr (load) + { + found->second.load(range); + } + else + { + found->second.flush(range); + } + + if (UNLIKELY(sync_end < limit)) + { + // Technically legal but assuming a map->flush usage, this shouldnot happen + // Optimizations could in theory batch together multiple transfers though + LOG_ERROR(RSX, "Sink request spans multiple allocated blocks!"); + const auto write_end = (sync_end + 1u); + const auto written = (write_end - local_address); + length -= written; + local_address = write_end; + continue; + } + + break; + } + else + { + LOG_ERROR(RSX, "Sync command on range not mapped!"); + return; + } + } + } + + void load_dma(u32 local_address, u32 length) + { + sync_dma_impl(local_address, length); + } + + void flush_dma(u32 local_address, u32 length) + { + sync_dma_impl(local_address, length); + } + + void clear_dma_resources() + { + g_dma_pool.clear(); + } +} diff --git a/rpcs3/Emu/RSX/VK/VKDMA.h b/rpcs3/Emu/RSX/VK/VKDMA.h new file mode 100644 index 0000000000..5f6a22e240 --- /dev/null +++ b/rpcs3/Emu/RSX/VK/VKDMA.h @@ -0,0 +1,57 @@ +#pragma once +#include "VKHelpers.h" + +namespace vk +{ + std::pair map_dma(command_buffer& cmd, u32 local_address, u32 length); + void load_dma(u32 local_address, u32 length); + void flush_dma(u32 local_address, u32 length); + + void clear_dma_resources(); + + class dma_block + { + enum page_bits + { + synchronized = 0, + dirty = 1, + nocache = 3 + }; + + struct + { + dma_block* parent = nullptr; + u32 block_offset = 0; + } + inheritance_info; + + u32 base_address = 0; + std::unique_ptr allocated_memory; + std::vector page_info; + + void* map_range(const utils::address_range& range); + void unmap(); + + void set_page_bit(u32 page, u64 bits); + bool test_page_bit(u32 page, u64 bits); + void mark_dirty(const utils::address_range& range); + void set_page_info(u32 page_offset, const std::vector& bits); + + public: + + void init(const render_device& dev, u32 addr, size_t size); + void init(dma_block* parent, u32 addr, size_t size); + void flush(const utils::address_range& range); + void load(const utils::address_range& range); + std::pair get(const utils::address_range& range); + + u32 start() const; + u32 end() const; + u32 size() const; + + dma_block* head(); + const dma_block* head() const; + void set_parent(command_buffer& cmd, dma_block* parent); + void extend(command_buffer& cmd, const render_device& dev, size_t new_size); + }; +} diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.cpp b/rpcs3/Emu/RSX/VK/VKHelpers.cpp index e10dc1ca79..3f6d90601a 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp +++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp @@ -6,6 +6,7 @@ #include "VKFramebuffer.h" #include "VKResolveHelper.h" #include "VKResourceManager.h" +#include "VKDMA.h" #include "Utilities/mutex.h" namespace vk @@ -265,6 +266,7 @@ namespace vk vk::clear_renderpass_cache(dev); vk::clear_framebuffer_cache(); vk::clear_resolve_helpers(); + vk::clear_dma_resources(); vk::get_resource_manager()->destroy(); g_null_texture.reset(); diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index b2db9cb4f8..b444870e40 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -4,6 +4,7 @@ #include "VKGSRender.h" #include "VKCompute.h" #include "VKResourceManager.h" +#include "VKDMA.h" #include "Emu/System.h" #include "../Common/TextureUtils.h" #include "Utilities/mutex.h" @@ -39,7 +40,6 @@ namespace vk VkEvent dma_fence = VK_NULL_HANDLE; vk::render_device* m_device = nullptr; vk::viewable_image *vram_texture = nullptr; - std::unique_ptr dma_buffer; public: using baseclass::cached_texture_section; @@ -73,7 +73,7 @@ namespace vk if (!flushed) { // Reset fence - verify(HERE), m_device, dma_buffer, dma_fence; + verify(HERE), m_device, dma_fence; vk::get_resource_manager()->dispose(dma_fence); } @@ -88,10 +88,9 @@ namespace vk void release_dma_resources() { - if (dma_buffer) + if (dma_fence) { auto gc = vk::get_resource_manager(); - gc->dispose(dma_buffer); gc->dispose(dma_fence); } } @@ -187,12 +186,6 @@ namespace vk vkCreateEvent(*m_device, &createInfo, nullptr, &dma_fence); } - if (!dma_buffer) - { - auto memory_type = m_device->get_memory_mapping().host_visible_coherent; - dma_buffer = std::make_unique(*m_device, align(get_section_size(), 256), memory_type, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); - } - vk::image *locked_resource = vram_texture; u32 transfer_width = width; u32 transfer_height = height; @@ -230,21 +223,52 @@ namespace vk verify(HERE), target->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; - // Handle any format conversions using compute tasks - vk::cs_shuffle_base *shuffle_kernel = nullptr; + // TODO: Read back stencil values (is this really necessary?) + const auto internal_bpp = vk::get_format_texel_width(vram_texture->format()); + const auto valid_range = get_confirmed_range(); + real_pitch = internal_bpp * transfer_width; - if (vram_texture->format() == VK_FORMAT_D24_UNORM_S8_UINT) + u32 transfer_x = 0, transfer_y = 0; + if (const auto section_range = get_section_range(); section_range != valid_range) { - shuffle_kernel = vk::get_compute_task(); + if (const auto offset = (valid_range.start - get_section_base())) + { + transfer_y = offset / rsx_pitch; + transfer_x = (offset % rsx_pitch) / internal_bpp; + + verify(HERE), transfer_width >= transfer_x, transfer_height >= transfer_y; + transfer_width -= transfer_x; + transfer_height -= transfer_y; + } + + if (const auto tail = (section_range.end - valid_range.end)) + { + const auto row_count = tail / rsx_pitch; + + verify(HERE), transfer_height >= row_count; + transfer_height -= row_count; + } } - else if (vram_texture->format() == VK_FORMAT_D32_SFLOAT_S8_UINT) - { - shuffle_kernel = vk::get_compute_task(); - } - else if (pack_unpack_swap_bytes) + + if ((vram_texture->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || + pack_unpack_swap_bytes) { + const auto section_length = valid_range.length(); + const auto transfer_pitch = transfer_width * internal_bpp; + const auto task_length = transfer_pitch * transfer_height; + + auto working_buffer = vk::get_scratch_buffer(); + auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length); + + VkBufferImageCopy region = {}; + region.imageSubresource = { vram_texture->aspect(), 0, 0, 1 }; + region.imageOffset = { (s32)transfer_x, (s32)transfer_y, 0 }; + region.imageExtent = { transfer_width, transfer_height, 1 }; + vk::copy_image_to_buffer(cmd, target, working_buffer, region); + const auto texel_layout = vk::get_format_element_size(vram_texture->format()); const auto elem_size = texel_layout.first; + vk::cs_shuffle_base *shuffle_kernel; if (elem_size == 2) { @@ -254,38 +278,60 @@ namespace vk { shuffle_kernel = vk::get_compute_task(); } - } + else + { + fmt::throw_exception("Unreachable" HERE); + } - // Do not run the compute task on host visible memory - vk::buffer* mem_target = shuffle_kernel ? vk::get_scratch_buffer() : dma_buffer.get(); - - // TODO: Read back stencil values (is this really necessary?) - VkBufferImageCopy region = {}; - region.imageSubresource = {vram_texture->aspect() & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1}; - region.imageExtent = {transfer_width, transfer_height, 1}; - vkCmdCopyImageToBuffer(cmd, target->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, mem_target->value, 1, ®ion); - - locked_resource->pop_layout(cmd); - real_pitch = vk::get_format_texel_width(vram_texture->format()) * transfer_width; - - if (shuffle_kernel) - { - verify (HERE), mem_target->value != dma_buffer->value; - - vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, get_section_size(), + vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); - shuffle_kernel->run(cmd, mem_target, get_section_size()); + shuffle_kernel->run(cmd, working_buffer, task_length); - vk::insert_buffer_memory_barrier(cmd, mem_target->value, 0, get_section_size(), + vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); - VkBufferCopy copy = {}; - copy.size = get_section_size(); - vkCmdCopyBuffer(cmd, mem_target->value, dma_buffer->value, 1, ©); + if (LIKELY(rsx_pitch == real_pitch)) + { + VkBufferCopy copy = {}; + copy.dstOffset = final_mapping.first; + copy.size = section_length; + vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, ©); + } + else + { + std::vector copy; + copy.reserve(transfer_height); + + u32 dst_offset = final_mapping.first; + u32 src_offset = 0; + + for (unsigned row = 0; row < transfer_height; ++row) + { + copy.push_back({src_offset, dst_offset, transfer_pitch}); + src_offset += real_pitch; + dst_offset += rsx_pitch; + } + + vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data()); + } } + else + { + VkBufferImageCopy region = {}; + region.bufferRowLength = (rsx_pitch / internal_bpp); + region.imageSubresource = { vram_texture->aspect(), 0, 0, 1 }; + region.imageOffset = { (s32)transfer_x, (s32)transfer_y, 0 }; + region.imageExtent = { transfer_width, transfer_height, 1 }; + + auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length()); + region.bufferOffset = mapping.first; + vkCmdCopyImageToBuffer(cmd, target->value, target->current_layout, mapping.second->value, 1, ®ion); + } + + locked_resource->pop_layout(cmd); if (UNLIKELY(synchronized)) { @@ -314,7 +360,7 @@ namespace vk /** * Flush */ - void* map_synchronized(u32 offset, u32 size) + void imp_flush() override { AUDIT(synchronized); @@ -322,12 +368,8 @@ namespace vk vk::wait_for_event(dma_fence, GENERAL_WAIT_TIMEOUT); vkResetEvent(*m_device, dma_fence); - return dma_buffer->map(offset, size); - } - - void finish_flush() - { - dma_buffer->unmap(); + const auto range = get_confirmed_range(); + vk::flush_dma(range.start, range.length()); if (context == rsx::texture_upload_context::framebuffer_storage) { @@ -336,6 +378,11 @@ namespace vk } } + void *map_synchronized(u32, u32) + { return nullptr; } + + void finish_flush() + {} /** * Misc diff --git a/rpcs3/Emu/RSX/rsx_utils.cpp b/rpcs3/Emu/RSX/rsx_utils.cpp index 9e2eec6f35..9ef064dab0 100644 --- a/rpcs3/Emu/RSX/rsx_utils.cpp +++ b/rpcs3/Emu/RSX/rsx_utils.cpp @@ -46,7 +46,7 @@ namespace rsx u8* buf = buffer; // Read the whole buffer from source - for (u32 y = 0; y < clip_h; ++y) + for (int y = 0; y < clip_h; ++y) { std::memcpy(buf, src, buffer_pitch); src += src_pitch; @@ -56,7 +56,7 @@ namespace rsx buf = buffer; // Write to destination - for (u32 y = 0; y < clip_h; ++y) + for (int y = 0; y < clip_h; ++y) { std::memcpy(dst, buf, buffer_pitch); dst += dst_pitch; diff --git a/rpcs3/VKGSRender.vcxproj b/rpcs3/VKGSRender.vcxproj index f23b98303a..08ee0447e1 100644 --- a/rpcs3/VKGSRender.vcxproj +++ b/rpcs3/VKGSRender.vcxproj @@ -25,6 +25,7 @@ + @@ -43,6 +44,7 @@ + diff --git a/rpcs3/VKGSRender.vcxproj.filters b/rpcs3/VKGSRender.vcxproj.filters index 340c233538..2409d4bd26 100644 --- a/rpcs3/VKGSRender.vcxproj.filters +++ b/rpcs3/VKGSRender.vcxproj.filters @@ -58,6 +58,9 @@ Source Files + + Source Files + @@ -105,5 +108,8 @@ Source Files + + Source Files + \ No newline at end of file