vk/dma: Allow interoperability between pass-through and write-back DMA caching types

2025-08-24 19:29:19 +00:00 · 2021-01-19 00:40:56 +03:00 · 2021-01-19 00:40:56 +03:00 · 67949bb5b7
commit 67949bb5b7
parent e56da4eb46
4 changed files with 123 additions and 144 deletions
--- a/rpcs3/Emu/RSX/VK/VKDMA.cpp
+++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp
@ -12,16 +12,15 @@ namespace vk
 {
 	static constexpr usz s_dma_block_length = 0x00010000;
 	static constexpr u32 s_dma_block_mask   = 0xFFFF0000;
 	//static constexpr u32 s_dma_offset_mask  = 0x0000FFFF;
 	static constexpr u32 s_page_size = 65536;
 	static constexpr u32 s_page_align = s_page_size - 1;
 	static constexpr u32 s_pages_per_entry = 32;
 	static constexpr u32 s_bits_per_page = 2;
 	static constexpr u32 s_bytes_per_entry = (s_page_size * s_pages_per_entry);
 	std::unordered_map<u32, std::unique_ptr<dma_block>> g_dma_pool;
 	dma_block::~dma_block()
 	{
 		// Use safe free (uses gc to clean up)
 		free();
 	}
 	void* dma_block::map_range(const utils::address_range& range)
 	{
 		if (inheritance_info.parent)
@ -49,19 +48,24 @@ namespace vk
 	void dma_block::allocate(const render_device& dev, usz size)
 	{
-		if (allocated_memory)
+		// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
-		{
+		// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
-			// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
+		free();
 			// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
 			auto gc = vk::get_resource_manager();
 			gc->dispose(allocated_memory);
 		}
 		allocated_memory = std::make_unique<vk::buffer>(dev, size,
 			dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
 			VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
 	}
 	void dma_block::free()
 	{
 		if (allocated_memory)
 		{
 			auto gc = vk::get_resource_manager();
 			gc->dispose(allocated_memory);
 		}
 	}
 	void dma_block::init(const render_device& dev, u32 addr, usz size)
 	{
 		ensure(size);
@ -69,7 +73,6 @@ namespace vk
 		base_address = addr;
 		allocate(dev, size);
 		page_info.resize(size / s_bytes_per_entry, ~0ull);
 	}
 	void dma_block::init(dma_block* parent, u32 addr, usz size)
@ -79,67 +82,6 @@ namespace vk
 		inheritance_info.block_offset = (addr - parent->base_address);
 	}
 	void dma_block::set_page_bit(u32 offset, u64 bits)
 	{
 		const auto entry = (offset / s_bytes_per_entry);
 		const auto word = entry / s_pages_per_entry;
 		const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
 		page_info[word] &= ~(3 << shift);
 		page_info[word] |= (bits << shift);
 	}
 	bool dma_block::test_page_bit(u32 offset, u64 bits)
 	{
 		const auto entry = (offset / s_bytes_per_entry);
 		const auto word = entry / s_pages_per_entry;
 		const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
 		return !!(page_info[word] & (bits << shift));
 	}
 	void dma_block::mark_dirty(const utils::address_range& range)
 	{
 		if (!inheritance_info.parent)
 		{
 			const u32 start = utils::align(range.start, s_page_size);
 			const u32 end = ((range.end + 1) & s_page_align);
 			for (u32 page = start; page < end; page += s_page_size)
 			{
 				set_page_bit(page - base_address, page_bits::dirty);
 			}
 			if (start > range.start) [[unlikely]]
 			{
 				set_page_bit(start - s_page_size, page_bits::nocache);
 			}
 			if (end < range.end) [[unlikely]]
 			{
 				set_page_bit(end + s_page_size, page_bits::nocache);
 			}
 		}
 		else
 		{
 			inheritance_info.parent->mark_dirty(range);
 		}
 	}
 	void dma_block::set_page_info(u32 page_offset, const std::vector<u64>& bits)
 	{
 		if (!inheritance_info.parent)
 		{
 			auto bit_offset = page_offset / s_bytes_per_entry;
 			ensure(bit_offset + bits.size() <= page_info.size());
 			std::memcpy(page_info.data() + bit_offset, bits.data(), bits.size());
 		}
 		else
 		{
 			inheritance_info.parent->set_page_info(page_offset + inheritance_info.block_offset, bits);
 		}
 	}
 	void dma_block::flush(const utils::address_range& range)
 	{
 		auto src = map_range(range);
@ -206,11 +148,10 @@ namespace vk
 		{
 			// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
 			// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
-			auto gc = vk::get_resource_manager();
+			free();
 			gc->dispose(allocated_memory);
-			parent->set_page_info(inheritance_info.block_offset, page_info);
+			//parent->set_page_info(inheritance_info.block_offset, page_info);
-			page_info.clear();
+			//page_info.clear();
 		}
 	}
@ -222,8 +163,8 @@ namespace vk
 		allocate(dev, new_size);
-		const auto required_entries = new_size / s_bytes_per_entry;
+		//const auto required_entries = new_size / s_bytes_per_entry;
-		page_info.resize(required_entries, ~0ull);
+		//page_info.resize(required_entries, ~0ull);
 	}
 	u32 dma_block::start() const
@ -244,13 +185,9 @@ namespace vk
 	void dma_block_EXT::allocate(const render_device& dev, usz size)
 	{
-		if (allocated_memory)
+		// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
-		{
+		// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
-			// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
+		free();
 			// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
 			auto gc = vk::get_resource_manager();
 			gc->dispose(allocated_memory);
 		}
 		allocated_memory = std::make_unique<vk::buffer>(dev,
 			VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
@ -278,16 +215,53 @@ namespace vk
 		// NOP
 	}
-	void create_dma_block(std::unique_ptr<dma_block>& block)
+	bool test_host_pointer(u32 base_address, usz length)
 	{
 #if 0 // Unusable due to vm locks
 		auto block = vm::get(vm::any, base_address);
 		ensure(block);
 		if ((block->addr + block->size) < (base_address + length))
 		{
 			return false;
 		}
 		if (block->flags & 0x120)
 		{
 			return true;
 		}
 		auto range_info = block->peek(base_address, u32(length));
 		return !!range_info.second;
 #endif
 #ifdef _WIN32
-		const bool allow_host_buffers = true;
+		MEMORY_BASIC_INFORMATION mem_info;
 		if (!::VirtualQuery(vm::get_super_ptr<const void>(base_address), &mem_info, sizeof(mem_info)))
 		{
 			rsx_log.error("VirtualQuery failed! LastError=0x%x", GetLastError());
 			return false;
 		}
 		return (mem_info.RegionSize >= length);
 #else
 		return true; // *nix behavior is unknown with NVIDIA drivers
 #endif
 	}
 	void create_dma_block(std::unique_ptr<dma_block>& block, u32 base_address, u32 expected_length)
 	{
 		const auto vendor = g_render_device->gpu().get_driver_vendor();
 #ifdef _WIN32
 		const bool allow_host_buffers = (vendor == driver_vendor::NVIDIA) ?
 			test_host_pointer(base_address, expected_length) :
 			true;
 #else
 		// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations		
 		const auto vendor = g_render_device->gpu().get_driver_vendor();
 		const bool allow_host_buffers = (vendor != driver_vendor::AMD && vendor != driver_vendor::RADV);
 #endif
-		if (g_render_device->get_external_memory_host_support() && allow_host_buffers)
+		if (allow_host_buffers && g_render_device->get_external_memory_host_support())
 		{
 			block.reset(new dma_block_EXT());
 		}
@ -295,6 +269,8 @@ namespace vk
 		{
 			block.reset(new dma_block());
 		}
 		block->init(*g_render_device, base_address, expected_length);
 	}
 	std::pair<u32, vk::buffer*> map_dma(const command_buffer& cmd, u32 local_address, u32 length)
@ -315,25 +291,28 @@ namespace vk
 		if (first_block == last_block) [[likely]]
 		{
 			auto &block_info = g_dma_pool[first_block];
-			if (!block_info) create_dma_block(block_info);
+			ensure(!block_info);
-
+			create_dma_block(block_info, first_block, s_dma_block_length);
 			block_info->init(*g_render_device, first_block, s_dma_block_length);
 			return block_info->get(map_range);
 		}
 		dma_block* block_head = nullptr;
 		auto block_end = utils::align(limit, s_dma_block_length);
-		// Reverse scan to try and find the minimum required length in case of other chaining
+		if (g_render_device->gpu().get_driver_vendor() != driver_vendor::NVIDIA ||
-		for (auto block = last_block; block != first_block; block -= s_dma_block_length)
+			rsx::get_location(local_address) == CELL_GCM_LOCATION_LOCAL)
 		{
-			if (auto found = g_dma_pool.find(block); found != g_dma_pool.end())
+			// Reverse scan to try and find the minimum required length in case of other chaining
 			for (auto block = last_block; block != first_block; block -= s_dma_block_length)
 			{
-				const auto end = found->second->end();
+				if (auto found = g_dma_pool.find(block); found != g_dma_pool.end())
-				last_block = std::max(last_block, end & s_dma_block_mask);
+				{
-				block_end = std::max(block_end, end + 1);
+					const auto end = found->second->end();
 					last_block = std::max(last_block, end & s_dma_block_mask);
 					block_end = std::max(block_end, end + 1);
-				break;
+					break;
 				}
 			}
 		}
@ -342,37 +321,31 @@ namespace vk
 			auto found = g_dma_pool.find(block);
 			auto &entry = g_dma_pool[block];
 			const bool exists = !!entry;
 			if (!exists) create_dma_block(entry);
 			if (block == first_block)
 			{
-				block_head = entry->head();
+				if (entry && entry->end() < limit)
 				if (exists)
 				{
-					if (entry->end() < limit)
+					// Then the references to this object do not go to the end of the list as will be done with this new allocation.
-					{
+					// A dumb release is therefore safe...
-						auto new_length = block_end - block_head->start();
+					entry.reset();
 						block_head->extend(cmd, *g_render_device, new_length);
 					}
 				}
-				else
+
 				if (!entry)
 				{
 					auto required_size = (block_end - block);
-					block_head->init(*g_render_device, block, required_size);
+					create_dma_block(entry, block, required_size);
 				}
 				block_head = entry->head();
 			}
 			else if (entry)
 			{
 				entry->set_parent(cmd, block_head);
 			}
 			else
 			{
-				if (exists)
+				entry.reset(new dma_block());
-				{
+				entry->init(block_head, block, s_dma_block_length);
 					entry->set_parent(cmd, block_head);
 				}
 				else
 				{
 					entry->init(block_head, block, s_dma_block_length);
 				}
 			}
 		}
--- a/rpcs3/Emu/RSX/VK/VKDMA.h
+++ b/rpcs3/Emu/RSX/VK/VKDMA.h
@ -13,13 +13,6 @@ namespace vk
 	class dma_block
 	{
 	protected:
 		enum page_bits
 		{
 			synchronized = 0,
 			dirty = 1,
 			nocache = 3
 		};
 		struct
 		{
 			dma_block* parent = nullptr;
@ -29,19 +22,17 @@ namespace vk
 		u32 base_address = 0;
 		std::unique_ptr<buffer> allocated_memory;
 		std::vector<u64> page_info;
 		virtual void allocate(const render_device& dev, usz size);
 		virtual void free();
 		virtual void* map_range(const utils::address_range& range);
 		virtual void unmap();
 		void set_page_bit(u32 page, u64 bits);
 		bool test_page_bit(u32 page, u64 bits);
 		void mark_dirty(const utils::address_range& range);
 		void set_page_info(u32 page_offset, const std::vector<u64>& bits);
 	public:
 		dma_block() = default;
 		virtual ~dma_block();
 		virtual void init(const render_device& dev, u32 addr, usz size);
 		virtual void init(dma_block* parent, u32 addr, usz size);
 		virtual void flush(const utils::address_range& range);
--- a/rpcs3/Emu/RSX/VK/VKHelpers.cpp
+++ b/rpcs3/Emu/RSX/VK/VKHelpers.cpp
@ -70,7 +70,6 @@ namespace vk
 		vk::clear_resolve_helpers();
 		vk::clear_dma_resources();
 		vk::vmm_reset();
 		vk::get_resource_manager()->destroy();
 		vk::clear_scratch_resources();
 		vk::get_upload_heap()->destroy();
@ -86,6 +85,9 @@ namespace vk
 			p.second->destroy();
 		}
 		g_overlay_passes.clear();
 		// This must be the last item destroyed
 		vk::get_resource_manager()->destroy();
 	}
 	const vk::render_device *get_current_renderer()
@ -263,8 +265,6 @@ namespace vk
 		return (g_num_processed_frames > 0)? g_num_processed_frames - 1: 0;
 	}
 	void do_query_cleanup(vk::command_buffer& cmd)
 	{
 		auto renderer = dynamic_cast<VKGSRender*>(rsx::get_current_renderer());
--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@ -905,6 +905,8 @@ namespace vk
 				}
 				auto dma_mapping = vk::map_dma(cmd, static_cast<u32>(src_address), static_cast<u32>(data_length));
 				ensure(dma_mapping.second->size() >= (dma_mapping.first + data_length));
 				vk::load_dma(::narrow<u32>(src_address), data_length);
 				upload_buffer = dma_mapping.second;
@ -927,7 +929,7 @@ namespace vk
 				}
 				// Copy from upload heap to scratch mem
-				if (!opt.deferred_cmds.empty())
+				if (opt.require_upload)
 				{
 					for (const auto& copy_cmd : opt.deferred_cmds)
 					{
@ -953,7 +955,8 @@ namespace vk
 				scratch_offset += image_linear_size;
 				ensure((scratch_offset + image_linear_size) <= scratch_buf->size()); // "Out of scratch memory"
 			}
-			else if (opt.require_upload)
+
 			if (opt.require_upload)
 			{
 				if (upload_commands.empty() || upload_buffer->value != upload_commands.back().first)
 				{
@ -974,7 +977,19 @@ namespace vk
 		{
 			ensure(scratch_buf);
-			vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
+			if (upload_commands.size() > 1)
 			{
 				auto range_ptr = buffer_copies.data();
 				for (const auto& op : upload_commands)
 				{
 					vkCmdCopyBuffer(cmd, op.first, scratch_buf->value, op.second, range_ptr);
 					range_ptr += op.second;
 				}
 			}
 			else
 			{
 				vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
 			}
 			insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
 				VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
@ -1020,7 +1035,7 @@ namespace vk
 			vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast<u32>(copy_regions.size()), copy_regions.data());
 		}
-		else if (opt.require_upload)
+		else if (upload_commands.size() > 1)
 		{
 			auto region_ptr = copy_regions.data();
 			for (const auto& op : upload_commands)