vk/dma: Allow interoperability between pass-through and write-back DMA caching types

This commit is contained in:
kd-11 2021-01-19 00:40:56 +03:00 committed by kd-11
commit 67949bb5b7
4 changed files with 123 additions and 144 deletions

View file

@ -12,16 +12,15 @@ namespace vk
{ {
static constexpr usz s_dma_block_length = 0x00010000; static constexpr usz s_dma_block_length = 0x00010000;
static constexpr u32 s_dma_block_mask = 0xFFFF0000; static constexpr u32 s_dma_block_mask = 0xFFFF0000;
//static constexpr u32 s_dma_offset_mask = 0x0000FFFF;
static constexpr u32 s_page_size = 65536;
static constexpr u32 s_page_align = s_page_size - 1;
static constexpr u32 s_pages_per_entry = 32;
static constexpr u32 s_bits_per_page = 2;
static constexpr u32 s_bytes_per_entry = (s_page_size * s_pages_per_entry);
std::unordered_map<u32, std::unique_ptr<dma_block>> g_dma_pool; std::unordered_map<u32, std::unique_ptr<dma_block>> g_dma_pool;
dma_block::~dma_block()
{
// Use safe free (uses gc to clean up)
free();
}
void* dma_block::map_range(const utils::address_range& range) void* dma_block::map_range(const utils::address_range& range)
{ {
if (inheritance_info.parent) if (inheritance_info.parent)
@ -49,19 +48,24 @@ namespace vk
void dma_block::allocate(const render_device& dev, usz size) void dma_block::allocate(const render_device& dev, usz size)
{ {
if (allocated_memory) // Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
{ // buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline free();
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
auto gc = vk::get_resource_manager();
gc->dispose(allocated_memory);
}
allocated_memory = std::make_unique<vk::buffer>(dev, size, allocated_memory = std::make_unique<vk::buffer>(dev, size,
dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0); VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
} }
void dma_block::free()
{
if (allocated_memory)
{
auto gc = vk::get_resource_manager();
gc->dispose(allocated_memory);
}
}
void dma_block::init(const render_device& dev, u32 addr, usz size) void dma_block::init(const render_device& dev, u32 addr, usz size)
{ {
ensure(size); ensure(size);
@ -69,7 +73,6 @@ namespace vk
base_address = addr; base_address = addr;
allocate(dev, size); allocate(dev, size);
page_info.resize(size / s_bytes_per_entry, ~0ull);
} }
void dma_block::init(dma_block* parent, u32 addr, usz size) void dma_block::init(dma_block* parent, u32 addr, usz size)
@ -79,67 +82,6 @@ namespace vk
inheritance_info.block_offset = (addr - parent->base_address); inheritance_info.block_offset = (addr - parent->base_address);
} }
void dma_block::set_page_bit(u32 offset, u64 bits)
{
const auto entry = (offset / s_bytes_per_entry);
const auto word = entry / s_pages_per_entry;
const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
page_info[word] &= ~(3 << shift);
page_info[word] |= (bits << shift);
}
bool dma_block::test_page_bit(u32 offset, u64 bits)
{
const auto entry = (offset / s_bytes_per_entry);
const auto word = entry / s_pages_per_entry;
const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
return !!(page_info[word] & (bits << shift));
}
void dma_block::mark_dirty(const utils::address_range& range)
{
if (!inheritance_info.parent)
{
const u32 start = utils::align(range.start, s_page_size);
const u32 end = ((range.end + 1) & s_page_align);
for (u32 page = start; page < end; page += s_page_size)
{
set_page_bit(page - base_address, page_bits::dirty);
}
if (start > range.start) [[unlikely]]
{
set_page_bit(start - s_page_size, page_bits::nocache);
}
if (end < range.end) [[unlikely]]
{
set_page_bit(end + s_page_size, page_bits::nocache);
}
}
else
{
inheritance_info.parent->mark_dirty(range);
}
}
void dma_block::set_page_info(u32 page_offset, const std::vector<u64>& bits)
{
if (!inheritance_info.parent)
{
auto bit_offset = page_offset / s_bytes_per_entry;
ensure(bit_offset + bits.size() <= page_info.size());
std::memcpy(page_info.data() + bit_offset, bits.data(), bits.size());
}
else
{
inheritance_info.parent->set_page_info(page_offset + inheritance_info.block_offset, bits);
}
}
void dma_block::flush(const utils::address_range& range) void dma_block::flush(const utils::address_range& range)
{ {
auto src = map_range(range); auto src = map_range(range);
@ -206,11 +148,10 @@ namespace vk
{ {
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline // Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data. // buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
auto gc = vk::get_resource_manager(); free();
gc->dispose(allocated_memory);
parent->set_page_info(inheritance_info.block_offset, page_info); //parent->set_page_info(inheritance_info.block_offset, page_info);
page_info.clear(); //page_info.clear();
} }
} }
@ -222,8 +163,8 @@ namespace vk
allocate(dev, new_size); allocate(dev, new_size);
const auto required_entries = new_size / s_bytes_per_entry; //const auto required_entries = new_size / s_bytes_per_entry;
page_info.resize(required_entries, ~0ull); //page_info.resize(required_entries, ~0ull);
} }
u32 dma_block::start() const u32 dma_block::start() const
@ -244,13 +185,9 @@ namespace vk
void dma_block_EXT::allocate(const render_device& dev, usz size) void dma_block_EXT::allocate(const render_device& dev, usz size)
{ {
if (allocated_memory) // Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
{ // buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline free();
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
auto gc = vk::get_resource_manager();
gc->dispose(allocated_memory);
}
allocated_memory = std::make_unique<vk::buffer>(dev, allocated_memory = std::make_unique<vk::buffer>(dev,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
@ -278,16 +215,53 @@ namespace vk
// NOP // NOP
} }
void create_dma_block(std::unique_ptr<dma_block>& block) bool test_host_pointer(u32 base_address, usz length)
{ {
#if 0 // Unusable due to vm locks
auto block = vm::get(vm::any, base_address);
ensure(block);
if ((block->addr + block->size) < (base_address + length))
{
return false;
}
if (block->flags & 0x120)
{
return true;
}
auto range_info = block->peek(base_address, u32(length));
return !!range_info.second;
#endif
#ifdef _WIN32 #ifdef _WIN32
const bool allow_host_buffers = true; MEMORY_BASIC_INFORMATION mem_info;
if (!::VirtualQuery(vm::get_super_ptr<const void>(base_address), &mem_info, sizeof(mem_info)))
{
rsx_log.error("VirtualQuery failed! LastError=0x%x", GetLastError());
return false;
}
return (mem_info.RegionSize >= length);
#else
return true; // *nix behavior is unknown with NVIDIA drivers
#endif
}
void create_dma_block(std::unique_ptr<dma_block>& block, u32 base_address, u32 expected_length)
{
const auto vendor = g_render_device->gpu().get_driver_vendor();
#ifdef _WIN32
const bool allow_host_buffers = (vendor == driver_vendor::NVIDIA) ?
test_host_pointer(base_address, expected_length) :
true;
#else #else
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations // Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
const auto vendor = g_render_device->gpu().get_driver_vendor();
const bool allow_host_buffers = (vendor != driver_vendor::AMD && vendor != driver_vendor::RADV); const bool allow_host_buffers = (vendor != driver_vendor::AMD && vendor != driver_vendor::RADV);
#endif #endif
if (g_render_device->get_external_memory_host_support() && allow_host_buffers) if (allow_host_buffers && g_render_device->get_external_memory_host_support())
{ {
block.reset(new dma_block_EXT()); block.reset(new dma_block_EXT());
} }
@ -295,6 +269,8 @@ namespace vk
{ {
block.reset(new dma_block()); block.reset(new dma_block());
} }
block->init(*g_render_device, base_address, expected_length);
} }
std::pair<u32, vk::buffer*> map_dma(const command_buffer& cmd, u32 local_address, u32 length) std::pair<u32, vk::buffer*> map_dma(const command_buffer& cmd, u32 local_address, u32 length)
@ -315,25 +291,28 @@ namespace vk
if (first_block == last_block) [[likely]] if (first_block == last_block) [[likely]]
{ {
auto &block_info = g_dma_pool[first_block]; auto &block_info = g_dma_pool[first_block];
if (!block_info) create_dma_block(block_info); ensure(!block_info);
create_dma_block(block_info, first_block, s_dma_block_length);
block_info->init(*g_render_device, first_block, s_dma_block_length);
return block_info->get(map_range); return block_info->get(map_range);
} }
dma_block* block_head = nullptr; dma_block* block_head = nullptr;
auto block_end = utils::align(limit, s_dma_block_length); auto block_end = utils::align(limit, s_dma_block_length);
// Reverse scan to try and find the minimum required length in case of other chaining if (g_render_device->gpu().get_driver_vendor() != driver_vendor::NVIDIA ||
for (auto block = last_block; block != first_block; block -= s_dma_block_length) rsx::get_location(local_address) == CELL_GCM_LOCATION_LOCAL)
{ {
if (auto found = g_dma_pool.find(block); found != g_dma_pool.end()) // Reverse scan to try and find the minimum required length in case of other chaining
for (auto block = last_block; block != first_block; block -= s_dma_block_length)
{ {
const auto end = found->second->end(); if (auto found = g_dma_pool.find(block); found != g_dma_pool.end())
last_block = std::max(last_block, end & s_dma_block_mask); {
block_end = std::max(block_end, end + 1); const auto end = found->second->end();
last_block = std::max(last_block, end & s_dma_block_mask);
block_end = std::max(block_end, end + 1);
break; break;
}
} }
} }
@ -342,37 +321,31 @@ namespace vk
auto found = g_dma_pool.find(block); auto found = g_dma_pool.find(block);
auto &entry = g_dma_pool[block]; auto &entry = g_dma_pool[block];
const bool exists = !!entry;
if (!exists) create_dma_block(entry);
if (block == first_block) if (block == first_block)
{ {
block_head = entry->head(); if (entry && entry->end() < limit)
if (exists)
{ {
if (entry->end() < limit) // Then the references to this object do not go to the end of the list as will be done with this new allocation.
{ // A dumb release is therefore safe...
auto new_length = block_end - block_head->start(); entry.reset();
block_head->extend(cmd, *g_render_device, new_length);
}
} }
else
if (!entry)
{ {
auto required_size = (block_end - block); auto required_size = (block_end - block);
block_head->init(*g_render_device, block, required_size); create_dma_block(entry, block, required_size);
} }
block_head = entry->head();
}
else if (entry)
{
entry->set_parent(cmd, block_head);
} }
else else
{ {
if (exists) entry.reset(new dma_block());
{ entry->init(block_head, block, s_dma_block_length);
entry->set_parent(cmd, block_head);
}
else
{
entry->init(block_head, block, s_dma_block_length);
}
} }
} }

View file

@ -13,13 +13,6 @@ namespace vk
class dma_block class dma_block
{ {
protected: protected:
enum page_bits
{
synchronized = 0,
dirty = 1,
nocache = 3
};
struct struct
{ {
dma_block* parent = nullptr; dma_block* parent = nullptr;
@ -29,19 +22,17 @@ namespace vk
u32 base_address = 0; u32 base_address = 0;
std::unique_ptr<buffer> allocated_memory; std::unique_ptr<buffer> allocated_memory;
std::vector<u64> page_info;
virtual void allocate(const render_device& dev, usz size); virtual void allocate(const render_device& dev, usz size);
virtual void free();
virtual void* map_range(const utils::address_range& range); virtual void* map_range(const utils::address_range& range);
virtual void unmap(); virtual void unmap();
void set_page_bit(u32 page, u64 bits);
bool test_page_bit(u32 page, u64 bits);
void mark_dirty(const utils::address_range& range);
void set_page_info(u32 page_offset, const std::vector<u64>& bits);
public: public:
dma_block() = default;
virtual ~dma_block();
virtual void init(const render_device& dev, u32 addr, usz size); virtual void init(const render_device& dev, u32 addr, usz size);
virtual void init(dma_block* parent, u32 addr, usz size); virtual void init(dma_block* parent, u32 addr, usz size);
virtual void flush(const utils::address_range& range); virtual void flush(const utils::address_range& range);

View file

@ -70,7 +70,6 @@ namespace vk
vk::clear_resolve_helpers(); vk::clear_resolve_helpers();
vk::clear_dma_resources(); vk::clear_dma_resources();
vk::vmm_reset(); vk::vmm_reset();
vk::get_resource_manager()->destroy();
vk::clear_scratch_resources(); vk::clear_scratch_resources();
vk::get_upload_heap()->destroy(); vk::get_upload_heap()->destroy();
@ -86,6 +85,9 @@ namespace vk
p.second->destroy(); p.second->destroy();
} }
g_overlay_passes.clear(); g_overlay_passes.clear();
// This must be the last item destroyed
vk::get_resource_manager()->destroy();
} }
const vk::render_device *get_current_renderer() const vk::render_device *get_current_renderer()
@ -263,8 +265,6 @@ namespace vk
return (g_num_processed_frames > 0)? g_num_processed_frames - 1: 0; return (g_num_processed_frames > 0)? g_num_processed_frames - 1: 0;
} }
void do_query_cleanup(vk::command_buffer& cmd) void do_query_cleanup(vk::command_buffer& cmd)
{ {
auto renderer = dynamic_cast<VKGSRender*>(rsx::get_current_renderer()); auto renderer = dynamic_cast<VKGSRender*>(rsx::get_current_renderer());

View file

@ -905,6 +905,8 @@ namespace vk
} }
auto dma_mapping = vk::map_dma(cmd, static_cast<u32>(src_address), static_cast<u32>(data_length)); auto dma_mapping = vk::map_dma(cmd, static_cast<u32>(src_address), static_cast<u32>(data_length));
ensure(dma_mapping.second->size() >= (dma_mapping.first + data_length));
vk::load_dma(::narrow<u32>(src_address), data_length); vk::load_dma(::narrow<u32>(src_address), data_length);
upload_buffer = dma_mapping.second; upload_buffer = dma_mapping.second;
@ -927,7 +929,7 @@ namespace vk
} }
// Copy from upload heap to scratch mem // Copy from upload heap to scratch mem
if (!opt.deferred_cmds.empty()) if (opt.require_upload)
{ {
for (const auto& copy_cmd : opt.deferred_cmds) for (const auto& copy_cmd : opt.deferred_cmds)
{ {
@ -953,7 +955,8 @@ namespace vk
scratch_offset += image_linear_size; scratch_offset += image_linear_size;
ensure((scratch_offset + image_linear_size) <= scratch_buf->size()); // "Out of scratch memory" ensure((scratch_offset + image_linear_size) <= scratch_buf->size()); // "Out of scratch memory"
} }
else if (opt.require_upload)
if (opt.require_upload)
{ {
if (upload_commands.empty() || upload_buffer->value != upload_commands.back().first) if (upload_commands.empty() || upload_buffer->value != upload_commands.back().first)
{ {
@ -974,7 +977,19 @@ namespace vk
{ {
ensure(scratch_buf); ensure(scratch_buf);
vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data()); if (upload_commands.size() > 1)
{
auto range_ptr = buffer_copies.data();
for (const auto& op : upload_commands)
{
vkCmdCopyBuffer(cmd, op.first, scratch_buf->value, op.second, range_ptr);
range_ptr += op.second;
}
}
else
{
vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
}
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT); VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
@ -1020,7 +1035,7 @@ namespace vk
vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast<u32>(copy_regions.size()), copy_regions.data()); vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast<u32>(copy_regions.size()), copy_regions.data());
} }
else if (opt.require_upload) else if (upload_commands.size() > 1)
{ {
auto region_ptr = copy_regions.data(); auto region_ptr = copy_regions.data();
for (const auto& op : upload_commands) for (const auto& op : upload_commands)