video_core: Improve invalidate heuristic

This commit is contained in:
IndecisiveTurtle 2024-09-05 14:57:55 +03:00
parent 324180cd2e
commit 842a5dabd4
9 changed files with 110 additions and 78 deletions

View file

@ -10,6 +10,7 @@
#include <arpa/inet.h>
#endif
#include <thread>
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/libraries/error_codes.h"
@ -559,6 +560,7 @@ int PS4_SYSV_ABI sceNetEpollDestroy() {
}
int PS4_SYSV_ABI sceNetEpollWait() {
std::this_thread::sleep_for(std::chrono::microseconds(1));
LOG_TRACE(Lib_Net, "(STUBBED) called");
return ORBIS_OK;
}

View file

@ -541,46 +541,61 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
}
bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) {
constexpr FindFlags flags = FindFlags::NoCreate | FindFlags::RelaxSize |
FindFlags::RelaxFmt | FindFlags::RelaxDim;
ImageInfo info{};
info.guest_address = device_addr;
info.guest_size_bytes = size;
info.type = vk::ImageType::e2D;
const ImageId image_id = texture_cache.FindImage(info, flags);
if (!image_id) {
boost::container::small_vector<ImageId, 8> image_ids;
size = std::min(size, MaxInvalidateDist);
texture_cache.ForEachImageInRegion(device_addr, size, [&](ImageId image_id, Image& image) {
if (True(image.flags & ImageFlagBits::CpuModified) ||
False(image.flags & ImageFlagBits::GpuModified)) {
return;
}
if (image.cpu_addr < device_addr || image.cpu_addr > device_addr + size) {
return;
}
image_ids.push_back(image_id);
});
if (image_ids.empty()) {
return false;
}
Image& image = texture_cache.GetImage(image_id);
// Sort images by modification tick. If there are overlaps we want to
// copy from least to most recently modified.
std::ranges::sort(image_ids, [&](ImageId lhs_id, ImageId rhs_id) {
const Image& lhs = texture_cache.GetImage(lhs_id);
const Image& rhs = texture_cache.GetImage(rhs_id);
return lhs.tick_accessed_last < rhs.tick_accessed_last;
});
boost::container::small_vector<vk::BufferImageCopy, 8> copies;
u32 offset = buffer.Offset(device_addr);
const u32 num_layers = image.info.resources.layers;
for (u32 m = 0; m < image.info.resources.levels; m++) {
const u32 width = std::max(image.info.size.width >> m, 1u);
const u32 height = std::max(image.info.size.height >> m, 1u);
const u32 depth =
image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u;
const auto& [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m];
copies.push_back({
.bufferOffset = offset,
.bufferRowLength = static_cast<u32>(mip_pitch),
.bufferImageHeight = static_cast<u32>(mip_height),
.imageSubresource{
.aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil,
.mipLevel = m,
.baseArrayLayer = 0,
.layerCount = num_layers,
},
.imageOffset = {0, 0, 0},
.imageExtent = {width, height, depth},
});
offset += mip_ofs * num_layers;
for (const ImageId image_id : image_ids) {
copies.clear();
Image& image = texture_cache.GetImage(image_id);
u32 offset = buffer.Offset(image.cpu_addr);
const u32 num_layers = image.info.resources.layers;
for (u32 m = 0; m < image.info.resources.levels; m++) {
const u32 width = std::max(image.info.size.width >> m, 1u);
const u32 height = std::max(image.info.size.height >> m, 1u);
const u32 depth =
image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u;
const auto& [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m];
copies.push_back({
.bufferOffset = offset,
.bufferRowLength = static_cast<u32>(mip_pitch),
.bufferImageHeight = static_cast<u32>(mip_height),
.imageSubresource{
.aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil,
.mipLevel = m,
.baseArrayLayer = 0,
.layerCount = num_layers,
},
.imageOffset = {0, 0, 0},
.imageExtent = {width, height, depth},
});
offset += mip_ofs * num_layers;
}
scheduler.EndRendering();
image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead);
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.buffer,
copies);
}
scheduler.EndRendering();
image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead);
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.buffer,
copies);
return true;
}

View file

@ -167,9 +167,6 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache,
LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)");
}
}
if (desc.is_written) {
texture_cache.InvalidateMemory(address, size);
}
const u32 alignment = instance.TexelBufferMinAlignment();
const auto [vk_buffer, offset] =
buffer_cache.ObtainBuffer(address, size, desc.is_written, true);
@ -184,13 +181,15 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache,
}
buffer_view = vk_buffer->View(offset_aligned, size + adjust, desc.is_written,
vsharp.GetDataFmt(), vsharp.GetNumberFmt());
if (auto barrier =
vk_buffer->GetBarrier(desc.is_written ? vk::AccessFlagBits2::eShaderWrite
: vk::AccessFlagBits2::eShaderRead,
vk::PipelineStageFlagBits2::eComputeShader)) {
buffer_barriers.emplace_back(*barrier);
}
if (desc.is_written) {
texture_cache.InvalidateMemory(address, size);
}
}
set_writes.push_back({
.dstSet = VK_NULL_HANDLE,
@ -252,10 +251,11 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache,
const auto cmdbuf = scheduler.CommandBuffer();
if (!buffer_barriers.empty()) {
auto dependencies = vk::DependencyInfo{
const auto dependencies = vk::DependencyInfo{
.bufferMemoryBarrierCount = u32(buffer_barriers.size()),
.pBufferMemoryBarriers = buffer_barriers.data(),
};
scheduler.EndRendering();
cmdbuf.pipelineBarrier2(dependencies);
}

View file

@ -405,15 +405,15 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs,
});
}
for (const auto& tex_buffer : stage->texture_buffers) {
const auto vsharp = tex_buffer.GetSharp(*stage);
for (const auto& desc : stage->texture_buffers) {
const auto vsharp = desc.GetSharp(*stage);
vk::BufferView& buffer_view = buffer_views.emplace_back(VK_NULL_HANDLE);
const u32 size = vsharp.GetSize();
if (vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid && size != 0) {
const VAddr address = vsharp.base_address;
const u32 alignment = instance.TexelBufferMinAlignment();
const auto [vk_buffer, offset] =
buffer_cache.ObtainBuffer(address, size, tex_buffer.is_written, true);
buffer_cache.ObtainBuffer(address, size, desc.is_written, true);
const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3;
ASSERT_MSG(fmt_stride == vsharp.GetStride(),
"Texel buffer stride must match format stride");
@ -423,22 +423,25 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs,
ASSERT(adjust % fmt_stride == 0);
push_data.AddOffset(binding, adjust / fmt_stride);
}
buffer_view = vk_buffer->View(offset_aligned, size + adjust, tex_buffer.is_written,
buffer_view = vk_buffer->View(offset_aligned, size + adjust, desc.is_written,
vsharp.GetDataFmt(), vsharp.GetNumberFmt());
const auto dst_access = tex_buffer.is_written ? vk::AccessFlagBits2::eShaderWrite
: vk::AccessFlagBits2::eShaderRead;
const auto dst_access = desc.is_written ? vk::AccessFlagBits2::eShaderWrite
: vk::AccessFlagBits2::eShaderRead;
if (auto barrier = vk_buffer->GetBarrier(
dst_access, vk::PipelineStageFlagBits2::eVertexShader)) {
buffer_barriers.emplace_back(*barrier);
}
if (desc.is_written) {
texture_cache.InvalidateMemory(address, size);
}
}
set_writes.push_back({
.dstSet = VK_NULL_HANDLE,
.dstBinding = binding++,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer
: vk::DescriptorType::eUniformTexelBuffer,
.descriptorType = desc.is_written ? vk::DescriptorType::eStorageTexelBuffer
: vk::DescriptorType::eUniformTexelBuffer,
.pTexelBufferView = &buffer_view,
});
}
@ -497,10 +500,11 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs,
const auto cmdbuf = scheduler.CommandBuffer();
if (!buffer_barriers.empty()) {
auto dependencies = vk::DependencyInfo{
const auto dependencies = vk::DependencyInfo{
.bufferMemoryBarrierCount = u32(buffer_barriers.size()),
.pBufferMemoryBarriers = buffer_barriers.data(),
};
scheduler.EndRendering();
cmdbuf.pipelineBarrier2(dependencies);
}

View file

@ -43,6 +43,7 @@ static VKAPI_ATTR VkBool32 VKAPI_CALL DebugUtilsCallback(
case 0x609a13b: // Vertex attribute at location not consumed by shader
case 0xc81ad50e:
case 0xb7c39078:
case 0x32868fde: // vkCreateBufferView(): pCreateInfo->range does not equal VK_WHOLE_SIZE
case 0x92d66fc1: // `pMultisampleState is NULL` for depth only passes (confirmed VL error)
return VK_FALSE;
default:

View file

@ -166,8 +166,9 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
image.Create(image_ci);
Vulkan::SetObjectName(instance->GetDevice(), (vk::Image)image, "Image {:#x}:{:#x}",
info.guest_address, info.guest_size_bytes);
Vulkan::SetObjectName(instance->GetDevice(), (vk::Image)image, "Image {}x{}x{} {:#x}:{:#x}",
info.size.width, info.size.height, info.size.depth, info.guest_address,
info.guest_size_bytes);
}
void Image::Transit(vk::ImageLayout dst_layout, vk::Flags<vk::AccessFlagBits> dst_mask,

View file

@ -117,6 +117,7 @@ struct Image {
vk::ImageLayout layout = vk::ImageLayout::eUndefined;
boost::container::small_vector<u64, 14> mip_hashes;
u64 tick_accessed_last{0};
u64 modification_tick{0};
};
} // namespace VideoCore

View file

@ -38,12 +38,11 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler&
TextureCache::~TextureCache() = default;
void TextureCache::InvalidateMemory(VAddr address, size_t size) {
static constexpr size_t MaxInvalidateDist = 128_MB;
std::unique_lock lock{mutex};
std::scoped_lock lock{mutex};
ForEachImageInRegion(address, size, [&](ImageId image_id, Image& image) {
const size_t image_dist =
image.cpu_addr > address ? image.cpu_addr - address : address - image.cpu_addr;
if (image_dist < MaxInvalidateDist && image.info.size.width > 16) {
if (image_dist < MaxInvalidateDist) {
// Ensure image is reuploaded when accessed again.
image.flags |= ImageFlagBits::CpuModified;
}
@ -152,7 +151,6 @@ ImageId TextureCache::ResolveOverlap(const ImageInfo& image_info, ImageId cache_
}
ImageId TextureCache::ExpandImage(const ImageInfo& info, ImageId image_id) {
const auto new_image_id = slot_images.insert(instance, scheduler, info);
RegisterImage(new_image_id);
@ -220,7 +218,9 @@ ImageId TextureCache::FindImage(const ImageInfo& info, FindFlags flags) {
RegisterImage(image_id);
}
slot_images[image_id].tick_accessed_last = scheduler.CurrentTick();
Image& image = slot_images[image_id];
image.tick_accessed_last = scheduler.CurrentTick();
image.modification_tick = ++modification_tick;
return image_id;
}
@ -248,8 +248,11 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo
ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) {
const ImageId image_id = FindImage(info);
UpdateImage(image_id);
Image& image = slot_images[image_id];
if (view_info.is_storage) {
image.flags |= ImageFlagBits::GpuModified;
}
UpdateImage(image_id);
auto& usage = image.info.usage;
if (view_info.is_storage) {
@ -405,7 +408,8 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule
// hazard
if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead,
vk::PipelineStageFlagBits2::eTransfer)) {
auto dependencies = vk::DependencyInfo{
const auto dependencies = vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &barrier.value(),
};

View file

@ -31,6 +31,8 @@ enum class FindFlags {
};
DECLARE_ENUM_FLAG_OPERATORS(FindFlags)
static constexpr u32 MaxInvalidateDist = 12_MB;
class TextureCache {
struct Traits {
using Entry = boost::container::small_vector<ImageId, 16>;
@ -114,25 +116,6 @@ public:
return false;
}
private:
ImageView& RegisterImageView(ImageId image_id, const ImageViewInfo& view_info);
/// Iterate over all page indices in a range
template <typename Func>
static void ForEachPage(PAddr addr, size_t size, Func&& func) {
static constexpr bool RETURNS_BOOL = std::is_same_v<std::invoke_result<Func, u64>, bool>;
const u64 page_end = (addr + size - 1) >> Traits::PageBits;
for (u64 page = addr >> Traits::PageBits; page <= page_end; ++page) {
if constexpr (RETURNS_BOOL) {
if (func(page)) {
break;
}
} else {
func(page);
}
}
}
template <typename Func>
void ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& func) {
using FuncReturn = typename std::invoke_result<Func, ImageId, Image&>::type;
@ -174,6 +157,26 @@ private:
}
}
private:
/// Iterate over all page indices in a range
template <typename Func>
static void ForEachPage(PAddr addr, size_t size, Func&& func) {
static constexpr bool RETURNS_BOOL = std::is_same_v<std::invoke_result<Func, u64>, bool>;
const u64 page_end = (addr + size - 1) >> Traits::PageBits;
for (u64 page = addr >> Traits::PageBits; page <= page_end; ++page) {
if constexpr (RETURNS_BOOL) {
if (func(page)) {
break;
}
} else {
func(page);
}
}
}
/// Registers an image view for provided image
ImageView& RegisterImageView(ImageId image_id, const ImageViewInfo& view_info);
/// Create an image from the given parameters
[[nodiscard]] ImageId InsertImage(const ImageInfo& info, VAddr cpu_addr);
@ -209,6 +212,7 @@ private:
tsl::robin_map<u64, Sampler> samplers;
PageTable page_table;
std::mutex mutex;
u64 modification_tick{0};
struct MetaDataInfo {
enum class Type {