rsx: Optimize static heap allocations

This commit is contained in:
kd-11 2025-03-31 19:55:05 +03:00
parent b42d2e3e34
commit d99a236f72
4 changed files with 123 additions and 91 deletions

View file

@ -404,57 +404,76 @@ namespace
}
};
template <typename T>
NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be_t<const T>> src, std::span<T> dst, T restart_index)
{
T min_index = index_limit<T>();
T max_index = 0;
u32 written = 0;
u32 length = ::size32(src);
for (u32 i = written; i < length; ++i)
template <typename T>
NEVER_INLINE std::tuple<T, T, u32> upload_untouched_skip_restart(std::span<to_be_t<const T>> src, std::span<T> dst, T restart_index)
{
T index = src[i];
if (index != restart_index)
T min_index = index_limit<T>();
T max_index = 0;
u32 written = 0;
u32 length = ::size32(src);
for (u32 i = written; i < length; ++i)
{
dst[written++] = min_max(min_index, max_index, index);
T index = src[i];
if (index != restart_index)
{
dst[written++] = min_max(min_index, max_index, index);
}
}
return std::make_tuple(min_index, max_index, written);
}
return std::make_tuple(min_index, max_index, written);
}
template<typename T>
std::tuple<T, T, u32> upload_untouched(std::span<to_be_t<const T>> src, std::span<T> dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index)
{
if (!is_primitive_restart_enabled)
template<typename T>
std::tuple<T, T, u32> upload_untouched(std::span<to_be_t<const T>> src, std::span<T> dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index)
{
return untouched_impl::upload_untouched(src, dst);
}
else if constexpr (std::is_same_v<T, u16>)
{
if (primitive_restart_index > 0xffff)
if (!is_primitive_restart_enabled)
{
return untouched_impl::upload_untouched(src, dst);
}
else if constexpr (std::is_same_v<T, u16>)
{
if (primitive_restart_index > 0xffff)
{
return untouched_impl::upload_untouched(src, dst);
}
else if (is_primitive_disjointed(draw_mode))
{
return upload_untouched_skip_restart(src, dst, static_cast<u16>(primitive_restart_index));
}
else
{
return primitive_restart_impl::upload_untouched(src, dst, static_cast<u16>(primitive_restart_index));
}
}
else if (is_primitive_disjointed(draw_mode))
{
return upload_untouched_skip_restart(src, dst, static_cast<u16>(primitive_restart_index));
return upload_untouched_skip_restart(src, dst, primitive_restart_index);
}
else
{
return primitive_restart_impl::upload_untouched(src, dst, static_cast<u16>(primitive_restart_index));
return primitive_restart_impl::upload_untouched(src, dst, primitive_restart_index);
}
}
else if (is_primitive_disjointed(draw_mode))
void iota16(u16* dst, u32 count)
{
return upload_untouched_skip_restart(src, dst, primitive_restart_index);
unsigned i = 0;
#if defined(ARCH_X64) || defined(ARCH_ARM64)
const unsigned step = 8; // We do 8 entries per step
const __m128i vec_step = _mm_set1_epi16(8); // Constant to increment the raw values
__m128i values = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
__m128i* vec_ptr = reinterpret_cast<__m128i*>(dst);
for (; (i + step) <= count; i += step, vec_ptr++)
{
_mm_stream_si128(vec_ptr, values);
_mm_add_epi16(values, vec_step);
}
#endif
for (; i < count; ++i)
dst[i] = i;
}
else
{
return primitive_restart_impl::upload_untouched(src, dst, primitive_restart_index);
}
}
template<typename T>
std::tuple<T, T, u32> expand_indexed_triangle_fan(std::span<to_be_t<const T>> src, std::span<T> dst, bool is_primitive_restart_enabled, u32 primitive_restart_index)
@ -624,8 +643,7 @@ void write_index_array_for_non_indexed_non_native_primitive_to_buffer(char* dst,
switch (draw_mode)
{
case rsx::primitive_type::line_loop:
for (unsigned i = 0; i < count; ++i)
typedDst[i] = i;
iota16(typedDst, count);
typedDst[count] = 0;
return;
case rsx::primitive_type::triangle_fan:

View file

@ -20,29 +20,33 @@ protected:
template<int Alignment>
bool can_alloc(usz size) const
{
usz alloc_size = utils::align(size, Alignment);
usz aligned_put_pos = utils::align(m_put_pos, Alignment);
if (aligned_put_pos + alloc_size < m_size)
const usz alloc_size = utils::align(size, Alignment);
const usz aligned_put_pos = utils::align(m_put_pos, Alignment);
const usz alloc_end = aligned_put_pos + alloc_size;
if (alloc_end < m_size) [[ likely ]]
{
// range before get
if (aligned_put_pos + alloc_size < m_get_pos)
// Range before get
if (alloc_end < m_get_pos)
return true;
// range after get
// Range after get
if (aligned_put_pos > m_get_pos)
return true;
return false;
}
else
{
// ..]....[..get..
if (aligned_put_pos < m_get_pos)
return false;
// ..get..]...[...
// Actually all resources extending beyond heap space starts at 0
if (alloc_size > m_get_pos)
return false;
return true;
}
// ..]....[..get..
if (aligned_put_pos < m_get_pos)
return false;
// ..get..]...[...
// Actually all resources extending beyond heap space starts at 0
if (alloc_size > m_get_pos)
return false;
return true;
}
// Grow the buffer to hold at least size bytes
@ -53,10 +57,9 @@ protected:
}
usz m_size;
usz m_put_pos; // Start of free space
usz m_min_guard_size; //If an allocation touches the guard region, reset the heap to avoid going over budget
usz m_current_allocated_size;
usz m_largest_allocated_pool;
usz m_put_pos; // Start of free space
usz m_get_pos; // End of free space
usz m_min_guard_size; // If an allocation touches the guard region, reset the heap to avoid going over budget
char* m_name;
public:
@ -65,8 +68,6 @@ public:
data_heap(const data_heap&) = delete;
data_heap(data_heap&&) = delete;
usz m_get_pos; // End of free space
void init(usz heap_size, const char* buffer_name = "unnamed", usz min_guard_size=0x10000)
{
m_name = const_cast<char*>(buffer_name);
@ -75,10 +76,8 @@ public:
m_put_pos = 0;
m_get_pos = heap_size - 1;
//allocation stats
// Allocation stats
m_min_guard_size = min_guard_size;
m_current_allocated_size = 0;
m_largest_allocated_pool = 0;
}
template<int Alignment>
@ -89,24 +88,45 @@ public:
if (!can_alloc<Alignment>(size) && !grow(alloc_size))
{
fmt::throw_exception("[%s] Working buffer not big enough, buffer_length=%d allocated=%d requested=%d guard=%d largest_pool=%d",
m_name, m_size, m_current_allocated_size, size, m_min_guard_size, m_largest_allocated_pool);
fmt::throw_exception("[%s] Working buffer not big enough, buffer_length=%d requested=%d guard=%d",
m_name, m_size, size, m_min_guard_size);
}
const usz block_length = (aligned_put_pos - m_put_pos) + alloc_size;
m_current_allocated_size += block_length;
m_largest_allocated_pool = std::max(m_largest_allocated_pool, block_length);
if (aligned_put_pos + alloc_size < m_size)
const usz alloc_end = aligned_put_pos + alloc_size;
if (alloc_end < m_size)
{
m_put_pos = aligned_put_pos + alloc_size;
m_put_pos = alloc_end;
return aligned_put_pos;
}
else
m_put_pos = alloc_size;
return 0;
}
/*
* For use in cases where we take a fixed amount each time
*/
template<int Alignment, usz Size = Alignment>
usz static_alloc()
{
static_assert((Size & (Alignment - 1)) == 0);
ensure((m_put_pos & (Alignment - 1)) == 0);
if (!can_alloc<Alignment>(Size) && !grow(Size))
{
m_put_pos = alloc_size;
return 0;
fmt::throw_exception("[%s] Working buffer not big enough, buffer_length=%d requested=%d guard=%d",
m_name, m_size, Size, m_min_guard_size);
}
const usz alloc_end = m_put_pos + Size;
if (m_put_pos + Size < m_size)
{
m_put_pos = alloc_end;
return m_put_pos;
}
m_put_pos = Size;
return 0;
}
/**
@ -117,30 +137,25 @@ public:
return (m_put_pos > 0) ? m_put_pos - 1 : m_size - 1;
}
inline void set_get_pos(usz value)
{
m_get_pos = value;
}
virtual bool is_critical() const
{
const usz guard_length = std::max(m_min_guard_size, m_largest_allocated_pool);
return (m_current_allocated_size + guard_length) >= m_size;
return m_min_guard_size >= m_size;
}
void reset_allocation_stats()
{
m_current_allocated_size = 0;
m_largest_allocated_pool = 0;
m_get_pos = get_current_put_pos_minus_one();
}
// Updates the current_allocated_size metrics
void notify()
inline void notify()
{
if (m_get_pos == umax)
m_current_allocated_size = 0;
else if (m_get_pos < m_put_pos)
m_current_allocated_size = (m_put_pos - m_get_pos - 1);
else if (m_get_pos > m_put_pos)
m_current_allocated_size = (m_put_pos + (m_size - m_get_pos - 1));
else
fmt::throw_exception("m_put_pos == m_get_pos!");
// @unused
}
usz size() const

View file

@ -41,7 +41,7 @@ namespace vk::data_heap_manager
continue;
}
heap->m_get_pos = found->second;
heap->set_get_pos(found->second);
heap->notify();
}
}

View file

@ -1162,7 +1162,6 @@ void VKGSRender::check_heap_status(u32 flags)
{
heap_critical = false;
u32 test = 1u << std::countr_zero(flags);
do
{
switch (flags & test)
@ -2046,7 +2045,7 @@ void VKGSRender::load_program_env()
check_heap_status(VK_HEAP_CHECK_VERTEX_ENV_STORAGE);
// Vertex state
const auto mem = m_vertex_env_ring_info.alloc<256>(256);
const auto mem = m_vertex_env_ring_info.static_alloc<256>();
auto buf = static_cast<u8*>(m_vertex_env_ring_info.map(mem, 148));
m_draw_processor.fill_scale_offset_data(buf, false);
@ -2134,7 +2133,7 @@ void VKGSRender::load_program_env()
{
check_heap_status(VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE);
auto mem = m_fragment_env_ring_info.alloc<256>(256);
auto mem = m_fragment_env_ring_info.static_alloc<256>();
auto buf = m_fragment_env_ring_info.map(mem, 32);
m_draw_processor.fill_fragment_state_buffer(buf, current_fragment_program);
@ -2146,7 +2145,7 @@ void VKGSRender::load_program_env()
{
check_heap_status(VK_HEAP_CHECK_TEXTURE_ENV_STORAGE);
auto mem = m_fragment_texture_params_ring_info.alloc<256>(768);
auto mem = m_fragment_texture_params_ring_info.static_alloc<256, 768>();
auto buf = m_fragment_texture_params_ring_info.map(mem, 768);
current_fragment_program.texture_params.write_to(buf, current_fp_metadata.referenced_textures_mask);
@ -2158,7 +2157,7 @@ void VKGSRender::load_program_env()
{
check_heap_status(VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE);
auto mem = m_raster_env_ring_info.alloc<256>(256);
auto mem = m_raster_env_ring_info.static_alloc<256>();
auto buf = m_raster_env_ring_info.map(mem, 128);
std::memcpy(buf, rsx::method_registers.polygon_stipple_pattern(), 128);