From e34bdb68413a8a0560a3708813949ddd4ba175c8 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 25 Aug 2024 01:47:02 +0300 Subject: [PATCH] Add GL Stream Buffer from Duckstation --- CMakeLists.txt | 5 +- include/align.hpp | 99 +++++++ include/renderer_gl/renderer_gl.hpp | 8 +- src/core/PICA/draw_acceleration.cpp | 2 + src/core/renderer_gl/renderer_gl.cpp | 8 + third_party/duckstation/gl/stream_buffer.cpp | 288 +++++++++++++++++++ third_party/duckstation/gl/stream_buffer.h | 53 ++++ 7 files changed, 461 insertions(+), 2 deletions(-) create mode 100644 include/align.hpp create mode 100644 third_party/duckstation/gl/stream_buffer.cpp create mode 100644 third_party/duckstation/gl/stream_buffer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 89322af4..6a94047c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -138,6 +138,7 @@ include_directories(${SDL2_INCLUDE_DIR}) include_directories(third_party/toml11) include_directories(third_party/glm) include_directories(third_party/renderdoc) +include_directories(third_party/duckstation) add_subdirectory(third_party/cmrc) @@ -302,6 +303,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp + include/align.hpp ) cmrc_add_resource_library( @@ -334,7 +336,6 @@ if(ENABLE_LUAJIT AND NOT ANDROID) endif() if(ENABLE_QT_GUI) - include_directories(third_party/duckstation) set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/window_info.cpp third_party/duckstation/gl/context.cpp) if(APPLE) @@ -377,6 +378,8 @@ if(ENABLE_OPENGL) src/host_shaders/opengl_fragment_shader.frag ) + set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/stream_buffer.cpp) + set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES}) source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES}) diff --git a/include/align.hpp b/include/align.hpp new file mode 100644 index 00000000..6b79a656 --- /dev/null +++ b/include/align.hpp @@ -0,0 +1,99 @@ +// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#pragma once + +#include + +#include "helpers.hpp" + +#ifdef _MSC_VER +#include +#endif + +namespace Common { + template + constexpr bool isAligned(T value, unsigned int alignment) { + return (value % static_cast(alignment)) == 0; + } + + template + constexpr T alignUp(T value, unsigned int alignment) { + return (value + static_cast(alignment - 1)) / static_cast(alignment) * static_cast(alignment); + } + + template + constexpr T alignDown(T value, unsigned int alignment) { + return value / static_cast(alignment) * static_cast(alignment); + } + + template + constexpr bool isAlignedPow2(T value, unsigned int alignment) { + return (value & static_cast(alignment - 1)) == 0; + } + + template + constexpr T alignUpPow2(T value, unsigned int alignment) { + return (value + static_cast(alignment - 1)) & static_cast(~static_cast(alignment - 1)); + } + + template + constexpr T alignDownPow2(T value, unsigned int alignment) { + return value & static_cast(~static_cast(alignment - 1)); + } + + template + constexpr bool isPow2(T value) { + return (value & (value - 1)) == 0; + } + + template + constexpr T previousPow2(T value) { + if (value == static_cast(0)) return 0; + + value |= (value >> 1); + value |= (value >> 2); + value |= (value >> 4); + if constexpr (sizeof(T) >= 16) value |= (value >> 8); + if constexpr (sizeof(T) >= 32) value |= (value >> 16); + if constexpr (sizeof(T) >= 64) value |= (value >> 32); + return value - (value >> 1); + } + + template + constexpr T nextPow2(T value) { + // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + if (value == static_cast(0)) return 0; + + value--; + value |= (value >> 1); + value |= (value >> 2); + value |= (value >> 4); + if constexpr (sizeof(T) >= 16) value |= (value >> 8); + if constexpr (sizeof(T) >= 32) value |= (value >> 16); + if constexpr (sizeof(T) >= 64) value |= (value >> 32); + value++; + return value; + } + + ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) { +#ifdef _MSC_VER + return _aligned_malloc(size, alignment); +#else + // Unaligned sizes are slow on macOS. +#ifdef __APPLE__ + if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1); +#endif + void* ret = nullptr; + return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr; +#endif + } + + ALWAYS_INLINE static void alignedFree(void* ptr) { +#ifdef _MSC_VER + _aligned_free(ptr); +#else + free(ptr); +#endif + } +} // namespace Common \ No newline at end of file diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 397aaf53..63bbb474 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -10,11 +11,12 @@ #include "PICA/float_types.hpp" #include "PICA/pica_frag_config.hpp" -#include "PICA/pica_vert_config.hpp" #include "PICA/pica_hash.hpp" +#include "PICA/pica_vert_config.hpp" #include "PICA/pica_vertex.hpp" #include "PICA/regs.hpp" #include "PICA/shader_gen.hpp" +#include "gl/stream_buffer.h" #include "gl_state.hpp" #include "helpers.hpp" #include "logger.hpp" @@ -83,6 +85,10 @@ class RendererGL final : public Renderer { // UBO for uploading the PICA uniforms when using hw shaders GLuint hwShaderUniformUBO; + using StreamBuffer = OpenGLStreamBuffer; + std::unique_ptr hwVertexBuffer; + std::unique_ptr hwIndexBuffer; + // Cached recompiled fragment shader struct CachedProgram { OpenGL::Program program; diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index e9546cf7..5fc21e48 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -82,6 +82,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { // Align attribute address up to a 4 byte boundary attributeOffset = (attributeOffset + 3) & -4; attributeOffset += (index - 11) << 2; + + attr.data = nullptr; continue; } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index f5728346..3b2d1d70 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -78,6 +78,14 @@ void RendererGL::initGraphicsContextInternal() { gl.useProgram(displayProgram); glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0); // Init sampler object + // Create stream buffers for vertex, index and uniform buffers + // TODO: Remove buffers from GL state tracking as the StreamBuffer implementation bypasses the state tracker. + static constexpr usize hwIndexBufferSize = 2_MB; + static constexpr usize hwVertexBufferSize = 16_MB; + + hwIndexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, hwIndexBufferSize); + hwVertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, hwVertexBufferSize); + // Allocate memory for the shadergen fragment uniform UBO glGenBuffers(1, &shadergenFragmentUBO); gl.bindUBO(shadergenFragmentUBO); diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp new file mode 100644 index 00000000..f4f8b54c --- /dev/null +++ b/third_party/duckstation/gl/stream_buffer.cpp @@ -0,0 +1,288 @@ +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#include "gl/stream_buffer.h" + +#include +#include + +#include "align.hpp" + +OpenGLStreamBuffer::OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : m_target(target), m_buffer_id(buffer_id), m_size(size) {} +OpenGLStreamBuffer::~OpenGLStreamBuffer() { glDeleteBuffers(1, &m_buffer_id); } + +void OpenGLStreamBuffer::Bind() { glBindBuffer(m_target, m_buffer_id); } +void OpenGLStreamBuffer::Unbind() { glBindBuffer(m_target, 0); } + +void OpenGLStreamBuffer::SetDebugName(std::string_view name) { +#ifdef GPU_DEBUG_INFO + if (glObjectLabel) { + glObjectLabel(GL_BUFFER, GetGLBufferId(), static_cast(name.length()), static_cast(name.data())); + } +#endif +} + +namespace { + // Uses glBufferSubData() to update. Preferred for drivers which don't support {ARB,EXT}_buffer_storage. + class BufferSubDataStreamBuffer final : public OpenGLStreamBuffer { + public: + ~BufferSubDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); } + + MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast(m_cpu_buffer), 0, 0, m_size / alignment}; } + + u32 Unmap(u32 used_size) override { + if (used_size == 0) return 0; + + glBindBuffer(m_target, m_buffer_id); + glBufferSubData(m_target, 0, used_size, m_cpu_buffer); + return 0; + } + + u32 GetChunkSize() const override { return m_size; } + + static std::unique_ptr Create(GLenum target, u32 size) { + glGetError(); + + GLuint buffer_id; + glGenBuffers(1, &buffer_id); + glBindBuffer(target, buffer_id); + glBufferData(target, size, nullptr, GL_STREAM_DRAW); + + GLenum err = glGetError(); + if (err != GL_NO_ERROR) { + glBindBuffer(target, 0); + glDeleteBuffers(1, &buffer_id); + return {}; + } + + return std::unique_ptr(new BufferSubDataStreamBuffer(target, buffer_id, size)); + } + + private: + BufferSubDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) { + m_cpu_buffer = static_cast(Common::alignedMalloc(size, 32)); + if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer"); + } + + u8* m_cpu_buffer; + }; + + // Uses BufferData() to orphan the buffer after every update. Used on Mali where BufferSubData forces a sync. + class BufferDataStreamBuffer final : public OpenGLStreamBuffer { + public: + ~BufferDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); } + + MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast(m_cpu_buffer), 0, 0, m_size / alignment}; } + + u32 Unmap(u32 used_size) override { + if (used_size == 0) return 0; + + glBindBuffer(m_target, m_buffer_id); + glBufferData(m_target, used_size, m_cpu_buffer, GL_STREAM_DRAW); + return 0; + } + + u32 GetChunkSize() const override { return m_size; } + + static std::unique_ptr Create(GLenum target, u32 size) { + glGetError(); + + GLuint buffer_id; + glGenBuffers(1, &buffer_id); + glBindBuffer(target, buffer_id); + glBufferData(target, size, nullptr, GL_STREAM_DRAW); + + GLenum err = glGetError(); + if (err != GL_NO_ERROR) { + glBindBuffer(target, 0); + glDeleteBuffers(1, &buffer_id); + return {}; + } + + return std::unique_ptr(new BufferDataStreamBuffer(target, buffer_id, size)); + } + + private: + BufferDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) { + m_cpu_buffer = static_cast(Common::alignedMalloc(size, 32)); + if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer"); + } + + u8* m_cpu_buffer; + }; + + // Base class for implementations which require syncing. + class SyncingStreamBuffer : public OpenGLStreamBuffer { + public: + enum : u32 { NUM_SYNC_POINTS = 16 }; + + virtual ~SyncingStreamBuffer() override { + for (u32 i = m_available_block_index; i <= m_used_block_index; i++) { + glDeleteSync(m_sync_objects[i]); + } + } + + protected: + SyncingStreamBuffer(GLenum target, GLuint buffer_id, u32 size) + : OpenGLStreamBuffer(target, buffer_id, size), m_bytes_per_block((size + (NUM_SYNC_POINTS)-1) / NUM_SYNC_POINTS) {} + + ALWAYS_INLINE u32 GetSyncIndexForOffset(u32 offset) { return offset / m_bytes_per_block; } + + ALWAYS_INLINE void AddSyncsForOffset(u32 offset) { + const u32 end = GetSyncIndexForOffset(offset); + for (; m_used_block_index < end; m_used_block_index++) { + if (m_sync_objects[m_used_block_index]) { + Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use"); + } + + m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + } + } + + ALWAYS_INLINE void WaitForSync(GLsync& sync) { + glClientWaitSync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED); + glDeleteSync(sync); + sync = nullptr; + } + + ALWAYS_INLINE void EnsureSyncsWaitedForOffset(u32 offset) { + const u32 end = std::min(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS); + for (; m_available_block_index < end; m_available_block_index++) { + if (!m_sync_objects[m_used_block_index]) [[unlikely]] { + Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use"); + } + + WaitForSync(m_sync_objects[m_available_block_index]); + } + } + + void AllocateSpace(u32 size) { + // add sync objects for writes since the last allocation + AddSyncsForOffset(m_position); + + // wait for sync objects for the space we want to use + EnsureSyncsWaitedForOffset(m_position + size); + + // wrap-around? + if ((m_position + size) > m_size) { + // current position ... buffer end + AddSyncsForOffset(m_size); + + // rewind, and try again + m_position = 0; + + // wait for the sync at the start of the buffer + WaitForSync(m_sync_objects[0]); + m_available_block_index = 1; + + // and however much more we need to satisfy the allocation + EnsureSyncsWaitedForOffset(size); + m_used_block_index = 0; + } + } + + u32 GetChunkSize() const override { return m_size / NUM_SYNC_POINTS; } + + u32 m_position = 0; + u32 m_used_block_index = 0; + u32 m_available_block_index = NUM_SYNC_POINTS; + u32 m_bytes_per_block; + std::array m_sync_objects{}; + }; + + class BufferStorageStreamBuffer : public SyncingStreamBuffer { + public: + ~BufferStorageStreamBuffer() override { + glBindBuffer(m_target, m_buffer_id); + glUnmapBuffer(m_target); + glBindBuffer(m_target, 0); + } + + MappingResult Map(u32 alignment, u32 min_size) override { + if (m_position > 0) m_position = Common::alignUp(m_position, alignment); + + AllocateSpace(min_size); + if ((m_position + min_size) > (m_available_block_index * m_bytes_per_block)) [[unlikely]] { + Helpers::panic("GL stream buffer: Invalid size passed to Unmap"); + } + + const u32 free_space_in_block = ((m_available_block_index * m_bytes_per_block) - m_position); + return MappingResult{static_cast(m_mapped_ptr + m_position), m_position, m_position / alignment, free_space_in_block / alignment}; + } + + u32 Unmap(u32 used_size) override { + if ((m_position + used_size) > m_size) [[unlikely]] { + Helpers::panic("GL stream buffer: Invalid size passed to Unmap"); + } + + if (!m_coherent) { + if (GLAD_GL_VERSION_4_5 || GLAD_GL_ARB_direct_state_access) { + glFlushMappedNamedBufferRange(m_buffer_id, m_position, used_size); + } else { + Bind(); + glFlushMappedBufferRange(m_target, m_position, used_size); + } + } + + const u32 prev_position = m_position; + m_position += used_size; + return prev_position; + } + + static std::unique_ptr Create(GLenum target, u32 size, bool coherent = true) { + glGetError(); + + GLuint buffer_id; + glGenBuffers(1, &buffer_id); + glBindBuffer(target, buffer_id); + + const u32 flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0); + const u32 map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT); + if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage) + glBufferStorage(target, size, nullptr, flags); + else if (GLAD_GL_EXT_buffer_storage) + glBufferStorageEXT(target, size, nullptr, flags); + + GLenum err = glGetError(); + if (err != GL_NO_ERROR) { + glBindBuffer(target, 0); + glDeleteBuffers(1, &buffer_id); + return {}; + } + + u8* mapped_ptr = static_cast(glMapBufferRange(target, 0, size, map_flags)); + AssertMsg(mapped_ptr, "Persistent buffer was mapped"); + + return std::unique_ptr(new BufferStorageStreamBuffer(target, buffer_id, size, mapped_ptr, coherent)); + } + + private: + BufferStorageStreamBuffer(GLenum target, GLuint buffer_id, u32 size, u8* mapped_ptr, bool coherent) + : SyncingStreamBuffer(target, buffer_id, size), m_mapped_ptr(mapped_ptr), m_coherent(coherent) {} + + u8* m_mapped_ptr; + bool m_coherent; + }; + +} // namespace + +std::unique_ptr OpenGLStreamBuffer::Create(GLenum target, u32 size) { + std::unique_ptr buf; + if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) { + buf = BufferStorageStreamBuffer::Create(target, size); + if (buf) return buf; + } + + // BufferSubData is slower on all drivers except NVIDIA... +#if 0 + const char* vendor = reinterpret_cast(glGetString(GL_VENDOR)); + if (std::strcmp(vendor, "ARM") == 0 || std::strcmp(vendor, "Qualcomm") == 0) { + // Mali and Adreno drivers can't do sub-buffer tracking... + return BufferDataStreamBuffer::Create(target, size); + } + + return BufferSubDataStreamBuffer::Create(target, size); +#else + return BufferDataStreamBuffer::Create(target, size); +#endif +} \ No newline at end of file diff --git a/third_party/duckstation/gl/stream_buffer.h b/third_party/duckstation/gl/stream_buffer.h new file mode 100644 index 00000000..6b3562e7 --- /dev/null +++ b/third_party/duckstation/gl/stream_buffer.h @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#pragma once + +#include +// Comment to avoid clang-format reordering the glad header + +#include +#include +#include +#include + +#include "duckstation_compat.h" +#include "helpers.hpp" + +class OpenGLStreamBuffer { + public: + virtual ~OpenGLStreamBuffer(); + + ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; } + ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; } + ALWAYS_INLINE u32 GetSize() const { return m_size; } + + void Bind(); + void Unbind(); + + void SetDebugName(std::string_view name); + + struct MappingResult { + void* pointer; + u32 buffer_offset; + u32 index_aligned; // offset / alignment, suitable for base vertex + u32 space_aligned; // remaining space / alignment + }; + + virtual MappingResult Map(u32 alignment, u32 min_size) = 0; + + /// Returns the position in the buffer *before* the start of used_size. + virtual u32 Unmap(u32 used_size) = 0; + + /// Returns the minimum granularity of blocks which sync objects will be created around. + virtual u32 GetChunkSize() const = 0; + + static std::unique_ptr Create(GLenum target, u32 size); + + protected: + OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size); + + GLenum m_target; + GLuint m_buffer_id; + u32 m_size; +}; \ No newline at end of file