mirror of
				https://github.com/dolphin-emu/dolphin.git
				synced 2025-10-25 01:19:19 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			394 lines
		
	
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			394 lines
		
	
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| // Copyright 2013 Dolphin Emulator Project
 | |
| // SPDX-License-Identifier: GPL-2.0-or-later
 | |
| 
 | |
| #include "VideoBackends/OGL/OGLStreamBuffer.h"
 | |
| 
 | |
| #include "Common/Align.h"
 | |
| #include "Common/GL/GLUtil.h"
 | |
| #include "Common/MathUtil.h"
 | |
| #include "Common/MemoryUtil.h"
 | |
| 
 | |
| #include "VideoBackends/OGL/OGLConfig.h"
 | |
| 
 | |
| #include "VideoCommon/DriverDetails.h"
 | |
| #include "VideoCommon/OnScreenDisplay.h"
 | |
| 
 | |
| namespace OGL
 | |
| {
 | |
| // moved out of constructor, so m_buffer is allowed to be const
 | |
| static u32 GenBuffer()
 | |
| {
 | |
|   u32 id;
 | |
|   glGenBuffers(1, &id);
 | |
|   return id;
 | |
| }
 | |
| 
 | |
| StreamBuffer::StreamBuffer(u32 type, u32 size)
 | |
|     : m_buffer(GenBuffer()), m_buffertype(type), m_size(MathUtil::NextPowerOf2(size)),
 | |
|       m_bit_per_slot(MathUtil::IntLog2(MathUtil::NextPowerOf2(size) / SYNC_POINTS))
 | |
| {
 | |
|   m_iterator = 0;
 | |
|   m_used_iterator = 0;
 | |
|   m_free_iterator = 0;
 | |
| }
 | |
| 
 | |
| StreamBuffer::~StreamBuffer()
 | |
| {
 | |
|   glDeleteBuffers(1, &m_buffer);
 | |
| }
 | |
| 
 | |
| /* Shared synchronization code for ring buffers
 | |
|  *
 | |
|  * The next three functions are to create/delete/use the OpenGL synchronization.
 | |
|  * ARB_sync (OpenGL 3.2) is used and required.
 | |
|  *
 | |
|  * To reduce overhead, the complete buffer is splitted up into SYNC_POINTS chunks.
 | |
|  * For each of this chunks, there is a fence which checks if this chunk is still in use.
 | |
|  *
 | |
|  * As our API allows to alloc more memory then it has to use, we have to catch how much is already
 | |
|  * written.
 | |
|  *
 | |
|  * m_iterator      - writing position
 | |
|  * m_free_iterator - last position checked if free
 | |
|  * m_used_iterator - last position known to be written
 | |
|  *
 | |
|  * So on alloc, we have to wait for all slots between m_free_iterator and m_iterator (and set
 | |
|  * m_free_iterator to m_iterator afterwards).
 | |
|  *
 | |
|  * We also assume that this buffer is accessed by the GPU between the Unmap and Map function,
 | |
|  * so we may create the fences on the start of mapping.
 | |
|  * Some here, new fences for the chunks between m_used_iterator and m_iterator (also update
 | |
|  * m_used_iterator).
 | |
|  *
 | |
|  * As ring buffers have an ugly behavior on rollover, have fun to read this code ;)
 | |
|  */
 | |
| 
 | |
| void StreamBuffer::CreateFences()
 | |
| {
 | |
|   for (int i = 0; i < SYNC_POINTS; i++)
 | |
|   {
 | |
|     m_fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
 | |
|   }
 | |
| }
 | |
| void StreamBuffer::DeleteFences()
 | |
| {
 | |
|   for (int i = Slot(m_free_iterator) + 1; i < SYNC_POINTS; i++)
 | |
|   {
 | |
|     glDeleteSync(m_fences[i]);
 | |
|   }
 | |
|   for (int i = 0; i < Slot(m_iterator); i++)
 | |
|   {
 | |
|     glDeleteSync(m_fences[i]);
 | |
|   }
 | |
| }
 | |
| void StreamBuffer::AllocMemory(u32 size)
 | |
| {
 | |
|   // insert waiting slots for used memory
 | |
|   for (int i = Slot(m_used_iterator); i < Slot(m_iterator); i++)
 | |
|   {
 | |
|     m_fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
 | |
|   }
 | |
|   m_used_iterator = m_iterator;
 | |
| 
 | |
|   // wait for new slots to end of buffer
 | |
|   for (int i = Slot(m_free_iterator) + 1; i <= Slot(m_iterator + size) && i < SYNC_POINTS; i++)
 | |
|   {
 | |
|     glClientWaitSync(m_fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
 | |
|     glDeleteSync(m_fences[i]);
 | |
|   }
 | |
| 
 | |
|   // If we allocate a large amount of memory (A), commit a smaller amount, then allocate memory
 | |
|   // smaller than allocation A, we will have already waited for these fences in A, but not used
 | |
|   // the space. In this case, don't set m_free_iterator to a position before that which we know
 | |
|   // is safe to use, which would result in waiting on the same fence(s) next time.
 | |
|   if ((m_iterator + size) > m_free_iterator)
 | |
|     m_free_iterator = m_iterator + size;
 | |
| 
 | |
|   // if buffer is full
 | |
|   if (m_iterator + size >= m_size)
 | |
|   {
 | |
|     // insert waiting slots in unused space at the end of the buffer
 | |
|     for (int i = Slot(m_used_iterator); i < SYNC_POINTS; i++)
 | |
|     {
 | |
|       m_fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
 | |
|     }
 | |
| 
 | |
|     // move to the start
 | |
|     m_used_iterator = m_iterator = 0;  // offset 0 is always aligned
 | |
| 
 | |
|     // wait for space at the start
 | |
|     for (int i = 0; i <= Slot(m_iterator + size); i++)
 | |
|     {
 | |
|       glClientWaitSync(m_fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
 | |
|       glDeleteSync(m_fences[i]);
 | |
|     }
 | |
|     m_free_iterator = m_iterator + size;
 | |
|   }
 | |
| }
 | |
| 
 | |
| /* The usual way to stream data to the GPU.
 | |
|  * Described here: https://www.opengl.org/wiki/Buffer_Object_Streaming#Unsynchronized_buffer_mapping
 | |
|  * Just do unsync appends until the buffer is full.
 | |
|  * When it's full, orphan (alloc a new buffer and free the old one)
 | |
|  *
 | |
|  * As reallocation is an overhead, this method isn't as fast as it is known to be.
 | |
|  */
 | |
| class MapAndOrphan : public StreamBuffer
 | |
| {
 | |
| public:
 | |
|   MapAndOrphan(u32 type, u32 size) : StreamBuffer(type, size)
 | |
|   {
 | |
|     glBindBuffer(m_buffertype, m_buffer);
 | |
|     glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW);
 | |
|   }
 | |
| 
 | |
|   ~MapAndOrphan() override {}
 | |
|   std::pair<u8*, u32> Map(u32 size) override
 | |
|   {
 | |
|     if (m_iterator + size >= m_size)
 | |
|     {
 | |
|       glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW);
 | |
|       m_iterator = 0;
 | |
|     }
 | |
|     u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
 | |
|                                         GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT |
 | |
|                                             GL_MAP_UNSYNCHRONIZED_BIT);
 | |
|     return std::make_pair(pointer, m_iterator);
 | |
|   }
 | |
| 
 | |
|   void Unmap(u32 used_size) override
 | |
|   {
 | |
|     glFlushMappedBufferRange(m_buffertype, 0, used_size);
 | |
|     glUnmapBuffer(m_buffertype);
 | |
|     m_iterator += used_size;
 | |
|   }
 | |
| };
 | |
| 
 | |
| /* A modified streaming way without reallocation
 | |
|  * This one fixes the reallocation overhead of the MapAndOrphan one.
 | |
|  * So it alloc a ring buffer on initialization.
 | |
|  * But with this limited resource, we have to care about the CPU-GPU distance.
 | |
|  * Else this fifo may overflow.
 | |
|  * So we had traded orphan vs syncing.
 | |
|  */
 | |
| class MapAndSync : public StreamBuffer
 | |
| {
 | |
| public:
 | |
|   MapAndSync(u32 type, u32 size) : StreamBuffer(type, size)
 | |
|   {
 | |
|     CreateFences();
 | |
|     glBindBuffer(m_buffertype, m_buffer);
 | |
|     glBufferData(m_buffertype, m_size, nullptr, GL_STREAM_DRAW);
 | |
|   }
 | |
| 
 | |
|   ~MapAndSync() override { DeleteFences(); }
 | |
|   std::pair<u8*, u32> Map(u32 size) override
 | |
|   {
 | |
|     AllocMemory(size);
 | |
|     u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
 | |
|                                         GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT |
 | |
|                                             GL_MAP_UNSYNCHRONIZED_BIT);
 | |
|     return std::make_pair(pointer, m_iterator);
 | |
|   }
 | |
| 
 | |
|   void Unmap(u32 used_size) override
 | |
|   {
 | |
|     glFlushMappedBufferRange(m_buffertype, 0, used_size);
 | |
|     glUnmapBuffer(m_buffertype);
 | |
|     m_iterator += used_size;
 | |
|   }
 | |
| };
 | |
| 
 | |
| /* Streaming fifo without mapping overhead.
 | |
|  * This one usually requires ARB_buffer_storage (OpenGL 4.4).
 | |
|  * And is usually not available on OpenGL3 GPUs.
 | |
|  *
 | |
|  * ARB_buffer_storage allows us to render from a mapped buffer.
 | |
|  * So we map it persistently in the initialization.
 | |
|  *
 | |
|  * Unsync mapping sounds like an easy task, but it isn't for threaded drivers.
 | |
|  * So every mapping on current close-source driver _will_ end in
 | |
|  * at least a round trip time between two threads.
 | |
|  *
 | |
|  * As persistently mapped buffer can't use orphaning, we also have to sync.
 | |
|  */
 | |
| class BufferStorage : public StreamBuffer
 | |
| {
 | |
| public:
 | |
|   BufferStorage(u32 type, u32 size, bool _coherent = true)
 | |
|       : StreamBuffer(type, size), coherent(_coherent)
 | |
|   {
 | |
|     CreateFences();
 | |
|     glBindBuffer(m_buffertype, m_buffer);
 | |
| 
 | |
|     // PERSISTANT_BIT to make sure that the buffer can be used while mapped
 | |
|     // COHERENT_BIT is set so we don't have to use a MemoryBarrier on write
 | |
|     // CLIENT_STORAGE_BIT is set since we access the buffer more frequently on the client side then
 | |
|     // server side
 | |
|     glBufferStorage(m_buffertype, m_size, nullptr,
 | |
|                     GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT |
 | |
|                         (coherent ? GL_MAP_COHERENT_BIT : 0));
 | |
|     m_pointer =
 | |
|         (u8*)glMapBufferRange(m_buffertype, 0, m_size,
 | |
|                               GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT |
 | |
|                                   (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT));
 | |
|   }
 | |
| 
 | |
|   ~BufferStorage() override
 | |
|   {
 | |
|     DeleteFences();
 | |
|     glUnmapBuffer(m_buffertype);
 | |
|     glBindBuffer(m_buffertype, 0);
 | |
|   }
 | |
| 
 | |
|   std::pair<u8*, u32> Map(u32 size) override
 | |
|   {
 | |
|     AllocMemory(size);
 | |
|     return std::make_pair(m_pointer + m_iterator, m_iterator);
 | |
|   }
 | |
| 
 | |
|   void Unmap(u32 used_size) override
 | |
|   {
 | |
|     if (!coherent)
 | |
|       glFlushMappedBufferRange(m_buffertype, m_iterator, used_size);
 | |
|     m_iterator += used_size;
 | |
|   }
 | |
| 
 | |
|   u8* m_pointer;
 | |
|   const bool coherent;
 | |
| };
 | |
| 
 | |
| /* --- AMD only ---
 | |
|  * Another streaming fifo without mapping overhead.
 | |
|  * As we can't orphan without mapping, we have to sync.
 | |
|  *
 | |
|  * This one uses AMD_pinned_memory which is available on all AMD GPUs.
 | |
|  * OpenGL 4.4 drivers should use BufferStorage.
 | |
|  */
 | |
| class PinnedMemory : public StreamBuffer
 | |
| {
 | |
| public:
 | |
|   PinnedMemory(u32 type, u32 size) : StreamBuffer(type, size)
 | |
|   {
 | |
|     CreateFences();
 | |
|     m_pointer = static_cast<u8*>(Common::AllocateAlignedMemory(
 | |
|         Common::AlignUp(m_size, ALIGN_PINNED_MEMORY), ALIGN_PINNED_MEMORY));
 | |
|     glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_buffer);
 | |
|     glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD,
 | |
|                  Common::AlignUp(m_size, ALIGN_PINNED_MEMORY), m_pointer, GL_STREAM_COPY);
 | |
|     glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, 0);
 | |
|     glBindBuffer(m_buffertype, m_buffer);
 | |
|   }
 | |
| 
 | |
|   ~PinnedMemory() override
 | |
|   {
 | |
|     DeleteFences();
 | |
|     glBindBuffer(m_buffertype, 0);
 | |
|     glFinish();  // ogl pipeline must be flushed, else this buffer can be in use
 | |
|     Common::FreeAlignedMemory(m_pointer);
 | |
|     m_pointer = nullptr;
 | |
|   }
 | |
| 
 | |
|   std::pair<u8*, u32> Map(u32 size) override
 | |
|   {
 | |
|     AllocMemory(size);
 | |
|     return std::make_pair(m_pointer + m_iterator, m_iterator);
 | |
|   }
 | |
| 
 | |
|   void Unmap(u32 used_size) override { m_iterator += used_size; }
 | |
|   u8* m_pointer;
 | |
|   static const u32 ALIGN_PINNED_MEMORY = 4096;
 | |
| };
 | |
| 
 | |
| /* Fifo based on the glBufferSubData call.
 | |
|  * As everything must be copied before glBufferSubData returns,
 | |
|  * an additional memcpy in the driver will be done.
 | |
|  * So this is a huge overhead, only use it if required.
 | |
|  */
 | |
| class BufferSubData : public StreamBuffer
 | |
| {
 | |
| public:
 | |
|   BufferSubData(u32 type, u32 size) : StreamBuffer(type, size)
 | |
|   {
 | |
|     glBindBuffer(m_buffertype, m_buffer);
 | |
|     glBufferData(m_buffertype, size, nullptr, GL_STATIC_DRAW);
 | |
|     m_pointer = new u8[m_size];
 | |
|   }
 | |
| 
 | |
|   ~BufferSubData() override { delete[] m_pointer; }
 | |
|   std::pair<u8*, u32> Map(u32 size) override { return std::make_pair(m_pointer, 0); }
 | |
|   void Unmap(u32 used_size) override { glBufferSubData(m_buffertype, 0, used_size, m_pointer); }
 | |
|   u8* m_pointer;
 | |
| };
 | |
| 
 | |
| /* Fifo based on the glBufferData call.
 | |
|  * Some trashy drivers stall in BufferSubData.
 | |
|  * So here we use glBufferData, which realloc this buffer every time.
 | |
|  * This may avoid stalls, but it is a bigger overhead than BufferSubData.
 | |
|  */
 | |
| class BufferData : public StreamBuffer
 | |
| {
 | |
| public:
 | |
|   BufferData(u32 type, u32 size) : StreamBuffer(type, size)
 | |
|   {
 | |
|     glBindBuffer(m_buffertype, m_buffer);
 | |
|     m_pointer = new u8[m_size];
 | |
|   }
 | |
| 
 | |
|   ~BufferData() override { delete[] m_pointer; }
 | |
|   std::pair<u8*, u32> Map(u32 size) override { return std::make_pair(m_pointer, 0); }
 | |
|   void Unmap(u32 used_size) override
 | |
|   {
 | |
|     glBufferData(m_buffertype, used_size, m_pointer, GL_STREAM_DRAW);
 | |
|   }
 | |
| 
 | |
|   u8* m_pointer;
 | |
| };
 | |
| 
 | |
| // Chooses the best streaming method based on the supported extensions and known issues
 | |
| std::unique_ptr<StreamBuffer> StreamBuffer::Create(u32 type, u32 size)
 | |
| {
 | |
|   // without basevertex support, only streaming methods whith uploads everything to zero works fine:
 | |
|   if (!g_ogl_config.bSupportsGLBaseVertex)
 | |
|   {
 | |
|     if (!DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BUFFER_STREAM))
 | |
|       return std::make_unique<BufferSubData>(type, size);
 | |
| 
 | |
|     // BufferData is by far the worst way, only use it if needed
 | |
|     return std::make_unique<BufferData>(type, size);
 | |
|   }
 | |
| 
 | |
|   // Prefer the syncing buffers over the orphaning one
 | |
|   if (g_ogl_config.bSupportsGLSync)
 | |
|   {
 | |
|     // pinned memory is much faster than buffer storage on AMD cards
 | |
|     if (g_ogl_config.bSupportsGLPinnedMemory &&
 | |
|         !(DriverDetails::HasBug(DriverDetails::BUG_BROKEN_PINNED_MEMORY)))
 | |
|       return std::make_unique<PinnedMemory>(type, size);
 | |
| 
 | |
|     // buffer storage works well in most situations
 | |
|     if (g_ogl_config.bSupportsGLBufferStorage &&
 | |
|         !(DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BUFFER_STORAGE) &&
 | |
|           type == GL_ARRAY_BUFFER) &&
 | |
|         !(DriverDetails::HasBug(DriverDetails::BUG_INTEL_BROKEN_BUFFER_STORAGE) &&
 | |
|           type == GL_ELEMENT_ARRAY_BUFFER))
 | |
|       return std::make_unique<BufferStorage>(type, size);
 | |
| 
 | |
|     // don't fall back to MapAnd* for Nvidia drivers
 | |
|     if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_UNSYNC_MAPPING))
 | |
|     {
 | |
|       if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_BUFFER_STREAM))
 | |
|         return std::make_unique<BufferData>(type, size);
 | |
|       else
 | |
|         return std::make_unique<BufferSubData>(type, size);
 | |
|     }
 | |
| 
 | |
|     // mapping fallback
 | |
|     if (g_ogl_config.bSupportsGLSync)
 | |
|       return std::make_unique<MapAndSync>(type, size);
 | |
|   }
 | |
| 
 | |
|   // default fallback, should work everywhere, but isn't the best way to do this job
 | |
|   return std::make_unique<MapAndOrphan>(type, size);
 | |
| }
 | |
| }  // namespace OGL
 |