diff --git a/include/PICA/gpu.hpp b/include/PICA/gpu.hpp index 2de48c01..75c6bc55 100644 --- a/include/PICA/gpu.hpp +++ b/include/PICA/gpu.hpp @@ -28,6 +28,7 @@ class GPU { static constexpr u32 maxAttribCount = 12; // Up to 12 vertex attributes static constexpr u32 vramSize = 6_MB; Registers regs; // GPU internal registers + std::array external_regs; // GPU external registers std::array currentAttributes; // Vertex attributes before being passed to the shader std::array immediateModeAttributes; // Vertex attributes uploaded via immediate mode submission @@ -66,9 +67,9 @@ class GPU { std::array fixedAttrBuff; // Buffer to hold fixed attributes in until they get submitted // Command processor pointers for GPU command lists - u32* cmdBuffStart = nullptr; - u32* cmdBuffEnd = nullptr; - u32* cmdBuffCurr = nullptr; + std::span cmdBuffStart{}; + u32 cmdBuffEnd = 0; + u32 cmdBuffCurr = 0; Renderer renderer; PICA::Vertex getImmediateModeVertex(); @@ -100,6 +101,9 @@ class GPU { u32 readReg(u32 address); void writeReg(u32 address, u32 value); + u32 readExternalReg(u32 index); + void writeExternalReg(u32 index, u32 value); + // Used when processing GPU command lists u32 readInternalReg(u32 index); void writeInternalReg(u32 index, u32 value, u32 mask); @@ -116,6 +120,10 @@ class GPU { renderer.displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags); } + void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalCopyBytes, u32 inputSize, u32 outputSize, u32 flags) { + renderer.textureCopy(inputAddr, outputAddr, totalCopyBytes, inputSize, outputSize, flags); + } + // Read a value of type T from physical address paddr // This is necessary because vertex attribute fetching uses physical addresses template @@ -130,19 +138,18 @@ class GPU { } } - // Get a pointer of type T* to the data starting from physical address paddr + // Get a span with the specified size of type T to the data starting from physical address paddr template - T* getPointerPhys(u32 paddr) { - if (paddr >= PhysicalAddrs::FCRAM && paddr <= PhysicalAddrs::FCRAMEnd) { + std::span getSpanPhys(u32 paddr, u32 size) { + if (paddr >= PhysicalAddrs::FCRAM && paddr + size <= PhysicalAddrs::FCRAMEnd) { u8* fcram = mem.getFCRAM(); u32 index = paddr - PhysicalAddrs::FCRAM; - - return (T*)&fcram[index]; - } else if (paddr >= PhysicalAddrs::VRAM && paddr <= PhysicalAddrs::VRAMEnd) { + return std::span{(T*)&fcram[index], size / sizeof(T)}; + } else if (paddr >= PhysicalAddrs::VRAM && paddr + size <= PhysicalAddrs::VRAMEnd) { u32 index = paddr - PhysicalAddrs::VRAM; - return (T*)&vram[index]; + return std::span{(T*)&vram[index], size / sizeof(T)}; } else [[unlikely]] { Helpers::panic("[GPU] Tried to access unknown physical address: %08X", paddr); } } -}; \ No newline at end of file +}; diff --git a/include/PICA/regs.hpp b/include/PICA/regs.hpp index e1c9a819..3b8976af 100644 --- a/include/PICA/regs.hpp +++ b/include/PICA/regs.hpp @@ -1,4 +1,5 @@ #pragma once +#include #include "helpers.hpp" namespace PICA { @@ -174,6 +175,54 @@ namespace PICA { }; } + namespace ExternalRegs { + enum : u32 { + MemFill1BufferStartPaddr = 0x3, + MemFill1BufferEndPAddr = 0x4, + MemFill1Value = 0x5, + MemFill1Control = 0x6, + MemFill2BufferStartPaddr = 0x7, + MemFill2BufferEndPAddr = 0x8, + MemFill2Value = 0x9, + MemFill2Control = 0xA, + VramBankControl = 0xB, + GPUBusy = 0xC, + BacklightControl = 0xBC, + // TODO: Framebuffer regs + Framebuffer0Size = 0x2F, + Framebuffer0AFirstAddr = 0x119, + Framebuffer0ASecondAddr = 0x11A, + Framebuffer0Config = 0x11B, + Framebuffer0Select = 0x11D, + Framebuffer0Stride = 0x123, + Framebuffer0BFirstAddr = 0x124, + Framebuffer0BSecondAddr = 0x125, + Framebuffer1Size = 0x156, + Framebuffer1AFirstAddr = 0x159, + Framebuffer1ASecondAddr = 0x15A, + Framebuffer1Config = 0x15B, + Framebuffer1Select = 0x15D, + Framebuffer1Stride = 0x163, + Framebuffer1BFirstAddr = 0x164, + Framebuffer1BSecondAddr = 0x165, + TransferInputPAddr = 0x2FF, + TransferOutputPAddr = 0x300, + DisplayTransferOutputDim = 0x301, + DisplayTransferInputDim = 0x302, + TransferFlags = 0x303, + TransferTrigger = 0x305, + TextureCopyTotalBytes = 0x307, + TextureCopyInputLineGap = 0x308, + TextureCopyOutputLineGap = 0x309, + }; + } + + enum class Scaling : u32 { + None = 0, + X = 1, + XY = 2, + }; + namespace Lights { enum : u32 { LUT_D0 = 0, @@ -235,7 +284,7 @@ namespace PICA { }; // Returns the string representation of a texture format - inline constexpr const char* textureFormatToString(TextureFmt fmt) { + constexpr std::string_view textureFormatToString(TextureFmt fmt) { switch (fmt) { case TextureFmt::RGBA8: return "RGBA8"; case TextureFmt::RGB8: return "RGB8"; @@ -255,16 +304,16 @@ namespace PICA { } } - inline constexpr const char* textureFormatToString(ColorFmt fmt) { + constexpr std::string_view textureFormatToString(ColorFmt fmt) { return textureFormatToString(static_cast(fmt)); } - inline constexpr bool hasStencil(DepthFmt format) { return format == PICA::DepthFmt::Depth24Stencil8; } + constexpr bool hasStencil(DepthFmt format) { return format == PICA::DepthFmt::Depth24Stencil8; } // Size occupied by each pixel in bytes // All formats are 16BPP except for RGBA8 (32BPP) and BGR8 (24BPP) - inline constexpr usize sizePerPixel(TextureFmt format) { + constexpr usize sizePerPixel(TextureFmt format) { switch (format) { case TextureFmt::RGB8: return 3; case TextureFmt::RGBA8: return 4; @@ -272,11 +321,11 @@ namespace PICA { } } - inline constexpr usize sizePerPixel(ColorFmt format) { + constexpr usize sizePerPixel(ColorFmt format) { return sizePerPixel(static_cast(format)); } - inline constexpr usize sizePerPixel(DepthFmt format) { + constexpr usize sizePerPixel(DepthFmt format) { switch (format) { case DepthFmt::Depth16: return 2; case DepthFmt::Depth24: return 3; @@ -292,4 +341,4 @@ namespace PICA { GeometryPrimitive = 3, }; -} // namespace PICA \ No newline at end of file +} // namespace PICA diff --git a/include/gl_state.hpp b/include/gl_state.hpp index 82531c7a..3d219c20 100644 --- a/include/gl_state.hpp +++ b/include/gl_state.hpp @@ -137,4 +137,4 @@ struct GLStateManager { }; static_assert(std::is_trivially_constructible(), "OpenGL State Manager class is not trivially constructible!"); -static_assert(std::is_trivially_destructible(), "OpenGL State Manager class is not trivially destructible!"); \ No newline at end of file +static_assert(std::is_trivially_destructible(), "OpenGL State Manager class is not trivially destructible!"); diff --git a/include/helpers.hpp b/include/helpers.hpp index 9830cc88..480494a7 100644 --- a/include/helpers.hpp +++ b/include/helpers.hpp @@ -80,6 +80,13 @@ namespace Helpers { } } + /// Align down an arbitrary-size value to the closect possible multiple of value. + template + static constexpr T alignDown(T value, std::size_t size) { + static_assert(std::is_unsigned_v, "T must be an unsigned value."); + return static_cast(value - value % size); + } + /// Sign extend an arbitrary-size value to 32 bits static constexpr u32 inline signExtend32(u32 value, u32 startingSize) { auto temp = (s32)value; diff --git a/include/memory.hpp b/include/memory.hpp index 40349b46..3069acf1 100644 --- a/include/memory.hpp +++ b/include/memory.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "crypto/aes_engine.hpp" #include "helpers.hpp" #include "handles.hpp" @@ -248,4 +249,4 @@ public: void setVRAM(u8* pointer) { vram = pointer; } bool allocateMainThreadStack(u32 size); -}; \ No newline at end of file +}; diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 07f8a63c..61233525 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -44,8 +44,8 @@ class Renderer { float oldDepthOffset = 0.0; bool oldDepthmapEnable = false; - SurfaceCache depthBufferCache; - SurfaceCache colourBufferCache; + SurfaceCache depthBufferCache; + SurfaceCache colourBufferCache; SurfaceCache textureCache; OpenGL::uvec2 fbSize; // The size of the framebuffer (ie both the colour and depth buffer)' @@ -87,6 +87,7 @@ class Renderer { void getGraphicsContext(); // Set up graphics context for rendering void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control); // Clear a GPU buffer in VRAM void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags); // Perform display transfer + void textureCopy(u32 inputAddr, u32 outputAddr, u32 copyBytes, u32 inputSize, u32 outputSize, u32 flags); // Perform display transfer void drawVertices(PICA::PrimType primType, std::span vertices); // Draw the given vertices // Take a screenshot of the screen and store it in a file @@ -97,6 +98,8 @@ class Renderer { fbSize.y() = height; } + ColourBuffer getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height); + void setColourFormat(PICA::ColorFmt format) { colourBufferFormat = format; } void setDepthFormat(PICA::DepthFmt format) { if (format == PICA::DepthFmt::Unknown1) { @@ -109,4 +112,4 @@ class Renderer { void setDepthBufferLoc(u32 loc) { depthBufferLoc = loc; } static constexpr u32 vertexBufferSize = 0x10000; -}; \ No newline at end of file +}; diff --git a/include/renderer_gl/surfaces.hpp b/include/renderer_gl/surfaces.hpp index 1d46e28e..80f79e53 100644 --- a/include/renderer_gl/surfaces.hpp +++ b/include/renderer_gl/surfaces.hpp @@ -19,6 +19,10 @@ struct ColourBuffer { OpenGL::Texture texture; OpenGL::Framebuffer fbo; + GLenum internalFormat; + GLenum fmt; + GLenum type; + ColourBuffer() : valid(false) {} ColourBuffer(u32 loc, PICA::ColorFmt format, u32 x, u32 y, bool valid = true) @@ -29,17 +33,40 @@ struct ColourBuffer { range = Interval(loc, (u32)endLoc); } - void allocate() { + void allocate() { + // Internal formats for the texture based on format + static constexpr std::array internalFormats = { + GL_RGBA8, GL_RGB8, GL_RGB5_A1, GL_RGB565, GL_RGBA4 + }; + + // Format of the texture + static constexpr std::array formats = { + GL_RGBA, GL_BGR, GL_RGBA, GL_RGB, GL_RGBA, + }; + + static constexpr std::array types = { + GL_UNSIGNED_INT_8_8_8_8, GL_UNSIGNED_BYTE, GL_UNSIGNED_SHORT_5_5_5_1, + GL_UNSIGNED_SHORT_5_6_5, GL_UNSIGNED_SHORT_4_4_4_4, + }; + + internalFormat = internalFormats[(int)format]; + fmt = formats[(int)format]; + type = types[(int)format]; + + // Create texture for the FBO, setting up filters and the like // Reading back the current texture is slow, but allocate calls should be few and far between. // If this becomes a bottleneck, we can fix it semi-easily auto prevTexture = OpenGL::getTex2D(); - texture.create(size.x(), size.y(), GL_RGBA8); + texture.create(size.x(), size.y(), internalFormat); texture.bind(); texture.setMinFilter(OpenGL::Linear); texture.setMagFilter(OpenGL::Linear); glBindTexture(GL_TEXTURE_2D, prevTexture); + OpenGL::setObjectLabel(GL_TEXTURE, texture.handle(), "Surface: %dx%d %s from %08X to %08X", size.x(), + size.y(), textureFormatToString(format).data(), + range.lower(), range.upper()); //Helpers::panic("Creating FBO: %d, %d\n", size.x(), size.y()); fbo.createWithDrawTexture(texture); @@ -144,4 +171,4 @@ struct DepthBuffer { size_t sizeInBytes() { return (size_t)size.x() * (size_t)size.y() * PICA::sizePerPixel(format); } -}; \ No newline at end of file +}; diff --git a/include/renderer_gl/textures.hpp b/include/renderer_gl/textures.hpp index 5469a59f..8533758c 100644 --- a/include/renderer_gl/textures.hpp +++ b/include/renderer_gl/textures.hpp @@ -40,7 +40,7 @@ struct Texture { void allocate(); void setNewConfig(u32 newConfig); - void decodeTexture(const void* data); + void decodeTexture(std::span data); void free(); u64 sizeInBytes(); @@ -53,7 +53,7 @@ struct Texture { static u32 getSwizzledOffset_4bpp(u32 u, u32 v, u32 width); // Returns the format of this texture as a string - std::string formatToString() { + std::string_view formatToString() { return PICA::textureFormatToString(format); } @@ -61,4 +61,4 @@ struct Texture { // TODO: Make hasAlpha a template parameter u32 getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, const void* data); u32 decodeETC(u32 alpha, u32 u, u32 v, u64 colourData); -}; \ No newline at end of file +}; diff --git a/include/services/gsp_gpu.hpp b/include/services/gsp_gpu.hpp index f687532e..d34562fd 100644 --- a/include/services/gsp_gpu.hpp +++ b/include/services/gsp_gpu.hpp @@ -18,6 +18,24 @@ enum class GPUInterrupt : u8 { DMA = 6 }; +struct FramebufferInfo { + u32 activeFb; + u32 leftFramebufferVaddr; + u32 rightFramebufferVaddr; + u32 stride; + u32 format; + u32 displayFb; + u32 attribute; +}; + +struct FrameBufferUpdate { + u8 index; + u8 dirtyFlag; + u16 pad0; + std::array framebufferInfo; + u32 pad1; +}; + // More circular dependencies class Kernel; @@ -42,6 +60,7 @@ class GPUService { void flushDataCache(u32 messagePointer); void registerInterruptRelayQueue(u32 messagePointer); void setAxiConfigQoSMode(u32 messagePointer); + void setBufferSwap(u32 messagePointer); void setInternalPriorities(u32 messagePointer); void setLCDForceBlack(u32 messagePointer); void storeDataCache(u32 messagePointer); @@ -57,6 +76,8 @@ class GPUService { void triggerTextureCopy(u32* cmd); void flushCacheRegions(u32* cmd); + void setBufferSwapImpl(u32 screen_id, const FramebufferInfo& info); + public: GPUService(Memory& mem, GPU& gpu, Kernel& kernel, u32& currentPID) : mem(mem), gpu(gpu), kernel(kernel), currentPID(currentPID) {} @@ -69,4 +90,4 @@ public: std::memset(ptr, 0, 0x1000); } } -}; \ No newline at end of file +}; diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index 37b67a50..f011292d 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -8,6 +8,12 @@ #include "PICA/float_types.hpp" #include "PICA/regs.hpp" +constexpr u32 top_screen_width = 240; +constexpr u32 top_screen_height = 400; + +constexpr u32 bottom_screen_width = 240; +constexpr u32 bottom_screen_height = 300; + using namespace Floats; // Note: For when we have multiple backends, the GL state manager can stay here and have the constructor for the Vulkan-or-whatever renderer ignore it @@ -41,6 +47,27 @@ void GPU::reset() { e.config2 = 0; } + // Initialize the framebuffer registers. Values taken from Citra. + + using namespace PICA::ExternalRegs; + // Top screen addresses and dimentions. + external_regs[Framebuffer0AFirstAddr] = 0x181E6000; + external_regs[Framebuffer0ASecondAddr] = 0x1822C800; + external_regs[Framebuffer0BFirstAddr] = 0x18273000; + external_regs[Framebuffer0BSecondAddr] = 0x182B9800; + external_regs[Framebuffer0Size] = (top_screen_height << 16) | top_screen_width; + external_regs[Framebuffer0Stride] = 720; + external_regs[Framebuffer0Config] = static_cast(PICA::ColorFmt::RGB8); + external_regs[Framebuffer0Select] = 0; + + // Bottom screen addresses and dimentions. + external_regs[Framebuffer1AFirstAddr] = 0x1848F000; + external_regs[Framebuffer1ASecondAddr] = 0x184C7800; + external_regs[Framebuffer1Size] = (bottom_screen_height << 16) | bottom_screen_width; + external_regs[Framebuffer1Stride] = 720; + external_regs[Framebuffer1Config] = static_cast(PICA::ColorFmt::RGB8); + external_regs[Framebuffer1Select] = 0; + renderer.reset(); } @@ -123,12 +150,12 @@ void GPU::drawArrays() { vertexIndex = i + regs[PICA::InternalRegs::VertexOffsetReg]; } else { if (shortIndex) { - auto ptr = getPointerPhys(indexBufferPointer); - vertexIndex = *ptr; // TODO: This is very unsafe + auto ptr = getSpanPhys(indexBufferPointer, sizeof(u16)); + vertexIndex = ptr[0]; // TODO: This is very unsafe indexBufferPointer += 2; } else { - auto ptr = getPointerPhys(indexBufferPointer); - vertexIndex = *ptr; // TODO: This is also very unsafe + auto ptr = getSpanPhys(indexBufferPointer, sizeof(u8)); + vertexIndex = ptr[0]; // TODO: This is also very unsafe indexBufferPointer += 1; } } @@ -188,42 +215,46 @@ void GPU::drawArrays() { switch (attribType) { case 0: { // Signed byte - s8* ptr = getPointerPhys(attrAddress); + const u32 attr_bytes = size * sizeof(s8); + const auto ptr = getSpanPhys(attrAddress, attr_bytes); for (component = 0; component < size; component++) { - float val = static_cast(*ptr++); + float val = static_cast(ptr[component]); attribute[component] = f24::fromFloat32(val); } - attrAddress += size * sizeof(s8); + attrAddress += attr_bytes; break; } case 1: { // Unsigned byte - u8* ptr = getPointerPhys(attrAddress); + const u32 attr_bytes = size * sizeof(u8); + const auto ptr = getSpanPhys(attrAddress, attr_bytes); for (component = 0; component < size; component++) { - float val = static_cast(*ptr++); + float val = static_cast(ptr[component]); attribute[component] = f24::fromFloat32(val); } - attrAddress += size * sizeof(u8); + attrAddress += attr_bytes; break; } case 2: { // Short - s16* ptr = getPointerPhys(attrAddress); + const u32 attr_bytes = size * sizeof(s16); + const auto ptr = getSpanPhys(attrAddress, attr_bytes); for (component = 0; component < size; component++) { - float val = static_cast(*ptr++); + float val = static_cast(ptr[component]); attribute[component] = f24::fromFloat32(val); } - attrAddress += size * sizeof(s16); + attrAddress += attr_bytes; break; } case 3: { // Float - float* ptr = getPointerPhys(attrAddress); + const u32 attr_bytes = size * sizeof(float); + const auto ptr = getSpanPhys(attrAddress, attr_bytes); for (component = 0; component < size; component++) { - float val = *ptr++; + float val = ptr[component]; attribute[component] = f24::fromFloat32(val); } - attrAddress += size * sizeof(float); + attrAddress += attr_bytes; break; } diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp index f62040dd..9f5f7892 100644 --- a/src/core/PICA/regs.cpp +++ b/src/core/PICA/regs.cpp @@ -18,11 +18,36 @@ void GPU::writeReg(u32 address, u32 value) { if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers const u32 index = (address - 0x1EF01000) / sizeof(u32); writeInternalReg(index, value, 0xffffffff); + } else if (address >= 0x1EF00004 && address < 0x1EF01000) { + const u32 index = (address - 0x1EF00004) / sizeof(u32); + writeExternalReg(index, value); } else { - log("Ignoring write to external GPU register %08X. Value: %08X\n", address, value); + log("Ignoring write to unknown GPU register %08X. Value: %08X\n", address, value); } } +u32 GPU::readExternalReg(u32 index) { + using namespace PICA::ExternalRegs; + + if (index > 0x1000) [[unlikely]] { + Helpers::panic("Tried to read invalid external GPU register. Index: %X\n", index); + return -1; + } + + return external_regs[index]; +} + +void GPU::writeExternalReg(u32 index, u32 value) { + using namespace PICA::ExternalRegs; + + if (index > 0x1000) [[unlikely]] { + Helpers::panic("Tried to write to invalid external GPU register. Index: %X, value: %08X\n", index, value); + return; + } + + external_regs[index] = value; +} + u32 GPU::readInternalReg(u32 index) { using namespace PICA::InternalRegs; @@ -54,7 +79,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { using namespace PICA::InternalRegs; if (index > regNum) [[unlikely]] { - Helpers::panic("Tried to write to invalid GPU register. Index: %X, value: %08X\n", index, value); + Helpers::panic("Tried to write to invalid internal GPU register. Index: %X, value: %08X\n", index, value); return; } @@ -275,9 +300,9 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { u32 size = (regs[CmdBufSize0 + bufferIndex] & 0xfffff) << 3; // Set command buffer state to execute the new buffer - cmdBuffStart = getPointerPhys(addr); - cmdBuffCurr = cmdBuffStart; - cmdBuffEnd = cmdBuffStart + (size / sizeof(u32)); + cmdBuffStart = getSpanPhys(addr, size); + cmdBuffCurr = 0; + cmdBuffEnd = (size / sizeof(u32)); } break; } @@ -308,12 +333,15 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { } void GPU::startCommandList(u32 addr, u32 size) { - cmdBuffStart = static_cast(mem.getReadPointer(addr)); - if (!cmdBuffStart) Helpers::panic("Couldn't get buffer for command list"); + cmdBuffStart = getSpanPhys(addr, size); + if (cmdBuffStart.empty()) { + Helpers::panic("Couldn't get buffer for command list"); + return; + } // TODO: This is very memory unsafe. We get a pointer to FCRAM and just keep writing without checking if we're gonna go OoB - cmdBuffCurr = cmdBuffStart; - cmdBuffEnd = cmdBuffStart + (size / sizeof(u32)); + cmdBuffCurr = 0; + cmdBuffEnd = (size / sizeof(u32)); // LUT for converting the parameter mask to an actual 32-bit mask // The parameter mask is 4 bits long, each bit corresponding to one byte of the mask @@ -329,13 +357,13 @@ void GPU::startCommandList(u32 addr, u32 size) { // The curr pointer starts out doubleword-aligned and is increased by 4 bytes each time // So to check if it is aligned, we get the number of words it's been incremented by // If that number is an odd value then the buffer is not aligned, otherwise it is - if ((cmdBuffCurr - cmdBuffStart) % 2 != 0) { + if (cmdBuffCurr % 2 != 0) { cmdBuffCurr++; } // The first word of a command is the command parameter and the second one is the header - u32 param1 = *cmdBuffCurr++; - u32 header = *cmdBuffCurr++; + u32 param1 = cmdBuffStart[cmdBuffCurr++]; + u32 header = cmdBuffStart[cmdBuffCurr++]; u32 id = header & 0xffff; u32 paramMaskIndex = getBits<16, 4>(header); @@ -352,8 +380,8 @@ void GPU::startCommandList(u32 addr, u32 size) { writeInternalReg(id, param1, mask); for (u32 i = 0; i < paramCount; i++) { id += idIncrement; - u32 param = *cmdBuffCurr++; + u32 param = cmdBuffStart[cmdBuffCurr++]; writeInternalReg(id, param, mask); } } -} \ No newline at end of file +} diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 00f28eba..2bf17153 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -453,4 +453,4 @@ u64 Memory::timeSince3DSEpoch() { constexpr u64 offset = 2208988800ull; milliseconds ms = duration_cast(seconds(rawTime + timezoneDifference + offset)); return ms.count(); -} \ No newline at end of file +} diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 73fb0919..1551a6ba 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -576,6 +576,12 @@ const char* displayFragmentShader = R"( } )"; +static void APIENTRY debugHandler(GLenum source, GLenum type, GLuint id, GLenum severity, + GLsizei length, const GLchar* message, const void* userParam) { + Helpers::warn("%d: %s\n", id, message); +} + + void Renderer::reset() { depthBufferCache.reset(); colourBufferCache.reset(); @@ -695,6 +701,11 @@ void Renderer::initGraphicsContext() { OpenGL::clearColor(); OpenGL::setViewport(oldViewport[0], oldViewport[1], oldViewport[2], oldViewport[3]); +#if defined(OPENGL_DEBUG_INFO) + glEnable(GL_DEBUG_OUTPUT); + glDebugMessageCallback(debugHandler, nullptr); +#endif + reset(); } @@ -919,6 +930,39 @@ constexpr u32 bottomScreenBuffer = 0x1f05dc00; void Renderer::display() { gl.disableScissor(); + gl.disableBlend(); + gl.disableDepth(); + gl.disableScissor(); + gl.setColourMask(true, true, true, true); + gl.useProgram(displayProgram); + gl.bindVAO(dummyVAO); + + OpenGL::disableClipPlane(0); + OpenGL::disableClipPlane(1); + + using namespace PICA::ExternalRegs; + const u32 topScreenAddr = gpu.readExternalReg(Framebuffer0AFirstAddr); + const u32 bottomScreenAddr = gpu.readExternalReg(Framebuffer1AFirstAddr); + + auto topScreen = colourBufferCache.findFromAddress(topScreenAddr); + auto bottomScreen = colourBufferCache.findFromAddress(bottomScreenAddr); + Helpers::warn("Top screen addr %08X\n", topScreenAddr); + + screenFramebuffer.bind(OpenGL::DrawFramebuffer); + + // Hack: Detect whether we are writing to the top or bottom screen by checking output gap and drawing to the proper part of the output texture + // We consider output gap == 320 to mean bottom, and anything else to mean top + if (topScreen) { + topScreen->get().texture.bind(); + OpenGL::setViewport(0, 240, 400, 240); // Top screen viewport + OpenGL::draw(OpenGL::TriangleStrip, 4); // Actually draw our 3DS screen + } + + if (bottomScreen) { + bottomScreen->get().texture.bind(); + OpenGL::setViewport(40, 0, 320, 240); + OpenGL::draw(OpenGL::TriangleStrip, 4); + } glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); screenFramebuffer.bind(OpenGL::ReadFramebuffer); @@ -986,7 +1030,7 @@ OpenGL::Texture Renderer::getTexture(Texture& tex) { if (buffer.has_value()) { return buffer.value().get().texture; } else { - const void* textureData = gpu.getPointerPhys(tex.location); // Get pointer to the texture data in 3DS memory + const std::span textureData = gpu.getSpanPhys(tex.location, tex.sizeInBytes()); // Get pointer to the texture data in 3DS memory Texture& newTex = textureCache.add(tex); newTex.decodeTexture(textureData); @@ -994,40 +1038,86 @@ OpenGL::Texture Renderer::getTexture(Texture& tex) { } } +// NOTE: The GPU format has RGB5551 and RGB655 swapped compared to internal regs format +PICA::ColorFmt ToColorFmt(u32 format) { + switch (format) { + case 2: return PICA::ColorFmt::RGB565; + case 3: return PICA::ColorFmt::RGBA5551; + default: return static_cast(format); + } +} + void Renderer::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) { const u32 inputWidth = inputSize & 0xffff; - const u32 inputGap = inputSize >> 16; + const u32 inputHeight = inputSize >> 16; + const auto inputFormat = ToColorFmt(Helpers::getBits<8, 3>(flags)); + const auto outputFormat = ToColorFmt(Helpers::getBits<12, 3>(flags)); + const PICA::Scaling scaling = static_cast(Helpers::getBits<24, 2>(flags)); - const u32 outputWidth = outputSize & 0xffff; - const u32 outputGap = outputSize >> 16; - - auto framebuffer = colourBufferCache.findFromAddress(inputAddr); - // If there's a framebuffer at this address, use it. Otherwise go back to our old hack and display framebuffer 0 - // Displays are hard I really don't want to try implementing them because getting a fast solution is terrible - OpenGL::Texture& tex = framebuffer.has_value() ? framebuffer.value().get().texture : colourBufferCache[0].texture; - - tex.bind(); - screenFramebuffer.bind(OpenGL::DrawFramebuffer); - - gl.disableBlend(); - gl.disableDepth(); - gl.disableScissor(); - gl.setColourMask(true, true, true, true); - gl.useProgram(displayProgram); - gl.bindVAO(dummyVAO); - - OpenGL::disableClipPlane(0); - OpenGL::disableClipPlane(1); - - // Hack: Detect whether we are writing to the top or bottom screen by checking output gap and drawing to the proper part of the output texture - // We consider output gap == 320 to mean bottom, and anything else to mean top - if (outputGap == 320) { - OpenGL::setViewport(40, 0, 320, 240); // Bottom screen viewport - } else { - OpenGL::setViewport(0, 240, 400, 240); // Top screen viewport + u32 outputWidth = outputSize & 0xffff; + if (scaling == PICA::Scaling::X || scaling == PICA::Scaling::XY) { + outputWidth >>= 1; + } + u32 outputHeight = outputSize >> 16; + if (scaling == PICA::Scaling::XY) { + outputHeight >>= 1; } - OpenGL::draw(OpenGL::TriangleStrip, 4); // Actually draw our 3DS screen + // If there's a framebuffer at this address, use it. Otherwise go back to our old hack and display framebuffer 0 + // Displays are hard I really don't want to try implementing them because getting a fast solution is terrible + auto srcFramebuffer = getColourBuffer(inputAddr, inputFormat, inputWidth, inputHeight); + auto dstFramebuffer = getColourBuffer(outputAddr, outputFormat, outputWidth, outputHeight); + + Helpers::warn("Display transfer with outputAddr %08X\n", outputAddr); + + // Blit the framebuffers + srcFramebuffer.fbo.bind(OpenGL::ReadFramebuffer); + dstFramebuffer.fbo.bind(OpenGL::DrawFramebuffer); + glBlitFramebuffer(0, 0, inputWidth, inputHeight, 0, 0, outputWidth, outputHeight, GL_COLOR_BUFFER_BIT, GL_LINEAR); +} + +void Renderer::textureCopy(u32 inputAddr, u32 outputAddr, u32 copyBytes, u32 inputSize, u32 outputSize, u32 flags) { + copyBytes = Helpers::alignDown(copyBytes, 16); + if (copyBytes == 0) [[unlikely]] { + return; + } + + const u32 inputWidth = (inputSize & 0xffff) * 16; + const u32 inputGap = (inputSize >> 16) * 16; + const u32 outputWidth = (outputSize & 0xffff) * 16; + const u32 outputGap = (outputSize >> 16) * 16; + + if (inputGap != 0 || inputWidth != outputWidth) { + Helpers::warn("Texture copy with non zero input gap or mismatching widths, cannot be accelerated"); + return; + } + + // If the texture is tiled, apps set inputWidth to the scanline size which is width * 8. + // HACK: We don't know if the src texture is tiled or not yet, assume it is for now, because it's the most common case. + // Citra handles this by letting the width/stride be set as bytes and interpreting it differently + // depending on the candidate surface. + auto srcFramebuffer = getColourBuffer(inputAddr, PICA::ColorFmt::RGBA8, inputWidth / 8, copyBytes / inputWidth); // HACK: Assume RGBA8 format + auto dstFramebuffer = getColourBuffer(outputAddr, srcFramebuffer.format, outputWidth / 8, copyBytes / outputWidth); + + // Blit the framebuffers + srcFramebuffer.fbo.bind(OpenGL::ReadFramebuffer); + dstFramebuffer.fbo.bind(OpenGL::DrawFramebuffer); + glBlitFramebuffer(0, 0, srcFramebuffer.size.x(), srcFramebuffer.size.y(), + 0, 0, dstFramebuffer.size.x(), dstFramebuffer.size.y(), GL_COLOR_BUFFER_BIT, GL_LINEAR); +} + +ColourBuffer Renderer::getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height) { + // Try to find an already existing buffer that contains the provided address + // This is a more relaxed check compared to getColourFBO as display transfer/texcopy may refer to + // subrect of a surface and in case of texcopy we don't know the format of the surface. + auto buffer = colourBufferCache.findFromAddress(addr); + if (buffer.has_value()) { + return buffer.value().get(); + } + + // Otherwise create and cache a new buffer. + ColourBuffer sampleBuffer(addr, format, width, height); + return colourBufferCache.add(sampleBuffer); } void Renderer::screenshot(const std::string& name) { @@ -1053,4 +1143,4 @@ void Renderer::screenshot(const std::string& name) { } stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0); -} \ No newline at end of file +} diff --git a/src/core/renderer_gl/textures.cpp b/src/core/renderer_gl/textures.cpp index e4df36a0..69280afa 100644 --- a/src/core/renderer_gl/textures.cpp +++ b/src/core/renderer_gl/textures.cpp @@ -258,18 +258,18 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) { } } -void Texture::decodeTexture(const void* data) { +void Texture::decodeTexture(std::span data) { std::vector decoded; decoded.reserve(u64(size.u()) * u64(size.v())); // Decode texels line by line for (u32 v = 0; v < size.v(); v++) { for (u32 u = 0; u < size.u(); u++) { - u32 colour = decodeTexel(u, v, format, data); + u32 colour = decodeTexel(u, v, format, data.data()); decoded.push_back(colour); } } texture.bind(); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, size.u(), size.v(), GL_RGBA, GL_UNSIGNED_BYTE, decoded.data()); -} \ No newline at end of file +} diff --git a/src/core/services/gsp_gpu.cpp b/src/core/services/gsp_gpu.cpp index f27688a2..399ffc01 100644 --- a/src/core/services/gsp_gpu.cpp +++ b/src/core/services/gsp_gpu.cpp @@ -1,4 +1,5 @@ #include "services/gsp_gpu.hpp" +#include "PICA/regs.hpp" #include "ipc.hpp" #include "kernel.hpp" @@ -10,6 +11,7 @@ namespace ServiceCommands { RegisterInterruptRelayQueue = 0x00130042, WriteHwRegs = 0x00010082, WriteHwRegsWithMask = 0x00020084, + SetBufferSwap = 0x00050200, FlushDataCache = 0x00080082, SetLCDForceBlack = 0x000B0040, TriggerCmdReqQueue = 0x000C0000, @@ -19,15 +21,35 @@ namespace ServiceCommands { } // Commands written to shared memory and processed by TriggerCmdReqQueue -namespace GXCommands { - enum : u32 { - TriggerDMARequest = 0, - ProcessCommandList = 1, - MemoryFill = 2, - TriggerDisplayTransfer = 3, - TriggerTextureCopy = 4, - FlushCacheRegions = 5 - }; +enum class GXCommands : u32 { + TriggerDMARequest = 0, + ProcessCommandList = 1, + MemoryFill = 2, + TriggerDisplayTransfer = 3, + TriggerTextureCopy = 4, + FlushCacheRegions = 5 +}; + +static u32 VaddrToPaddr(u32 addr) { + if (addr >= VirtualAddrs::VramStart && addr < (VirtualAddrs::VramStart + VirtualAddrs::VramSize)) [[likely]] { + return addr - VirtualAddrs::VramStart + PhysicalAddrs::VRAM; + } + + else if (addr >= VirtualAddrs::LinearHeapStartOld && addr < VirtualAddrs::LinearHeapEndOld) { + return addr - VirtualAddrs::LinearHeapStartOld + PhysicalAddrs::FCRAM; + } + + else if (addr >= VirtualAddrs::LinearHeapStartNew && addr < VirtualAddrs::LinearHeapEndNew) { + return addr - VirtualAddrs::LinearHeapStartNew + PhysicalAddrs::FCRAM; + } + + else if (addr == 0) { + return 0; + } + + Helpers::warn("[GSP::GPU VaddrToPaddr] Unknown virtual address %08X", addr); + // Obviously garbage address + return 0xF3310932; } void GPUService::reset() { @@ -43,13 +65,14 @@ void GPUService::handleSyncRequest(u32 messagePointer) { case ServiceCommands::FlushDataCache: flushDataCache(messagePointer); break; case ServiceCommands::RegisterInterruptRelayQueue: registerInterruptRelayQueue(messagePointer); break; case ServiceCommands::SetAxiConfigQoSMode: setAxiConfigQoSMode(messagePointer); break; + case ServiceCommands::SetBufferSwap: setBufferSwap(messagePointer); break; case ServiceCommands::SetInternalPriorities: setInternalPriorities(messagePointer); break; case ServiceCommands::SetLCDForceBlack: setLCDForceBlack(messagePointer); break; case ServiceCommands::StoreDataCache: storeDataCache(messagePointer); break; case ServiceCommands::TriggerCmdReqQueue: [[likely]] triggerCmdReqQueue(messagePointer); break; case ServiceCommands::WriteHwRegs: writeHwRegs(messagePointer); break; case ServiceCommands::WriteHwRegsWithMask: writeHwRegsWithMask(messagePointer); break; -; default: Helpers::panic("GPU service requested. Command: %08X\n", command); + default: Helpers::panic("GPU service requested. Command: %08X\n", command); } } @@ -122,15 +145,12 @@ void GPUService::requestInterrupt(GPUInterrupt type) { // Not emulating this causes Yoshi's Wooly World, Captain Toad, Metroid 2 et al to hang if (type == GPUInterrupt::VBlank0 || type == GPUInterrupt::VBlank1) { int screen = static_cast(type) - static_cast(GPUInterrupt::VBlank0); // 0 for top screen, 1 for bottom - - constexpr u32 FBInfoSize = 0x40; // TODO: Offset depends on GSP thread being triggered - u8* info = &sharedMem[0x200 + screen * FBInfoSize]; - u8& dirtyFlag = info[1]; + FrameBufferUpdate* update = reinterpret_cast(&sharedMem[0x200 + screen * sizeof(FrameBufferUpdate)]); - if (dirtyFlag & 1) { - // TODO: Submit buffer info here - dirtyFlag &= ~1; + if (update->dirtyFlag & 1) { + setBufferSwapImpl(screen, update->framebufferInfo[update->index]); + update->dirtyFlag &= ~1; } } @@ -259,6 +279,18 @@ void GPUService::setAxiConfigQoSMode(u32 messagePointer) { mem.write32(messagePointer + 4, Result::Success); } +void GPUService::setBufferSwap(u32 messagePointer) { + FramebufferInfo info{}; + const u32 screenId = mem.read32(messagePointer + 4); // Selects either PDC0 or PDC1 + info.activeFb = mem.read32(messagePointer + 8); + info.leftFramebufferVaddr = mem.read32(messagePointer + 12); + info.rightFramebufferVaddr = mem.read32(messagePointer + 16); + info.stride = mem.read32(messagePointer + 20); + info.format = mem.read32(messagePointer + 24); + info.displayFb = mem.read32(messagePointer + 28); // Selects either framebuffer A or B + setBufferSwapImpl(screenId, info); +} + // Seems to also be completely undocumented void GPUService::setInternalPriorities(u32 messagePointer) { log("GSP::GPU::SetInternalPriorities\n"); @@ -281,7 +313,7 @@ void GPUService::processCommandBuffer() { log("Processing %d GPU commands\n", commandsLeft); while (commandsLeft != 0) { - u32 cmdID = cmd[0] & 0xff; + const GXCommands cmdID = static_cast(cmd[0] & 0xff); switch (cmdID) { case GXCommands::ProcessCommandList: processCommandList(cmd); break; case GXCommands::MemoryFill: memoryFill(cmd); break; @@ -324,28 +356,6 @@ void GPUService::memoryFill(u32* cmd) { } } -static u32 VaddrToPaddr(u32 addr) { - if (addr >= VirtualAddrs::VramStart && addr < (VirtualAddrs::VramStart + VirtualAddrs::VramSize)) [[likely]] { - return addr - VirtualAddrs::VramStart + PhysicalAddrs::VRAM; - } - - else if (addr >= VirtualAddrs::LinearHeapStartOld && addr < VirtualAddrs::LinearHeapEndOld) { - return addr - VirtualAddrs::LinearHeapStartOld + PhysicalAddrs::FCRAM; - } - - else if (addr >= VirtualAddrs::LinearHeapStartNew && addr < VirtualAddrs::LinearHeapEndNew) { - return addr - VirtualAddrs::LinearHeapStartNew + PhysicalAddrs::FCRAM; - } - - else if (addr == 0) { - return 0; - } - - Helpers::warn("[GSP::GPU VaddrToPaddr] Unknown virtual address %08X", addr); - // Obviously garbage address - return 0xF3310932; -} - void GPUService::triggerDisplayTransfer(u32* cmd) { const u32 inputAddr = VaddrToPaddr(cmd[1]); const u32 outputAddr = VaddrToPaddr(cmd[2]); @@ -373,23 +383,74 @@ void GPUService::flushCacheRegions(u32* cmd) { log("GSP::GPU::FlushCacheRegions (Stubbed)\n"); } +void GPUService::setBufferSwapImpl(u32 screenId, const FramebufferInfo& info) { + using namespace PICA::ExternalRegs; + + constexpr static std::array fb_addresses = { + Framebuffer0AFirstAddr, + Framebuffer0ASecondAddr, + Framebuffer0BFirstAddr, + Framebuffer0BSecondAddr, + Framebuffer1AFirstAddr, + Framebuffer1ASecondAddr, + Framebuffer1BFirstAddr, + Framebuffer1BSecondAddr, + }; + + const u32 fb_index = screenId * 4 + info.activeFb * 2; + gpu.writeExternalReg(fb_addresses[fb_index], VaddrToPaddr(info.leftFramebufferVaddr)); + gpu.writeExternalReg(fb_addresses[fb_index + 1], VaddrToPaddr(info.rightFramebufferVaddr)); + + constexpr static std::array config_addresses = { + Framebuffer0Config, + Framebuffer0Select, + Framebuffer0Stride, + Framebuffer1Config, + Framebuffer1Select, + Framebuffer1Stride, + }; + + const u32 config_index = screenId * 3; + gpu.writeExternalReg(config_addresses[config_index], info.format); + gpu.writeExternalReg(config_addresses[config_index + 1], info.displayFb); + gpu.writeExternalReg(config_addresses[config_index + 2], info.stride); +} + // Actually send command list (aka display list) to GPU void GPUService::processCommandList(u32* cmd) { const u32 address = cmd[1] & ~7; // Buffer address const u32 size = cmd[2] & ~3; // Buffer size in bytes - const bool updateGas = cmd[3] == 1; // Update gas additive blend results (0 = don't update, 1 = update) - const bool flushBuffer = cmd[7] == 1; // Flush buffer (0 = don't flush, 1 = flush) + [[maybe_unused]] const bool updateGas = cmd[3] == 1; // Update gas additive blend results (0 = don't update, 1 = update) + [[maybe_unused]] const bool flushBuffer = cmd[7] == 1; // Flush buffer (0 = don't flush, 1 = flush) log("GPU::GSP::processCommandList. Address: %08X, size in bytes: %08X\n", address, size); - gpu.startCommandList(address, size); + gpu.startCommandList(VaddrToPaddr(address), size); requestInterrupt(GPUInterrupt::P3D); // Send an IRQ when command list processing is over } // TODO: Emulate the transfer engine & its registers // Then this can be emulated by just writing the appropriate values there void GPUService::triggerTextureCopy(u32* cmd) { - Helpers::warn("GSP::GPU::TriggerTextureCopy (unimplemented)\n"); + const u32 inputBufferAddr = VaddrToPaddr(cmd[1]); + const u32 outputBufferAddr = VaddrToPaddr(cmd[2]); + const u32 totalCopyBytes = cmd[3]; + const u32 inputWidthGap = cmd[4]; + const u32 outputWidthGap = cmd[5]; + const u32 flags = cmd[6]; + + // Write the trigger register + using namespace PICA::ExternalRegs; + gpu.writeExternalReg(TransferTrigger, gpu.readExternalReg(TransferTrigger) | 1); + + // Perform the texture copy + gpu.textureCopy(inputBufferAddr, outputBufferAddr, totalCopyBytes, inputWidthGap, outputWidthGap, flags); + // This uses the transfer engine and thus needs to fire a PPF interrupt. // NSMB2 relies on this requestInterrupt(GPUInterrupt::PPF); -} \ No newline at end of file + + // Writing to trigger will perform the texture copy. + // Reset the bit here to singal completion. + gpu.writeExternalReg(TransferTrigger, gpu.readExternalReg(TransferTrigger) & ~1); +} +