code: Better screen support

2025-07-15 21:41:32 +00:00 · 2023-07-10 23:59:44 +03:00 · 2023-07-10 23:59:44 +03:00 · 603074dc90
commit 603074dc90
parent c8f4d41b47
15 changed files with 465 additions and 140 deletions
--- a/include/PICA/gpu.hpp
+++ b/include/PICA/gpu.hpp
@ -28,6 +28,7 @@ class GPU {
 	static constexpr u32 maxAttribCount = 12; // Up to 12 vertex attributes
 	static constexpr u32 vramSize = 6_MB;
 	Registers regs; // GPU internal registers
+	std::array<u32, 0x1000> external_regs; // GPU external registers
 	std::array<vec4f, 16> currentAttributes; // Vertex attributes before being passed to the shader

 	std::array<vec4f, 16> immediateModeAttributes; // Vertex attributes uploaded via immediate mode submission
@ -66,9 +67,9 @@ class GPU {
 	std::array<u32, 3> fixedAttrBuff; // Buffer to hold fixed attributes in until they get submitted

 	// Command processor pointers for GPU command lists
-	u32* cmdBuffStart = nullptr;
-	u32* cmdBuffEnd = nullptr;
-	u32* cmdBuffCurr = nullptr;
+	std::span<u32> cmdBuffStart{};
+	u32 cmdBuffEnd = 0;
+	u32 cmdBuffCurr = 0;

 	Renderer renderer;
 	PICA::Vertex getImmediateModeVertex();
@ -100,6 +101,9 @@ class GPU {
 	u32 readReg(u32 address);
 	void writeReg(u32 address, u32 value);

+	u32 readExternalReg(u32 index);
+	void writeExternalReg(u32 index, u32 value);
+
 	// Used when processing GPU command lists
 	u32 readInternalReg(u32 index);
 	void writeInternalReg(u32 index, u32 value, u32 mask);
@ -116,6 +120,10 @@ class GPU {
 		renderer.displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags);
 	}

+	void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalCopyBytes, u32 inputSize, u32 outputSize, u32 flags) {
+		renderer.textureCopy(inputAddr, outputAddr, totalCopyBytes, inputSize, outputSize, flags);
+	}
+
 	// Read a value of type T from physical address paddr
 	// This is necessary because vertex attribute fetching uses physical addresses
 	template <typename T>
@ -130,19 +138,18 @@ class GPU {
 		}
 	}

-	// Get a pointer of type T* to the data starting from physical address paddr
+	// Get a span with the specified size of type T to the data starting from physical address paddr
 	template <typename T>
-	T* getPointerPhys(u32 paddr) {
-		if (paddr >= PhysicalAddrs::FCRAM && paddr <= PhysicalAddrs::FCRAMEnd) {
+	std::span<T> getSpanPhys(u32 paddr, u32 size) {
+		if (paddr >= PhysicalAddrs::FCRAM && paddr + size <= PhysicalAddrs::FCRAMEnd) {
 			u8* fcram = mem.getFCRAM();
 			u32 index = paddr - PhysicalAddrs::FCRAM;
-
-			return (T*)&fcram[index];
-		} else if (paddr >= PhysicalAddrs::VRAM && paddr <= PhysicalAddrs::VRAMEnd) {
+			return std::span{(T*)&fcram[index], size / sizeof(T)};
+		} else if (paddr >= PhysicalAddrs::VRAM && paddr + size <= PhysicalAddrs::VRAMEnd) {
 			u32 index = paddr - PhysicalAddrs::VRAM;
-			return (T*)&vram[index];
+			return std::span{(T*)&vram[index], size / sizeof(T)};
 		} else [[unlikely]] {
 			Helpers::panic("[GPU] Tried to access unknown physical address: %08X", paddr);
 		}
 	}
-};
+};
--- a/include/PICA/regs.hpp
+++ b/include/PICA/regs.hpp
@ -1,4 +1,5 @@
 #pragma once
+#include <string_view>
 #include "helpers.hpp"

 namespace PICA {
@ -174,6 +175,54 @@ namespace PICA {
 		};
 	}

+	namespace ExternalRegs {
+		enum : u32 {
+			MemFill1BufferStartPaddr = 0x3,
+			MemFill1BufferEndPAddr = 0x4,
+			MemFill1Value = 0x5,
+			MemFill1Control = 0x6,
+			MemFill2BufferStartPaddr = 0x7,
+			MemFill2BufferEndPAddr = 0x8,
+			MemFill2Value = 0x9,
+			MemFill2Control = 0xA,
+			VramBankControl = 0xB,
+			GPUBusy = 0xC,
+			BacklightControl = 0xBC,
+			// TODO: Framebuffer regs
+			Framebuffer0Size = 0x2F,
+			Framebuffer0AFirstAddr = 0x119,
+			Framebuffer0ASecondAddr = 0x11A,
+			Framebuffer0Config = 0x11B,
+			Framebuffer0Select = 0x11D,
+			Framebuffer0Stride = 0x123,
+			Framebuffer0BFirstAddr = 0x124,
+			Framebuffer0BSecondAddr = 0x125,
+			Framebuffer1Size = 0x156,
+			Framebuffer1AFirstAddr = 0x159,
+			Framebuffer1ASecondAddr = 0x15A,
+			Framebuffer1Config = 0x15B,
+			Framebuffer1Select = 0x15D,
+			Framebuffer1Stride = 0x163,
+			Framebuffer1BFirstAddr = 0x164,
+			Framebuffer1BSecondAddr = 0x165,
+			TransferInputPAddr = 0x2FF,
+			TransferOutputPAddr = 0x300,
+			DisplayTransferOutputDim = 0x301,
+			DisplayTransferInputDim = 0x302,
+			TransferFlags = 0x303,
+			TransferTrigger = 0x305,
+			TextureCopyTotalBytes = 0x307,
+			TextureCopyInputLineGap = 0x308,
+			TextureCopyOutputLineGap = 0x309,
+		};
+	}
+
+	enum class Scaling : u32 {
+		None = 0,
+		X = 1,
+		XY = 2,
+	};
+
 	namespace Lights {
 		enum : u32 {
 			LUT_D0 = 0,
@ -235,7 +284,7 @@ namespace PICA {
 	};

 	// Returns the string representation of a texture format
-	inline constexpr const char* textureFormatToString(TextureFmt fmt) {
+	constexpr std::string_view textureFormatToString(TextureFmt fmt) {
 		switch (fmt) {
 			case TextureFmt::RGBA8: return "RGBA8";
 			case TextureFmt::RGB8: return "RGB8";
@ -255,16 +304,16 @@ namespace PICA {
 		}
 	}

-	inline constexpr const char* textureFormatToString(ColorFmt fmt) {
+	constexpr std::string_view textureFormatToString(ColorFmt fmt) {
 		return textureFormatToString(static_cast<TextureFmt>(fmt));
 	}

-	inline constexpr bool hasStencil(DepthFmt format) { return format == PICA::DepthFmt::Depth24Stencil8; }
+	constexpr bool hasStencil(DepthFmt format) { return format == PICA::DepthFmt::Depth24Stencil8; }

 	// Size occupied by each pixel in bytes

 	// All formats are 16BPP except for RGBA8 (32BPP) and BGR8 (24BPP)
-	inline constexpr usize sizePerPixel(TextureFmt format) {
+	constexpr usize sizePerPixel(TextureFmt format) {
 		switch (format) {
 			case TextureFmt::RGB8: return 3;
 			case TextureFmt::RGBA8: return 4;
@ -272,11 +321,11 @@ namespace PICA {
 		}
 	}

-	inline constexpr usize sizePerPixel(ColorFmt format) {
+	constexpr usize sizePerPixel(ColorFmt format) {
 		return sizePerPixel(static_cast<TextureFmt>(format));
 	}

-	inline constexpr usize sizePerPixel(DepthFmt format) {
+	constexpr usize sizePerPixel(DepthFmt format) {
 		switch (format) {
 			case DepthFmt::Depth16: return 2;
 			case DepthFmt::Depth24: return 3;
@ -292,4 +341,4 @@ namespace PICA {
 		GeometryPrimitive = 3,
 	};

-}  // namespace PICA
+}  // namespace PICA
--- a/include/gl_state.hpp
+++ b/include/gl_state.hpp
@ -137,4 +137,4 @@ struct GLStateManager {
 };

 static_assert(std::is_trivially_constructible<GLStateManager>(), "OpenGL State Manager class is not trivially constructible!");
-static_assert(std::is_trivially_destructible<GLStateManager>(), "OpenGL State Manager class is not trivially destructible!");
+static_assert(std::is_trivially_destructible<GLStateManager>(), "OpenGL State Manager class is not trivially destructible!");
--- a/include/helpers.hpp
+++ b/include/helpers.hpp
@ -80,6 +80,13 @@ namespace Helpers {
 		}
 	}

+	/// Align down an arbitrary-size value to the closect possible multiple of value.
+	template <typename T>
+	static constexpr T alignDown(T value, std::size_t size) {
+		static_assert(std::is_unsigned_v<T>, "T must be an unsigned value.");
+		return static_cast<T>(value - value % size);
+	}
+
 	/// Sign extend an arbitrary-size value to 32 bits
 	static constexpr u32 inline signExtend32(u32 value, u32 startingSize) {
 		auto temp = (s32)value;
--- a/include/memory.hpp
+++ b/include/memory.hpp
@ -5,6 +5,7 @@
 #include <fstream>
 #include <optional>
 #include <vector>
+#include <span>
 #include "crypto/aes_engine.hpp"
 #include "helpers.hpp"
 #include "handles.hpp"
@ -248,4 +249,4 @@ public:

 	void setVRAM(u8* pointer) { vram = pointer; }
 	bool allocateMainThreadStack(u32 size);
-};
+};
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@ -44,8 +44,8 @@ class Renderer {
 	float oldDepthOffset = 0.0;
 	bool oldDepthmapEnable = false;

-	SurfaceCache<DepthBuffer, 10, true> depthBufferCache;
-	SurfaceCache<ColourBuffer, 10, true> colourBufferCache;
+	SurfaceCache<DepthBuffer, 64, true> depthBufferCache;
+	SurfaceCache<ColourBuffer, 64, true> colourBufferCache;
 	SurfaceCache<Texture, 256, true> textureCache;

 	OpenGL::uvec2 fbSize;  // The size of the framebuffer (ie both the colour and depth buffer)'
@ -87,6 +87,7 @@ class Renderer {
 	void getGraphicsContext();                                                                      // Set up graphics context for rendering
 	void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control);                     // Clear a GPU buffer in VRAM
 	void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags);  // Perform display transfer
+	void textureCopy(u32 inputAddr, u32 outputAddr, u32 copyBytes, u32 inputSize, u32 outputSize, u32 flags);  // Perform display transfer
 	void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices);             // Draw the given vertices

 	// Take a screenshot of the screen and store it in a file
@ -97,6 +98,8 @@ class Renderer {
 		fbSize.y() = height;
 	}

+	ColourBuffer getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height);
+
 	void setColourFormat(PICA::ColorFmt format) { colourBufferFormat = format; }
 	void setDepthFormat(PICA::DepthFmt format) {
 		if (format == PICA::DepthFmt::Unknown1) {
@ -109,4 +112,4 @@ class Renderer {
 	void setDepthBufferLoc(u32 loc) { depthBufferLoc = loc; }

 	static constexpr u32 vertexBufferSize = 0x10000;
-};
+};
--- a/include/renderer_gl/surfaces.hpp
+++ b/include/renderer_gl/surfaces.hpp
@ -19,6 +19,10 @@ struct ColourBuffer {
    OpenGL::Texture texture;
    OpenGL::Framebuffer fbo;

+	GLenum internalFormat;
+	GLenum fmt;
+	GLenum type;
+
    ColourBuffer() : valid(false) {}

    ColourBuffer(u32 loc, PICA::ColorFmt format, u32 x, u32 y, bool valid = true)
@ -29,17 +33,40 @@ struct ColourBuffer {
        range = Interval<u32>(loc, (u32)endLoc);
    }

-    void allocate() {
+	void allocate() {
+		// Internal formats for the texture based on format
+		static constexpr std::array<GLenum, 5> internalFormats = {
+			GL_RGBA8, GL_RGB8, GL_RGB5_A1, GL_RGB565, GL_RGBA4
+		};
+
+		// Format of the texture
+		static constexpr std::array<GLenum, 5> formats = {
+			GL_RGBA, GL_BGR, GL_RGBA, GL_RGB, GL_RGBA,
+		};
+
+		static constexpr std::array<GLenum, 5> types = {
+			GL_UNSIGNED_INT_8_8_8_8, GL_UNSIGNED_BYTE, GL_UNSIGNED_SHORT_5_5_5_1,
+			GL_UNSIGNED_SHORT_5_6_5, GL_UNSIGNED_SHORT_4_4_4_4,
+		};
+
+		internalFormat = internalFormats[(int)format];
+		fmt = formats[(int)format];
+		type = types[(int)format];
+
+
        // Create texture for the FBO, setting up filters and the like
        // Reading back the current texture is slow, but allocate calls should be few and far between.
        // If this becomes a bottleneck, we can fix it semi-easily
        auto prevTexture = OpenGL::getTex2D();
-        texture.create(size.x(), size.y(), GL_RGBA8);
+        texture.create(size.x(), size.y(), internalFormat);
        texture.bind();
        texture.setMinFilter(OpenGL::Linear);
        texture.setMagFilter(OpenGL::Linear);
        glBindTexture(GL_TEXTURE_2D, prevTexture);

+		OpenGL::setObjectLabel(GL_TEXTURE, texture.handle(), "Surface: %dx%d %s from %08X to %08X", size.x(),
+							   size.y(), textureFormatToString(format).data(),
+							   range.lower(), range.upper());
        //Helpers::panic("Creating FBO: %d, %d\n", size.x(), size.y());

        fbo.createWithDrawTexture(texture);
@ -144,4 +171,4 @@ struct DepthBuffer {
    size_t sizeInBytes() {
        return (size_t)size.x() * (size_t)size.y() * PICA::sizePerPixel(format);
    }
-};
+};
--- a/include/renderer_gl/textures.hpp
+++ b/include/renderer_gl/textures.hpp
@ -40,7 +40,7 @@ struct Texture {

    void allocate();
    void setNewConfig(u32 newConfig);
-    void decodeTexture(const void* data);
+    void decodeTexture(std::span<const u8> data);
    void free();
    u64 sizeInBytes();

@ -53,7 +53,7 @@ struct Texture {
    static u32 getSwizzledOffset_4bpp(u32 u, u32 v, u32 width);

    // Returns the format of this texture as a string
-    std::string formatToString() {
+    std::string_view formatToString() {
        return PICA::textureFormatToString(format);
    }

@ -61,4 +61,4 @@ struct Texture {
    // TODO: Make hasAlpha a template parameter
    u32 getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, const void* data);
    u32 decodeETC(u32 alpha, u32 u, u32 v, u64 colourData);
-};
+};
--- a/include/services/gsp_gpu.hpp
+++ b/include/services/gsp_gpu.hpp
@ -18,6 +18,24 @@ enum class GPUInterrupt : u8 {
 	DMA = 6
 };

+struct FramebufferInfo {
+	u32 activeFb;
+	u32 leftFramebufferVaddr;
+	u32 rightFramebufferVaddr;
+	u32 stride;
+	u32 format;
+	u32 displayFb;
+	u32 attribute;
+};
+
+struct FrameBufferUpdate {
+	u8 index;
+	u8 dirtyFlag;
+	u16 pad0;
+	std::array<FramebufferInfo, 2> framebufferInfo;
+	u32 pad1;
+};
+
 // More circular dependencies
 class Kernel;

@ -42,6 +60,7 @@ class GPUService {
 	void flushDataCache(u32 messagePointer);
 	void registerInterruptRelayQueue(u32 messagePointer);
 	void setAxiConfigQoSMode(u32 messagePointer);
+	void setBufferSwap(u32 messagePointer);
 	void setInternalPriorities(u32 messagePointer);
 	void setLCDForceBlack(u32 messagePointer);
 	void storeDataCache(u32 messagePointer);
@ -57,6 +76,8 @@ class GPUService {
 	void triggerTextureCopy(u32* cmd);
 	void flushCacheRegions(u32* cmd);

+	void setBufferSwapImpl(u32 screen_id, const FramebufferInfo& info);
+
 public:
 	GPUService(Memory& mem, GPU& gpu, Kernel& kernel, u32& currentPID) : mem(mem), gpu(gpu),
 		kernel(kernel), currentPID(currentPID) {}
@ -69,4 +90,4 @@ public:
 			std::memset(ptr, 0, 0x1000);
 		}
 	}
-};
+};
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@ -8,6 +8,12 @@
 #include "PICA/float_types.hpp"
 #include "PICA/regs.hpp"

+constexpr u32 top_screen_width = 240;
+constexpr u32 top_screen_height = 400;
+
+constexpr u32 bottom_screen_width = 240;
+constexpr u32 bottom_screen_height = 300;
+
 using namespace Floats;

 // Note: For when we have multiple backends, the GL state manager can stay here and have the constructor for the Vulkan-or-whatever renderer ignore it
@ -41,6 +47,27 @@ void GPU::reset() {
 		e.config2 = 0;
 	}

+	// Initialize the framebuffer registers. Values taken from Citra.
+
+	using namespace PICA::ExternalRegs;
+	// Top screen addresses and dimentions.
+	external_regs[Framebuffer0AFirstAddr] = 0x181E6000;
+	external_regs[Framebuffer0ASecondAddr] = 0x1822C800;
+	external_regs[Framebuffer0BFirstAddr] = 0x18273000;
+	external_regs[Framebuffer0BSecondAddr] = 0x182B9800;
+	external_regs[Framebuffer0Size] = (top_screen_height << 16) | top_screen_width;
+	external_regs[Framebuffer0Stride] = 720;
+	external_regs[Framebuffer0Config] = static_cast<u32>(PICA::ColorFmt::RGB8);
+	external_regs[Framebuffer0Select] = 0;
+
+	// Bottom screen addresses and dimentions.
+	external_regs[Framebuffer1AFirstAddr] = 0x1848F000;
+	external_regs[Framebuffer1ASecondAddr] = 0x184C7800;
+	external_regs[Framebuffer1Size] = (bottom_screen_height << 16) | bottom_screen_width;
+	external_regs[Framebuffer1Stride] = 720;
+	external_regs[Framebuffer1Config] = static_cast<u32>(PICA::ColorFmt::RGB8);
+	external_regs[Framebuffer1Select] = 0;
+
 	renderer.reset();
 }

@ -123,12 +150,12 @@ void GPU::drawArrays() {
 			vertexIndex = i + regs[PICA::InternalRegs::VertexOffsetReg];
 		} else {
 			if (shortIndex) {
-				auto ptr = getPointerPhys<u16>(indexBufferPointer);
-				vertexIndex = *ptr; // TODO: This is very unsafe
+				auto ptr = getSpanPhys<u16>(indexBufferPointer, sizeof(u16));
+				vertexIndex = ptr[0]; // TODO: This is very unsafe
 				indexBufferPointer += 2;
 			} else {
-				auto ptr = getPointerPhys<u8>(indexBufferPointer);
-				vertexIndex = *ptr; // TODO: This is also very unsafe
+				auto ptr = getSpanPhys<u8>(indexBufferPointer, sizeof(u8));
+				vertexIndex = ptr[0]; // TODO: This is also very unsafe
 				indexBufferPointer += 1;
 			}
 		}
@ -188,42 +215,46 @@ void GPU::drawArrays() {

 					switch (attribType) {
 						case 0: { // Signed byte
-							s8* ptr = getPointerPhys<s8>(attrAddress);
+							const u32 attr_bytes = size * sizeof(s8);
+							const auto ptr = getSpanPhys<s8>(attrAddress, attr_bytes);
 							for (component = 0; component < size; component++) {
-								float val = static_cast<float>(*ptr++);
+								float val = static_cast<float>(ptr[component]);
 								attribute[component] = f24::fromFloat32(val);
 							}
-							attrAddress += size * sizeof(s8);
+							attrAddress += attr_bytes;
 							break;
 						}

 						case 1: { // Unsigned byte
-							u8* ptr = getPointerPhys<u8>(attrAddress);
+							const u32 attr_bytes = size * sizeof(u8);
+							const auto ptr = getSpanPhys<u8>(attrAddress, attr_bytes);
 							for (component = 0; component < size; component++) {
-								float val = static_cast<float>(*ptr++);
+								float val = static_cast<float>(ptr[component]);
 								attribute[component] = f24::fromFloat32(val);
 							}
-							attrAddress += size * sizeof(u8);
+							attrAddress += attr_bytes;
 							break;
 						}

 						case 2: { // Short
-							s16* ptr = getPointerPhys<s16>(attrAddress);
+							const u32 attr_bytes = size * sizeof(s16);
+							const auto ptr = getSpanPhys<s16>(attrAddress, attr_bytes);
 							for (component = 0; component < size; component++) {
-								float val = static_cast<float>(*ptr++);
+								float val = static_cast<float>(ptr[component]);
 								attribute[component] = f24::fromFloat32(val);
 							}
-							attrAddress += size * sizeof(s16);
+							attrAddress += attr_bytes;
 							break;
 						}

 						case 3: { // Float
-							float* ptr = getPointerPhys<float>(attrAddress);
+							const u32 attr_bytes = size * sizeof(float);
+							const auto ptr = getSpanPhys<float>(attrAddress, attr_bytes);
 							for (component = 0; component < size; component++) {
-								float val = *ptr++;
+								float val = ptr[component];
 								attribute[component] = f24::fromFloat32(val);
 							}
-							attrAddress += size * sizeof(float);
+							attrAddress += attr_bytes;
 							break;
 						}

--- a/src/core/PICA/regs.cpp
+++ b/src/core/PICA/regs.cpp
@ -18,11 +18,36 @@ void GPU::writeReg(u32 address, u32 value) {
 	if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers
 		const u32 index = (address - 0x1EF01000) / sizeof(u32);
 		writeInternalReg(index, value, 0xffffffff);
+	} else if (address >= 0x1EF00004 && address < 0x1EF01000) {
+		const u32 index = (address - 0x1EF00004) / sizeof(u32);
+		writeExternalReg(index, value);
 	} else {
-		log("Ignoring write to external GPU register %08X. Value: %08X\n", address, value);
+		log("Ignoring write to unknown GPU register %08X. Value: %08X\n", address, value);
 	}
 }

+u32 GPU::readExternalReg(u32 index) {
+	using namespace PICA::ExternalRegs;
+
+	if (index > 0x1000) [[unlikely]] {
+		Helpers::panic("Tried to read invalid external GPU register. Index: %X\n", index);
+		return -1;
+	}
+
+	return external_regs[index];
+}
+
+void GPU::writeExternalReg(u32 index, u32 value) {
+	using namespace PICA::ExternalRegs;
+
+	if (index > 0x1000) [[unlikely]] {
+		Helpers::panic("Tried to write to invalid external GPU register. Index: %X, value: %08X\n", index, value);
+		return;
+	}
+
+	external_regs[index] = value;
+}
+
 u32 GPU::readInternalReg(u32 index) {
 	using namespace PICA::InternalRegs;

@ -54,7 +79,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 	using namespace PICA::InternalRegs;

 	if (index > regNum) [[unlikely]] {
-		Helpers::panic("Tried to write to invalid GPU register. Index: %X, value: %08X\n", index, value);
+		Helpers::panic("Tried to write to invalid internal GPU register. Index: %X, value: %08X\n", index, value);
 		return;
 	}

@ -275,9 +300,9 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 				u32 size = (regs[CmdBufSize0 + bufferIndex] & 0xfffff) << 3;

 				// Set command buffer state to execute the new buffer
-				cmdBuffStart = getPointerPhys<u32>(addr);
-				cmdBuffCurr = cmdBuffStart;
-				cmdBuffEnd = cmdBuffStart + (size / sizeof(u32));
+				cmdBuffStart = getSpanPhys<u32>(addr, size);
+				cmdBuffCurr = 0;
+				cmdBuffEnd = (size / sizeof(u32));
 			}
 			break;
 		}
@ -308,12 +333,15 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 }

 void GPU::startCommandList(u32 addr, u32 size) {
-	cmdBuffStart = static_cast<u32*>(mem.getReadPointer(addr));
-	if (!cmdBuffStart) Helpers::panic("Couldn't get buffer for command list");
+	cmdBuffStart = getSpanPhys<u32>(addr, size);
+	if (cmdBuffStart.empty()) {
+		Helpers::panic("Couldn't get buffer for command list");
+		return;
+	}
 	// TODO: This is very memory unsafe. We get a pointer to FCRAM and just keep writing without checking if we're gonna go OoB

-	cmdBuffCurr = cmdBuffStart;
-	cmdBuffEnd = cmdBuffStart + (size / sizeof(u32));
+	cmdBuffCurr = 0;
+	cmdBuffEnd = (size / sizeof(u32));

 	// LUT for converting the parameter mask to an actual 32-bit mask
 	// The parameter mask is 4 bits long, each bit corresponding to one byte of the mask
@ -329,13 +357,13 @@ void GPU::startCommandList(u32 addr, u32 size) {
 		// The curr pointer starts out doubleword-aligned and is increased by 4 bytes each time
 		// So to check if it is aligned, we get the number of words it's been incremented by
 		// If that number is an odd value then the buffer is not aligned, otherwise it is
-		if ((cmdBuffCurr - cmdBuffStart) % 2 != 0) {
+		if (cmdBuffCurr % 2 != 0) {
 			cmdBuffCurr++;
 		}

 		// The first word of a command is the command parameter and the second one is the header
-		u32 param1 = *cmdBuffCurr++;
-		u32 header = *cmdBuffCurr++;
+		u32 param1 = cmdBuffStart[cmdBuffCurr++];
+		u32 header = cmdBuffStart[cmdBuffCurr++];

 		u32 id = header & 0xffff;
 		u32 paramMaskIndex = getBits<16, 4>(header);
@ -352,8 +380,8 @@ void GPU::startCommandList(u32 addr, u32 size) {
 		writeInternalReg(id, param1, mask);
 		for (u32 i = 0; i < paramCount; i++) {
 			id += idIncrement;
-			u32 param = *cmdBuffCurr++;
+			u32 param = cmdBuffStart[cmdBuffCurr++];
 			writeInternalReg(id, param, mask);
 		}
 	}
-}
+}
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@ -453,4 +453,4 @@ u64 Memory::timeSince3DSEpoch() {
 	constexpr u64 offset = 2208988800ull;
 	milliseconds ms = duration_cast<milliseconds>(seconds(rawTime + timezoneDifference + offset));
 	return ms.count();
-}
+}
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@ -576,6 +576,12 @@ const char* displayFragmentShader = R"(
    }
 )";

+static void APIENTRY debugHandler(GLenum source, GLenum type, GLuint id, GLenum severity,
+								  GLsizei length, const GLchar* message, const void* userParam) {
+	Helpers::warn("%d: %s\n", id, message);
+}
+
+
 void Renderer::reset() {
 	depthBufferCache.reset();
 	colourBufferCache.reset();
@ -695,6 +701,11 @@ void Renderer::initGraphicsContext() {
 	OpenGL::clearColor();
 	OpenGL::setViewport(oldViewport[0], oldViewport[1], oldViewport[2], oldViewport[3]);

+#if defined(OPENGL_DEBUG_INFO)
+	glEnable(GL_DEBUG_OUTPUT);
+	glDebugMessageCallback(debugHandler, nullptr);
+#endif
+
 	reset();
 }

@ -919,6 +930,39 @@ constexpr u32 bottomScreenBuffer = 0x1f05dc00;

 void Renderer::display() {
 	gl.disableScissor();
+	gl.disableBlend();
+	gl.disableDepth();
+	gl.disableScissor();
+	gl.setColourMask(true, true, true, true);
+	gl.useProgram(displayProgram);
+	gl.bindVAO(dummyVAO);
+
+	OpenGL::disableClipPlane(0);
+	OpenGL::disableClipPlane(1);
+
+	using namespace PICA::ExternalRegs;
+	const u32 topScreenAddr = gpu.readExternalReg(Framebuffer0AFirstAddr);
+	const u32 bottomScreenAddr = gpu.readExternalReg(Framebuffer1AFirstAddr);
+
+	auto topScreen = colourBufferCache.findFromAddress(topScreenAddr);
+	auto bottomScreen = colourBufferCache.findFromAddress(bottomScreenAddr);
+	Helpers::warn("Top screen addr %08X\n", topScreenAddr);
+
+	screenFramebuffer.bind(OpenGL::DrawFramebuffer);
+
+	// Hack: Detect whether we are writing to the top or bottom screen by checking output gap and drawing to the proper part of the output texture
+	// We consider output gap == 320 to mean bottom, and anything else to mean top
+	if (topScreen) {
+		topScreen->get().texture.bind();
+		OpenGL::setViewport(0, 240, 400, 240); // Top screen viewport
+		OpenGL::draw(OpenGL::TriangleStrip, 4); // Actually draw our 3DS screen
+	}
+
+	if (bottomScreen) {
+		bottomScreen->get().texture.bind();
+		OpenGL::setViewport(40, 0, 320, 240);
+		OpenGL::draw(OpenGL::TriangleStrip, 4);
+	}

 	glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
 	screenFramebuffer.bind(OpenGL::ReadFramebuffer);
@ -986,7 +1030,7 @@ OpenGL::Texture Renderer::getTexture(Texture& tex) {
 	if (buffer.has_value()) {
 		return buffer.value().get().texture;
 	} else {
-		const void* textureData = gpu.getPointerPhys<void*>(tex.location); // Get pointer to the texture data in 3DS memory
+		const std::span textureData = gpu.getSpanPhys<u8>(tex.location, tex.sizeInBytes()); // Get pointer to the texture data in 3DS memory
 		Texture& newTex = textureCache.add(tex);
 		newTex.decodeTexture(textureData);

@ -994,40 +1038,86 @@ OpenGL::Texture Renderer::getTexture(Texture& tex) {
 	}
 }

+// NOTE: The GPU format has RGB5551 and RGB655 swapped compared to internal regs format
+PICA::ColorFmt ToColorFmt(u32 format) {
+	switch (format) {
+		case 2: return PICA::ColorFmt::RGB565;
+		case 3: return PICA::ColorFmt::RGBA5551;
+		default: return static_cast<PICA::ColorFmt>(format);
+	}
+}
+
 void Renderer::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {
 	const u32 inputWidth = inputSize & 0xffff;
-	const u32 inputGap = inputSize >> 16;
+	const u32 inputHeight = inputSize >> 16;
+	const auto inputFormat = ToColorFmt(Helpers::getBits<8, 3>(flags));
+	const auto outputFormat = ToColorFmt(Helpers::getBits<12, 3>(flags));
+	const PICA::Scaling scaling = static_cast<PICA::Scaling>(Helpers::getBits<24, 2>(flags));

-	const u32 outputWidth = outputSize & 0xffff;
-	const u32 outputGap = outputSize >> 16;
-
-	auto framebuffer = colourBufferCache.findFromAddress(inputAddr);
-	// If there's a framebuffer at this address, use it. Otherwise go back to our old hack and display framebuffer 0
-	// Displays are hard I really don't want to try implementing them because getting a fast solution is terrible
-	OpenGL::Texture& tex = framebuffer.has_value() ? framebuffer.value().get().texture : colourBufferCache[0].texture;
-
-	tex.bind();
-	screenFramebuffer.bind(OpenGL::DrawFramebuffer);
-
-	gl.disableBlend();
-	gl.disableDepth();
-	gl.disableScissor();
-	gl.setColourMask(true, true, true, true);
-	gl.useProgram(displayProgram);
-	gl.bindVAO(dummyVAO);
-
-	OpenGL::disableClipPlane(0);
-	OpenGL::disableClipPlane(1);
-
-	// Hack: Detect whether we are writing to the top or bottom screen by checking output gap and drawing to the proper part of the output texture
-	// We consider output gap == 320 to mean bottom, and anything else to mean top
-	if (outputGap == 320) {
-		OpenGL::setViewport(40, 0, 320, 240); // Bottom screen viewport
-	} else {
-		OpenGL::setViewport(0, 240, 400, 240); // Top screen viewport
+	u32 outputWidth = outputSize & 0xffff;
+	if (scaling == PICA::Scaling::X || scaling == PICA::Scaling::XY) {
+		outputWidth >>= 1;
+	}
+	u32 outputHeight = outputSize >> 16;
+	if (scaling == PICA::Scaling::XY) {
+		outputHeight >>= 1;
 	}

-	OpenGL::draw(OpenGL::TriangleStrip, 4); // Actually draw our 3DS screen
+	// If there's a framebuffer at this address, use it. Otherwise go back to our old hack and display framebuffer 0
+	// Displays are hard I really don't want to try implementing them because getting a fast solution is terrible
+	auto srcFramebuffer = getColourBuffer(inputAddr, inputFormat, inputWidth, inputHeight);
+	auto dstFramebuffer = getColourBuffer(outputAddr, outputFormat, outputWidth, outputHeight);
+
+	Helpers::warn("Display transfer with outputAddr %08X\n", outputAddr);
+
+	// Blit the framebuffers
+	srcFramebuffer.fbo.bind(OpenGL::ReadFramebuffer);
+	dstFramebuffer.fbo.bind(OpenGL::DrawFramebuffer);
+	glBlitFramebuffer(0, 0, inputWidth, inputHeight, 0, 0, outputWidth, outputHeight, GL_COLOR_BUFFER_BIT, GL_LINEAR);
+}
+
+void Renderer::textureCopy(u32 inputAddr, u32 outputAddr, u32 copyBytes, u32 inputSize, u32 outputSize, u32 flags) {
+	copyBytes = Helpers::alignDown(copyBytes, 16);
+	if (copyBytes == 0) [[unlikely]] {
+		return;
+	}
+
+	const u32 inputWidth = (inputSize & 0xffff) * 16;
+	const u32 inputGap = (inputSize >> 16) * 16;
+	const u32 outputWidth = (outputSize & 0xffff) * 16;
+	const u32 outputGap = (outputSize >> 16) * 16;
+
+	if (inputGap != 0 || inputWidth != outputWidth) {
+		Helpers::warn("Texture copy with non zero input gap or mismatching widths, cannot be accelerated");
+		return;
+	}
+
+	// If the texture is tiled, apps set inputWidth to the scanline size which is width * 8.
+	// HACK: We don't know if the src texture is tiled or not yet, assume it is for now, because it's the most common case.
+	// Citra handles this by letting the width/stride be set as bytes and interpreting it differently
+	// depending on the candidate surface.
+	auto srcFramebuffer = getColourBuffer(inputAddr, PICA::ColorFmt::RGBA8, inputWidth / 8, copyBytes / inputWidth);  // HACK: Assume RGBA8 format
+	auto dstFramebuffer = getColourBuffer(outputAddr, srcFramebuffer.format, outputWidth / 8, copyBytes / outputWidth);
+
+	// Blit the framebuffers
+	srcFramebuffer.fbo.bind(OpenGL::ReadFramebuffer);
+	dstFramebuffer.fbo.bind(OpenGL::DrawFramebuffer);
+	glBlitFramebuffer(0, 0, srcFramebuffer.size.x(), srcFramebuffer.size.y(),
+					  0, 0, dstFramebuffer.size.x(), dstFramebuffer.size.y(), GL_COLOR_BUFFER_BIT, GL_LINEAR);
+}
+
+ColourBuffer Renderer::getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height) {
+	// Try to find an already existing buffer that contains the provided address
+	// This is a more relaxed check compared to getColourFBO as display transfer/texcopy may refer to
+	// subrect of a surface and in case of texcopy we don't know the format of the surface.
+	auto buffer = colourBufferCache.findFromAddress(addr);
+	if (buffer.has_value()) {
+		return buffer.value().get();
+	}
+
+	// Otherwise create and cache a new buffer.
+	ColourBuffer sampleBuffer(addr, format, width, height);
+	return colourBufferCache.add(sampleBuffer);
 }

 void Renderer::screenshot(const std::string& name) {
@ -1053,4 +1143,4 @@ void Renderer::screenshot(const std::string& name) {
 	}

 	stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0);
-}
+}
--- a/src/core/renderer_gl/textures.cpp
+++ b/src/core/renderer_gl/textures.cpp
@ -258,18 +258,18 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) {
    }
 }

-void Texture::decodeTexture(const void* data) {
+void Texture::decodeTexture(std::span<const u8> data) {
    std::vector<u32> decoded;
    decoded.reserve(u64(size.u()) * u64(size.v()));

    // Decode texels line by line
    for (u32 v = 0; v < size.v(); v++) {
        for (u32 u = 0; u < size.u(); u++) {
-            u32 colour = decodeTexel(u, v, format, data);
+            u32 colour = decodeTexel(u, v, format, data.data());
            decoded.push_back(colour);
        }
    }

    texture.bind();
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, size.u(), size.v(), GL_RGBA, GL_UNSIGNED_BYTE, decoded.data());
-}
+}
--- a/src/core/services/gsp_gpu.cpp
+++ b/src/core/services/gsp_gpu.cpp
@ -1,4 +1,5 @@
 #include "services/gsp_gpu.hpp"
+#include "PICA/regs.hpp"
 #include "ipc.hpp"
 #include "kernel.hpp"

@ -10,6 +11,7 @@ namespace ServiceCommands {
 		RegisterInterruptRelayQueue = 0x00130042,
 		WriteHwRegs = 0x00010082,
 		WriteHwRegsWithMask = 0x00020084,
+		SetBufferSwap = 0x00050200,
 		FlushDataCache = 0x00080082,
 		SetLCDForceBlack = 0x000B0040,
 		TriggerCmdReqQueue = 0x000C0000,
@ -19,15 +21,35 @@ namespace ServiceCommands {
 }

 // Commands written to shared memory and processed by TriggerCmdReqQueue
-namespace GXCommands {
-	enum : u32 {
-		TriggerDMARequest = 0,
-		ProcessCommandList = 1,
-		MemoryFill = 2,
-		TriggerDisplayTransfer = 3,
-		TriggerTextureCopy = 4,
-		FlushCacheRegions = 5
-	};
+enum class GXCommands : u32 {
+	TriggerDMARequest = 0,
+	ProcessCommandList = 1,
+	MemoryFill = 2,
+	TriggerDisplayTransfer = 3,
+	TriggerTextureCopy = 4,
+	FlushCacheRegions = 5
+};
+
+static u32 VaddrToPaddr(u32 addr) {
+	if (addr >= VirtualAddrs::VramStart && addr < (VirtualAddrs::VramStart + VirtualAddrs::VramSize)) [[likely]] {
+		return addr - VirtualAddrs::VramStart + PhysicalAddrs::VRAM;
+	}
+
+	else if (addr >= VirtualAddrs::LinearHeapStartOld && addr < VirtualAddrs::LinearHeapEndOld) {
+		return addr - VirtualAddrs::LinearHeapStartOld + PhysicalAddrs::FCRAM;
+	}
+
+	else if (addr >= VirtualAddrs::LinearHeapStartNew && addr < VirtualAddrs::LinearHeapEndNew) {
+		return addr - VirtualAddrs::LinearHeapStartNew + PhysicalAddrs::FCRAM;
+	}
+
+	else if (addr == 0) {
+		return 0;
+	}
+
+	Helpers::warn("[GSP::GPU VaddrToPaddr] Unknown virtual address %08X", addr);
+	// Obviously garbage address
+	return 0xF3310932;
 }

 void GPUService::reset() {
@ -43,13 +65,14 @@ void GPUService::handleSyncRequest(u32 messagePointer) {
 		case ServiceCommands::FlushDataCache: flushDataCache(messagePointer); break;
 		case ServiceCommands::RegisterInterruptRelayQueue: registerInterruptRelayQueue(messagePointer); break;
 		case ServiceCommands::SetAxiConfigQoSMode: setAxiConfigQoSMode(messagePointer); break;
+		case ServiceCommands::SetBufferSwap: setBufferSwap(messagePointer); break;
 		case ServiceCommands::SetInternalPriorities: setInternalPriorities(messagePointer); break;
 		case ServiceCommands::SetLCDForceBlack: setLCDForceBlack(messagePointer); break;
 		case ServiceCommands::StoreDataCache: storeDataCache(messagePointer); break;
 		case ServiceCommands::TriggerCmdReqQueue: [[likely]] triggerCmdReqQueue(messagePointer); break;
 		case ServiceCommands::WriteHwRegs: writeHwRegs(messagePointer); break;
 		case ServiceCommands::WriteHwRegsWithMask: writeHwRegsWithMask(messagePointer); break;
-;		default: Helpers::panic("GPU service requested. Command: %08X\n", command);
+		default: Helpers::panic("GPU service requested. Command: %08X\n", command);
 	}
 }

@ -122,15 +145,12 @@ void GPUService::requestInterrupt(GPUInterrupt type) {
 	// Not emulating this causes Yoshi's Wooly World, Captain Toad, Metroid 2 et al to hang
 	if (type == GPUInterrupt::VBlank0 || type == GPUInterrupt::VBlank1) {
 		int screen = static_cast<u32>(type) - static_cast<u32>(GPUInterrupt::VBlank0); // 0 for top screen, 1 for bottom
-
-		constexpr u32 FBInfoSize = 0x40;
 		// TODO: Offset depends on GSP thread being triggered
-		u8* info = &sharedMem[0x200 + screen * FBInfoSize];
-		u8& dirtyFlag = info[1];
+		FrameBufferUpdate* update = reinterpret_cast<FrameBufferUpdate*>(&sharedMem[0x200 + screen * sizeof(FrameBufferUpdate)]);

-		if (dirtyFlag & 1) {
-			// TODO: Submit buffer info here
-			dirtyFlag &= ~1;
+		if (update->dirtyFlag & 1) {
+			setBufferSwapImpl(screen, update->framebufferInfo[update->index]);
+			update->dirtyFlag &= ~1;
 		}
 	}

@ -259,6 +279,18 @@ void GPUService::setAxiConfigQoSMode(u32 messagePointer) {
 	mem.write32(messagePointer + 4, Result::Success);
 }

+void GPUService::setBufferSwap(u32 messagePointer) {
+	FramebufferInfo info{};
+	const u32 screenId = mem.read32(messagePointer + 4); // Selects either PDC0 or PDC1
+	info.activeFb = mem.read32(messagePointer + 8);
+	info.leftFramebufferVaddr = mem.read32(messagePointer + 12);
+	info.rightFramebufferVaddr = mem.read32(messagePointer + 16);
+	info.stride = mem.read32(messagePointer + 20);
+	info.format = mem.read32(messagePointer + 24);
+	info.displayFb = mem.read32(messagePointer + 28); // Selects either framebuffer A or B
+	setBufferSwapImpl(screenId, info);
+}
+
 // Seems to also be completely undocumented
 void GPUService::setInternalPriorities(u32 messagePointer) {
 	log("GSP::GPU::SetInternalPriorities\n");
@ -281,7 +313,7 @@ void GPUService::processCommandBuffer() {
 		log("Processing %d GPU commands\n", commandsLeft);

 		while (commandsLeft != 0) {
-			u32 cmdID = cmd[0] & 0xff;
+			const GXCommands cmdID = static_cast<GXCommands>(cmd[0] & 0xff);
 			switch (cmdID) {
 				case GXCommands::ProcessCommandList: processCommandList(cmd); break;
 				case GXCommands::MemoryFill: memoryFill(cmd); break;
@ -324,28 +356,6 @@ void GPUService::memoryFill(u32* cmd) {
 	}
 }

-static u32 VaddrToPaddr(u32 addr) {
-	if (addr >= VirtualAddrs::VramStart && addr < (VirtualAddrs::VramStart + VirtualAddrs::VramSize)) [[likely]] {
-		return addr - VirtualAddrs::VramStart + PhysicalAddrs::VRAM;
-	}
-	
-	else if (addr >= VirtualAddrs::LinearHeapStartOld && addr < VirtualAddrs::LinearHeapEndOld) {
-		return addr - VirtualAddrs::LinearHeapStartOld + PhysicalAddrs::FCRAM;
-	}
-
-	else if (addr >= VirtualAddrs::LinearHeapStartNew && addr < VirtualAddrs::LinearHeapEndNew) {
-		return addr - VirtualAddrs::LinearHeapStartNew + PhysicalAddrs::FCRAM;
-	}
-
-	else if (addr == 0) {
-		return 0;
-	}
-
-	Helpers::warn("[GSP::GPU VaddrToPaddr] Unknown virtual address %08X", addr);
-	// Obviously garbage address
-	return 0xF3310932;
-}
-
 void GPUService::triggerDisplayTransfer(u32* cmd) {
 	const u32 inputAddr = VaddrToPaddr(cmd[1]);
 	const u32 outputAddr = VaddrToPaddr(cmd[2]);
@ -373,23 +383,74 @@ void GPUService::flushCacheRegions(u32* cmd) {
 	log("GSP::GPU::FlushCacheRegions (Stubbed)\n");
 }

+void GPUService::setBufferSwapImpl(u32 screenId, const FramebufferInfo& info) {
+	using namespace PICA::ExternalRegs;
+
+	constexpr static std::array<u32, 8> fb_addresses = {
+		Framebuffer0AFirstAddr,
+		Framebuffer0ASecondAddr,
+		Framebuffer0BFirstAddr,
+		Framebuffer0BSecondAddr,
+		Framebuffer1AFirstAddr,
+		Framebuffer1ASecondAddr,
+		Framebuffer1BFirstAddr,
+		Framebuffer1BSecondAddr,
+	};
+
+	const u32 fb_index = screenId * 4 + info.activeFb * 2;
+	gpu.writeExternalReg(fb_addresses[fb_index], VaddrToPaddr(info.leftFramebufferVaddr));
+	gpu.writeExternalReg(fb_addresses[fb_index + 1], VaddrToPaddr(info.rightFramebufferVaddr));
+
+	constexpr static std::array<u32, 6> config_addresses = {
+		Framebuffer0Config,
+		Framebuffer0Select,
+		Framebuffer0Stride,
+		Framebuffer1Config,
+		Framebuffer1Select,
+		Framebuffer1Stride,
+	};
+
+	const u32 config_index = screenId * 3;
+	gpu.writeExternalReg(config_addresses[config_index], info.format);
+	gpu.writeExternalReg(config_addresses[config_index + 1], info.displayFb);
+	gpu.writeExternalReg(config_addresses[config_index + 2], info.stride);
+}
+
 // Actually send command list (aka display list) to GPU
 void GPUService::processCommandList(u32* cmd) {
 	const u32 address = cmd[1] & ~7; // Buffer address
 	const u32 size = cmd[2] & ~3; // Buffer size in bytes
-	const bool updateGas = cmd[3] == 1; // Update gas additive blend results (0 = don't update, 1 = update)
-	const bool flushBuffer = cmd[7] == 1; // Flush buffer (0 = don't flush, 1 = flush)
+	[[maybe_unused]] const bool updateGas = cmd[3] == 1; // Update gas additive blend results (0 = don't update, 1 = update)
+	[[maybe_unused]] const bool flushBuffer = cmd[7] == 1; // Flush buffer (0 = don't flush, 1 = flush)

 	log("GPU::GSP::processCommandList. Address: %08X, size in bytes: %08X\n", address, size);
-	gpu.startCommandList(address, size);
+	gpu.startCommandList(VaddrToPaddr(address), size);
 	requestInterrupt(GPUInterrupt::P3D); // Send an IRQ when command list processing is over
 }

 // TODO: Emulate the transfer engine & its registers
 // Then this can be emulated by just writing the appropriate values there
 void GPUService::triggerTextureCopy(u32* cmd) {
-	Helpers::warn("GSP::GPU::TriggerTextureCopy (unimplemented)\n");
+	const u32 inputBufferAddr = VaddrToPaddr(cmd[1]);
+	const u32 outputBufferAddr = VaddrToPaddr(cmd[2]);
+	const u32 totalCopyBytes = cmd[3];
+	const u32 inputWidthGap = cmd[4];
+	const u32 outputWidthGap = cmd[5];
+	const u32 flags = cmd[6];
+
+	// Write the trigger register
+	using namespace PICA::ExternalRegs;
+	gpu.writeExternalReg(TransferTrigger, gpu.readExternalReg(TransferTrigger) | 1);
+
+	// Perform the texture copy
+	gpu.textureCopy(inputBufferAddr, outputBufferAddr, totalCopyBytes, inputWidthGap, outputWidthGap, flags);
+
 	// This uses the transfer engine and thus needs to fire a PPF interrupt.
 	// NSMB2 relies on this
 	requestInterrupt(GPUInterrupt::PPF);
-}
+
+	// Writing to trigger will perform the texture copy.
+	// Reset the bit here to singal completion.
+	gpu.writeExternalReg(TransferTrigger, gpu.readExternalReg(TransferTrigger) & ~1);
+}
+