code: Better screen support

This commit is contained in:
GPUCode 2023-07-10 23:59:44 +03:00
parent c8f4d41b47
commit 603074dc90
15 changed files with 465 additions and 140 deletions

View file

@ -28,6 +28,7 @@ class GPU {
static constexpr u32 maxAttribCount = 12; // Up to 12 vertex attributes
static constexpr u32 vramSize = 6_MB;
Registers regs; // GPU internal registers
std::array<u32, 0x1000> external_regs; // GPU external registers
std::array<vec4f, 16> currentAttributes; // Vertex attributes before being passed to the shader
std::array<vec4f, 16> immediateModeAttributes; // Vertex attributes uploaded via immediate mode submission
@ -66,9 +67,9 @@ class GPU {
std::array<u32, 3> fixedAttrBuff; // Buffer to hold fixed attributes in until they get submitted
// Command processor pointers for GPU command lists
u32* cmdBuffStart = nullptr;
u32* cmdBuffEnd = nullptr;
u32* cmdBuffCurr = nullptr;
std::span<u32> cmdBuffStart{};
u32 cmdBuffEnd = 0;
u32 cmdBuffCurr = 0;
Renderer renderer;
PICA::Vertex getImmediateModeVertex();
@ -100,6 +101,9 @@ class GPU {
u32 readReg(u32 address);
void writeReg(u32 address, u32 value);
u32 readExternalReg(u32 index);
void writeExternalReg(u32 index, u32 value);
// Used when processing GPU command lists
u32 readInternalReg(u32 index);
void writeInternalReg(u32 index, u32 value, u32 mask);
@ -116,6 +120,10 @@ class GPU {
renderer.displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags);
}
void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalCopyBytes, u32 inputSize, u32 outputSize, u32 flags) {
renderer.textureCopy(inputAddr, outputAddr, totalCopyBytes, inputSize, outputSize, flags);
}
// Read a value of type T from physical address paddr
// This is necessary because vertex attribute fetching uses physical addresses
template <typename T>
@ -130,19 +138,18 @@ class GPU {
}
}
// Get a pointer of type T* to the data starting from physical address paddr
// Get a span with the specified size of type T to the data starting from physical address paddr
template <typename T>
T* getPointerPhys(u32 paddr) {
if (paddr >= PhysicalAddrs::FCRAM && paddr <= PhysicalAddrs::FCRAMEnd) {
std::span<T> getSpanPhys(u32 paddr, u32 size) {
if (paddr >= PhysicalAddrs::FCRAM && paddr + size <= PhysicalAddrs::FCRAMEnd) {
u8* fcram = mem.getFCRAM();
u32 index = paddr - PhysicalAddrs::FCRAM;
return (T*)&fcram[index];
} else if (paddr >= PhysicalAddrs::VRAM && paddr <= PhysicalAddrs::VRAMEnd) {
return std::span{(T*)&fcram[index], size / sizeof(T)};
} else if (paddr >= PhysicalAddrs::VRAM && paddr + size <= PhysicalAddrs::VRAMEnd) {
u32 index = paddr - PhysicalAddrs::VRAM;
return (T*)&vram[index];
return std::span{(T*)&vram[index], size / sizeof(T)};
} else [[unlikely]] {
Helpers::panic("[GPU] Tried to access unknown physical address: %08X", paddr);
}
}
};
};

View file

@ -1,4 +1,5 @@
#pragma once
#include <string_view>
#include "helpers.hpp"
namespace PICA {
@ -174,6 +175,54 @@ namespace PICA {
};
}
namespace ExternalRegs {
enum : u32 {
MemFill1BufferStartPaddr = 0x3,
MemFill1BufferEndPAddr = 0x4,
MemFill1Value = 0x5,
MemFill1Control = 0x6,
MemFill2BufferStartPaddr = 0x7,
MemFill2BufferEndPAddr = 0x8,
MemFill2Value = 0x9,
MemFill2Control = 0xA,
VramBankControl = 0xB,
GPUBusy = 0xC,
BacklightControl = 0xBC,
// TODO: Framebuffer regs
Framebuffer0Size = 0x2F,
Framebuffer0AFirstAddr = 0x119,
Framebuffer0ASecondAddr = 0x11A,
Framebuffer0Config = 0x11B,
Framebuffer0Select = 0x11D,
Framebuffer0Stride = 0x123,
Framebuffer0BFirstAddr = 0x124,
Framebuffer0BSecondAddr = 0x125,
Framebuffer1Size = 0x156,
Framebuffer1AFirstAddr = 0x159,
Framebuffer1ASecondAddr = 0x15A,
Framebuffer1Config = 0x15B,
Framebuffer1Select = 0x15D,
Framebuffer1Stride = 0x163,
Framebuffer1BFirstAddr = 0x164,
Framebuffer1BSecondAddr = 0x165,
TransferInputPAddr = 0x2FF,
TransferOutputPAddr = 0x300,
DisplayTransferOutputDim = 0x301,
DisplayTransferInputDim = 0x302,
TransferFlags = 0x303,
TransferTrigger = 0x305,
TextureCopyTotalBytes = 0x307,
TextureCopyInputLineGap = 0x308,
TextureCopyOutputLineGap = 0x309,
};
}
enum class Scaling : u32 {
None = 0,
X = 1,
XY = 2,
};
namespace Lights {
enum : u32 {
LUT_D0 = 0,
@ -235,7 +284,7 @@ namespace PICA {
};
// Returns the string representation of a texture format
inline constexpr const char* textureFormatToString(TextureFmt fmt) {
constexpr std::string_view textureFormatToString(TextureFmt fmt) {
switch (fmt) {
case TextureFmt::RGBA8: return "RGBA8";
case TextureFmt::RGB8: return "RGB8";
@ -255,16 +304,16 @@ namespace PICA {
}
}
inline constexpr const char* textureFormatToString(ColorFmt fmt) {
constexpr std::string_view textureFormatToString(ColorFmt fmt) {
return textureFormatToString(static_cast<TextureFmt>(fmt));
}
inline constexpr bool hasStencil(DepthFmt format) { return format == PICA::DepthFmt::Depth24Stencil8; }
constexpr bool hasStencil(DepthFmt format) { return format == PICA::DepthFmt::Depth24Stencil8; }
// Size occupied by each pixel in bytes
// All formats are 16BPP except for RGBA8 (32BPP) and BGR8 (24BPP)
inline constexpr usize sizePerPixel(TextureFmt format) {
constexpr usize sizePerPixel(TextureFmt format) {
switch (format) {
case TextureFmt::RGB8: return 3;
case TextureFmt::RGBA8: return 4;
@ -272,11 +321,11 @@ namespace PICA {
}
}
inline constexpr usize sizePerPixel(ColorFmt format) {
constexpr usize sizePerPixel(ColorFmt format) {
return sizePerPixel(static_cast<TextureFmt>(format));
}
inline constexpr usize sizePerPixel(DepthFmt format) {
constexpr usize sizePerPixel(DepthFmt format) {
switch (format) {
case DepthFmt::Depth16: return 2;
case DepthFmt::Depth24: return 3;
@ -292,4 +341,4 @@ namespace PICA {
GeometryPrimitive = 3,
};
} // namespace PICA
} // namespace PICA

View file

@ -137,4 +137,4 @@ struct GLStateManager {
};
static_assert(std::is_trivially_constructible<GLStateManager>(), "OpenGL State Manager class is not trivially constructible!");
static_assert(std::is_trivially_destructible<GLStateManager>(), "OpenGL State Manager class is not trivially destructible!");
static_assert(std::is_trivially_destructible<GLStateManager>(), "OpenGL State Manager class is not trivially destructible!");

View file

@ -80,6 +80,13 @@ namespace Helpers {
}
}
/// Align down an arbitrary-size value to the closect possible multiple of value.
template <typename T>
static constexpr T alignDown(T value, std::size_t size) {
static_assert(std::is_unsigned_v<T>, "T must be an unsigned value.");
return static_cast<T>(value - value % size);
}
/// Sign extend an arbitrary-size value to 32 bits
static constexpr u32 inline signExtend32(u32 value, u32 startingSize) {
auto temp = (s32)value;

View file

@ -5,6 +5,7 @@
#include <fstream>
#include <optional>
#include <vector>
#include <span>
#include "crypto/aes_engine.hpp"
#include "helpers.hpp"
#include "handles.hpp"
@ -248,4 +249,4 @@ public:
void setVRAM(u8* pointer) { vram = pointer; }
bool allocateMainThreadStack(u32 size);
};
};

View file

@ -44,8 +44,8 @@ class Renderer {
float oldDepthOffset = 0.0;
bool oldDepthmapEnable = false;
SurfaceCache<DepthBuffer, 10, true> depthBufferCache;
SurfaceCache<ColourBuffer, 10, true> colourBufferCache;
SurfaceCache<DepthBuffer, 64, true> depthBufferCache;
SurfaceCache<ColourBuffer, 64, true> colourBufferCache;
SurfaceCache<Texture, 256, true> textureCache;
OpenGL::uvec2 fbSize; // The size of the framebuffer (ie both the colour and depth buffer)'
@ -87,6 +87,7 @@ class Renderer {
void getGraphicsContext(); // Set up graphics context for rendering
void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control); // Clear a GPU buffer in VRAM
void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags); // Perform display transfer
void textureCopy(u32 inputAddr, u32 outputAddr, u32 copyBytes, u32 inputSize, u32 outputSize, u32 flags); // Perform display transfer
void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices); // Draw the given vertices
// Take a screenshot of the screen and store it in a file
@ -97,6 +98,8 @@ class Renderer {
fbSize.y() = height;
}
ColourBuffer getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height);
void setColourFormat(PICA::ColorFmt format) { colourBufferFormat = format; }
void setDepthFormat(PICA::DepthFmt format) {
if (format == PICA::DepthFmt::Unknown1) {
@ -109,4 +112,4 @@ class Renderer {
void setDepthBufferLoc(u32 loc) { depthBufferLoc = loc; }
static constexpr u32 vertexBufferSize = 0x10000;
};
};

View file

@ -19,6 +19,10 @@ struct ColourBuffer {
OpenGL::Texture texture;
OpenGL::Framebuffer fbo;
GLenum internalFormat;
GLenum fmt;
GLenum type;
ColourBuffer() : valid(false) {}
ColourBuffer(u32 loc, PICA::ColorFmt format, u32 x, u32 y, bool valid = true)
@ -29,17 +33,40 @@ struct ColourBuffer {
range = Interval<u32>(loc, (u32)endLoc);
}
void allocate() {
void allocate() {
// Internal formats for the texture based on format
static constexpr std::array<GLenum, 5> internalFormats = {
GL_RGBA8, GL_RGB8, GL_RGB5_A1, GL_RGB565, GL_RGBA4
};
// Format of the texture
static constexpr std::array<GLenum, 5> formats = {
GL_RGBA, GL_BGR, GL_RGBA, GL_RGB, GL_RGBA,
};
static constexpr std::array<GLenum, 5> types = {
GL_UNSIGNED_INT_8_8_8_8, GL_UNSIGNED_BYTE, GL_UNSIGNED_SHORT_5_5_5_1,
GL_UNSIGNED_SHORT_5_6_5, GL_UNSIGNED_SHORT_4_4_4_4,
};
internalFormat = internalFormats[(int)format];
fmt = formats[(int)format];
type = types[(int)format];
// Create texture for the FBO, setting up filters and the like
// Reading back the current texture is slow, but allocate calls should be few and far between.
// If this becomes a bottleneck, we can fix it semi-easily
auto prevTexture = OpenGL::getTex2D();
texture.create(size.x(), size.y(), GL_RGBA8);
texture.create(size.x(), size.y(), internalFormat);
texture.bind();
texture.setMinFilter(OpenGL::Linear);
texture.setMagFilter(OpenGL::Linear);
glBindTexture(GL_TEXTURE_2D, prevTexture);
OpenGL::setObjectLabel(GL_TEXTURE, texture.handle(), "Surface: %dx%d %s from %08X to %08X", size.x(),
size.y(), textureFormatToString(format).data(),
range.lower(), range.upper());
//Helpers::panic("Creating FBO: %d, %d\n", size.x(), size.y());
fbo.createWithDrawTexture(texture);
@ -144,4 +171,4 @@ struct DepthBuffer {
size_t sizeInBytes() {
return (size_t)size.x() * (size_t)size.y() * PICA::sizePerPixel(format);
}
};
};

View file

@ -40,7 +40,7 @@ struct Texture {
void allocate();
void setNewConfig(u32 newConfig);
void decodeTexture(const void* data);
void decodeTexture(std::span<const u8> data);
void free();
u64 sizeInBytes();
@ -53,7 +53,7 @@ struct Texture {
static u32 getSwizzledOffset_4bpp(u32 u, u32 v, u32 width);
// Returns the format of this texture as a string
std::string formatToString() {
std::string_view formatToString() {
return PICA::textureFormatToString(format);
}
@ -61,4 +61,4 @@ struct Texture {
// TODO: Make hasAlpha a template parameter
u32 getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, const void* data);
u32 decodeETC(u32 alpha, u32 u, u32 v, u64 colourData);
};
};

View file

@ -18,6 +18,24 @@ enum class GPUInterrupt : u8 {
DMA = 6
};
struct FramebufferInfo {
u32 activeFb;
u32 leftFramebufferVaddr;
u32 rightFramebufferVaddr;
u32 stride;
u32 format;
u32 displayFb;
u32 attribute;
};
struct FrameBufferUpdate {
u8 index;
u8 dirtyFlag;
u16 pad0;
std::array<FramebufferInfo, 2> framebufferInfo;
u32 pad1;
};
// More circular dependencies
class Kernel;
@ -42,6 +60,7 @@ class GPUService {
void flushDataCache(u32 messagePointer);
void registerInterruptRelayQueue(u32 messagePointer);
void setAxiConfigQoSMode(u32 messagePointer);
void setBufferSwap(u32 messagePointer);
void setInternalPriorities(u32 messagePointer);
void setLCDForceBlack(u32 messagePointer);
void storeDataCache(u32 messagePointer);
@ -57,6 +76,8 @@ class GPUService {
void triggerTextureCopy(u32* cmd);
void flushCacheRegions(u32* cmd);
void setBufferSwapImpl(u32 screen_id, const FramebufferInfo& info);
public:
GPUService(Memory& mem, GPU& gpu, Kernel& kernel, u32& currentPID) : mem(mem), gpu(gpu),
kernel(kernel), currentPID(currentPID) {}
@ -69,4 +90,4 @@ public:
std::memset(ptr, 0, 0x1000);
}
}
};
};

View file

@ -8,6 +8,12 @@
#include "PICA/float_types.hpp"
#include "PICA/regs.hpp"
constexpr u32 top_screen_width = 240;
constexpr u32 top_screen_height = 400;
constexpr u32 bottom_screen_width = 240;
constexpr u32 bottom_screen_height = 300;
using namespace Floats;
// Note: For when we have multiple backends, the GL state manager can stay here and have the constructor for the Vulkan-or-whatever renderer ignore it
@ -41,6 +47,27 @@ void GPU::reset() {
e.config2 = 0;
}
// Initialize the framebuffer registers. Values taken from Citra.
using namespace PICA::ExternalRegs;
// Top screen addresses and dimentions.
external_regs[Framebuffer0AFirstAddr] = 0x181E6000;
external_regs[Framebuffer0ASecondAddr] = 0x1822C800;
external_regs[Framebuffer0BFirstAddr] = 0x18273000;
external_regs[Framebuffer0BSecondAddr] = 0x182B9800;
external_regs[Framebuffer0Size] = (top_screen_height << 16) | top_screen_width;
external_regs[Framebuffer0Stride] = 720;
external_regs[Framebuffer0Config] = static_cast<u32>(PICA::ColorFmt::RGB8);
external_regs[Framebuffer0Select] = 0;
// Bottom screen addresses and dimentions.
external_regs[Framebuffer1AFirstAddr] = 0x1848F000;
external_regs[Framebuffer1ASecondAddr] = 0x184C7800;
external_regs[Framebuffer1Size] = (bottom_screen_height << 16) | bottom_screen_width;
external_regs[Framebuffer1Stride] = 720;
external_regs[Framebuffer1Config] = static_cast<u32>(PICA::ColorFmt::RGB8);
external_regs[Framebuffer1Select] = 0;
renderer.reset();
}
@ -123,12 +150,12 @@ void GPU::drawArrays() {
vertexIndex = i + regs[PICA::InternalRegs::VertexOffsetReg];
} else {
if (shortIndex) {
auto ptr = getPointerPhys<u16>(indexBufferPointer);
vertexIndex = *ptr; // TODO: This is very unsafe
auto ptr = getSpanPhys<u16>(indexBufferPointer, sizeof(u16));
vertexIndex = ptr[0]; // TODO: This is very unsafe
indexBufferPointer += 2;
} else {
auto ptr = getPointerPhys<u8>(indexBufferPointer);
vertexIndex = *ptr; // TODO: This is also very unsafe
auto ptr = getSpanPhys<u8>(indexBufferPointer, sizeof(u8));
vertexIndex = ptr[0]; // TODO: This is also very unsafe
indexBufferPointer += 1;
}
}
@ -188,42 +215,46 @@ void GPU::drawArrays() {
switch (attribType) {
case 0: { // Signed byte
s8* ptr = getPointerPhys<s8>(attrAddress);
const u32 attr_bytes = size * sizeof(s8);
const auto ptr = getSpanPhys<s8>(attrAddress, attr_bytes);
for (component = 0; component < size; component++) {
float val = static_cast<float>(*ptr++);
float val = static_cast<float>(ptr[component]);
attribute[component] = f24::fromFloat32(val);
}
attrAddress += size * sizeof(s8);
attrAddress += attr_bytes;
break;
}
case 1: { // Unsigned byte
u8* ptr = getPointerPhys<u8>(attrAddress);
const u32 attr_bytes = size * sizeof(u8);
const auto ptr = getSpanPhys<u8>(attrAddress, attr_bytes);
for (component = 0; component < size; component++) {
float val = static_cast<float>(*ptr++);
float val = static_cast<float>(ptr[component]);
attribute[component] = f24::fromFloat32(val);
}
attrAddress += size * sizeof(u8);
attrAddress += attr_bytes;
break;
}
case 2: { // Short
s16* ptr = getPointerPhys<s16>(attrAddress);
const u32 attr_bytes = size * sizeof(s16);
const auto ptr = getSpanPhys<s16>(attrAddress, attr_bytes);
for (component = 0; component < size; component++) {
float val = static_cast<float>(*ptr++);
float val = static_cast<float>(ptr[component]);
attribute[component] = f24::fromFloat32(val);
}
attrAddress += size * sizeof(s16);
attrAddress += attr_bytes;
break;
}
case 3: { // Float
float* ptr = getPointerPhys<float>(attrAddress);
const u32 attr_bytes = size * sizeof(float);
const auto ptr = getSpanPhys<float>(attrAddress, attr_bytes);
for (component = 0; component < size; component++) {
float val = *ptr++;
float val = ptr[component];
attribute[component] = f24::fromFloat32(val);
}
attrAddress += size * sizeof(float);
attrAddress += attr_bytes;
break;
}

View file

@ -18,11 +18,36 @@ void GPU::writeReg(u32 address, u32 value) {
if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers
const u32 index = (address - 0x1EF01000) / sizeof(u32);
writeInternalReg(index, value, 0xffffffff);
} else if (address >= 0x1EF00004 && address < 0x1EF01000) {
const u32 index = (address - 0x1EF00004) / sizeof(u32);
writeExternalReg(index, value);
} else {
log("Ignoring write to external GPU register %08X. Value: %08X\n", address, value);
log("Ignoring write to unknown GPU register %08X. Value: %08X\n", address, value);
}
}
u32 GPU::readExternalReg(u32 index) {
using namespace PICA::ExternalRegs;
if (index > 0x1000) [[unlikely]] {
Helpers::panic("Tried to read invalid external GPU register. Index: %X\n", index);
return -1;
}
return external_regs[index];
}
void GPU::writeExternalReg(u32 index, u32 value) {
using namespace PICA::ExternalRegs;
if (index > 0x1000) [[unlikely]] {
Helpers::panic("Tried to write to invalid external GPU register. Index: %X, value: %08X\n", index, value);
return;
}
external_regs[index] = value;
}
u32 GPU::readInternalReg(u32 index) {
using namespace PICA::InternalRegs;
@ -54,7 +79,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
using namespace PICA::InternalRegs;
if (index > regNum) [[unlikely]] {
Helpers::panic("Tried to write to invalid GPU register. Index: %X, value: %08X\n", index, value);
Helpers::panic("Tried to write to invalid internal GPU register. Index: %X, value: %08X\n", index, value);
return;
}
@ -275,9 +300,9 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
u32 size = (regs[CmdBufSize0 + bufferIndex] & 0xfffff) << 3;
// Set command buffer state to execute the new buffer
cmdBuffStart = getPointerPhys<u32>(addr);
cmdBuffCurr = cmdBuffStart;
cmdBuffEnd = cmdBuffStart + (size / sizeof(u32));
cmdBuffStart = getSpanPhys<u32>(addr, size);
cmdBuffCurr = 0;
cmdBuffEnd = (size / sizeof(u32));
}
break;
}
@ -308,12 +333,15 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
}
void GPU::startCommandList(u32 addr, u32 size) {
cmdBuffStart = static_cast<u32*>(mem.getReadPointer(addr));
if (!cmdBuffStart) Helpers::panic("Couldn't get buffer for command list");
cmdBuffStart = getSpanPhys<u32>(addr, size);
if (cmdBuffStart.empty()) {
Helpers::panic("Couldn't get buffer for command list");
return;
}
// TODO: This is very memory unsafe. We get a pointer to FCRAM and just keep writing without checking if we're gonna go OoB
cmdBuffCurr = cmdBuffStart;
cmdBuffEnd = cmdBuffStart + (size / sizeof(u32));
cmdBuffCurr = 0;
cmdBuffEnd = (size / sizeof(u32));
// LUT for converting the parameter mask to an actual 32-bit mask
// The parameter mask is 4 bits long, each bit corresponding to one byte of the mask
@ -329,13 +357,13 @@ void GPU::startCommandList(u32 addr, u32 size) {
// The curr pointer starts out doubleword-aligned and is increased by 4 bytes each time
// So to check if it is aligned, we get the number of words it's been incremented by
// If that number is an odd value then the buffer is not aligned, otherwise it is
if ((cmdBuffCurr - cmdBuffStart) % 2 != 0) {
if (cmdBuffCurr % 2 != 0) {
cmdBuffCurr++;
}
// The first word of a command is the command parameter and the second one is the header
u32 param1 = *cmdBuffCurr++;
u32 header = *cmdBuffCurr++;
u32 param1 = cmdBuffStart[cmdBuffCurr++];
u32 header = cmdBuffStart[cmdBuffCurr++];
u32 id = header & 0xffff;
u32 paramMaskIndex = getBits<16, 4>(header);
@ -352,8 +380,8 @@ void GPU::startCommandList(u32 addr, u32 size) {
writeInternalReg(id, param1, mask);
for (u32 i = 0; i < paramCount; i++) {
id += idIncrement;
u32 param = *cmdBuffCurr++;
u32 param = cmdBuffStart[cmdBuffCurr++];
writeInternalReg(id, param, mask);
}
}
}
}

View file

@ -453,4 +453,4 @@ u64 Memory::timeSince3DSEpoch() {
constexpr u64 offset = 2208988800ull;
milliseconds ms = duration_cast<milliseconds>(seconds(rawTime + timezoneDifference + offset));
return ms.count();
}
}

View file

@ -576,6 +576,12 @@ const char* displayFragmentShader = R"(
}
)";
static void APIENTRY debugHandler(GLenum source, GLenum type, GLuint id, GLenum severity,
GLsizei length, const GLchar* message, const void* userParam) {
Helpers::warn("%d: %s\n", id, message);
}
void Renderer::reset() {
depthBufferCache.reset();
colourBufferCache.reset();
@ -695,6 +701,11 @@ void Renderer::initGraphicsContext() {
OpenGL::clearColor();
OpenGL::setViewport(oldViewport[0], oldViewport[1], oldViewport[2], oldViewport[3]);
#if defined(OPENGL_DEBUG_INFO)
glEnable(GL_DEBUG_OUTPUT);
glDebugMessageCallback(debugHandler, nullptr);
#endif
reset();
}
@ -919,6 +930,39 @@ constexpr u32 bottomScreenBuffer = 0x1f05dc00;
void Renderer::display() {
gl.disableScissor();
gl.disableBlend();
gl.disableDepth();
gl.disableScissor();
gl.setColourMask(true, true, true, true);
gl.useProgram(displayProgram);
gl.bindVAO(dummyVAO);
OpenGL::disableClipPlane(0);
OpenGL::disableClipPlane(1);
using namespace PICA::ExternalRegs;
const u32 topScreenAddr = gpu.readExternalReg(Framebuffer0AFirstAddr);
const u32 bottomScreenAddr = gpu.readExternalReg(Framebuffer1AFirstAddr);
auto topScreen = colourBufferCache.findFromAddress(topScreenAddr);
auto bottomScreen = colourBufferCache.findFromAddress(bottomScreenAddr);
Helpers::warn("Top screen addr %08X\n", topScreenAddr);
screenFramebuffer.bind(OpenGL::DrawFramebuffer);
// Hack: Detect whether we are writing to the top or bottom screen by checking output gap and drawing to the proper part of the output texture
// We consider output gap == 320 to mean bottom, and anything else to mean top
if (topScreen) {
topScreen->get().texture.bind();
OpenGL::setViewport(0, 240, 400, 240); // Top screen viewport
OpenGL::draw(OpenGL::TriangleStrip, 4); // Actually draw our 3DS screen
}
if (bottomScreen) {
bottomScreen->get().texture.bind();
OpenGL::setViewport(40, 0, 320, 240);
OpenGL::draw(OpenGL::TriangleStrip, 4);
}
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
screenFramebuffer.bind(OpenGL::ReadFramebuffer);
@ -986,7 +1030,7 @@ OpenGL::Texture Renderer::getTexture(Texture& tex) {
if (buffer.has_value()) {
return buffer.value().get().texture;
} else {
const void* textureData = gpu.getPointerPhys<void*>(tex.location); // Get pointer to the texture data in 3DS memory
const std::span textureData = gpu.getSpanPhys<u8>(tex.location, tex.sizeInBytes()); // Get pointer to the texture data in 3DS memory
Texture& newTex = textureCache.add(tex);
newTex.decodeTexture(textureData);
@ -994,40 +1038,86 @@ OpenGL::Texture Renderer::getTexture(Texture& tex) {
}
}
// NOTE: The GPU format has RGB5551 and RGB655 swapped compared to internal regs format
PICA::ColorFmt ToColorFmt(u32 format) {
switch (format) {
case 2: return PICA::ColorFmt::RGB565;
case 3: return PICA::ColorFmt::RGBA5551;
default: return static_cast<PICA::ColorFmt>(format);
}
}
void Renderer::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {
const u32 inputWidth = inputSize & 0xffff;
const u32 inputGap = inputSize >> 16;
const u32 inputHeight = inputSize >> 16;
const auto inputFormat = ToColorFmt(Helpers::getBits<8, 3>(flags));
const auto outputFormat = ToColorFmt(Helpers::getBits<12, 3>(flags));
const PICA::Scaling scaling = static_cast<PICA::Scaling>(Helpers::getBits<24, 2>(flags));
const u32 outputWidth = outputSize & 0xffff;
const u32 outputGap = outputSize >> 16;
auto framebuffer = colourBufferCache.findFromAddress(inputAddr);
// If there's a framebuffer at this address, use it. Otherwise go back to our old hack and display framebuffer 0
// Displays are hard I really don't want to try implementing them because getting a fast solution is terrible
OpenGL::Texture& tex = framebuffer.has_value() ? framebuffer.value().get().texture : colourBufferCache[0].texture;
tex.bind();
screenFramebuffer.bind(OpenGL::DrawFramebuffer);
gl.disableBlend();
gl.disableDepth();
gl.disableScissor();
gl.setColourMask(true, true, true, true);
gl.useProgram(displayProgram);
gl.bindVAO(dummyVAO);
OpenGL::disableClipPlane(0);
OpenGL::disableClipPlane(1);
// Hack: Detect whether we are writing to the top or bottom screen by checking output gap and drawing to the proper part of the output texture
// We consider output gap == 320 to mean bottom, and anything else to mean top
if (outputGap == 320) {
OpenGL::setViewport(40, 0, 320, 240); // Bottom screen viewport
} else {
OpenGL::setViewport(0, 240, 400, 240); // Top screen viewport
u32 outputWidth = outputSize & 0xffff;
if (scaling == PICA::Scaling::X || scaling == PICA::Scaling::XY) {
outputWidth >>= 1;
}
u32 outputHeight = outputSize >> 16;
if (scaling == PICA::Scaling::XY) {
outputHeight >>= 1;
}
OpenGL::draw(OpenGL::TriangleStrip, 4); // Actually draw our 3DS screen
// If there's a framebuffer at this address, use it. Otherwise go back to our old hack and display framebuffer 0
// Displays are hard I really don't want to try implementing them because getting a fast solution is terrible
auto srcFramebuffer = getColourBuffer(inputAddr, inputFormat, inputWidth, inputHeight);
auto dstFramebuffer = getColourBuffer(outputAddr, outputFormat, outputWidth, outputHeight);
Helpers::warn("Display transfer with outputAddr %08X\n", outputAddr);
// Blit the framebuffers
srcFramebuffer.fbo.bind(OpenGL::ReadFramebuffer);
dstFramebuffer.fbo.bind(OpenGL::DrawFramebuffer);
glBlitFramebuffer(0, 0, inputWidth, inputHeight, 0, 0, outputWidth, outputHeight, GL_COLOR_BUFFER_BIT, GL_LINEAR);
}
void Renderer::textureCopy(u32 inputAddr, u32 outputAddr, u32 copyBytes, u32 inputSize, u32 outputSize, u32 flags) {
copyBytes = Helpers::alignDown(copyBytes, 16);
if (copyBytes == 0) [[unlikely]] {
return;
}
const u32 inputWidth = (inputSize & 0xffff) * 16;
const u32 inputGap = (inputSize >> 16) * 16;
const u32 outputWidth = (outputSize & 0xffff) * 16;
const u32 outputGap = (outputSize >> 16) * 16;
if (inputGap != 0 || inputWidth != outputWidth) {
Helpers::warn("Texture copy with non zero input gap or mismatching widths, cannot be accelerated");
return;
}
// If the texture is tiled, apps set inputWidth to the scanline size which is width * 8.
// HACK: We don't know if the src texture is tiled or not yet, assume it is for now, because it's the most common case.
// Citra handles this by letting the width/stride be set as bytes and interpreting it differently
// depending on the candidate surface.
auto srcFramebuffer = getColourBuffer(inputAddr, PICA::ColorFmt::RGBA8, inputWidth / 8, copyBytes / inputWidth); // HACK: Assume RGBA8 format
auto dstFramebuffer = getColourBuffer(outputAddr, srcFramebuffer.format, outputWidth / 8, copyBytes / outputWidth);
// Blit the framebuffers
srcFramebuffer.fbo.bind(OpenGL::ReadFramebuffer);
dstFramebuffer.fbo.bind(OpenGL::DrawFramebuffer);
glBlitFramebuffer(0, 0, srcFramebuffer.size.x(), srcFramebuffer.size.y(),
0, 0, dstFramebuffer.size.x(), dstFramebuffer.size.y(), GL_COLOR_BUFFER_BIT, GL_LINEAR);
}
ColourBuffer Renderer::getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height) {
// Try to find an already existing buffer that contains the provided address
// This is a more relaxed check compared to getColourFBO as display transfer/texcopy may refer to
// subrect of a surface and in case of texcopy we don't know the format of the surface.
auto buffer = colourBufferCache.findFromAddress(addr);
if (buffer.has_value()) {
return buffer.value().get();
}
// Otherwise create and cache a new buffer.
ColourBuffer sampleBuffer(addr, format, width, height);
return colourBufferCache.add(sampleBuffer);
}
void Renderer::screenshot(const std::string& name) {
@ -1053,4 +1143,4 @@ void Renderer::screenshot(const std::string& name) {
}
stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0);
}
}

View file

@ -258,18 +258,18 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) {
}
}
void Texture::decodeTexture(const void* data) {
void Texture::decodeTexture(std::span<const u8> data) {
std::vector<u32> decoded;
decoded.reserve(u64(size.u()) * u64(size.v()));
// Decode texels line by line
for (u32 v = 0; v < size.v(); v++) {
for (u32 u = 0; u < size.u(); u++) {
u32 colour = decodeTexel(u, v, format, data);
u32 colour = decodeTexel(u, v, format, data.data());
decoded.push_back(colour);
}
}
texture.bind();
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, size.u(), size.v(), GL_RGBA, GL_UNSIGNED_BYTE, decoded.data());
}
}

View file

@ -1,4 +1,5 @@
#include "services/gsp_gpu.hpp"
#include "PICA/regs.hpp"
#include "ipc.hpp"
#include "kernel.hpp"
@ -10,6 +11,7 @@ namespace ServiceCommands {
RegisterInterruptRelayQueue = 0x00130042,
WriteHwRegs = 0x00010082,
WriteHwRegsWithMask = 0x00020084,
SetBufferSwap = 0x00050200,
FlushDataCache = 0x00080082,
SetLCDForceBlack = 0x000B0040,
TriggerCmdReqQueue = 0x000C0000,
@ -19,15 +21,35 @@ namespace ServiceCommands {
}
// Commands written to shared memory and processed by TriggerCmdReqQueue
namespace GXCommands {
enum : u32 {
TriggerDMARequest = 0,
ProcessCommandList = 1,
MemoryFill = 2,
TriggerDisplayTransfer = 3,
TriggerTextureCopy = 4,
FlushCacheRegions = 5
};
enum class GXCommands : u32 {
TriggerDMARequest = 0,
ProcessCommandList = 1,
MemoryFill = 2,
TriggerDisplayTransfer = 3,
TriggerTextureCopy = 4,
FlushCacheRegions = 5
};
static u32 VaddrToPaddr(u32 addr) {
if (addr >= VirtualAddrs::VramStart && addr < (VirtualAddrs::VramStart + VirtualAddrs::VramSize)) [[likely]] {
return addr - VirtualAddrs::VramStart + PhysicalAddrs::VRAM;
}
else if (addr >= VirtualAddrs::LinearHeapStartOld && addr < VirtualAddrs::LinearHeapEndOld) {
return addr - VirtualAddrs::LinearHeapStartOld + PhysicalAddrs::FCRAM;
}
else if (addr >= VirtualAddrs::LinearHeapStartNew && addr < VirtualAddrs::LinearHeapEndNew) {
return addr - VirtualAddrs::LinearHeapStartNew + PhysicalAddrs::FCRAM;
}
else if (addr == 0) {
return 0;
}
Helpers::warn("[GSP::GPU VaddrToPaddr] Unknown virtual address %08X", addr);
// Obviously garbage address
return 0xF3310932;
}
void GPUService::reset() {
@ -43,13 +65,14 @@ void GPUService::handleSyncRequest(u32 messagePointer) {
case ServiceCommands::FlushDataCache: flushDataCache(messagePointer); break;
case ServiceCommands::RegisterInterruptRelayQueue: registerInterruptRelayQueue(messagePointer); break;
case ServiceCommands::SetAxiConfigQoSMode: setAxiConfigQoSMode(messagePointer); break;
case ServiceCommands::SetBufferSwap: setBufferSwap(messagePointer); break;
case ServiceCommands::SetInternalPriorities: setInternalPriorities(messagePointer); break;
case ServiceCommands::SetLCDForceBlack: setLCDForceBlack(messagePointer); break;
case ServiceCommands::StoreDataCache: storeDataCache(messagePointer); break;
case ServiceCommands::TriggerCmdReqQueue: [[likely]] triggerCmdReqQueue(messagePointer); break;
case ServiceCommands::WriteHwRegs: writeHwRegs(messagePointer); break;
case ServiceCommands::WriteHwRegsWithMask: writeHwRegsWithMask(messagePointer); break;
; default: Helpers::panic("GPU service requested. Command: %08X\n", command);
default: Helpers::panic("GPU service requested. Command: %08X\n", command);
}
}
@ -122,15 +145,12 @@ void GPUService::requestInterrupt(GPUInterrupt type) {
// Not emulating this causes Yoshi's Wooly World, Captain Toad, Metroid 2 et al to hang
if (type == GPUInterrupt::VBlank0 || type == GPUInterrupt::VBlank1) {
int screen = static_cast<u32>(type) - static_cast<u32>(GPUInterrupt::VBlank0); // 0 for top screen, 1 for bottom
constexpr u32 FBInfoSize = 0x40;
// TODO: Offset depends on GSP thread being triggered
u8* info = &sharedMem[0x200 + screen * FBInfoSize];
u8& dirtyFlag = info[1];
FrameBufferUpdate* update = reinterpret_cast<FrameBufferUpdate*>(&sharedMem[0x200 + screen * sizeof(FrameBufferUpdate)]);
if (dirtyFlag & 1) {
// TODO: Submit buffer info here
dirtyFlag &= ~1;
if (update->dirtyFlag & 1) {
setBufferSwapImpl(screen, update->framebufferInfo[update->index]);
update->dirtyFlag &= ~1;
}
}
@ -259,6 +279,18 @@ void GPUService::setAxiConfigQoSMode(u32 messagePointer) {
mem.write32(messagePointer + 4, Result::Success);
}
void GPUService::setBufferSwap(u32 messagePointer) {
FramebufferInfo info{};
const u32 screenId = mem.read32(messagePointer + 4); // Selects either PDC0 or PDC1
info.activeFb = mem.read32(messagePointer + 8);
info.leftFramebufferVaddr = mem.read32(messagePointer + 12);
info.rightFramebufferVaddr = mem.read32(messagePointer + 16);
info.stride = mem.read32(messagePointer + 20);
info.format = mem.read32(messagePointer + 24);
info.displayFb = mem.read32(messagePointer + 28); // Selects either framebuffer A or B
setBufferSwapImpl(screenId, info);
}
// Seems to also be completely undocumented
void GPUService::setInternalPriorities(u32 messagePointer) {
log("GSP::GPU::SetInternalPriorities\n");
@ -281,7 +313,7 @@ void GPUService::processCommandBuffer() {
log("Processing %d GPU commands\n", commandsLeft);
while (commandsLeft != 0) {
u32 cmdID = cmd[0] & 0xff;
const GXCommands cmdID = static_cast<GXCommands>(cmd[0] & 0xff);
switch (cmdID) {
case GXCommands::ProcessCommandList: processCommandList(cmd); break;
case GXCommands::MemoryFill: memoryFill(cmd); break;
@ -324,28 +356,6 @@ void GPUService::memoryFill(u32* cmd) {
}
}
static u32 VaddrToPaddr(u32 addr) {
if (addr >= VirtualAddrs::VramStart && addr < (VirtualAddrs::VramStart + VirtualAddrs::VramSize)) [[likely]] {
return addr - VirtualAddrs::VramStart + PhysicalAddrs::VRAM;
}
else if (addr >= VirtualAddrs::LinearHeapStartOld && addr < VirtualAddrs::LinearHeapEndOld) {
return addr - VirtualAddrs::LinearHeapStartOld + PhysicalAddrs::FCRAM;
}
else if (addr >= VirtualAddrs::LinearHeapStartNew && addr < VirtualAddrs::LinearHeapEndNew) {
return addr - VirtualAddrs::LinearHeapStartNew + PhysicalAddrs::FCRAM;
}
else if (addr == 0) {
return 0;
}
Helpers::warn("[GSP::GPU VaddrToPaddr] Unknown virtual address %08X", addr);
// Obviously garbage address
return 0xF3310932;
}
void GPUService::triggerDisplayTransfer(u32* cmd) {
const u32 inputAddr = VaddrToPaddr(cmd[1]);
const u32 outputAddr = VaddrToPaddr(cmd[2]);
@ -373,23 +383,74 @@ void GPUService::flushCacheRegions(u32* cmd) {
log("GSP::GPU::FlushCacheRegions (Stubbed)\n");
}
void GPUService::setBufferSwapImpl(u32 screenId, const FramebufferInfo& info) {
using namespace PICA::ExternalRegs;
constexpr static std::array<u32, 8> fb_addresses = {
Framebuffer0AFirstAddr,
Framebuffer0ASecondAddr,
Framebuffer0BFirstAddr,
Framebuffer0BSecondAddr,
Framebuffer1AFirstAddr,
Framebuffer1ASecondAddr,
Framebuffer1BFirstAddr,
Framebuffer1BSecondAddr,
};
const u32 fb_index = screenId * 4 + info.activeFb * 2;
gpu.writeExternalReg(fb_addresses[fb_index], VaddrToPaddr(info.leftFramebufferVaddr));
gpu.writeExternalReg(fb_addresses[fb_index + 1], VaddrToPaddr(info.rightFramebufferVaddr));
constexpr static std::array<u32, 6> config_addresses = {
Framebuffer0Config,
Framebuffer0Select,
Framebuffer0Stride,
Framebuffer1Config,
Framebuffer1Select,
Framebuffer1Stride,
};
const u32 config_index = screenId * 3;
gpu.writeExternalReg(config_addresses[config_index], info.format);
gpu.writeExternalReg(config_addresses[config_index + 1], info.displayFb);
gpu.writeExternalReg(config_addresses[config_index + 2], info.stride);
}
// Actually send command list (aka display list) to GPU
void GPUService::processCommandList(u32* cmd) {
const u32 address = cmd[1] & ~7; // Buffer address
const u32 size = cmd[2] & ~3; // Buffer size in bytes
const bool updateGas = cmd[3] == 1; // Update gas additive blend results (0 = don't update, 1 = update)
const bool flushBuffer = cmd[7] == 1; // Flush buffer (0 = don't flush, 1 = flush)
[[maybe_unused]] const bool updateGas = cmd[3] == 1; // Update gas additive blend results (0 = don't update, 1 = update)
[[maybe_unused]] const bool flushBuffer = cmd[7] == 1; // Flush buffer (0 = don't flush, 1 = flush)
log("GPU::GSP::processCommandList. Address: %08X, size in bytes: %08X\n", address, size);
gpu.startCommandList(address, size);
gpu.startCommandList(VaddrToPaddr(address), size);
requestInterrupt(GPUInterrupt::P3D); // Send an IRQ when command list processing is over
}
// TODO: Emulate the transfer engine & its registers
// Then this can be emulated by just writing the appropriate values there
void GPUService::triggerTextureCopy(u32* cmd) {
Helpers::warn("GSP::GPU::TriggerTextureCopy (unimplemented)\n");
const u32 inputBufferAddr = VaddrToPaddr(cmd[1]);
const u32 outputBufferAddr = VaddrToPaddr(cmd[2]);
const u32 totalCopyBytes = cmd[3];
const u32 inputWidthGap = cmd[4];
const u32 outputWidthGap = cmd[5];
const u32 flags = cmd[6];
// Write the trigger register
using namespace PICA::ExternalRegs;
gpu.writeExternalReg(TransferTrigger, gpu.readExternalReg(TransferTrigger) | 1);
// Perform the texture copy
gpu.textureCopy(inputBufferAddr, outputBufferAddr, totalCopyBytes, inputWidthGap, outputWidthGap, flags);
// This uses the transfer engine and thus needs to fire a PPF interrupt.
// NSMB2 relies on this
requestInterrupt(GPUInterrupt::PPF);
}
// Writing to trigger will perform the texture copy.
// Reset the bit here to singal completion.
gpu.writeExternalReg(TransferTrigger, gpu.readExternalReg(TransferTrigger) & ~1);
}