From 072304404cbd7eab0a5d2737389c654c5f4f95f0 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Sat, 10 Apr 2021 15:17:42 -0700
Subject: [PATCH 01/10] Correct indirect stage ref typos

YAGCD uses BI0/BC0/BI1/BC1/BI2/BC2/BI3/BC3, so I'm pretty sure the BI2/BC3/BI4/BC4 names are a typo that just was propagated.
---
 Source/Core/VideoCommon/BPMemory.h       | 12 ++++-----
 Source/Core/VideoCommon/PixelShaderGen.h | 32 ++++++------------------
 2 files changed, 14 insertions(+), 30 deletions(-)
diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h
index f22ceda2cd..61f3bbac50 100644
--- a/Source/Core/VideoCommon/BPMemory.h
+++ b/Source/Core/VideoCommon/BPMemory.h
@@ -600,13 +600,13 @@ struct fmt::formatter<TEXSCALE>
 union RAS1_IREF
 {
   BitField<0, 3, u32> bi0;  // Indirect tex stage 0 ntexmap
-  BitField<3, 3, u32> bc0;  // Indirect tex stage 0 ntexmap
+  BitField<3, 3, u32> bc0;  // Indirect tex stage 0 ntexcoord
   BitField<6, 3, u32> bi1;
   BitField<9, 3, u32> bc1;
   BitField<12, 3, u32> bi2;
-  BitField<15, 3, u32> bc3;  // Typo?
-  BitField<18, 3, u32> bi4;
-  BitField<21, 3, u32> bc4;
+  BitField<15, 3, u32> bc2;
+  BitField<18, 3, u32> bi3;
+  BitField<21, 3, u32> bc3;
   u32 hex;
 
   u32 getTexCoord(int i) const { return (hex >> (6 * i + 3)) & 7; }
@@ -625,8 +625,8 @@ struct fmt::formatter<RAS1_IREF>
                      "Stage 1 ntexmap: {}\nStage 1 ntexcoord: {}\n"
                      "Stage 2 ntexmap: {}\nStage 2 ntexcoord: {}\n"
                      "Stage 3 ntexmap: {}\nStage 3 ntexcoord: {}",
-                     indref.bi0, indref.bc0, indref.bi1, indref.bc1, indref.bi2, indref.bc3,
-                     indref.bi4, indref.bc4);
+                     indref.bi0, indref.bc0, indref.bi1, indref.bc1, indref.bi2, indref.bc2,
+                     indref.bi3, indref.bc3);
   }
 };
 
diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h
index a9ffa25498..08724274ee 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.h
+++ b/Source/Core/VideoCommon/PixelShaderGen.h
@@ -66,9 +66,9 @@ struct pixel_shader_uid_data
   u32 tevindref_bi1 : 3;
   u32 tevindref_bc1 : 3;
   u32 tevindref_bi2 : 3;
+  u32 tevindref_bc2 : 3;
+  u32 tevindref_bi3 : 3;
   u32 tevindref_bc3 : 3;
-  u32 tevindref_bi4 : 3;
-  u32 tevindref_bc4 : 3;
 
   void SetTevindrefValues(int index, u32 texcoord, u32 texmap)
   {
@@ -84,55 +84,39 @@ struct pixel_shader_uid_data
     }
     else if (index == 2)
     {
-      tevindref_bc3 = texcoord;
+      tevindref_bc2 = texcoord;
       tevindref_bi2 = texmap;
     }
     else if (index == 3)
     {
-      tevindref_bc4 = texcoord;
-      tevindref_bi4 = texmap;
+      tevindref_bc3 = texcoord;
+      tevindref_bi3 = texmap;
     }
   }
 
   u32 GetTevindirefCoord(int index) const
   {
     if (index == 0)
-    {
       return tevindref_bc0;
-    }
     else if (index == 1)
-    {
       return tevindref_bc1;
-    }
     else if (index == 2)
-    {
-      return tevindref_bc3;
-    }
+      return tevindref_bc2;
     else if (index == 3)
-    {
-      return tevindref_bc4;
-    }
+      return tevindref_bc3;
     return 0;
   }
 
   u32 GetTevindirefMap(int index) const
   {
     if (index == 0)
-    {
       return tevindref_bi0;
-    }
     else if (index == 1)
-    {
       return tevindref_bi1;
-    }
     else if (index == 2)
-    {
       return tevindref_bi2;
-    }
     else if (index == 3)
-    {
-      return tevindref_bi4;
-    }
+      return tevindref_bi3;
     return 0;
   }
 

From 1d628d087b96b95586d666b1431a42ab00ee22f3 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Tue, 13 Apr 2021 13:12:38 -0700
Subject: [PATCH 02/10] Add 1 when displaying the number of TEV stages

---
 Source/Core/VideoCommon/BPMemory.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h
index 61f3bbac50..36ec9ec0a9 100644
--- a/Source/Core/VideoCommon/BPMemory.h
+++ b/Source/Core/VideoCommon/BPMemory.h
@@ -911,6 +911,8 @@ union GenMode
   BitField<7, 1, u32> unused;              // 1 bit unused?
   BitField<8, 1, bool, u32> flat_shading;  // unconfirmed
   BitField<9, 1, bool, u32> multisampling;
+  // This value is 1 less than the actual number (0-15 map to 1-16).
+  // In other words there is always at least 1 tev stage
   BitField<10, 4, u32> numtevstages;
   BitField<14, 2, CullMode> cullmode;
   BitField<16, 3, u32> numindstages;
@@ -937,7 +939,7 @@ struct fmt::formatter<GenMode>
                      "ZFreeze: {}",
                      mode.numtexgens, mode.numcolchans, mode.unused,
                      mode.flat_shading ? "Yes" : "No", mode.multisampling ? "Yes" : "No",
-                     mode.numtevstages, mode.cullmode, mode.numindstages,
+                     mode.numtevstages + 1, mode.cullmode, mode.numindstages,
                      mode.zfreeze ? "Yes" : "No");
   }
 };

From c3668e179c28dbe769f8a128e780a2269044f962 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Sat, 10 Apr 2021 17:41:06 -0700
Subject: [PATCH 03/10] Split TevStageIndirect::mid into matrix_index and
 matrix_id

---
 Source/Core/VideoBackends/Software/Tev.cpp    | 26 +++++-----
 Source/Core/VideoCommon/BPMemory.h            | 52 +++++++++++++++----
 Source/Core/VideoCommon/PixelShaderGen.cpp    | 26 ++++++----
 .../Core/VideoCommon/PixelShaderManager.cpp   |  4 +-
 Source/Core/VideoCommon/UberShaderPixel.cpp   | 11 ++--
 5 files changed, 78 insertions(+), 41 deletions(-)

diff --git a/Source/Core/VideoBackends/Software/Tev.cpp b/Source/Core/VideoBackends/Software/Tev.cpp
index 65227339e4..30258f6cd3 100644
--- a/Source/Core/VideoBackends/Software/Tev.cpp
+++ b/Source/Core/VideoBackends/Software/Tev.cpp
@@ -485,20 +485,16 @@ void Tev::Indirect(unsigned int stageNum, s32 s, s32 t)
 
   // matrix multiply - results might overflow, but we don't care since we only use the lower 24 bits
   // of the result.
-  const int indmtxid = indirect.mid & 3;
-  if (indmtxid)
+  if (indirect.matrix_index != IndMtxIndex::Off)
   {
-    const IND_MTX& indmtx = bpmem.indmtx[indmtxid - 1];
-    const int scale =
-        ((u32)indmtx.col0.s0 << 0) | ((u32)indmtx.col1.s1 << 2) | ((u32)indmtx.col2.s2 << 4);
+    const IND_MTX& indmtx = bpmem.indmtx[static_cast<u32>(indirect.matrix_index.Value()) - 1];
 
-    int shift;
+    const int shift = 17 - indmtx.GetScale();
 
-    switch (indirect.mid & 12)
+    switch (indirect.matrix_id)
     {
-    case 0:
+    case IndMtxId::Indirect:
       // matrix values are S0.10, output format is S17.7, so divide by 8
-      shift = (17 - scale);
       indtevtrans[0] = (indmtx.col0.ma * indcoord[0] + indmtx.col1.mc * indcoord[1] +
                         indmtx.col2.me * indcoord[2]) >>
                        3;
@@ -506,25 +502,29 @@ void Tev::Indirect(unsigned int stageNum, s32 s, s32 t)
                         indmtx.col2.mf * indcoord[2]) >>
                        3;
       break;
-    case 4:  // s matrix
+    case IndMtxId::S:
       // s is S17.7, matrix elements are divided by 256, output is S17.7, so divide by 256. - TODO:
       // Maybe, since s is actually stored as S24, we should divide by 256*64?
-      shift = (17 - scale);
       indtevtrans[0] = s * indcoord[0] / 256;
       indtevtrans[1] = t * indcoord[0] / 256;
       break;
-    case 8:  // t matrix
-      shift = (17 - scale);
+    case IndMtxId::T:
       indtevtrans[0] = s * indcoord[1] / 256;
       indtevtrans[1] = t * indcoord[1] / 256;
       break;
     default:
+      PanicAlertFmt("Invalid indirect matrix ID {}", indirect.matrix_id);
       return;
     }
 
     indtevtrans[0] = shift >= 0 ? indtevtrans[0] >> shift : indtevtrans[0] << -shift;
     indtevtrans[1] = shift >= 0 ? indtevtrans[1] >> shift : indtevtrans[1] << -shift;
   }
+  else
+  {
+    // If matrix_index is Off (0), matrix_id should be Indirect (0)
+    ASSERT(indirect.matrix_id == IndMtxId::Indirect);
+  }
 
   if (indirect.fb_addprev)
   {
diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h
index 36ec9ec0a9..569b1747f3 100644
--- a/Source/Core/VideoCommon/BPMemory.h
+++ b/Source/Core/VideoCommon/BPMemory.h
@@ -300,6 +300,31 @@ struct fmt::formatter<IndTexBias> : EnumFormatter<IndTexBias::STU>
   formatter() : EnumFormatter({"None", "S", "T", "ST", "U", "SU", "TU", "STU"}) {}
 };
 
+enum class IndMtxIndex : u32
+{
+  Off = 0,
+  Matrix0 = 1,
+  Matrix1 = 2,
+  Matrix2 = 3,
+};
+template <>
+struct fmt::formatter<IndMtxIndex> : EnumFormatter<IndMtxIndex::Matrix2>
+{
+  formatter() : EnumFormatter({"Off", "Matrix 0", "Matrix 1", "Matrix 2"}) {}
+};
+
+enum class IndMtxId : u32
+{
+  Indirect = 0,
+  S = 1,
+  T = 2,
+};
+template <>
+struct fmt::formatter<IndMtxId> : EnumFormatter<IndMtxId::T>
+{
+  formatter() : EnumFormatter({"Indirect", "S", "T"}) {}
+};
+
 // Indirect texture bump alpha
 enum class IndTexBumpAlpha : u32
 {
@@ -335,7 +360,7 @@ union IND_MTXA
 {
   BitField<0, 11, s32> ma;
   BitField<11, 11, s32> mb;
-  BitField<22, 2, u32> s0;  // bits 0-1 of scale factor
+  BitField<22, 2, u8, u32> s0;  // bits 0-1 of scale factor
   u32 hex;
 };
 
@@ -343,7 +368,7 @@ union IND_MTXB
 {
   BitField<0, 11, s32> mc;
   BitField<11, 11, s32> md;
-  BitField<22, 2, u32> s1;  // bits 2-3 of scale factor
+  BitField<22, 2, u8, u32> s1;  // bits 2-3 of scale factor
   u32 hex;
 };
 
@@ -351,7 +376,7 @@ union IND_MTXC
 {
   BitField<0, 11, s32> me;
   BitField<11, 11, s32> mf;
-  BitField<22, 2, u32> s2;  // bits 4-5 of scale factor
+  BitField<22, 2, u8, u32> s2;  // bits 4-5 of scale factor
   u32 hex;
 };
 
@@ -360,6 +385,7 @@ struct IND_MTX
   IND_MTXA col0;
   IND_MTXB col1;
   IND_MTXC col2;
+  u8 GetScale() const { return (col0.s0 << 0) | (col1.s1 << 2) | (col2.s2 << 4); }
 };
 
 union IND_IMASK
@@ -475,8 +501,12 @@ union TevStageIndirect
   BitField<4, 1, bool, u32> bias_s;
   BitField<5, 1, bool, u32> bias_t;
   BitField<6, 1, bool, u32> bias_u;
-  BitField<7, 2, IndTexBumpAlpha> bs;     // Indicates which coordinate will become the 'bump alpha'
-  BitField<9, 4, u32> mid;                // Matrix ID to multiply offsets with
+  BitField<7, 2, IndTexBumpAlpha> bs;  // Indicates which coordinate will become the 'bump alpha'
+  // Indicates which indirect matrix is used when matrix_id is Indirect.
+  // Also always indicates which indirect matrix to use for the scale factor, even with S or T.
+  BitField<9, 2, IndMtxIndex> matrix_index;
+  // Should be set to Indirect (0) if matrix_index is Off (0)
+  BitField<11, 2, IndMtxId> matrix_id;
   BitField<13, 3, IndTexWrap> sw;         // Wrapping factor for S of regular coord
   BitField<16, 3, IndTexWrap> tw;         // Wrapping factor for T of regular coord
   BitField<19, 1, bool, u32> lb_utclod;   // Use modified or unmodified texture
@@ -492,9 +522,9 @@ union TevStageIndirect
 
   u32 fullhex;
 
-  // If bs and mid are zero, the result of the stage is independent of
+  // If bs and matrix are zero, the result of the stage is independent of
   // the texture sample data, so we can skip sampling the texture.
-  bool IsActive() const { return bs != IndTexBumpAlpha::Off || mid != 0; }
+  bool IsActive() const { return bs != IndTexBumpAlpha::Off || matrix_index != IndMtxIndex::Off; }
 };
 template <>
 struct fmt::formatter<TevStageIndirect>
@@ -508,13 +538,15 @@ struct fmt::formatter<TevStageIndirect>
                      "Format: {}\n"
                      "Bias: {}\n"
                      "Bump alpha: {}\n"
+                     "Offset matrix index: {}\n"
                      "Offset matrix ID: {}\n"
                      "Regular coord S wrapping factor: {}\n"
                      "Regular coord T wrapping factor: {}\n"
                      "Use modified texture coordinates for LOD computation: {}\n"
                      "Add texture coordinates from previous TEV stage: {}",
-                     tevind.bt, tevind.fmt, tevind.bias, tevind.bs, tevind.mid, tevind.sw,
-                     tevind.tw, tevind.lb_utclod ? "Yes" : "No", tevind.fb_addprev ? "Yes" : "No");
+                     tevind.bt, tevind.fmt, tevind.bias, tevind.bs, tevind.matrix_index,
+                     tevind.matrix_id, tevind.sw, tevind.tw, tevind.lb_utclod ? "Yes" : "No",
+                     tevind.fb_addprev ? "Yes" : "No");
   }
 };
 
@@ -1914,7 +1946,7 @@ struct BPMemory
   GenMode genMode;
   u32 display_copy_filter[4];  // 01-04
   u32 unknown;                 // 05
-  // indirect matrices (set by GXSetIndTexMtx, selected by TevStageIndirect::mid)
+  // indirect matrices (set by GXSetIndTexMtx, selected by TevStageIndirect::matrix_index)
   // abc form a 2x3 offset matrix, there's 3 such matrices
   // the 3 offset matrices can either be indirect type, S-type, or T-type
   // 6bit scale factor s is distributed across IND_MTXA/B/C.
diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp
index ab59bfd52a..866b7bacc8 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@@ -991,7 +991,7 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
       // TODO: Should we reset alphabump to 0 here?
     }
 
-    if (tevind.mid != 0)
+    if (tevind.matrix_index != IndMtxIndex::Off)
     {
       // format
       static constexpr std::array<const char*, 4> tev_ind_fmt_mask{
@@ -1038,11 +1038,14 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
                   tev_ind_bias_add[u32(tevind.fmt.Value())]);
       }
 
+      // Multiplied by 2 because each matrix has two rows.
+      // Note also that the 4th column of the matrix contains the scale factor.
+      const u32 mtxidx = 2 * (static_cast<u32>(tevind.matrix_index.Value()) - 1);
+
       // multiply by offset matrix and scale - calculations are likely to overflow badly,
       // yet it works out since we only care about the lower 23 bits (+1 sign bit) of the result
-      if (tevind.mid <= 3)
+      if (tevind.matrix_id == IndMtxId::Indirect)
       {
-        const u32 mtxidx = 2 * (tevind.mid - 1);
         out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
 
         out.Write("\tint2 indtevtrans{} = int2(idot(" I_INDTEXMTX
@@ -1064,10 +1067,9 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
           out.Write("\telse indtevtrans{} <<= (-" I_INDTEXMTX "[{}].w);\n", n, mtxidx);
         }
       }
-      else if (tevind.mid <= 7 && has_tex_coord)
-      {  // s matrix
-        ASSERT(tevind.mid >= 5);
-        const u32 mtxidx = 2 * (tevind.mid - 5);
+      else if (tevind.matrix_id == IndMtxId::S)
+      {
+        ASSERT(has_tex_coord);
         out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
 
         out.Write("\tint2 indtevtrans{} = int2(fixpoint_uv{} * iindtevcrd{}.xx) >> 8;\n", n,
@@ -1086,10 +1088,9 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
           out.Write("\telse indtevtrans{} <<= (-" I_INDTEXMTX "[{}].w);\n", n, mtxidx);
         }
       }
-      else if (tevind.mid <= 11 && has_tex_coord)
-      {  // t matrix
-        ASSERT(tevind.mid >= 9);
-        const u32 mtxidx = 2 * (tevind.mid - 9);
+      else if (tevind.matrix_id == IndMtxId::T)
+      {
+        ASSERT(has_tex_coord);
         out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
 
         out.Write("\tint2 indtevtrans{} = int2(fixpoint_uv{} * iindtevcrd{}.yy) >> 8;\n", n,
@@ -1112,11 +1113,14 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
       else
       {
         out.Write("\tint2 indtevtrans{} = int2(0, 0);\n", n);
+        ASSERT(false);  // Unknown value for matrix_id
       }
     }
     else
     {
       out.Write("\tint2 indtevtrans{} = int2(0, 0);\n", n);
+      // If matrix_index is Off (0), matrix_id should be Indirect (0)
+      ASSERT(tevind.matrix_id == IndMtxId::Indirect);
     }
 
     // ---------
diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp
index bae48623c7..6d52d8ffc1 100644
--- a/Source/Core/VideoCommon/PixelShaderManager.cpp
+++ b/Source/Core/VideoCommon/PixelShaderManager.cpp
@@ -336,9 +336,7 @@ void PixelShaderManager::SetIndTexScaleChanged(bool high)
 
 void PixelShaderManager::SetIndMatrixChanged(int matrixidx)
 {
-  int scale = ((u32)bpmem.indmtx[matrixidx].col0.s0 << 0) |
-              ((u32)bpmem.indmtx[matrixidx].col1.s1 << 2) |
-              ((u32)bpmem.indmtx[matrixidx].col2.s2 << 4);
+  const u8 scale = bpmem.indmtx[matrixidx].GetScale();
 
   // xyz - static matrix
   // w - dynamic matrix scale / 128
diff --git a/Source/Core/VideoCommon/UberShaderPixel.cpp b/Source/Core/VideoCommon/UberShaderPixel.cpp
index c35ce2821d..996d3c1d55 100644
--- a/Source/Core/VideoCommon/UberShaderPixel.cpp
+++ b/Source/Core/VideoCommon/UberShaderPixel.cpp
@@ -806,7 +806,10 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
     out.Write("      uint fmt = {};\n", BitfieldExtract<&TevStageIndirect::fmt>("tevind"));
     out.Write("      uint bias = {};\n", BitfieldExtract<&TevStageIndirect::bias>("tevind"));
     out.Write("      uint bt = {};\n", BitfieldExtract<&TevStageIndirect::bt>("tevind"));
-    out.Write("      uint mid = {};\n", BitfieldExtract<&TevStageIndirect::mid>("tevind"));
+    out.Write("      uint matrix_index = {};\n",
+              BitfieldExtract<&TevStageIndirect::matrix_index>("tevind"));
+    out.Write("      uint matrix_id = {};\n",
+              BitfieldExtract<&TevStageIndirect::matrix_id>("tevind"));
     out.Write("\n");
     out.Write("      int3 indcoord;\n");
     LookupIndirectTexture("indcoord", "bt");
@@ -846,12 +849,12 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
               "\n"
               "      // Matrix multiply\n"
               "      int2 indtevtrans = int2(0, 0);\n"
-              "      if ((mid & 3u) != 0u)\n"
+              "      if (matrix_index != 0u)\n"
               "      {{\n"
-              "        uint mtxidx = 2u * ((mid & 3u) - 1u);\n"
+              "        uint mtxidx = 2u * (matrix_index - 1u);\n"
               "        int shift = " I_INDTEXMTX "[mtxidx].w;\n"
               "\n"
-              "        switch (mid >> 2)\n"
+              "        switch (matrix_id)\n"
               "        {{\n"
               "        case 0u: // 3x2 S0.10 matrix\n"
               "          indtevtrans = int2(idot(" I_INDTEXMTX

From 002ff4e4dd594c75898df9ba5ee4a14bc8fb7f77 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Sat, 17 Apr 2021 09:57:16 -0700
Subject: [PATCH 04/10] PixelShaderGen: Remove unused num_texgens argument

It became unused in f039149198657c1891e1c6462ed30c31ed4b8486.
---
 Source/Core/VideoCommon/PixelShaderGen.cpp  | 5 ++---
 Source/Core/VideoCommon/PixelShaderGen.h    | 2 +-
 Source/Core/VideoCommon/UberShaderPixel.cpp | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp
index 866b7bacc8..6cda21b41b 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@@ -361,7 +361,7 @@ void ClearUnusedPixelShaderUidBits(APIType api_type, const ShaderHostConfig& hos
   uid_data->bounding_box &= host_config.bounding_box & host_config.backend_bbox;
 }
 
-void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type, u32 num_texgens,
+void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type,
                                   const ShaderHostConfig& host_config, bool bounding_box)
 {
   // dot product for integer vectors
@@ -546,8 +546,7 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
             uid_data->genMode_numtexgens, uid_data->genMode_numindstages);
 
   // Stuff that is shared between ubershaders and pixelgen.
-  WritePixelShaderCommonHeader(out, api_type, uid_data->genMode_numtexgens, host_config,
-                               uid_data->bounding_box);
+  WritePixelShaderCommonHeader(out, api_type, host_config, uid_data->bounding_box);
 
   if (uid_data->forced_early_z && g_ActiveConfig.backend_info.bSupportsEarlyZ)
   {
diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h
index 08724274ee..6691e9c507 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.h
+++ b/Source/Core/VideoCommon/PixelShaderGen.h
@@ -158,7 +158,7 @@ using PixelShaderUid = ShaderUid<pixel_shader_uid_data>;
 
 ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& host_config,
                                    const pixel_shader_uid_data* uid_data);
-void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type, u32 num_texgens,
+void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type,
                                   const ShaderHostConfig& host_config, bool bounding_box);
 void ClearUnusedPixelShaderUidBits(APIType api_type, const ShaderHostConfig& host_config,
                                    PixelShaderUid* uid);
diff --git a/Source/Core/VideoCommon/UberShaderPixel.cpp b/Source/Core/VideoCommon/UberShaderPixel.cpp
index 996d3c1d55..f3a1fe2a4b 100644
--- a/Source/Core/VideoCommon/UberShaderPixel.cpp
+++ b/Source/Core/VideoCommon/UberShaderPixel.cpp
@@ -64,7 +64,7 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
 
   out.Write("// Pixel UberShader for {} texgens{}{}\n", numTexgen,
             early_depth ? ", early-depth" : "", per_pixel_depth ? ", per-pixel depth" : "");
-  WritePixelShaderCommonHeader(out, ApiType, numTexgen, host_config, bounding_box);
+  WritePixelShaderCommonHeader(out, ApiType, host_config, bounding_box);
   WriteUberShaderCommonHeader(out, ApiType, host_config);
   if (per_pixel_lighting)
     WriteLightingFunction(out);

From f6cf85a8bca13d3e8a075f2f823a142b7b9d115d Mon Sep 17 00:00:00 2001
From: Tillmann Karras <tilkax@gmail.com>
Date: Mon, 5 Aug 2019 02:18:37 +0100
Subject: [PATCH 05/10] PixelShaderGen: Fix OOB tex coord indices

Previously we set the texture coordinate to zero, now we set
the texture coordinate *index* to zero. This fixes the ripple
effect of the Mario painting in Luigi's Mansion.

Co-authored-by: Pokechu22 <Pokechu022@gmail.com>
---
 Source/Core/VideoCommon/PixelShaderGen.cpp | 56 +++++++++++-----------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp
index 6cda21b41b..b59cf04ebb 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@@ -238,14 +238,9 @@ PixelShaderUid GetPixelShaderUid()
 
   for (unsigned int n = 0; n < numStages; n++)
   {
-    int texcoord = bpmem.tevorders[n / 2].getTexCoord(n & 1);
-    bool bHasTexCoord = (u32)texcoord < bpmem.genMode.numtexgens;
-    // HACK to handle cases where the tex gen is not enabled
-    if (!bHasTexCoord)
-      texcoord = bpmem.genMode.numtexgens;
+    uid_data->stagehash[n].tevorders_texcoord = bpmem.tevorders[n / 2].getTexCoord(n & 1);
 
     uid_data->stagehash[n].hasindstage = bpmem.tevind[n].bt < bpmem.genMode.numindstages;
-    uid_data->stagehash[n].tevorders_texcoord = texcoord;
     if (uid_data->stagehash[n].hasindstage)
       uid_data->stagehash[n].tevind = bpmem.tevind[n].hex;
 
@@ -774,9 +769,11 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
       out.Write("col1 = float4(0.0, 0.0, 0.0, 0.0);\n");
   }
 
-  // HACK to handle cases where the tex gen is not enabled
   if (uid_data->genMode_numtexgens == 0)
   {
+    // TODO: This is a hack to ensure that shaders still compile when setting out of bounds tex
+    // coord indices to 0.  Ideally, it shouldn't exist at all, but the exact behavior hasn't been
+    // tested.
     out.Write("\tint2 fixpoint_uv0 = int2(0, 0);\n\n");
   }
   else
@@ -795,19 +792,19 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
   {
     if ((uid_data->nIndirectStagesUsed & (1U << i)) != 0)
     {
-      const u32 texcoord = uid_data->GetTevindirefCoord(i);
+      u32 texcoord = uid_data->GetTevindirefCoord(i);
       const u32 texmap = uid_data->GetTevindirefMap(i);
 
-      if (texcoord < uid_data->genMode_numtexgens)
-      {
-        out.SetConstantsUsed(C_INDTEXSCALE + i / 2, C_INDTEXSCALE + i / 2);
-        out.Write("\ttempcoord = fixpoint_uv{} >> " I_INDTEXSCALE "[{}].{};\n", texcoord, i / 2,
-                  (i & 1) != 0 ? "zw" : "xy");
-      }
-      else
-      {
-        out.Write("\ttempcoord = int2(0, 0);\n");
-      }
+      // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does
+      // not exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
+      // This affects the Mario portrait in Luigi's Mansion, where the developers forgot to set
+      // the number of tex gens to 2 (bug 11462).
+      if (texcoord >= uid_data->genMode_numtexgens)
+        texcoord = 0;
+
+      out.SetConstantsUsed(C_INDTEXSCALE + i / 2, C_INDTEXSCALE + i / 2);
+      out.Write("\ttempcoord = fixpoint_uv{} >> " I_INDTEXSCALE "[{}].{};\n", texcoord, i / 2,
+                (i & 1) ? "zw" : "xy");
 
       out.Write("\tint3 iindtex{} = ", i);
       SampleTexture(out, "float2(tempcoord)", "abg", texmap, stereo, api_type);
@@ -949,7 +946,8 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
   const auto& stage = uid_data->stagehash[n];
   out.Write("\n\t// TEV stage {}\n", n);
 
-  // HACK to handle cases where the tex gen is not enabled
+  // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does not
+  // exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
   u32 texcoord = stage.tevorders_texcoord;
   const bool has_tex_coord = texcoord < uid_data->genMode_numtexgens;
   if (!has_tex_coord)
@@ -1169,6 +1167,10 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
     // Emulate s24 overflows
     out.Write("\ttevcoord.xy = (tevcoord.xy << 8) >> 8;\n");
   }
+  else
+  {
+    out.Write("\ttevcoord.xy = fixpoint_uv{};\n", texcoord);
+  }
 
   TevStageCombiner::ColorCombiner cc;
   TevStageCombiner::AlphaCombiner ac;
@@ -1194,7 +1196,7 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
     out.Write("\trastemp = {}.{};\n", tev_ras_table[u32(stage.tevorders_colorchan)], rasswap);
   }
 
-  if (stage.tevorders_enable)
+  if (stage.tevorders_enable && uid_data->genMode_numtexgens > 0)
   {
     // Generate swizzle string to represent the texture color channel swapping
     const char texswap[5] = {
@@ -1205,17 +1207,15 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
         '\0',
     };
 
-    if (!stage.hasindstage)
-    {
-      // calc tevcord
-      if (has_tex_coord)
-        out.Write("\ttevcoord.xy = fixpoint_uv{};\n", texcoord);
-      else
-        out.Write("\ttevcoord.xy = int2(0, 0);\n");
-    }
     out.Write("\ttextemp = ");
     SampleTexture(out, "float2(tevcoord.xy)", texswap, stage.tevorders_texmap, stereo, api_type);
   }
+  else if (uid_data->genMode_numtexgens == 0)
+  {
+    // It seems like the result is always black when no tex coords are enabled, but further testing
+    // is needed.
+    out.Write("\ttextemp = int4(0, 0, 0, 0);\n");
+  }
   else
   {
     out.Write("\ttextemp = int4(255, 255, 255, 255);\n");

From 16c17ed9cead8f2cafd5d027d5f4957cd906246c Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Thu, 8 Apr 2021 18:10:13 -0700
Subject: [PATCH 06/10] Software: Fix OOB tex coord indices

Previously we set the texture coordinate to zero, now we set
the texture coordinate *index* to zero. This fixes the ripple
effect of the Mario painting in Luigi's Mansion.
---
 Source/Core/VideoBackends/Software/Tev.cpp | 32 ++++++++++++++++++----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/Source/Core/VideoBackends/Software/Tev.cpp b/Source/Core/VideoBackends/Software/Tev.cpp
index 30258f6cd3..be6f14d610 100644
--- a/Source/Core/VideoBackends/Software/Tev.cpp
+++ b/Source/Core/VideoBackends/Software/Tev.cpp
@@ -6,6 +6,7 @@
 
 #include <algorithm>
 #include <cmath>
+#include <cstring>
 
 #include "Common/ChunkFile.h"
 #include "Common/CommonTypes.h"
@@ -559,9 +560,16 @@ void Tev::Draw()
     const int stageNum2 = stageNum >> 1;
     const int stageOdd = stageNum & 1;
 
-    const u32 texcoordSel = bpmem.tevindref.getTexCoord(stageNum);
+    u32 texcoordSel = bpmem.tevindref.getTexCoord(stageNum);
     const u32 texmap = bpmem.tevindref.getTexMap(stageNum);
 
+    // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does
+    // not exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
+    // This affects the Mario portrait in Luigi's Mansion, where the developers forgot to set
+    // the number of tex gens to 2 (bug 11462).
+    if (texcoordSel >= bpmem.genMode.numtexgens)
+      texcoordSel = 0;
+
     const TEXSCALE& texscale = bpmem.texscale[stageNum2];
     const s32 scaleS = stageOdd ? texscale.ss1 : texscale.ss0;
     const s32 scaleT = stageOdd ? texscale.ts1 : texscale.ts0;
@@ -592,8 +600,13 @@ void Tev::Draw()
     const TevStageCombiner::ColorCombiner& cc = bpmem.combiners[stageNum].colorC;
     const TevStageCombiner::AlphaCombiner& ac = bpmem.combiners[stageNum].alphaC;
 
-    const int texcoordSel = order.getTexCoord(stageOdd);
-    const int texmap = order.getTexMap(stageOdd);
+    u32 texcoordSel = order.getTexCoord(stageOdd);
+    const u32 texmap = order.getTexMap(stageOdd);
+
+    // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does
+    // not exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
+    if (texcoordSel >= bpmem.genMode.numtexgens)
+      texcoordSel = 0;
 
     Indirect(stageNum, Uv[texcoordSel].s, Uv[texcoordSel].t);
 
@@ -603,8 +616,17 @@ void Tev::Draw()
       // RGBA
       u8 texel[4];
 
-      TextureSampler::Sample(TexCoord.s, TexCoord.t, TextureLod[stageNum], TextureLinear[stageNum],
-                             texmap, texel);
+      if (bpmem.genMode.numtexgens > 0)
+      {
+        TextureSampler::Sample(TexCoord.s, TexCoord.t, TextureLod[stageNum],
+                               TextureLinear[stageNum], texmap, texel);
+      }
+      else
+      {
+        // It seems like the result is always black when no tex coords are enabled, but further
+        // hardware testing is needed.
+        std::memset(texel, 0, 4);
+      }
 
 #if ALLOW_TEV_DUMPS
       if (g_ActiveConfig.bDumpTevTextureFetches)

From ed020349676f6bf465c1129123fafcdbfeaeef0b Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Sat, 17 Apr 2021 14:15:34 -0700
Subject: [PATCH 07/10] UberShaderPixel: Return fixed-point values from
 selectTexCoord

This change should have no behavioral differences itself, but allows for changing the behavior of out of bounds tex coord indices more easily in the next commit.  Without this change, returning tex0 for out of bounds cases and then applying the fixed-point logic would use the wrong tex dimension info (tex0 with I_TEXDIMS[1] or such), which is inaccurate.
---
 Source/Core/VideoCommon/UberShaderPixel.cpp | 73 +++++++++------------
 1 file changed, 32 insertions(+), 41 deletions(-)

diff --git a/Source/Core/VideoCommon/UberShaderPixel.cpp b/Source/Core/VideoCommon/UberShaderPixel.cpp
index f3a1fe2a4b..574b089a7c 100644
--- a/Source/Core/VideoCommon/UberShaderPixel.cpp
+++ b/Source/Core/VideoCommon/UberShaderPixel.cpp
@@ -150,17 +150,10 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
   // Uniform index -> texture coordinates
   if (numTexgen > 0)
   {
-    if (ApiType != APIType::D3D)
-    {
-      out.Write("float3 selectTexCoord(uint index) {{\n");
-    }
-    else
-    {
-      out.Write("float3 selectTexCoord(uint index");
-      for (u32 i = 0; i < numTexgen; i++)
-        out.Write(", float3 tex{}", i);
-      out.Write(") {{\n");
-    }
+    out.Write("int2 selectTexCoord(uint index");
+    for (u32 i = 0; i < numTexgen; i++)
+      out.Write(", int2 fixpoint_uv{}", i);
+    out.Write(") {{\n");
 
     if (ApiType == APIType::D3D)
     {
@@ -168,11 +161,11 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
       for (u32 i = 0; i < numTexgen; i++)
       {
         out.Write("  case {}u:\n"
-                  "    return tex{};\n",
+                  "    return fixpoint_uv{};\n",
                   i, i);
       }
       out.Write("  default:\n"
-                "    return float3(0.0, 0.0, 0.0);\n"
+                "    return int2(0, 0);\n"
                 "  }}\n");
     }
     else
@@ -182,16 +175,16 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
       if (numTexgen > 2)
         out.Write("    if (index < 2u) {{\n");
       if (numTexgen > 1)
-        out.Write("      return (index == 0u) ? tex0 : tex1;\n");
+        out.Write("      return (index == 0u) ? fixpoint_uv0 : fixpoint_uv1;\n");
       else
-        out.Write("      return (index == 0u) ? tex0 : float3(0.0, 0.0, 0.0);\n");
+        out.Write("      return (index == 0u) ? fixpoint_uv0 : int2(0, 0);\n");
       if (numTexgen > 2)
       {
         out.Write("    }} else {{\n");  // >= 2
         if (numTexgen > 3)
-          out.Write("      return (index == 2u) ? tex2 : tex3;\n");
+          out.Write("      return (index == 2u) ? fixpoint_uv2 : fixpoint_uv3;\n");
         else
-          out.Write("      return (index == 2u) ? tex2 : float3(0.0, 0.0, 0.0);\n");
+          out.Write("      return (index == 2u) ? fixpoint_uv2 : int2(0, 0);\n");
         out.Write("    }}\n");
       }
       if (numTexgen > 4)
@@ -200,16 +193,16 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
         if (numTexgen > 6)
           out.Write("    if (index < 6u) {{\n");
         if (numTexgen > 5)
-          out.Write("      return (index == 4u) ? tex4 : tex5;\n");
+          out.Write("      return (index == 4u) ? fixpoint_uv4 : fixpoint_uv5;\n");
         else
-          out.Write("      return (index == 4u) ? tex4 : float3(0.0, 0.0, 0.0);\n");
+          out.Write("      return (index == 4u) ? fixpoint_uv4 : int2(0, 0);\n");
         if (numTexgen > 6)
         {
           out.Write("    }} else {{\n");  // >= 6 <= 8
           if (numTexgen > 7)
-            out.Write("      return (index == 6u) ? tex6 : tex7;\n");
+            out.Write("      return (index == 6u) ? fixpoint_uv6 : fixpoint_uv7;\n");
           else
-            out.Write("      return (index == 6u) ? tex6 : float3(0.0, 0.0, 0.0);\n");
+            out.Write("      return (index == 6u) ? fixpoint_uv6 : int2(0, 0);\n");
           out.Write("    }}\n");
         }
         out.Write("  }}\n");
@@ -293,9 +286,7 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
               "  {{\n"
               "    uint texcoord = bitfieldExtract(iref, 0, 3);\n"
               "    uint texmap = bitfieldExtract(iref, 8, 3);\n"
-              "    float3 uv = getTexCoord(texcoord);\n"
-              "    int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * " I_TEXDIMS
-              "[texcoord].zw);\n"
+              "    int2 fixedPoint_uv = getTexCoord(texcoord);\n"
               "\n"
               "    if (({} & 1u) == 0u)\n"
               "      fixedPoint_uv = fixedPoint_uv >> " I_INDTEXSCALE "[{} >> 1].xy;\n"
@@ -666,21 +657,14 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
         "\n");
   }
 
-  // Since the texture coodinate variables aren't global, we need to pass
-  // them to the select function in D3D.
+  // Since the fixed-point texture coodinate variables aren't global, we need to pass
+  // them to the select function.  This applies to all backends.
   if (numTexgen > 0)
   {
-    if (ApiType != APIType::D3D)
-    {
-      out.Write("#define getTexCoord(index) selectTexCoord((index))\n\n");
-    }
-    else
-    {
-      out.Write("#define getTexCoord(index) selectTexCoord((index)");
-      for (u32 i = 0; i < numTexgen; i++)
-        out.Write(", tex{}", i);
-      out.Write(")\n\n");
-    }
+    out.Write("#define getTexCoord(index) selectTexCoord((index)");
+    for (u32 i = 0; i < numTexgen; i++)
+      out.Write(", fixpoint_uv{}", i);
+    out.Write(")\n\n");
   }
 
   if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
@@ -788,11 +772,18 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
   // Disable texturing when there are no texgens (for now)
   if (numTexgen != 0)
   {
-    out.Write("    uint tex_coord = {};\n",
+    for (u32 i = 0; i < numTexgen; i++)
+    {
+      out.Write("    int2 fixpoint_uv{} = int2(", i);
+      out.Write("(tex{}.z == 0.0 ? tex{}.xy : tex{}.xy / tex{}.z)", i, i, i, i);
+      out.Write(" * " I_TEXDIMS "[{}].zw);\n", i);
+      // TODO: S24 overflows here?
+    }
+
+    out.Write("\n"
+              "    uint tex_coord = {};\n",
               BitfieldExtract<&TwoTevStageOrders::texcoord0>("ss.order"));
-    out.Write("    float3 uv = getTexCoord(tex_coord);\n"
-              "    int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * " I_TEXDIMS
-              "[tex_coord].zw);\n"
+    out.Write("    int2 fixedPoint_uv = getTexCoord(tex_coord);\n"
               "\n"
               "    bool texture_enabled = (ss.order & {}u) != 0u;\n",
               1 << TwoTevStageOrders().enable0.StartBit());

From 5e3360c2cc84335b7089fb9bc5faf5453b602d13 Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Sat, 17 Apr 2021 13:23:39 -0700
Subject: [PATCH 08/10] UberShaderPixel: Fix OOB tex coord indices

Previously we set the texture coordinate to zero, now we set
the texture coordinate *index* to zero. This fixes the ripple
effect of the Mario painting in Luigi's Mansion.
---
 Source/Core/VideoCommon/UberShaderPixel.cpp | 23 ++++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/Source/Core/VideoCommon/UberShaderPixel.cpp b/Source/Core/VideoCommon/UberShaderPixel.cpp
index 574b089a7c..a029f58812 100644
--- a/Source/Core/VideoCommon/UberShaderPixel.cpp
+++ b/Source/Core/VideoCommon/UberShaderPixel.cpp
@@ -148,6 +148,10 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
   }
 
   // Uniform index -> texture coordinates
+  // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does
+  // not exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
+  // This affects the Mario portrait in Luigi's Mansion, where the developers forgot to set
+  // the number of tex gens to 2 (bug 11462).
   if (numTexgen > 0)
   {
     out.Write("int2 selectTexCoord(uint index");
@@ -165,11 +169,14 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
                   i, i);
       }
       out.Write("  default:\n"
-                "    return int2(0, 0);\n"
+                "    return fixpoint_uv0;\n"
                 "  }}\n");
     }
     else
     {
+      out.Write("  if (index >= {}u) {{\n", numTexgen);
+      out.Write("    return fixpoint_uv0;\n"
+                "  }}\n");
       if (numTexgen > 4)
         out.Write("  if (index < 4u) {{\n");
       if (numTexgen > 2)
@@ -177,32 +184,32 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
       if (numTexgen > 1)
         out.Write("      return (index == 0u) ? fixpoint_uv0 : fixpoint_uv1;\n");
       else
-        out.Write("      return (index == 0u) ? fixpoint_uv0 : int2(0, 0);\n");
+        out.Write("      return fixpoint_uv0;\n");
       if (numTexgen > 2)
       {
-        out.Write("    }} else {{\n");  // >= 2
+        out.Write("    }} else {{\n");  // >= 2 < min(4, numTexgen)
         if (numTexgen > 3)
           out.Write("      return (index == 2u) ? fixpoint_uv2 : fixpoint_uv3;\n");
         else
-          out.Write("      return (index == 2u) ? fixpoint_uv2 : int2(0, 0);\n");
+          out.Write("      return fixpoint_uv2;\n");
         out.Write("    }}\n");
       }
       if (numTexgen > 4)
       {
-        out.Write("  }} else {{\n");  // >= 4 <= 8
+        out.Write("  }} else {{\n");  // >= 4 < min(8, numTexgen)
         if (numTexgen > 6)
           out.Write("    if (index < 6u) {{\n");
         if (numTexgen > 5)
           out.Write("      return (index == 4u) ? fixpoint_uv4 : fixpoint_uv5;\n");
         else
-          out.Write("      return (index == 4u) ? fixpoint_uv4 : int2(0, 0);\n");
+          out.Write("      return fixpoint_uv4;\n");
         if (numTexgen > 6)
         {
-          out.Write("    }} else {{\n");  // >= 6 <= 8
+          out.Write("    }} else {{\n");  // >= 6 < min(8, numTexgen)
           if (numTexgen > 7)
             out.Write("      return (index == 6u) ? fixpoint_uv6 : fixpoint_uv7;\n");
           else
-            out.Write("      return (index == 6u) ? fixpoint_uv6 : int2(0, 0);\n");
+            out.Write("      return fixpoint_uv6;\n");
           out.Write("    }}\n");
         }
         out.Write("  }}\n");

From b5844ab195a38303e5fb82a7f6804a726fa8fb7a Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Sun, 18 Apr 2021 16:12:21 -0700
Subject: [PATCH 09/10] PixelShaderGen: always run indirect stage logic

Hardware testing has confirmed that fb_addprev and wrapping both run even when the indirect stage is disabled.
---
 Source/Core/VideoCommon/GXPipelineTypes.h  |  1 +
 Source/Core/VideoCommon/PixelShaderGen.cpp | 48 +++++++++++-----------
 Source/Core/VideoCommon/PixelShaderGen.h   |  1 +
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/Source/Core/VideoCommon/GXPipelineTypes.h b/Source/Core/VideoCommon/GXPipelineTypes.h
index 1ddd853597..8892d386f5 100644
--- a/Source/Core/VideoCommon/GXPipelineTypes.h
+++ b/Source/Core/VideoCommon/GXPipelineTypes.h
@@ -20,6 +20,7 @@ namespace VideoCommon
 // As pipelines encompass both shader UIDs and render states, changes to either of these should
 // also increment the pipeline UID version. Incrementing the UID version will cause all UID
 // caches to be invalidated.
+// TODO: Remove PixelShaderUid hasindstage on the next UID version bump
 constexpr u32 GX_PIPELINE_UID_VERSION = 2;  // Last changed in PR 9122
 
 struct GXPipelineUid
diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp
index b59cf04ebb..636bf2be7f 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@@ -220,13 +220,10 @@ PixelShaderUid GetPixelShaderUid()
 
   // indirect texture map lookup
   int nIndirectStagesUsed = 0;
-  if (uid_data->genMode_numindstages > 0)
+  for (unsigned int i = 0; i < numStages; ++i)
   {
-    for (unsigned int i = 0; i < numStages; ++i)
-    {
-      if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < uid_data->genMode_numindstages)
-        nIndirectStagesUsed |= 1 << bpmem.tevind[i].bt;
-    }
+    if (bpmem.tevind[i].IsActive())
+      nIndirectStagesUsed |= 1 << bpmem.tevind[i].bt;
   }
 
   uid_data->nIndirectStagesUsed = nIndirectStagesUsed;
@@ -240,9 +237,12 @@ PixelShaderUid GetPixelShaderUid()
   {
     uid_data->stagehash[n].tevorders_texcoord = bpmem.tevorders[n / 2].getTexCoord(n & 1);
 
+    // hasindstage previously was used as a criterion to set tevind to 0, but there are variables in
+    // tevind that are used even if the indirect stage is disabled, so now it is only left in to
+    // avoid breaking existing UIDs (in most cases, games will have 0 in tevind anyways)
+    // TODO: Remove hasindstage on the next UID version bump
     uid_data->stagehash[n].hasindstage = bpmem.tevind[n].bt < bpmem.genMode.numindstages;
-    if (uid_data->stagehash[n].hasindstage)
-      uid_data->stagehash[n].tevind = bpmem.tevind[n].hex;
+    uid_data->stagehash[n].tevind = bpmem.tevind[n].hex;
 
     TevStageCombiner::ColorCombiner& cc = bpmem.combiners[n].colorC;
     TevStageCombiner::AlphaCombiner& ac = bpmem.combiners[n].alphaC;
@@ -810,6 +810,16 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
       SampleTexture(out, "float2(tempcoord)", "abg", texmap, stereo, api_type);
     }
   }
+  for (u32 i = uid_data->genMode_numindstages; i < 4; i++)
+  {
+    // Referencing a stage above the number of ind stages is undefined behavior,
+    // and on console produces a noise pattern (details unknown).
+    // TODO: This behavior is nowhere near that, but it ensures the shader still compiles.
+    if ((uid_data->nIndirectStagesUsed & (1U << i)) != 0)
+    {
+      out.Write("\tint3 iindtex{} = int3(0, 0, 0);  // Undefined behavior on console\n", i);
+    }
+  }
 
   for (u32 i = 0; i < numStages; i++)
   {
@@ -953,11 +963,8 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
   if (!has_tex_coord)
     texcoord = 0;
 
-  if (stage.hasindstage)
   {
-    TevStageIndirect tevind;
-    tevind.hex = stage.tevind;
-
+    const TevStageIndirect tevind{.hex = stage.tevind};
     out.Write("\t// indirect op\n");
 
     // Perform the indirect op on the incoming regular coordinates
@@ -1124,9 +1131,8 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
     // Wrapping
     // ---------
 
-    // TODO: Should the last element be 1 or (1<<7)?
-    static constexpr std::array<const char*, 7> tev_ind_wrap_start{
-        "0", "(256<<7)", "(128<<7)", "(64<<7)", "(32<<7)", "(16<<7)", "1",
+    static constexpr std::array<const char*, 5> tev_ind_wrap_start{
+        "(256<<7)", "(128<<7)", "(64<<7)", "(32<<7)", "(16<<7)",
     };
 
     // wrap S
@@ -1134,14 +1140,14 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
     {
       out.Write("\twrappedcoord.x = fixpoint_uv{}.x;\n", texcoord);
     }
-    else if (tevind.sw == IndTexWrap::ITW_0)
+    else if (tevind.sw >= IndTexWrap::ITW_0)  // 7 (Invalid) appears to behave the same as 6 (ITW_0)
     {
       out.Write("\twrappedcoord.x = 0;\n");
     }
     else
     {
       out.Write("\twrappedcoord.x = fixpoint_uv{}.x & ({} - 1);\n", texcoord,
-                tev_ind_wrap_start[u32(tevind.sw.Value())]);
+                tev_ind_wrap_start[u32(tevind.sw.Value()) - u32(IndTexWrap::ITW_256)]);
     }
 
     // wrap T
@@ -1149,14 +1155,14 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
     {
       out.Write("\twrappedcoord.y = fixpoint_uv{}.y;\n", texcoord);
     }
-    else if (tevind.tw == IndTexWrap::ITW_0)
+    else if (tevind.tw >= IndTexWrap::ITW_0)  // 7 (Invalid) appears to behave the same as 6 (ITW_0)
     {
       out.Write("\twrappedcoord.y = 0;\n");
     }
     else
     {
       out.Write("\twrappedcoord.y = fixpoint_uv{}.y & ({} - 1);\n", texcoord,
-                tev_ind_wrap_start[u32(tevind.tw.Value())]);
+                tev_ind_wrap_start[u32(tevind.tw.Value()) - u32(IndTexWrap::ITW_256)]);
     }
 
     if (tevind.fb_addprev)  // add previous tevcoord
@@ -1167,10 +1173,6 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
     // Emulate s24 overflows
     out.Write("\ttevcoord.xy = (tevcoord.xy << 8) >> 8;\n");
   }
-  else
-  {
-    out.Write("\ttevcoord.xy = fixpoint_uv{};\n", texcoord);
-  }
 
   TevStageCombiner::ColorCombiner cc;
   TevStageCombiner::AlphaCombiner ac;
diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h
index 6691e9c507..d75fac8eb7 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.h
+++ b/Source/Core/VideoCommon/PixelShaderGen.h
@@ -133,6 +133,7 @@ struct pixel_shader_uid_data
     u32 pad1 : 6;
 
     // TODO: Clean up the swapXY mess
+    // TODO: remove hasindstage, as it no longer does anything useful
     u32 hasindstage : 1;
     u32 tevind : 21;
     u32 tevksel_swap1a : 2;

From e1d45e9ba66d3ba7d6769e36c0fc82ceb5028ecb Mon Sep 17 00:00:00 2001
From: Pokechu22 <Pokechu022@gmail.com>
Date: Sun, 18 Apr 2021 19:51:02 -0700
Subject: [PATCH 10/10] UberShaderPixel: always run indirect stage logic

Hardware testing has confirmed that fb_addprev and wrapping both run even when the indirect stage is disabled.
---
 .../Core/VideoCommon/PixelShaderManager.cpp   | 33 +++++++------------
 Source/Core/VideoCommon/UberShaderPixel.cpp   |  6 ++++
 2 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp
index 6d52d8ffc1..0ae1ced1e4 100644
--- a/Source/Core/VideoCommon/PixelShaderManager.cpp
+++ b/Source/Core/VideoCommon/PixelShaderManager.cpp
@@ -148,29 +148,18 @@ void PixelShaderManager::SetConstants()
 
     for (u32 i = 0; i < (bpmem.genMode.numtevstages + 1); ++i)
     {
-      u32 stage = bpmem.tevind[i].bt;
-      if (stage < bpmem.genMode.numindstages)
-      {
-        // We set some extra bits so the ubershader can quickly check if these
-        // features are in use.
-        if (bpmem.tevind[i].IsActive())
-          constants.pack1[stage][3] =
-              bpmem.tevindref.getTexCoord(stage) | bpmem.tevindref.getTexMap(stage) << 8 | 1 << 16;
-        // Note: a tevind of zero just happens to be a passthrough, so no need
-        // to set an extra bit.
-        constants.pack1[i][2] = bpmem.tevind[i].hex;  // TODO: This match shadergen, but videosw
-                                                      // will always wrap.
+      // Note: a tevind of zero just happens to be a passthrough, so no need
+      // to set an extra bit.  Furthermore, wrap and add to previous apply even if there is no
+      // indirect stage.
+      constants.pack1[i][2] = bpmem.tevind[i].hex;
 
-        // The ubershader uses tevind != 0 as a condition whether to calculate texcoords,
-        // even when texture is disabled, instead of the stage < bpmem.genMode.numindstages.
-        // We set an unused bit here to indicate that the stage is active, even if it
-        // is just a pass-through.
-        constants.pack1[i][2] |= 0x80000000;
-      }
-      else
-      {
-        constants.pack1[i][2] = 0;
-      }
+      u32 stage = bpmem.tevind[i].bt;
+
+      // We use an extra bit (1 << 16) to provide a fast way of testing if this feature is in use.
+      // Note also that this is indexed by indirect stage, not by TEV stage.
+      if (bpmem.tevind[i].IsActive() && stage < bpmem.genMode.numindstages)
+        constants.pack1[stage][3] =
+            bpmem.tevindref.getTexCoord(stage) | bpmem.tevindref.getTexMap(stage) << 8 | 1 << 16;
     }
 
     dirty = true;
diff --git a/Source/Core/VideoCommon/UberShaderPixel.cpp b/Source/Core/VideoCommon/UberShaderPixel.cpp
index a029f58812..642896a3dc 100644
--- a/Source/Core/VideoCommon/UberShaderPixel.cpp
+++ b/Source/Core/VideoCommon/UberShaderPixel.cpp
@@ -287,6 +287,8 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
   // ======================
   const auto LookupIndirectTexture = [&out, stereo](std::string_view out_var_name,
                                                     std::string_view in_index_name) {
+    // in_index_name is the indirect stage, not the tev stage
+    // bpmem_iref is packed differently from RAS1_IREF
     out.Write("{{\n"
               "  uint iref = bpmem_iref({});\n"
               "  if ( iref != 0u)\n"
@@ -304,6 +306,10 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
               "[texmap].xy, {})).abg;\n",
               in_index_name, in_index_name, in_index_name, in_index_name, out_var_name,
               stereo ? "float(layer)" : "0.0");
+    // There is always a bit set in bpmem_iref if the data is valid (matrix is not off, and the
+    // indirect texture stage is enabled). If the matrix is off, the result doesn't matter; if the
+    // indirect texture stage is disabled, the result is undefined (and produces a glitchy pattern
+    // on hardware, different from this).
     out.Write("  }}\n"
               "  else\n"
               "  {{\n"