shader_recompiler: Proper support for inst-typed buffer format operations. (#2469)

2025-08-25 03:35:34 +00:00 · 2025-02-21 03:01:18 -08:00 · 2025-02-21 03:01:18 -08:00 · 9424047214
commit 9424047214
parent 6860bb7349
6 changed files with 167 additions and 207 deletions
--- a/src/shader_recompiler/frontend/translate/translate.h
+++ b/src/shader_recompiler/frontend/translate/translate.h
@ -277,10 +277,9 @@ public:

    // Buffer Memory
    // MUBUF / MTBUF
-    void BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst);
-    void BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst);
-    void BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst);
-    void BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst);
+    void BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed, const GcnInst& inst);
+    void BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed,
+                      const GcnInst& inst);
    void BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst);

    // Image Memory
--- a/src/shader_recompiler/frontend/translate/vector_memory.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp
@ -11,59 +11,59 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {

        // Buffer load operations
    case Opcode::TBUFFER_LOAD_FORMAT_X:
-        return BUFFER_LOAD(1, true, inst);
+        return BUFFER_LOAD(1, true, false, inst);
    case Opcode::TBUFFER_LOAD_FORMAT_XY:
-        return BUFFER_LOAD(2, true, inst);
+        return BUFFER_LOAD(2, true, false, inst);
    case Opcode::TBUFFER_LOAD_FORMAT_XYZ:
-        return BUFFER_LOAD(3, true, inst);
+        return BUFFER_LOAD(3, true, false, inst);
    case Opcode::TBUFFER_LOAD_FORMAT_XYZW:
-        return BUFFER_LOAD(4, true, inst);
+        return BUFFER_LOAD(4, true, false, inst);

    case Opcode::BUFFER_LOAD_FORMAT_X:
-        return BUFFER_LOAD_FORMAT(1, inst);
+        return BUFFER_LOAD(1, false, true, inst);
    case Opcode::BUFFER_LOAD_FORMAT_XY:
-        return BUFFER_LOAD_FORMAT(2, inst);
+        return BUFFER_LOAD(2, false, true, inst);
    case Opcode::BUFFER_LOAD_FORMAT_XYZ:
-        return BUFFER_LOAD_FORMAT(3, inst);
+        return BUFFER_LOAD(3, false, true, inst);
    case Opcode::BUFFER_LOAD_FORMAT_XYZW:
-        return BUFFER_LOAD_FORMAT(4, inst);
+        return BUFFER_LOAD(4, false, true, inst);

    case Opcode::BUFFER_LOAD_DWORD:
-        return BUFFER_LOAD(1, false, inst);
+        return BUFFER_LOAD(1, false, false, inst);
    case Opcode::BUFFER_LOAD_DWORDX2:
-        return BUFFER_LOAD(2, false, inst);
+        return BUFFER_LOAD(2, false, false, inst);
    case Opcode::BUFFER_LOAD_DWORDX3:
-        return BUFFER_LOAD(3, false, inst);
+        return BUFFER_LOAD(3, false, false, inst);
    case Opcode::BUFFER_LOAD_DWORDX4:
-        return BUFFER_LOAD(4, false, inst);
+        return BUFFER_LOAD(4, false, false, inst);

        // Buffer store operations
    case Opcode::BUFFER_STORE_FORMAT_X:
-        return BUFFER_STORE_FORMAT(1, inst);
+        return BUFFER_STORE(1, false, true, inst);
    case Opcode::BUFFER_STORE_FORMAT_XY:
-        return BUFFER_STORE_FORMAT(2, inst);
+        return BUFFER_STORE(2, false, true, inst);
    case Opcode::BUFFER_STORE_FORMAT_XYZ:
-        return BUFFER_STORE_FORMAT(3, inst);
+        return BUFFER_STORE(3, false, true, inst);
    case Opcode::BUFFER_STORE_FORMAT_XYZW:
-        return BUFFER_STORE_FORMAT(4, inst);
+        return BUFFER_STORE(4, false, true, inst);

    case Opcode::TBUFFER_STORE_FORMAT_X:
-        return BUFFER_STORE(1, true, inst);
+        return BUFFER_STORE(1, true, false, inst);
    case Opcode::TBUFFER_STORE_FORMAT_XY:
-        return BUFFER_STORE(2, true, inst);
+        return BUFFER_STORE(2, true, false, inst);
    case Opcode::TBUFFER_STORE_FORMAT_XYZ:
-        return BUFFER_STORE(3, true, inst);
+        return BUFFER_STORE(3, true, false, inst);
    case Opcode::TBUFFER_STORE_FORMAT_XYZW:
-        return BUFFER_STORE(4, true, inst);
+        return BUFFER_STORE(4, true, false, inst);

    case Opcode::BUFFER_STORE_DWORD:
-        return BUFFER_STORE(1, false, inst);
+        return BUFFER_STORE(1, false, false, inst);
    case Opcode::BUFFER_STORE_DWORDX2:
-        return BUFFER_STORE(2, false, inst);
+        return BUFFER_STORE(2, false, false, inst);
    case Opcode::BUFFER_STORE_DWORDX3:
-        return BUFFER_STORE(3, false, inst);
+        return BUFFER_STORE(3, false, false, inst);
    case Opcode::BUFFER_STORE_DWORDX4:
-        return BUFFER_STORE(4, false, inst);
+        return BUFFER_STORE(4, false, false, inst);

        // Buffer atomic operations
    case Opcode::BUFFER_ATOMIC_ADD:
@ -165,7 +165,8 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {
    }
 }

-void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) {
+void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed,
+                             const GcnInst& inst) {
    const auto& mubuf = inst.control.mubuf;
    const bool is_ring = mubuf.glc && mubuf.slc;
    const IR::VectorReg vaddr{inst.src[0].code};
@ -195,66 +196,38 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst)
    buffer_info.inst_offset.Assign(mubuf.offset);
    buffer_info.globally_coherent.Assign(mubuf.glc);
    buffer_info.system_coherent.Assign(mubuf.slc);
-    buffer_info.typed.Assign(is_typed);
-    if (is_typed) {
+    buffer_info.typed.Assign(is_inst_typed || is_buffer_typed);
+    if (is_inst_typed) {
        const auto& mtbuf = inst.control.mtbuf;
-        const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
-        const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
-        ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
-               (dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
-                dmft == AmdGpu::DataFormat::Format32_32_32 ||
-                dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32));
+        buffer_info.inst_data_fmt.Assign(static_cast<AmdGpu::DataFormat>(mtbuf.dfmt));
+        buffer_info.inst_num_fmt.Assign(static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt));
+    } else {
+        buffer_info.inst_data_fmt.Assign(AmdGpu::DataFormat::FormatInvalid);
    }

    const IR::Value handle =
        ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
                              ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
-    const IR::Value value = ir.LoadBufferU32(num_dwords, handle, address, buffer_info);
    const IR::VectorReg dst_reg{inst.src[1].code};
-    if (num_dwords == 1) {
-        ir.SetVectorReg(dst_reg, IR::U32{value});
-        return;
-    }
-    for (u32 i = 0; i < num_dwords; i++) {
-        ir.SetVectorReg(dst_reg + i, IR::U32{ir.CompositeExtract(value, i)});
+    if (buffer_info.typed) {
+        const IR::Value value = ir.LoadBufferFormat(handle, address, buffer_info);
+        for (u32 i = 0; i < num_dwords; i++) {
+            ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)});
+        }
+    } else {
+        const IR::Value value = ir.LoadBufferU32(num_dwords, handle, address, buffer_info);
+        if (num_dwords == 1) {
+            ir.SetVectorReg(dst_reg, IR::U32{value});
+            return;
+        }
+        for (u32 i = 0; i < num_dwords; i++) {
+            ir.SetVectorReg(dst_reg + i, IR::U32{ir.CompositeExtract(value, i)});
+        }
    }
 }

-void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) {
-    const auto& mubuf = inst.control.mubuf;
-    const IR::VectorReg vaddr{inst.src[0].code};
-    const IR::ScalarReg sharp{inst.src[2].code * 4};
-    const IR::Value address = [&] -> IR::Value {
-        if (mubuf.idxen && mubuf.offen) {
-            return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
-        }
-        if (mubuf.idxen || mubuf.offen) {
-            return ir.GetVectorReg(vaddr);
-        }
-        return {};
-    }();
-    const IR::Value soffset{GetSrc(inst.src[3])};
-    ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
-
-    IR::BufferInstInfo buffer_info{};
-    buffer_info.index_enable.Assign(mubuf.idxen);
-    buffer_info.offset_enable.Assign(mubuf.offen);
-    buffer_info.inst_offset.Assign(mubuf.offset);
-    buffer_info.globally_coherent.Assign(mubuf.glc);
-    buffer_info.system_coherent.Assign(mubuf.slc);
-    buffer_info.typed.Assign(true);
-
-    const IR::Value handle =
-        ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
-                              ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
-    const IR::Value value = ir.LoadBufferFormat(handle, address, buffer_info);
-    const IR::VectorReg dst_reg{inst.src[1].code};
-    for (u32 i = 0; i < num_dwords; i++) {
-        ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)});
-    }
-}
-
-void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst) {
+void Translator::BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed,
+                              const GcnInst& inst) {
    const auto& mubuf = inst.control.mubuf;
    const bool is_ring = mubuf.glc && mubuf.slc;
    const IR::VectorReg vaddr{inst.src[0].code};
@ -285,80 +258,38 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
    buffer_info.inst_offset.Assign(mubuf.offset);
    buffer_info.globally_coherent.Assign(mubuf.glc);
    buffer_info.system_coherent.Assign(mubuf.slc);
-    buffer_info.typed.Assign(is_typed);
-    if (is_typed) {
+    buffer_info.typed.Assign(is_inst_typed || is_buffer_typed);
+    if (is_inst_typed) {
        const auto& mtbuf = inst.control.mtbuf;
-        const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
-        const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
-        ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
-               (dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
-                dmft == AmdGpu::DataFormat::Format32_32_32 ||
-                dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32));
+        buffer_info.inst_data_fmt.Assign(static_cast<AmdGpu::DataFormat>(mtbuf.dfmt));
+        buffer_info.inst_num_fmt.Assign(static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt));
+    } else {
+        buffer_info.inst_data_fmt.Assign(AmdGpu::DataFormat::FormatInvalid);
    }

-    IR::Value value{};
-    const IR::VectorReg src_reg{inst.src[1].code};
-    switch (num_dwords) {
-    case 1:
-        value = ir.GetVectorReg(src_reg);
-        break;
-    case 2:
-        value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1));
-        break;
-    case 3:
-        value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1),
-                                      ir.GetVectorReg(src_reg + 2));
-        break;
-    case 4:
-        value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1),
-                                      ir.GetVectorReg(src_reg + 2), ir.GetVectorReg(src_reg + 3));
-        break;
-    }
    const IR::Value handle =
        ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
                              ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
-    ir.StoreBufferU32(num_dwords, handle, address, value, buffer_info);
-}
-
-void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
-    const auto& mubuf = inst.control.mubuf;
-    const IR::VectorReg vaddr{inst.src[0].code};
-    const IR::ScalarReg sharp{inst.src[2].code * 4};
-    const IR::Value address = [&] -> IR::Value {
-        if (mubuf.idxen && mubuf.offen) {
-            return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
-        }
-        if (mubuf.idxen || mubuf.offen) {
-            return ir.GetVectorReg(vaddr);
-        }
-        return {};
-    }();
-    const IR::Value soffset{GetSrc(inst.src[3])};
-    ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
-
-    IR::BufferInstInfo buffer_info{};
-    buffer_info.index_enable.Assign(mubuf.idxen);
-    buffer_info.offset_enable.Assign(mubuf.offen);
-    buffer_info.inst_offset.Assign(mubuf.offset);
-    buffer_info.globally_coherent.Assign(mubuf.glc);
-    buffer_info.system_coherent.Assign(mubuf.slc);
-    buffer_info.typed.Assign(true);
-
    const IR::VectorReg src_reg{inst.src[1].code};

-    std::array<IR::F32, 4> comps{};
+    boost::container::static_vector<IR::Value, 4> comps;
    for (u32 i = 0; i < num_dwords; i++) {
-        comps[i] = ir.GetVectorReg<IR::F32>(src_reg + i);
+        const auto src_reg_i = src_reg + i;
+        if (buffer_info.typed) {
+            comps.push_back(ir.GetVectorReg<IR::F32>(src_reg_i));
+        } else {
+            comps.push_back(ir.GetVectorReg<IR::U32>(src_reg_i));
+        }
    }
-    for (u32 i = num_dwords; i < 4; i++) {
-        comps[i] = ir.Imm32(0.f);
+    if (buffer_info.typed) {
+        for (u32 i = num_dwords; i < 4; i++) {
+            comps.push_back(ir.Imm32(0.f));
+        }
+        ir.StoreBufferFormat(handle, address, ir.CompositeConstruct(comps), buffer_info);
+    } else {
+        const auto value = num_dwords == 1 ? comps[0] : ir.CompositeConstruct(comps);
+        ir.StoreBufferU32(num_dwords, handle, address, value, buffer_info);
    }
-
-    const IR::Value value = ir.CompositeConstruct(comps[0], comps[1], comps[2], comps[3]);
-    const IR::Value handle =
-        ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
-                              ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
-    ir.StoreBufferFormat(handle, address, value, buffer_info);
 }

 void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
--- a/src/shader_recompiler/ir/ir_emitter.cpp
+++ b/src/shader_recompiler/ir/ir_emitter.cpp
@ -638,7 +638,8 @@ Value IREmitter::CompositeConstruct(std::span<const Value> elements) {
    case 4:
        return CompositeConstruct(elements[0], elements[1], elements[2], elements[3]);
    default:
-        UNREACHABLE_MSG("Composite construct with greater than 4 elements");
+        UNREACHABLE_MSG("Composite construct with {} elements, only 2-4 are supported",
+                        elements.size());
    }
 }

--- a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
+++ b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
@ -10,6 +10,14 @@

 namespace Shader::Optimization {

+struct FormatInfo {
+    AmdGpu::DataFormat data_format;
+    AmdGpu::NumberFormat num_format;
+    AmdGpu::CompMapping swizzle;
+    AmdGpu::NumberConversion num_conversion;
+    int num_components;
+};
+
 static bool IsBufferFormatLoad(const IR::Inst& inst) {
    return inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32;
 }
@ -18,152 +26,151 @@ static bool IsBufferFormatStore(const IR::Inst& inst) {
    return inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32;
 }

-static IR::Value LoadBufferFormat(IR::IREmitter& ir, const AmdGpu::Buffer& buffer,
-                                  const IR::Value handle, const IR::U32 address,
-                                  const IR::BufferInstInfo info) {
-    const auto data_fmt = buffer.GetDataFmt();
-    const auto num_fmt = buffer.GetNumberFmt();
-    const auto num_conv = buffer.GetNumberConversion();
-    const auto num_components = AmdGpu::NumComponents(buffer.GetDataFmt());
-
+static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, const IR::U32 address,
+                                  const IR::BufferInstInfo info, const FormatInfo& format_info) {
    IR::Value interpreted;
-    switch (data_fmt) {
+    switch (format_info.data_format) {
    case AmdGpu::DataFormat::FormatInvalid:
        interpreted = ir.Imm32(0.f);
        break;
    case AmdGpu::DataFormat::Format8: {
-        const auto unpacked = ir.Unpack4x8(num_fmt, ir.LoadBufferU8(handle, address, info));
+        const auto unpacked =
+            ir.Unpack4x8(format_info.num_format, ir.LoadBufferU8(handle, address, info));
        interpreted = ir.CompositeExtract(unpacked, 0);
        break;
    }
    case AmdGpu::DataFormat::Format8_8: {
        const auto raw = ir.LoadBufferU16(handle, address, info);
-        const auto unpacked = ir.Unpack4x8(num_fmt, raw);
+        const auto unpacked = ir.Unpack4x8(format_info.num_format, raw);
        interpreted = ir.CompositeConstruct(ir.CompositeExtract(unpacked, 0),
                                            ir.CompositeExtract(unpacked, 1));
        break;
    }
    case AmdGpu::DataFormat::Format8_8_8_8:
-        interpreted = ir.Unpack4x8(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
+        interpreted = ir.Unpack4x8(format_info.num_format,
+                                   IR::U32{ir.LoadBufferU32(1, handle, address, info)});
        break;
    case AmdGpu::DataFormat::Format16: {
-        const auto unpacked = ir.Unpack2x16(num_fmt, ir.LoadBufferU16(handle, address, info));
+        const auto unpacked =
+            ir.Unpack2x16(format_info.num_format, ir.LoadBufferU16(handle, address, info));
        interpreted = ir.CompositeExtract(unpacked, 0);
        break;
    }
    case AmdGpu::DataFormat::Format16_16:
-        interpreted = ir.Unpack2x16(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
+        interpreted = ir.Unpack2x16(format_info.num_format,
+                                    IR::U32{ir.LoadBufferU32(1, handle, address, info)});
        break;
    case AmdGpu::DataFormat::Format10_11_11:
-        interpreted =
-            ir.Unpack10_11_11(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
+        interpreted = ir.Unpack10_11_11(format_info.num_format,
+                                        IR::U32{ir.LoadBufferU32(1, handle, address, info)});
        break;
    case AmdGpu::DataFormat::Format2_10_10_10:
-        interpreted =
-            ir.Unpack2_10_10_10(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
+        interpreted = ir.Unpack2_10_10_10(format_info.num_format,
+                                          IR::U32{ir.LoadBufferU32(1, handle, address, info)});
        break;
    case AmdGpu::DataFormat::Format16_16_16_16: {
        const auto raw = ir.LoadBufferU32(2, handle, address, info);
-        interpreted =
-            ir.CompositeConstruct(ir.Unpack2x16(num_fmt, IR::U32{ir.CompositeExtract(raw, 0)}),
-                                  ir.Unpack2x16(num_fmt, IR::U32{ir.CompositeExtract(raw, 1)}));
+        interpreted = ir.CompositeConstruct(
+            ir.Unpack2x16(format_info.num_format, IR::U32{ir.CompositeExtract(raw, 0)}),
+            ir.Unpack2x16(format_info.num_format, IR::U32{ir.CompositeExtract(raw, 1)}));
        break;
    }
    case AmdGpu::DataFormat::Format32:
    case AmdGpu::DataFormat::Format32_32:
    case AmdGpu::DataFormat::Format32_32_32:
    case AmdGpu::DataFormat::Format32_32_32_32: {
-        ASSERT(num_fmt == AmdGpu::NumberFormat::Uint || num_fmt == AmdGpu::NumberFormat::Sint ||
-               num_fmt == AmdGpu::NumberFormat::Float);
-        interpreted = ir.LoadBufferF32(num_components, handle, address, info);
+        ASSERT(format_info.num_format == AmdGpu::NumberFormat::Uint ||
+               format_info.num_format == AmdGpu::NumberFormat::Sint ||
+               format_info.num_format == AmdGpu::NumberFormat::Float);
+        interpreted = ir.LoadBufferF32(format_info.num_components, handle, address, info);
        break;
    }
    default:
-        UNREACHABLE_MSG("Unsupported buffer data format: {}", data_fmt);
+        UNREACHABLE_MSG("Unsupported buffer data format: {}", format_info.data_format);
    }

    // Pad to 4 components and apply additional modifications.
    boost::container::static_vector<IR::Value, 4> components;
    for (u32 i = 0; i < 4; i++) {
-        if (i < num_components) {
+        if (i < format_info.num_components) {
            const auto component =
-                IR::F32{num_components == 1 ? interpreted : ir.CompositeExtract(interpreted, i)};
-            components.push_back(ApplyReadNumberConversion(ir, component, num_conv));
+                IR::F32{format_info.num_components == 1 ? interpreted
+                                                        : ir.CompositeExtract(interpreted, i)};
+            components.push_back(
+                ApplyReadNumberConversion(ir, component, format_info.num_conversion));
        } else {
            components.push_back(ir.Imm32(0.f));
        }
    }
-    const auto swizzled = ApplySwizzle(ir, ir.CompositeConstruct(components), buffer.DstSelect());
+    const auto swizzled = ApplySwizzle(ir, ir.CompositeConstruct(components), format_info.swizzle);
    return swizzled;
 }

-static void StoreBufferFormat(IR::IREmitter& ir, const AmdGpu::Buffer& buffer,
-                              const IR::Value handle, const IR::U32 address, const IR::Value& value,
-                              const IR::BufferInstInfo info) {
-    const auto data_fmt = buffer.GetDataFmt();
-    const auto num_fmt = buffer.GetNumberFmt();
-    const auto num_conv = buffer.GetNumberConversion();
-    const auto num_components = AmdGpu::NumComponents(buffer.GetDataFmt());
-
+static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const IR::U32 address,
+                              const IR::Value& value, const IR::BufferInstInfo info,
+                              const FormatInfo& format_info) {
    // Extract actual number of components and apply additional modifications.
-    const auto swizzled = ApplySwizzle(ir, value, buffer.DstSelect().Inverse());
+    const auto swizzled = ApplySwizzle(ir, value, format_info.swizzle.Inverse());
    boost::container::static_vector<IR::Value, 4> components;
-    for (u32 i = 0; i < num_components; i++) {
+    for (u32 i = 0; i < format_info.num_components; i++) {
        const auto component = IR::F32{ir.CompositeExtract(swizzled, i)};
-        components.push_back(ApplyWriteNumberConversion(ir, component, num_conv));
+        components.push_back(ApplyWriteNumberConversion(ir, component, format_info.num_conversion));
    }
    const auto real_value =
        components.size() == 1 ? components[0] : ir.CompositeConstruct(components);

-    switch (data_fmt) {
+    switch (format_info.data_format) {
    case AmdGpu::DataFormat::FormatInvalid:
        break;
    case AmdGpu::DataFormat::Format8: {
        const auto packed =
-            ir.Pack4x8(num_fmt, ir.CompositeConstruct(real_value, ir.Imm32(0.f), ir.Imm32(0.f),
-                                                      ir.Imm32(0.f)));
+            ir.Pack4x8(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f),
+                                                                     ir.Imm32(0.f), ir.Imm32(0.f)));
        ir.StoreBufferU8(handle, address, packed, info);
        break;
    }
    case AmdGpu::DataFormat::Format8_8: {
-        const auto packed =
-            ir.Pack4x8(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
-                                                      ir.CompositeExtract(real_value, 1),
-                                                      ir.Imm32(0.f), ir.Imm32(0.f)));
+        const auto packed = ir.Pack4x8(format_info.num_format,
+                                       ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
+                                                             ir.CompositeExtract(real_value, 1),
+                                                             ir.Imm32(0.f), ir.Imm32(0.f)));
        ir.StoreBufferU16(handle, address, packed, info);
        break;
    }
    case AmdGpu::DataFormat::Format8_8_8_8: {
-        auto packed = ir.Pack4x8(num_fmt, real_value);
+        auto packed = ir.Pack4x8(format_info.num_format, real_value);
        ir.StoreBufferU32(1, handle, address, packed, info);
        break;
    }
    case AmdGpu::DataFormat::Format16: {
-        const auto packed = ir.Pack2x16(num_fmt, ir.CompositeConstruct(real_value, ir.Imm32(0.f)));
+        const auto packed =
+            ir.Pack2x16(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f)));
        ir.StoreBufferU16(handle, address, packed, info);
        break;
    }
    case AmdGpu::DataFormat::Format16_16: {
-        const auto packed = ir.Pack2x16(num_fmt, real_value);
+        const auto packed = ir.Pack2x16(format_info.num_format, real_value);
        ir.StoreBufferU32(1, handle, address, packed, info);
        break;
    }
    case AmdGpu::DataFormat::Format10_11_11: {
-        const auto packed = ir.Pack10_11_11(num_fmt, real_value);
+        const auto packed = ir.Pack10_11_11(format_info.num_format, real_value);
        ir.StoreBufferU32(1, handle, address, packed, info);
        break;
    }
    case AmdGpu::DataFormat::Format2_10_10_10: {
-        const auto packed = ir.Pack2_10_10_10(num_fmt, real_value);
+        const auto packed = ir.Pack2_10_10_10(format_info.num_format, real_value);
        ir.StoreBufferU32(1, handle, address, packed, info);
        break;
    }
    case AmdGpu::DataFormat::Format16_16_16_16: {
        const auto packed = ir.CompositeConstruct(
-            ir.Pack2x16(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
-                                                       ir.CompositeExtract(real_value, 1))),
-            ir.Pack2x16(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 2),
-                                                       ir.CompositeExtract(real_value, 3))));
+            ir.Pack2x16(format_info.num_format,
+                        ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
+                                              ir.CompositeExtract(real_value, 1))),
+            ir.Pack2x16(format_info.num_format,
+                        ir.CompositeConstruct(ir.CompositeExtract(real_value, 2),
+                                              ir.CompositeExtract(real_value, 3))));
        ir.StoreBufferU32(2, handle, address, packed, info);
        break;
    }
@ -171,28 +178,40 @@ static void StoreBufferFormat(IR::IREmitter& ir, const AmdGpu::Buffer& buffer,
    case AmdGpu::DataFormat::Format32_32:
    case AmdGpu::DataFormat::Format32_32_32:
    case AmdGpu::DataFormat::Format32_32_32_32: {
-        ASSERT(num_fmt == AmdGpu::NumberFormat::Uint || num_fmt == AmdGpu::NumberFormat::Sint ||
-               num_fmt == AmdGpu::NumberFormat::Float);
-        ir.StoreBufferF32(num_components, handle, address, real_value, info);
+        ASSERT(format_info.num_format == AmdGpu::NumberFormat::Uint ||
+               format_info.num_format == AmdGpu::NumberFormat::Sint ||
+               format_info.num_format == AmdGpu::NumberFormat::Float);
+        ir.StoreBufferF32(format_info.num_components, handle, address, real_value, info);
        break;
    }
    default:
-        UNREACHABLE_MSG("Unsupported buffer data format: {}", data_fmt);
+        UNREACHABLE_MSG("Unsupported buffer data format: {}", format_info.data_format);
    }
 }

 static void LowerBufferFormatInst(IR::Block& block, IR::Inst& inst, Info& info) {
    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+    const auto flags = inst.Flags<IR::BufferInstInfo>();
    const auto desc{info.buffers[inst.Arg(0).U32()]};
    const auto buffer{desc.GetSharp(info)};

+    const auto is_inst_typed = flags.inst_data_fmt != AmdGpu::DataFormat::FormatInvalid;
+    const auto data_format = is_inst_typed ? flags.inst_data_fmt.Value() : buffer.GetDataFmt();
+    const auto num_format = is_inst_typed ? flags.inst_num_fmt.Value() : buffer.GetNumberFmt();
+    const auto format_info = FormatInfo{
+        .data_format = data_format,
+        .num_format = num_format,
+        .swizzle = is_inst_typed ? AmdGpu::IdentityMapping : buffer.DstSelect(),
+        .num_conversion = AmdGpu::MapNumberConversion(num_format),
+        .num_components = AmdGpu::NumComponents(data_format),
+    };
+
    if (IsBufferFormatLoad(inst)) {
-        const auto interpreted = LoadBufferFormat(ir, buffer, inst.Arg(0), IR::U32{inst.Arg(1)},
-                                                  inst.Flags<IR::BufferInstInfo>());
+        const auto interpreted =
+            LoadBufferFormat(ir, inst.Arg(0), IR::U32{inst.Arg(1)}, flags, format_info);
        inst.ReplaceUsesWithAndRemove(interpreted);
    } else if (IsBufferFormatStore(inst)) {
-        StoreBufferFormat(ir, buffer, inst.Arg(0), IR::U32{inst.Arg(1)}, inst.Arg(2),
-                          inst.Flags<IR::BufferInstInfo>());
+        StoreBufferFormat(ir, inst.Arg(0), IR::U32{inst.Arg(1)}, inst.Arg(2), flags, format_info);
        inst.Invalidate();
    }
 }
--- a/src/shader_recompiler/ir/reg.h
+++ b/src/shader_recompiler/ir/reg.h
@ -7,6 +7,7 @@
 #include "common/bit_field.h"
 #include "common/enum.h"
 #include "common/types.h"
+#include "video_core/amdgpu/types.h"

 namespace Shader::IR {

@ -52,6 +53,8 @@ union BufferInstInfo {
    BitField<14, 1, u32> system_coherent;
    BitField<15, 1, u32> globally_coherent;
    BitField<16, 1, u32> typed;
+    BitField<17, 4, AmdGpu::DataFormat> inst_data_fmt;
+    BitField<21, 3, AmdGpu::NumberFormat> inst_num_fmt;
 };

 enum class ScalarReg : u32 {
--- a/src/video_core/amdgpu/types.h
+++ b/src/video_core/amdgpu/types.h
@ -262,6 +262,13 @@ private:
    }
 };

+static constexpr CompMapping IdentityMapping = {
+    .r = CompSwizzle::Red,
+    .g = CompSwizzle::Green,
+    .b = CompSwizzle::Blue,
+    .a = CompSwizzle::Alpha,
+};
+
 inline DataFormat RemapDataFormat(const DataFormat format) {
    switch (format) {
    case DataFormat::Format11_11_10: