mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-04-20 03:24:49 +00:00
shader_recompiler: Proper support for inst-typed buffer format operations. (#2469)
Some checks are pending
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Some checks are pending
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
This commit is contained in:
parent
6860bb7349
commit
9424047214
6 changed files with 167 additions and 207 deletions
|
@ -277,10 +277,9 @@ public:
|
|||
|
||||
// Buffer Memory
|
||||
// MUBUF / MTBUF
|
||||
void BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst);
|
||||
void BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst);
|
||||
void BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst);
|
||||
void BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst);
|
||||
void BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed, const GcnInst& inst);
|
||||
void BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed,
|
||||
const GcnInst& inst);
|
||||
void BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst);
|
||||
|
||||
// Image Memory
|
||||
|
|
|
@ -11,59 +11,59 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {
|
|||
|
||||
// Buffer load operations
|
||||
case Opcode::TBUFFER_LOAD_FORMAT_X:
|
||||
return BUFFER_LOAD(1, true, inst);
|
||||
return BUFFER_LOAD(1, true, false, inst);
|
||||
case Opcode::TBUFFER_LOAD_FORMAT_XY:
|
||||
return BUFFER_LOAD(2, true, inst);
|
||||
return BUFFER_LOAD(2, true, false, inst);
|
||||
case Opcode::TBUFFER_LOAD_FORMAT_XYZ:
|
||||
return BUFFER_LOAD(3, true, inst);
|
||||
return BUFFER_LOAD(3, true, false, inst);
|
||||
case Opcode::TBUFFER_LOAD_FORMAT_XYZW:
|
||||
return BUFFER_LOAD(4, true, inst);
|
||||
return BUFFER_LOAD(4, true, false, inst);
|
||||
|
||||
case Opcode::BUFFER_LOAD_FORMAT_X:
|
||||
return BUFFER_LOAD_FORMAT(1, inst);
|
||||
return BUFFER_LOAD(1, false, true, inst);
|
||||
case Opcode::BUFFER_LOAD_FORMAT_XY:
|
||||
return BUFFER_LOAD_FORMAT(2, inst);
|
||||
return BUFFER_LOAD(2, false, true, inst);
|
||||
case Opcode::BUFFER_LOAD_FORMAT_XYZ:
|
||||
return BUFFER_LOAD_FORMAT(3, inst);
|
||||
return BUFFER_LOAD(3, false, true, inst);
|
||||
case Opcode::BUFFER_LOAD_FORMAT_XYZW:
|
||||
return BUFFER_LOAD_FORMAT(4, inst);
|
||||
return BUFFER_LOAD(4, false, true, inst);
|
||||
|
||||
case Opcode::BUFFER_LOAD_DWORD:
|
||||
return BUFFER_LOAD(1, false, inst);
|
||||
return BUFFER_LOAD(1, false, false, inst);
|
||||
case Opcode::BUFFER_LOAD_DWORDX2:
|
||||
return BUFFER_LOAD(2, false, inst);
|
||||
return BUFFER_LOAD(2, false, false, inst);
|
||||
case Opcode::BUFFER_LOAD_DWORDX3:
|
||||
return BUFFER_LOAD(3, false, inst);
|
||||
return BUFFER_LOAD(3, false, false, inst);
|
||||
case Opcode::BUFFER_LOAD_DWORDX4:
|
||||
return BUFFER_LOAD(4, false, inst);
|
||||
return BUFFER_LOAD(4, false, false, inst);
|
||||
|
||||
// Buffer store operations
|
||||
case Opcode::BUFFER_STORE_FORMAT_X:
|
||||
return BUFFER_STORE_FORMAT(1, inst);
|
||||
return BUFFER_STORE(1, false, true, inst);
|
||||
case Opcode::BUFFER_STORE_FORMAT_XY:
|
||||
return BUFFER_STORE_FORMAT(2, inst);
|
||||
return BUFFER_STORE(2, false, true, inst);
|
||||
case Opcode::BUFFER_STORE_FORMAT_XYZ:
|
||||
return BUFFER_STORE_FORMAT(3, inst);
|
||||
return BUFFER_STORE(3, false, true, inst);
|
||||
case Opcode::BUFFER_STORE_FORMAT_XYZW:
|
||||
return BUFFER_STORE_FORMAT(4, inst);
|
||||
return BUFFER_STORE(4, false, true, inst);
|
||||
|
||||
case Opcode::TBUFFER_STORE_FORMAT_X:
|
||||
return BUFFER_STORE(1, true, inst);
|
||||
return BUFFER_STORE(1, true, false, inst);
|
||||
case Opcode::TBUFFER_STORE_FORMAT_XY:
|
||||
return BUFFER_STORE(2, true, inst);
|
||||
return BUFFER_STORE(2, true, false, inst);
|
||||
case Opcode::TBUFFER_STORE_FORMAT_XYZ:
|
||||
return BUFFER_STORE(3, true, inst);
|
||||
return BUFFER_STORE(3, true, false, inst);
|
||||
case Opcode::TBUFFER_STORE_FORMAT_XYZW:
|
||||
return BUFFER_STORE(4, true, inst);
|
||||
return BUFFER_STORE(4, true, false, inst);
|
||||
|
||||
case Opcode::BUFFER_STORE_DWORD:
|
||||
return BUFFER_STORE(1, false, inst);
|
||||
return BUFFER_STORE(1, false, false, inst);
|
||||
case Opcode::BUFFER_STORE_DWORDX2:
|
||||
return BUFFER_STORE(2, false, inst);
|
||||
return BUFFER_STORE(2, false, false, inst);
|
||||
case Opcode::BUFFER_STORE_DWORDX3:
|
||||
return BUFFER_STORE(3, false, inst);
|
||||
return BUFFER_STORE(3, false, false, inst);
|
||||
case Opcode::BUFFER_STORE_DWORDX4:
|
||||
return BUFFER_STORE(4, false, inst);
|
||||
return BUFFER_STORE(4, false, false, inst);
|
||||
|
||||
// Buffer atomic operations
|
||||
case Opcode::BUFFER_ATOMIC_ADD:
|
||||
|
@ -165,7 +165,8 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {
|
|||
}
|
||||
}
|
||||
|
||||
void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) {
|
||||
void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed,
|
||||
const GcnInst& inst) {
|
||||
const auto& mubuf = inst.control.mubuf;
|
||||
const bool is_ring = mubuf.glc && mubuf.slc;
|
||||
const IR::VectorReg vaddr{inst.src[0].code};
|
||||
|
@ -195,66 +196,38 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst)
|
|||
buffer_info.inst_offset.Assign(mubuf.offset);
|
||||
buffer_info.globally_coherent.Assign(mubuf.glc);
|
||||
buffer_info.system_coherent.Assign(mubuf.slc);
|
||||
buffer_info.typed.Assign(is_typed);
|
||||
if (is_typed) {
|
||||
buffer_info.typed.Assign(is_inst_typed || is_buffer_typed);
|
||||
if (is_inst_typed) {
|
||||
const auto& mtbuf = inst.control.mtbuf;
|
||||
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
|
||||
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
|
||||
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
|
||||
(dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
|
||||
dmft == AmdGpu::DataFormat::Format32_32_32 ||
|
||||
dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32));
|
||||
buffer_info.inst_data_fmt.Assign(static_cast<AmdGpu::DataFormat>(mtbuf.dfmt));
|
||||
buffer_info.inst_num_fmt.Assign(static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt));
|
||||
} else {
|
||||
buffer_info.inst_data_fmt.Assign(AmdGpu::DataFormat::FormatInvalid);
|
||||
}
|
||||
|
||||
const IR::Value handle =
|
||||
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
|
||||
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
|
||||
const IR::Value value = ir.LoadBufferU32(num_dwords, handle, address, buffer_info);
|
||||
const IR::VectorReg dst_reg{inst.src[1].code};
|
||||
if (num_dwords == 1) {
|
||||
ir.SetVectorReg(dst_reg, IR::U32{value});
|
||||
return;
|
||||
}
|
||||
for (u32 i = 0; i < num_dwords; i++) {
|
||||
ir.SetVectorReg(dst_reg + i, IR::U32{ir.CompositeExtract(value, i)});
|
||||
if (buffer_info.typed) {
|
||||
const IR::Value value = ir.LoadBufferFormat(handle, address, buffer_info);
|
||||
for (u32 i = 0; i < num_dwords; i++) {
|
||||
ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)});
|
||||
}
|
||||
} else {
|
||||
const IR::Value value = ir.LoadBufferU32(num_dwords, handle, address, buffer_info);
|
||||
if (num_dwords == 1) {
|
||||
ir.SetVectorReg(dst_reg, IR::U32{value});
|
||||
return;
|
||||
}
|
||||
for (u32 i = 0; i < num_dwords; i++) {
|
||||
ir.SetVectorReg(dst_reg + i, IR::U32{ir.CompositeExtract(value, i)});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) {
|
||||
const auto& mubuf = inst.control.mubuf;
|
||||
const IR::VectorReg vaddr{inst.src[0].code};
|
||||
const IR::ScalarReg sharp{inst.src[2].code * 4};
|
||||
const IR::Value address = [&] -> IR::Value {
|
||||
if (mubuf.idxen && mubuf.offen) {
|
||||
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
|
||||
}
|
||||
if (mubuf.idxen || mubuf.offen) {
|
||||
return ir.GetVectorReg(vaddr);
|
||||
}
|
||||
return {};
|
||||
}();
|
||||
const IR::Value soffset{GetSrc(inst.src[3])};
|
||||
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
|
||||
|
||||
IR::BufferInstInfo buffer_info{};
|
||||
buffer_info.index_enable.Assign(mubuf.idxen);
|
||||
buffer_info.offset_enable.Assign(mubuf.offen);
|
||||
buffer_info.inst_offset.Assign(mubuf.offset);
|
||||
buffer_info.globally_coherent.Assign(mubuf.glc);
|
||||
buffer_info.system_coherent.Assign(mubuf.slc);
|
||||
buffer_info.typed.Assign(true);
|
||||
|
||||
const IR::Value handle =
|
||||
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
|
||||
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
|
||||
const IR::Value value = ir.LoadBufferFormat(handle, address, buffer_info);
|
||||
const IR::VectorReg dst_reg{inst.src[1].code};
|
||||
for (u32 i = 0; i < num_dwords; i++) {
|
||||
ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)});
|
||||
}
|
||||
}
|
||||
|
||||
void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst) {
|
||||
void Translator::BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed,
|
||||
const GcnInst& inst) {
|
||||
const auto& mubuf = inst.control.mubuf;
|
||||
const bool is_ring = mubuf.glc && mubuf.slc;
|
||||
const IR::VectorReg vaddr{inst.src[0].code};
|
||||
|
@ -285,80 +258,38 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
|
|||
buffer_info.inst_offset.Assign(mubuf.offset);
|
||||
buffer_info.globally_coherent.Assign(mubuf.glc);
|
||||
buffer_info.system_coherent.Assign(mubuf.slc);
|
||||
buffer_info.typed.Assign(is_typed);
|
||||
if (is_typed) {
|
||||
buffer_info.typed.Assign(is_inst_typed || is_buffer_typed);
|
||||
if (is_inst_typed) {
|
||||
const auto& mtbuf = inst.control.mtbuf;
|
||||
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
|
||||
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
|
||||
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
|
||||
(dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
|
||||
dmft == AmdGpu::DataFormat::Format32_32_32 ||
|
||||
dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32));
|
||||
buffer_info.inst_data_fmt.Assign(static_cast<AmdGpu::DataFormat>(mtbuf.dfmt));
|
||||
buffer_info.inst_num_fmt.Assign(static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt));
|
||||
} else {
|
||||
buffer_info.inst_data_fmt.Assign(AmdGpu::DataFormat::FormatInvalid);
|
||||
}
|
||||
|
||||
IR::Value value{};
|
||||
const IR::VectorReg src_reg{inst.src[1].code};
|
||||
switch (num_dwords) {
|
||||
case 1:
|
||||
value = ir.GetVectorReg(src_reg);
|
||||
break;
|
||||
case 2:
|
||||
value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1));
|
||||
break;
|
||||
case 3:
|
||||
value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1),
|
||||
ir.GetVectorReg(src_reg + 2));
|
||||
break;
|
||||
case 4:
|
||||
value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1),
|
||||
ir.GetVectorReg(src_reg + 2), ir.GetVectorReg(src_reg + 3));
|
||||
break;
|
||||
}
|
||||
const IR::Value handle =
|
||||
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
|
||||
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
|
||||
ir.StoreBufferU32(num_dwords, handle, address, value, buffer_info);
|
||||
}
|
||||
|
||||
void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
|
||||
const auto& mubuf = inst.control.mubuf;
|
||||
const IR::VectorReg vaddr{inst.src[0].code};
|
||||
const IR::ScalarReg sharp{inst.src[2].code * 4};
|
||||
const IR::Value address = [&] -> IR::Value {
|
||||
if (mubuf.idxen && mubuf.offen) {
|
||||
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
|
||||
}
|
||||
if (mubuf.idxen || mubuf.offen) {
|
||||
return ir.GetVectorReg(vaddr);
|
||||
}
|
||||
return {};
|
||||
}();
|
||||
const IR::Value soffset{GetSrc(inst.src[3])};
|
||||
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
|
||||
|
||||
IR::BufferInstInfo buffer_info{};
|
||||
buffer_info.index_enable.Assign(mubuf.idxen);
|
||||
buffer_info.offset_enable.Assign(mubuf.offen);
|
||||
buffer_info.inst_offset.Assign(mubuf.offset);
|
||||
buffer_info.globally_coherent.Assign(mubuf.glc);
|
||||
buffer_info.system_coherent.Assign(mubuf.slc);
|
||||
buffer_info.typed.Assign(true);
|
||||
|
||||
const IR::VectorReg src_reg{inst.src[1].code};
|
||||
|
||||
std::array<IR::F32, 4> comps{};
|
||||
boost::container::static_vector<IR::Value, 4> comps;
|
||||
for (u32 i = 0; i < num_dwords; i++) {
|
||||
comps[i] = ir.GetVectorReg<IR::F32>(src_reg + i);
|
||||
const auto src_reg_i = src_reg + i;
|
||||
if (buffer_info.typed) {
|
||||
comps.push_back(ir.GetVectorReg<IR::F32>(src_reg_i));
|
||||
} else {
|
||||
comps.push_back(ir.GetVectorReg<IR::U32>(src_reg_i));
|
||||
}
|
||||
}
|
||||
for (u32 i = num_dwords; i < 4; i++) {
|
||||
comps[i] = ir.Imm32(0.f);
|
||||
if (buffer_info.typed) {
|
||||
for (u32 i = num_dwords; i < 4; i++) {
|
||||
comps.push_back(ir.Imm32(0.f));
|
||||
}
|
||||
ir.StoreBufferFormat(handle, address, ir.CompositeConstruct(comps), buffer_info);
|
||||
} else {
|
||||
const auto value = num_dwords == 1 ? comps[0] : ir.CompositeConstruct(comps);
|
||||
ir.StoreBufferU32(num_dwords, handle, address, value, buffer_info);
|
||||
}
|
||||
|
||||
const IR::Value value = ir.CompositeConstruct(comps[0], comps[1], comps[2], comps[3]);
|
||||
const IR::Value handle =
|
||||
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
|
||||
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
|
||||
ir.StoreBufferFormat(handle, address, value, buffer_info);
|
||||
}
|
||||
|
||||
void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
|
||||
|
|
|
@ -638,7 +638,8 @@ Value IREmitter::CompositeConstruct(std::span<const Value> elements) {
|
|||
case 4:
|
||||
return CompositeConstruct(elements[0], elements[1], elements[2], elements[3]);
|
||||
default:
|
||||
UNREACHABLE_MSG("Composite construct with greater than 4 elements");
|
||||
UNREACHABLE_MSG("Composite construct with {} elements, only 2-4 are supported",
|
||||
elements.size());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -10,6 +10,14 @@
|
|||
|
||||
namespace Shader::Optimization {
|
||||
|
||||
struct FormatInfo {
|
||||
AmdGpu::DataFormat data_format;
|
||||
AmdGpu::NumberFormat num_format;
|
||||
AmdGpu::CompMapping swizzle;
|
||||
AmdGpu::NumberConversion num_conversion;
|
||||
int num_components;
|
||||
};
|
||||
|
||||
static bool IsBufferFormatLoad(const IR::Inst& inst) {
|
||||
return inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32;
|
||||
}
|
||||
|
@ -18,152 +26,151 @@ static bool IsBufferFormatStore(const IR::Inst& inst) {
|
|||
return inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32;
|
||||
}
|
||||
|
||||
static IR::Value LoadBufferFormat(IR::IREmitter& ir, const AmdGpu::Buffer& buffer,
|
||||
const IR::Value handle, const IR::U32 address,
|
||||
const IR::BufferInstInfo info) {
|
||||
const auto data_fmt = buffer.GetDataFmt();
|
||||
const auto num_fmt = buffer.GetNumberFmt();
|
||||
const auto num_conv = buffer.GetNumberConversion();
|
||||
const auto num_components = AmdGpu::NumComponents(buffer.GetDataFmt());
|
||||
|
||||
static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, const IR::U32 address,
|
||||
const IR::BufferInstInfo info, const FormatInfo& format_info) {
|
||||
IR::Value interpreted;
|
||||
switch (data_fmt) {
|
||||
switch (format_info.data_format) {
|
||||
case AmdGpu::DataFormat::FormatInvalid:
|
||||
interpreted = ir.Imm32(0.f);
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format8: {
|
||||
const auto unpacked = ir.Unpack4x8(num_fmt, ir.LoadBufferU8(handle, address, info));
|
||||
const auto unpacked =
|
||||
ir.Unpack4x8(format_info.num_format, ir.LoadBufferU8(handle, address, info));
|
||||
interpreted = ir.CompositeExtract(unpacked, 0);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format8_8: {
|
||||
const auto raw = ir.LoadBufferU16(handle, address, info);
|
||||
const auto unpacked = ir.Unpack4x8(num_fmt, raw);
|
||||
const auto unpacked = ir.Unpack4x8(format_info.num_format, raw);
|
||||
interpreted = ir.CompositeConstruct(ir.CompositeExtract(unpacked, 0),
|
||||
ir.CompositeExtract(unpacked, 1));
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format8_8_8_8:
|
||||
interpreted = ir.Unpack4x8(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
interpreted = ir.Unpack4x8(format_info.num_format,
|
||||
IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format16: {
|
||||
const auto unpacked = ir.Unpack2x16(num_fmt, ir.LoadBufferU16(handle, address, info));
|
||||
const auto unpacked =
|
||||
ir.Unpack2x16(format_info.num_format, ir.LoadBufferU16(handle, address, info));
|
||||
interpreted = ir.CompositeExtract(unpacked, 0);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format16_16:
|
||||
interpreted = ir.Unpack2x16(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
interpreted = ir.Unpack2x16(format_info.num_format,
|
||||
IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format10_11_11:
|
||||
interpreted =
|
||||
ir.Unpack10_11_11(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
interpreted = ir.Unpack10_11_11(format_info.num_format,
|
||||
IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format2_10_10_10:
|
||||
interpreted =
|
||||
ir.Unpack2_10_10_10(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
interpreted = ir.Unpack2_10_10_10(format_info.num_format,
|
||||
IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format16_16_16_16: {
|
||||
const auto raw = ir.LoadBufferU32(2, handle, address, info);
|
||||
interpreted =
|
||||
ir.CompositeConstruct(ir.Unpack2x16(num_fmt, IR::U32{ir.CompositeExtract(raw, 0)}),
|
||||
ir.Unpack2x16(num_fmt, IR::U32{ir.CompositeExtract(raw, 1)}));
|
||||
interpreted = ir.CompositeConstruct(
|
||||
ir.Unpack2x16(format_info.num_format, IR::U32{ir.CompositeExtract(raw, 0)}),
|
||||
ir.Unpack2x16(format_info.num_format, IR::U32{ir.CompositeExtract(raw, 1)}));
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format32:
|
||||
case AmdGpu::DataFormat::Format32_32:
|
||||
case AmdGpu::DataFormat::Format32_32_32:
|
||||
case AmdGpu::DataFormat::Format32_32_32_32: {
|
||||
ASSERT(num_fmt == AmdGpu::NumberFormat::Uint || num_fmt == AmdGpu::NumberFormat::Sint ||
|
||||
num_fmt == AmdGpu::NumberFormat::Float);
|
||||
interpreted = ir.LoadBufferF32(num_components, handle, address, info);
|
||||
ASSERT(format_info.num_format == AmdGpu::NumberFormat::Uint ||
|
||||
format_info.num_format == AmdGpu::NumberFormat::Sint ||
|
||||
format_info.num_format == AmdGpu::NumberFormat::Float);
|
||||
interpreted = ir.LoadBufferF32(format_info.num_components, handle, address, info);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported buffer data format: {}", data_fmt);
|
||||
UNREACHABLE_MSG("Unsupported buffer data format: {}", format_info.data_format);
|
||||
}
|
||||
|
||||
// Pad to 4 components and apply additional modifications.
|
||||
boost::container::static_vector<IR::Value, 4> components;
|
||||
for (u32 i = 0; i < 4; i++) {
|
||||
if (i < num_components) {
|
||||
if (i < format_info.num_components) {
|
||||
const auto component =
|
||||
IR::F32{num_components == 1 ? interpreted : ir.CompositeExtract(interpreted, i)};
|
||||
components.push_back(ApplyReadNumberConversion(ir, component, num_conv));
|
||||
IR::F32{format_info.num_components == 1 ? interpreted
|
||||
: ir.CompositeExtract(interpreted, i)};
|
||||
components.push_back(
|
||||
ApplyReadNumberConversion(ir, component, format_info.num_conversion));
|
||||
} else {
|
||||
components.push_back(ir.Imm32(0.f));
|
||||
}
|
||||
}
|
||||
const auto swizzled = ApplySwizzle(ir, ir.CompositeConstruct(components), buffer.DstSelect());
|
||||
const auto swizzled = ApplySwizzle(ir, ir.CompositeConstruct(components), format_info.swizzle);
|
||||
return swizzled;
|
||||
}
|
||||
|
||||
static void StoreBufferFormat(IR::IREmitter& ir, const AmdGpu::Buffer& buffer,
|
||||
const IR::Value handle, const IR::U32 address, const IR::Value& value,
|
||||
const IR::BufferInstInfo info) {
|
||||
const auto data_fmt = buffer.GetDataFmt();
|
||||
const auto num_fmt = buffer.GetNumberFmt();
|
||||
const auto num_conv = buffer.GetNumberConversion();
|
||||
const auto num_components = AmdGpu::NumComponents(buffer.GetDataFmt());
|
||||
|
||||
static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const IR::U32 address,
|
||||
const IR::Value& value, const IR::BufferInstInfo info,
|
||||
const FormatInfo& format_info) {
|
||||
// Extract actual number of components and apply additional modifications.
|
||||
const auto swizzled = ApplySwizzle(ir, value, buffer.DstSelect().Inverse());
|
||||
const auto swizzled = ApplySwizzle(ir, value, format_info.swizzle.Inverse());
|
||||
boost::container::static_vector<IR::Value, 4> components;
|
||||
for (u32 i = 0; i < num_components; i++) {
|
||||
for (u32 i = 0; i < format_info.num_components; i++) {
|
||||
const auto component = IR::F32{ir.CompositeExtract(swizzled, i)};
|
||||
components.push_back(ApplyWriteNumberConversion(ir, component, num_conv));
|
||||
components.push_back(ApplyWriteNumberConversion(ir, component, format_info.num_conversion));
|
||||
}
|
||||
const auto real_value =
|
||||
components.size() == 1 ? components[0] : ir.CompositeConstruct(components);
|
||||
|
||||
switch (data_fmt) {
|
||||
switch (format_info.data_format) {
|
||||
case AmdGpu::DataFormat::FormatInvalid:
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format8: {
|
||||
const auto packed =
|
||||
ir.Pack4x8(num_fmt, ir.CompositeConstruct(real_value, ir.Imm32(0.f), ir.Imm32(0.f),
|
||||
ir.Imm32(0.f)));
|
||||
ir.Pack4x8(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f),
|
||||
ir.Imm32(0.f), ir.Imm32(0.f)));
|
||||
ir.StoreBufferU8(handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format8_8: {
|
||||
const auto packed =
|
||||
ir.Pack4x8(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
|
||||
ir.CompositeExtract(real_value, 1),
|
||||
ir.Imm32(0.f), ir.Imm32(0.f)));
|
||||
const auto packed = ir.Pack4x8(format_info.num_format,
|
||||
ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
|
||||
ir.CompositeExtract(real_value, 1),
|
||||
ir.Imm32(0.f), ir.Imm32(0.f)));
|
||||
ir.StoreBufferU16(handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format8_8_8_8: {
|
||||
auto packed = ir.Pack4x8(num_fmt, real_value);
|
||||
auto packed = ir.Pack4x8(format_info.num_format, real_value);
|
||||
ir.StoreBufferU32(1, handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format16: {
|
||||
const auto packed = ir.Pack2x16(num_fmt, ir.CompositeConstruct(real_value, ir.Imm32(0.f)));
|
||||
const auto packed =
|
||||
ir.Pack2x16(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f)));
|
||||
ir.StoreBufferU16(handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format16_16: {
|
||||
const auto packed = ir.Pack2x16(num_fmt, real_value);
|
||||
const auto packed = ir.Pack2x16(format_info.num_format, real_value);
|
||||
ir.StoreBufferU32(1, handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format10_11_11: {
|
||||
const auto packed = ir.Pack10_11_11(num_fmt, real_value);
|
||||
const auto packed = ir.Pack10_11_11(format_info.num_format, real_value);
|
||||
ir.StoreBufferU32(1, handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format2_10_10_10: {
|
||||
const auto packed = ir.Pack2_10_10_10(num_fmt, real_value);
|
||||
const auto packed = ir.Pack2_10_10_10(format_info.num_format, real_value);
|
||||
ir.StoreBufferU32(1, handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format16_16_16_16: {
|
||||
const auto packed = ir.CompositeConstruct(
|
||||
ir.Pack2x16(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
|
||||
ir.CompositeExtract(real_value, 1))),
|
||||
ir.Pack2x16(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 2),
|
||||
ir.CompositeExtract(real_value, 3))));
|
||||
ir.Pack2x16(format_info.num_format,
|
||||
ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
|
||||
ir.CompositeExtract(real_value, 1))),
|
||||
ir.Pack2x16(format_info.num_format,
|
||||
ir.CompositeConstruct(ir.CompositeExtract(real_value, 2),
|
||||
ir.CompositeExtract(real_value, 3))));
|
||||
ir.StoreBufferU32(2, handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
|
@ -171,28 +178,40 @@ static void StoreBufferFormat(IR::IREmitter& ir, const AmdGpu::Buffer& buffer,
|
|||
case AmdGpu::DataFormat::Format32_32:
|
||||
case AmdGpu::DataFormat::Format32_32_32:
|
||||
case AmdGpu::DataFormat::Format32_32_32_32: {
|
||||
ASSERT(num_fmt == AmdGpu::NumberFormat::Uint || num_fmt == AmdGpu::NumberFormat::Sint ||
|
||||
num_fmt == AmdGpu::NumberFormat::Float);
|
||||
ir.StoreBufferF32(num_components, handle, address, real_value, info);
|
||||
ASSERT(format_info.num_format == AmdGpu::NumberFormat::Uint ||
|
||||
format_info.num_format == AmdGpu::NumberFormat::Sint ||
|
||||
format_info.num_format == AmdGpu::NumberFormat::Float);
|
||||
ir.StoreBufferF32(format_info.num_components, handle, address, real_value, info);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported buffer data format: {}", data_fmt);
|
||||
UNREACHABLE_MSG("Unsupported buffer data format: {}", format_info.data_format);
|
||||
}
|
||||
}
|
||||
|
||||
static void LowerBufferFormatInst(IR::Block& block, IR::Inst& inst, Info& info) {
|
||||
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
|
||||
const auto flags = inst.Flags<IR::BufferInstInfo>();
|
||||
const auto desc{info.buffers[inst.Arg(0).U32()]};
|
||||
const auto buffer{desc.GetSharp(info)};
|
||||
|
||||
const auto is_inst_typed = flags.inst_data_fmt != AmdGpu::DataFormat::FormatInvalid;
|
||||
const auto data_format = is_inst_typed ? flags.inst_data_fmt.Value() : buffer.GetDataFmt();
|
||||
const auto num_format = is_inst_typed ? flags.inst_num_fmt.Value() : buffer.GetNumberFmt();
|
||||
const auto format_info = FormatInfo{
|
||||
.data_format = data_format,
|
||||
.num_format = num_format,
|
||||
.swizzle = is_inst_typed ? AmdGpu::IdentityMapping : buffer.DstSelect(),
|
||||
.num_conversion = AmdGpu::MapNumberConversion(num_format),
|
||||
.num_components = AmdGpu::NumComponents(data_format),
|
||||
};
|
||||
|
||||
if (IsBufferFormatLoad(inst)) {
|
||||
const auto interpreted = LoadBufferFormat(ir, buffer, inst.Arg(0), IR::U32{inst.Arg(1)},
|
||||
inst.Flags<IR::BufferInstInfo>());
|
||||
const auto interpreted =
|
||||
LoadBufferFormat(ir, inst.Arg(0), IR::U32{inst.Arg(1)}, flags, format_info);
|
||||
inst.ReplaceUsesWithAndRemove(interpreted);
|
||||
} else if (IsBufferFormatStore(inst)) {
|
||||
StoreBufferFormat(ir, buffer, inst.Arg(0), IR::U32{inst.Arg(1)}, inst.Arg(2),
|
||||
inst.Flags<IR::BufferInstInfo>());
|
||||
StoreBufferFormat(ir, inst.Arg(0), IR::U32{inst.Arg(1)}, inst.Arg(2), flags, format_info);
|
||||
inst.Invalidate();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include "common/bit_field.h"
|
||||
#include "common/enum.h"
|
||||
#include "common/types.h"
|
||||
#include "video_core/amdgpu/types.h"
|
||||
|
||||
namespace Shader::IR {
|
||||
|
||||
|
@ -52,6 +53,8 @@ union BufferInstInfo {
|
|||
BitField<14, 1, u32> system_coherent;
|
||||
BitField<15, 1, u32> globally_coherent;
|
||||
BitField<16, 1, u32> typed;
|
||||
BitField<17, 4, AmdGpu::DataFormat> inst_data_fmt;
|
||||
BitField<21, 3, AmdGpu::NumberFormat> inst_num_fmt;
|
||||
};
|
||||
|
||||
enum class ScalarReg : u32 {
|
||||
|
|
|
@ -262,6 +262,13 @@ private:
|
|||
}
|
||||
};
|
||||
|
||||
static constexpr CompMapping IdentityMapping = {
|
||||
.r = CompSwizzle::Red,
|
||||
.g = CompSwizzle::Green,
|
||||
.b = CompSwizzle::Blue,
|
||||
.a = CompSwizzle::Alpha,
|
||||
};
|
||||
|
||||
inline DataFormat RemapDataFormat(const DataFormat format) {
|
||||
switch (format) {
|
||||
case DataFormat::Format11_11_10:
|
||||
|
|
Loading…
Add table
Reference in a new issue