mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-04-20 03:24:49 +00:00
shader_recompiler: Replace texel buffers with in-shader buffer format interpretation (#2363)
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
* shader_recompiler: Replace texel buffers with in-shader buffer format interpretation * shader_recompiler: Move 10/11-bit float conversion to functions and address some comments. * vulkan: Remove VK_KHR_maintenance5 as it is no longer needed for buffer views. * shader_recompiler: Add helpers for composites and bitfields in pack/unpack. * shader_recompiler: Use initializer_list for bitfield insert helper.
This commit is contained in:
parent
78b4f10cc6
commit
cfe249debe
35 changed files with 1037 additions and 562 deletions
|
@ -753,6 +753,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h
|
|||
src/shader_recompiler/ir/passes/hull_shader_transform.cpp
|
||||
src/shader_recompiler/ir/passes/identity_removal_pass.cpp
|
||||
src/shader_recompiler/ir/passes/ir_passes.h
|
||||
src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
|
||||
src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp
|
||||
src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
|
||||
src/shader_recompiler/ir/passes/ring_access_elimination.cpp
|
||||
|
|
2
externals/sirit
vendored
2
externals/sirit
vendored
|
@ -1 +1 @@
|
|||
Subproject commit d6f3c0d99862ab2ff8f95e9ac221560f1f97e29a
|
||||
Subproject commit 8b9b12c2089505ac8b10fa56bf56b3ed49d9d7b0
|
|
@ -250,7 +250,7 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct
|
|||
ctx.AddCapability(spv::Capability::Float64);
|
||||
}
|
||||
ctx.AddCapability(spv::Capability::Int64);
|
||||
if (info.has_storage_images || info.has_image_buffers) {
|
||||
if (info.has_storage_images) {
|
||||
ctx.AddCapability(spv::Capability::StorageImageExtendedFormats);
|
||||
ctx.AddCapability(spv::Capability::StorageImageReadWithoutFormat);
|
||||
ctx.AddCapability(spv::Capability::StorageImageWriteWithoutFormat);
|
||||
|
@ -259,12 +259,6 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct
|
|||
ctx.AddCapability(spv::Capability::ImageReadWriteLodAMD);
|
||||
}
|
||||
}
|
||||
if (info.has_texel_buffers) {
|
||||
ctx.AddCapability(spv::Capability::SampledBuffer);
|
||||
}
|
||||
if (info.has_image_buffers) {
|
||||
ctx.AddCapability(spv::Capability::ImageBuffer);
|
||||
}
|
||||
if (info.has_image_gather) {
|
||||
ctx.AddCapability(spv::Capability::ImageGatherExtended);
|
||||
}
|
||||
|
|
|
@ -6,6 +6,56 @@
|
|||
|
||||
namespace Shader::Backend::SPIRV {
|
||||
|
||||
struct R {
|
||||
R(u32 a, u32 b) : offset(a), size(b) {}
|
||||
u32 offset;
|
||||
u32 size;
|
||||
};
|
||||
template <bool is_signed, typename... Args>
|
||||
static std::array<Id, sizeof...(Args)> ExtractBitFields(EmitContext& ctx, const Id value,
|
||||
const Args... args) {
|
||||
const auto op_func =
|
||||
is_signed ? &EmitContext::OpBitFieldSExtract : &EmitContext::OpBitFieldUExtract;
|
||||
std::array<Id, sizeof...(Args)> result{};
|
||||
u32 i = 0;
|
||||
(
|
||||
[&] {
|
||||
result[i++] = (ctx.*op_func)(ctx.U32[1], value, ctx.ConstU32(args.offset),
|
||||
ctx.ConstU32(args.size));
|
||||
}(),
|
||||
...);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
static Id InsertBitFields(EmitContext& ctx, const std::initializer_list<Id> values,
|
||||
const Args... args) {
|
||||
Id result{};
|
||||
auto it = values.begin();
|
||||
(
|
||||
[&] {
|
||||
if (it == values.begin()) {
|
||||
result = *it;
|
||||
} else {
|
||||
result = ctx.OpBitFieldInsert(ctx.U32[1], result, *it, ctx.ConstU32(args.offset),
|
||||
ctx.ConstU32(args.size));
|
||||
}
|
||||
++it;
|
||||
}(),
|
||||
...);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <u32 num_components>
|
||||
static std::array<Id, num_components> ExtractComposite(EmitContext& ctx, const VectorIds type,
|
||||
const Id value) {
|
||||
std::array<Id, num_components> result{};
|
||||
for (u32 i = 0; i < num_components; i++) {
|
||||
result[i] = ctx.OpCompositeExtract(type[1], value, i);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
Id EmitBitCastU16F16(EmitContext& ctx, Id value) {
|
||||
return ctx.OpBitcast(ctx.U16, value);
|
||||
}
|
||||
|
@ -42,22 +92,6 @@ Id EmitPackFloat2x32(EmitContext& ctx, Id value) {
|
|||
return ctx.OpBitcast(ctx.F64[1], value);
|
||||
}
|
||||
|
||||
Id EmitPackFloat2x16(EmitContext& ctx, Id value) {
|
||||
return ctx.OpBitcast(ctx.U32[1], value);
|
||||
}
|
||||
|
||||
Id EmitUnpackFloat2x16(EmitContext& ctx, Id value) {
|
||||
return ctx.OpBitcast(ctx.F16[2], value);
|
||||
}
|
||||
|
||||
Id EmitPackHalf2x16(EmitContext& ctx, Id value) {
|
||||
return ctx.OpPackHalf2x16(ctx.U32[1], value);
|
||||
}
|
||||
|
||||
Id EmitUnpackHalf2x16(EmitContext& ctx, Id value) {
|
||||
return ctx.OpUnpackHalf2x16(ctx.F32[2], value);
|
||||
}
|
||||
|
||||
Id EmitPackUnorm2x16(EmitContext& ctx, Id value) {
|
||||
return ctx.OpPackUnorm2x16(ctx.U32[1], value);
|
||||
}
|
||||
|
@ -75,31 +109,157 @@ Id EmitUnpackSnorm2x16(EmitContext& ctx, Id value) {
|
|||
}
|
||||
|
||||
Id EmitPackUint2x16(EmitContext& ctx, Id value) {
|
||||
// No SPIR-V instruction for this, do it manually.
|
||||
const auto x{ctx.OpCompositeExtract(ctx.U32[1], value, 0)};
|
||||
const auto y{ctx.OpCompositeExtract(ctx.U32[1], value, 1)};
|
||||
return ctx.OpBitFieldInsert(ctx.U32[1], x, y, ctx.ConstU32(16U), ctx.ConstU32(16U));
|
||||
const auto unpacked{ctx.OpBitcast(ctx.U32[2], value)};
|
||||
const auto [x, y] = ExtractComposite<2>(ctx, ctx.U32, unpacked);
|
||||
return InsertBitFields(ctx, {x, y}, R(0, 16), R(16, 16));
|
||||
}
|
||||
|
||||
Id EmitUnpackUint2x16(EmitContext& ctx, Id value) {
|
||||
// No SPIR-V instruction for this, do it manually.
|
||||
const auto x{ctx.OpBitFieldUExtract(ctx.U32[1], value, ctx.ConstU32(0U), ctx.ConstU32(16U))};
|
||||
const auto y{ctx.OpBitFieldUExtract(ctx.U32[1], value, ctx.ConstU32(16U), ctx.ConstU32(16U))};
|
||||
return ctx.OpCompositeConstruct(ctx.U32[2], x, y);
|
||||
const auto [x, y] = ExtractBitFields<false>(ctx, value, R(0, 16), R(16, 16));
|
||||
const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[2], x, y)};
|
||||
return ctx.OpBitcast(ctx.F32[2], unpacked);
|
||||
}
|
||||
|
||||
Id EmitPackSint2x16(EmitContext& ctx, Id value) {
|
||||
// No SPIR-V instruction for this, do it manually.
|
||||
const auto x{ctx.OpCompositeExtract(ctx.U32[1], value, 0)};
|
||||
const auto y{ctx.OpCompositeExtract(ctx.U32[1], value, 1)};
|
||||
return ctx.OpBitFieldInsert(ctx.U32[1], x, y, ctx.ConstU32(16U), ctx.ConstU32(16U));
|
||||
return EmitPackUint2x16(ctx, value);
|
||||
}
|
||||
|
||||
Id EmitUnpackSint2x16(EmitContext& ctx, Id value) {
|
||||
// No SPIR-V instruction for this, do it manually.
|
||||
const auto x{ctx.OpBitFieldSExtract(ctx.U32[1], value, ctx.ConstU32(0U), ctx.ConstU32(16U))};
|
||||
const auto y{ctx.OpBitFieldSExtract(ctx.U32[1], value, ctx.ConstU32(16U), ctx.ConstU32(16U))};
|
||||
return ctx.OpCompositeConstruct(ctx.U32[2], x, y);
|
||||
const auto [x, y] = ExtractBitFields<true>(ctx, value, R(0, 16), R(16, 16));
|
||||
const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[2], x, y)};
|
||||
return ctx.OpBitcast(ctx.F32[2], unpacked);
|
||||
}
|
||||
|
||||
Id EmitPackHalf2x16(EmitContext& ctx, Id value) {
|
||||
return ctx.OpPackHalf2x16(ctx.U32[1], value);
|
||||
}
|
||||
|
||||
Id EmitUnpackHalf2x16(EmitContext& ctx, Id value) {
|
||||
return ctx.OpUnpackHalf2x16(ctx.F32[2], value);
|
||||
}
|
||||
|
||||
Id EmitPackUnorm4x8(EmitContext& ctx, Id value) {
|
||||
return ctx.OpPackUnorm4x8(ctx.U32[1], value);
|
||||
}
|
||||
|
||||
Id EmitUnpackUnorm4x8(EmitContext& ctx, Id value) {
|
||||
return ctx.OpUnpackUnorm4x8(ctx.F32[4], value);
|
||||
}
|
||||
|
||||
Id EmitPackSnorm4x8(EmitContext& ctx, Id value) {
|
||||
return ctx.OpPackSnorm4x8(ctx.U32[1], value);
|
||||
}
|
||||
|
||||
Id EmitUnpackSnorm4x8(EmitContext& ctx, Id value) {
|
||||
return ctx.OpUnpackSnorm4x8(ctx.F32[4], value);
|
||||
}
|
||||
|
||||
Id EmitPackUint4x8(EmitContext& ctx, Id value) {
|
||||
const auto unpacked{ctx.OpBitcast(ctx.U32[4], value)};
|
||||
const auto [x, y, z, w] = ExtractComposite<4>(ctx, ctx.U32, unpacked);
|
||||
return InsertBitFields(ctx, {x, y, z, w}, R(0, 8), R(8, 8), R(16, 8), R(24, 8));
|
||||
}
|
||||
|
||||
Id EmitUnpackUint4x8(EmitContext& ctx, Id value) {
|
||||
const auto [x, y, z, w] =
|
||||
ExtractBitFields<false>(ctx, value, R(0, 8), R(8, 8), R(16, 8), R(24, 8));
|
||||
const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[4], x, y, z, w)};
|
||||
return ctx.OpBitcast(ctx.F32[4], unpacked);
|
||||
}
|
||||
|
||||
Id EmitPackSint4x8(EmitContext& ctx, Id value) {
|
||||
return EmitPackUint4x8(ctx, value);
|
||||
}
|
||||
|
||||
Id EmitUnpackSint4x8(EmitContext& ctx, Id value) {
|
||||
const auto [x, y, z, w] =
|
||||
ExtractBitFields<true>(ctx, value, R(0, 8), R(8, 8), R(16, 8), R(24, 8));
|
||||
const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[4], x, y, z, w)};
|
||||
return ctx.OpBitcast(ctx.F32[4], unpacked);
|
||||
}
|
||||
|
||||
Id EmitPackUfloat10_11_11(EmitContext& ctx, Id value) {
|
||||
const auto [x, y, z] = ExtractComposite<3>(ctx, ctx.F32, value);
|
||||
const auto cvt_x{ctx.OpFunctionCall(ctx.U32[1], ctx.f32_to_uf11, x)};
|
||||
const auto cvt_y{ctx.OpFunctionCall(ctx.U32[1], ctx.f32_to_uf11, y)};
|
||||
const auto cvt_z{ctx.OpFunctionCall(ctx.U32[1], ctx.f32_to_uf10, z)};
|
||||
return InsertBitFields(ctx, {cvt_x, cvt_y, cvt_z}, R(0, 11), R(11, 11), R(22, 10));
|
||||
}
|
||||
|
||||
Id EmitUnpackUfloat10_11_11(EmitContext& ctx, Id value) {
|
||||
const auto [x, y, z] = ExtractBitFields<false>(ctx, value, R(0, 11), R(11, 11), R(22, 10));
|
||||
const auto cvt_x{ctx.OpFunctionCall(ctx.F32[1], ctx.uf11_to_f32, x)};
|
||||
const auto cvt_y{ctx.OpFunctionCall(ctx.F32[1], ctx.uf11_to_f32, y)};
|
||||
const auto cvt_z{ctx.OpFunctionCall(ctx.F32[1], ctx.uf10_to_f32, z)};
|
||||
return ctx.OpCompositeConstruct(ctx.F32[3], cvt_x, cvt_y, cvt_z);
|
||||
}
|
||||
|
||||
Id EmitPackUnorm2_10_10_10(EmitContext& ctx, Id value) {
|
||||
const auto unorm_min{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(0.f), ctx.ConstF32(0.f),
|
||||
ctx.ConstF32(0.f), ctx.ConstF32(0.f))};
|
||||
const auto unorm_max{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(1.f), ctx.ConstF32(1.f),
|
||||
ctx.ConstF32(1.f), ctx.ConstF32(1.f))};
|
||||
const auto clamped{ctx.OpFClamp(ctx.F32[4], value, unorm_min, unorm_max)};
|
||||
const auto unorm_mul{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(1023.f),
|
||||
ctx.ConstF32(1023.f), ctx.ConstF32(1023.f),
|
||||
ctx.ConstF32(3.f))};
|
||||
const auto as_float{ctx.OpFMul(ctx.F32[4], clamped, unorm_mul)};
|
||||
const auto as_uint{ctx.OpConvertFToU(ctx.U32[4], ctx.OpRoundEven(ctx.F32[4], as_float))};
|
||||
return EmitPackUint2_10_10_10(ctx, ctx.OpBitcast(ctx.F32[4], as_uint));
|
||||
}
|
||||
|
||||
Id EmitUnpackUnorm2_10_10_10(EmitContext& ctx, Id value) {
|
||||
const auto unpacked{ctx.OpBitcast(ctx.U32[4], EmitUnpackUint2_10_10_10(ctx, value))};
|
||||
const auto as_float{ctx.OpConvertUToF(ctx.F32[4], unpacked)};
|
||||
const auto unorm_div{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(1023.f),
|
||||
ctx.ConstF32(1023.f), ctx.ConstF32(1023.f),
|
||||
ctx.ConstF32(3.f))};
|
||||
return ctx.OpFDiv(ctx.F32[4], as_float, unorm_div);
|
||||
}
|
||||
|
||||
Id EmitPackSnorm2_10_10_10(EmitContext& ctx, Id value) {
|
||||
const auto snorm_min{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(-1.f), ctx.ConstF32(-1.f),
|
||||
ctx.ConstF32(-1.f), ctx.ConstF32(-1.f))};
|
||||
const auto snorm_max{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(1.f), ctx.ConstF32(1.f),
|
||||
ctx.ConstF32(1.f), ctx.ConstF32(1.f))};
|
||||
const auto clamped{ctx.OpFClamp(ctx.F32[4], value, snorm_min, snorm_max)};
|
||||
const auto snorm_mul{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(511.f), ctx.ConstF32(511.f),
|
||||
ctx.ConstF32(511.f), ctx.ConstF32(1.f))};
|
||||
const auto as_float{ctx.OpFMul(ctx.F32[4], clamped, snorm_mul)};
|
||||
const auto as_sint{ctx.OpConvertFToS(ctx.U32[4], ctx.OpRoundEven(ctx.F32[4], as_float))};
|
||||
return EmitPackSint2_10_10_10(ctx, ctx.OpBitcast(ctx.F32[4], as_sint));
|
||||
}
|
||||
|
||||
Id EmitUnpackSnorm2_10_10_10(EmitContext& ctx, Id value) {
|
||||
const auto unpacked{ctx.OpBitcast(ctx.U32[4], EmitUnpackSint2_10_10_10(ctx, value))};
|
||||
const auto as_float{ctx.OpConvertSToF(ctx.F32[4], unpacked)};
|
||||
const auto snorm_div{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(511.f), ctx.ConstF32(511.f),
|
||||
ctx.ConstF32(511.f), ctx.ConstF32(1.f))};
|
||||
return ctx.OpFDiv(ctx.F32[4], as_float, snorm_div);
|
||||
}
|
||||
|
||||
Id EmitPackUint2_10_10_10(EmitContext& ctx, Id value) {
|
||||
const auto unpacked{ctx.OpBitcast(ctx.U32[4], value)};
|
||||
const auto [x, y, z, w] = ExtractComposite<4>(ctx, ctx.U32, unpacked);
|
||||
return InsertBitFields(ctx, {x, y, z, w}, R(0, 10), R(10, 10), R(20, 10), R(30, 2));
|
||||
}
|
||||
|
||||
Id EmitUnpackUint2_10_10_10(EmitContext& ctx, Id value) {
|
||||
const auto [x, y, z, w] =
|
||||
ExtractBitFields<false>(ctx, value, R(0, 10), R(10, 10), R(20, 10), R(30, 2));
|
||||
const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[4], x, y, z, w)};
|
||||
return ctx.OpBitcast(ctx.F32[4], unpacked);
|
||||
}
|
||||
|
||||
Id EmitPackSint2_10_10_10(EmitContext& ctx, Id value) {
|
||||
return EmitPackUint2_10_10_10(ctx, value);
|
||||
}
|
||||
|
||||
Id EmitUnpackSint2_10_10_10(EmitContext& ctx, Id value) {
|
||||
const auto [x, y, z, w] =
|
||||
ExtractBitFields<true>(ctx, value, R(0, 10), R(10, 10), R(20, 10), R(30, 2));
|
||||
const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[4], x, y, z, w)};
|
||||
return ctx.OpBitcast(ctx.F32[4], unpacked);
|
||||
}
|
||||
|
||||
} // namespace Shader::Backend::SPIRV
|
||||
|
|
|
@ -24,6 +24,10 @@ Id EmitCompositeConstructU32x4(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, I
|
|||
return EmitCompositeConstruct(ctx, inst, ctx.U32[4], e1, e2, e3, e4);
|
||||
}
|
||||
|
||||
Id EmitCompositeConstructU32x2x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2) {
|
||||
return EmitCompositeConstruct(ctx, inst, ctx.U32[4], e1, e2);
|
||||
}
|
||||
|
||||
Id EmitCompositeExtractU32x2(EmitContext& ctx, Id composite, u32 index) {
|
||||
return ctx.OpCompositeExtract(ctx.U32[1], composite, index);
|
||||
}
|
||||
|
@ -124,6 +128,10 @@ Id EmitCompositeConstructF32x4(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, I
|
|||
return EmitCompositeConstruct(ctx, inst, ctx.F32[4], e1, e2, e3, e4);
|
||||
}
|
||||
|
||||
Id EmitCompositeConstructF32x2x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2) {
|
||||
return EmitCompositeConstruct(ctx, inst, ctx.F32[4], e1, e2);
|
||||
}
|
||||
|
||||
Id EmitCompositeExtractF32x2(EmitContext& ctx, Id composite, u32 index) {
|
||||
return ctx.OpCompositeExtract(ctx.F32[1], composite, index);
|
||||
}
|
||||
|
|
|
@ -416,6 +416,20 @@ static Id EmitLoadBufferU32xN(EmitContext& ctx, u32 handle, Id address) {
|
|||
}
|
||||
}
|
||||
|
||||
Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
|
||||
const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(3u))};
|
||||
const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))};
|
||||
const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)};
|
||||
return ctx.OpBitFieldUExtract(ctx.U32[1], dword, bit_offset, ctx.ConstU32(8u));
|
||||
}
|
||||
|
||||
Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
|
||||
const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(2u))};
|
||||
const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))};
|
||||
const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)};
|
||||
return ctx.OpBitFieldUExtract(ctx.U32[1], dword, bit_offset, ctx.ConstU32(16u));
|
||||
}
|
||||
|
||||
Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
|
||||
return EmitLoadBufferU32xN<1>(ctx, handle, address);
|
||||
}
|
||||
|
@ -432,18 +446,24 @@ Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
|
|||
return EmitLoadBufferU32xN<4>(ctx, handle, address);
|
||||
}
|
||||
|
||||
Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
return ctx.OpBitcast(ctx.F32[1], EmitLoadBufferU32(ctx, inst, handle, address));
|
||||
}
|
||||
|
||||
Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
return ctx.OpBitcast(ctx.F32[2], EmitLoadBufferU32x2(ctx, inst, handle, address));
|
||||
}
|
||||
|
||||
Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
return ctx.OpBitcast(ctx.F32[3], EmitLoadBufferU32x3(ctx, inst, handle, address));
|
||||
}
|
||||
|
||||
Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
return ctx.OpBitcast(ctx.F32[4], EmitLoadBufferU32x4(ctx, inst, handle, address));
|
||||
}
|
||||
|
||||
Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
const auto& buffer = ctx.texture_buffers[handle];
|
||||
const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id);
|
||||
const Id coord =
|
||||
ctx.OpIAdd(ctx.U32[1], ctx.OpShiftLeftLogical(ctx.U32[1], address, buffer.coord_shift),
|
||||
buffer.coord_offset);
|
||||
Id texel = buffer.is_storage ? ctx.OpImageRead(buffer.result_type, tex_buffer, coord)
|
||||
: ctx.OpImageFetch(buffer.result_type, tex_buffer, coord);
|
||||
if (buffer.is_integer) {
|
||||
texel = ctx.OpBitcast(ctx.F32[4], texel);
|
||||
}
|
||||
return texel;
|
||||
UNREACHABLE_MSG("SPIR-V instruction");
|
||||
}
|
||||
|
||||
template <u32 N>
|
||||
|
@ -464,32 +484,56 @@ static void EmitStoreBufferU32xN(EmitContext& ctx, u32 handle, Id address, Id va
|
|||
}
|
||||
}
|
||||
|
||||
void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
|
||||
void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
|
||||
const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(3u))};
|
||||
const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))};
|
||||
const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)};
|
||||
const Id new_val{ctx.OpBitFieldInsert(ctx.U32[1], dword, value, bit_offset, ctx.ConstU32(8u))};
|
||||
EmitStoreBufferU32xN<1>(ctx, handle, address, new_val);
|
||||
}
|
||||
|
||||
void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
|
||||
const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(2u))};
|
||||
const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))};
|
||||
const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)};
|
||||
const Id new_val{ctx.OpBitFieldInsert(ctx.U32[1], dword, value, bit_offset, ctx.ConstU32(16u))};
|
||||
EmitStoreBufferU32xN<1>(ctx, handle, address, new_val);
|
||||
}
|
||||
|
||||
void EmitStoreBufferU32(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
|
||||
EmitStoreBufferU32xN<1>(ctx, handle, address, value);
|
||||
}
|
||||
|
||||
void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
|
||||
void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
|
||||
EmitStoreBufferU32xN<2>(ctx, handle, address, value);
|
||||
}
|
||||
|
||||
void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
|
||||
void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
|
||||
EmitStoreBufferU32xN<3>(ctx, handle, address, value);
|
||||
}
|
||||
|
||||
void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
|
||||
void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
|
||||
EmitStoreBufferU32xN<4>(ctx, handle, address, value);
|
||||
}
|
||||
|
||||
void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
|
||||
EmitStoreBufferU32(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[1], value));
|
||||
}
|
||||
|
||||
void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
|
||||
EmitStoreBufferU32x2(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[2], value));
|
||||
}
|
||||
|
||||
void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
|
||||
EmitStoreBufferU32x3(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[3], value));
|
||||
}
|
||||
|
||||
void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
|
||||
EmitStoreBufferU32x4(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[4], value));
|
||||
}
|
||||
|
||||
void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
|
||||
const auto& buffer = ctx.texture_buffers[handle];
|
||||
const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id);
|
||||
const Id coord =
|
||||
ctx.OpIAdd(ctx.U32[1], ctx.OpShiftLeftLogical(ctx.U32[1], address, buffer.coord_shift),
|
||||
buffer.coord_offset);
|
||||
if (buffer.is_integer) {
|
||||
value = ctx.OpBitcast(buffer.result_type, value);
|
||||
}
|
||||
ctx.OpImageWrite(tex_buffer, coord, value);
|
||||
UNREACHABLE_MSG("SPIR-V instruction");
|
||||
}
|
||||
|
||||
} // namespace Shader::Backend::SPIRV
|
||||
|
|
|
@ -63,15 +63,27 @@ void EmitGetGotoVariable(EmitContext& ctx);
|
|||
void EmitSetScc(EmitContext& ctx);
|
||||
Id EmitReadConst(EmitContext& ctx, IR::Inst* inst);
|
||||
Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index);
|
||||
Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
void EmitStoreBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
void EmitStoreBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
|
@ -123,6 +135,7 @@ Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value);
|
|||
Id EmitCompositeConstructU32x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2);
|
||||
Id EmitCompositeConstructU32x3(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3);
|
||||
Id EmitCompositeConstructU32x4(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3, Id e4);
|
||||
Id EmitCompositeConstructU32x2x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2);
|
||||
Id EmitCompositeExtractU32x2(EmitContext& ctx, Id composite, u32 index);
|
||||
Id EmitCompositeExtractU32x3(EmitContext& ctx, Id composite, u32 index);
|
||||
Id EmitCompositeExtractU32x4(EmitContext& ctx, Id composite, u32 index);
|
||||
|
@ -151,6 +164,7 @@ Id EmitCompositeShuffleF16x4(EmitContext& ctx, Id composite1, Id composite2, u32
|
|||
Id EmitCompositeConstructF32x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2);
|
||||
Id EmitCompositeConstructF32x3(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3);
|
||||
Id EmitCompositeConstructF32x4(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3, Id e4);
|
||||
Id EmitCompositeConstructF32x2x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2);
|
||||
Id EmitCompositeExtractF32x2(EmitContext& ctx, Id composite, u32 index);
|
||||
Id EmitCompositeExtractF32x3(EmitContext& ctx, Id composite, u32 index);
|
||||
Id EmitCompositeExtractF32x4(EmitContext& ctx, Id composite, u32 index);
|
||||
|
@ -193,10 +207,6 @@ void EmitBitCastF64U64(EmitContext& ctx);
|
|||
Id EmitPackUint2x32(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackUint2x32(EmitContext& ctx, Id value);
|
||||
Id EmitPackFloat2x32(EmitContext& ctx, Id value);
|
||||
Id EmitPackFloat2x16(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackFloat2x16(EmitContext& ctx, Id value);
|
||||
Id EmitPackHalf2x16(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackHalf2x16(EmitContext& ctx, Id value);
|
||||
Id EmitPackUnorm2x16(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackUnorm2x16(EmitContext& ctx, Id value);
|
||||
Id EmitPackSnorm2x16(EmitContext& ctx, Id value);
|
||||
|
@ -205,6 +215,26 @@ Id EmitPackUint2x16(EmitContext& ctx, Id value);
|
|||
Id EmitUnpackUint2x16(EmitContext& ctx, Id value);
|
||||
Id EmitPackSint2x16(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackSint2x16(EmitContext& ctx, Id value);
|
||||
Id EmitPackHalf2x16(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackHalf2x16(EmitContext& ctx, Id value);
|
||||
Id EmitPackUnorm4x8(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackUnorm4x8(EmitContext& ctx, Id value);
|
||||
Id EmitPackSnorm4x8(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackSnorm4x8(EmitContext& ctx, Id value);
|
||||
Id EmitPackUint4x8(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackUint4x8(EmitContext& ctx, Id value);
|
||||
Id EmitPackSint4x8(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackSint4x8(EmitContext& ctx, Id value);
|
||||
Id EmitPackUfloat10_11_11(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackUfloat10_11_11(EmitContext& ctx, Id value);
|
||||
Id EmitPackUnorm2_10_10_10(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackUnorm2_10_10_10(EmitContext& ctx, Id value);
|
||||
Id EmitPackSnorm2_10_10_10(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackSnorm2_10_10_10(EmitContext& ctx, Id value);
|
||||
Id EmitPackUint2_10_10_10(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackUint2_10_10_10(EmitContext& ctx, Id value);
|
||||
Id EmitPackSint2_10_10_10(EmitContext& ctx, Id value);
|
||||
Id EmitUnpackSint2_10_10_10(EmitContext& ctx, Id value);
|
||||
Id EmitFPAbs16(EmitContext& ctx, Id value);
|
||||
Id EmitFPAbs32(EmitContext& ctx, Id value);
|
||||
Id EmitFPAbs64(EmitContext& ctx, Id value);
|
||||
|
|
|
@ -74,8 +74,8 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf
|
|||
DefineInterfaces();
|
||||
DefineSharedMemory();
|
||||
DefineBuffers();
|
||||
DefineTextureBuffers();
|
||||
DefineImagesAndSamplers();
|
||||
DefineFunctions();
|
||||
}
|
||||
|
||||
EmitContext::~EmitContext() = default;
|
||||
|
@ -205,19 +205,6 @@ void EmitContext::DefineBufferOffsets() {
|
|||
buffer.offset_dwords = OpShiftRightLogical(U32[1], buffer.offset, ConstU32(2U));
|
||||
Name(buffer.offset_dwords, fmt::format("buf{}_dword_off", binding));
|
||||
}
|
||||
for (TextureBufferDefinition& tex_buffer : texture_buffers) {
|
||||
const u32 binding = tex_buffer.binding;
|
||||
const u32 half = PushData::BufOffsetIndex + (binding >> 4);
|
||||
const u32 comp = (binding & 0xf) >> 2;
|
||||
const u32 offset = (binding & 0x3) << 3;
|
||||
const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]),
|
||||
push_data_block, ConstU32(half), ConstU32(comp))};
|
||||
const Id value{OpLoad(U32[1], ptr)};
|
||||
tex_buffer.coord_offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(6U));
|
||||
tex_buffer.coord_shift =
|
||||
OpBitFieldUExtract(U32[1], value, ConstU32(offset + 6U), ConstU32(2U));
|
||||
Name(tex_buffer.coord_offset, fmt::format("texbuf{}_off", binding));
|
||||
}
|
||||
}
|
||||
|
||||
void EmitContext::DefineInterpolatedAttribs() {
|
||||
|
@ -676,32 +663,6 @@ void EmitContext::DefineBuffers() {
|
|||
}
|
||||
}
|
||||
|
||||
void EmitContext::DefineTextureBuffers() {
|
||||
for (const auto& desc : info.texture_buffers) {
|
||||
const auto sharp = desc.GetSharp(info);
|
||||
const auto nfmt = sharp.GetNumberFmt();
|
||||
const bool is_integer = AmdGpu::IsInteger(nfmt);
|
||||
const VectorIds& sampled_type{GetAttributeType(*this, nfmt)};
|
||||
const u32 sampled = desc.is_written ? 2 : 1;
|
||||
const Id image_type{TypeImage(sampled_type[1], spv::Dim::Buffer, false, false, false,
|
||||
sampled, spv::ImageFormat::Unknown)};
|
||||
const Id pointer_type{TypePointer(spv::StorageClass::UniformConstant, image_type)};
|
||||
const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)};
|
||||
Decorate(id, spv::Decoration::Binding, binding.unified++);
|
||||
Decorate(id, spv::Decoration::DescriptorSet, 0U);
|
||||
Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sharp_idx));
|
||||
texture_buffers.push_back({
|
||||
.id = id,
|
||||
.binding = binding.buffer++,
|
||||
.image_type = image_type,
|
||||
.result_type = sampled_type[4],
|
||||
.is_integer = is_integer,
|
||||
.is_storage = desc.is_written,
|
||||
});
|
||||
interfaces.push_back(id);
|
||||
}
|
||||
}
|
||||
|
||||
spv::ImageFormat GetFormat(const AmdGpu::Image& image) {
|
||||
if (image.GetDataFmt() == AmdGpu::DataFormat::Format32 &&
|
||||
image.GetNumberFmt() == AmdGpu::NumberFormat::Uint) {
|
||||
|
@ -893,4 +854,117 @@ void EmitContext::DefineSharedMemory() {
|
|||
}
|
||||
}
|
||||
|
||||
Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) {
|
||||
// https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/src/util/format_r11g11b10f.h
|
||||
const auto func_type{TypeFunction(U32[1], F32[1])};
|
||||
const auto func{OpFunction(U32[1], spv::FunctionControlMask::MaskNone, func_type)};
|
||||
const auto value{OpFunctionParameter(F32[1])};
|
||||
Name(func, name);
|
||||
AddLabel();
|
||||
|
||||
const auto raw_value{OpBitcast(U32[1], value)};
|
||||
const auto exponent{
|
||||
OpBitcast(S32[1], OpBitFieldSExtract(U32[1], raw_value, ConstU32(23U), ConstU32(8U)))};
|
||||
const auto sign{OpBitFieldUExtract(U32[1], raw_value, ConstU32(31U), ConstU32(1U))};
|
||||
|
||||
const auto is_zero{OpLogicalOr(U1[1], OpIEqual(U1[1], raw_value, ConstU32(0U)),
|
||||
OpIEqual(U1[1], sign, ConstU32(1U)))};
|
||||
const auto is_nan{OpIsNan(U1[1], value)};
|
||||
const auto is_inf{OpIsInf(U1[1], value)};
|
||||
const auto is_denorm{OpSLessThanEqual(U1[1], exponent, ConstS32(-15))};
|
||||
|
||||
const auto denorm_mantissa{OpConvertFToU(
|
||||
U32[1],
|
||||
OpRoundEven(F32[1], OpFMul(F32[1], value,
|
||||
ConstF32(static_cast<float>(1 << (mantissa_bits + 14))))))};
|
||||
const auto denorm_overflow{
|
||||
OpINotEqual(U1[1], OpShiftRightLogical(U32[1], denorm_mantissa, ConstU32(mantissa_bits)),
|
||||
ConstU32(0U))};
|
||||
const auto denorm{
|
||||
OpSelect(U32[1], denorm_overflow, ConstU32(1U << mantissa_bits), denorm_mantissa)};
|
||||
|
||||
const auto norm_mantissa{OpConvertFToU(
|
||||
U32[1],
|
||||
OpRoundEven(F32[1],
|
||||
OpLdexp(F32[1], value,
|
||||
OpISub(S32[1], ConstS32(static_cast<int>(mantissa_bits)), exponent))))};
|
||||
const auto norm_overflow{
|
||||
OpUGreaterThanEqual(U1[1], norm_mantissa, ConstU32(2U << mantissa_bits))};
|
||||
const auto norm_final_mantissa{OpBitwiseAnd(
|
||||
U32[1],
|
||||
OpSelect(U32[1], norm_overflow, OpShiftRightLogical(U32[1], norm_mantissa, ConstU32(1U)),
|
||||
norm_mantissa),
|
||||
ConstU32((1U << mantissa_bits) - 1))};
|
||||
const auto norm_final_exponent{OpBitcast(
|
||||
U32[1],
|
||||
OpIAdd(S32[1],
|
||||
OpSelect(S32[1], norm_overflow, OpIAdd(S32[1], exponent, ConstS32(1)), exponent),
|
||||
ConstS32(15)))};
|
||||
const auto norm{OpBitFieldInsert(U32[1], norm_final_mantissa, norm_final_exponent,
|
||||
ConstU32(mantissa_bits), ConstU32(5U))};
|
||||
|
||||
const auto result{OpSelect(U32[1], is_zero, ConstU32(0U),
|
||||
OpSelect(U32[1], is_nan, ConstU32(31u << mantissa_bits | 1U),
|
||||
OpSelect(U32[1], is_inf, ConstU32(31U << mantissa_bits),
|
||||
OpSelect(U32[1], is_denorm, denorm, norm))))};
|
||||
|
||||
OpReturnValue(result);
|
||||
OpFunctionEnd();
|
||||
return func;
|
||||
}
|
||||
|
||||
Id EmitContext::DefineUfloatM5ToFloat32(u32 mantissa_bits, const std::string_view name) {
|
||||
// https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/src/util/format_r11g11b10f.h
|
||||
const auto func_type{TypeFunction(F32[1], U32[1])};
|
||||
const auto func{OpFunction(F32[1], spv::FunctionControlMask::MaskNone, func_type)};
|
||||
const auto value{OpFunctionParameter(U32[1])};
|
||||
Name(func, name);
|
||||
AddLabel();
|
||||
|
||||
const auto raw_mantissa{
|
||||
OpBitFieldUExtract(U32[1], value, ConstU32(0U), ConstU32(mantissa_bits))};
|
||||
const auto mantissa{OpConvertUToF(F32[1], raw_mantissa)};
|
||||
const auto exponent{OpBitcast(
|
||||
S32[1], OpBitFieldSExtract(U32[1], value, ConstU32(mantissa_bits), ConstU32(5U)))};
|
||||
|
||||
const auto is_exp_neg_one{OpIEqual(U1[1], exponent, ConstS32(-1))};
|
||||
const auto is_exp_zero{OpIEqual(U1[1], exponent, ConstS32(0))};
|
||||
|
||||
const auto is_zero{OpIEqual(U1[1], value, ConstU32(0u))};
|
||||
const auto is_nan{
|
||||
OpLogicalAnd(U1[1], is_exp_neg_one, OpINotEqual(U1[1], raw_mantissa, ConstU32(0u)))};
|
||||
const auto is_inf{
|
||||
OpLogicalAnd(U1[1], is_exp_neg_one, OpIEqual(U1[1], raw_mantissa, ConstU32(0u)))};
|
||||
const auto is_denorm{
|
||||
OpLogicalAnd(U1[1], is_exp_zero, OpINotEqual(U1[1], raw_mantissa, ConstU32(0u)))};
|
||||
|
||||
const auto denorm{OpFMul(F32[1], mantissa, ConstF32(1.f / (1 << 20)))};
|
||||
const auto norm{OpLdexp(
|
||||
F32[1],
|
||||
OpFAdd(F32[1],
|
||||
OpFMul(F32[1], mantissa, ConstF32(1.f / static_cast<float>(1 << mantissa_bits))),
|
||||
ConstF32(1.f)),
|
||||
exponent)};
|
||||
|
||||
const auto result{OpSelect(F32[1], is_zero, ConstF32(0.f),
|
||||
OpSelect(F32[1], is_nan, ConstF32(NAN),
|
||||
OpSelect(F32[1], is_inf, ConstF32(INFINITY),
|
||||
OpSelect(F32[1], is_denorm, denorm, norm))))};
|
||||
|
||||
OpReturnValue(result);
|
||||
OpFunctionEnd();
|
||||
return func;
|
||||
}
|
||||
|
||||
void EmitContext::DefineFunctions() {
|
||||
if (info.uses_pack_10_11_11) {
|
||||
f32_to_uf11 = DefineFloat32ToUfloatM5(6, "f32_to_uf11");
|
||||
f32_to_uf10 = DefineFloat32ToUfloatM5(5, "f32_to_uf10");
|
||||
}
|
||||
if (info.uses_unpack_10_11_11) {
|
||||
uf11_to_f32 = DefineUfloatM5ToFloat32(6, "uf11_to_f32");
|
||||
uf10_to_f32 = DefineUfloatM5ToFloat32(5, "uf10_to_f32");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Shader::Backend::SPIRV
|
||||
|
|
|
@ -235,20 +235,9 @@ public:
|
|||
const VectorIds* data_types;
|
||||
Id pointer_type;
|
||||
};
|
||||
struct TextureBufferDefinition {
|
||||
Id id;
|
||||
Id coord_offset;
|
||||
Id coord_shift;
|
||||
u32 binding;
|
||||
Id image_type;
|
||||
Id result_type;
|
||||
bool is_integer = false;
|
||||
bool is_storage = false;
|
||||
};
|
||||
|
||||
Bindings& binding;
|
||||
boost::container::small_vector<BufferDefinition, 16> buffers;
|
||||
boost::container::small_vector<TextureBufferDefinition, 8> texture_buffers;
|
||||
BufferDefinition srt_flatbuf;
|
||||
boost::container::small_vector<TextureDefinition, 8> images;
|
||||
boost::container::small_vector<Id, 4> samplers;
|
||||
|
@ -271,6 +260,11 @@ public:
|
|||
std::array<SpirvAttribute, IR::NumParams> output_params{};
|
||||
std::array<SpirvAttribute, IR::NumRenderTargets> frag_outputs{};
|
||||
|
||||
Id uf11_to_f32{};
|
||||
Id f32_to_uf11{};
|
||||
Id uf10_to_f32{};
|
||||
Id f32_to_uf10{};
|
||||
|
||||
private:
|
||||
void DefineArithmeticTypes();
|
||||
void DefineInterfaces();
|
||||
|
@ -278,12 +272,15 @@ private:
|
|||
void DefineOutputs();
|
||||
void DefinePushDataBlock();
|
||||
void DefineBuffers();
|
||||
void DefineTextureBuffers();
|
||||
void DefineImagesAndSamplers();
|
||||
void DefineSharedMemory();
|
||||
void DefineFunctions();
|
||||
|
||||
SpirvAttribute GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id, u32 num_components,
|
||||
bool output);
|
||||
|
||||
Id DefineFloat32ToUfloatM5(u32 mantissa_bits, std::string_view name);
|
||||
Id DefineUfloatM5ToFloat32(u32 mantissa_bits, std::string_view name);
|
||||
};
|
||||
|
||||
} // namespace Shader::Backend::SPIRV
|
||||
|
|
|
@ -30,28 +30,25 @@ void Translator::ExportMrtCompressed(IR::Attribute attribute, u32 idx, const IR:
|
|||
static_cast<u32>(attribute) - static_cast<u32>(IR::Attribute::RenderTarget0);
|
||||
const auto color_buffer = runtime_info.fs_info.color_buffers[color_buffer_idx];
|
||||
|
||||
IR::Value unpacked_value;
|
||||
bool is_integer = false;
|
||||
AmdGpu::NumberFormat num_format;
|
||||
switch (color_buffer.export_format) {
|
||||
case AmdGpu::Liverpool::ShaderExportFormat::Zero:
|
||||
// No export
|
||||
return;
|
||||
case AmdGpu::Liverpool::ShaderExportFormat::ABGR_FP16:
|
||||
unpacked_value = ir.UnpackHalf2x16(value);
|
||||
num_format = AmdGpu::NumberFormat::Float;
|
||||
break;
|
||||
case AmdGpu::Liverpool::ShaderExportFormat::ABGR_UNORM16:
|
||||
unpacked_value = ir.UnpackUnorm2x16(value);
|
||||
num_format = AmdGpu::NumberFormat::Unorm;
|
||||
break;
|
||||
case AmdGpu::Liverpool::ShaderExportFormat::ABGR_SNORM16:
|
||||
unpacked_value = ir.UnpackSnorm2x16(value);
|
||||
num_format = AmdGpu::NumberFormat::Snorm;
|
||||
break;
|
||||
case AmdGpu::Liverpool::ShaderExportFormat::ABGR_UINT16:
|
||||
unpacked_value = ir.UnpackUint2x16(value);
|
||||
is_integer = true;
|
||||
num_format = AmdGpu::NumberFormat::Uint;
|
||||
break;
|
||||
case AmdGpu::Liverpool::ShaderExportFormat::ABGR_SINT16:
|
||||
unpacked_value = ir.UnpackSint2x16(value);
|
||||
is_integer = true;
|
||||
num_format = AmdGpu::NumberFormat::Sint;
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE_MSG("Unimplemented compressed MRT export format {}",
|
||||
|
@ -59,16 +56,15 @@ void Translator::ExportMrtCompressed(IR::Attribute attribute, u32 idx, const IR:
|
|||
break;
|
||||
}
|
||||
|
||||
const auto r = ir.CompositeExtract(unpacked_value, 0);
|
||||
const auto g = ir.CompositeExtract(unpacked_value, 1);
|
||||
const IR::F32 float_r = is_integer ? ir.BitCast<IR::F32>(IR::U32{r}) : IR::F32{r};
|
||||
const IR::F32 float_g = is_integer ? ir.BitCast<IR::F32>(IR::U32{g}) : IR::F32{g};
|
||||
const auto unpacked_value = ir.Unpack2x16(num_format, value);
|
||||
const IR::F32 r = IR::F32{ir.CompositeExtract(unpacked_value, 0)};
|
||||
const IR::F32 g = IR::F32{ir.CompositeExtract(unpacked_value, 1)};
|
||||
|
||||
const auto swizzled_r = SwizzleMrtComponent(color_buffer, idx * 2);
|
||||
const auto swizzled_g = SwizzleMrtComponent(color_buffer, idx * 2 + 1);
|
||||
|
||||
ExportMrtValue(attribute, swizzled_r, float_r, color_buffer);
|
||||
ExportMrtValue(attribute, swizzled_g, float_g, color_buffer);
|
||||
ExportMrtValue(attribute, swizzled_r, r, color_buffer);
|
||||
ExportMrtValue(attribute, swizzled_g, g, color_buffer);
|
||||
}
|
||||
|
||||
void Translator::ExportMrtUncompressed(IR::Attribute attribute, u32 comp, const IR::F32& value) {
|
||||
|
@ -115,7 +111,7 @@ void Translator::ExportCompressed(IR::Attribute attribute, u32 idx, const IR::U3
|
|||
ExportMrtCompressed(attribute, idx, value);
|
||||
return;
|
||||
}
|
||||
const IR::Value unpacked_value = ir.UnpackHalf2x16(value);
|
||||
const IR::Value unpacked_value = ir.Unpack2x16(AmdGpu::NumberFormat::Float, value);
|
||||
const IR::F32 r = IR::F32{ir.CompositeExtract(unpacked_value, 0)};
|
||||
const IR::F32 g = IR::F32{ir.CompositeExtract(unpacked_value, 1)};
|
||||
ir.SetAttribute(attribute, r, idx * 2);
|
||||
|
|
|
@ -651,19 +651,19 @@ void Translator::V_LDEXP_F32(const GcnInst& inst) {
|
|||
void Translator::V_CVT_PKNORM_U16_F32(const GcnInst& inst) {
|
||||
const IR::Value vec_f32 =
|
||||
ir.CompositeConstruct(GetSrc<IR::F32>(inst.src[0]), GetSrc<IR::F32>(inst.src[1]));
|
||||
SetDst(inst.dst[0], ir.PackUnorm2x16(vec_f32));
|
||||
SetDst(inst.dst[0], ir.Pack2x16(AmdGpu::NumberFormat::Unorm, vec_f32));
|
||||
}
|
||||
|
||||
void Translator::V_CVT_PKNORM_I16_F32(const GcnInst& inst) {
|
||||
const IR::Value vec_f32 =
|
||||
ir.CompositeConstruct(GetSrc<IR::F32>(inst.src[0]), GetSrc<IR::F32>(inst.src[1]));
|
||||
SetDst(inst.dst[0], ir.PackSnorm2x16(vec_f32));
|
||||
SetDst(inst.dst[0], ir.Pack2x16(AmdGpu::NumberFormat::Snorm, vec_f32));
|
||||
}
|
||||
|
||||
void Translator::V_CVT_PKRTZ_F16_F32(const GcnInst& inst) {
|
||||
const IR::Value vec_f32 =
|
||||
ir.CompositeConstruct(GetSrc<IR::F32>(inst.src[0]), GetSrc<IR::F32>(inst.src[1]));
|
||||
SetDst(inst.dst[0], ir.PackHalf2x16(vec_f32));
|
||||
SetDst(inst.dst[0], ir.Pack2x16(AmdGpu::NumberFormat::Float, vec_f32));
|
||||
}
|
||||
|
||||
// VOP1
|
||||
|
@ -1245,14 +1245,16 @@ void Translator::V_SAD_U32(const GcnInst& inst) {
|
|||
|
||||
void Translator::V_CVT_PK_U16_U32(const GcnInst& inst) {
|
||||
const IR::Value vec_u32 =
|
||||
ir.CompositeConstruct(GetSrc<IR::U32>(inst.src[0]), GetSrc<IR::U32>(inst.src[1]));
|
||||
SetDst(inst.dst[0], ir.PackUint2x16(vec_u32));
|
||||
ir.CompositeConstruct(ir.BitCast<IR::F32>(GetSrc<IR::U32>(inst.src[0])),
|
||||
ir.BitCast<IR::F32>(GetSrc<IR::U32>(inst.src[1])));
|
||||
SetDst(inst.dst[0], ir.Pack2x16(AmdGpu::NumberFormat::Uint, vec_u32));
|
||||
}
|
||||
|
||||
void Translator::V_CVT_PK_I16_I32(const GcnInst& inst) {
|
||||
const IR::Value vec_u32 =
|
||||
ir.CompositeConstruct(GetSrc<IR::U32>(inst.src[0]), GetSrc<IR::U32>(inst.src[1]));
|
||||
SetDst(inst.dst[0], ir.PackSint2x16(vec_u32));
|
||||
ir.CompositeConstruct(ir.BitCast<IR::F32>(GetSrc<IR::U32>(inst.src[0])),
|
||||
ir.BitCast<IR::F32>(GetSrc<IR::U32>(inst.src[1])));
|
||||
SetDst(inst.dst[0], ir.Pack2x16(AmdGpu::NumberFormat::Sint, vec_u32));
|
||||
}
|
||||
|
||||
void Translator::V_CVT_PK_U8_F32(const GcnInst& inst) {
|
||||
|
|
|
@ -208,7 +208,7 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst)
|
|||
const IR::Value handle =
|
||||
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
|
||||
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
|
||||
const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, buffer_info);
|
||||
const IR::Value value = ir.LoadBufferU32(num_dwords, handle, address, buffer_info);
|
||||
const IR::VectorReg dst_reg{inst.src[1].code};
|
||||
if (num_dwords == 1) {
|
||||
ir.SetVectorReg(dst_reg, IR::U32{value});
|
||||
|
@ -314,16 +314,18 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
|
|||
const IR::Value handle =
|
||||
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
|
||||
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
|
||||
ir.StoreBuffer(num_dwords, handle, address, value, buffer_info);
|
||||
ir.StoreBufferU32(num_dwords, handle, address, value, buffer_info);
|
||||
}
|
||||
|
||||
void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
|
||||
const auto& mubuf = inst.control.mubuf;
|
||||
const IR::VectorReg vaddr{inst.src[0].code};
|
||||
const IR::ScalarReg sharp{inst.src[2].code * 4};
|
||||
ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported");
|
||||
const IR::Value address = [&] -> IR::Value {
|
||||
if (mubuf.idxen) {
|
||||
if (mubuf.idxen && mubuf.offen) {
|
||||
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
|
||||
}
|
||||
if (mubuf.idxen || mubuf.offen) {
|
||||
return ir.GetVectorReg(vaddr);
|
||||
}
|
||||
return {};
|
||||
|
|
|
@ -48,6 +48,7 @@ struct BufferResource {
|
|||
bool is_instance_data{};
|
||||
u8 instance_attrib{};
|
||||
bool is_written{};
|
||||
bool is_formatted{};
|
||||
|
||||
[[nodiscard]] bool IsStorage(const AmdGpu::Buffer& buffer) const noexcept {
|
||||
return buffer.GetSize() > MaxUboSize || is_written || is_gds_buffer;
|
||||
|
@ -57,14 +58,6 @@ struct BufferResource {
|
|||
};
|
||||
using BufferResourceList = boost::container::small_vector<BufferResource, 16>;
|
||||
|
||||
struct TextureBufferResource {
|
||||
u32 sharp_idx;
|
||||
bool is_written{};
|
||||
|
||||
[[nodiscard]] constexpr AmdGpu::Buffer GetSharp(const Info& info) const noexcept;
|
||||
};
|
||||
using TextureBufferResourceList = boost::container::small_vector<TextureBufferResource, 16>;
|
||||
|
||||
struct ImageResource {
|
||||
u32 sharp_idx;
|
||||
bool is_depth{};
|
||||
|
@ -114,11 +107,6 @@ struct PushData {
|
|||
ASSERT(offset < 256 && binding < buf_offsets.size());
|
||||
buf_offsets[binding] = offset;
|
||||
}
|
||||
|
||||
void AddTexelOffset(u32 binding, u32 multiplier, u32 texel_offset) {
|
||||
ASSERT(texel_offset < 64 && multiplier < 16);
|
||||
buf_offsets[binding] = texel_offset | ((std::bit_width(multiplier) - 1) << 6);
|
||||
}
|
||||
};
|
||||
static_assert(sizeof(PushData) <= 128,
|
||||
"PushData size is greater than minimum size guaranteed by Vulkan spec");
|
||||
|
@ -175,7 +163,6 @@ struct Info {
|
|||
u32 uses_patches{};
|
||||
|
||||
BufferResourceList buffers;
|
||||
TextureBufferResourceList texture_buffers;
|
||||
ImageResourceList images;
|
||||
SamplerResourceList samplers;
|
||||
FMaskResourceList fmasks;
|
||||
|
@ -193,8 +180,6 @@ struct Info {
|
|||
u64 pgm_hash{};
|
||||
VAddr pgm_base;
|
||||
bool has_storage_images{};
|
||||
bool has_image_buffers{};
|
||||
bool has_texel_buffers{};
|
||||
bool has_discard{};
|
||||
bool has_image_gather{};
|
||||
bool has_image_query{};
|
||||
|
@ -204,6 +189,8 @@ struct Info {
|
|||
bool uses_shared{};
|
||||
bool uses_fp16{};
|
||||
bool uses_fp64{};
|
||||
bool uses_pack_10_11_11{};
|
||||
bool uses_unpack_10_11_11{};
|
||||
bool stores_tess_level_outer{};
|
||||
bool stores_tess_level_inner{};
|
||||
bool translation_failed{}; // indicates that shader has unsupported instructions
|
||||
|
@ -246,8 +233,7 @@ struct Info {
|
|||
}
|
||||
|
||||
void AddBindings(Backend::Bindings& bnd) const {
|
||||
const auto total_buffers =
|
||||
buffers.size() + texture_buffers.size() + (has_readconst ? 1 : 0);
|
||||
const auto total_buffers = buffers.size() + (has_readconst ? 1 : 0);
|
||||
bnd.buffer += total_buffers;
|
||||
bnd.unified += total_buffers + images.size() + samplers.size();
|
||||
bnd.user_data += ud_mask.NumRegs();
|
||||
|
@ -278,10 +264,6 @@ constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexce
|
|||
return inline_cbuf ? inline_cbuf : info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
|
||||
}
|
||||
|
||||
constexpr AmdGpu::Buffer TextureBufferResource::GetSharp(const Info& info) const noexcept {
|
||||
return info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
|
||||
}
|
||||
|
||||
constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept {
|
||||
const auto image = info.ReadUdSharp<AmdGpu::Image>(sharp_idx);
|
||||
if (!image.Valid()) {
|
||||
|
|
|
@ -370,8 +370,16 @@ U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) {
|
|||
return Inst<U32>(Opcode::ReadConstBuffer, handle, index);
|
||||
}
|
||||
|
||||
Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& address,
|
||||
BufferInstInfo info) {
|
||||
U32 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) {
|
||||
return Inst<U32>(Opcode::LoadBufferU8, Flags{info}, handle, address);
|
||||
}
|
||||
|
||||
U32 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) {
|
||||
return Inst<U32>(Opcode::LoadBufferU16, Flags{info}, handle, address);
|
||||
}
|
||||
|
||||
Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value& address,
|
||||
BufferInstInfo info) {
|
||||
switch (num_dwords) {
|
||||
case 1:
|
||||
return Inst(Opcode::LoadBufferU32, Flags{info}, handle, address);
|
||||
|
@ -386,12 +394,38 @@ Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& ad
|
|||
}
|
||||
}
|
||||
|
||||
Value IREmitter::LoadBufferF32(int num_dwords, const Value& handle, const Value& address,
|
||||
BufferInstInfo info) {
|
||||
switch (num_dwords) {
|
||||
case 1:
|
||||
return Inst(Opcode::LoadBufferF32, Flags{info}, handle, address);
|
||||
case 2:
|
||||
return Inst(Opcode::LoadBufferF32x2, Flags{info}, handle, address);
|
||||
case 3:
|
||||
return Inst(Opcode::LoadBufferF32x3, Flags{info}, handle, address);
|
||||
case 4:
|
||||
return Inst(Opcode::LoadBufferF32x4, Flags{info}, handle, address);
|
||||
default:
|
||||
UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords);
|
||||
}
|
||||
}
|
||||
|
||||
Value IREmitter::LoadBufferFormat(const Value& handle, const Value& address, BufferInstInfo info) {
|
||||
return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address);
|
||||
}
|
||||
|
||||
void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& address,
|
||||
const Value& data, BufferInstInfo info) {
|
||||
void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U32& data,
|
||||
BufferInstInfo info) {
|
||||
Inst(Opcode::StoreBufferU8, Flags{info}, handle, address, data);
|
||||
}
|
||||
|
||||
void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U32& data,
|
||||
BufferInstInfo info) {
|
||||
Inst(Opcode::StoreBufferU16, Flags{info}, handle, address, data);
|
||||
}
|
||||
|
||||
void IREmitter::StoreBufferU32(int num_dwords, const Value& handle, const Value& address,
|
||||
const Value& data, BufferInstInfo info) {
|
||||
switch (num_dwords) {
|
||||
case 1:
|
||||
Inst(Opcode::StoreBufferU32, Flags{info}, handle, address, data);
|
||||
|
@ -410,6 +444,31 @@ void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& ad
|
|||
}
|
||||
}
|
||||
|
||||
void IREmitter::StoreBufferF32(int num_dwords, const Value& handle, const Value& address,
|
||||
const Value& data, BufferInstInfo info) {
|
||||
switch (num_dwords) {
|
||||
case 1:
|
||||
Inst(Opcode::StoreBufferF32, Flags{info}, handle, address, data);
|
||||
break;
|
||||
case 2:
|
||||
Inst(Opcode::StoreBufferF32x2, Flags{info}, handle, address, data);
|
||||
break;
|
||||
case 3:
|
||||
Inst(Opcode::StoreBufferF32x3, Flags{info}, handle, address, data);
|
||||
break;
|
||||
case 4:
|
||||
Inst(Opcode::StoreBufferF32x4, Flags{info}, handle, address, data);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords);
|
||||
}
|
||||
}
|
||||
|
||||
void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, const Value& data,
|
||||
BufferInstInfo info) {
|
||||
Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data);
|
||||
}
|
||||
|
||||
Value IREmitter::BufferAtomicIAdd(const Value& handle, const Value& address, const Value& value,
|
||||
BufferInstInfo info) {
|
||||
return Inst(Opcode::BufferAtomicIAdd32, Flags{info}, handle, address, value);
|
||||
|
@ -457,11 +516,6 @@ Value IREmitter::BufferAtomicSwap(const Value& handle, const Value& address, con
|
|||
return Inst(Opcode::BufferAtomicSwap32, Flags{info}, handle, address, value);
|
||||
}
|
||||
|
||||
void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, const Value& data,
|
||||
BufferInstInfo info) {
|
||||
Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data);
|
||||
}
|
||||
|
||||
U32 IREmitter::DataAppend(const U32& counter) {
|
||||
return Inst<U32>(Opcode::DataAppend, counter, Imm32(0));
|
||||
}
|
||||
|
@ -527,10 +581,14 @@ Value IREmitter::CompositeConstruct(const Value& e1, const Value& e2) {
|
|||
switch (e1.Type()) {
|
||||
case Type::U32:
|
||||
return Inst(Opcode::CompositeConstructU32x2, e1, e2);
|
||||
case Type::U32x2:
|
||||
return Inst(Opcode::CompositeConstructU32x2x2, e1, e2);
|
||||
case Type::F16:
|
||||
return Inst(Opcode::CompositeConstructF16x2, e1, e2);
|
||||
case Type::F32:
|
||||
return Inst(Opcode::CompositeConstructF32x2, e1, e2);
|
||||
case Type::F32x2:
|
||||
return Inst(Opcode::CompositeConstructF32x2x2, e1, e2);
|
||||
case Type::F64:
|
||||
return Inst(Opcode::CompositeConstructF64x2, e1, e2);
|
||||
default:
|
||||
|
@ -779,52 +837,116 @@ F64 IREmitter::PackFloat2x32(const Value& vector) {
|
|||
return Inst<F64>(Opcode::PackFloat2x32, vector);
|
||||
}
|
||||
|
||||
U32 IREmitter::PackFloat2x16(const Value& vector) {
|
||||
return Inst<U32>(Opcode::PackFloat2x16, vector);
|
||||
U32 IREmitter::Pack2x16(const AmdGpu::NumberFormat number_format, const Value& vector) {
|
||||
switch (number_format) {
|
||||
case AmdGpu::NumberFormat::Unorm:
|
||||
return Inst<U32>(Opcode::PackUnorm2x16, vector);
|
||||
case AmdGpu::NumberFormat::Snorm:
|
||||
return Inst<U32>(Opcode::PackSnorm2x16, vector);
|
||||
case AmdGpu::NumberFormat::Uint:
|
||||
return Inst<U32>(Opcode::PackUint2x16, vector);
|
||||
case AmdGpu::NumberFormat::Sint:
|
||||
return Inst<U32>(Opcode::PackSint2x16, vector);
|
||||
case AmdGpu::NumberFormat::Float:
|
||||
return Inst<U32>(Opcode::PackHalf2x16, vector);
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported 2x16 number format: {}", number_format);
|
||||
}
|
||||
}
|
||||
|
||||
Value IREmitter::UnpackFloat2x16(const U32& value) {
|
||||
return Inst(Opcode::UnpackFloat2x16, value);
|
||||
Value IREmitter::Unpack2x16(const AmdGpu::NumberFormat number_format, const U32& value) {
|
||||
switch (number_format) {
|
||||
case AmdGpu::NumberFormat::Unorm:
|
||||
return Inst(Opcode::UnpackUnorm2x16, value);
|
||||
case AmdGpu::NumberFormat::Snorm:
|
||||
return Inst(Opcode::UnpackSnorm2x16, value);
|
||||
case AmdGpu::NumberFormat::Uint:
|
||||
return Inst(Opcode::UnpackUint2x16, value);
|
||||
case AmdGpu::NumberFormat::Sint:
|
||||
return Inst(Opcode::UnpackSint2x16, value);
|
||||
case AmdGpu::NumberFormat::Float:
|
||||
return Inst(Opcode::UnpackHalf2x16, value);
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported 2x16 number format: {}", number_format);
|
||||
}
|
||||
}
|
||||
|
||||
U32 IREmitter::PackHalf2x16(const Value& vector) {
|
||||
return Inst<U32>(Opcode::PackHalf2x16, vector);
|
||||
U32 IREmitter::Pack4x8(const AmdGpu::NumberFormat number_format, const Value& vector) {
|
||||
switch (number_format) {
|
||||
case AmdGpu::NumberFormat::Unorm:
|
||||
return Inst<U32>(Opcode::PackUnorm4x8, vector);
|
||||
case AmdGpu::NumberFormat::Snorm:
|
||||
return Inst<U32>(Opcode::PackSnorm4x8, vector);
|
||||
case AmdGpu::NumberFormat::Uint:
|
||||
return Inst<U32>(Opcode::PackUint4x8, vector);
|
||||
case AmdGpu::NumberFormat::Sint:
|
||||
return Inst<U32>(Opcode::PackSint4x8, vector);
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported 4x8 number format: {}", number_format);
|
||||
}
|
||||
}
|
||||
|
||||
Value IREmitter::UnpackHalf2x16(const U32& value) {
|
||||
return Inst(Opcode::UnpackHalf2x16, value);
|
||||
Value IREmitter::Unpack4x8(const AmdGpu::NumberFormat number_format, const U32& value) {
|
||||
switch (number_format) {
|
||||
case AmdGpu::NumberFormat::Unorm:
|
||||
return Inst(Opcode::UnpackUnorm4x8, value);
|
||||
case AmdGpu::NumberFormat::Snorm:
|
||||
return Inst(Opcode::UnpackSnorm4x8, value);
|
||||
case AmdGpu::NumberFormat::Uint:
|
||||
return Inst(Opcode::UnpackUint4x8, value);
|
||||
case AmdGpu::NumberFormat::Sint:
|
||||
return Inst(Opcode::UnpackSint4x8, value);
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported 4x8 number format: {}", number_format);
|
||||
}
|
||||
}
|
||||
|
||||
U32 IREmitter::PackUnorm2x16(const Value& vector) {
|
||||
return Inst<U32>(Opcode::PackUnorm2x16, vector);
|
||||
U32 IREmitter::Pack10_11_11(const AmdGpu::NumberFormat number_format, const Value& vector) {
|
||||
switch (number_format) {
|
||||
case AmdGpu::NumberFormat::Float:
|
||||
return Inst<U32>(Opcode::PackUfloat10_11_11, vector);
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported 10_11_11 number format: {}", number_format);
|
||||
}
|
||||
}
|
||||
|
||||
Value IREmitter::UnpackUnorm2x16(const U32& value) {
|
||||
return Inst(Opcode::UnpackUnorm2x16, value);
|
||||
U32 IREmitter::Pack2_10_10_10(const AmdGpu::NumberFormat number_format, const Value& vector) {
|
||||
switch (number_format) {
|
||||
case AmdGpu::NumberFormat::Unorm:
|
||||
return Inst<U32>(Opcode::PackUnorm2_10_10_10, vector);
|
||||
case AmdGpu::NumberFormat::Snorm:
|
||||
return Inst<U32>(Opcode::PackSnorm2_10_10_10, vector);
|
||||
case AmdGpu::NumberFormat::Uint:
|
||||
return Inst<U32>(Opcode::PackUint2_10_10_10, vector);
|
||||
case AmdGpu::NumberFormat::Sint:
|
||||
return Inst<U32>(Opcode::PackSint2_10_10_10, vector);
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported 2_10_10_10 number format: {}", number_format);
|
||||
}
|
||||
}
|
||||
|
||||
U32 IREmitter::PackSnorm2x16(const Value& vector) {
|
||||
return Inst<U32>(Opcode::PackSnorm2x16, vector);
|
||||
Value IREmitter::Unpack2_10_10_10(const AmdGpu::NumberFormat number_format, const U32& value) {
|
||||
switch (number_format) {
|
||||
case AmdGpu::NumberFormat::Unorm:
|
||||
return Inst(Opcode::UnpackUnorm2_10_10_10, value);
|
||||
case AmdGpu::NumberFormat::Snorm:
|
||||
return Inst(Opcode::UnpackSnorm2_10_10_10, value);
|
||||
case AmdGpu::NumberFormat::Uint:
|
||||
return Inst(Opcode::UnpackUint2_10_10_10, value);
|
||||
case AmdGpu::NumberFormat::Sint:
|
||||
return Inst(Opcode::UnpackSint2_10_10_10, value);
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported 2_10_10_10 number format: {}", number_format);
|
||||
}
|
||||
}
|
||||
|
||||
Value IREmitter::UnpackSnorm2x16(const U32& value) {
|
||||
return Inst(Opcode::UnpackSnorm2x16, value);
|
||||
}
|
||||
|
||||
U32 IREmitter::PackUint2x16(const Value& value) {
|
||||
return Inst<U32>(Opcode::PackUint2x16, value);
|
||||
}
|
||||
|
||||
Value IREmitter::UnpackUint2x16(const U32& value) {
|
||||
return Inst(Opcode::UnpackUint2x16, value);
|
||||
}
|
||||
|
||||
U32 IREmitter::PackSint2x16(const Value& value) {
|
||||
return Inst<U32>(Opcode::PackSint2x16, value);
|
||||
}
|
||||
|
||||
Value IREmitter::UnpackSint2x16(const U32& value) {
|
||||
return Inst(Opcode::UnpackSint2x16, value);
|
||||
Value IREmitter::Unpack10_11_11(const AmdGpu::NumberFormat number_format, const U32& value) {
|
||||
switch (number_format) {
|
||||
case AmdGpu::NumberFormat::Float:
|
||||
return Inst(Opcode::UnpackUfloat10_11_11, value);
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported 10_11_11 number format: {}", number_format);
|
||||
}
|
||||
}
|
||||
|
||||
F32F64 IREmitter::FPMul(const F32F64& a, const F32F64& b) {
|
||||
|
|
|
@ -109,12 +109,22 @@ public:
|
|||
[[nodiscard]] U32 ReadConst(const Value& base, const U32& offset);
|
||||
[[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index);
|
||||
|
||||
[[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address,
|
||||
BufferInstInfo info);
|
||||
[[nodiscard]] U32 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info);
|
||||
[[nodiscard]] U32 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info);
|
||||
[[nodiscard]] Value LoadBufferU32(int num_dwords, const Value& handle, const Value& address,
|
||||
BufferInstInfo info);
|
||||
[[nodiscard]] Value LoadBufferF32(int num_dwords, const Value& handle, const Value& address,
|
||||
BufferInstInfo info);
|
||||
[[nodiscard]] Value LoadBufferFormat(const Value& handle, const Value& address,
|
||||
BufferInstInfo info);
|
||||
void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data,
|
||||
BufferInstInfo info);
|
||||
void StoreBufferU8(const Value& handle, const Value& address, const U32& data,
|
||||
BufferInstInfo info);
|
||||
void StoreBufferU16(const Value& handle, const Value& address, const U32& data,
|
||||
BufferInstInfo info);
|
||||
void StoreBufferU32(int num_dwords, const Value& handle, const Value& address,
|
||||
const Value& data, BufferInstInfo info);
|
||||
void StoreBufferF32(int num_dwords, const Value& handle, const Value& address,
|
||||
const Value& data, BufferInstInfo info);
|
||||
void StoreBufferFormat(const Value& handle, const Value& address, const Value& data,
|
||||
BufferInstInfo info);
|
||||
|
||||
|
@ -167,22 +177,19 @@ public:
|
|||
|
||||
[[nodiscard]] U64 PackUint2x32(const Value& vector);
|
||||
[[nodiscard]] Value UnpackUint2x32(const U64& value);
|
||||
|
||||
[[nodiscard]] F64 PackFloat2x32(const Value& vector);
|
||||
|
||||
[[nodiscard]] U32 PackFloat2x16(const Value& vector);
|
||||
[[nodiscard]] Value UnpackFloat2x16(const U32& value);
|
||||
[[nodiscard]] U32 Pack2x16(AmdGpu::NumberFormat number_format, const Value& vector);
|
||||
[[nodiscard]] Value Unpack2x16(AmdGpu::NumberFormat number_format, const U32& value);
|
||||
|
||||
[[nodiscard]] U32 PackHalf2x16(const Value& vector);
|
||||
[[nodiscard]] Value UnpackHalf2x16(const U32& value);
|
||||
[[nodiscard]] U32 PackUnorm2x16(const Value& vector);
|
||||
[[nodiscard]] Value UnpackUnorm2x16(const U32& value);
|
||||
[[nodiscard]] U32 PackSnorm2x16(const Value& vector);
|
||||
[[nodiscard]] Value UnpackSnorm2x16(const U32& value);
|
||||
[[nodiscard]] U32 PackUint2x16(const Value& value);
|
||||
[[nodiscard]] Value UnpackUint2x16(const U32& value);
|
||||
[[nodiscard]] U32 PackSint2x16(const Value& value);
|
||||
[[nodiscard]] Value UnpackSint2x16(const U32& value);
|
||||
[[nodiscard]] U32 Pack4x8(AmdGpu::NumberFormat number_format, const Value& vector);
|
||||
[[nodiscard]] Value Unpack4x8(AmdGpu::NumberFormat number_format, const U32& value);
|
||||
|
||||
[[nodiscard]] U32 Pack10_11_11(AmdGpu::NumberFormat number_format, const Value& vector);
|
||||
[[nodiscard]] Value Unpack10_11_11(AmdGpu::NumberFormat number_format, const U32& value);
|
||||
|
||||
[[nodiscard]] U32 Pack2_10_10_10(AmdGpu::NumberFormat number_format, const Value& vector);
|
||||
[[nodiscard]] Value Unpack2_10_10_10(AmdGpu::NumberFormat number_format, const U32& value);
|
||||
|
||||
[[nodiscard]] F32F64 FPAdd(const F32F64& a, const F32F64& b);
|
||||
[[nodiscard]] F32F64 FPSub(const F32F64& a, const F32F64& b);
|
||||
|
|
|
@ -54,10 +54,16 @@ bool Inst::MayHaveSideEffects() const noexcept {
|
|||
case Opcode::SetAttribute:
|
||||
case Opcode::SetTcsGenericAttribute:
|
||||
case Opcode::SetPatch:
|
||||
case Opcode::StoreBufferU8:
|
||||
case Opcode::StoreBufferU16:
|
||||
case Opcode::StoreBufferU32:
|
||||
case Opcode::StoreBufferU32x2:
|
||||
case Opcode::StoreBufferU32x3:
|
||||
case Opcode::StoreBufferU32x4:
|
||||
case Opcode::StoreBufferF32:
|
||||
case Opcode::StoreBufferF32x2:
|
||||
case Opcode::StoreBufferF32x3:
|
||||
case Opcode::StoreBufferF32x4:
|
||||
case Opcode::StoreBufferFormatF32:
|
||||
case Opcode::BufferAtomicIAdd32:
|
||||
case Opcode::BufferAtomicSMin32:
|
||||
|
|
|
@ -90,15 +90,27 @@ OPCODE(UndefU32, U32,
|
|||
OPCODE(UndefU64, U64, )
|
||||
|
||||
// Buffer operations
|
||||
OPCODE(LoadBufferU8, U32, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferU16, U32, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferU32, U32, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferU32x2, U32x2, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferU32x3, U32x3, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferU32x4, U32x4, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferF32, F32, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferFormatF32, F32x4, Opaque, Opaque, )
|
||||
OPCODE(StoreBufferU8, Void, Opaque, Opaque, U32, )
|
||||
OPCODE(StoreBufferU16, Void, Opaque, Opaque, U32, )
|
||||
OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, )
|
||||
OPCODE(StoreBufferU32x2, Void, Opaque, Opaque, U32x2, )
|
||||
OPCODE(StoreBufferU32x3, Void, Opaque, Opaque, U32x3, )
|
||||
OPCODE(StoreBufferU32x4, Void, Opaque, Opaque, U32x4, )
|
||||
OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, )
|
||||
OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, )
|
||||
OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, )
|
||||
OPCODE(StoreBufferF32x4, Void, Opaque, Opaque, F32x4, )
|
||||
OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, F32x4, )
|
||||
|
||||
// Buffer atomic operations
|
||||
|
@ -118,6 +130,7 @@ OPCODE(BufferAtomicSwap32, U32, Opaq
|
|||
OPCODE(CompositeConstructU32x2, U32x2, U32, U32, )
|
||||
OPCODE(CompositeConstructU32x3, U32x3, U32, U32, U32, )
|
||||
OPCODE(CompositeConstructU32x4, U32x4, U32, U32, U32, U32, )
|
||||
OPCODE(CompositeConstructU32x2x2, U32x4, U32x2, U32x2, )
|
||||
OPCODE(CompositeExtractU32x2, U32, U32x2, U32, )
|
||||
OPCODE(CompositeExtractU32x3, U32, U32x3, U32, )
|
||||
OPCODE(CompositeExtractU32x4, U32, U32x4, U32, )
|
||||
|
@ -142,6 +155,7 @@ OPCODE(CompositeShuffleF16x4, F16x4, F16x
|
|||
OPCODE(CompositeConstructF32x2, F32x2, F32, F32, )
|
||||
OPCODE(CompositeConstructF32x3, F32x3, F32, F32, F32, )
|
||||
OPCODE(CompositeConstructF32x4, F32x4, F32, F32, F32, F32, )
|
||||
OPCODE(CompositeConstructF32x2x2, F32x4, F32x2, F32x2, )
|
||||
OPCODE(CompositeExtractF32x2, F32, F32x2, U32, )
|
||||
OPCODE(CompositeExtractF32x3, F32, F32x3, U32, )
|
||||
OPCODE(CompositeExtractF32x4, F32, F32x4, U32, )
|
||||
|
@ -180,21 +194,42 @@ OPCODE(BitCastU64F64, U64, F64,
|
|||
OPCODE(BitCastF16U16, F16, U16, )
|
||||
OPCODE(BitCastF32U32, F32, U32, )
|
||||
OPCODE(BitCastF64U64, F64, U64, )
|
||||
|
||||
OPCODE(PackUint2x32, U64, U32x2, )
|
||||
OPCODE(UnpackUint2x32, U32x2, U64, )
|
||||
OPCODE(PackFloat2x32, F64, F32x2, )
|
||||
OPCODE(PackFloat2x16, U32, F16x2, )
|
||||
OPCODE(UnpackFloat2x16, F16x2, U32, )
|
||||
OPCODE(PackHalf2x16, U32, F32x2, )
|
||||
OPCODE(UnpackHalf2x16, F32x2, U32, )
|
||||
|
||||
OPCODE(PackUnorm2x16, U32, F32x2, )
|
||||
OPCODE(UnpackUnorm2x16, F32x2, U32, )
|
||||
OPCODE(PackSnorm2x16, U32, F32x2, )
|
||||
OPCODE(UnpackSnorm2x16, F32x2, U32, )
|
||||
OPCODE(PackUint2x16, U32, U32x2, )
|
||||
OPCODE(UnpackUint2x16, U32x2, U32, )
|
||||
OPCODE(PackSint2x16, U32, U32x2, )
|
||||
OPCODE(UnpackSint2x16, U32x2, U32, )
|
||||
OPCODE(PackUint2x16, U32, F32x2, )
|
||||
OPCODE(UnpackUint2x16, F32x2, U32, )
|
||||
OPCODE(PackSint2x16, U32, F32x2, )
|
||||
OPCODE(UnpackSint2x16, F32x2, U32, )
|
||||
OPCODE(PackHalf2x16, U32, F32x2, )
|
||||
OPCODE(UnpackHalf2x16, F32x2, U32, )
|
||||
|
||||
OPCODE(PackUnorm4x8, U32, F32x4, )
|
||||
OPCODE(UnpackUnorm4x8, F32x4, U32, )
|
||||
OPCODE(PackSnorm4x8, U32, F32x4, )
|
||||
OPCODE(UnpackSnorm4x8, F32x4, U32, )
|
||||
OPCODE(PackUint4x8, U32, F32x4, )
|
||||
OPCODE(UnpackUint4x8, F32x4, U32, )
|
||||
OPCODE(PackSint4x8, U32, F32x4, )
|
||||
OPCODE(UnpackSint4x8, F32x4, U32, )
|
||||
|
||||
OPCODE(PackUfloat10_11_11, U32, F32x3, )
|
||||
OPCODE(UnpackUfloat10_11_11, F32x3, U32, )
|
||||
|
||||
OPCODE(PackUnorm2_10_10_10, U32, F32x4, )
|
||||
OPCODE(UnpackUnorm2_10_10_10, F32x4, U32, )
|
||||
OPCODE(PackSnorm2_10_10_10, U32, F32x4, )
|
||||
OPCODE(UnpackSnorm2_10_10_10, F32x4, U32, )
|
||||
OPCODE(PackUint2_10_10_10, U32, F32x4, )
|
||||
OPCODE(UnpackUint2_10_10_10, F32x4, U32, )
|
||||
OPCODE(PackSint2_10_10_10, U32, F32x4, )
|
||||
OPCODE(UnpackSint2_10_10_10, F32x4, U32, )
|
||||
|
||||
// Floating-point operations
|
||||
OPCODE(FPAbs32, F32, F32, )
|
||||
|
|
|
@ -340,14 +340,7 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
|
|||
return FoldBitCast<IR::Opcode::BitCastF32U32, f32, u32>(inst, IR::Opcode::BitCastU32F32);
|
||||
case IR::Opcode::BitCastU32F32:
|
||||
return FoldBitCast<IR::Opcode::BitCastU32F32, u32, f32>(inst, IR::Opcode::BitCastF32U32);
|
||||
case IR::Opcode::PackHalf2x16:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackHalf2x16);
|
||||
case IR::Opcode::UnpackHalf2x16:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackHalf2x16);
|
||||
case IR::Opcode::PackFloat2x16:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackFloat2x16);
|
||||
case IR::Opcode::UnpackFloat2x16:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackFloat2x16);
|
||||
// 2x16
|
||||
case IR::Opcode::PackUnorm2x16:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackUnorm2x16);
|
||||
case IR::Opcode::UnpackUnorm2x16:
|
||||
|
@ -364,6 +357,49 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
|
|||
return FoldInverseFunc(inst, IR::Opcode::UnpackSint2x16);
|
||||
case IR::Opcode::UnpackSint2x16:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackSint2x16);
|
||||
case IR::Opcode::PackHalf2x16:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackHalf2x16);
|
||||
case IR::Opcode::UnpackHalf2x16:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackHalf2x16);
|
||||
// 4x8
|
||||
case IR::Opcode::PackUnorm4x8:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackUnorm4x8);
|
||||
case IR::Opcode::UnpackUnorm4x8:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackUnorm4x8);
|
||||
case IR::Opcode::PackSnorm4x8:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackSnorm4x8);
|
||||
case IR::Opcode::UnpackSnorm4x8:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackSnorm4x8);
|
||||
case IR::Opcode::PackUint4x8:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackUint4x8);
|
||||
case IR::Opcode::UnpackUint4x8:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackUint4x8);
|
||||
case IR::Opcode::PackSint4x8:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackSint4x8);
|
||||
case IR::Opcode::UnpackSint4x8:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackSint4x8);
|
||||
// 10_11_11
|
||||
case IR::Opcode::PackUfloat10_11_11:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackUfloat10_11_11);
|
||||
case IR::Opcode::UnpackUfloat10_11_11:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackUfloat10_11_11);
|
||||
// 2_10_10_10
|
||||
case IR::Opcode::PackUnorm2_10_10_10:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackUnorm2_10_10_10);
|
||||
case IR::Opcode::UnpackUnorm2_10_10_10:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackUnorm2_10_10_10);
|
||||
case IR::Opcode::PackSnorm2_10_10_10:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackSnorm2_10_10_10);
|
||||
case IR::Opcode::UnpackSnorm2_10_10_10:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackSnorm2_10_10_10);
|
||||
case IR::Opcode::PackUint2_10_10_10:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackUint2_10_10_10);
|
||||
case IR::Opcode::UnpackUint2_10_10_10:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackUint2_10_10_10);
|
||||
case IR::Opcode::PackSint2_10_10_10:
|
||||
return FoldInverseFunc(inst, IR::Opcode::UnpackSint2_10_10_10);
|
||||
case IR::Opcode::UnpackSint2_10_10_10:
|
||||
return FoldInverseFunc(inst, IR::Opcode::PackSint2_10_10_10);
|
||||
case IR::Opcode::SelectU1:
|
||||
case IR::Opcode::SelectU8:
|
||||
case IR::Opcode::SelectU16:
|
||||
|
|
|
@ -19,6 +19,7 @@ void ConstantPropagationPass(IR::BlockList& program);
|
|||
void FlattenExtendedUserdataPass(IR::Program& program);
|
||||
void ResourceTrackingPass(IR::Program& program);
|
||||
void CollectShaderInfoPass(IR::Program& program);
|
||||
void LowerBufferFormatToRaw(IR::Program& program);
|
||||
void LowerSharedMemToRegisters(IR::Program& program);
|
||||
void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info,
|
||||
Stage stage);
|
||||
|
|
211
src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
Normal file
211
src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
Normal file
|
@ -0,0 +1,211 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include "shader_recompiler/info.h"
|
||||
#include "shader_recompiler/ir/basic_block.h"
|
||||
#include "shader_recompiler/ir/ir_emitter.h"
|
||||
#include "shader_recompiler/ir/program.h"
|
||||
#include "shader_recompiler/ir/reinterpret.h"
|
||||
#include "video_core/amdgpu/resource.h"
|
||||
|
||||
namespace Shader::Optimization {
|
||||
|
||||
static bool IsBufferFormatLoad(const IR::Inst& inst) {
|
||||
return inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32;
|
||||
}
|
||||
|
||||
static bool IsBufferFormatStore(const IR::Inst& inst) {
|
||||
return inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32;
|
||||
}
|
||||
|
||||
static IR::Value LoadBufferFormat(IR::IREmitter& ir, const AmdGpu::Buffer& buffer,
|
||||
const IR::Value handle, const IR::U32 address,
|
||||
const IR::BufferInstInfo info) {
|
||||
const auto data_fmt = buffer.GetDataFmt();
|
||||
const auto num_fmt = buffer.GetNumberFmt();
|
||||
const auto num_conv = buffer.GetNumberConversion();
|
||||
const auto num_components = AmdGpu::NumComponents(buffer.GetDataFmt());
|
||||
|
||||
IR::Value interpreted;
|
||||
switch (data_fmt) {
|
||||
case AmdGpu::DataFormat::FormatInvalid:
|
||||
interpreted = ir.Imm32(0.f);
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format8: {
|
||||
const auto unpacked = ir.Unpack4x8(num_fmt, ir.LoadBufferU8(handle, address, info));
|
||||
interpreted = ir.CompositeExtract(unpacked, 0);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format8_8: {
|
||||
const auto raw = ir.LoadBufferU16(handle, address, info);
|
||||
const auto unpacked = ir.Unpack4x8(num_fmt, raw);
|
||||
interpreted = ir.CompositeConstruct(ir.CompositeExtract(unpacked, 0),
|
||||
ir.CompositeExtract(unpacked, 1));
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format8_8_8_8:
|
||||
interpreted = ir.Unpack4x8(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format16: {
|
||||
const auto unpacked = ir.Unpack2x16(num_fmt, ir.LoadBufferU16(handle, address, info));
|
||||
interpreted = ir.CompositeExtract(unpacked, 0);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format16_16:
|
||||
interpreted = ir.Unpack2x16(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format10_11_11:
|
||||
interpreted =
|
||||
ir.Unpack10_11_11(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format2_10_10_10:
|
||||
interpreted =
|
||||
ir.Unpack2_10_10_10(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)});
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format16_16_16_16: {
|
||||
const auto raw = ir.LoadBufferU32(2, handle, address, info);
|
||||
interpreted =
|
||||
ir.CompositeConstruct(ir.Unpack2x16(num_fmt, IR::U32{ir.CompositeExtract(raw, 0)}),
|
||||
ir.Unpack2x16(num_fmt, IR::U32{ir.CompositeExtract(raw, 1)}));
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format32:
|
||||
case AmdGpu::DataFormat::Format32_32:
|
||||
case AmdGpu::DataFormat::Format32_32_32:
|
||||
case AmdGpu::DataFormat::Format32_32_32_32: {
|
||||
ASSERT(num_fmt == AmdGpu::NumberFormat::Uint || num_fmt == AmdGpu::NumberFormat::Sint ||
|
||||
num_fmt == AmdGpu::NumberFormat::Float);
|
||||
interpreted = ir.LoadBufferF32(num_components, handle, address, info);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported buffer data format: {}", data_fmt);
|
||||
}
|
||||
|
||||
// Pad to 4 components and apply additional modifications.
|
||||
boost::container::static_vector<IR::Value, 4> components;
|
||||
for (u32 i = 0; i < 4; i++) {
|
||||
if (i < num_components) {
|
||||
const auto component =
|
||||
IR::F32{num_components == 1 ? interpreted : ir.CompositeExtract(interpreted, i)};
|
||||
components.push_back(ApplyReadNumberConversion(ir, component, num_conv));
|
||||
} else {
|
||||
components.push_back(ir.Imm32(0.f));
|
||||
}
|
||||
}
|
||||
const auto swizzled = ApplySwizzle(ir, ir.CompositeConstruct(components), buffer.DstSelect());
|
||||
return swizzled;
|
||||
}
|
||||
|
||||
static void StoreBufferFormat(IR::IREmitter& ir, const AmdGpu::Buffer& buffer,
|
||||
const IR::Value handle, const IR::U32 address, const IR::Value& value,
|
||||
const IR::BufferInstInfo info) {
|
||||
const auto data_fmt = buffer.GetDataFmt();
|
||||
const auto num_fmt = buffer.GetNumberFmt();
|
||||
const auto num_conv = buffer.GetNumberConversion();
|
||||
const auto num_components = AmdGpu::NumComponents(buffer.GetDataFmt());
|
||||
|
||||
// Extract actual number of components and apply additional modifications.
|
||||
const auto swizzled = ApplySwizzle(ir, value, buffer.DstSelect().Inverse());
|
||||
boost::container::static_vector<IR::Value, 4> components;
|
||||
for (u32 i = 0; i < num_components; i++) {
|
||||
const auto component = IR::F32{ir.CompositeExtract(swizzled, i)};
|
||||
components.push_back(ApplyWriteNumberConversion(ir, component, num_conv));
|
||||
}
|
||||
const auto real_value =
|
||||
components.size() == 1 ? components[0] : ir.CompositeConstruct(components);
|
||||
|
||||
switch (data_fmt) {
|
||||
case AmdGpu::DataFormat::FormatInvalid:
|
||||
break;
|
||||
case AmdGpu::DataFormat::Format8: {
|
||||
const auto packed =
|
||||
ir.Pack4x8(num_fmt, ir.CompositeConstruct(real_value, ir.Imm32(0.f), ir.Imm32(0.f),
|
||||
ir.Imm32(0.f)));
|
||||
ir.StoreBufferU8(handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format8_8: {
|
||||
const auto packed =
|
||||
ir.Pack4x8(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
|
||||
ir.CompositeExtract(real_value, 1),
|
||||
ir.Imm32(0.f), ir.Imm32(0.f)));
|
||||
ir.StoreBufferU16(handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format8_8_8_8: {
|
||||
auto packed = ir.Pack4x8(num_fmt, real_value);
|
||||
ir.StoreBufferU32(1, handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format16: {
|
||||
const auto packed = ir.Pack2x16(num_fmt, ir.CompositeConstruct(real_value, ir.Imm32(0.f)));
|
||||
ir.StoreBufferU16(handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format16_16: {
|
||||
const auto packed = ir.Pack2x16(num_fmt, real_value);
|
||||
ir.StoreBufferU32(1, handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format10_11_11: {
|
||||
const auto packed = ir.Pack10_11_11(num_fmt, real_value);
|
||||
ir.StoreBufferU32(1, handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format2_10_10_10: {
|
||||
const auto packed = ir.Pack2_10_10_10(num_fmt, real_value);
|
||||
ir.StoreBufferU32(1, handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format16_16_16_16: {
|
||||
const auto packed = ir.CompositeConstruct(
|
||||
ir.Pack2x16(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
|
||||
ir.CompositeExtract(real_value, 1))),
|
||||
ir.Pack2x16(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 2),
|
||||
ir.CompositeExtract(real_value, 3))));
|
||||
ir.StoreBufferU32(2, handle, address, packed, info);
|
||||
break;
|
||||
}
|
||||
case AmdGpu::DataFormat::Format32:
|
||||
case AmdGpu::DataFormat::Format32_32:
|
||||
case AmdGpu::DataFormat::Format32_32_32:
|
||||
case AmdGpu::DataFormat::Format32_32_32_32: {
|
||||
ASSERT(num_fmt == AmdGpu::NumberFormat::Uint || num_fmt == AmdGpu::NumberFormat::Sint ||
|
||||
num_fmt == AmdGpu::NumberFormat::Float);
|
||||
ir.StoreBufferF32(num_components, handle, address, real_value, info);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported buffer data format: {}", data_fmt);
|
||||
}
|
||||
}
|
||||
|
||||
static void LowerBufferFormatInst(IR::Block& block, IR::Inst& inst, Info& info) {
|
||||
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
|
||||
const auto desc{info.buffers[inst.Arg(0).U32()]};
|
||||
const auto buffer{desc.GetSharp(info)};
|
||||
|
||||
if (IsBufferFormatLoad(inst)) {
|
||||
const auto interpreted = LoadBufferFormat(ir, buffer, inst.Arg(0), IR::U32{inst.Arg(1)},
|
||||
inst.Flags<IR::BufferInstInfo>());
|
||||
inst.ReplaceUsesWithAndRemove(interpreted);
|
||||
} else if (IsBufferFormatStore(inst)) {
|
||||
StoreBufferFormat(ir, buffer, inst.Arg(0), IR::U32{inst.Arg(1)}, inst.Arg(2),
|
||||
inst.Flags<IR::BufferInstInfo>());
|
||||
inst.Invalidate();
|
||||
}
|
||||
}
|
||||
|
||||
void LowerBufferFormatToRaw(IR::Program& program) {
|
||||
auto& info = program.info;
|
||||
for (IR::Block* const block : program.blocks) {
|
||||
for (IR::Inst& inst : block->Instructions()) {
|
||||
if (IsBufferFormatLoad(inst) || IsBufferFormatStore(inst)) {
|
||||
LowerBufferFormatInst(*block, inst, info);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Shader::Optimization
|
|
@ -1,8 +1,6 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <algorithm>
|
||||
#include <boost/container/small_vector.hpp>
|
||||
#include "shader_recompiler/info.h"
|
||||
#include "shader_recompiler/ir/basic_block.h"
|
||||
#include "shader_recompiler/ir/breadth_first_search.h"
|
||||
|
@ -37,10 +35,17 @@ bool IsBufferAtomic(const IR::Inst& inst) {
|
|||
|
||||
bool IsBufferStore(const IR::Inst& inst) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::StoreBufferU8:
|
||||
case IR::Opcode::StoreBufferU16:
|
||||
case IR::Opcode::StoreBufferU32:
|
||||
case IR::Opcode::StoreBufferU32x2:
|
||||
case IR::Opcode::StoreBufferU32x3:
|
||||
case IR::Opcode::StoreBufferU32x4:
|
||||
case IR::Opcode::StoreBufferF32:
|
||||
case IR::Opcode::StoreBufferF32x2:
|
||||
case IR::Opcode::StoreBufferF32x3:
|
||||
case IR::Opcode::StoreBufferF32x4:
|
||||
case IR::Opcode::StoreBufferFormatF32:
|
||||
return true;
|
||||
default:
|
||||
return IsBufferAtomic(inst);
|
||||
|
@ -49,10 +54,17 @@ bool IsBufferStore(const IR::Inst& inst) {
|
|||
|
||||
bool IsBufferInstruction(const IR::Inst& inst) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadBufferU8:
|
||||
case IR::Opcode::LoadBufferU16:
|
||||
case IR::Opcode::LoadBufferU32:
|
||||
case IR::Opcode::LoadBufferU32x2:
|
||||
case IR::Opcode::LoadBufferU32x3:
|
||||
case IR::Opcode::LoadBufferU32x4:
|
||||
case IR::Opcode::LoadBufferF32:
|
||||
case IR::Opcode::LoadBufferF32x2:
|
||||
case IR::Opcode::LoadBufferF32x3:
|
||||
case IR::Opcode::LoadBufferF32x4:
|
||||
case IR::Opcode::LoadBufferFormatF32:
|
||||
case IR::Opcode::ReadConstBuffer:
|
||||
return true;
|
||||
default:
|
||||
|
@ -65,34 +77,6 @@ bool IsDataRingInstruction(const IR::Inst& inst) {
|
|||
inst.GetOpcode() == IR::Opcode::DataConsume;
|
||||
}
|
||||
|
||||
bool IsTextureBufferInstruction(const IR::Inst& inst) {
|
||||
return inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 ||
|
||||
inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32;
|
||||
}
|
||||
|
||||
bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_format) {
|
||||
switch (num_format) {
|
||||
case AmdGpu::NumberFormat::Float:
|
||||
switch (data_format) {
|
||||
case AmdGpu::DataFormat::Format16:
|
||||
case AmdGpu::DataFormat::Format16_16:
|
||||
case AmdGpu::DataFormat::Format16_16_16_16:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
case AmdGpu::NumberFormat::Unorm:
|
||||
case AmdGpu::NumberFormat::Snorm:
|
||||
case AmdGpu::NumberFormat::Uscaled:
|
||||
case AmdGpu::NumberFormat::Sscaled:
|
||||
case AmdGpu::NumberFormat::Uint:
|
||||
case AmdGpu::NumberFormat::Sint:
|
||||
case AmdGpu::NumberFormat::SnormNz:
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
|
||||
return IR::Type::U32;
|
||||
}
|
||||
|
@ -132,8 +116,7 @@ bool IsImageInstruction(const IR::Inst& inst) {
|
|||
class Descriptors {
|
||||
public:
|
||||
explicit Descriptors(Info& info_)
|
||||
: info{info_}, buffer_resources{info_.buffers},
|
||||
texture_buffer_resources{info_.texture_buffers}, image_resources{info_.images},
|
||||
: info{info_}, buffer_resources{info_.buffers}, image_resources{info_.images},
|
||||
sampler_resources{info_.samplers}, fmask_resources(info_.fmasks) {}
|
||||
|
||||
u32 Add(const BufferResource& desc) {
|
||||
|
@ -147,15 +130,7 @@ public:
|
|||
auto& buffer = buffer_resources[index];
|
||||
buffer.used_types |= desc.used_types;
|
||||
buffer.is_written |= desc.is_written;
|
||||
return index;
|
||||
}
|
||||
|
||||
u32 Add(const TextureBufferResource& desc) {
|
||||
const u32 index{Add(texture_buffer_resources, desc, [&desc](const auto& existing) {
|
||||
return desc.sharp_idx == existing.sharp_idx;
|
||||
})};
|
||||
auto& buffer = texture_buffer_resources[index];
|
||||
buffer.is_written |= desc.is_written;
|
||||
buffer.is_formatted |= desc.is_formatted;
|
||||
return index;
|
||||
}
|
||||
|
||||
|
@ -196,7 +171,6 @@ private:
|
|||
|
||||
const Info& info;
|
||||
BufferResourceList& buffer_resources;
|
||||
TextureBufferResourceList& texture_buffer_resources;
|
||||
ImageResourceList& image_resources;
|
||||
SamplerResourceList& sampler_resources;
|
||||
FMaskResourceList& fmask_resources;
|
||||
|
@ -313,6 +287,8 @@ void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors&
|
|||
.sharp_idx = sharp,
|
||||
.used_types = BufferDataType(inst, buffer.GetNumberFmt()),
|
||||
.is_written = IsBufferStore(inst),
|
||||
.is_formatted = inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 ||
|
||||
inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32,
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -321,21 +297,6 @@ void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors&
|
|||
inst.SetArg(0, ir.Imm32(binding));
|
||||
}
|
||||
|
||||
void PatchTextureBufferSharp(IR::Block& block, IR::Inst& inst, Info& info,
|
||||
Descriptors& descriptors) {
|
||||
const IR::Inst* handle = inst.Arg(0).InstRecursive();
|
||||
const IR::Inst* producer = handle->Arg(0).InstRecursive();
|
||||
const auto sharp = TrackSharp(producer, info);
|
||||
const s32 binding = descriptors.Add(TextureBufferResource{
|
||||
.sharp_idx = sharp,
|
||||
.is_written = inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32,
|
||||
});
|
||||
|
||||
// Replace handle with binding index in texture buffer resource list.
|
||||
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
|
||||
inst.SetArg(0, ir.Imm32(binding));
|
||||
}
|
||||
|
||||
void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) {
|
||||
const auto pred = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
|
||||
const auto opcode = inst->GetOpcode();
|
||||
|
@ -553,36 +514,6 @@ void PatchBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) {
|
|||
inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, buffer.stride));
|
||||
}
|
||||
|
||||
void PatchTextureBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) {
|
||||
const auto handle = inst.Arg(0);
|
||||
const auto buffer_res = info.texture_buffers[handle.U32()];
|
||||
const auto buffer = buffer_res.GetSharp(info);
|
||||
|
||||
// Only linear addressing with index is supported currently, since we cannot yet
|
||||
// address with sub-texel granularity.
|
||||
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
|
||||
ASSERT_MSG(!buffer.swizzle_enable && !inst_info.offset_enable && inst_info.inst_offset == 0,
|
||||
"Unsupported texture buffer address mode.");
|
||||
|
||||
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
|
||||
// Stride of 1 to get an index into formatted data. See above addressing limitations.
|
||||
inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, 1U));
|
||||
|
||||
if (inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32) {
|
||||
const auto swizzled = ApplySwizzle(ir, inst.Arg(2), buffer.DstSelect().Inverse());
|
||||
const auto converted =
|
||||
ApplyWriteNumberConversionVec4(ir, swizzled, buffer.GetNumberConversion());
|
||||
inst.SetArg(2, converted);
|
||||
} else if (inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32) {
|
||||
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
|
||||
const auto texel = ir.LoadBufferFormat(inst.Arg(0), inst.Arg(1), inst_info);
|
||||
const auto swizzled = ApplySwizzle(ir, texel, buffer.DstSelect());
|
||||
const auto converted =
|
||||
ApplyReadNumberConversionVec4(ir, swizzled, buffer.GetNumberConversion());
|
||||
inst.ReplaceUsesWith(converted);
|
||||
}
|
||||
}
|
||||
|
||||
IR::Value FixCubeCoords(IR::IREmitter& ir, const AmdGpu::Image& image, const IR::Value& x,
|
||||
const IR::Value& y, const IR::Value& face) {
|
||||
if (!image.IsCube()) {
|
||||
|
@ -861,8 +792,6 @@ void ResourceTrackingPass(IR::Program& program) {
|
|||
for (IR::Inst& inst : block->Instructions()) {
|
||||
if (IsBufferInstruction(inst)) {
|
||||
PatchBufferSharp(*block, inst, info, descriptors);
|
||||
} else if (IsTextureBufferInstruction(inst)) {
|
||||
PatchTextureBufferSharp(*block, inst, info, descriptors);
|
||||
} else if (IsImageInstruction(inst)) {
|
||||
PatchImageSharp(*block, inst, info, descriptors);
|
||||
} else if (IsDataRingInstruction(inst)) {
|
||||
|
@ -876,8 +805,6 @@ void ResourceTrackingPass(IR::Program& program) {
|
|||
for (IR::Inst& inst : block->Instructions()) {
|
||||
if (IsBufferInstruction(inst)) {
|
||||
PatchBufferArgs(*block, inst, info);
|
||||
} else if (IsTextureBufferInstruction(inst)) {
|
||||
PatchTextureBufferArgs(*block, inst, info);
|
||||
} else if (IsImageInstruction(inst)) {
|
||||
PatchImageArgs(*block, inst, info);
|
||||
}
|
||||
|
|
|
@ -50,12 +50,6 @@ void Visit(Info& info, const IR::Inst& inst) {
|
|||
case IR::Opcode::ImageWrite:
|
||||
info.has_storage_images = true;
|
||||
break;
|
||||
case IR::Opcode::LoadBufferFormatF32:
|
||||
info.has_texel_buffers = true;
|
||||
break;
|
||||
case IR::Opcode::StoreBufferFormatF32:
|
||||
info.has_image_buffers = true;
|
||||
break;
|
||||
case IR::Opcode::QuadShuffle:
|
||||
info.uses_group_quad = true;
|
||||
break;
|
||||
|
@ -82,6 +76,12 @@ void Visit(Info& info, const IR::Inst& inst) {
|
|||
case IR::Opcode::ReadConst:
|
||||
info.has_readconst = true;
|
||||
break;
|
||||
case IR::Opcode::PackUfloat10_11_11:
|
||||
info.uses_pack_10_11_11 = true;
|
||||
break;
|
||||
case IR::Opcode::UnpackUfloat10_11_11:
|
||||
info.uses_unpack_10_11_11 = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -88,6 +88,7 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
|
|||
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
|
||||
Shader::Optimization::FlattenExtendedUserdataPass(program);
|
||||
Shader::Optimization::ResourceTrackingPass(program);
|
||||
Shader::Optimization::LowerBufferFormatToRaw(program);
|
||||
Shader::Optimization::IdentityRemovalPass(program.blocks);
|
||||
Shader::Optimization::DeadCodeEliminationPass(program);
|
||||
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
|
||||
|
|
|
@ -19,30 +19,30 @@ struct VsAttribSpecialization {
|
|||
};
|
||||
|
||||
struct BufferSpecialization {
|
||||
u16 stride : 14;
|
||||
u16 is_storage : 1;
|
||||
u16 swizzle_enable : 1;
|
||||
u8 index_stride : 2 = 0;
|
||||
u8 element_size : 2 = 0;
|
||||
u32 stride : 14;
|
||||
u32 is_storage : 1;
|
||||
u32 is_formatted : 1;
|
||||
u32 swizzle_enable : 1;
|
||||
u32 data_format : 6;
|
||||
u32 num_format : 4;
|
||||
u32 index_stride : 2;
|
||||
u32 element_size : 2;
|
||||
u32 size = 0;
|
||||
AmdGpu::CompMapping dst_select{};
|
||||
AmdGpu::NumberConversion num_conversion{};
|
||||
|
||||
bool operator==(const BufferSpecialization& other) const {
|
||||
return stride == other.stride && is_storage == other.is_storage &&
|
||||
swizzle_enable == other.swizzle_enable &&
|
||||
is_formatted == other.is_formatted && swizzle_enable == other.swizzle_enable &&
|
||||
(!is_formatted ||
|
||||
(data_format == other.data_format && num_format == other.num_format &&
|
||||
dst_select == other.dst_select && num_conversion == other.num_conversion)) &&
|
||||
(!swizzle_enable ||
|
||||
(index_stride == other.index_stride && element_size == other.element_size)) &&
|
||||
(size >= other.is_storage || is_storage);
|
||||
}
|
||||
};
|
||||
|
||||
struct TextureBufferSpecialization {
|
||||
bool is_integer = false;
|
||||
AmdGpu::CompMapping dst_select{};
|
||||
AmdGpu::NumberConversion num_conversion{};
|
||||
|
||||
auto operator<=>(const TextureBufferSpecialization&) const = default;
|
||||
};
|
||||
|
||||
struct ImageSpecialization {
|
||||
AmdGpu::ImageType type = AmdGpu::ImageType::Color2D;
|
||||
bool is_integer = false;
|
||||
|
@ -82,7 +82,6 @@ struct StageSpecialization {
|
|||
boost::container::small_vector<VsAttribSpecialization, 32> vs_attribs;
|
||||
std::bitset<MaxStageResources> bitset{};
|
||||
boost::container::small_vector<BufferSpecialization, 16> buffers;
|
||||
boost::container::small_vector<TextureBufferSpecialization, 8> tex_buffers;
|
||||
boost::container::small_vector<ImageSpecialization, 16> images;
|
||||
boost::container::small_vector<FMaskSpecialization, 8> fmasks;
|
||||
boost::container::small_vector<SamplerSpecialization, 16> samplers;
|
||||
|
@ -111,7 +110,14 @@ struct StageSpecialization {
|
|||
[](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
|
||||
spec.stride = sharp.GetStride();
|
||||
spec.is_storage = desc.IsStorage(sharp);
|
||||
spec.is_formatted = desc.is_formatted;
|
||||
spec.swizzle_enable = sharp.swizzle_enable;
|
||||
if (spec.is_formatted) {
|
||||
spec.data_format = static_cast<u32>(sharp.GetDataFmt());
|
||||
spec.num_format = static_cast<u32>(sharp.GetNumberFmt());
|
||||
spec.dst_select = sharp.DstSelect();
|
||||
spec.num_conversion = sharp.GetNumberConversion();
|
||||
}
|
||||
if (spec.swizzle_enable) {
|
||||
spec.index_stride = sharp.index_stride;
|
||||
spec.element_size = sharp.element_size;
|
||||
|
@ -120,12 +126,6 @@ struct StageSpecialization {
|
|||
spec.size = sharp.GetSize();
|
||||
}
|
||||
});
|
||||
ForEachSharp(binding, tex_buffers, info->texture_buffers,
|
||||
[](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
|
||||
spec.is_integer = AmdGpu::IsInteger(sharp.GetNumberFmt());
|
||||
spec.dst_select = sharp.DstSelect();
|
||||
spec.num_conversion = sharp.GetNumberConversion();
|
||||
});
|
||||
ForEachSharp(binding, images, info->images,
|
||||
[](auto& spec, const auto& desc, AmdGpu::Image sharp) {
|
||||
spec.type = sharp.GetViewType(desc.is_array);
|
||||
|
@ -217,11 +217,6 @@ struct StageSpecialization {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
for (u32 i = 0; i < tex_buffers.size(); i++) {
|
||||
if (other.bitset[binding++] && tex_buffers[i] != other.tex_buffers[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
for (u32 i = 0; i < images.size(); i++) {
|
||||
if (other.bitset[binding++] && images[i] != other.images[i]) {
|
||||
return false;
|
||||
|
|
|
@ -95,8 +95,7 @@ Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
|
|||
// Create buffer object.
|
||||
const vk::BufferCreateInfo buffer_ci = {
|
||||
.size = size_bytes,
|
||||
// When maintenance5 is not supported, use all flags since we can't add flags to views.
|
||||
.usage = instance->IsMaintenance5Supported() ? flags : AllFlags,
|
||||
.usage = flags,
|
||||
};
|
||||
VmaAllocationInfo alloc_info{};
|
||||
buffer.Create(buffer_ci, usage, &alloc_info);
|
||||
|
@ -113,29 +112,6 @@ Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
|
|||
is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
|
||||
}
|
||||
|
||||
vk::BufferView Buffer::View(u32 offset, u32 size, bool is_written, AmdGpu::DataFormat dfmt,
|
||||
AmdGpu::NumberFormat nfmt) {
|
||||
const vk::BufferUsageFlags2CreateInfoKHR usage_flags = {
|
||||
.usage = is_written ? vk::BufferUsageFlagBits2KHR::eStorageTexelBuffer
|
||||
: vk::BufferUsageFlagBits2KHR::eUniformTexelBuffer,
|
||||
};
|
||||
const vk::BufferViewCreateInfo view_ci = {
|
||||
.pNext = instance->IsMaintenance5Supported() ? &usage_flags : nullptr,
|
||||
.buffer = buffer.buffer,
|
||||
.format = Vulkan::LiverpoolToVK::SurfaceFormat(dfmt, nfmt),
|
||||
.offset = offset,
|
||||
.range = size,
|
||||
};
|
||||
const auto [view_result, view] = instance->GetDevice().createBufferView(view_ci);
|
||||
ASSERT_MSG(view_result == vk::Result::eSuccess, "Failed to create buffer view: {}",
|
||||
vk::to_string(view_result));
|
||||
scheduler->DeferOperation(
|
||||
[view, device = instance->GetDevice()] { device.destroyBufferView(view); });
|
||||
Vulkan::SetObjectName(instance->GetDevice(), view, "BufferView {:#x}:{:#x}", cpu_addr + offset,
|
||||
size);
|
||||
return view;
|
||||
}
|
||||
|
||||
constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
|
||||
constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
|
||||
|
||||
|
|
|
@ -32,13 +32,12 @@ enum class MemoryUsage {
|
|||
};
|
||||
|
||||
constexpr vk::BufferUsageFlags ReadFlags =
|
||||
vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eUniformTexelBuffer |
|
||||
vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eIndexBuffer |
|
||||
vk::BufferUsageFlagBits::eVertexBuffer | vk::BufferUsageFlagBits::eIndirectBuffer;
|
||||
vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eUniformBuffer |
|
||||
vk::BufferUsageFlagBits::eIndexBuffer | vk::BufferUsageFlagBits::eVertexBuffer |
|
||||
vk::BufferUsageFlagBits::eIndirectBuffer;
|
||||
|
||||
constexpr vk::BufferUsageFlags AllFlags = ReadFlags | vk::BufferUsageFlagBits::eTransferDst |
|
||||
vk::BufferUsageFlagBits::eStorageTexelBuffer |
|
||||
vk::BufferUsageFlagBits::eStorageBuffer;
|
||||
constexpr vk::BufferUsageFlags AllFlags =
|
||||
ReadFlags | vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eStorageBuffer;
|
||||
|
||||
struct UniqueBuffer {
|
||||
explicit UniqueBuffer(vk::Device device, VmaAllocator allocator);
|
||||
|
@ -83,9 +82,6 @@ public:
|
|||
Buffer& operator=(Buffer&&) = default;
|
||||
Buffer(Buffer&&) = default;
|
||||
|
||||
vk::BufferView View(u32 offset, u32 size, bool is_written, AmdGpu::DataFormat dfmt,
|
||||
AmdGpu::NumberFormat nfmt);
|
||||
|
||||
/// Increases the likeliness of this being a stream buffer
|
||||
void IncreaseStreamScore(int score) noexcept {
|
||||
stream_score += score;
|
||||
|
|
|
@ -352,12 +352,9 @@ vk::ComponentMapping ComponentMapping(AmdGpu::CompMapping comp_mapping) {
|
|||
};
|
||||
}
|
||||
|
||||
static constexpr vk::FormatFeatureFlags2 BufferRead =
|
||||
vk::FormatFeatureFlagBits2::eUniformTexelBuffer | vk::FormatFeatureFlagBits2::eVertexBuffer;
|
||||
static constexpr vk::FormatFeatureFlags2 BufferWrite =
|
||||
vk::FormatFeatureFlagBits2::eStorageTexelBuffer |
|
||||
vk::FormatFeatureFlagBits2::eStorageReadWithoutFormat |
|
||||
vk::FormatFeatureFlagBits2::eStorageWriteWithoutFormat;
|
||||
// Texel buffer feature flags are not needed as format is interpreted in-shader.
|
||||
static constexpr vk::FormatFeatureFlags2 BufferRead = vk::FormatFeatureFlagBits2::eVertexBuffer;
|
||||
static constexpr vk::FormatFeatureFlags2 BufferWrite = static_cast<vk::FormatFeatureFlags2>(0);
|
||||
static constexpr vk::FormatFeatureFlags2 ImageRead = vk::FormatFeatureFlagBits2::eTransferSrc |
|
||||
vk::FormatFeatureFlagBits2::eTransferDst |
|
||||
vk::FormatFeatureFlagBits2::eSampledImage;
|
||||
|
|
|
@ -55,15 +55,6 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler
|
|||
.stageFlags = vk::ShaderStageFlagBits::eCompute,
|
||||
});
|
||||
}
|
||||
for (const auto& tex_buffer : info->texture_buffers) {
|
||||
bindings.push_back({
|
||||
.binding = binding++,
|
||||
.descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer
|
||||
: vk::DescriptorType::eUniformTexelBuffer,
|
||||
.descriptorCount = 1,
|
||||
.stageFlags = vk::ShaderStageFlagBits::eCompute,
|
||||
});
|
||||
}
|
||||
for (const auto& image : info->images) {
|
||||
bindings.push_back({
|
||||
.binding = binding++,
|
||||
|
|
|
@ -375,15 +375,6 @@ void GraphicsPipeline::BuildDescSetLayout() {
|
|||
.stageFlags = gp_stage_flags,
|
||||
});
|
||||
}
|
||||
for (const auto& tex_buffer : stage->texture_buffers) {
|
||||
bindings.push_back({
|
||||
.binding = binding++,
|
||||
.descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer
|
||||
: vk::DescriptorType::eUniformTexelBuffer,
|
||||
.descriptorCount = 1,
|
||||
.stageFlags = gp_stage_flags,
|
||||
});
|
||||
}
|
||||
for (const auto& image : stage->images) {
|
||||
bindings.push_back({
|
||||
.binding = binding++,
|
||||
|
|
|
@ -268,7 +268,6 @@ bool Instance::CreateDevice() {
|
|||
null_descriptor =
|
||||
feature_chain.get<vk::PhysicalDeviceRobustness2FeaturesEXT>().nullDescriptor;
|
||||
}
|
||||
maintenance5 = add_extension(VK_KHR_MAINTENANCE_5_EXTENSION_NAME);
|
||||
custom_border_color = add_extension(VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME);
|
||||
depth_clip_control = add_extension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME);
|
||||
vertex_input_dynamic_state = add_extension(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME);
|
||||
|
@ -376,9 +375,6 @@ bool Instance::CreateDevice() {
|
|||
.maintenance4 = true,
|
||||
},
|
||||
// Other extensions
|
||||
vk::PhysicalDeviceMaintenance5FeaturesKHR{
|
||||
.maintenance5 = true,
|
||||
},
|
||||
vk::PhysicalDeviceCustomBorderColorFeaturesEXT{
|
||||
.customBorderColors = true,
|
||||
.customBorderColorWithoutFormat = true,
|
||||
|
@ -414,9 +410,6 @@ bool Instance::CreateDevice() {
|
|||
if (!maintenance4) {
|
||||
device_chain.unlink<vk::PhysicalDeviceMaintenance4FeaturesKHR>();
|
||||
}
|
||||
if (!maintenance5) {
|
||||
device_chain.unlink<vk::PhysicalDeviceMaintenance5FeaturesKHR>();
|
||||
}
|
||||
if (!custom_border_color) {
|
||||
device_chain.unlink<vk::PhysicalDeviceCustomBorderColorFeaturesEXT>();
|
||||
}
|
||||
|
|
|
@ -114,11 +114,6 @@ public:
|
|||
return null_descriptor;
|
||||
}
|
||||
|
||||
/// Returns true when VK_KHR_maintenance5 is supported.
|
||||
bool IsMaintenance5Supported() const {
|
||||
return maintenance5;
|
||||
}
|
||||
|
||||
/// Returns true when VK_KHR_fragment_shader_barycentric is supported.
|
||||
bool IsFragmentShaderBarycentricSupported() const {
|
||||
return fragment_shader_barycentric;
|
||||
|
@ -209,11 +204,6 @@ public:
|
|||
return properties.limits.minStorageBufferOffsetAlignment;
|
||||
}
|
||||
|
||||
/// Returns the minimum required alignment for texel buffers
|
||||
vk::DeviceSize TexelBufferMinAlignment() const {
|
||||
return properties.limits.minTexelBufferOffsetAlignment;
|
||||
}
|
||||
|
||||
/// Returns the minimum alignemt required for accessing host-mapped device memory
|
||||
vk::DeviceSize NonCoherentAtomSize() const {
|
||||
return properties.limits.nonCoherentAtomSize;
|
||||
|
@ -229,11 +219,6 @@ public:
|
|||
return properties.limits.maxComputeSharedMemorySize;
|
||||
}
|
||||
|
||||
/// Returns the maximum supported elements in a texel buffer
|
||||
u32 MaxTexelBufferElements() const {
|
||||
return properties.limits.maxTexelBufferElements;
|
||||
}
|
||||
|
||||
/// Returns the maximum sampler LOD bias.
|
||||
float MaxSamplerLodBias() const {
|
||||
return properties.limits.maxSamplerLodBias;
|
||||
|
@ -317,7 +302,6 @@ private:
|
|||
bool dynamic_color_write_mask{};
|
||||
bool vertex_input_dynamic_state{};
|
||||
bool null_descriptor{};
|
||||
bool maintenance5{};
|
||||
bool list_restart{};
|
||||
bool legacy_vertex_attributes{};
|
||||
bool shader_stencil_export{};
|
||||
|
|
|
@ -29,8 +29,6 @@ using Shader::VsOutput;
|
|||
constexpr static std::array DescriptorHeapSizes = {
|
||||
vk::DescriptorPoolSize{vk::DescriptorType::eUniformBuffer, 8192},
|
||||
vk::DescriptorPoolSize{vk::DescriptorType::eStorageBuffer, 1024},
|
||||
vk::DescriptorPoolSize{vk::DescriptorType::eUniformTexelBuffer, 128},
|
||||
vk::DescriptorPoolSize{vk::DescriptorType::eStorageTexelBuffer, 128},
|
||||
vk::DescriptorPoolSize{vk::DescriptorType::eSampledImage, 8192},
|
||||
vk::DescriptorPoolSize{vk::DescriptorType::eSampler, 1024},
|
||||
};
|
||||
|
|
|
@ -435,28 +435,6 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
|
|||
if (pipeline->IsCompute()) {
|
||||
const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute);
|
||||
|
||||
// Most of the time when a metadata is updated with a shader it gets cleared. It means
|
||||
// we can skip the whole dispatch and update the tracked state instead. Also, it is not
|
||||
// intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we
|
||||
// will need its full emulation anyways. For cases of metadata read a warning will be
|
||||
// logged.
|
||||
const auto IsMetaUpdate = [&](const auto& desc) {
|
||||
const auto sharp = desc.GetSharp(info);
|
||||
const VAddr address = sharp.base_address;
|
||||
if (desc.is_written) {
|
||||
// Assume all slices were updates
|
||||
if (texture_cache.ClearMeta(address)) {
|
||||
LOG_TRACE(Render_Vulkan, "Metadata update skipped");
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (texture_cache.IsMeta(address)) {
|
||||
LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)");
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
// Assume if a shader reads and writes metas at the same time, it is a copy shader.
|
||||
bool meta_read = false;
|
||||
for (const auto& desc : info.buffers) {
|
||||
|
@ -469,23 +447,26 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
|
|||
}
|
||||
}
|
||||
|
||||
for (const auto& desc : info.texture_buffers) {
|
||||
if (!desc.is_written) {
|
||||
const VAddr address = desc.GetSharp(info).base_address;
|
||||
meta_read = texture_cache.IsMeta(address);
|
||||
}
|
||||
}
|
||||
|
||||
// Most of the time when a metadata is updated with a shader it gets cleared. It means
|
||||
// we can skip the whole dispatch and update the tracked state instead. Also, it is not
|
||||
// intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we
|
||||
// will need its full emulation anyways. For cases of metadata read a warning will be
|
||||
// logged.
|
||||
if (!meta_read) {
|
||||
for (const auto& desc : info.buffers) {
|
||||
if (IsMetaUpdate(desc)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& desc : info.texture_buffers) {
|
||||
if (IsMetaUpdate(desc)) {
|
||||
return false;
|
||||
const auto sharp = desc.GetSharp(info);
|
||||
const VAddr address = sharp.base_address;
|
||||
if (desc.is_written) {
|
||||
// Assume all slices were updates
|
||||
if (texture_cache.ClearMeta(address)) {
|
||||
LOG_TRACE(Render_Vulkan, "Metadata update skipped");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (texture_cache.IsMeta(address)) {
|
||||
LOG_WARNING(Render_Vulkan,
|
||||
"Unexpected metadata read by a CS shader (buffer)");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -541,19 +522,6 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
|
|||
}
|
||||
}
|
||||
|
||||
texbuffer_bindings.clear();
|
||||
|
||||
for (const auto& desc : stage.texture_buffers) {
|
||||
const auto vsharp = desc.GetSharp(stage);
|
||||
if (vsharp.base_address != 0 && vsharp.GetSize() > 0 &&
|
||||
vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid) {
|
||||
const auto buffer_id = buffer_cache.FindBuffer(vsharp.base_address, vsharp.GetSize());
|
||||
texbuffer_bindings.emplace_back(buffer_id, vsharp);
|
||||
} else {
|
||||
texbuffer_bindings.emplace_back(VideoCore::BufferId{}, vsharp);
|
||||
}
|
||||
}
|
||||
|
||||
// Bind a SSBO to act as shared memory in case of not being able to use a workgroup buffer
|
||||
// (e.g. when the compute shared memory is bigger than the GPU's shared memory)
|
||||
if (stage.has_emulated_shared_memory) {
|
||||
|
@ -601,8 +569,9 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
|
|||
buffer_infos.emplace_back(null_buffer.Handle(), 0, VK_WHOLE_SIZE);
|
||||
}
|
||||
} else {
|
||||
const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(
|
||||
vsharp.base_address, vsharp.GetSize(), desc.is_written, false, buffer_id);
|
||||
const auto [vk_buffer, offset] =
|
||||
buffer_cache.ObtainBuffer(vsharp.base_address, vsharp.GetSize(), desc.is_written,
|
||||
desc.is_formatted, buffer_id);
|
||||
const u32 alignment =
|
||||
is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment();
|
||||
const u32 offset_aligned = Common::AlignDown(offset, alignment);
|
||||
|
@ -617,6 +586,9 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
|
|||
vk::PipelineStageFlagBits2::eAllCommands)) {
|
||||
buffer_barriers.emplace_back(*barrier);
|
||||
}
|
||||
if (desc.is_written && desc.is_formatted) {
|
||||
texture_cache.InvalidateMemoryFromGPU(vsharp.base_address, vsharp.GetSize());
|
||||
}
|
||||
}
|
||||
|
||||
set_writes.push_back({
|
||||
|
@ -630,56 +602,6 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
|
|||
});
|
||||
++binding.buffer;
|
||||
}
|
||||
|
||||
for (u32 i = 0; i < texbuffer_bindings.size(); i++) {
|
||||
const auto& [buffer_id, vsharp] = texbuffer_bindings[i];
|
||||
const auto& desc = stage.texture_buffers[i];
|
||||
// Fallback format for null buffer view; never used in valid buffer case.
|
||||
const auto data_fmt = vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid
|
||||
? vsharp.GetDataFmt()
|
||||
: AmdGpu::DataFormat::Format8;
|
||||
const u32 fmt_stride = AmdGpu::NumBits(data_fmt) >> 3;
|
||||
vk::BufferView buffer_view;
|
||||
if (buffer_id) {
|
||||
const u32 alignment = instance.TexelBufferMinAlignment();
|
||||
const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(
|
||||
vsharp.base_address, vsharp.GetSize(), desc.is_written, true, buffer_id);
|
||||
const u32 buf_stride = vsharp.GetStride();
|
||||
ASSERT_MSG(buf_stride % fmt_stride == 0,
|
||||
"Texel buffer stride must match format stride");
|
||||
const u32 offset_aligned = Common::AlignDown(offset, alignment);
|
||||
const u32 adjust = offset - offset_aligned;
|
||||
ASSERT(adjust % fmt_stride == 0);
|
||||
push_data.AddTexelOffset(binding.buffer, buf_stride / fmt_stride, adjust / fmt_stride);
|
||||
buffer_view = vk_buffer->View(offset_aligned, vsharp.GetSize() + adjust,
|
||||
desc.is_written, data_fmt, vsharp.GetNumberFmt());
|
||||
if (auto barrier =
|
||||
vk_buffer->GetBarrier(desc.is_written ? vk::AccessFlagBits2::eShaderWrite
|
||||
: vk::AccessFlagBits2::eShaderRead,
|
||||
vk::PipelineStageFlagBits2::eAllCommands)) {
|
||||
buffer_barriers.emplace_back(*barrier);
|
||||
}
|
||||
if (desc.is_written) {
|
||||
texture_cache.InvalidateMemoryFromGPU(vsharp.base_address, vsharp.GetSize());
|
||||
}
|
||||
} else if (instance.IsNullDescriptorSupported()) {
|
||||
buffer_view = VK_NULL_HANDLE;
|
||||
} else {
|
||||
buffer_view =
|
||||
null_buffer.View(0, fmt_stride, desc.is_written, data_fmt, vsharp.GetNumberFmt());
|
||||
}
|
||||
|
||||
set_writes.push_back({
|
||||
.dstSet = VK_NULL_HANDLE,
|
||||
.dstBinding = binding.unified++,
|
||||
.dstArrayElement = 0,
|
||||
.descriptorCount = 1,
|
||||
.descriptorType = desc.is_written ? vk::DescriptorType::eStorageTexelBuffer
|
||||
: vk::DescriptorType::eUniformTexelBuffer,
|
||||
.pTexelBufferView = &buffer_views.emplace_back(buffer_view),
|
||||
});
|
||||
++binding.buffer;
|
||||
}
|
||||
}
|
||||
|
||||
void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindings& binding,
|
||||
|
|
|
@ -120,8 +120,6 @@ private:
|
|||
|
||||
using BufferBindingInfo = std::pair<VideoCore::BufferId, AmdGpu::Buffer>;
|
||||
boost::container::static_vector<BufferBindingInfo, 32> buffer_bindings;
|
||||
using TexBufferBindingInfo = std::pair<VideoCore::BufferId, AmdGpu::Buffer>;
|
||||
boost::container::static_vector<TexBufferBindingInfo, 32> texbuffer_bindings;
|
||||
using ImageBindingInfo = std::pair<VideoCore::ImageId, VideoCore::TextureCache::TextureDesc>;
|
||||
boost::container::static_vector<ImageBindingInfo, 64> image_bindings;
|
||||
};
|
||||
|
|
|
@ -19,9 +19,9 @@ static bool ExecuteCopyShaderHLE(const Shader::Info& info,
|
|||
auto& buffer_cache = rasterizer.GetBufferCache();
|
||||
|
||||
// Copy shader defines three formatted buffers as inputs: control, source, and destination.
|
||||
const auto ctl_buf_sharp = info.texture_buffers[0].GetSharp(info);
|
||||
const auto src_buf_sharp = info.texture_buffers[1].GetSharp(info);
|
||||
const auto dst_buf_sharp = info.texture_buffers[2].GetSharp(info);
|
||||
const auto ctl_buf_sharp = info.buffers[0].GetSharp(info);
|
||||
const auto src_buf_sharp = info.buffers[1].GetSharp(info);
|
||||
const auto dst_buf_sharp = info.buffers[2].GetSharp(info);
|
||||
const auto buf_stride = src_buf_sharp.GetStride();
|
||||
ASSERT(buf_stride == dst_buf_sharp.GetStride());
|
||||
|
||||
|
@ -95,12 +95,10 @@ static bool ExecuteCopyShaderHLE(const Shader::Info& info,
|
|||
}
|
||||
|
||||
// Obtain buffers for the total source and destination ranges.
|
||||
const auto [src_buf, src_buf_offset] =
|
||||
buffer_cache.ObtainBuffer(src_buf_sharp.base_address + src_offset_min,
|
||||
src_offset_max - src_offset_min, false, false);
|
||||
const auto [dst_buf, dst_buf_offset] =
|
||||
buffer_cache.ObtainBuffer(dst_buf_sharp.base_address + dst_offset_min,
|
||||
dst_offset_max - dst_offset_min, true, false);
|
||||
const auto [src_buf, src_buf_offset] = buffer_cache.ObtainBuffer(
|
||||
src_buf_sharp.base_address + src_offset_min, src_offset_max - src_offset_min, false);
|
||||
const auto [dst_buf, dst_buf_offset] = buffer_cache.ObtainBuffer(
|
||||
dst_buf_sharp.base_address + dst_offset_min, dst_offset_max - dst_offset_min, true);
|
||||
|
||||
// Apply found buffer base.
|
||||
const auto vk_copies = std::span{copies}.subspan(batch_start, batch_end - batch_start);
|
||||
|
|
Loading…
Add table
Reference in a new issue