Handle compute packets that are split between the ends of two command buffers (#2476)
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions

* Squashed initial implementation

* Logging for checking if buffers are memory contiguous

* Add check to see if first instruction is valid in the next buffer to avoid false positives

* Oof

* Replace old code with IndecisiveTurtle's new, better implementation

* Add `unlikely` keyword to the split packet handling branches

Co-authored-by: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com>

---------

Co-authored-by: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com>
This commit is contained in:
kalaposfos13 2025-03-25 23:01:21 +01:00 committed by GitHub
parent 5c72030fb8
commit a1ec8b0a88
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 41 additions and 19 deletions

View file

@ -726,20 +726,39 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
}
template <bool is_indirect>
Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, u32 vqid) {
Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid) {
FIBER_ENTER(acb_task_name[vqid]);
const auto& queue = asc_queues[{vqid}];
auto& queue = asc_queues[{vqid}];
auto base_addr = reinterpret_cast<uintptr_t>(acb.data());
while (!acb.empty()) {
const auto* header = reinterpret_cast<const PM4Header*>(acb.data());
const u32 type = header->type;
if (type != 3) {
// No other types of packets were spotted so far
UNREACHABLE_MSG("Invalid PM4 type {}", type);
auto base_addr = reinterpret_cast<VAddr>(acb);
while (acb_dwords > 0) {
auto* header = reinterpret_cast<const PM4Header*>(acb);
u32 next_dw_off = header->type3.NumWords() + 1;
// If we have a buffered packet, use it.
if (queue.tmp_dwords > 0) [[unlikely]] {
header = reinterpret_cast<const PM4Header*>(queue.tmp_packet.data());
next_dw_off = header->type3.NumWords() + 1 - queue.tmp_dwords;
std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb, next_dw_off * sizeof(u32));
queue.tmp_dwords = 0;
}
// If the packet is split across ring boundary, buffer until next submission
if (next_dw_off > acb_dwords) [[unlikely]] {
std::memcpy(queue.tmp_packet.data(), acb, acb_dwords * sizeof(u32));
queue.tmp_dwords = acb_dwords;
if constexpr (!is_indirect) {
*queue.read_addr += acb_dwords;
*queue.read_addr %= queue.ring_size_dw;
}
break;
}
if (header->type != 3) {
// No other types of packets were spotted so far
UNREACHABLE_MSG("Invalid PM4 type {}", header->type.Value());
}
const u32 count = header->type3.NumWords();
const PM4ItOpcode opcode = header->type3.opcode;
const auto* it_body = reinterpret_cast<const u32*>(header) + 1;
switch (opcode) {
@ -749,8 +768,8 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, u32 vqid) {
}
case PM4ItOpcode::IndirectBuffer: {
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
auto task = ProcessCompute<true>(
{indirect_buffer->Address<const u32>(), indirect_buffer->ib_size}, vqid);
auto task = ProcessCompute<true>(indirect_buffer->Address<const u32>(),
indirect_buffer->ib_size, vqid);
RESUME_ASC(task, vqid);
while (!task.handle.done()) {
@ -800,7 +819,7 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, u32 vqid) {
}
case PM4ItOpcode::SetShReg: {
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
const auto set_size = (count - 1) * sizeof(u32);
const auto set_size = (header->type3.NumWords() - 1) * sizeof(u32);
if (set_data->reg_offset >= 0x200 &&
set_data->reg_offset <= (0x200 + sizeof(ComputeProgram) / 4)) {
@ -895,14 +914,14 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, u32 vqid) {
}
default:
UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}",
static_cast<u32>(opcode), count);
static_cast<u32>(opcode), header->type3.NumWords());
}
const auto packet_size_dw = header->type3.NumWords() + 1;
acb = NextPacket(acb, packet_size_dw);
acb += next_dw_off;
acb_dwords -= next_dw_off;
if constexpr (!is_indirect) {
*queue.read_addr += packet_size_dw;
*queue.read_addr += next_dw_off;
*queue.read_addr %= queue.ring_size_dw;
}
}
@ -969,7 +988,7 @@ void Liverpool::SubmitAsc(u32 gnm_vqid, std::span<const u32> acb) {
auto& queue = mapped_queues[gnm_vqid];
const auto vqid = gnm_vqid - 1;
const auto& task = ProcessCompute(acb, vqid);
const auto& task = ProcessCompute(acb.data(), acb.size(), vqid);
{
std::scoped_lock lock{queue.m_access};
queue.submits.emplace(task.handle);

View file

@ -1496,10 +1496,13 @@ public:
}
struct AscQueueInfo {
static constexpr size_t Pm4BufferSize = 1024;
VAddr map_addr;
u32* read_addr;
u32 ring_size_dw;
u32 pipe_id;
std::array<u32, Pm4BufferSize> tmp_packet;
u32 tmp_dwords;
};
Common::SlotVector<AscQueueInfo> asc_queues{};
@ -1541,7 +1544,7 @@ private:
Task ProcessGraphics(std::span<const u32> dcb, std::span<const u32> ccb);
Task ProcessCeUpdate(std::span<const u32> ccb);
template <bool is_indirect = false>
Task ProcessCompute(std::span<const u32> acb, u32 vqid);
Task ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid);
void Process(std::stop_token stoken);