// Copyright 2012 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.

#include <algorithm>

#include "Common/CommonFuncs.h"
#include "Common/CommonTypes.h"
#include "Common/Logging/Log.h"
#include "VideoBackends/D3D12/D3DBase.h"
#include "VideoBackends/D3D12/D3DCommandListManager.h"
#include "VideoBackends/D3D12/PerfQuery.h"
#include "VideoCommon/RenderBase.h"

namespace DX12
{
PerfQuery::PerfQuery()
{
  D3D12_QUERY_HEAP_DESC desc = {D3D12_QUERY_HEAP_TYPE_OCCLUSION, PERF_QUERY_BUFFER_SIZE, 0};
  CheckHR(D3D::device12->CreateQueryHeap(&desc, IID_PPV_ARGS(&m_query_heap)));

  CheckHR(D3D::device12->CreateCommittedResource(
      &CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK), D3D12_HEAP_FLAG_NONE,
      &CD3DX12_RESOURCE_DESC::Buffer(QUERY_READBACK_BUFFER_SIZE), D3D12_RESOURCE_STATE_COPY_DEST,
      nullptr, IID_PPV_ARGS(&m_query_readback_buffer)));

  m_tracking_fence =
      D3D::command_list_mgr->RegisterQueueFenceCallback(this, &PerfQuery::QueueFenceCallback);
}

PerfQuery::~PerfQuery()
{
  D3D::command_list_mgr->RemoveQueueFenceCallback(this);

  SAFE_RELEASE(m_query_heap);
  SAFE_RELEASE(m_query_readback_buffer);
}

void PerfQuery::EnableQuery(PerfQueryGroup type)
{
  if (m_query_count > m_query_buffer.size() / 2)
    WeakFlush();

  // all queries already used?
  if (m_query_buffer.size() == m_query_count)
  {
    FlushOne();
    // WARN_LOG(VIDEO, "Flushed query buffer early!");
  }

  if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
  {
    size_t index = (m_query_read_pos + m_query_count) % m_query_buffer.size();
    auto& entry = m_query_buffer[index];

    D3D::current_command_list->BeginQuery(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION,
                                          static_cast<UINT>(index));
    entry.query_type = type;
    entry.fence_value = -1;

    ++m_query_count;
  }
}

void PerfQuery::DisableQuery(PerfQueryGroup type)
{
  if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
  {
    size_t index =
        (m_query_read_pos + m_query_count + m_query_buffer.size() - 1) % m_query_buffer.size();
    auto& entry = m_query_buffer[index];

    D3D::current_command_list->EndQuery(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION,
                                        static_cast<UINT>(index));
    D3D::current_command_list->ResolveQueryData(m_query_heap, D3D12_QUERY_TYPE_OCCLUSION,
                                                static_cast<UINT>(index), 1,
                                                m_query_readback_buffer, index * sizeof(UINT64));
    entry.fence_value = m_next_fence_value;
  }
}

void PerfQuery::ResetQuery()
{
  m_query_count = 0;
  std::fill_n(m_results, ArraySize(m_results), 0);
}

u32 PerfQuery::GetQueryResult(PerfQueryType type)
{
  u32 result = 0;

  if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC)
    result = m_results[PQG_ZCOMP_ZCOMPLOC];
  else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT)
    result = m_results[PQG_ZCOMP];
  else if (type == PQ_BLEND_INPUT)
    result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC];
  else if (type == PQ_EFB_COPY_CLOCKS)
    result = m_results[PQG_EFB_COPY_CLOCKS];

  return result;
}

void PerfQuery::FlushOne()
{
  size_t index = m_query_read_pos;
  ActiveQuery& entry = m_query_buffer[index];

  // Has the command list been executed yet?
  if (entry.fence_value == m_next_fence_value)
    D3D::command_list_mgr->ExecuteQueuedWork(false);

  // Block until the fence is reached
  D3D::command_list_mgr->WaitOnCPUForFence(m_tracking_fence, entry.fence_value);

  // Copy from readback buffer to local
  void* readback_buffer_map;
  D3D12_RANGE read_range = {sizeof(UINT64) * index, sizeof(UINT64) * (index + 1)};
  CheckHR(m_query_readback_buffer->Map(0, &read_range, &readback_buffer_map));

  UINT64 result;
  memcpy(&result, reinterpret_cast<u8*>(readback_buffer_map) + sizeof(UINT64) * index,
         sizeof(UINT64));

  D3D12_RANGE write_range = {};
  m_query_readback_buffer->Unmap(0, &write_range);

  // NOTE: Reported pixel metrics should be referenced to native resolution
  // TODO: Dropping the lower 2 bits from this count should be closer to actual
  // hardware behavior when drawing triangles.
  m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() *
                                       EFB_HEIGHT / g_renderer->GetTargetHeight());

  m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size();
  m_query_count--;
}

UINT64 PerfQuery::FindLastPendingFenceValue() const
{
  UINT64 last_fence_value = 0;
  u32 query_count = m_query_count;
  u32 query_read_pos = m_query_read_pos;
  while (query_count > 0)
  {
    const ActiveQuery& entry = m_query_buffer[query_read_pos];

    last_fence_value = std::max(entry.fence_value, last_fence_value);
    query_read_pos = (query_read_pos + 1) % m_query_buffer.size();
    query_count--;
  }

  return last_fence_value;
}

void PerfQuery::FlushResults()
{
  if (IsFlushed())
    return;

  // Find the fence value we have to wait for.
  UINT64 last_fence_value = FindLastPendingFenceValue();
  if (last_fence_value == m_next_fence_value)
    D3D::command_list_mgr->ExecuteQueuedWork(false);

  // Wait for all queries to be resolved.
  D3D::command_list_mgr->WaitOnCPUForFence(m_tracking_fence, last_fence_value);

  // Map the whole readback buffer. Shouldn't have much overhead, and saves taking the
  // wrapped-around cases into consideration.
  void* readback_buffer_map;
  D3D12_RANGE read_range = {0, QUERY_READBACK_BUFFER_SIZE};
  CheckHR(m_query_readback_buffer->Map(0, &read_range, &readback_buffer_map));

  // Read all pending queries.
  while (m_query_count > 0)
  {
    ActiveQuery& entry = m_query_buffer[m_query_read_pos];

    UINT64 result;
    memcpy(&result, reinterpret_cast<u8*>(readback_buffer_map) + sizeof(UINT64) * m_query_read_pos,
           sizeof(UINT64));

    // NOTE: Reported pixel metrics should be referenced to native resolution
    // TODO: Dropping the lower 2 bits from this count should be closer to actual
    // hardware behavior when drawing triangles.
    m_results[entry.query_type] += (u32)(result * EFB_WIDTH / g_renderer->GetTargetWidth() *
                                         EFB_HEIGHT / g_renderer->GetTargetHeight());

    m_query_read_pos = (m_query_read_pos + 1) % m_query_buffer.size();
    m_query_count--;
  }

  D3D12_RANGE write_range = {};
  m_query_readback_buffer->Unmap(0, &write_range);
}

void PerfQuery::WeakFlush()
{
  UINT64 completed_fence = m_tracking_fence->GetCompletedValue();

  while (!IsFlushed())
  {
    ActiveQuery& entry = m_query_buffer[m_query_read_pos];
    if (entry.fence_value > completed_fence)
      break;

    FlushOne();
  }
}

bool PerfQuery::IsFlushed() const
{
  return m_query_count == 0;
}

void PerfQuery::QueueFenceCallback(void* owning_object, UINT64 fence_value)
{
  PerfQuery* owning_perf_query = static_cast<PerfQuery*>(owning_object);
  owning_perf_query->QueueFence(fence_value);
}

void PerfQuery::QueueFence(UINT64 fence_value)
{
  m_next_fence_value = fence_value + 1;
}

}  // namespace