Core: Create fastmem mappings for page address translation

Previously we've only been setting up fastmem mappings for block address
translation, but now we also do it for page address translation. This
increases performance when games access memory using page tables, but
decreases performance when games set up page tables.

The tlbie instruction is used as an indication that the mappings need to
be updated.

There are some accuracy downsides:

* The TLB is now effectively infinitely large, which matters if games
  don't use tlbie when modifying page tables.
* The R and C bits for page table entries get set pessimistically rather
  than when the page is actually accessed.

No games are known to be broken by these inaccuracies, but unfortunately
the second inaccuracy causes a large performance regression in Rogue
Squadron 3. You still get the old, more accurate behavior if Enable
Write-Back Cache is on.
This commit is contained in:
JosJuice 2025-06-20 09:17:21 +02:00
commit 8d9ea7fb83
6 changed files with 245 additions and 17 deletions

View file

@ -11,6 +11,7 @@
#include <algorithm>
#include <array>
#include <cstring>
#include <map>
#include <memory>
#include <span>
#include <tuple>
@ -41,7 +42,9 @@
namespace Memory
{
MemoryManager::MemoryManager(Core::System& system) : m_system(system)
MemoryManager::MemoryManager(Core::System& system)
: m_page_size(m_arena.GetPageSize()), m_page_alignment(m_arena.GetPageAlignment()),
m_system(system)
{
}
@ -233,13 +236,19 @@ bool MemoryManager::InitFastmemArena()
return true;
}
void MemoryManager::UpdateLogicalMemory(const PowerPC::BatTable& dbat_table)
void MemoryManager::UpdateDBATMappings(const PowerPC::BatTable& dbat_table)
{
for (auto& entry : m_logical_mapped_entries)
for (auto& entry : m_dbat_mapped_entries)
{
m_arena.UnmapFromMemoryRegion(entry.mapped_pointer, entry.mapped_size);
}
m_logical_mapped_entries.clear();
m_dbat_mapped_entries.clear();
for (auto& entry : m_page_table_mapped_entries)
{
m_arena.UnmapFromMemoryRegion(entry.mapped_pointer, entry.mapped_size);
}
m_page_table_mapped_entries.clear();
m_logical_page_mappings.fill(nullptr);
@ -288,13 +297,12 @@ void MemoryManager::UpdateLogicalMemory(const PowerPC::BatTable& dbat_table)
void* mapped_pointer = m_arena.MapInMemoryRegion(position, mapped_size, base);
if (!mapped_pointer)
{
PanicAlertFmt(
"Memory::UpdateLogicalMemory(): Failed to map memory region at 0x{:08X} "
"(size 0x{:08X}) into logical fastmem region at 0x{:08X}.",
intersection_start, mapped_size, logical_address);
exit(0);
PanicAlertFmt("Memory::UpdateDBATMappings(): Failed to map memory region at 0x{:08X} "
"(size 0x{:08X}) into logical fastmem region at 0x{:08X}.",
intersection_start, mapped_size, logical_address);
continue;
}
m_logical_mapped_entries.push_back({mapped_pointer, mapped_size});
m_dbat_mapped_entries.push_back({mapped_pointer, mapped_size});
}
m_logical_page_mappings[i] =
@ -305,6 +313,57 @@ void MemoryManager::UpdateLogicalMemory(const PowerPC::BatTable& dbat_table)
}
}
void MemoryManager::UpdatePageTableMappings(const std::map<u32, u32>& page_mappings)
{
if (m_page_size > PowerPC::HW_PAGE_SIZE)
return;
for (auto& entry : m_page_table_mapped_entries)
{
m_arena.UnmapFromMemoryRegion(entry.mapped_pointer, entry.mapped_size);
}
m_page_table_mapped_entries.clear();
for (const auto [logical_address, translated_address] : page_mappings)
{
if (logical_address % m_page_alignment != 0)
continue;
constexpr u32 logical_size = PowerPC::HW_PAGE_SIZE;
for (const auto& physical_region : m_physical_regions)
{
if (!physical_region.active)
continue;
u32 mapping_address = physical_region.physical_address;
u32 mapping_end = mapping_address + physical_region.size;
u32 intersection_start = std::max(mapping_address, translated_address);
u32 intersection_end = std::min(mapping_end, translated_address + logical_size);
if (intersection_start < intersection_end)
{
// Found an overlapping region; map it.
if (m_is_fastmem_arena_initialized)
{
u32 position = physical_region.shm_position + intersection_start - mapping_address;
u8* base = m_logical_base + logical_address + intersection_start - translated_address;
u32 mapped_size = intersection_end - intersection_start;
void* mapped_pointer = m_arena.MapInMemoryRegion(position, mapped_size, base);
if (!mapped_pointer)
{
PanicAlertFmt(
"Memory::UpdatePageTableMappings(): Failed to map memory region at 0x{:08X} "
"(size 0x{:08X}) into logical fastmem region at 0x{:08X}.",
intersection_start, mapped_size, logical_address);
continue;
}
m_page_table_mapped_entries.push_back({mapped_pointer, mapped_size});
}
}
}
}
}
void MemoryManager::DoState(PointerWrap& p)
{
const u32 current_ram_size = GetRamSize();
@ -386,11 +445,17 @@ void MemoryManager::ShutdownFastmemArena()
m_arena.UnmapFromMemoryRegion(base, region.size);
}
for (auto& entry : m_logical_mapped_entries)
for (auto& entry : m_dbat_mapped_entries)
{
m_arena.UnmapFromMemoryRegion(entry.mapped_pointer, entry.mapped_size);
}
m_logical_mapped_entries.clear();
m_dbat_mapped_entries.clear();
for (auto& entry : m_page_table_mapped_entries)
{
m_arena.UnmapFromMemoryRegion(entry.mapped_pointer, entry.mapped_size);
}
m_page_table_mapped_entries.clear();
m_arena.ReleaseMemoryRegion();

View file

@ -4,6 +4,7 @@
#pragma once
#include <array>
#include <map>
#include <memory>
#include <span>
#include <string>
@ -99,7 +100,8 @@ public:
void ShutdownFastmemArena();
void DoState(PointerWrap& p);
void UpdateLogicalMemory(const PowerPC::BatTable& dbat_table);
void UpdateDBATMappings(const PowerPC::BatTable& dbat_table);
void UpdatePageTableMappings(const std::map<u32, u32>& page_mappings);
void Clear();
@ -207,6 +209,9 @@ private:
// The MemArena class
Common::MemArena m_arena;
const size_t m_page_size;
const size_t m_page_alignment;
// Dolphin allocates memory to represent four regions:
// - 32MB RAM (actually 24MB on hardware), available on GameCube and Wii
// - 64MB "EXRAM", RAM only available on Wii
@ -247,7 +252,8 @@ private:
// TODO: Do we want to handle the mirrors of the GC RAM?
std::array<PhysicalMemoryRegion, 4> m_physical_regions{};
std::vector<LogicalMemoryView> m_logical_mapped_entries;
std::vector<LogicalMemoryView> m_dbat_mapped_entries;
std::vector<LogicalMemoryView> m_page_table_mapped_entries;
std::array<void*, PowerPC::BAT_PAGE_COUNT> m_physical_page_mappings{};
std::array<void*, PowerPC::BAT_PAGE_COUNT> m_logical_page_mappings{};

View file

@ -33,6 +33,7 @@
#include "Common/Align.h"
#include "Common/Assert.h"
#include "Common/BitUtils.h"
#include "Common/ChunkFile.h"
#include "Common/CommonTypes.h"
#include "Common/Logging/Log.h"
@ -58,6 +59,22 @@ MMU::MMU(Core::System& system, Memory::MemoryManager& memory, PowerPC::PowerPCMa
MMU::~MMU() = default;
void MMU::Reset()
{
m_page_mappings.clear();
#ifndef _ARCH_32
m_memory.UpdatePageTableMappings(m_page_mappings);
#endif
}
void MMU::DoState(PointerWrap& p)
{
// Instead of storing m_page_mappings in savestates, we *could* recalculate it based on memory
// here in DoState, but this could lead to us getting a more up-to-date set of page mappings
// than we had when the savestate was created, which could be a problem for TAS determinism.
p.Do(m_page_mappings);
}
// Overloaded byteswap functions, for use within the templated functions below.
[[maybe_unused]] static u8 bswap(u8 val)
{
@ -1323,10 +1340,13 @@ void MMU::SDRUpdated()
m_ppc_state.pagetable_base = htaborg << 16;
m_ppc_state.pagetable_hashmask = ((htabmask << 10) | 0x3ff);
PageTableUpdated();
}
void MMU::SRUpdated()
{
PageTableUpdated();
}
enum class TLBLookupResult
@ -1416,6 +1436,124 @@ void MMU::InvalidateTLBEntry(u32 address)
m_ppc_state.tlb[PowerPC::DATA_TLB_INDEX][entry_index].Invalidate();
m_ppc_state.tlb[PowerPC::INST_TLB_INDEX][entry_index].Invalidate();
PageTableUpdated();
}
void MMU::PageTableUpdated()
{
#ifndef _ARCH_32
m_page_mappings.clear();
if (m_ppc_state.m_enable_dcache)
{
// Because fastmem isn't in use when accurate dcache emulation is enabled, setting up mappings
// would be a waste of time. Skipping setting up mappings also comes with the bonus of skipping
// the inaccurate behavior of setting the R and C bits of PTE2 as soon as a page is mapped.
return;
}
const u32 page_table_mask = m_ppc_state.pagetable_hashmask;
const u32 page_table_base = m_ppc_state.pagetable_base;
const u32 page_table_end = (page_table_base | (page_table_mask << 6)) + (1 << 6);
const u32 page_table_size = page_table_end - page_table_base;
u8* page_table_view = m_system.GetMemory().GetPointerForRange(page_table_base, page_table_size);
if (!page_table_view)
{
WARN_LOG_FMT(POWERPC, "Failed to read page table at {:#010x}-{:#010x}", page_table_base,
page_table_end);
m_memory.UpdatePageTableMappings(m_page_mappings);
return;
}
const auto read_page_table = [&](u32 H) {
for (u32 i = 0; i <= page_table_mask; ++i)
{
for (u32 j = 0; j < 8; ++j)
{
const u32 pte_addr = (page_table_base | ((i & page_table_mask) << 6)) + j * 8;
UPTE_Lo pte1(Common::swap32(page_table_view + pte_addr - page_table_base));
UPTE_Hi pte2(Common::swap32(page_table_view + pte_addr - page_table_base + 4));
if (!pte1.V)
continue;
if (pte1.H != H)
continue;
// There are quirks related to uncached memory that can't be correctly emulated by fast
// accesses, so we don't map uncached memory. (However, no software at all is known to
// trigger these quirks through page address translation, only through block address
// translation.)
const bool wi = (pte2.WIMG & 0b1100) != 0;
if (wi)
continue;
// Due to hash masking, the upper bits of page_index_from_hash might not match the actual
// page index. But these bits fully overlap with the API (abbreviated page index), so we can
// overwrite these bits with the API from pte1 and thereby get the correct page index.
//
// In other words: logical_address.API must be written to after logical_address.page_index!
u32 page_index_from_hash = i ^ pte1.VSID;
if (pte1.H)
page_index_from_hash = ~page_index_from_hash;
EffectiveAddress logical_address;
logical_address.offset = 0;
logical_address.page_index = page_index_from_hash;
logical_address.API = pte1.API;
for (u32 k = 0; k < std::size(m_ppc_state.sr); ++k)
{
const auto sr = UReg_SR{m_ppc_state.sr[k]};
if (sr.VSID != pte1.VSID || sr.T != 0)
continue;
logical_address.SR = k;
// Block address translation takes priority over page address translation.
if (m_dbat_table[logical_address.Hex >> PowerPC::BAT_INDEX_SHIFT] &
PowerPC::BAT_MAPPED_BIT)
{
continue;
}
// Fast accesses don't support memchecks, so force slow accesses by removing fastmem
// mappings for all overlapping virtual pages.
constexpr u32 logical_size = PowerPC::HW_PAGE_SIZE;
if (m_power_pc.GetMemChecks().OverlapsMemcheck(logical_address.Hex, logical_size))
continue;
const u32 physical_address = pte2.RPN << 12;
// Important: This doesn't overwrite anything already present in m_page_mappings.
m_page_mappings.emplace(logical_address.Hex, physical_address);
// HACK: We set R and C, which indicate whether a page have been read from and written to
// respectively, when a page is mapped rather than when it's actually accessed. The latter
// is probably possible using some fault handling logic, but for now it seems like more
// work than it's worth.
if (!pte2.R || !pte2.C)
{
pte2.R = 1;
pte2.C = 1;
const u32 pte2_swapped = Common::swap32(pte2.Hex);
std::memcpy(page_table_view + pte_addr - page_table_base + 4, &pte2_swapped,
sizeof(pte2_swapped));
}
}
}
}
};
// We need to read all H=0 PTEs first, because H=0 takes priority over H=1.
read_page_table(0);
read_page_table(1);
m_memory.UpdatePageTableMappings(m_page_mappings);
#endif
}
// Page Address Translation
@ -1643,7 +1781,8 @@ void MMU::DBATUpdated()
}
#ifndef _ARCH_32
m_memory.UpdateLogicalMemory(m_dbat_table);
m_memory.UpdateDBATMappings(m_dbat_table);
m_memory.UpdatePageTableMappings(m_page_mappings);
#endif
// IsOptimizable*Address and dcbz depends on the BAT mapping, so we need a flush here.

View file

@ -5,12 +5,15 @@
#include <array>
#include <cstddef>
#include <map>
#include <optional>
#include <string>
#include "Common/BitField.h"
#include "Common/CommonTypes.h"
class PointerWrap;
namespace Core
{
class CPUThreadGuard;
@ -116,6 +119,9 @@ public:
MMU& operator=(MMU&& other) = delete;
~MMU();
void Reset();
void DoState(PointerWrap& p);
// Routines for debugger UI, cheats, etc. to access emulated memory from the
// perspective of the CPU. Not for use by core emulation routines.
// Use "Host" prefix.
@ -240,6 +246,7 @@ public:
void SDRUpdated();
void SRUpdated();
void InvalidateTLBEntry(u32 address);
void PageTableUpdated();
void DBATUpdated();
void IBATUpdated();
@ -326,6 +333,10 @@ private:
PowerPC::PowerPCManager& m_power_pc;
PowerPC::PowerPCState& m_ppc_state;
// STATE_TO_SAVE
std::map<u32, u32> m_page_mappings;
// END STATE_TO_SAVE
BatTable m_ibat_table;
BatTable m_dbat_table;
};

View file

@ -105,6 +105,9 @@ void PowerPCManager::DoState(PointerWrap& p)
m_ppc_state.iCache.DoState(memory, p);
m_ppc_state.dCache.DoState(memory, p);
auto& mmu = m_system.GetMMU();
mmu.DoState(p);
if (p.IsReadMode())
{
if (!m_ppc_state.m_enable_dcache)
@ -116,7 +119,6 @@ void PowerPCManager::DoState(PointerWrap& p)
RoundingModeUpdated(m_ppc_state);
RecalculateAllFeatureFlags(m_ppc_state);
auto& mmu = m_system.GetMMU();
mmu.IBATUpdated();
mmu.DBATUpdated();
}
@ -253,6 +255,10 @@ void PowerPCManager::RefreshConfig()
{
INFO_LOG_FMT(POWERPC, "Flushing data cache");
m_ppc_state.dCache.FlushAll(m_system.GetMemory());
// No page table mappings are created when accurate dcache emulation is enabled.
// If there are any that can be created, let's create them now.
m_system.GetMMU().PageTableUpdated();
}
}
@ -282,6 +288,7 @@ void PowerPCManager::Reset()
ResetRegisters();
m_ppc_state.iCache.Reset(m_system.GetJitInterface());
m_ppc_state.dCache.Reset();
m_system.GetMMU().Reset();
}
void PowerPCManager::ScheduleInvalidateCacheThreadSafe(u32 address)

View file

@ -95,7 +95,7 @@ static size_t s_state_writes_in_queue;
static std::condition_variable s_state_write_queue_is_empty;
// Don't forget to increase this after doing changes on the savestate system
constexpr u32 STATE_VERSION = 175; // Last changed in PR 13751
constexpr u32 STATE_VERSION = 176; // Last changed in PR 13768
// Increase this if the StateExtendedHeader definition changes
constexpr u32 EXTENDED_HEADER_VERSION = 1; // Last changed in PR 12217