mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-19 19:15:26 +00:00
LLVM: enable some JIT events (Intel, Perf)
Made some related adjustments. Currently incomplete.
This commit is contained in:
parent
510041a873
commit
d836033212
16 changed files with 233 additions and 162 deletions
15
3rdparty/llvm.cmake
vendored
15
3rdparty/llvm.cmake
vendored
|
@ -13,6 +13,15 @@ if(WITH_LLVM)
|
|||
option(LLVM_INCLUDE_UTILS OFF)
|
||||
option(LLVM_CCACHE_BUILD ON)
|
||||
|
||||
if(WIN32)
|
||||
set(LLVM_USE_INTEL_JITEVENTS ON)
|
||||
endif()
|
||||
|
||||
if(CMAKE_SYSTEM MATCHES "Linux")
|
||||
set(LLVM_USE_INTEL_JITEVENTS ON)
|
||||
set(LLVM_USE_PERF ON)
|
||||
endif()
|
||||
|
||||
set(CXX_FLAGS_OLD ${CMAKE_CXX_FLAGS})
|
||||
|
||||
if (MSVC)
|
||||
|
@ -52,7 +61,11 @@ if(WITH_LLVM)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
set(LLVM_LIBS LLVMMCJIT LLVMX86CodeGen LLVMX86AsmParser)
|
||||
set(LLVM_LIBS LLVMMCJIT LLVMX86CodeGen LLVMX86AsmParser LLVMIntelJITEvents)
|
||||
|
||||
if(CMAKE_SYSTEM MATCHES "Linux")
|
||||
set(LLVM_LIBS ${LLVM_LIBS} LLVMPerfJITEvents)
|
||||
endif()
|
||||
|
||||
add_library(3rdparty_llvm INTERFACE)
|
||||
target_link_libraries(3rdparty_llvm INTERFACE ${LLVM_LIBS})
|
||||
|
|
|
@ -16,6 +16,15 @@
|
|||
|
||||
LOG_CHANNEL(jit_log, "JIT");
|
||||
|
||||
void jit_announce(uptr func, usz size, std::string_view name)
|
||||
{
|
||||
#ifdef __linux__
|
||||
static const fs::file s_map(fmt::format("/tmp/perf-%d.map", getpid()), fs::rewrite + fs::append);
|
||||
|
||||
s_map.write(fmt::format("%x %x %s\n", func, size, name));
|
||||
#endif
|
||||
}
|
||||
|
||||
static u8* get_jit_memory()
|
||||
{
|
||||
// Reserve 2G memory (magic static)
|
||||
|
@ -230,7 +239,7 @@ asmjit::Runtime& asmjit::get_global_runtime()
|
|||
return asmjit::kErrorNoCodeGenerated;
|
||||
}
|
||||
|
||||
void* p = m_pos.fetch_add(utils::align(codeSize, 4096));
|
||||
void* p = m_pos.fetch_add(utils::align(codeSize, 64));
|
||||
if (!p || m_pos > m_max) [[unlikely]]
|
||||
{
|
||||
*dst = nullptr;
|
||||
|
@ -245,7 +254,6 @@ asmjit::Runtime& asmjit::get_global_runtime()
|
|||
return asmjit::kErrorInvalidState;
|
||||
}
|
||||
|
||||
utils::memory_protect(p, utils::align(codeSize, 4096), utils::protection::rx);
|
||||
flush(p, relocSize);
|
||||
*dst = p;
|
||||
|
||||
|
@ -331,6 +339,9 @@ asmjit::inline_runtime::~inline_runtime()
|
|||
#include "llvm/ExecutionEngine/ExecutionEngine.h"
|
||||
#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
|
||||
#include "llvm/ExecutionEngine/ObjectCache.h"
|
||||
#include "llvm/ExecutionEngine/JITEventListener.h"
|
||||
#include "llvm/Object/ObjectFile.h"
|
||||
#include "llvm/Object/SymbolSize.h"
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(pop)
|
||||
#else
|
||||
|
@ -386,7 +397,7 @@ static u64 make_null_function(const std::string& name)
|
|||
using namespace asmjit;
|
||||
|
||||
// Build a "null" function that contains its name
|
||||
const auto func = build_function_asm<void (*)()>([&](X86Assembler& c, auto& args)
|
||||
const auto func = build_function_asm<void (*)()>("NULL", [&](X86Assembler& c, auto& args)
|
||||
{
|
||||
Label data = c.newLabel();
|
||||
c.lea(args[0], x86::qword_ptr(data, 0));
|
||||
|
@ -406,6 +417,34 @@ static u64 make_null_function(const std::string& name)
|
|||
}
|
||||
}
|
||||
|
||||
struct JITAnnouncer : llvm::JITEventListener
|
||||
{
|
||||
void notifyObjectLoaded(u64, const llvm::object::ObjectFile& obj, const llvm::RuntimeDyld::LoadedObjectInfo& info) override
|
||||
{
|
||||
using namespace llvm;
|
||||
|
||||
object::OwningBinary<object::ObjectFile> debug_obj_ = info.getObjectForDebug(obj);
|
||||
const object::ObjectFile& debug_obj = *debug_obj_.getBinary();
|
||||
|
||||
for (const auto& [sym, size] : computeSymbolSizes(debug_obj))
|
||||
{
|
||||
Expected<object::SymbolRef::Type> type_ = sym.getType();
|
||||
if (!type_ || *type_ != object::SymbolRef::ST_Function)
|
||||
continue;
|
||||
|
||||
Expected<StringRef> name = sym.getName();
|
||||
if (!name)
|
||||
continue;
|
||||
|
||||
Expected<u64> addr = sym.getAddress();
|
||||
if (!addr)
|
||||
continue;
|
||||
|
||||
jit_announce(*addr, size, {name->data(), name->size()});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Simple memory manager
|
||||
struct MemoryManager1 : llvm::RTDyldMemoryManager
|
||||
{
|
||||
|
@ -429,7 +468,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
|
|||
|
||||
~MemoryManager1() override
|
||||
{
|
||||
utils::memory_release(ptr, c_max_size * 2);
|
||||
// Hack: don't release to prevent reuse of address space, see jit_announce
|
||||
utils::memory_decommit(ptr, c_max_size * 2);
|
||||
}
|
||||
|
||||
llvm::JITSymbol findSymbol(const std::string& name) override
|
||||
|
@ -812,6 +852,12 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
|
|||
}
|
||||
}
|
||||
|
||||
if (!_link.empty() || !(flags & 0x1))
|
||||
{
|
||||
m_engine->RegisterJITEventListener(llvm::JITEventListener::createIntelJITEventListener());
|
||||
m_engine->RegisterJITEventListener(new JITAnnouncer);
|
||||
}
|
||||
|
||||
if (!m_engine)
|
||||
{
|
||||
fmt::throw_exception("LLVM: Failed to create ExecutionEngine: %s", result);
|
||||
|
|
|
@ -34,6 +34,13 @@
|
|||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
|
||||
void jit_announce(uptr func, usz size, std::string_view name);
|
||||
|
||||
void jit_announce(auto* func, usz size, std::string_view name)
|
||||
{
|
||||
jit_announce(uptr(func), size, name);
|
||||
}
|
||||
|
||||
enum class jit_class
|
||||
{
|
||||
ppu_code,
|
||||
|
@ -161,7 +168,7 @@ namespace asmjit
|
|||
|
||||
// Build runtime function with asmjit::X86Assembler
|
||||
template <typename FT, typename F>
|
||||
inline FT build_function_asm(F&& builder)
|
||||
inline FT build_function_asm(std::string_view name, F&& builder)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
@ -195,6 +202,7 @@ inline FT build_function_asm(F&& builder)
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
jit_announce(result, code.getCodeSize(), name);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -210,8 +218,8 @@ public:
|
|||
built_function& operator=(const built_function&) = delete;
|
||||
|
||||
template <typename F>
|
||||
built_function(F&& builder)
|
||||
: m_func(ensure(build_function_asm<FT>(std::forward<F>(builder))))
|
||||
built_function(std::string_view name, F&& builder)
|
||||
: m_func(ensure(build_function_asm<FT>(name, std::forward<F>(builder))))
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -238,7 +246,7 @@ public:
|
|||
built_function& operator=(const built_function&) = delete;
|
||||
|
||||
template <typename F>
|
||||
built_function(F&& builder)
|
||||
built_function(std::string_view name, F&& builder)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
@ -270,6 +278,10 @@ public:
|
|||
{
|
||||
ensure(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
jit_announce(result, code.getCodeSize(), name);
|
||||
}
|
||||
}
|
||||
|
||||
operator FT() const noexcept
|
||||
|
|
|
@ -2190,7 +2190,7 @@ thread_base::native_entry thread_base::finalize(u64 _self) noexcept
|
|||
|
||||
thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base* _base))
|
||||
{
|
||||
return build_function_asm<native_entry>([&](asmjit::X86Assembler& c, auto& args)
|
||||
return build_function_asm<native_entry>("thread_base_trampoline", [&](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ImportGroup Label="PropertySheets" />
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
|
@ -61,6 +61,7 @@
|
|||
LLVMBitWriter.lib;
|
||||
LLVMCoroutines.lib;
|
||||
LLVMObjCARCOpts.lib;
|
||||
LLVMIntelJITEvents.lib;
|
||||
</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ImportGroup Label="PropertySheets" />
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
|
@ -62,6 +62,7 @@
|
|||
LLVMBitWriter.lib;
|
||||
LLVMCoroutines.lib;
|
||||
LLVMObjCARCOpts.lib;
|
||||
LLVMIntelJITEvents.lib;
|
||||
</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
|
|
2
llvm
2
llvm
|
@ -1 +1 @@
|
|||
Subproject commit 318b8fe3746615f914522d4e177c537ce80d1d08
|
||||
Subproject commit a670c459ea782411885b1e9861c89d04609d648f
|
|
@ -39,9 +39,9 @@
|
|||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros">
|
||||
<CmakeReleaseCLI>call vsdevcmd.bat -arch=amd64
|
||||
cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm</CmakeReleaseCLI>
|
||||
cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm</CmakeReleaseCLI>
|
||||
<CmakeDebugCLI>call vsdevcmd.bat -arch=amd64
|
||||
cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm</CmakeDebugCLI>
|
||||
cmake -G Ninja -DCMAKE_CXX_COMPILER="cl.exe" -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm</CmakeDebugCLI>
|
||||
<CmakeCleanCLI>echo Cleaning..
|
||||
for /F "delims= eol=|" %%f in ('
|
||||
dir /b ^| findstr /V "[^.]*\build[^.]*\.vcxproj"') do (
|
||||
|
|
|
@ -39,9 +39,9 @@
|
|||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros">
|
||||
<CmakeReleaseCLI>call vsdevcmd.bat -arch=amd64
|
||||
cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm</CmakeReleaseCLI>
|
||||
cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Release" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Release" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm</CmakeReleaseCLI>
|
||||
<CmakeDebugCLI>call vsdevcmd.bat -arch=amd64
|
||||
cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT ../llvm</CmakeDebugCLI>
|
||||
cmake -G Ninja -DCMAKE_CXX_COMPILER="clang-cl.exe" -DCMAKE_C_COMPILER="clang-cl.exe" -DCMAKE_BUILD_TYPE="Debug" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_DEFAULT_TARGET_TRIPLE:STRING=x86_64-pc-windows-msvc -DLLVM_HOST_TRIPLE:STRING=x86_64-pc-windows-msvc -DCMAKE_INSTALL_PREFIX="./Debug" -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_TOOLS=OFF -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_TOOLS=OFF -DLLVM_INCLUDE_UTILS=OFF -DCMAKE_SYSTEM_VERSION=6.1 -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON -DCMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION=$(WindowsTargetPlatformVersion) -DLLVM_USE_CRT_DEBUG=MTd -DLLVM_USE_CRT_RELEASE=MT -DLLVM_USE_INTEL_JITEVENTS=ON ../llvm</CmakeDebugCLI>
|
||||
<CmakeCleanCLI>echo Cleaning..
|
||||
for /F "delims= eol=|" %%f in ('
|
||||
dir /b ^| findstr /V "[^.]*\build[^.]*\.vcxproj"') do (
|
||||
|
|
|
@ -1910,14 +1910,14 @@ std::vector<ppu_function_t>& ppu_function_manager::access(bool ghc)
|
|||
|
||||
static std::vector<ppu_function_t> list_ghc
|
||||
{
|
||||
build_function_asm<ppu_function_t>([](asmjit::X86Assembler& c, auto& args)
|
||||
build_function_asm<ppu_function_t>("ppu_unregistered", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
c.mov(args[0], x86::rbp);
|
||||
c.jmp(imm_ptr(list[0]));
|
||||
}),
|
||||
build_function_asm<ppu_function_t>([](asmjit::X86Assembler& c, auto& args)
|
||||
build_function_asm<ppu_function_t>("ppu_return", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
@ -1937,7 +1937,7 @@ u32 ppu_function_manager::add_function(ppu_function_t function)
|
|||
list.push_back(function);
|
||||
|
||||
// Generate trampoline
|
||||
list2.push_back(build_function_asm<ppu_function_t>([&](asmjit::X86Assembler& c, auto& args)
|
||||
list2.push_back(build_function_asm<ppu_function_t>("ppu_trampolinea", [&](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
|
|
@ -147,7 +147,7 @@ static bool ppu_break(ppu_thread& ppu, ppu_opcode_t op);
|
|||
|
||||
extern void do_cell_atomic_128_store(u32 addr, const void* to_write);
|
||||
|
||||
const auto ppu_gateway = built_function<void(*)(ppu_thread*)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
// Gateway for PPU, converts from native to GHC calling convention, also saves RSP value for escape
|
||||
using namespace asmjit;
|
||||
|
@ -248,7 +248,7 @@ const auto ppu_gateway = built_function<void(*)(ppu_thread*)>([](asmjit::X86Asse
|
|||
c.ret();
|
||||
});
|
||||
|
||||
const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_escape", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
@ -256,12 +256,13 @@ const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>([](asmji
|
|||
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)));
|
||||
|
||||
// Return to the return location
|
||||
c.jmp(x86::qword_ptr(x86::rsp, -8));
|
||||
c.sub(x86::rsp, 8);
|
||||
c.ret();
|
||||
});
|
||||
|
||||
void ppu_recompiler_fallback(ppu_thread& ppu);
|
||||
|
||||
const auto ppu_recompiler_fallback_ghc = build_function_asm<void(*)(ppu_thread& ppu)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const auto ppu_recompiler_fallback_ghc = build_function_asm<void(*)(ppu_thread& ppu)>("ppu_trampolineb", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
@ -1816,7 +1817,7 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr)
|
|||
return ppu_load_acquire_reservation<u64>(ppu, addr);
|
||||
}
|
||||
|
||||
const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>("ppu_stcx_accurate_tx", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
@ -1832,11 +1833,7 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
|
|||
|
||||
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
|
||||
c.push(x86::rbp);
|
||||
c.push(x86::r13);
|
||||
c.push(x86::r12);
|
||||
c.push(x86::rbx);
|
||||
c.push(x86::r14);
|
||||
c.push(x86::r15);
|
||||
c.sub(x86::rsp, 40);
|
||||
#ifdef _WIN32
|
||||
if (!s_tsx_avx)
|
||||
|
@ -1847,7 +1844,7 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
|
|||
#endif
|
||||
|
||||
// Prepare registers
|
||||
build_swap_rdx_with(c, args, x86::r12);
|
||||
build_swap_rdx_with(c, args, x86::r10);
|
||||
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
|
||||
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
||||
c.and_(x86::rbp, -128);
|
||||
|
@ -1855,11 +1852,9 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
|
|||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
c.movzx(args[0].r32(), args[0].r16());
|
||||
c.shr(args[0].r32(), 1);
|
||||
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
|
||||
c.and_(x86::rbx, -128 / 2);
|
||||
c.prefetchw(x86::byte_ptr(x86::rbx));
|
||||
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
|
||||
c.and_(x86::r11, -128 / 2);
|
||||
c.and_(args[0].r32(), 63);
|
||||
c.mov(x86::r13, args[1]);
|
||||
|
||||
// Prepare data
|
||||
if (s_tsx_avx)
|
||||
|
@ -1894,8 +1889,6 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
|
|||
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
|
||||
c.jae(fall);
|
||||
});
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
|
||||
// Check pause flag
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||
|
@ -1939,7 +1932,7 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
|
|||
c.mov(x86::qword_ptr(x86::rbp, args[0], 1, 0), args[3]);
|
||||
|
||||
c.xend();
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 64);
|
||||
c.lock().add(x86::qword_ptr(x86::r11), 64);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
c.jmp(_ret);
|
||||
|
@ -1975,7 +1968,7 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
|
|||
c.jmp(_ret);
|
||||
|
||||
c.bind(fail2);
|
||||
c.lock().sub(x86::qword_ptr(x86::rbx), 1);
|
||||
c.lock().sub(x86::qword_ptr(x86::r11), 64);
|
||||
c.bind(load);
|
||||
|
||||
// Store previous data back to rdata
|
||||
|
@ -2019,12 +2012,17 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
|
|||
}
|
||||
|
||||
c.add(x86::rsp, 40);
|
||||
c.pop(x86::r15);
|
||||
c.pop(x86::r14);
|
||||
c.pop(x86::rbx);
|
||||
c.pop(x86::r12);
|
||||
c.pop(x86::r13);
|
||||
c.pop(x86::rbp);
|
||||
|
||||
#ifdef __linux__
|
||||
// Hack for perf profiling (TODO)
|
||||
Label ret2 = c.newLabel();
|
||||
c.lea(x86::rdx, x86::qword_ptr(ret2));
|
||||
c.push(x86::rdx);
|
||||
c.push(x86::rdx);
|
||||
c.bind(ret2);
|
||||
#endif
|
||||
c.ret();
|
||||
});
|
||||
|
||||
|
|
|
@ -904,6 +904,10 @@ spu_function_t spu_recompiler::compile(spu_program&& _func)
|
|||
|
||||
spu_log.fatal("Failed to build a function");
|
||||
}
|
||||
else
|
||||
{
|
||||
jit_announce(fn, code.getCodeSize(), fmt::format("spu-b-%s", fmt::base57(be_t<u64>(m_hash_start))));
|
||||
}
|
||||
|
||||
// Install compiled function pointer
|
||||
const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn);
|
||||
|
|
|
@ -1733,7 +1733,7 @@ bool spu_interpreter::SHUFB(spu_thread& spu, spu_opcode_t op)
|
|||
return true;
|
||||
}
|
||||
|
||||
const spu_inter_func_t optimized_shufb = build_function_asm<spu_inter_func_t>([](asmjit::X86Assembler& c, auto& /*args*/)
|
||||
const spu_inter_func_t optimized_shufb = build_function_asm<spu_inter_func_t>("spu_shufb", [](asmjit::X86Assembler& c, auto& /*args*/)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
|
|
@ -160,7 +160,7 @@ DECLARE(spu_runtime::tr_all) = []
|
|||
return reinterpret_cast<spu_function_t>(trptr);
|
||||
}();
|
||||
|
||||
DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>([](asmjit::X86Assembler& c, auto& args)
|
||||
DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>("spu_gateway", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
// Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape
|
||||
using namespace asmjit;
|
||||
|
@ -249,7 +249,7 @@ DECLARE(spu_runtime::g_gateway) = built_function<spu_function_t>([](asmjit::X86A
|
|||
c.ret();
|
||||
});
|
||||
|
||||
DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>([](asmjit::X86Assembler& c, auto& args)
|
||||
DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>("spu_escape", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
@ -257,10 +257,11 @@ DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>([](asm
|
|||
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
|
||||
|
||||
// Return to the return location
|
||||
c.jmp(x86::qword_ptr(x86::rsp, -8));
|
||||
c.sub(x86::rsp, 8);
|
||||
c.ret();
|
||||
});
|
||||
|
||||
DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, spu_function_t, u8*)>([](asmjit::X86Assembler& c, auto& args)
|
||||
DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, spu_function_t, u8*)>("spu_tail_escape", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
@ -268,14 +269,15 @@ DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, sp
|
|||
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
|
||||
|
||||
// Adjust stack for initial call instruction in the gateway
|
||||
c.sub(x86::rsp, 8);
|
||||
c.sub(x86::rsp, 16);
|
||||
|
||||
// Tail call, GHC CC (second arg)
|
||||
c.mov(x86::r13, args[0]);
|
||||
c.mov(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::ls)));
|
||||
c.mov(x86::r12, args[2]);
|
||||
c.xor_(x86::ebx, x86::ebx);
|
||||
c.jmp(args[1]);
|
||||
c.mov(x86::qword_ptr(x86::rsp), args[1]);
|
||||
c.ret();
|
||||
});
|
||||
|
||||
DECLARE(spu_runtime::g_interpreter_table) = {};
|
||||
|
@ -1066,6 +1068,8 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
|
|||
|
||||
workload.clear();
|
||||
result = reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
|
||||
|
||||
jit_announce(wxptr, raw - wxptr, "spu_ubertrampoline");
|
||||
}
|
||||
|
||||
if (auto _old = stuff_it->trampoline.compare_and_swap(nullptr, result))
|
||||
|
@ -3480,7 +3484,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
|||
#endif
|
||||
|
||||
// Get function chunk name
|
||||
const std::string name = fmt::format("spu-chunk-0x%05x", addr);
|
||||
const std::string name = fmt::format("spu-cx%05x-%s", addr, fmt::base57(be_t<u64>{m_hash_start}));
|
||||
llvm::Function* result = llvm::cast<llvm::Function>(m_module->getOrInsertFunction(name, chunk_type).getCallee());
|
||||
|
||||
// Set parameters
|
||||
|
@ -3505,7 +3509,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
|
|||
// 5. $3
|
||||
const auto func_type = get_ftype<u32[4], u8*, u8*, u32, u32[4], u32[4]>();
|
||||
|
||||
const std::string fname = fmt::format("spu-function-0x%05x", addr);
|
||||
const std::string fname = fmt::format("spu-fx%05x-%s", addr, fmt::base57(be_t<u64>{m_hash_start}));
|
||||
llvm::Function* fn = llvm::cast<llvm::Function>(m_module->getOrInsertFunction(fname, func_type).getCallee());
|
||||
|
||||
fn->setLinkage(llvm::GlobalValue::InternalLinkage);
|
||||
|
|
|
@ -405,7 +405,7 @@ std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
|
|||
return res;
|
||||
}
|
||||
|
||||
const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>("spu_putllc_tx", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
@ -420,12 +420,8 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
|
|||
//}
|
||||
|
||||
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
|
||||
c.push(x86::rbp);
|
||||
c.push(x86::r13);
|
||||
c.push(x86::r12);
|
||||
c.push(x86::rbx);
|
||||
c.sub(x86::rsp, 168);
|
||||
#ifdef _WIN32
|
||||
c.sub(x86::rsp, 168);
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
c.vmovups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
|
||||
|
@ -447,16 +443,14 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
|
|||
#endif
|
||||
|
||||
// Prepare registers
|
||||
build_swap_rdx_with(c, args, x86::r12);
|
||||
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
|
||||
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
build_swap_rdx_with(c, args, x86::r10);
|
||||
c.mov(args[1], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
|
||||
c.lea(args[1], x86::qword_ptr(args[1], args[0]));
|
||||
c.prefetchw(x86::byte_ptr(args[1], 0));
|
||||
c.prefetchw(x86::byte_ptr(args[1], 64));
|
||||
c.and_(args[0].r32(), 0xff80);
|
||||
c.shr(args[0].r32(), 1);
|
||||
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbx));
|
||||
c.mov(x86::r13, args[1]);
|
||||
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
|
||||
|
||||
// Prepare data
|
||||
if (s_tsx_avx)
|
||||
|
@ -504,8 +498,6 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
|
|||
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
|
||||
c.jae(fall);
|
||||
});
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
|
||||
// Check pause flag
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
|
||||
|
@ -514,10 +506,10 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
|
|||
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
||||
c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
||||
c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
||||
c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
||||
c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(args[1], 0));
|
||||
c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(args[1], 32));
|
||||
c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(args[1], 64));
|
||||
c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(args[1], 96));
|
||||
c.vorps(x86::ymm0, x86::ymm0, x86::ymm1);
|
||||
c.vorps(x86::ymm1, x86::ymm2, x86::ymm3);
|
||||
c.vorps(x86::ymm0, x86::ymm1, x86::ymm0);
|
||||
|
@ -525,14 +517,14 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
|
|||
}
|
||||
else
|
||||
{
|
||||
c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
||||
c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
||||
c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
||||
c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
||||
c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
||||
c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
||||
c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
||||
c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
||||
c.xorps(x86::xmm0, x86::oword_ptr(args[1], 0));
|
||||
c.xorps(x86::xmm1, x86::oword_ptr(args[1], 16));
|
||||
c.xorps(x86::xmm2, x86::oword_ptr(args[1], 32));
|
||||
c.xorps(x86::xmm3, x86::oword_ptr(args[1], 48));
|
||||
c.xorps(x86::xmm4, x86::oword_ptr(args[1], 64));
|
||||
c.xorps(x86::xmm5, x86::oword_ptr(args[1], 80));
|
||||
c.xorps(x86::xmm6, x86::oword_ptr(args[1], 96));
|
||||
c.xorps(x86::xmm7, x86::oword_ptr(args[1], 112));
|
||||
c.orps(x86::xmm0, x86::xmm1);
|
||||
c.orps(x86::xmm2, x86::xmm3);
|
||||
c.orps(x86::xmm4, x86::xmm5);
|
||||
|
@ -547,25 +539,25 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
|
|||
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm4);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm5);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm6);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm7);
|
||||
c.vmovaps(x86::yword_ptr(args[1], 0), x86::ymm4);
|
||||
c.vmovaps(x86::yword_ptr(args[1], 32), x86::ymm5);
|
||||
c.vmovaps(x86::yword_ptr(args[1], 64), x86::ymm6);
|
||||
c.vmovaps(x86::yword_ptr(args[1], 96), x86::ymm7);
|
||||
}
|
||||
else
|
||||
{
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm8);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm9);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm10);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm11);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm12);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm13);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm14);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm15);
|
||||
c.movaps(x86::oword_ptr(args[1], 0), x86::xmm8);
|
||||
c.movaps(x86::oword_ptr(args[1], 16), x86::xmm9);
|
||||
c.movaps(x86::oword_ptr(args[1], 32), x86::xmm10);
|
||||
c.movaps(x86::oword_ptr(args[1], 48), x86::xmm11);
|
||||
c.movaps(x86::oword_ptr(args[1], 64), x86::xmm12);
|
||||
c.movaps(x86::oword_ptr(args[1], 80), x86::xmm13);
|
||||
c.movaps(x86::oword_ptr(args[1], 96), x86::xmm14);
|
||||
c.movaps(x86::oword_ptr(args[1], 112), x86::xmm15);
|
||||
}
|
||||
|
||||
c.xend();
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 64);
|
||||
c.lock().add(x86::qword_ptr(x86::r11), 64);
|
||||
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
|
||||
build_get_tsc(c);
|
||||
c.sub(x86::rax, stamp0);
|
||||
|
@ -577,21 +569,21 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
|
|||
// Load previous data to store back to rdata
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
||||
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
||||
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
||||
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
||||
c.vmovaps(x86::ymm0, x86::yword_ptr(args[1], 0));
|
||||
c.vmovaps(x86::ymm1, x86::yword_ptr(args[1], 32));
|
||||
c.vmovaps(x86::ymm2, x86::yword_ptr(args[1], 64));
|
||||
c.vmovaps(x86::ymm3, x86::yword_ptr(args[1], 96));
|
||||
}
|
||||
else
|
||||
{
|
||||
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
||||
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
||||
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
||||
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
||||
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
||||
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
||||
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
||||
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
||||
c.movaps(x86::xmm0, x86::oword_ptr(args[1], 0));
|
||||
c.movaps(x86::xmm1, x86::oword_ptr(args[1], 16));
|
||||
c.movaps(x86::xmm2, x86::oword_ptr(args[1], 32));
|
||||
c.movaps(x86::xmm3, x86::oword_ptr(args[1], 48));
|
||||
c.movaps(x86::xmm4, x86::oword_ptr(args[1], 64));
|
||||
c.movaps(x86::xmm5, x86::oword_ptr(args[1], 80));
|
||||
c.movaps(x86::xmm6, x86::oword_ptr(args[1], 96));
|
||||
c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
|
||||
}
|
||||
|
||||
c.xend();
|
||||
|
@ -603,7 +595,7 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
|
|||
c.jmp(_ret);
|
||||
|
||||
c.bind(fail2);
|
||||
c.lock().sub(x86::qword_ptr(x86::rbx), 64);
|
||||
c.lock().sub(x86::qword_ptr(x86::r11), 64);
|
||||
c.bind(load);
|
||||
|
||||
// Store previous data back to rdata
|
||||
|
@ -652,6 +644,7 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
|
|||
c.movups(x86::xmm14, x86::oword_ptr(x86::rsp, 128));
|
||||
c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144));
|
||||
}
|
||||
c.add(x86::rsp, 168);
|
||||
#endif
|
||||
|
||||
if (s_tsx_avx)
|
||||
|
@ -659,15 +652,18 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
|
|||
c.vzeroupper();
|
||||
}
|
||||
|
||||
c.add(x86::rsp, 168);
|
||||
c.pop(x86::rbx);
|
||||
c.pop(x86::r12);
|
||||
c.pop(x86::r13);
|
||||
c.pop(x86::rbp);
|
||||
#ifdef __linux__
|
||||
// Hack for perf profiling (TODO)
|
||||
Label ret2 = c.newLabel();
|
||||
c.lea(x86::rdx, x86::qword_ptr(ret2));
|
||||
c.push(x86::rdx);
|
||||
c.push(x86::rdx);
|
||||
c.bind(ret2);
|
||||
#endif
|
||||
c.ret();
|
||||
});
|
||||
|
||||
const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>("spu_putlluc_tx", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
@ -680,30 +676,20 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
|
|||
//}
|
||||
|
||||
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
|
||||
c.push(x86::rbp);
|
||||
c.push(x86::r13);
|
||||
c.push(x86::r12);
|
||||
c.push(x86::rbx);
|
||||
c.sub(x86::rsp, 40);
|
||||
#ifdef _WIN32
|
||||
c.sub(x86::rsp, 40);
|
||||
if (!s_tsx_avx)
|
||||
{
|
||||
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
|
||||
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Prepare registers
|
||||
build_swap_rdx_with(c, args, x86::r12);
|
||||
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
|
||||
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
c.and_(args[0].r32(), 0xff80);
|
||||
c.shr(args[0].r32(), 1);
|
||||
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbx));
|
||||
c.mov(x86::r13, args[1]);
|
||||
build_swap_rdx_with(c, args, x86::r10);
|
||||
c.mov(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
|
||||
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
|
||||
c.prefetchw(x86::byte_ptr(x86::r11, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::r11, 64));
|
||||
|
||||
// Prepare data
|
||||
if (s_tsx_avx)
|
||||
|
@ -725,6 +711,10 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
|
|||
c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
|
||||
}
|
||||
|
||||
c.and_(args[0].r32(), 0xff80);
|
||||
c.shr(args[0].r32(), 1);
|
||||
c.lea(args[1], x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
|
||||
|
||||
// Alloc args[0] to stamp0
|
||||
const auto stamp0 = args[0];
|
||||
build_get_tsc(c, stamp0);
|
||||
|
@ -739,35 +729,29 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
|
|||
c.jae(fall);
|
||||
});
|
||||
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
|
||||
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
|
||||
|
||||
// // Check pause flag
|
||||
// c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
|
||||
// c.jc(fall);
|
||||
c.xbegin(tx1);
|
||||
|
||||
if (s_tsx_avx)
|
||||
{
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm0);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm1);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm2);
|
||||
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm3);
|
||||
c.vmovaps(x86::yword_ptr(x86::r11, 0), x86::ymm0);
|
||||
c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1);
|
||||
c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2);
|
||||
c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3);
|
||||
}
|
||||
else
|
||||
{
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm0);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm1);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm2);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm3);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm4);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm5);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm6);
|
||||
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm7);
|
||||
c.movaps(x86::oword_ptr(x86::r11, 0), x86::xmm0);
|
||||
c.movaps(x86::oword_ptr(x86::r11, 16), x86::xmm1);
|
||||
c.movaps(x86::oword_ptr(x86::r11, 32), x86::xmm2);
|
||||
c.movaps(x86::oword_ptr(x86::r11, 48), x86::xmm3);
|
||||
c.movaps(x86::oword_ptr(x86::r11, 64), x86::xmm4);
|
||||
c.movaps(x86::oword_ptr(x86::r11, 80), x86::xmm5);
|
||||
c.movaps(x86::oword_ptr(x86::r11, 96), x86::xmm6);
|
||||
c.movaps(x86::oword_ptr(x86::r11, 112), x86::xmm7);
|
||||
}
|
||||
|
||||
c.xend();
|
||||
c.lock().add(x86::qword_ptr(x86::rbx), 32);
|
||||
c.lock().add(x86::qword_ptr(args[1]), 32);
|
||||
// stx++
|
||||
c.add(x86::qword_ptr(args[2]), 1);
|
||||
build_get_tsc(c);
|
||||
|
@ -786,6 +770,7 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
|
|||
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
|
||||
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
|
||||
}
|
||||
c.add(x86::rsp, 40);
|
||||
#endif
|
||||
|
||||
if (s_tsx_avx)
|
||||
|
@ -793,15 +778,18 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
|
|||
c.vzeroupper();
|
||||
}
|
||||
|
||||
c.add(x86::rsp, 40);
|
||||
c.pop(x86::rbx);
|
||||
c.pop(x86::r12);
|
||||
c.pop(x86::r13);
|
||||
c.pop(x86::rbp);
|
||||
#ifdef __linux__
|
||||
// Hack for perf profiling (TODO)
|
||||
Label ret2 = c.newLabel();
|
||||
c.lea(x86::rdx, x86::qword_ptr(ret2));
|
||||
c.push(x86::rdx);
|
||||
c.push(x86::rdx);
|
||||
c.bind(ret2);
|
||||
#endif
|
||||
c.ret();
|
||||
});
|
||||
|
||||
const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>("spu_getllar_tx", [](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
|
@ -815,8 +803,6 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
|
|||
|
||||
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
|
||||
c.push(x86::rbp);
|
||||
c.push(x86::r13);
|
||||
c.push(x86::r12);
|
||||
c.push(x86::rbx);
|
||||
c.sub(x86::rsp, 40);
|
||||
#ifdef _WIN32
|
||||
|
@ -828,13 +814,12 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
|
|||
#endif
|
||||
|
||||
// Prepare registers
|
||||
build_swap_rdx_with(c, args, x86::r12);
|
||||
build_swap_rdx_with(c, args, x86::r10);
|
||||
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
|
||||
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
||||
c.and_(args[0].r32(), 0xff80);
|
||||
c.shr(args[0].r32(), 1);
|
||||
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
|
||||
c.mov(x86::r13, args[1]);
|
||||
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
|
||||
|
||||
// Alloc args[0] to stamp0
|
||||
const auto stamp0 = args[0];
|
||||
|
@ -853,7 +838,7 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
|
|||
// Check pause flag
|
||||
c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
|
||||
c.jc(fall);
|
||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
||||
c.mov(x86::rax, x86::qword_ptr(x86::r11));
|
||||
c.and_(x86::rax, -128);
|
||||
c.cmp(x86::rax, args[3]);
|
||||
c.jne(fall);
|
||||
|
@ -926,9 +911,16 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
|
|||
|
||||
c.add(x86::rsp, 40);
|
||||
c.pop(x86::rbx);
|
||||
c.pop(x86::r12);
|
||||
c.pop(x86::r13);
|
||||
c.pop(x86::rbp);
|
||||
|
||||
#ifdef __linux__
|
||||
// Hack for perf profiling (TODO)
|
||||
Label ret2 = c.newLabel();
|
||||
c.lea(x86::rdx, x86::qword_ptr(ret2));
|
||||
c.push(x86::rdx);
|
||||
c.push(x86::rdx);
|
||||
c.bind(ret2);
|
||||
#endif
|
||||
c.ret();
|
||||
});
|
||||
|
||||
|
|
|
@ -256,9 +256,9 @@ namespace
|
|||
}
|
||||
}
|
||||
|
||||
built_function<void(*)(void*, const void*, u32)> copy_data_swap_u32(&build_copy_data_swap_u32<false>);
|
||||
built_function<void(*)(void*, const void*, u32)> copy_data_swap_u32("copy_data_swap_u32", &build_copy_data_swap_u32<false>);
|
||||
|
||||
built_function<bool(*)(void*, const void*, u32)> copy_data_swap_u32_cmp(&build_copy_data_swap_u32<true>);
|
||||
built_function<bool(*)(void*, const void*, u32)> copy_data_swap_u32_cmp("copy_data_swap_u32_cmp", &build_copy_data_swap_u32<true>);
|
||||
|
||||
namespace
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue