Merge branch 'master' into spurs0optimize

2025-04-21 03:55:32 +00:00 · 2024-11-03 09:48:29 +02:00 · 2024-11-03 09:48:29 +02:00 · d09bc6a0dc
commit d09bc6a0dc
parent a5838fdb12 b053b1e200
98 changed files with 1756 additions and 730 deletions
--- a/.ci/build-linux-aarch64.sh
+++ b/.ci/build-linux-aarch64.sh
@ -12,8 +12,8 @@ git submodule -q update --init $(awk '/path/ && !/llvm/ { print $3 }' .gitmodule

 mkdir build && cd build || exit 1

-export CC=clang
-export CXX=clang++
+export CC="${CLANG_BINARY}"
+export CXX="${CLANGXX_BINARY}"

 cmake ..                                               \
    -DCMAKE_INSTALL_PREFIX=/usr                        \
@ -27,8 +27,8 @@ cmake ..                                               \
    -DUSE_SYSTEM_FFMPEG=OFF                            \
    -DUSE_DISCORD_RPC=ON                               \
    -DOpenGL_GL_PREFERENCE=LEGACY                      \
+    -DLLVM_DIR=/opt/llvm/lib/cmake/llvm                \
    -DSTATIC_LINK_LLVM=ON                              \
-    -DBUILD_LLVM=OFF                                   \
    -G Ninja

 ninja; build_status=$?;
--- a/.ci/build-linux.sh
+++ b/.ci/build-linux.sh
@ -42,6 +42,7 @@ cmake ..                                               \
    -DCMAKE_RANLIB="$RANLIB"                           \
    -DUSE_SYSTEM_CURL=ON                               \
    -DUSE_SDL=ON                                       \
+    -DUSE_SYSTEM_SDL=ON                                \
    -DUSE_SYSTEM_FFMPEG=OFF                            \
    -DUSE_DISCORD_RPC=ON                               \
    -DOpenGL_GL_PREFERENCE=LEGACY                      \
@ -61,5 +62,5 @@ shellcheck .ci/*.sh
 } && SHOULD_DEPLOY="true" || SHOULD_DEPLOY="false"

 if [ "$build_status" -eq 0 ] && [ "$SHOULD_DEPLOY" = "true" ]; then
-    .ci/deploy-linux-legacy.sh "x86_64"
+    .ci/deploy-linux.sh "x86_64"
 fi
--- a/.ci/build-mac-arm64.sh
+++ b/.ci/build-mac-arm64.sh
@ -3,18 +3,18 @@
 # shellcheck disable=SC2086
 brew_arm64_install_packages() {
    for pkg in "$@"; do
-        echo "Fetching bottle for $pkg..."
-        bottle_path="$("$BREW_ARM64_PATH/bin/brew" --cache --bottle-tag=arm64_sonoma "$pkg")"
+        echo "Fetching bottle for $pkg (arm64)..."
+        bottle_path="$("$BREW_ARM64_PATH/bin/brew" --cache --bottle-tag=arm64_ventura "$pkg")"
        if [ ! -f "$bottle_path" ]; then
-            if ! "$BREW_ARM64_PATH/bin/brew" fetch --force --bottle-tag=arm64_sonoma "$pkg"; then
+            if ! "$BREW_ARM64_PATH/bin/brew" fetch --force --verbose --debug --bottle-tag=arm64_ventura "$pkg"; then
                echo "Failed to fetch bottle for $pkg"
                return 1
            fi
-            bottle_path="$("$BREW_ARM64_PATH/bin/brew" --cache --bottle-tag=arm64_sonoma "$pkg")"
+            bottle_path="$("$BREW_ARM64_PATH/bin/brew" --cache --bottle-tag=arm64_ventura "$pkg")"
        fi

-        echo "Installing $pkg..."
-        "$BREW_ARM64_PATH/bin/brew" install --ignore-dependencies "$bottle_path" || true
+        echo "Installing $pkg (arm64)..."
+        "$BREW_ARM64_PATH/bin/brew" install --force --force-bottle --ignore-dependencies "$bottle_path" || true
    done
 }

@ -23,9 +23,14 @@ export HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK=1
 export HOMEBREW_NO_INSTALL_CLEANUP=1

 /usr/local/bin/brew update
+sudo rm -rf /usr/local/Cellar/curl /usr/local/opt/curl
+/usr/local/bin/brew install -f --overwrite curl
 /usr/local/bin/brew uninstall -f --ignore-dependencies ffmpeg
 /usr/local/bin/brew install -f --build-from-source ffmpeg@5 || true
+/usr/local/bin/brew install -f --overwrite python || true
+/usr/local/bin/brew link --overwrite python || true
 /usr/local/bin/brew install -f --overwrite nasm ninja p7zip ccache pipenv #create-dmg
+/usr/local/bin/brew link -f curl || true
 /usr/local/bin/brew install llvm@$LLVM_COMPILER_VER glew cmake sdl2 vulkan-headers coreutils
 /usr/local/bin/brew link -f llvm@$LLVM_COMPILER_VER ffmpeg@5 || true

@ -34,12 +39,17 @@ sudo mkdir -p "$BREW_ARM64_PATH"
 sudo chmod 777 "$BREW_ARM64_PATH"
 curl -L https://github.com/Homebrew/brew/tarball/master | tar xz --strip 1 -C "$BREW_ARM64_PATH"

-"$BREW_ARM64_PATH/bin/brew" update
-brew_arm64_install_packages 0mq aom aribb24 ca-certificates cjson curl dav1d ffmpeg@5 fontconfig freetype freetype2 gettext glew gmp gnutls lame libbluray libidn2 libnettle libogg libpng librist libsodium libsoxr libtasn libtasn1 libunistring libvmaf libvorbis libvpx libx11 libxau libxcb libxdmcp llvm@$LLVM_COMPILER_VER mbedtls molten-vk nettle opencore-amr openjpeg openssl opus p11-kit pkg-config pkgconfig pzstd rav1e sdl2 snappy speex srt svt-av1 theora vulkan-headers webp x264 x265 xz z3 zeromq zmq zstd
+#"$BREW_ARM64_PATH/bin/brew" update
+# libvorbis requires Homebrew-installed curl, but we can't run it on x64, and we also need the aarch64 libs, so we swap the binary
+brew_arm64_install_packages curl
+mv /opt/homebrew1/opt/curl/bin/curl /opt/homebrew1/opt/curl/bin/curl.bak
+ln -s /usr/local/opt/curl/bin/curl /opt/homebrew1/opt/curl/bin/curl
+
+brew_arm64_install_packages 0mq aom aribb24 ca-certificates cjson dav1d ffmpeg@5 fontconfig freetype freetype2 gettext glew gmp gnutls lame libbluray libidn2 libnettle libogg libpng librist libsodium libsoxr libtasn libtasn1 libunistring libvmaf libvorbis libvpx libx11 libxau libxcb libxdmcp llvm@$LLVM_COMPILER_VER mbedtls molten-vk nettle opencore-amr openjpeg openssl opus p11-kit pkg-config pkgconfig pzstd rav1e sdl2 snappy speex srt svt-av1 theora vulkan-headers webp x264 x265 xz z3 zeromq zmq zstd
 "$BREW_ARM64_PATH/bin/brew" link -f ffmpeg@5

-# moltenvk based on commit for 1.2.10 release
-wget https://raw.githubusercontent.com/Homebrew/homebrew-core/0d9f25fbd1658e975d00bd0e8cccd20a0c2cb74b/Formula/m/molten-vk.rb
+# moltenvk based on commit for 1.2.11 release
+wget https://raw.githubusercontent.com/Homebrew/homebrew-core/6bfc8950c696d1f952425e8af2a6248603dc0df9/Formula/m/molten-vk.rb
 /usr/local/bin/brew install -f --overwrite ./molten-vk.rb
 export CXX=clang++
 export CC=clang
@ -125,6 +135,7 @@ export MACOSX_DEPLOYMENT_TARGET=13.0
    -DLLVM_TARGET_ARCH=arm64 \
    -DCMAKE_OSX_ARCHITECTURES=arm64 \
    -DCMAKE_IGNORE_PATH="$BREW_X64_PATH/lib" \
+    -DCMAKE_IGNORE_PREFIX_PATH=/usr/local/opt \
    -DCMAKE_SYSTEM_PROCESSOR=arm64 \
    -DCMAKE_TOOLCHAIN_FILE=buildfiles/cmake/TCDarwinARM64.cmake \
    -DCMAKE_CXX_FLAGS="-D__MAC_OS_X_VERSION_MIN_REQUIRED=130000" \
--- a/.ci/build-mac.sh
+++ b/.ci/build-mac.sh
@ -9,15 +9,15 @@ brew install -f --overwrite nasm ninja p7zip ccache pipenv #create-dmg
 #/usr/sbin/softwareupdate --install-rosetta --agree-to-license
 arch -x86_64 /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
 arch -x86_64 /usr/local/bin/brew update
-arch -x86_64 /usr/local/bin/brew install -f --overwrite python@3.12 || arch -x86_64 /usr/local/bin/brew link --overwrite python@3.12
+arch -x86_64 /usr/local/bin/brew install -f --overwrite python || arch -x86_64 /usr/local/bin/brew link --overwrite python
 arch -x86_64 /usr/local/bin/brew uninstall -f --ignore-dependencies ffmpeg
 arch -x86_64 /usr/local/bin/brew install -f --build-from-source ffmpeg@5
 arch -x86_64 /usr/local/bin/brew reinstall -f --build-from-source gnutls freetype
 arch -x86_64 /usr/local/bin/brew install llvm@$LLVM_COMPILER_VER glew cmake sdl2 vulkan-headers coreutils
 arch -x86_64 /usr/local/bin/brew link -f llvm@$LLVM_COMPILER_VER ffmpeg@5

-# moltenvk based on commit for 1.2.10 release
-wget https://raw.githubusercontent.com/Homebrew/homebrew-core/0d9f25fbd1658e975d00bd0e8cccd20a0c2cb74b/Formula/m/molten-vk.rb
+# moltenvk based on commit for 1.2.11 release
+wget https://raw.githubusercontent.com/Homebrew/homebrew-core/6bfc8950c696d1f952425e8af2a6248603dc0df9/Formula/m/molten-vk.rb
 arch -x86_64 /usr/local/bin/brew install -f --overwrite ./molten-vk.rb
 export CXX=clang++
 export CC=clang
--- a/.ci/deploy-linux-legacy.sh
+++ b/.ci/deploy-linux-legacy.sh
@ -1,48 +0,0 @@
-#!/bin/sh -ex
-
-cd build || exit 1
-
-if [ "$DEPLOY_APPIMAGE" = "true" ]; then
-    DESTDIR=AppDir ninja install
-
-    curl -fsSLo /usr/bin/linuxdeploy https://github.com/linuxdeploy/linuxdeploy/releases/download/continuous/linuxdeploy-x86_64.AppImage
-    chmod +x /usr/bin/linuxdeploy
-    curl -fsSLo /usr/bin/linuxdeploy-plugin-qt https://github.com/linuxdeploy/linuxdeploy-plugin-qt/releases/download/continuous/linuxdeploy-plugin-qt-x86_64.AppImage
-    chmod +x /usr/bin/linuxdeploy-plugin-qt
-    curl -fsSLo linuxdeploy-plugin-checkrt.sh https://github.com/linuxdeploy/linuxdeploy-plugin-checkrt/releases/download/continuous/linuxdeploy-plugin-checkrt-x86_64.sh
-    chmod +x ./linuxdeploy-plugin-checkrt.sh
-
-    export EXTRA_PLATFORM_PLUGINS="libqwayland-egl.so;libqwayland-generic.so"
-    export EXTRA_QT_PLUGINS="svg;wayland-decoration-client;wayland-graphics-integration-client;wayland-shell-integration"
-    APPIMAGE_EXTRACT_AND_RUN=1 linuxdeploy --appdir AppDir --plugin qt
-
-    # Remove libwayland-client because it has platform-dependent exports and breaks other OSes
-    rm -f ./AppDir/usr/lib/libwayland-client.so*
-
-    # Remove libvulkan because it causes issues with gamescope
-    rm -f ./AppDir/usr/lib/libvulkan.so*
-
-    # Remove git directory containing local commit history file
-    rm -rf ./AppDir/usr/share/rpcs3/git
-
-    ./linuxdeploy-plugin-checkrt.sh --appdir AppDir
-
-    linuxdeploy --appimage-extract
-    ./squashfs-root/plugins/linuxdeploy-plugin-appimage/usr/bin/appimagetool AppDir -g
-
-    COMM_TAG=$(awk '/version{.*}/ { printf("%d.%d.%d", $5, $6, $7) }' ../rpcs3/rpcs3_version.cpp)
-    COMM_COUNT="$(git rev-list --count HEAD)"
-    COMM_HASH="$(git rev-parse --short=8 HEAD)"
-    RPCS3_APPIMAGE="rpcs3-v${COMM_TAG}-${COMM_COUNT}-${COMM_HASH}_linux64.AppImage"
-
-    mv ./RPCS3*.AppImage "$RPCS3_APPIMAGE"
-
-    # If we're building using a CI, let's copy over the AppImage artifact
-    if [ -n "$BUILD_ARTIFACTSTAGINGDIRECTORY" ]; then
-        cp "$RPCS3_APPIMAGE" "$ARTDIR"
-    fi
-
-    FILESIZE=$(stat -c %s ./rpcs3*.AppImage)
-    SHA256SUM=$(sha256sum ./rpcs3*.AppImage | awk '{ print $1 }')
-    echo "${SHA256SUM};${FILESIZE}B" > "$RELEASE_MESSAGE"
-fi
--- a/.ci/deploy-linux.sh
+++ b/.ci/deploy-linux.sh
@ -17,7 +17,7 @@ if [ "$DEPLOY_APPIMAGE" = "true" ]; then
    export EXTRA_PLATFORM_PLUGINS="libqwayland-egl.so;libqwayland-generic.so"
    export EXTRA_QT_PLUGINS="svg;wayland-decoration-client;wayland-graphics-integration-client;wayland-shell-integration;waylandcompositor"

-    APPIMAGE_EXTRACT_AND_RUN=1 linuxdeploy --appdir AppDir --plugin qt
+    APPIMAGE_EXTRACT_AND_RUN=1 linuxdeploy --appdir AppDir --plugin qt --plugin checkrt

    # Remove libwayland-client because it has platform-dependent exports and breaks other OSes
    rm -f ./AppDir/usr/lib/libwayland-client.so*
@ -28,8 +28,6 @@ if [ "$DEPLOY_APPIMAGE" = "true" ]; then
    # Remove git directory containing local commit history file
    rm -rf ./AppDir/usr/share/rpcs3/git

-    ./linuxdeploy-plugin-checkrt.sh --appdir AppDir
-
    linuxdeploy --appimage-extract
    ./squashfs-root/plugins/linuxdeploy-plugin-appimage/usr/bin/appimagetool AppDir -g

--- a/.cirrus.yml
+++ b/.cirrus.yml
@ -132,7 +132,7 @@ linux_aarch64_task:
  matrix:
    - name: Cirrus Linux AArch64 Clang
      arm_container:
-        image: 'docker.io/kd117/rpcs3-ci-aarch64:latest'
+        image: 'docker.io/rpcs3/rpcs3-ci-focal-aarch64:1.0'
        cpu: 8
        memory: 8G
      clang_script:
--- a/3rdparty/MoltenVK/CMakeLists.txt
+++ b/3rdparty/MoltenVK/CMakeLists.txt
@ -4,7 +4,7 @@ include(ExternalProject)

 ExternalProject_Add(moltenvk
 	GIT_REPOSITORY https://github.com/KhronosGroup/MoltenVK.git
-	GIT_TAG edbdcf0
+	GIT_TAG 81541f6
 	BUILD_IN_SOURCE 1
 	SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK
 	CONFIGURE_COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/fetchDependencies" --macos
--- a/3rdparty/OpenAL/CMakeLists.txt
+++ b/3rdparty/OpenAL/CMakeLists.txt
@ -5,8 +5,8 @@ if(USE_SYSTEM_OPENAL)
 	target_include_directories(3rdparty_openal INTERFACE ${OPENAL_INCLUDE_DIR})
 	target_link_libraries(3rdparty_openal INTERFACE ${OPENAL_LIBRARY})
 else()
-	option(ALSOFT_UTILS "Build utility programs"  OFF)
-	option(ALSOFT_EXAMPLES  "Build example programs"  OFF)
+	option(ALSOFT_UTILS "Build utility programs" OFF)
+	option(ALSOFT_EXAMPLES "Build example programs" OFF)
 	add_subdirectory(openal-soft EXCLUDE_FROM_ALL)
 	add_library(3rdparty_openal INTERFACE)
 	target_link_libraries(3rdparty_openal INTERFACE OpenAL::OpenAL)
--- a/3rdparty/llvm/CMakeLists.txt
+++ b/3rdparty/llvm/CMakeLists.txt
@ -17,7 +17,10 @@ if(WITH_LLVM)
 		option(LLVM_CCACHE_BUILD OFF)
 		set(LLVM_ENABLE_WARNINGS OFF CACHE BOOL "Enable compiler warnings.")

-		if(WIN32 AND COMPILER_X86)
+		# For Windows x86 (not Windows AArch64).
+		# Check on MSVC is needed due to COMPILER_X86, COMPILER_ARM etc. are not set/supported by the MSVC compiler, if used.
+		# Furthermore, the MSVC compiler is not available/supported on Windows AArch64
+		if(WIN32 AND (COMPILER_X86 OR MSVC))
 			set(LLVM_USE_INTEL_JITEVENTS ON)
 		endif()

@ -70,12 +73,18 @@ if(WITH_LLVM)
 				set(LLVM_TARGETS_TO_BUILD "X86" CACHE STRING "Semicolon-separated list of targets to build, or \"all\".")
 			endif()
 		endif()
-		if((WIN32 AND BUILD_LLVM) OR (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND COMPILER_X86))
+
+		# For Windows x86 (not Windows AArch64) only when BUILD_LLVM is enabled and
+		# for Linux x86 (not Linux AArch64) even if BUILD_LLVM is disabled (precompiled llvm used)
+		if(LLVM_USE_INTEL_JITEVENTS OR (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND COMPILER_X86))
 			list (APPEND LLVM_ADDITIONAL_LIBS IntelJITEvents)
 		endif()
+
+		# For Linux even if BUILD_LLVM is disabled (precompiled llvm used)
 		if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
 			list (APPEND LLVM_ADDITIONAL_LIBS PerfJITEvents)
 		endif()
+
 		llvm_map_components_to_libnames(LLVM_LIBS
 			${LLVM_TARGETS_TO_BUILD}
 			${LLVM_ADDITIONAL_LIBS}
--- a/BUILDING.md
+++ b/BUILDING.md
@ -7,25 +7,40 @@ Other instructions may be found [here](https://wiki.rpcs3.net/index.php?title=Bu

 ### Windows 10 or later

-* [CMake 3.28.0+](https://www.cmake.org/download/) (add to PATH)
-* [Python 3.6+](https://www.python.org/downloads/) (add to PATH)
-* [Qt 6.7.3](https://www.qt.io/download-qt-installer)
-* [Visual Studio 2022](https://visualstudio.microsoft.com/thank-you-downloading-visual-studio/?sku=Community) (or at least Visual Studio 2019 16.11.xx+ as C++20 is not included in previous versions)
-* [Vulkan SDK 1.3.268.0](https://vulkan.lunarg.com/sdk/home) (See "Install the SDK" [here](https://vulkan.lunarg.com/doc/sdk/latest/windows/getting_started.html)) for now future SDKs don't work. You need precisely 1.3.268.0.
+The following tools are required to build RPCS3 on Windows 10 or later:
+- [Visual Studio 2022](https://visualstudio.microsoft.com/thank-you-downloading-visual-studio/?sku=Community) (or at least Visual Studio 2019 16.11.xx+ as C++20 is not included in previous versions)
+- **Optional** - [CMake 3.28.0+](https://www.cmake.org/download/) (add to PATH)

-**Either add the** `QTDIR` **environment variable, e.g.** `<QtInstallFolder>\6.7.3\msvc2019_64\` **, or use the [Visual Studio Qt Plugin](https://marketplace.visualstudio.com/items?itemName=TheQtCompany.QtVisualStudioTools2019)**
+  **NOTES:**
+  - **Visual Studio 2022** integrates **CMake 3.29+** and it also supports both the `sln` solution (`.sln`, `.vcxproj`) and `CMake` solution (`CMakeLists.txt`, `CMakePresets.json`).
+    See sections [Building with Visual Studio sln solution](#building-with-visual-studio-sln-solution) and [Building with Visual Studio CMake solution](#building-with-visual-studio-cmake-solution)
+    on how to build the project with **Visual Studio**.
+  - Install and use this standalone **CMake** tool just in case of your preference. See section [Building with standalone CMake tool](#building-with-standalone-cmake-tool) on how to build the project
+    with standalone **CMake** tool.

-**NOTE: If you have issues with the Qt plugin, you may want to uninstall the Qt Plugin and install the [Legacy Qt Plugin](https://marketplace.visualstudio.com/items?itemName=TheQtCompany.LEGACYQtVisualStudioTools2019) instead.**
+- [Python 3.6+](https://www.python.org/downloads/) (add to PATH)
+- [Qt 6.7.3](https://www.qt.io/download-qt-installer) In case you can't download from the official installer, you can use [Another Qt installer](https://github.com/miurahr/aqtinstall) (In that case you will need to manually add the "qtmultimedia" module when installing Qt)
+- [Vulkan SDK 1.3.268.0](https://vulkan.lunarg.com/sdk/home) (see "Install the SDK" [here](https://vulkan.lunarg.com/doc/sdk/latest/windows/getting_started.html)) for now future SDKs don't work. You need precisely 1.3.268.0.
+
+The `sln` solution available only on **Visual Studio** is the preferred building solution. It easily allows to build the **RPCS3** application in `Release` and `Debug` mode.
+
+In order to build **RPCS3** with the `sln` solution (with **Visual Studio**), **Qt** libs need to be detected. To detect the libs:
+- add and set the `QTDIR` environment variable, e.g. `<QtInstallFolder>\6.7.3\msvc2019_64\`
+- or use the [Visual Studio Qt Plugin](https://marketplace.visualstudio.com/items?itemName=TheQtCompany.QtVisualStudioTools2019)
+
+  **NOTE:** If you have issues with the **Visual Studio Qt Plugin**, you may want to uninstall it and install the [Legacy Qt Plugin](https://marketplace.visualstudio.com/items?itemName=TheQtCompany.LEGACYQtVisualStudioTools2019) instead.
+
+In order to build **RPCS3** with the `CMake` solution (with both **Visual Studio** and standalone **CMake** tool):
+- add and set the `CMAKE_PREFIX_PATH` environment variable to the **Qt** libs path, e.g. `<QtInstallFolder>\6.7.3\msvc2019_64\`

 ### Linux

-These are the essentials tools to build RPCS3 on Linux. Some of them can be installed through your favorite package manager.
-
-* Clang 17+ or GCC 13+
-* [CMake 3.28.0+](https://www.cmake.org/download/)
-* [Qt 6.7.3](https://www.qt.io/download-qt-installer)
-* [Vulkan SDK 1.3.268.0](https://vulkan.lunarg.com/sdk/home) (See "Install the SDK" [here](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html)) for now future SDKs don't work. You need precisely 1.3.268.0.
-* [SDL2](https://github.com/libsdl-org/SDL/releases) (for the FAudio backend)
+These are the essentials tools to build RPCS3 on Linux. Some of them can be installed through your favorite package manager:
+- Clang 17+ or GCC 13+
+- [CMake 3.28.0+](https://www.cmake.org/download/)
+- [Qt 6.7.3](https://www.qt.io/download-qt-installer)
+- [Vulkan SDK 1.3.268.0](https://vulkan.lunarg.com/sdk/home) (See "Install the SDK" [here](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html)) for now future SDKs don't work. You need precisely 1.3.268.0.
+- [SDL2](https://github.com/libsdl-org/SDL/releases) (for the FAudio backend)

 **If you have an NVIDIA GPU, you may need to install the libglvnd package.**

@ -38,6 +53,7 @@ These are the essentials tools to build RPCS3 on Linux. Some of them can be inst
    sudo apt-get install build-essential libasound2-dev libpulse-dev libopenal-dev libglew-dev zlib1g-dev libedit-dev libvulkan-dev libudev-dev git libevdev-dev libsdl2-2.0 libsdl2-dev libjack-dev libsndio-dev

 Ubuntu is usually horrendously out of date, and some packages need to be downloaded by hand. This part is for Qt, GCC, Vulkan, and CMake
+
 ##### Qt PPA

 Ubuntu usually does not have a new enough Qt package to suit rpcs3's needs. There is currently no PPA available to work around this.
@ -65,6 +81,7 @@ sudo apt install vulkan-sdk
 ```

 ##### CMake
+
 ```
 . /etc/os-release
 wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | sudo apt-key add -
@ -95,26 +112,64 @@ git submodule update --init

 ### Windows

-#### Configuring the Qt plugin (if used)
+#### Building with Visual Studio sln solution

-1) Go to `Extensions->Qt VS Tools->Qt Versions`.
-2) Add the path to your Qt installation with compiler e.g. `<QtInstallFolder>\6.7.3\msvc2019_64`, version will fill in automatically.
-3) Go to `Extensions->Qt VS Tools->Options->Legacy Project Format`. (Only available in the legacy Qt plugin)
-4) Set `Build: Run pre-build setup` to `true`. (Only available in the legacy Qt plugin)
+Start **Visual Studio**, click on `Open a project or solution` and select the `rpcs3.sln` file inside the RPCS3's root folder

-#### Building the projects
+##### Configuring the Qt Plugin (if used)

-Open `rpcs3.sln`. The recommended build configuration is `Release`. (On older revisions: `Release - LLVM`)
+1) go to `Extensions->Qt VS Tools->Qt Versions`
+2) add the path to your Qt installation with compiler e.g. `<QtInstallFolder>\6.7.3\msvc2019_64`, version will fill in automatically
+3) go to `Extensions->Qt VS Tools->Options->Legacy Project Format`. (Only available in the **Legacy Qt Plugin**)
+4) set `Build: Run pre-build setup` to `true`. (Only available in the **Legacy Qt Plugin**)

-You may want to download the precompiled [LLVM libs](https://github.com/RPCS3/llvm-mirror/releases/download/custom-build-win-16.0.1/llvmlibs_mt.7z) and extract them to `3rdparty\llvm\`, as well as download and extract the [additional libs](https://github.com/RPCS3/glslang/releases/latest/download/glslanglibs_mt.7z) to `lib\%CONFIGURATION%-x64\` to speed up compilation time (unoptimised/debug libs are currently not available precompiled).
+##### Building the projects

-If you're not using the precompiled libs, build the following projects in *__BUILD_BEFORE* folder by right-clicking on a project > *Build*.:
-* glslang
-* **Either** llvm_build **or** llvm_build_clang_cl
+**NOTE:** The recommended build configuration is `Release`. (On older revisions: `Release - LLVM`)

-Afterwards:
+You may want to download the precompiled [LLVM libs](https://github.com/RPCS3/llvm-mirror/releases/download/custom-build-win-16.0.1/llvmlibs_mt.7z) and extract them to `3rdparty\llvm\`,
+as well as download and extract the [additional libs](https://github.com/RPCS3/glslang/releases/latest/download/glslanglibs_mt.7z) to `lib\%CONFIGURATION%-x64\` to speed up compilation
+time (unoptimised/debug libs are currently not available precompiled).

-`Build > Build Solution`
+If you're not using the precompiled libs, those dependency libs need to be compiled first. From the `Solution Explorer` panel:
+1) expand `__BUILD_BEFORE`
+2) from the `Solution Configurations` drop-down menu, select `Release` (select `Debug` if you want to build in `Debug` mode)
+3) one after another, right-click on the following projects and then click on `Build` to build the selected lib:
+   - `glslang`
+   - either `llvm_build`
+   - or `llvm_build_clang_cl` (if you installed **clang** on VS)
+
+In order to build the **RPCS3** application:
+1) from the `Solution Configurations` drop-down menu, select `Release` (select `Debug` if you want to build in `Debug` mode)
+
+   **NOTE:** In case you previously compiled the dependency libs under `__BUILD_BEFORE`, you have also to select the same build configuration (e.g. `Release`, if you compiled the dependency libs in `Release` mode)
+
+2) click on `Build` menu and then on `Build Solution`
+3) once the build is completed, the **RPCS3** application will be available under the `<rpcs3_root>\bin` folder
+
+#### Building with Visual Studio CMake solution
+
+Start **Visual Studio**, click on `Open a local folder` and select the RPCS3's root folder
+
+Once the project is open on VS, from the `Solution Explorer` panel:
+1) right-click on `rpcs3` and then click on `Switch to CMake Targets View`
+2) from the `Configuration` drop-down menu, select `msvc-release` (select `msvc-debug` if you want to build in `Debug` mode)
+3) right-click on `CMakeLists.txt Project` and then click on `Configure Cache`
+4) once the cache is created, the `rpcs3 project` will be available
+5) right-click on `rpcs3 Project` and then click on `Build All`, or click on `Build` menu and then on `Build All`
+6) once the build is completed, the **RPCS3** application will be available under the `<rpcs3_root>\build-msvc\bin` folder
+
+#### Building with standalone CMake tool
+
+In case you preferred to install and use the standalone **CMake** tool:
+1) move on the RPCS3's root folder
+2) execute the following commands to create the cache and to build the application (for the build, use `--preset msvc-debug` if you want to build in `Debug` mode), respectively:
+
+   ```
+   cmake --preset msvc
+   cmake --build --preset msvc-release
+   ```
+3) once the build is completed, the **RPCS3** application will be available under the `<rpcs3_root>\build-msvc\bin` folder

 ### Linux

@ -122,7 +177,7 @@ While still in the project root:

 1) `cd .. && mkdir --parents rpcs3_build && cd rpcs3_build`
 2) `cmake ../rpcs3/ && make` or `CXX=g++-13 CC=gcc-13 cmake ../rpcs3/ && make` to force these compilers
-3) Run RPCS3 with `./bin/rpcs3`
+3) run RPCS3 with `./bin/rpcs3`

 If compiling for ARM, pass the flag `-DUSE_NATIVE_INSTRUCTIONS=OFF` to the cmake command. This resolves some Neon errors when compiling our SIMD headers.

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -25,7 +25,7 @@ option(USE_SYSTEM_ZLIB "Prefer system ZLIB instead of the builtin one" ON)
 option(USE_VULKAN "Vulkan render backend" ON)
 option(USE_PRECOMPILED_HEADERS "Use precompiled headers" OFF)
 option(USE_SDL "Enables SDL input handler" OFF)
-option(USE_SYSTEM_SDL "Prefer system SDL instead of the builtin one" OFF)
+option(USE_SYSTEM_SDL "Prefer system SDL instead of the builtin one" ON)
 option(USE_SYSTEM_FFMPEG "Prefer system ffmpeg instead of the prebuild one" OFF)
 option(USE_SYSTEM_OPENAL "Prefer system OpenAL instead of the prebuild one" ON)
 option(USE_SYSTEM_CURL "Prefer system Curl instead of the prebuild one" ON)
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -7,12 +7,12 @@
      "binaryDir": "build-gcc",
      "cacheVariables": {
        "CMAKE_BUILD_TYPE": "Debug",
+        "USE_NATIVE_INSTRUCTIONS": "ON",
+        "USE_PRECOMPILED_HEADERS": "ON",
        "USE_FAUDIO": "OFF",
        "USE_SYSTEM_CURL": "OFF",
        "USE_SYSTEM_ZLIB": "OFF",
        "USE_SYSTEM_LIBPNG": "OFF",
-        "USE_NATIVE_INSTRUCTIONS": "ON",
-        "USE_PRECOMPILED_HEADERS": "ON",
        "BUILD_LLVM": "OFF",
        "STATIC_LINK_LLVM": "ON"
      }
@ -23,13 +23,13 @@
      "binaryDir": "build-clang64",
      "cacheVariables": {
        "CMAKE_BUILD_TYPE": "Debug",
+        "USE_NATIVE_INSTRUCTIONS": "ON",
+        "USE_PRECOMPILED_HEADERS": "ON",
        "USE_FAUDIO": "OFF",
        "USE_SYSTEM_CURL": "OFF",
        "USE_SYSTEM_ZLIB": "OFF",
        "USE_SYSTEM_LIBPNG": "OFF",
        "LLVM_ENABLE_LIBCXX": "ON",
-        "USE_NATIVE_INSTRUCTIONS": "ON",
-        "USE_PRECOMPILED_HEADERS": "ON",
        "BUILD_LLVM": "OFF",
        "STATIC_LINK_LLVM": "ON"
      },
@ -48,11 +48,14 @@
        "strategy": "external"
      },
      "cacheVariables": {
+        "CMAKE_CONFIGURATION_TYPES": "Debug;Release",
        "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
-        "USE_FAUDIO": "OFF",
-        "USE_PRECOMPILED_HEADERS": "ON",
-        "USE_SYSTEM_ZLIB": "OFF",
        "USE_NATIVE_INSTRUCTIONS": "ON",
+        "USE_PRECOMPILED_HEADERS": "ON",
+        "USE_FAUDIO": "OFF",
+        "USE_SYSTEM_CURL": "OFF",
+        "USE_SYSTEM_ZLIB": "OFF",
+        "USE_SYSTEM_OPENAL": "OFF",
        "BUILD_LLVM": "ON",
        "STATIC_LINK_LLVM": "ON"
      },
@ -64,5 +67,17 @@
        }
      }
    }
+  ],
+  "buildPresets": [
+    {
+      "name": "msvc-debug",
+      "configurePreset": "msvc",
+      "configuration": "Debug"
+    },
+    {
+      "name": "msvc-release",
+      "configurePreset": "msvc",
+      "configuration": "Release"
+    }
  ]
 }
--- a/Utilities/File.cpp
+++ b/Utilities/File.cpp
@ -2412,6 +2412,13 @@ fs::file fs::make_gather(std::vector<fs::file> files)
 	return result;
 }

+std::string fs::generate_neighboring_path(std::string_view source, [[maybe_unused]] u64 seed)
+{
+	// Seed is currently not used
+
+	return fmt::format(u8"%s/＄%s.%s.tmp", get_parent_dir(source), source.substr(source.find_last_of(fs::delim) + 1), fmt::base57(utils::get_unique_tsc()));
+}
+
 bool fs::pending_file::open(std::string_view path)
 {
 	file.close();
@ -2430,7 +2437,7 @@ bool fs::pending_file::open(std::string_view path)

 	do
 	{
-		m_path = fmt::format(u8"%s/＄%s.%s.tmp", get_parent_dir(path), path.substr(path.find_last_of(fs::delim) + 1), fmt::base57(utils::get_unique_tsc()));
+		m_path = fs::generate_neighboring_path(path, 0);

 		if (file.open(m_path, fs::create + fs::write + fs::read + fs::excl))
 		{
@ -2475,7 +2482,6 @@ bool fs::pending_file::commit(bool overwrite)
 	{
 		file.sync();
 	}
-
 #endif

 #ifdef _WIN32
@ -2486,16 +2492,130 @@ bool fs::pending_file::commit(bool overwrite)
 		disp.DeleteFileW = false;
 		ensure(SetFileInformationByHandle(file.get_handle(), FileDispositionInfo, &disp, sizeof(disp)));
 	}
+
+	std::vector<std::wstring> hardlink_paths;
+
+	const auto ws1 = to_wchar(m_path);
+
+	const HANDLE file_handle = !overwrite ? INVALID_HANDLE_VALUE
+		: CreateFileW(ws1.get(), GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
+
+	while (file_handle != INVALID_HANDLE_VALUE)
+	{
+		// Get file ID (used to check for hardlinks)
+		BY_HANDLE_FILE_INFORMATION file_info;
+
+		if (!GetFileInformationByHandle(file_handle, &file_info) || file_info.nNumberOfLinks == 1)
+		{	
+			CloseHandle(file_handle);
+			break;
+		}
+
+		// Buffer for holding link name
+		std::wstring link_name_buffer(MAX_PATH, wchar_t{});
+		DWORD buffer_size{};
+		HANDLE find_handle = INVALID_HANDLE_VALUE;
+
+		while (true)
+		{
+			buffer_size = static_cast<DWORD>(link_name_buffer.size() - 1);
+			find_handle = FindFirstFileNameW(ws1.get(), 0, &buffer_size, link_name_buffer.data());
+
+			if (find_handle != INVALID_HANDLE_VALUE || GetLastError() != ERROR_MORE_DATA)
+			{
+				break;
+			}
+
+			link_name_buffer.resize(buffer_size + 1);
+		}
+
+		if (find_handle != INVALID_HANDLE_VALUE)
+		{
+			const std::wstring_view ws1_sv = ws1.get();
+
+			while (true)
+			{
+				if (link_name_buffer.c_str() != ws1_sv)
+				{
+					// Note: link_name_buffer is a buffer which may contain zeroes so truncate it
+					hardlink_paths.push_back(link_name_buffer.c_str());
+				}
+
+				buffer_size = static_cast<DWORD>(link_name_buffer.size() - 1);
+				if (!FindNextFileNameW(find_handle, &buffer_size, link_name_buffer.data()))
+				{
+					if (GetLastError() != ERROR_MORE_DATA)
+					{
+						break;
+					}
+
+					link_name_buffer.resize(buffer_size + 1);
+				}
+			}
+		}
+
+		// Clean up
+		FindClose(find_handle);
+		CloseHandle(file_handle);
+		break;
+	}
+
+	if (!hardlink_paths.empty())
+	{
+		// REPLACEFILE_WRITE_THROUGH is not supported
+		file.sync();
+	}
 #endif

 	file.close();

 #ifdef _WIN32
-	const auto ws1 = to_wchar(m_path);
-	const auto ws2 = to_wchar(m_dest);
+	const auto wdest = to_wchar(m_dest);

-	if (MoveFileExW(ws1.get(), ws2.get(), overwrite ? MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH : MOVEFILE_WRITE_THROUGH))
+	bool ok = false;
+
+	if (hardlink_paths.empty())
 	{
+		ok = MoveFileExW(ws1.get(), wdest.get(), overwrite ? MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH : MOVEFILE_WRITE_THROUGH);
+	}
+	else
+	{
+		ok = ReplaceFileW(ws1.get(), wdest.get(), nullptr, 0, nullptr, nullptr);
+	}
+
+	if (ok)
+	{
+		for (const std::wstring& link_name : hardlink_paths)
+		{
+			std::unique_ptr<wchar_t[]> write_temp_path;
+
+			do
+			{
+				write_temp_path = to_wchar(fs::generate_neighboring_path(m_dest, 0));
+
+				// Generate a temporary hard linke
+				if (CreateHardLinkW(wdest.get(), write_temp_path.get(), nullptr))
+				{
+					if (MoveFileExW(write_temp_path.get(), link_name.data(), MOVEFILE_REPLACE_EXISTING))
+					{
+						// Success
+						write_temp_path.reset();
+						break;
+					}
+
+					break;
+				}
+			}
+			while (fs::g_tls_error == fs::error::exist); // Only retry if failed due to existing file
+
+			if (write_temp_path)
+			{
+				// Failure
+				g_tls_error = to_error(GetLastError());
+				return false;
+			}
+		}
+
 		// Disable the destructor
 		m_path.clear();
 		return true;
@ -2557,6 +2677,17 @@ void fmt_class_string<fs::seek_mode>::format(std::string& out, u64 arg)
 template<>
 void fmt_class_string<fs::error>::format(std::string& out, u64 arg)
 {
+	if (arg == static_cast<u64>(fs::error::unknown))
+	{
+		// Note: may not be the correct error code because it only prints the last
+#ifdef _WIN32
+		fmt::append(out, "Unknown error [errno=%d]", GetLastError());
+#else
+		fmt::append(out, "Unknown error [errno=%d]", errno);
+#endif
+		return;
+	}
+
 	format_enum(out, arg, [](auto arg)
 	{
 		switch (arg)
--- a/Utilities/File.h
+++ b/Utilities/File.h
@ -601,6 +601,8 @@ namespace fs
 	// Temporary directory
 	const std::string& get_temp_dir();

+	std::string generate_neighboring_path(std::string_view source, u64 seed);
+
 	// Unique pending file creation destined to be renamed to the destination file
 	struct pending_file
 	{
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -38,13 +38,13 @@ jobs:
    displayName: ccache

  - bash: |
-      docker pull --quiet rpcs3/rpcs3-ci-focal:1.7.1
+      docker pull --quiet rpcs3/rpcs3-ci-focal:1.9
      docker run                      \
        -v $(pwd):/rpcs3              \
        --env-file .ci/docker.env \
        -v $CCACHE_DIR:/root/.ccache  \
        -v $BUILD_ARTIFACTSTAGINGDIRECTORY:/root/artifacts \
-        rpcs3/rpcs3-ci-focal:1.7.1 \
+        rpcs3/rpcs3-ci-focal:1.9 \
        /rpcs3/.ci/build-linux.sh
    displayName: Docker setup and build

--- a/rpcs3/CMakeLists.txt
+++ b/rpcs3/CMakeLists.txt
@ -132,29 +132,29 @@ if(APPLE)
    endif()
    qt_finalize_target(rpcs3)
    add_custom_command(TARGET rpcs3 POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy
-            ${CMAKE_CURRENT_SOURCE_DIR}/rpcs3.icns $<TARGET_FILE_DIR:rpcs3>/../Resources/rpcs3.icns
-            COMMAND ${CMAKE_COMMAND} -E copy_directory
-            ${CMAKE_SOURCE_DIR}/bin/Icons $<TARGET_FILE_DIR:rpcs3>/../Resources/Icons
-            COMMAND ${CMAKE_COMMAND} -E copy_directory
-            ${CMAKE_SOURCE_DIR}/bin/GuiConfigs $<TARGET_FILE_DIR:rpcs3>/../Resources/GuiConfigs
-            COMMAND ${CMAKE_COMMAND} -E copy_directory
-            ${CMAKE_SOURCE_DIR}/bin/git $<TARGET_FILE_DIR:rpcs3>/../Resources/git
-            COMMAND "${MACDEPLOYQT_EXECUTABLE}" "${PROJECT_BINARY_DIR}/bin/rpcs3.app" "${QT_DEPLOY_FLAGS}")
+        COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/rpcs3.icns $<TARGET_FILE_DIR:rpcs3>/../Resources/rpcs3.icns
+        COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bin/Icons $<TARGET_FILE_DIR:rpcs3>/../Resources/Icons
+        COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bin/GuiConfigs $<TARGET_FILE_DIR:rpcs3>/../Resources/GuiConfigs
+        COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bin/git $<TARGET_FILE_DIR:rpcs3>/../Resources/git
+        COMMAND "${MACDEPLOYQT_EXECUTABLE}" "${PROJECT_BINARY_DIR}/bin/rpcs3.app" "${QT_DEPLOY_FLAGS}")
 elseif(UNIX)
    add_custom_command(TARGET rpcs3 POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy_directory
-            ${CMAKE_SOURCE_DIR}/bin/Icons $<TARGET_FILE_DIR:rpcs3>/Icons)
-    add_custom_command(TARGET rpcs3 POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy_directory
-            ${CMAKE_SOURCE_DIR}/bin/GuiConfigs $<TARGET_FILE_DIR:rpcs3>/GuiConfigs)
-    add_custom_command(TARGET rpcs3 POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy_directory
-            ${CMAKE_SOURCE_DIR}/bin/git $<TARGET_FILE_DIR:rpcs3>/git)
+        COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bin/Icons $<TARGET_FILE_DIR:rpcs3>/Icons
+        COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bin/GuiConfigs $<TARGET_FILE_DIR:rpcs3>/GuiConfigs
+        COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bin/git $<TARGET_FILE_DIR:rpcs3>/git)
 elseif(WIN32)
+    if(MSVC)
+        add_custom_command(TARGET rpcs3 POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/3rdparty/OpenAL/openal-soft/$<CONFIG>/OpenAL32.dll $<TARGET_FILE_DIR:rpcs3>)
+    endif()
    add_custom_command(TARGET rpcs3 POST_BUILD
-        COMMAND "${CMAKE_COMMAND}" -E copy_directory "${CMAKE_SOURCE_DIR}/bin" "$<TARGET_FILE_DIR:rpcs3>"
-        COMMAND "${WINDEPLOYQT_EXECUTABLE}" --no-compiler-runtime --no-opengl-sw --no-patchqt --no-translations --no-quick --no-system-d3d-compiler --no-quick-import --plugindir "$<IF:$<CXX_COMPILER_ID:MSVC>,$<TARGET_FILE_DIR:rpcs3>/plugins,$<TARGET_FILE_DIR:rpcs3>/share/qt6/plugins>" --verbose 0 "$<TARGET_FILE:rpcs3>")
+        COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bin/Icons $<TARGET_FILE_DIR:rpcs3>/Icons
+        COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bin/GuiConfigs $<TARGET_FILE_DIR:rpcs3>/GuiConfigs
+        COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bin/git $<TARGET_FILE_DIR:rpcs3>/git
+        COMMAND "${WINDEPLOYQT_EXECUTABLE}" --no-compiler-runtime --no-opengl-sw --no-patchqt
+            --no-translations --no-quick --no-system-d3d-compiler --no-quick-import
+            --plugindir "$<IF:$<CXX_COMPILER_ID:MSVC>,$<TARGET_FILE_DIR:rpcs3>/plugins,$<TARGET_FILE_DIR:rpcs3>/share/qt6/plugins>"
+            --verbose 0 "$<TARGET_FILE:rpcs3>")
 endif()

 # Unix installation
--- a/rpcs3/Emu/CMakeLists.txt
+++ b/rpcs3/Emu/CMakeLists.txt
@ -478,6 +478,7 @@ target_sources(rpcs3_emu PRIVATE
    RSX/gcm_printing.cpp
    RSX/GL/GLCommonDecompiler.cpp
    RSX/GL/GLCompute.cpp
+    RSX/GL/GLDMA.cpp
    RSX/GL/GLDraw.cpp
    RSX/GL/GLFragmentProgram.cpp
    RSX/GL/GLGSRender.cpp
@ -503,6 +504,7 @@ target_sources(rpcs3_emu PRIVATE
    RSX/GL/OpenGL.cpp
    RSX/GL/upscalers/fsr1/fsr_pass.cpp
    RSX/GSRender.cpp
+    RSX/Host/RSXDMAWriter.cpp
    RSX/Null/NullGSRender.cpp
    RSX/NV47/FW/draw_call.cpp
    RSX/NV47/FW/reg_context.cpp
--- a/rpcs3/Emu/Cell/Modules/cellSync.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSync.cpp
@ -472,7 +472,7 @@ error_code cellSyncQueuePush(ppu_thread& ppu, vm::ptr<CellSyncQueue> queue, vm::

 	u32 position;

-	while (!queue->ctrl.atomic_op([&](auto& ctrl)
+	while (!queue->ctrl.atomic_op([&](CellSyncQueue::ctrl_t& ctrl)
 	{
 		return CellSyncQueue::try_push_begin(ctrl, depth, &position);
 	}))
@ -509,7 +509,7 @@ error_code cellSyncQueueTryPush(vm::ptr<CellSyncQueue> queue, vm::cptr<void> buf

 	u32 position;

-	while (!queue->ctrl.atomic_op([&](auto& ctrl)
+	while (!queue->ctrl.atomic_op([&](CellSyncQueue::ctrl_t& ctrl)
 	{
 		return CellSyncQueue::try_push_begin(ctrl, depth, &position);
 	}))
@ -543,7 +543,7 @@ error_code cellSyncQueuePop(ppu_thread& ppu, vm::ptr<CellSyncQueue> queue, vm::p

 	u32 position;

-	while (!queue->ctrl.atomic_op([&](auto& ctrl)
+	while (!queue->ctrl.atomic_op([&](CellSyncQueue::ctrl_t& ctrl)
 	{
 		return CellSyncQueue::try_pop_begin(ctrl, depth, &position);
 	}))
@ -580,7 +580,7 @@ error_code cellSyncQueueTryPop(vm::ptr<CellSyncQueue> queue, vm::ptr<void> buffe

 	u32 position;

-	while (!queue->ctrl.atomic_op([&](auto& ctrl)
+	while (!queue->ctrl.atomic_op([&](CellSyncQueue::ctrl_t& ctrl)
 	{
 		return CellSyncQueue::try_pop_begin(ctrl, depth, &position);
 	}))
@ -614,7 +614,7 @@ error_code cellSyncQueuePeek(ppu_thread& ppu, vm::ptr<CellSyncQueue> queue, vm::

 	u32 position;

-	while (!queue->ctrl.atomic_op([&](auto& ctrl)
+	while (!queue->ctrl.atomic_op([&](CellSyncQueue::ctrl_t& ctrl)
 	{
 		return CellSyncQueue::try_peek_begin(ctrl, depth, &position);
 	}))
@ -651,7 +651,7 @@ error_code cellSyncQueueTryPeek(vm::ptr<CellSyncQueue> queue, vm::ptr<void> buff

 	u32 position;

-	while (!queue->ctrl.atomic_op([&](auto& ctrl)
+	while (!queue->ctrl.atomic_op([&](CellSyncQueue::ctrl_t& ctrl)
 	{
 		return CellSyncQueue::try_peek_begin(ctrl, depth, &position);
 	}))
--- a/rpcs3/Emu/Cell/RawSPUThread.cpp
+++ b/rpcs3/Emu/Cell/RawSPUThread.cpp
@ -7,20 +7,30 @@

 inline void try_start(spu_thread& spu)
 {
-	reader_lock lock(spu.run_ctrl_mtx);
+	bool notify = false;

-	if (spu.status_npc.fetch_op([](spu_thread::status_npc_sync_var& value)
+	if (~spu.status_npc.load().status & SPU_STATUS_RUNNING)
 	{
-		if (value.status & SPU_STATUS_RUNNING)
+		reader_lock lock(spu.run_ctrl_mtx);
+
+		if (spu.status_npc.fetch_op([](spu_thread::status_npc_sync_var& value)
 		{
-			return false;
-		}
+			if (value.status & SPU_STATUS_RUNNING)
+			{
+				return false;
+			}

-		value.status = SPU_STATUS_RUNNING | (value.status & SPU_STATUS_IS_ISOLATED);
-		return true;
-	}).second)
+			value.status = SPU_STATUS_RUNNING | (value.status & SPU_STATUS_IS_ISOLATED);
+			return true;
+		}).second)
+		{
+			spu.state -= cpu_flag::stop;
+			notify = true;
+		}
+	}
+
+	if (notify)
 	{
-		spu.state -= cpu_flag::stop;
 		spu.state.notify_one();
 	}
 };
--- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@ -543,9 +543,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 	template <typename T = u8>
 	llvm::Value* _ptr(llvm::Value* base, llvm::Value* offset)
 	{
-		const auto off = m_ir->CreateGEP(get_type<u8>(), base, offset);
-		const auto ptr = m_ir->CreateBitCast(off, get_type<T*>());
-		return ptr;
+		return m_ir->CreateGEP(get_type<u8>(), base, offset);
 	}

 	template <typename T, typename... Args>
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -1678,7 +1678,6 @@ void spu_thread::cpu_init()
 	spurs_average_task_duration = 0;
 	spurs_waited = false;
 	spurs_entered_wait = false;
-	spurs_read_events = false;

 	int_ctrl[0].clear();
 	int_ctrl[1].clear();
@ -2699,7 +2698,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*

 					bool ok = false;

-					std::tie(old, ok) = bits->fetch_op([&](auto& v)
+					std::tie(old, ok) = bits->fetch_op([&](u128& v)
 					{
 						if (v & wmask)
 						{
@ -2797,7 +2796,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
 				res += 127;

 				// Release bits and notify
-				bits->atomic_op([&](auto& v)
+				bits->atomic_op([&](u128& v)
 				{
 					v &= ~wmask;
 				});
@ -4807,7 +4806,7 @@ bool spu_thread::process_mfc_cmd()
 		getllar_spin_count = 0;
 		getllar_busy_waiting_switch = umax;

-		u64 ntime;
+		u64 ntime = 0;
 		rsx::reservation_lock rsx_lock(addr, 128);

 		for (u64 i = 0; i != umax; [&]()
@ -4913,21 +4912,14 @@ bool spu_thread::process_mfc_cmd()
 		// Avoid logging useless commands if there is no reservation
 		const bool dump = g_cfg.core.mfc_debug && raddr;

-		const bool is_spurs_task_wait = pc == 0x11e4;
+		const bool is_spurs_task_wait = pc == 0x11e4 && spurs_addr != 0u - 0x80u;

-		do
+		if (!is_spurs_task_wait || spurs_addr != raddr || spurs_waited)
+		{
+			//
+		}
+		else if ((_ref<u8>(0x100 + 0x73) & (1u << index)) == 0 && (static_cast<u8>(rdata[0x73]) & (1u << index)) != 0)
 		{
-			if (!is_spurs_task_wait)
-			{
-				break;
-			}
-
-			if (spurs_addr != raddr || g_cfg.core.max_spurs_threads == g_cfg.core.max_spurs_threads.def || spurs_waited || spurs_read_events)
-			{
-				spurs_read_events = false;
-				break;
-			}
-
 			// Wait for other threads to complete their tasks (temporarily)
 			u32 max_run = group->max_run;

@ -4973,14 +4965,25 @@ bool spu_thread::process_mfc_cmd()
 				spurs_waited = true;
 				spurs_entered_wait = true;

-				// Wait the duration of 4 tasks
-				const u64 spurs_wait_time = std::clamp<u64>(spurs_average_task_duration / spurs_task_count_to_calculate * 4, 3000, 100'000);
+				// Wait the duration of one and a half tasks
+				const u64 spurs_wait_time = std::clamp<u64>(spurs_average_task_duration / spurs_task_count_to_calculate * 3 / 2, 10'000, 100'000);
 				spurs_wait_duration_last = spurs_wait_time;

+				if (spurs_last_task_timestamp)
+				{
+					const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate;
+					spurs_average_task_duration -= avg_entry; 
+					spurs_average_task_duration += std::min<u64>(45'000, current - spurs_last_task_timestamp);
+					spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate);
+					spurs_last_task_timestamp = 0;
+				}
+
 				while (true)
 				{
-					if (is_stopped())
+					if (is_stopped() || current - before >= spurs_wait_time)
 					{
+						// Timed-out
+						group->spurs_running++;
 						break;
 					}

@ -5008,20 +5011,12 @@ bool spu_thread::process_mfc_cmd()
 					}

 					current = get_system_time();
-
-					if (current - before >= spurs_wait_time)
-					{
-						// Timed-out
-						group->spurs_running++;
-						break;
-					}
 				}

 				state += cpu_flag::temp;
 				static_cast<void>(test_stopped());
 			}
 		}
-		while (false);

 		if (do_putllc(ch_mfc_cmd))
 		{
@ -5029,19 +5024,50 @@ bool spu_thread::process_mfc_cmd()

 			if (is_spurs_task_wait)
 			{
-				const u64 current = get_system_time();
+				const bool is_idle = (_ref<u8>(0x100 + 0x73) & (1u << index)) != 0;
+				const bool was_idle = (static_cast<u8>(rdata[0x73]) & (1u << index)) != 0;

-				if (spurs_last_task_timestamp)
+				if (!was_idle && is_idle)
 				{
-					const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate;
-					spurs_average_task_duration -= spurs_waited && !is_stopped() ? spurs_wait_duration_last + avg_entry : avg_entry; 
-					spurs_average_task_duration += std::min<u64>(45'000, current - spurs_last_task_timestamp);
-				}
+					const u32 prev_running = group->spurs_running.fetch_op([](u32& x)
+					{
+						if (x)
+						{
+							x--;
+							return true;
+						}

-				spurs_last_task_timestamp = current;
-				spurs_read_events = false;
-				spurs_waited = false;
-				spurs_entered_wait = false;
+						return false;
+					}).first;
+
+					if (prev_running)
+					{
+						spurs_entered_wait = true;
+					}
+
+					if (prev_running == group->max_run && prev_running < group->max_num)
+					{
+						group->spurs_running.notify_one();
+					}
+				}
+				else if (was_idle && !is_idle)
+				{
+					// Cleanup
+					const u64 current = get_system_time();
+
+					if (spurs_last_task_timestamp)
+					{
+						const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate;
+						spurs_average_task_duration -= avg_entry; 
+						spurs_average_task_duration += std::min<u64>(45'000, current - spurs_last_task_timestamp);
+						spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate);
+						spurs_last_task_timestamp = 0;
+					}
+
+					spurs_last_task_timestamp = current;
+					spurs_waited = false;
+					spurs_entered_wait = false;
+				}
 			}
 		}
 		else
@ -5560,6 +5586,8 @@ s64 spu_thread::get_ch_value(u32 ch)

 			thread_ctrl::wait_on(state, old);
 		}
+
+		fmt::throw_exception("Unreachable"); // Fix unannotated fallthrough warning
 	}

 	case MFC_RdTagStat:
@ -5642,53 +5670,11 @@ s64 spu_thread::get_ch_value(u32 ch)

 		auto events = get_events(mask1, false, true);

-		const bool is_spurs_task_wait = pc == 0x11a8 && spurs_addr == raddr;
-
 		if (events.count)
 		{
-			if (is_spurs_task_wait)
-			{
-				spurs_read_events = true;
-			}
-
 			return events.events & mask1;
 		}

-		if (is_spurs_task_wait)
-		{
-			spurs_read_events = true;
-
-			if (g_cfg.core.max_spurs_threads != g_cfg.core.max_spurs_threads.def && !spurs_entered_wait)
-			{
-				const u32 prev_running = group->spurs_running.fetch_op([](u32& x)
-				{
-					if (x)
-					{
-						x--;
-						return true;
-					}
-
-					return false;
-				}).first;
-
-				if (prev_running)
-				{
-					spurs_entered_wait = true;
-				}
-
-				if (prev_running == group->max_run && prev_running < group->max_num)
-				{
-					group->spurs_running.notify_one();
-
-					if (group->spurs_running == prev_running - 1)
-					{
-						// Try to let another thread slip in and take over execution 
-						thread_ctrl::wait_for(300);
-					}
-				}
-			}
-		}
-
 		spu_function_logger logger(*this, "MFC Events read");

 		lv2_obj::prepare_for_sleep(*this);
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@ -188,10 +188,10 @@ struct spu_channel_op_state
 struct alignas(16) spu_channel
 {
 	// Low 32 bits contain value
-	atomic_t<u64> data;
+	atomic_t<u64> data{};

 	// Pending value to be inserted when it is possible in pop() or pop_wait()
-	atomic_t<u64> jostling_value;
+	atomic_t<u64> jostling_value{};

 public:
 	static constexpr u32 off_wait  = 32;
@ -667,11 +667,11 @@ public:
 	u8* reserv_base_addr = vm::g_reservations;

 	// General-Purpose Registers
-	std::array<v128, 128> gpr;
-	SPU_FPSCR fpscr;
+	std::array<v128, 128> gpr{};
+	SPU_FPSCR fpscr{};

 	// MFC command data
-	spu_mfc_cmd ch_mfc_cmd;
+	spu_mfc_cmd ch_mfc_cmd{};

 	// MFC command queue
 	spu_mfc_cmd mfc_queue[16]{};
@ -683,9 +683,9 @@ public:
 	u64 mfc_last_timestamp = 0;

 	// MFC proxy command data
-	spu_mfc_cmd mfc_prxy_cmd;
+	spu_mfc_cmd mfc_prxy_cmd{};
 	shared_mutex mfc_prxy_mtx;
-	atomic_t<u32> mfc_prxy_mask;
+	atomic_t<u32> mfc_prxy_mask = 0;

 	// Tracks writes to MFC proxy command data
 	union
@ -707,11 +707,11 @@ public:
 	// Range Lock pointer
 	atomic_t<u64, 64>* range_lock{};

-	u32 srr0;
-	u32 ch_tag_upd;
-	u32 ch_tag_mask;
+	u32 srr0 = 0;
+	u32 ch_tag_upd = 0;
+	u32 ch_tag_mask = 0;
 	spu_channel ch_tag_stat;
-	u32 ch_stall_mask;
+	u32 ch_stall_mask = 0;
 	spu_channel ch_stall_stat;
 	spu_channel ch_atomic_stat;

@ -736,14 +736,14 @@ public:
 	};

 	atomic_t<ch_events_t> ch_events;
-	bool interrupts_enabled;
+	bool interrupts_enabled = false;

-	u64 ch_dec_start_timestamp; // timestamp of writing decrementer value
-	u32 ch_dec_value; // written decrementer value
+	u64 ch_dec_start_timestamp = 0; // timestamp of writing decrementer value
+	u32 ch_dec_value = 0; // written decrementer value
 	bool is_dec_frozen = false;
 	std::pair<u32, u32> read_dec() const; // Read decrementer

-	atomic_t<u32> run_ctrl; // SPU Run Control register (only provided to get latest data written)
+	atomic_t<u32> run_ctrl = 0; // SPU Run Control register (only provided to get latest data written)
 	shared_mutex run_ctrl_mtx;

 	struct alignas(8) status_npc_sync_var
@ -752,10 +752,10 @@ public:
 		u32 npc; // SPU Next Program Counter register
 	};

-	atomic_t<status_npc_sync_var> status_npc;
-	std::array<spu_int_ctrl_t, 3> int_ctrl; // SPU Class 0, 1, 2 Interrupt Management
+	atomic_t<status_npc_sync_var> status_npc{};
+	std::array<spu_int_ctrl_t, 3> int_ctrl{}; // SPU Class 0, 1, 2 Interrupt Management

-	std::array<std::pair<u32, std::shared_ptr<lv2_event_queue>>, 32> spuq; // Event Queue Keys for SPU Thread
+	std::array<std::pair<u32, std::shared_ptr<lv2_event_queue>>, 32> spuq{}; // Event Queue Keys for SPU Thread
 	std::shared_ptr<lv2_event_queue> spup[64]; // SPU Ports
 	spu_channel exit_status{}; // Threaded SPU exit status (not a channel, but the interface fits)
 	atomic_t<u32> last_exit_status; // Value to be written in exit_status after checking group termination
@ -769,7 +769,6 @@ public:
 	u32 spurs_addr = 0;
 	bool spurs_waited = false;
 	bool spurs_entered_wait = false;
-	bool spurs_read_events = false;
 	u64 spurs_wait_duration_last = 0;
 	u64 spurs_average_task_duration = 0;
 	u64 spurs_last_task_timestamp = 0;
--- a/rpcs3/Emu/Cell/lv2/lv2.cpp
+++ b/rpcs3/Emu/Cell/lv2/lv2.cpp
@ -1852,7 +1852,7 @@ void lv2_obj::schedule_all(u64 current_time)

 				target->start_time = 0;

-				if ((target->state.fetch_op(FN(x += cpu_flag::signal, x -= cpu_flag::suspend, x-= remove_yield, void())) & (cpu_flag::wait + cpu_flag::signal)) != cpu_flag::wait)
+				if ((target->state.fetch_op(AOFN(x += cpu_flag::signal, x -= cpu_flag::suspend, x-= remove_yield, void())) & (cpu_flag::wait + cpu_flag::signal)) != cpu_flag::wait)
 				{
 					continue;
 				}
--- a/rpcs3/Emu/Cell/lv2/sys_event.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_event.cpp
@ -119,8 +119,13 @@ std::shared_ptr<lv2_event_queue> lv2_event_queue::find(u64 ipc_key)

 extern void resume_spu_thread_group_from_waiting(spu_thread& spu);

-CellError lv2_event_queue::send(lv2_event event)
+CellError lv2_event_queue::send(lv2_event event, bool* notified_thread, lv2_event_port* port)
 {
+	if (notified_thread)
+	{
+		*notified_thread = false;
+	}
+
 	std::lock_guard lock(mutex);

 	if (!exists)
@ -162,6 +167,15 @@ CellError lv2_event_queue::send(lv2_event event)
 		std::tie(ppu.gpr[4], ppu.gpr[5], ppu.gpr[6], ppu.gpr[7]) = event;

 		awake(&ppu);
+
+		if (port && ppu.prio.load().prio < ensure(cpu_thread::get_current<ppu_thread>())->prio.load().prio)
+		{
+			// Block event port disconnection for the time being of sending events
+			// PPU -> lower prio PPU is the only case that can cause thread blocking 
+			port->is_busy++;
+			ensure(notified_thread);
+			*notified_thread = true;
+		}
 	}
 	else
 	{
@ -709,7 +723,10 @@ error_code sys_event_port_disconnect(ppu_thread& ppu, u32 eport_id)
 		return CELL_ENOTCONN;
 	}

-	// TODO: return CELL_EBUSY if necessary (can't detect the condition)
+	if (port->is_busy)
+	{
+		return CELL_EBUSY;
+	}

 	port->queue.reset();

@ -718,20 +735,32 @@ error_code sys_event_port_disconnect(ppu_thread& ppu, u32 eport_id)

 error_code sys_event_port_send(u32 eport_id, u64 data1, u64 data2, u64 data3)
 {
-	if (auto cpu = get_current_cpu_thread())
+	const auto cpu = cpu_thread::get_current();
+	const auto ppu = cpu ? cpu->try_get<ppu_thread>() : nullptr;
+
+	if (cpu)
 	{
 		cpu->state += cpu_flag::wait;
 	}

 	sys_event.trace("sys_event_port_send(eport_id=0x%x, data1=0x%llx, data2=0x%llx, data3=0x%llx)", eport_id, data1, data2, data3);

+	bool notified_thread = false;
+
 	const auto port = idm::check<lv2_obj, lv2_event_port>(eport_id, [&, notify = lv2_obj::notify_all_t()](lv2_event_port& port) -> CellError
 	{
+		if (ppu && ppu->loaded_from_savestate)
+		{
+			port.is_busy++;
+			notified_thread = true;
+			return {};
+		}
+
 		if (lv2_obj::check(port.queue))
 		{
 			const u64 source = port.name ? port.name : (u64{process_getpid() + 0u} << 32) | u64{eport_id};

-			return port.queue->send(source, data1, data2, data3);
+			return port.queue->send(source, data1, data2, data3, &notified_thread, ppu && port.queue->type == SYS_PPU_QUEUE ? &port : nullptr);
 		}

 		return CELL_ENOTCONN;
@ -742,6 +771,19 @@ error_code sys_event_port_send(u32 eport_id, u64 data1, u64 data2, u64 data3)
 		return CELL_ESRCH;
 	}

+	if (ppu && notified_thread)
+	{
+		// Wait to be requeued
+		if (ppu->test_stopped())
+		{
+			// Wait again on savestate load
+			ppu->state += cpu_flag::again;
+		}
+
+		port->is_busy--;
+		return CELL_OK;
+	}
+
 	if (port.ret)
 	{
 		if (port.ret == CELL_EAGAIN)
--- a/rpcs3/Emu/Cell/lv2/sys_event.h
+++ b/rpcs3/Emu/Cell/lv2/sys_event.h
@ -79,6 +79,8 @@ struct sys_event_t
 // Source, data1, data2, data3
 using lv2_event = std::tuple<u64, u64, u64, u64>;

+struct lv2_event_port;
+
 struct lv2_event_queue final : public lv2_obj
 {
 	static const u32 id_base = 0x8d000000;
@ -103,11 +105,11 @@ struct lv2_event_queue final : public lv2_obj
 	static void save_ptr(utils::serial&, lv2_event_queue*);
 	static std::shared_ptr<lv2_event_queue> load_ptr(utils::serial& ar, std::shared_ptr<lv2_event_queue>& queue, std::string_view msg = {});

-	CellError send(lv2_event event);
+	CellError send(lv2_event event, bool* notified_thread = nullptr, lv2_event_port* port = nullptr);

-	CellError send(u64 source, u64 d1, u64 d2, u64 d3)
+	CellError send(u64 source, u64 d1, u64 d2, u64 d3, bool* notified_thread = nullptr, lv2_event_port* port = nullptr)
 	{
-		return send(std::make_tuple(source, d1, d2, d3));
+		return send(std::make_tuple(source, d1, d2, d3), notified_thread, port);
 	}

 	// Get event queue by its global key
@ -121,6 +123,7 @@ struct lv2_event_port final : lv2_obj
 	const s32 type; // Port type, either IPC or local
 	const u64 name; // Event source (generated from id and process id if not set)

+	atomic_t<usz> is_busy = 0; // Counts threads waiting on event sending
 	std::shared_ptr<lv2_event_queue> queue; // Event queue this port is connected to

 	lv2_event_port(s32 type, u64 name)
--- a/rpcs3/Emu/Cell/lv2/sys_event_flag.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_event_flag.cpp
@ -357,16 +357,14 @@ error_code sys_event_flag_set(cpu_thread& cpu, u32 id, u64 bitptn)
 			}
 		}

-		// Process all waiters in single atomic op
-		const u32 count = flag->pattern.atomic_op([&](u64& value)
-		{
-			value |= bitptn;
-			u32 count = 0;
+		u32 count = 0;

-			if (!flag->sq)
-			{
-				return count;
-			}
+		// Process all waiters in single atomic op
+		for (u64 pattern = flag->pattern, to_write = pattern, dependant_mask = 0;; to_write = pattern, dependant_mask = 0)
+		{
+			count = 0;
+			to_write |= bitptn;
+			dependant_mask = 0;

 			for (auto ppu = +flag->sq; ppu; ppu = ppu->next_cpu)
 			{
@ -405,10 +403,20 @@ error_code sys_event_flag_set(cpu_thread& cpu, u32 id, u64 bitptn)
 				const u64 pattern = ppu.gpr[4];
 				const u64 mode = ppu.gpr[5];

-				if (lv2_event_flag::check_pattern(value, pattern, mode, &ppu.gpr[6]))
+				// If it's OR mode, set bits must have waken up the thread therefore no
+				// dependency on old value
+				const u64 dependant_mask_or = ((mode & 0xf) == SYS_EVENT_FLAG_WAIT_OR || (bitptn & pattern & to_write) == pattern ? 0 : pattern);
+
+				if (lv2_event_flag::check_pattern(to_write, pattern, mode, &ppu.gpr[6]))
 				{
+					dependant_mask |= dependant_mask_or;
 					ppu.gpr[3] = CELL_OK;
 					count++;
+
+					if (!to_write)
+					{
+						break;
+					}
 				}
 				else
 				{
@ -416,8 +424,29 @@ error_code sys_event_flag_set(cpu_thread& cpu, u32 id, u64 bitptn)
 				}
 			}

-			return count;
-		});
+			dependant_mask &= ~bitptn;
+
+			auto [new_val, ok] = flag->pattern.fetch_op([&](u64& x)
+			{
+				if ((x ^ pattern) & dependant_mask)
+				{
+					return false;
+				}
+
+				x |= bitptn;
+
+				// Clear the bit-wise difference
+				x &= ~((pattern | bitptn) & ~to_write);
+				return true;
+			});
+
+			if (ok)
+			{
+				break;
+			}
+
+			pattern = new_val;
+		}

 		if (!count)
 		{
--- a/rpcs3/Emu/Cell/lv2/sys_lwmutex.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_lwmutex.cpp
@ -142,7 +142,7 @@ error_code _sys_lwmutex_lock(ppu_thread& ppu, u32 lwmutex_id, u64 timeout)

 	const auto mutex = idm::get<lv2_obj, lv2_lwmutex>(lwmutex_id, [&, notify = lv2_obj::notify_all_t()](lv2_lwmutex& mutex)
 	{
-		if (s32 signal = mutex.lv2_control.fetch_op([](auto& data)
+		if (s32 signal = mutex.lv2_control.fetch_op([](lv2_lwmutex::control_data_t& data)
 		{
 			if (data.signaled)
 			{
@ -297,7 +297,7 @@ error_code _sys_lwmutex_trylock(ppu_thread& ppu, u32 lwmutex_id)

 	const auto mutex = idm::check<lv2_obj, lv2_lwmutex>(lwmutex_id, [&](lv2_lwmutex& mutex)
 	{
-		auto [_, ok] = mutex.lv2_control.fetch_op([](auto& data)
+		auto [_, ok] = mutex.lv2_control.fetch_op([](lv2_lwmutex::control_data_t& data)
 		{
 			if (data.signaled & 1)
 			{
--- a/rpcs3/Emu/Cell/lv2/sys_memory.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_memory.cpp
@ -93,7 +93,7 @@ std::shared_ptr<vm::block_t> reserve_map(u32 alloc_size, u32 align)

 // Todo: fix order of error checks

-error_code sys_memory_allocate(cpu_thread& cpu, u32 size, u64 flags, vm::ptr<u32> alloc_addr)
+error_code sys_memory_allocate(cpu_thread& cpu, u64 size, u64 flags, vm::ptr<u32> alloc_addr)
 {
 	cpu.state += cpu_flag::wait;

@ -129,9 +129,9 @@ error_code sys_memory_allocate(cpu_thread& cpu, u32 size, u64 flags, vm::ptr<u32
 		return {CELL_ENOMEM, dct.size - dct.used};
 	}

-	if (const auto area = reserve_map(size, align))
+	if (const auto area = reserve_map(static_cast<u32>(size), align))
 	{
-		if (const u32 addr = area->alloc(size, nullptr, align))
+		if (const u32 addr = area->alloc(static_cast<u32>(size), nullptr, align))
 		{
 			ensure(!g_fxo->get<sys_memory_address_table>().addrs[addr >> 16].exchange(&dct));

@ -139,7 +139,7 @@ error_code sys_memory_allocate(cpu_thread& cpu, u32 size, u64 flags, vm::ptr<u32
 			{
 				sys_memory.notice("sys_memory_allocate(): Allocated 0x%x address (size=0x%x)", addr, size);

-				vm::lock_sudo(addr, size);
+				vm::lock_sudo(addr, static_cast<u32>(size));
 				cpu.check_state();
 				*alloc_addr = addr;
 				return CELL_OK;
@ -155,7 +155,7 @@ error_code sys_memory_allocate(cpu_thread& cpu, u32 size, u64 flags, vm::ptr<u32
 	return CELL_ENOMEM;
 }

-error_code sys_memory_allocate_from_container(cpu_thread& cpu, u32 size, u32 cid, u64 flags, vm::ptr<u32> alloc_addr)
+error_code sys_memory_allocate_from_container(cpu_thread& cpu, u64 size, u32 cid, u64 flags, vm::ptr<u32> alloc_addr)
 {
 	cpu.state += cpu_flag::wait;

@ -203,15 +203,15 @@ error_code sys_memory_allocate_from_container(cpu_thread& cpu, u32 size, u32 cid
 		return {ct.ret, ct->size - ct->used};
 	}

-	if (const auto area = reserve_map(size, align))
+	if (const auto area = reserve_map(static_cast<u32>(size), align))
 	{
-		if (const u32 addr = area->alloc(size))
+		if (const u32 addr = area->alloc(static_cast<u32>(size)))
 		{
 			ensure(!g_fxo->get<sys_memory_address_table>().addrs[addr >> 16].exchange(ct.ptr.get()));

 			if (alloc_addr)
 			{
-				vm::lock_sudo(addr, size);
+				vm::lock_sudo(addr, static_cast<u32>(size));
 				cpu.check_state();
 				*alloc_addr = addr;
 				return CELL_OK;
@ -320,7 +320,7 @@ error_code sys_memory_get_user_memory_stat(cpu_thread& cpu, vm::ptr<sys_memory_u
 	return CELL_OK;
 }

-error_code sys_memory_container_create(cpu_thread& cpu, vm::ptr<u32> cid, u32 size)
+error_code sys_memory_container_create(cpu_thread& cpu, vm::ptr<u32> cid, u64 size)
 {
 	cpu.state += cpu_flag::wait;

@ -345,7 +345,7 @@ error_code sys_memory_container_create(cpu_thread& cpu, vm::ptr<u32> cid, u32 si
 	}

 	// Create the memory container
-	if (const u32 id = idm::make<lv2_memory_container>(size, true))
+	if (const u32 id = idm::make<lv2_memory_container>(static_cast<u32>(size), true))
 	{
 		cpu.check_state();
 		*cid = id;
--- a/rpcs3/Emu/Cell/lv2/sys_memory.h
+++ b/rpcs3/Emu/Cell/lv2/sys_memory.h
@ -128,13 +128,13 @@ struct sys_memory_user_memory_stat_t
 };

 // SysCalls
-error_code sys_memory_allocate(cpu_thread& cpu, u32 size, u64 flags, vm::ptr<u32> alloc_addr);
-error_code sys_memory_allocate_from_container(cpu_thread& cpu, u32 size, u32 cid, u64 flags, vm::ptr<u32> alloc_addr);
+error_code sys_memory_allocate(cpu_thread& cpu, u64 size, u64 flags, vm::ptr<u32> alloc_addr);
+error_code sys_memory_allocate_from_container(cpu_thread& cpu, u64 size, u32 cid, u64 flags, vm::ptr<u32> alloc_addr);
 error_code sys_memory_free(cpu_thread& cpu, u32 start_addr);
 error_code sys_memory_get_page_attribute(cpu_thread& cpu, u32 addr, vm::ptr<sys_page_attr_t> attr);
 error_code sys_memory_get_user_memory_size(cpu_thread& cpu, vm::ptr<sys_memory_info_t> mem_info);
 error_code sys_memory_get_user_memory_stat(cpu_thread& cpu, vm::ptr<sys_memory_user_memory_stat_t> mem_stat);
-error_code sys_memory_container_create(cpu_thread& cpu, vm::ptr<u32> cid, u32 size);
+error_code sys_memory_container_create(cpu_thread& cpu, vm::ptr<u32> cid, u64 size);
 error_code sys_memory_container_destroy(cpu_thread& cpu, u32 cid);
 error_code sys_memory_container_get_size(cpu_thread& cpu, vm::ptr<sys_memory_info_t> mem_info, u32 cid);
 error_code sys_memory_container_destroy_parent_with_childs(cpu_thread& cpu, u32 cid, u32 must_0, vm::ptr<u32> mc_child);
--- a/rpcs3/Emu/Cell/lv2/sys_spu.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_spu.cpp
@ -562,20 +562,48 @@ error_code sys_spu_thread_initialize(ppu_thread& ppu, vm::ptr<u32> thread, u32 g

 	sys_spu.warning("sys_spu_thread_initialize(thread=*0x%x, group=0x%x, spu_num=%d, img=*0x%x, attr=*0x%x, arg=*0x%x)", thread, group_id, spu_num, img, attr, arg);

-	const u32 option = attr->option;
-
-	if (attr->name_len > 0x80 || option & ~(SYS_SPU_THREAD_OPTION_DEC_SYNC_TB_ENABLE | SYS_SPU_THREAD_OPTION_ASYNC_INTR_ENABLE))
+	if (spu_num >= std::size(decltype(lv2_spu_group::threads_map){}))
 	{
 		return CELL_EINVAL;
 	}

-	sys_spu_image image;
+	if (!attr)
+	{
+		return CELL_EFAULT;
+	}

-	switch (img->type)
+	const sys_spu_thread_attribute attr_data = *attr;
+
+	if (attr_data.name_len > 0x80)
+	{
+		return CELL_EINVAL;
+	}
+
+	if (!arg)
+	{
+		return CELL_EFAULT;
+	}
+
+	const sys_spu_thread_argument args = *arg;
+	const u32 option = attr_data.option;
+
+	if (option & ~(SYS_SPU_THREAD_OPTION_DEC_SYNC_TB_ENABLE | SYS_SPU_THREAD_OPTION_ASYNC_INTR_ENABLE))
+	{
+		return CELL_EINVAL;
+	}
+
+	if (!img)
+	{
+		return CELL_EFAULT;
+	}
+
+	sys_spu_image image = *img;
+
+	switch (image.type)
 	{
 	case SYS_SPU_IMAGE_TYPE_KERNEL:
 	{
-		const auto handle = idm::get<lv2_obj, lv2_spu_image>(img->entry_point);
+		const auto handle = idm::get<lv2_obj, lv2_spu_image>(image.entry_point);

 		if (!handle)
 		{
@ -591,12 +619,11 @@ error_code sys_spu_thread_initialize(ppu_thread& ppu, vm::ptr<u32> thread, u32 g
 	}
 	case SYS_SPU_IMAGE_TYPE_USER:
 	{
-		if (img->entry_point > 0x3fffc || img->nsegs <= 0 || img->nsegs > 0x20)
+		if (image.entry_point > 0x3fffc || image.nsegs <= 0 || image.nsegs > 0x20)
 		{
 			return CELL_EINVAL;
 		}

-		image = *img;
 		break;
 	}
 	default: return CELL_EINVAL;
@ -672,7 +699,7 @@ error_code sys_spu_thread_initialize(ppu_thread& ppu, vm::ptr<u32> thread, u32 g
 	}

 	// Read thread name
-	const std::string thread_name(attr->name.get_ptr(), std::max<u32>(attr->name_len, 1) - 1);
+	const std::string thread_name(attr_data.name.get_ptr(), std::max<u32>(attr_data.name_len, 1) - 1);

 	const auto group = idm::get<lv2_spu_group>(group_id);

@ -681,11 +708,6 @@ error_code sys_spu_thread_initialize(ppu_thread& ppu, vm::ptr<u32> thread, u32 g
 		return CELL_ESRCH;
 	}

-	if (spu_num >= group->threads_map.size())
-	{
-		return CELL_EINVAL;
-	}
-
 	std::unique_lock lock(group->mutex);

 	if (auto state = +group->run_state; state != SPU_THREAD_GROUP_STATUS_NOT_INITIALIZED)
@ -725,7 +747,7 @@ error_code sys_spu_thread_initialize(ppu_thread& ppu, vm::ptr<u32> thread, u32 g
 	ensure(vm::get(vm::spu)->falloc(spu->vm_offset(), SPU_LS_SIZE, &spu->shm, static_cast<u64>(vm::page_size_64k) | static_cast<u64>(vm::alloc_hidden)));
 	spu->map_ls(*spu->shm, spu->ls);

-	group->args[inited] = {arg->arg1, arg->arg2, arg->arg3, arg->arg4};
+	group->args[inited] = {args.arg1, args.arg2, args.arg3, args.arg4};
 	group->imgs[inited].first = image.entry_point;
 	group->imgs[inited].second = std::move(spu_segs);

@ -800,12 +822,14 @@ error_code sys_spu_thread_group_create(ppu_thread& ppu, vm::ptr<u32> id, u32 num

 	const s32 min_prio = g_ps3_process_info.has_root_perm() ? 0 : 16;

-	if (attr->nsize > 0x80 || !num)
+	const sys_spu_thread_group_attribute attr_data = *attr;
+
+	if (attr_data.nsize > 0x80 || !num)
 	{
 		return CELL_EINVAL;
 	}

-	const s32 type = attr->type;
+	const s32 type = attr_data.type;

 	bool use_scheduler = true;
 	bool use_memct = !!(type & SYS_SPU_THREAD_GROUP_TYPE_MEMORY_FROM_CONTAINER);
@ -902,7 +926,7 @@ error_code sys_spu_thread_group_create(ppu_thread& ppu, vm::ptr<u32> id, u32 num

 	if (use_memct && mem_size)
 	{
-		const auto sct = idm::get<lv2_memory_container>(attr->ct);
+		const auto sct = idm::get<lv2_memory_container>(attr_data.ct);

 		if (!sct)
 		{
@ -936,7 +960,7 @@ error_code sys_spu_thread_group_create(ppu_thread& ppu, vm::ptr<u32> id, u32 num
 		return CELL_EBUSY;
 	}

-	const auto group = idm::make_ptr<lv2_spu_group>(std::string(attr->name.get_ptr(), std::max<u32>(attr->nsize, 1) - 1), num, prio, type, ct, use_scheduler, mem_size);
+	const auto group = idm::make_ptr<lv2_spu_group>(std::string(attr_data.name.get_ptr(), std::max<u32>(attr_data.nsize, 1) - 1), num, prio, type, ct, use_scheduler, mem_size);

 	if (!group)
 	{
@ -1807,6 +1831,11 @@ error_code sys_spu_thread_write_snr(ppu_thread& ppu, u32 id, u32 number, u32 val

 	sys_spu.trace("sys_spu_thread_write_snr(id=0x%x, number=%d, value=0x%x)", id, number, value);

+	if (number > 1)
+	{
+		return CELL_EINVAL;
+	}
+
 	const auto [thread, group] = lv2_spu_group::get_thread(id);

 	if (!thread) [[unlikely]]
@ -1814,11 +1843,6 @@ error_code sys_spu_thread_write_snr(ppu_thread& ppu, u32 id, u32 number, u32 val
 		return CELL_ESRCH;
 	}

-	if (number > 1)
-	{
-		return CELL_EINVAL;
-	}
-
 	thread->push_snr(number, value);

 	return CELL_OK;
@ -1895,21 +1919,19 @@ error_code sys_spu_thread_group_disconnect_event(ppu_thread& ppu, u32 id, u32 et
 	if (!ep)
 	{
 		sys_spu.error("sys_spu_thread_group_disconnect_event(): unknown event type (%d)", et);
-		return CELL_EINVAL;
+		return CELL_OK;
 	}

+	// No error checking is performed
+
 	std::lock_guard lock(group->mutex);

-	if (!lv2_obj::check(*ep))
-	{
-		return CELL_EINVAL;
-	}
-
 	ep->reset();
+
 	return CELL_OK;
 }

-error_code sys_spu_thread_connect_event(ppu_thread& ppu, u32 id, u32 eq, u32 et, u8 spup)
+error_code sys_spu_thread_connect_event(ppu_thread& ppu, u32 id, u32 eq, u32 et, u32 spup)
 {
 	ppu.state += cpu_flag::wait;

@ -1943,7 +1965,7 @@ error_code sys_spu_thread_connect_event(ppu_thread& ppu, u32 id, u32 eq, u32 et,
 	return CELL_OK;
 }

-error_code sys_spu_thread_disconnect_event(ppu_thread& ppu, u32 id, u32 et, u8 spup)
+error_code sys_spu_thread_disconnect_event(ppu_thread& ppu, u32 id, u32 et, u32 spup)
 {
 	ppu.state += cpu_flag::wait;

@ -2068,6 +2090,11 @@ error_code sys_spu_thread_group_connect_event_all_threads(ppu_thread& ppu, u32 i

 	sys_spu.warning("sys_spu_thread_group_connect_event_all_threads(id=0x%x, eq=0x%x, req=0x%llx, spup=*0x%x)", id, eq, req, spup);

+	if (!req)
+	{
+		return CELL_EINVAL;
+	}
+
 	const auto group = idm::get<lv2_spu_group>(id);
 	const auto queue = idm::get<lv2_obj, lv2_event_queue>(eq);

@ -2076,11 +2103,6 @@ error_code sys_spu_thread_group_connect_event_all_threads(ppu_thread& ppu, u32 i
 		return CELL_ESRCH;
 	}

-	if (!req)
-	{
-		return CELL_EINVAL;
-	}
-
 	std::unique_lock lock(group->mutex);

 	if (auto state = +group->run_state;
@ -2144,12 +2166,17 @@ error_code sys_spu_thread_group_connect_event_all_threads(ppu_thread& ppu, u32 i
 	return CELL_OK;
 }

-error_code sys_spu_thread_group_disconnect_event_all_threads(ppu_thread& ppu, u32 id, u8 spup)
+error_code sys_spu_thread_group_disconnect_event_all_threads(ppu_thread& ppu, u32 id, u32 spup)
 {
 	ppu.state += cpu_flag::wait;

 	sys_spu.warning("sys_spu_thread_group_disconnect_event_all_threads(id=0x%x, spup=%d)", id, spup);

+	if (spup > 63)
+	{
+		return CELL_EINVAL;
+	}
+
 	const auto group = idm::get<lv2_spu_group>(id);

 	if (!group)
@ -2157,11 +2184,6 @@ error_code sys_spu_thread_group_disconnect_event_all_threads(ppu_thread& ppu, u3
 		return CELL_ESRCH;
 	}

-	if (spup > 63)
-	{
-		return CELL_EINVAL;
-	}
-
 	std::lock_guard lock(group->mutex);

 	for (auto& t : group->threads)
--- a/rpcs3/Emu/Cell/lv2/sys_spu.h
+++ b/rpcs3/Emu/Cell/lv2/sys_spu.h
@ -372,7 +372,7 @@ error_code sys_spu_thread_group_get_priority(ppu_thread&, u32 id, vm::ptr<s32> p
 error_code sys_spu_thread_group_connect_event(ppu_thread&, u32 id, u32 eq, u32 et);
 error_code sys_spu_thread_group_disconnect_event(ppu_thread&, u32 id, u32 et);
 error_code sys_spu_thread_group_connect_event_all_threads(ppu_thread&, u32 id, u32 eq_id, u64 req, vm::ptr<u8> spup);
-error_code sys_spu_thread_group_disconnect_event_all_threads(ppu_thread&, u32 id, u8 spup);
+error_code sys_spu_thread_group_disconnect_event_all_threads(ppu_thread&, u32 id, u32 spup);
 error_code sys_spu_thread_group_set_cooperative_victims(ppu_thread&, u32 id, u32 threads_mask);
 error_code sys_spu_thread_group_syscall_253(ppu_thread& ppu, u32 id, vm::ptr<sys_spu_thread_group_syscall_253_info> info);
 error_code sys_spu_thread_group_log(ppu_thread&, s32 command, vm::ptr<s32> stat);
@ -382,8 +382,8 @@ error_code sys_spu_thread_write_spu_mb(ppu_thread&, u32 id, u32 value);
 error_code sys_spu_thread_set_spu_cfg(ppu_thread&, u32 id, u64 value);
 error_code sys_spu_thread_get_spu_cfg(ppu_thread&, u32 id, vm::ptr<u64> value);
 error_code sys_spu_thread_write_snr(ppu_thread&, u32 id, u32 number, u32 value);
-error_code sys_spu_thread_connect_event(ppu_thread&, u32 id, u32 eq, u32 et, u8 spup);
-error_code sys_spu_thread_disconnect_event(ppu_thread&, u32 id, u32 et, u8 spup);
+error_code sys_spu_thread_connect_event(ppu_thread&, u32 id, u32 eq, u32 et, u32 spup);
+error_code sys_spu_thread_disconnect_event(ppu_thread&, u32 id, u32 et, u32 spup);
 error_code sys_spu_thread_bind_queue(ppu_thread&, u32 id, u32 spuq, u32 spuq_num);
 error_code sys_spu_thread_unbind_queue(ppu_thread&, u32 id, u32 spuq_num);
 error_code sys_spu_thread_get_exit_status(ppu_thread&, u32 id, vm::ptr<s32> status);
--- a/rpcs3/Emu/Cell/lv2/sys_ss.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_ss.cpp
@ -156,7 +156,7 @@ error_code sys_ss_random_number_generator(u64 pkg_id, vm::ptr<void> buf, u64 siz

 error_code sys_ss_access_control_engine(u64 pkg_id, u64 a2, u64 a3)
 {
-	sys_ss.todo("sys_ss_access_control_engine(pkg_id=0x%llx, a2=0x%llx, a3=0x%llx)", pkg_id, a2, a3);
+	sys_ss.success("sys_ss_access_control_engine(pkg_id=0x%llx, a2=0x%llx, a3=0x%llx)", pkg_id, a2, a3);

 	const u64 authid = g_ps3_process_info.self_info.valid ?
 		g_ps3_process_info.self_info.prog_id_hdr.program_authority_id : 0;
@ -167,7 +167,7 @@ error_code sys_ss_access_control_engine(u64 pkg_id, u64 a2, u64 a3)
 	{
 		if (!g_ps3_process_info.debug_or_root())
 		{
-			return CELL_ENOSYS;
+			return not_an_error(CELL_ENOSYS);
 		}

 		if (!a2)
--- a/rpcs3/Emu/Cell/lv2/sys_time.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_time.cpp
@ -7,6 +7,7 @@
 #include "Emu/Cell/timers.hpp"

 #include "util/asm.hpp"
+#include "util/sysinfo.hpp"

 static u64 timebase_offset;
 static u64 systemtime_offset;
@ -146,6 +147,18 @@ u64 convert_to_timebased_time(u64 time)

 u64 get_timebased_time()
 {
+	if (u64 freq = utils::get_tsc_freq())
+	{
+		const u64 tsc = utils::get_tsc();
+
+#if _MSC_VER
+		const u64 result = static_cast<u64>(u128_from_mul(tsc, g_timebase_freq) / freq) * g_cfg.core.clocks_scale / 100u;
+#else
+		const u64 result = (tsc / freq * g_timebase_freq + tsc % freq * g_timebase_freq / freq) * g_cfg.core.clocks_scale / 100u;
+#endif
+		return result - timebase_offset;
+	}
+
 	while (true)
 	{
 #ifdef _WIN32
@ -155,7 +168,11 @@ u64 get_timebased_time()
 		const u64 time = count.QuadPart;
 		const u64 freq = s_time_aux_info.perf_freq;

+#if _MSC_VER
+		const u64 result = static_cast<u64>(u128_from_mul(time * g_cfg.core.clocks_scale, g_timebase_freq) / freq / 100u);
+#else
 		const u64 result = (time / freq * g_timebase_freq + time % freq * g_timebase_freq / freq) * g_cfg.core.clocks_scale / 100u;
+#endif
 #else
 		struct timespec ts;
 		ensure(::clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
@ -190,6 +207,18 @@ void initialize_timebased_time(u64 timebased_init, bool reset)
 // Returns some relative time in microseconds, don't change this fact
 u64 get_system_time()
 {
+	if (u64 freq = utils::get_tsc_freq())
+	{
+		const u64 tsc = utils::get_tsc();
+
+#if _MSC_VER
+		const u64 result = static_cast<u64>(u128_from_mul(tsc, 1000000ull) / freq);
+#else
+		const u64 result = (tsc / freq * 1000000ull + tsc % freq * 1000000ull / freq);
+#endif
+		return result;
+	}
+
 	while (true)
 	{
 #ifdef _WIN32
@ -199,7 +228,11 @@ u64 get_system_time()
 		const u64 time = count.QuadPart;
 		const u64 freq = s_time_aux_info.perf_freq;

+#if _MSC_VER
+		const u64 result = static_cast<u64>(u128_from_mul(time, 1000000ull) / freq);
+#else
 		const u64 result = time / freq * 1000000ull + (time % freq) * 1000000ull / freq;
+#endif
 #else
 		struct timespec ts;
 		ensure(::clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
--- a/rpcs3/Emu/Cell/lv2/sys_vm.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_vm.cpp
@ -48,13 +48,18 @@ sys_vm_t::sys_vm_t(utils::serial& ar)
 	g_fxo->get<sys_vm_global_t>().total_vsize += size;
 }

-error_code sys_vm_memory_map(ppu_thread& ppu, u32 vsize, u32 psize, u32 cid, u64 flag, u64 policy, vm::ptr<u32> addr)
+error_code sys_vm_memory_map(ppu_thread& ppu, u64 vsize, u64 psize, u32 cid, u64 flag, u64 policy, vm::ptr<u32> addr)
 {
 	ppu.state += cpu_flag::wait;

 	sys_vm.warning("sys_vm_memory_map(vsize=0x%x, psize=0x%x, cid=0x%x, flags=0x%x, policy=0x%x, addr=*0x%x)", vsize, psize, cid, flag, policy, addr);

-	if (!vsize || !psize || vsize % 0x2000000 || vsize > 0x10000000 || psize > 0x10000000 || policy != SYS_VM_POLICY_AUTO_RECOMMENDED)
+	if (!vsize || !psize || vsize % 0x200'0000 || vsize > 0x1000'0000 || psize % 0x1'0000 || policy != SYS_VM_POLICY_AUTO_RECOMMENDED)
+	{
+		return CELL_EINVAL;
+	}
+
+	if (ppu.gpr[11] == 300 && psize < 0x10'0000)
 	{
 		return CELL_EINVAL;
 	}
@ -68,16 +73,16 @@ error_code sys_vm_memory_map(ppu_thread& ppu, u32 vsize, u32 psize, u32 cid, u64
 		return CELL_ESRCH;
 	}

-	if (!g_fxo->get<sys_vm_global_t>().total_vsize.fetch_op([vsize](u32& size)
+	if (!g_fxo->get<sys_vm_global_t>().total_vsize.fetch_op([vsize, has_root = g_ps3_process_info.has_root_perm()](u32& size)
 	{
 		// A single process can hold up to 256MB of virtual memory, even on DECR
 		// VSH can hold more
-		if ((g_ps3_process_info.has_root_perm() ? 0x1E000000 : 0x10000000) - size < vsize)
+		if ((has_root ? 0x1E000000 : 0x10000000) - size < vsize)
 		{
 			return false;
 		}

-		size += vsize;
+		size += static_cast<u32>(vsize);
 		return true;
 	}).second)
 	{
@ -86,7 +91,7 @@ error_code sys_vm_memory_map(ppu_thread& ppu, u32 vsize, u32 psize, u32 cid, u64

 	if (!ct->take(psize))
 	{
-		g_fxo->get<sys_vm_global_t>().total_vsize -= vsize;
+		g_fxo->get<sys_vm_global_t>().total_vsize -= static_cast<u32>(vsize);
 		return CELL_ENOMEM;
 	}

@ -96,10 +101,10 @@ error_code sys_vm_memory_map(ppu_thread& ppu, u32 vsize, u32 psize, u32 cid, u64
 		sys_vm.warning("sys_vm_memory_map(): Found VM 0x%x area (vsize=0x%x)", addr, vsize);

 		// Alloc all memory (shall not fail)
-		ensure(area->alloc(vsize));
-		vm::lock_sudo(area->addr, vsize);
+		ensure(area->alloc(static_cast<u32>(vsize)));
+		vm::lock_sudo(area->addr, static_cast<u32>(vsize));

-		idm::make<sys_vm_t>(area->addr, vsize, ct, psize);
+		idm::make<sys_vm_t>(area->addr, static_cast<u32>(vsize), ct, static_cast<u32>(psize));

 		// Write a pointer for the allocated memory
 		ppu.check_state();
@ -108,11 +113,11 @@ error_code sys_vm_memory_map(ppu_thread& ppu, u32 vsize, u32 psize, u32 cid, u64
 	}

 	ct->free(psize);
-	g_fxo->get<sys_vm_global_t>().total_vsize -= vsize;
+	g_fxo->get<sys_vm_global_t>().total_vsize -= static_cast<u32>(vsize);
 	return CELL_ENOMEM;
 }

-error_code sys_vm_memory_map_different(ppu_thread& ppu, u32 vsize, u32 psize, u32 cid, u64 flag, u64 policy, vm::ptr<u32> addr)
+error_code sys_vm_memory_map_different(ppu_thread& ppu, u64 vsize, u64 psize, u32 cid, u64 flag, u64 policy, vm::ptr<u32> addr)
 {
 	ppu.state += cpu_flag::wait;

@ -153,7 +158,7 @@ error_code sys_vm_unmap(ppu_thread& ppu, u32 addr)
 	return CELL_OK;
 }

-error_code sys_vm_append_memory(ppu_thread& ppu, u32 addr, u32 size)
+error_code sys_vm_append_memory(ppu_thread& ppu, u32 addr, u64 size)
 {
 	ppu.state += cpu_flag::wait;

@ -176,7 +181,7 @@ error_code sys_vm_append_memory(ppu_thread& ppu, u32 addr, u32 size)
 			return CELL_ENOMEM;
 		}

-		vmo.psize += size;
+		vmo.psize += static_cast<u32>(size);
 		return {};
 	});

@ -193,7 +198,7 @@ error_code sys_vm_append_memory(ppu_thread& ppu, u32 addr, u32 size)
 	return CELL_OK;
 }

-error_code sys_vm_return_memory(ppu_thread& ppu, u32 addr, u32 size)
+error_code sys_vm_return_memory(ppu_thread& ppu, u32 addr, u64 size)
 {
 	ppu.state += cpu_flag::wait;

@ -213,12 +218,12 @@ error_code sys_vm_return_memory(ppu_thread& ppu, u32 addr, u32 size)

 		auto [_, ok] = vmo.psize.fetch_op([&](u32& value)
 		{
-			if (value < 0x100000ull + size)
+			if (value <= size || value - size < 0x100000ull)
 			{
 				return false;
 			}

-			value -= size;
+			value -= static_cast<u32>(size);
 			return true;
 		});

--- a/rpcs3/Emu/Cell/lv2/sys_vm.h
+++ b/rpcs3/Emu/Cell/lv2/sys_vm.h
@ -58,11 +58,11 @@ struct sys_vm_t
 class ppu_thread;

 // SysCalls
-error_code sys_vm_memory_map(ppu_thread& ppu, u32 vsize, u32 psize, u32 cid, u64 flag, u64 policy, vm::ptr<u32> addr);
-error_code sys_vm_memory_map_different(ppu_thread& ppu, u32 vsize, u32 psize, u32 cid, u64 flag, u64 policy, vm::ptr<u32> addr);
+error_code sys_vm_memory_map(ppu_thread& ppu, u64 vsize, u64 psize, u32 cid, u64 flag, u64 policy, vm::ptr<u32> addr);
+error_code sys_vm_memory_map_different(ppu_thread& ppu, u64 vsize, u64 psize, u32 cid, u64 flag, u64 policy, vm::ptr<u32> addr);
 error_code sys_vm_unmap(ppu_thread& ppu, u32 addr);
-error_code sys_vm_append_memory(ppu_thread& ppu, u32 addr, u32 size);
-error_code sys_vm_return_memory(ppu_thread& ppu, u32 addr, u32 size);
+error_code sys_vm_append_memory(ppu_thread& ppu, u32 addr, u64 size);
+error_code sys_vm_return_memory(ppu_thread& ppu, u32 addr, u64 size);
 error_code sys_vm_lock(ppu_thread& ppu, u32 addr, u32 size);
 error_code sys_vm_unlock(ppu_thread& ppu, u32 addr, u32 size);
 error_code sys_vm_touch(ppu_thread& ppu, u32 addr, u32 size);
--- a/rpcs3/Emu/Io/PadHandler.cpp
+++ b/rpcs3/Emu/Io/PadHandler.cpp
@ -350,7 +350,6 @@ void PadHandlerBase::get_motion_sensors(const std::string& pad_id, const motion_

 	// Get the current motion values
 	std::shared_ptr<Pad> pad = std::make_shared<Pad>(m_type, 0, 0, 0, 0);
-	pad->m_sensors.resize(preview_values.size(), AnalogSensor(0, 0, 0, 0, 0));
 	pad_ensemble binding{pad, device, nullptr};
 	get_extended_info(binding);

@ -505,18 +504,18 @@ bool PadHandlerBase::bindPadToDevice(std::shared_ptr<Pad> pad)
 		pad->m_buttons.emplace_back(CELL_PAD_BTN_OFFSET_PRESS_PIGGYBACK, mapping[button::skateboard_tilt_right], CELL_PAD_CTRL_PRESS_R1);
 	}

-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_X, mapping[button::ls_left], mapping[button::ls_right]);
-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_Y, mapping[button::ls_down], mapping[button::ls_up]);
-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_X, mapping[button::rs_left], mapping[button::rs_right]);
-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_Y, mapping[button::rs_down], mapping[button::rs_up]);
+	pad->m_sticks[0] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_X, mapping[button::ls_left], mapping[button::ls_right]);
+	pad->m_sticks[1] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_Y, mapping[button::ls_down], mapping[button::ls_up]);
+	pad->m_sticks[2] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_X, mapping[button::rs_left], mapping[button::rs_right]);
+	pad->m_sticks[3] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_Y, mapping[button::rs_down], mapping[button::rs_up]);

-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_X, 0, 0, 0, DEFAULT_MOTION_X);
-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_Y, 0, 0, 0, DEFAULT_MOTION_Y);
-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_Z, 0, 0, 0, DEFAULT_MOTION_Z);
-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_G, 0, 0, 0, DEFAULT_MOTION_G);
+	pad->m_sensors[0] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_X, 0, 0, 0, DEFAULT_MOTION_X);
+	pad->m_sensors[1] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_Y, 0, 0, 0, DEFAULT_MOTION_Y);
+	pad->m_sensors[2] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_Z, 0, 0, 0, DEFAULT_MOTION_Z);
+	pad->m_sensors[3] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_G, 0, 0, 0, DEFAULT_MOTION_G);

-	pad->m_vibrateMotors.emplace_back(true, 0);
-	pad->m_vibrateMotors.emplace_back(false, 0);
+	pad->m_vibrateMotors[0] = VibrateMotor(true, 0);
+	pad->m_vibrateMotors[1] = VibrateMotor(false, 0);

 	m_bindings.emplace_back(pad, pad_device, nullptr);

--- a/rpcs3/Emu/Io/pad_types.h
+++ b/rpcs3/Emu/Io/pad_types.h
@ -417,6 +417,7 @@ struct AnalogStick
 	std::map<u32, u16> m_pressed_keys_min{}; // only used in keyboard_pad_handler
 	std::map<u32, u16> m_pressed_keys_max{}; // only used in keyboard_pad_handler

+	AnalogStick() {}
 	AnalogStick(u32 offset, std::set<u32> key_codes_min, std::set<u32> key_codes_max)
 		: m_offset(offset)
 		, m_key_codes_min(std::move(key_codes_min))
@ -447,6 +448,7 @@ struct VibrateMotor
 	bool m_is_large_motor = false;
 	u8 m_value = 0;

+	VibrateMotor() {}
 	VibrateMotor(bool is_large_motor, u8 value)
 		: m_is_large_motor(is_large_motor)
 		, m_value(value)
@ -489,9 +491,9 @@ struct Pad
 	u8 m_battery_level{0};

 	std::vector<Button> m_buttons;
-	std::vector<AnalogStick> m_sticks;
-	std::vector<AnalogSensor> m_sensors;
-	std::vector<VibrateMotor> m_vibrateMotors;
+	std::array<AnalogStick, 4> m_sticks{};
+	std::array<AnalogSensor, 4> m_sensors{};
+	std::array<VibrateMotor, 2> m_vibrateMotors{};

 	// These hold bits for their respective buttons
 	u16 m_digital_1{0};
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@ -945,7 +945,7 @@ namespace vm
 		return true;
 	}

-	static u32 _page_unmap(u32 addr, u32 max_size, u64 bflags, utils::shm* shm)
+	static u32 _page_unmap(u32 addr, u32 max_size, u64 bflags, utils::shm* shm, std::vector<std::pair<u64, u64>>& unmap_events)
 	{
 		perf_meter<"PAGE_UNm"_u64> perf0;

@ -1009,7 +1009,7 @@ namespace vm
 		//       the RSX might try to call VirtualProtect on memory that is already unmapped
 		if (auto rsxthr = g_fxo->try_get<rsx::thread>())
 		{
-			rsxthr->on_notify_memory_unmapped(addr, size);
+			rsxthr->on_notify_pre_memory_unmapped(addr, size, unmap_events);
 		}

 		// Deregister PPU related data
@ -1309,7 +1309,7 @@ namespace vm
 		}
 	}

-	bool block_t::unmap()
+	bool block_t::unmap(std::vector<std::pair<u64, u64>>* unmapped)
 	{
 		auto& m_map = (m.*block_map)();

@ -1320,7 +1320,10 @@ namespace vm
 			{
 				const auto next = std::next(it);
 				const auto size = it->second.first;
-				_page_unmap(it->first, size, this->flags, it->second.second.get());
+
+				std::vector<std::pair<u64, u64>> event_data;
+				ensure(size == _page_unmap(it->first, size, this->flags, it->second.second.get(), unmapped ? *unmapped : event_data));
+
 				it = next;
 			}

@ -1480,6 +1483,22 @@ namespace vm
 	{
 		auto& m_map = (m.*block_map)();
 		{
+			struct notify_t
+			{
+				std::vector<std::pair<u64, u64>> event_data;
+
+				~notify_t() noexcept
+				{
+					if (auto rsxthr = g_fxo->try_get<rsx::thread>())
+					{
+						for (const auto [event_data1, event_data2] : event_data)
+						{
+							rsxthr->on_notify_post_memory_unmapped(event_data1, event_data2);
+						}
+					}
+				}
+			} unmap_notification;
+
 			vm::writer_lock lock;

 			const auto found = m_map.find(addr - (flags & stack_guarded ? 0x1000 : 0));
@ -1505,7 +1524,7 @@ namespace vm
 			}

 			// Unmap "real" memory pages
-			ensure(size == _page_unmap(addr, size, this->flags, found->second.second.get()));
+			ensure(size == _page_unmap(addr, size, this->flags, found->second.second.get(), unmap_notification.event_data));

 			// Clear stack guards
 			if (flags & stack_guarded)
@ -1815,9 +1834,9 @@ namespace vm
 		}
 	}

-	bool _unmap_block(const std::shared_ptr<block_t>& block)
+	bool _unmap_block(const std::shared_ptr<block_t>& block, std::vector<std::pair<u64, u64>>* unmapped = nullptr)
 	{
-		return block->unmap();
+		return block->unmap(unmapped);
 	}

 	static bool _test_map(u32 addr, u32 size)
@ -1964,6 +1983,22 @@ namespace vm

 		std::pair<std::shared_ptr<block_t>, bool> result{};

+		struct notify_t
+		{
+			std::vector<std::pair<u64, u64>> unmap_data;
+
+			~notify_t() noexcept
+			{
+				if (auto rsxthr = g_fxo->try_get<rsx::thread>())
+				{
+					for (const auto [event_data1, event_data2] : unmap_data)
+					{
+						rsxthr->on_notify_post_memory_unmapped(event_data1, event_data2);
+					}
+				}
+			}
+		} unmap_notifications;
+
 		vm::writer_lock lock;

 		for (auto it = g_locations.begin() + memory_location_max; it != g_locations.end(); it++)
@ -1993,7 +2028,7 @@ namespace vm

 				result.first = std::move(*it);
 				g_locations.erase(it);
-				ensure(_unmap_block(result.first));
+				ensure(_unmap_block(result.first, &unmap_notifications.unmap_data));
 				result.second = true;
 				return result;
 			}
--- a/rpcs3/Emu/Memory/vm.h
+++ b/rpcs3/Emu/Memory/vm.h
@ -133,8 +133,8 @@ namespace vm
 		bool try_alloc(u32 addr, u64 bflags, u32 size, std::shared_ptr<utils::shm>&&) const;

 		// Unmap block
-		bool unmap();
-		friend bool _unmap_block(const std::shared_ptr<block_t>&);
+		bool unmap(std::vector<std::pair<u64, u64>>* unmapped = nullptr);
+		friend bool _unmap_block(const std::shared_ptr<block_t>&, std::vector<std::pair<u64, u64>>* unmapped);

 	public:
 		block_t(u32 addr, u32 size, u64 flags);
--- a/rpcs3/Emu/RSX/Common/profiling_timer.hpp
+++ b/rpcs3/Emu/RSX/Common/profiling_timer.hpp
@ -16,7 +16,7 @@ namespace rsx
 		{
 			if (enabled) [[unlikely]]
 			{
-				last = rsx::uclock();
+				last = get_system_time();
 			}
 		}

@ -28,7 +28,7 @@ namespace rsx
 			}

 			auto old = last;
-			last = rsx::uclock();
+			last = get_system_time();
 			return static_cast<s64>(last - old);
 		}
 	};
--- a/rpcs3/Emu/RSX/Common/time.hpp
+++ b/rpcs3/Emu/RSX/Common/time.hpp
@ -4,20 +4,3 @@
 #include <util/sysinfo.hpp>

 #include "Emu/Cell/timers.hpp"
-
-namespace rsx
-{
-	static inline u64 uclock()
-	{
-		static const ullong s_tsc_scaled_freq = (utils::get_tsc_freq() / 1000000);
-
-		if (s_tsc_scaled_freq)
-		{
-			return utils::get_tsc() / s_tsc_scaled_freq;
-		}
-		else
-		{
-			return get_system_time();
-		}
-	}
-}
--- a/rpcs3/Emu/RSX/GL/GLCompute.h
+++ b/rpcs3/Emu/RSX/GL/GLCompute.h
@ -300,7 +300,7 @@ namespace gl

 			m_src = fmt::replace_all(m_src, syntax_replace);

-			param_buffer.create(gl::buffer::target::uniform, 32, nullptr, gl::buffer::memory_type::local, GL_DYNAMIC_COPY);
+			param_buffer.create(gl::buffer::target::uniform, 32, nullptr, gl::buffer::memory_type::local, gl::buffer::usage::dynamic_update);
 		}

 		~cs_deswizzle_3d()
--- a/rpcs3/Emu/RSX/GL/GLDMA.cpp
+++ b/rpcs3/Emu/RSX/GL/GLDMA.cpp
@ -0,0 +1,126 @@
+#include "stdafx.h"
+#include "GLDMA.h"
+
+#include "Emu/Memory/vm.h"
+
+namespace gl
+{
+	static constexpr u32 s_dma_block_size = 0x10000;
+	static constexpr u32 s_dma_block_mask = ~(s_dma_block_size - 1);
+
+	std::unordered_map<u32, std::unique_ptr<dma_block>> g_dma_pool;
+
+	void dma_block::allocate(u32 base_address, u32 block_size)
+	{
+		// Since this is a userptr block, we don't need to move data around on resize. Just "claim" a different chunk and move on.
+		if (m_data)
+		{
+			m_data->remove();
+		}
+
+		void* userptr = vm::get_super_ptr(base_address);
+
+		m_data = std::make_unique<gl::buffer>();
+		m_data->create(buffer::target::array, block_size, userptr, buffer::memory_type::userptr, 0);
+		m_base_address = base_address;
+
+		// Some drivers may reject userptr input for whatever reason. Check that the state is still valid.
+		gl::check_state();
+	}
+
+	void* dma_block::map(const utils::address_range& range) const
+	{
+		ensure(range.inside(this->range()));
+		return vm::get_super_ptr(range.start);
+	}
+
+	void dma_block::resize(u32 new_length)
+	{
+		if (new_length <= length())
+		{
+			return;
+		}
+
+		allocate(m_base_address, new_length);
+	}
+
+	void dma_block::set_parent(const dma_block* other)
+	{
+		ensure(this->range().inside(other->range()));
+		ensure(other != this);
+
+		m_parent = other;
+		if (m_data)
+		{
+			m_data->remove();
+			m_data.reset();
+		}
+	}
+
+	bool dma_block::can_map(const utils::address_range& range) const
+	{
+		if (m_parent)
+		{
+			return m_parent->can_map(range);
+		}
+
+		return range.inside(this->range());
+	}
+
+	void clear_dma_resources()
+	{
+		g_dma_pool.clear();
+	}
+
+	utils::address_range to_dma_block_range(u32 start, u32 length)
+	{
+		const auto start_block_address = start & s_dma_block_mask;
+		const auto end_block_address = (start + length + s_dma_block_size - 1) & s_dma_block_mask;
+		return utils::address_range::start_end(start_block_address, end_block_address);
+	}
+
+	const dma_block& get_block(u32 start, u32 length)
+	{
+		const auto block_range = to_dma_block_range(start, length);
+		auto& block = g_dma_pool[block_range.start];
+		if (!block)
+		{
+			block = std::make_unique<dma_block>();
+			block->allocate(block_range.start, block_range.length());
+			return *block;
+		}
+
+		const auto range = utils::address_range::start_length(start, length);
+		if (block->can_map(range)) [[ likely ]]
+		{
+			return *block;
+		}
+
+		const auto owner = block->head();
+		const auto new_length = (block_range.end + 1) - owner->base_addr();
+		const auto search_end = (block_range.end + 1);
+
+		// 1. Resize to new length
+		ensure((new_length & ~s_dma_block_mask) == 0);
+		auto new_owner = std::make_unique<dma_block>();
+		new_owner->allocate(owner->base_addr(), new_length);
+
+		// 2. Acquire all the extras
+		for (u32 id = owner->base_addr() + s_dma_block_size;
+			 id < search_end;
+			 id += s_dma_block_size)
+		{
+			ensure((id % s_dma_block_size) == 0);
+			g_dma_pool[id]->set_parent(new_owner.get());
+		}
+
+		block = std::move(new_owner);
+		return *block;
+	}
+
+	dma_mapping_handle map_dma(u32 guest_address, u32 length)
+	{
+		auto& block = get_block(guest_address, length);
+		return { guest_address - block.base_addr(), block.get() };
+	}
+}
--- a/rpcs3/Emu/RSX/GL/GLDMA.h
+++ b/rpcs3/Emu/RSX/GL/GLDMA.h
@ -0,0 +1,41 @@
+#pragma once
+
+#include <util/types.hpp>
+#include "Utilities/address_range.h"
+
+#include "glutils/buffer_object.h"
+
+// TODO: Unify the DMA implementation across backends as part of RSX restructuring.
+namespace gl
+{
+	using dma_mapping_handle = std::pair<u32, gl::buffer*>;
+
+	dma_mapping_handle map_dma(u32 guest_addr, u32 length);
+	void clear_dma_resources();
+
+	// GL does not currently support mixed block types...
+	class dma_block
+	{
+	public:
+		dma_block() = default;
+
+		void allocate(u32 base_address, u32 block_size);
+		void resize(u32 new_length);
+		void* map(const utils::address_range& range) const;
+
+		void set_parent(const dma_block* other);
+		const dma_block* head() const { return m_parent ? m_parent : this; }
+		bool can_map(const utils::address_range& range) const;
+
+		u32 base_addr() const { return m_base_address; }
+		u32 length() const { return m_data ? static_cast<u32>(m_data->size()) : 0; }
+		bool empty() const { return length() == 0; }
+		buffer* get() const { return m_data.get(); }
+		utils::address_range range() const { return utils::address_range::start_length(m_base_address, length()); }
+
+	protected:
+		u32 m_base_address = 0;
+		const dma_block* m_parent = nullptr;
+		std::unique_ptr<gl::buffer> m_data;
+	};
+}
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@ -3,9 +3,11 @@
 #include "../Overlays/Shaders/shader_loading_dialog_native.h"
 #include "GLGSRender.h"
 #include "GLCompute.h"
+#include "GLDMA.h"

 #include "Emu/Memory/vm_locking.h"
 #include "Emu/RSX/rsx_methods.h"
+#include "Emu/RSX/Host/RSXDMAWriter.h"
 #include "Emu/RSX/NV47/HW/context_accessors.define.h"

 [[noreturn]] extern void report_fatal_error(std::string_view _text, bool is_html = false, bool include_help_text = true);
@ -180,6 +182,20 @@ void GLGSRender::on_init_thread()
 		backend_config.supports_normalized_barycentrics = false;
 	}

+	if (gl_caps.AMD_pinned_memory && g_cfg.video.host_label_synchronization)
+	{
+		backend_config.supports_host_gpu_labels = true;
+
+		m_host_gpu_context_data = std::make_unique<gl::buffer>();
+		m_host_gpu_context_data->create(gl::buffer::target::array, 4096, nullptr, gl::buffer::memory_type::host_visible,
+			gl::buffer::usage::host_read | gl::buffer::usage::host_write | gl::buffer::usage::persistent_map);
+
+		auto host_context_ptr = reinterpret_cast<rsx::host_gpu_context_t*>(m_host_gpu_context_data->map(0, 4096, gl::buffer::access::persistent_rw));
+		m_host_dma_ctrl = std::make_unique<rsx::RSXDMAWriter>(host_context_ptr);
+		m_enqueued_host_write_buffer = std::make_unique<gl::scratch_ring_buffer>();
+		m_enqueued_host_write_buffer->create(gl::buffer::target::array, 64 * 0x100000, gl::buffer::usage::dynamic_update);
+	}
+
 	// Use industry standard resource alignment values as defaults
 	m_uniform_buffer_offset_align = 256;
 	m_min_texbuffer_alignment = 256;
@ -397,6 +413,7 @@ void GLGSRender::on_exit()
 	// TODO: Move these
 	gl::destroy_compute_tasks();
 	gl::destroy_overlay_passes();
+	gl::clear_dma_resources();

 	gl::destroy_global_texture_resources();

@ -407,6 +424,10 @@ void GLGSRender::on_exit()
 	m_prog_buffer.clear();
 	m_rtts.destroy();

+	m_host_dma_ctrl.reset();
+	m_host_gpu_context_data.reset();
+	m_enqueued_host_write_buffer.reset();
+
 	for (auto &fbo : m_framebuffer_cache)
 	{
 		fbo.remove();
@ -1193,7 +1214,7 @@ void GLGSRender::notify_tile_unbound(u32 tile)
 	if (false)
 	{
 		u32 addr = rsx::get_address(tiles[tile].offset, tiles[tile].location);
-		on_notify_memory_unmapped(addr, tiles[tile].size);
+		on_notify_pre_memory_unmapped(addr, tiles[tile].size, *std::make_unique<std::vector<std::pair<u64, u64>>>());
 		m_rtts.invalidate_surface_address(addr, false);
 	}

@ -1203,6 +1224,66 @@ void GLGSRender::notify_tile_unbound(u32 tile)
 	}
 }

+bool GLGSRender::release_GCM_label(u32 address, u32 args)
+{
+	if (!backend_config.supports_host_gpu_labels)
+	{
+		return false;
+	}
+
+	auto host_ctx = ensure(m_host_dma_ctrl->host_ctx());
+
+	if (host_ctx->texture_loads_completed())
+	{
+		// We're about to poll waiting for GPU state, ensure the context is still valid.
+		gl::check_state();
+
+		// All texture loads already seen by the host GPU
+		// Wait for all previously submitted labels to be flushed
+		m_host_dma_ctrl->drain_label_queue();
+		return false;
+	}
+
+	const auto mapping = gl::map_dma(address, 4);
+	const auto write_data = std::bit_cast<u32, be_t<u32>>(args);
+	const auto release_event_id = host_ctx->on_label_acquire();
+
+	// We don't have async texture loads yet, so just release both the label and the commands complete
+	u64 write_buf[2] = { write_data, release_event_id };
+	const auto host_read_offset = m_enqueued_host_write_buffer->alloc(16, 16);
+	m_enqueued_host_write_buffer->get().sub_data(host_read_offset, 16, write_buf);
+
+	// Now write to DMA and then to host context
+	m_enqueued_host_write_buffer->get().copy_to(mapping.second, host_read_offset, mapping.first, 4);
+	m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset + 8, ::offset32(&rsx::host_gpu_context_t::commands_complete_event), 8);
+	m_enqueued_host_write_buffer->push_barrier(host_read_offset, 16);
+
+	host_ctx->on_label_release();
+	return true;
+}
+
+void GLGSRender::enqueue_host_context_write(u32 offset, u32 size, const void* data)
+{
+	ensure(size <= 8);
+	const u32 host_read_offset = m_enqueued_host_write_buffer->alloc(8, 16);
+	m_enqueued_host_write_buffer->get().sub_data(host_read_offset, size, data);
+	m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset, offset, size);
+	m_enqueued_host_write_buffer->push_barrier(host_read_offset, 16);
+}
+
+void GLGSRender::on_guest_texture_read()
+{
+	if (!backend_config.supports_host_gpu_labels)
+	{
+		return;
+	}
+
+	// Tag the read as being in progress
+	u64 event_id = m_host_dma_ctrl->host_ctx()->inc_counter();
+	m_host_dma_ctrl->host_ctx()->texture_load_request_event = event_id;
+	enqueue_host_context_write(::offset32(&rsx::host_gpu_context_t::texture_load_complete_event), 8, &event_id);
+}
+
 void GLGSRender::begin_occlusion_query(rsx::reports::occlusion_query_info* query)
 {
 	query->result = 0;
--- a/rpcs3/Emu/RSX/GL/GLGSRender.h
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.h
@ -128,7 +128,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control

 	GLProgramBuffer m_prog_buffer;

-	//buffer
+	// Draw Buffers
 	gl::fbo* m_draw_fbo = nullptr;
 	std::list<gl::framebuffer_holder> m_framebuffer_cache;
 	std::unique_ptr<gl::texture> m_flip_tex_color[2];
@ -137,7 +137,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 	std::unique_ptr<gl::upscaler> m_upscaler;
 	output_scaling_mode m_output_scaling = output_scaling_mode::bilinear;

-	//vaos are mandatory for core profile
+	// VAOs are mandatory for core profile
 	gl::vao m_vao;

 	shared_mutex m_sampler_mutex;
@ -150,6 +150,10 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 	// Occlusion query type, can be SAMPLES_PASSED or ANY_SAMPLES_PASSED
 	GLenum m_occlusion_type = GL_ANY_SAMPLES_PASSED;

+	// Host context for GPU-driven work
+	std::unique_ptr<gl::buffer> m_host_gpu_context_data;
+	std::unique_ptr<gl::scratch_ring_buffer> m_enqueued_host_write_buffer;
+
 public:
 	u64 get_cycles() final;

@ -193,6 +197,11 @@ public:
 	void get_occlusion_query_result(rsx::reports::occlusion_query_info* query) override;
 	void discard_occlusion_query(rsx::reports::occlusion_query_info* query) override;

+	// DMA
+	bool release_GCM_label(u32 address, u32 data) override;
+	void enqueue_host_context_write(u32 offset, u32 size, const void* data);
+	void on_guest_texture_read();
+
 	// GRAPH backend
 	void patch_transform_constants(rsx::context* ctx, u32 index, u32 count) override;

--- a/rpcs3/Emu/RSX/GL/GLTexture.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp
@ -3,6 +3,7 @@
 #include "GLCompute.h"
 #include "GLRenderTargets.h"
 #include "GLOverlays.h"
+#include "GLGSRender.h"

 #include "glutils/blitter.h"
 #include "glutils/ring_buffer.h"
@ -285,7 +286,7 @@ namespace gl
 			if (!(*dst) || max_mem > static_cast<u64>(dst->size()))
 			{
 				if (*dst) dst->remove();
-				dst->create(buffer::target::ssbo, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY);
+				dst->create(buffer::target::ssbo, max_mem, nullptr, buffer::memory_type::local, 0);
 			}

 			if (auto as_vi = dynamic_cast<const gl::viewable_image*>(src);
@ -400,7 +401,7 @@ namespace gl
 				return;
 			}

-			scratch_mem.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY);
+			scratch_mem.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, 0);

 			glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
 			src->copy_to(&scratch_mem, in_offset, 0, mem_info->image_size_in_bytes);
@ -835,6 +836,10 @@ namespace gl
 		const GLenum gl_format = std::get<0>(format_type);
 		const GLenum gl_type = std::get<1>(format_type);
 		fill_texture(cmd, dst, gcm_format, subresources_layout, is_swizzled, gl_format, gl_type, data_upload_buf);
+
+		// Notify the renderer of the upload
+		auto renderer = static_cast<GLGSRender*>(rsx::get_current_renderer());
+		renderer->on_guest_texture_read();
 	}

 	u32 get_format_texel_width(GLenum format)
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.h
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h
@ -59,7 +59,7 @@ namespace gl
 				pbo.remove();
 			}

-			pbo.create(buffer::target::pixel_pack, buffer_size, nullptr, buffer::memory_type::host_visible, GL_STREAM_READ);
+			pbo.create(buffer::target::pixel_pack, buffer_size, nullptr, buffer::memory_type::host_visible, buffer::usage::host_read);
 			glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
 		}

--- a/rpcs3/Emu/RSX/GL/glutils/buffer_object.cpp
+++ b/rpcs3/Emu/RSX/GL/glutils/buffer_object.cpp
@ -3,38 +3,35 @@

 namespace gl
 {
-	void buffer::allocate(GLsizeiptr size, const void* data_, memory_type type, GLenum usage)
+	void buffer::allocate(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_flags)
 	{
+		m_memory_type = type;
+
 		if (const auto& caps = get_driver_caps();
-			caps.ARB_buffer_storage_supported)
+			type != memory_type::userptr && caps.ARB_buffer_storage_supported)
 		{
 			GLenum flags = 0;
-			if (type == memory_type::host_visible)
+			if (usage_flags & usage::host_write)
 			{
-				switch (usage)
-				{
-				case GL_STREAM_DRAW:
-				case GL_STATIC_DRAW:
-				case GL_DYNAMIC_DRAW:
-					flags |= GL_MAP_WRITE_BIT;
-					break;
-				case GL_STREAM_READ:
-				case GL_STATIC_READ:
-				case GL_DYNAMIC_READ:
-					flags |= GL_MAP_READ_BIT;
-					break;
-				default:
-					fmt::throw_exception("Unsupported buffer usage 0x%x", usage);
-				}
+				flags |= GL_MAP_WRITE_BIT;
 			}
-			else
+			if (usage_flags & usage::host_read)
 			{
-				// Local memory hints
-				if (usage == GL_DYNAMIC_COPY)
-				{
-					flags |= GL_DYNAMIC_STORAGE_BIT;
-				}
+				flags |= GL_MAP_READ_BIT;
 			}
+			if (usage_flags & usage::persistent_map)
+			{
+				flags |= GL_MAP_PERSISTENT_BIT;
+			}
+			if (usage_flags & usage::dynamic_update)
+			{
+				flags |= GL_DYNAMIC_STORAGE_BIT;
+			}
+
+			ensure((flags & (GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT)) != (GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT),
+				"Mutually exclusive usage flags set!");
+
+			ensure(type == memory_type::local || flags != 0, "Host-visible memory must have usage flags set!");

 			if ((flags & GL_MAP_READ_BIT) && !caps.vendor_AMD)
 			{
@ -51,10 +48,8 @@ namespace gl
 		}
 		else
 		{
-			data(size, data_, usage);
+			data(size, data_, GL_STREAM_COPY);
 		}
-
-		m_memory_type = type;
 	}

 	buffer::~buffer()
@ -89,18 +84,18 @@ namespace gl
 		save_binding_state save(current_target(), *this);
 	}

-	void buffer::create(GLsizeiptr size, const void* data_, memory_type type, GLenum usage)
+	void buffer::create(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits)
 	{
 		create();
-		allocate(size, data_, type, usage);
+		allocate(size, data_, type, usage_bits);
 	}

-	void buffer::create(target target_, GLsizeiptr size, const void* data_, memory_type type, GLenum usage)
+	void buffer::create(target target_, GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits)
 	{
 		m_target = target_;

 		create();
-		allocate(size, data_, type, usage);
+		allocate(size, data_, type, usage_bits);
 	}

 	void buffer::remove()
@ -117,11 +112,19 @@ namespace gl
 	{
 		ensure(m_memory_type != memory_type::local);

-		DSA_CALL2(NamedBufferData, m_id, size, data_, usage);
 		m_size = size;
+
+		if (m_memory_type == memory_type::userptr)
+		{
+			glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_id);
+			glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, size, data_, usage);
+			return;
+		}
+
+		DSA_CALL2(NamedBufferData, m_id, size, data_, usage);
 	}

-	void buffer::sub_data(GLsizeiptr offset, GLsizeiptr length, GLvoid* data)
+	void buffer::sub_data(GLsizeiptr offset, GLsizeiptr length, const GLvoid* data)
 	{
 		ensure(m_memory_type == memory_type::local);
 		DSA_CALL2(NamedBufferSubData, m_id, offset, length, data);
--- a/rpcs3/Emu/RSX/GL/glutils/buffer_object.h
+++ b/rpcs3/Emu/RSX/GL/glutils/buffer_object.h
@ -22,20 +22,30 @@ namespace gl
 		{
 			read = GL_MAP_READ_BIT,
 			write = GL_MAP_WRITE_BIT,
-			read_write = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT
+			rw = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT,
+			persistent_rw = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT
 		};

 		enum class memory_type
 		{
 			undefined = 0,
 			local = 1,
-			host_visible = 2
+			host_visible = 2,
+			userptr = 4
+		};
+
+		enum usage
+		{
+			host_write     = (1 << 0),
+			host_read      = (1 << 1),
+			persistent_map = (1 << 2),
+			dynamic_update = (1 << 3),
 		};

 		class save_binding_state
 		{
-			GLint m_last_binding;
-			GLenum m_target;
+			GLint m_last_binding = GL_ZERO;
+			GLenum m_target = GL_NONE;

 		public:
 			save_binding_state(target target_, const buffer& new_state) : save_binding_state(target_)
@ -64,6 +74,11 @@ namespace gl

 			~save_binding_state()
 			{
+				if (!m_target)
+				{
+					return;
+				}
+
 				glBindBuffer(m_target, m_last_binding);
 			}
 		};
@ -77,7 +92,7 @@ namespace gl
 		// Metadata
 		mutable std::pair<u32, u32> m_bound_range{};

-		void allocate(GLsizeiptr size, const void* data_, memory_type type, GLenum usage);
+		void allocate(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits);

 	public:
 		buffer() = default;
@ -88,8 +103,8 @@ namespace gl
 		void recreate(GLsizeiptr size, const void* data = nullptr);

 		void create();
-		void create(GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLenum usage = GL_STREAM_DRAW);
-		void create(target target_, GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLenum usage = GL_STREAM_DRAW);
+		void create(GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLuint usage_bits = 0);
+		void create(target target_, GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLuint usage_bits = 0);

 		void remove();

@ -97,7 +112,7 @@ namespace gl
 		void bind() const { bind(current_target()); }

 		void data(GLsizeiptr size, const void* data_ = nullptr, GLenum usage = GL_STREAM_DRAW);
-		void sub_data(GLsizeiptr offset, GLsizeiptr length, GLvoid* data);
+		void sub_data(GLsizeiptr offset, GLsizeiptr length, const GLvoid* data);

 		GLubyte* map(GLsizeiptr offset, GLsizeiptr length, access access_);
 		void unmap();
--- a/rpcs3/Emu/RSX/GL/glutils/capabilities.cpp
+++ b/rpcs3/Emu/RSX/GL/glutils/capabilities.cpp
@ -33,7 +33,7 @@ namespace gl

 	void capabilities::initialize()
 	{
-		int find_count = 16;
+		int find_count = 17;
 		int ext_count = 0;
 		glGetIntegerv(GL_NUM_EXTENSIONS, &ext_count);

@ -164,6 +164,13 @@ namespace gl
 				find_count--;
 				continue;
 			}
+
+			if (check(ext_name, "GL_AMD_pinned_memory"))
+			{
+				AMD_pinned_memory = true;
+				find_count--;
+				continue;
+			}
 		}

 		// Set GLSL version
--- a/rpcs3/Emu/RSX/GL/glutils/capabilities.h
+++ b/rpcs3/Emu/RSX/GL/glutils/capabilities.h
@ -25,6 +25,7 @@ namespace gl

 		bool EXT_dsa_supported = false;
 		bool EXT_depth_bounds_test = false;
+		bool AMD_pinned_memory = false;
 		bool ARB_dsa_supported = false;
 		bool ARB_bindless_texture_supported = false;
 		bool ARB_buffer_storage_supported = false;
--- a/rpcs3/Emu/RSX/GL/glutils/common.h
+++ b/rpcs3/Emu/RSX/GL/glutils/common.h
@ -79,4 +79,12 @@ namespace gl
 	{
 		glInsertEventMarkerEXT(static_cast<GLsizei>(strlen(label)), label);
 	}
+
+	// Checks if GL state is still valid
+	static inline void check_state()
+	{
+		// GL_OUT_OF_MEMORY invalidates the OpenGL context and is actually the GL version of DEVICE_LOST.
+		// This spec workaround allows it to be abused by ISVs to indicate a broken GL context.
+		ensure(glGetError() != GL_OUT_OF_MEMORY);
+	}
 }
--- a/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp
+++ b/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp
@ -242,14 +242,14 @@ namespace gl
 		}
 	}

-	void scratch_ring_buffer::create(buffer::target target_, u64 size)
+	void scratch_ring_buffer::create(buffer::target target_, u64 size, u32 usage_flags)
 	{
 		if (m_storage)
 		{
 			remove();
 		}

-		m_storage.create(target_, size, nullptr, gl::buffer::memory_type::local, GL_STATIC_COPY);
+		m_storage.create(target_, size, nullptr, gl::buffer::memory_type::local, usage_flags);
 	}

 	void scratch_ring_buffer::remove()
--- a/rpcs3/Emu/RSX/GL/glutils/ring_buffer.h
+++ b/rpcs3/Emu/RSX/GL/glutils/ring_buffer.h
@ -103,7 +103,7 @@ namespace gl
 		scratch_ring_buffer(const scratch_ring_buffer&) = delete;
 		~scratch_ring_buffer();

-		void create(buffer::target _target, u64 size);
+		void create(buffer::target _target, u64 size, u32 usage_flags = 0);
 		void remove();

 		u32 alloc(u32 size, u32 alignment);
--- a/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp
+++ b/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp
@ -80,7 +80,7 @@ namespace gl
 			if (!m_ubo)
 			{
 				ensure(compiled);
-				m_ubo.create(gl::buffer::target::uniform, push_buffer_size, nullptr, gl::buffer::memory_type::local, GL_DYNAMIC_COPY);
+				m_ubo.create(gl::buffer::target::uniform, push_buffer_size, nullptr, gl::buffer::memory_type::local, gl::buffer::usage::dynamic_update);

 				// Statically bind the image sources
 				m_program.uniforms["InputTexture"] = GL_TEMP_IMAGE_SLOT(0);
--- a/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp
+++ b/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp
@ -0,0 +1,67 @@
+#include "stdafx.h"
+#include "RSXDMAWriter.h"
+
+#include "Utilities//Thread.h"
+#include <util/asm.hpp>
+
+namespace rsx
+{
+	void RSXDMAWriter::update()
+	{
+		if (m_dispatch_handlers.empty())
+		{
+			m_job_queue.clear();
+			return;
+		}
+
+		while (!m_job_queue.empty())
+		{
+			const auto job = m_job_queue.front();
+
+			if (const auto dispatch = m_dispatch_handlers.find(job.dispatch_class);
+				dispatch == m_dispatch_handlers.end() || dispatch->second.handler(m_host_context_ptr, &job))
+			{
+				// No handler registered, or callback consumed the job
+				m_job_queue.pop_front();
+				continue;
+			}
+
+			// Dispatcher found and rejected the job. Stop, we'll try again later.
+			break;
+		}
+	}
+
+	void RSXDMAWriter::register_handler(host_dispatch_handler_t handler)
+	{
+		m_dispatch_handlers[handler.dispatch_class] = handler;
+	}
+
+	void RSXDMAWriter::deregister_handler(int dispatch_class)
+	{
+		m_dispatch_handlers.erase(dispatch_class);
+	}
+
+	void RSXDMAWriter::enqueue(const host_gpu_write_op_t& request)
+	{
+		m_job_queue.push_back(request);
+	}
+
+	void RSXDMAWriter::drain_label_queue()
+	{
+		if (!m_host_context_ptr)
+		{
+			return;
+		}
+
+		// FIXME: This is a busy wait, consider yield to improve responsiveness on weak devices.
+		while (!m_host_context_ptr->in_flight_commands_completed())
+		{
+			utils::pause();
+
+			if (thread_ctrl::state() == thread_state::aborting)
+			{
+				break;
+			}
+		}
+	}
+}
--- a/rpcs3/Emu/RSX/Host/RSXDMAWriter.h
+++ b/rpcs3/Emu/RSX/Host/RSXDMAWriter.h
@ -0,0 +1,115 @@
+#pragma once
+
+#include <util/types.hpp>
+
+#include <unordered_map>
+#include <functional>
+#include <deque>
+
+namespace rsx
+{
+	struct host_gpu_context_t
+	{
+		u64 magic = 0xCAFEBABE;
+		u64 event_counter = 0;
+		u64 texture_load_request_event = 0;
+		u64 texture_load_complete_event = 0;
+		u64 last_label_acquire_event = 0;
+		u64 last_label_release2_event = 0;
+		u64 commands_complete_event = 0;
+
+		inline u64 inc_counter() volatile
+		{
+			// Workaround for volatile increment warning. GPU can see this value directly, but currently we do not modify it on the device.
+			event_counter = event_counter + 1;
+			return event_counter;
+		}
+
+		inline bool in_flight_commands_completed() const volatile
+		{
+			return last_label_release2_event <= commands_complete_event;
+		}
+
+		inline bool texture_loads_completed() const volatile
+		{
+			// Return true if all texture load requests are done.
+			return texture_load_complete_event == texture_load_request_event;
+		}
+
+		inline bool has_unflushed_texture_loads() const volatile
+		{
+			return texture_load_request_event > last_label_release2_event;
+		}
+
+		inline u64 on_texture_load_acquire() volatile
+		{
+			texture_load_request_event = inc_counter();
+			return texture_load_request_event;
+		}
+
+		inline void on_texture_load_release() volatile
+		{
+			// Normally released by the host device, but implemented nonetheless for software fallback
+			texture_load_complete_event = texture_load_request_event;
+		}
+
+		inline u64 on_label_acquire() volatile
+		{
+			last_label_acquire_event = inc_counter();
+			return last_label_acquire_event;
+		}
+
+		inline void on_label_release() volatile
+		{
+			last_label_release2_event = last_label_acquire_event;
+		}
+
+		inline bool needs_label_release() const volatile
+		{
+			return last_label_acquire_event > last_label_release2_event;
+		}
+	};
+
+	struct host_gpu_write_op_t
+	{
+		int dispatch_class = 0;
+		void* userdata = nullptr;
+	};
+
+	struct host_dispatch_handler_t
+	{
+		int dispatch_class = 0;
+		std::function<bool(const volatile host_gpu_context_t*, const host_gpu_write_op_t*)> handler;
+	};
+
+	class RSXDMAWriter
+	{
+	public:
+		RSXDMAWriter(void* mem)
+			: m_host_context_ptr(new (mem)host_gpu_context_t)
+		{}
+
+		RSXDMAWriter(host_gpu_context_t* pctx)
+			: m_host_context_ptr(pctx)
+		{}
+
+		void update();
+
+		void register_handler(host_dispatch_handler_t handler);
+		void deregister_handler(int dispatch_class);
+
+		void enqueue(const host_gpu_write_op_t& request);
+		void drain_label_queue();
+
+		volatile host_gpu_context_t* host_ctx() const
+		{
+			return m_host_context_ptr;
+		}
+
+	private:
+		std::unordered_map<int, host_dispatch_handler_t> m_dispatch_handlers;
+		volatile host_gpu_context_t* m_host_context_ptr = nullptr;
+
+		std::deque<host_gpu_write_op_t> m_job_queue;
+	};
+}
--- a/rpcs3/Emu/RSX/NV47/HW/nv406e.cpp
+++ b/rpcs3/Emu/RSX/NV47/HW/nv406e.cpp
@ -44,7 +44,7 @@ namespace rsx
 				RSX(ctx)->flush_fifo();
 			}

-			u64 start = rsx::uclock();
+			u64 start = get_system_time();
 			u64 last_check_val = start;

 			while (sema != arg)
@ -57,7 +57,7 @@ namespace rsx

 				if (const auto tdr = static_cast<u64>(g_cfg.video.driver_recovery_timeout))
 				{
-					const u64 current = rsx::uclock();
+					const u64 current = get_system_time();

 					if (current - last_check_val > 20'000)
 					{
@ -81,7 +81,7 @@ namespace rsx
 			}

 			RSX(ctx)->fifo_wake_delay();
-			RSX(ctx)->performance_counters.idle_time += (rsx::uclock() - start);
+			RSX(ctx)->performance_counters.idle_time += (get_system_time() - start);
 		}

 		void semaphore_release(context* ctx, u32 /*reg*/, u32 arg)
--- a/rpcs3/Emu/RSX/Overlays/overlay_animated_icon.cpp
+++ b/rpcs3/Emu/RSX/Overlays/overlay_animated_icon.cpp
@ -25,11 +25,11 @@ namespace rsx
 		{
 			if (m_last_update_timestamp_us == 0)
 			{
-				m_last_update_timestamp_us = rsx::uclock();
+				m_last_update_timestamp_us = get_system_time();
 			}
 			else
 			{
-				const auto now = rsx::uclock();
+				const auto now = get_system_time();
 				m_current_frame_duration_us += (now - m_last_update_timestamp_us);
 				m_last_update_timestamp_us = now;
 			}
--- a/rpcs3/Emu/RSX/Overlays/overlay_message.cpp
+++ b/rpcs3/Emu/RSX/Overlays/overlay_message.cpp
@ -13,7 +13,7 @@ namespace rsx
 				return duration;
 			}

-			return rsx::uclock() + duration;
+			return get_system_time() + duration;
 		}

 		template <typename T>
@ -168,7 +168,7 @@ namespace rsx

 		void message::update_queue(std::deque<message_item>& vis_set, std::deque<message_item>& ready_set, message_pin_location origin)
 		{
-			const u64 cur_time = rsx::uclock();
+			const u64 cur_time = get_system_time();

 			for (auto it = vis_set.begin(); it != vis_set.end();)
 			{
--- a/rpcs3/Emu/RSX/Overlays/overlays.cpp
+++ b/rpcs3/Emu/RSX/Overlays/overlays.cpp
@ -499,7 +499,7 @@ namespace rsx
 			}

 			if (auto rsxthr = rsx::get_current_renderer(); rsxthr &&
-				(min_refresh_duration_us + rsxthr->last_host_flip_timestamp) < rsx::uclock())
+				(min_refresh_duration_us + rsxthr->last_host_flip_timestamp) < get_system_time())
 			{
 				rsxthr->async_flip_requested |= rsx::thread::flip_request::native_ui;
 			}
--- a/rpcs3/Emu/RSX/RSXFIFO.cpp
+++ b/rpcs3/Emu/RSX/RSXFIFO.cpp
@ -173,10 +173,10 @@ namespace rsx
 							break;
 						}

-						start_time = rsx::uclock();
+						start_time = get_system_time();
 					}

-					auto now = rsx::uclock();
+					auto now = get_system_time();
 					if (now - start_time >= 50u)
 					{
 						if (m_thread->is_stopped())
@ -186,7 +186,7 @@ namespace rsx

 						m_thread->cpu_wait({});

-						const auto then = std::exchange(now, rsx::uclock());
+						const auto then = std::exchange(now, get_system_time());
 						start_time = now;
 						m_thread->performance_counters.idle_time += now - then;
 					}
@ -623,7 +623,7 @@ namespace rsx
 			{
 				if (performance_counters.state == FIFO::state::running)
 				{
-					performance_counters.FIFO_idle_timestamp = rsx::uclock();
+					performance_counters.FIFO_idle_timestamp = get_system_time();
 					performance_counters.state = FIFO::state::nop;
 				}

@ -633,7 +633,7 @@ namespace rsx
 			{
 				if (performance_counters.state == FIFO::state::running)
 				{
-					performance_counters.FIFO_idle_timestamp = rsx::uclock();
+					performance_counters.FIFO_idle_timestamp = get_system_time();
 					performance_counters.state = FIFO::state::empty;
 				}
 				else
@ -668,7 +668,7 @@ namespace rsx
 					//Jump to self. Often preceded by NOP
 					if (performance_counters.state == FIFO::state::running)
 					{
-						performance_counters.FIFO_idle_timestamp = rsx::uclock();
+						performance_counters.FIFO_idle_timestamp = get_system_time();
 						sync_point_request.release(true);
 					}

@ -749,7 +749,7 @@ namespace rsx
 			}

 			// Update performance counters with time spent in idle mode
-			performance_counters.idle_time += (rsx::uclock() - performance_counters.FIFO_idle_timestamp);
+			performance_counters.idle_time += (get_system_time() - performance_counters.FIFO_idle_timestamp);
 		}

 		do
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@ -1,10 +1,6 @@
 #include "stdafx.h"
 #include "RSXThread.h"

-#include "Emu/Cell/PPUCallback.h"
-#include "Emu/Cell/SPUThread.h"
-#include "Emu/Cell/timers.hpp"
-
 #include "Capture/rsx_capture.h"
 #include "Common/BufferUtils.h"
 #include "Common/buffer_stream.hpp"
@ -13,9 +9,17 @@
 #include "Common/time.hpp"
 #include "Core/RSXReservationLock.hpp"
 #include "Core/RSXEngLock.hpp"
+#include "Host/RSXDMAWriter.h"
+#include "NV47/HW/context.h"
+#include "Program/GLSLCommon.h"
 #include "rsx_methods.h"
+
 #include "gcm_printing.h"
 #include "RSXDisAsm.h"
+
+#include "Emu/Cell/PPUCallback.h"
+#include "Emu/Cell/SPUThread.h"
+#include "Emu/Cell/timers.hpp"
 #include "Emu/Cell/lv2/sys_event.h"
 #include "Emu/Cell/lv2/sys_time.h"
 #include "Emu/Cell/Modules/cellGcmSys.h"
@ -23,11 +27,10 @@
 #include "Overlays/overlay_perf_metrics.h"
 #include "Overlays/overlay_debug_overlay.h"
 #include "Overlays/overlay_message.h"
-#include "Program/GLSLCommon.h"
+
 #include "Utilities/date_time.h"
 #include "Utilities/StrUtil.h"
 #include "Crypto/unzip.h"
-#include "NV47/HW/context.h"

 #include "util/asm.hpp"

@ -1021,7 +1024,7 @@ namespace rsx
 		fifo_ctrl = std::make_unique<::rsx::FIFO::FIFO_control>(this);
 		fifo_ctrl->set_get(ctrl->get);

-		last_guest_flip_timestamp = rsx::uclock() - 1000000;
+		last_guest_flip_timestamp = get_system_time() - 1000000;

 		vblank_count = 0;

@ -1101,7 +1104,7 @@ namespace rsx
 				if (Emu.IsPaused())
 				{
 					// Save the difference before pause
-					start_time = rsx::uclock() - start_time;
+					start_time = get_system_time() - start_time;

 					while (Emu.IsPaused() && !is_stopped())
 					{
@ -1109,7 +1112,7 @@ namespace rsx
 					}

 					// Restore difference
-					start_time = rsx::uclock() - start_time;
+					start_time = get_system_time() - start_time;
 				}
 			}
 		})));
@ -1162,6 +1165,11 @@ namespace rsx

 				// Update other sub-units
 				zcull_ctrl->update(this);
+
+				if (m_host_dma_ctrl)
+				{
+					m_host_dma_ctrl->update();
+				}
 			}

 			// Execute FIFO queue
@ -3049,7 +3057,7 @@ namespace rsx
 			}
 		}

-		last_host_flip_timestamp = rsx::uclock();
+		last_host_flip_timestamp = get_system_time();
 	}

 	void thread::check_zcull_status(bool framebuffer_swap)
@ -3291,7 +3299,7 @@ namespace rsx
 	{
 		bool kill_itself = g_cfg.core.rsx_fifo_accuracy == rsx_fifo_mode::as_ps3;

-		const u64 current_time = rsx::uclock();
+		const u64 current_time = get_system_time();

 		if (recovered_fifo_cmds_history.size() == 20u)
 		{
@ -3373,7 +3381,7 @@ namespace rsx

 		// Some cases do not need full delay
 		remaining = utils::aligned_div(remaining, div);
-		const u64 until = rsx::uclock() + remaining;
+		const u64 until = get_system_time() + remaining;

 		while (true)
 		{
@ -3404,7 +3412,7 @@ namespace rsx
 				busy_wait(100);
 			}

-			const u64 current = rsx::uclock();
+			const u64 current = get_system_time();

 			if (current >= until)
 			{
@ -3500,58 +3508,71 @@ namespace rsx
 		}
 	}

-	void thread::on_notify_memory_unmapped(u32 address, u32 size)
+	void thread::on_notify_pre_memory_unmapped(u32 address, u32 size, std::vector<std::pair<u64, u64>>& event_data)
 	{
 		if (rsx_thread_running && address < rsx::constants::local_mem_base)
 		{
-			if (!isHLE)
+			// Each bit represents io entry to be unmapped
+			u64 unmap_status[512 / 64]{};
+
+			for (u32 ea = address >> 20, end = ea + (size >> 20); ea < end; ea++)
 			{
-				// Each bit represents io entry to be unmapped
-				u64 unmap_status[512 / 64]{};
-
-				for (u32 ea = address >> 20, end = ea + (size >> 20); ea < end; ea++)
-				{
-					const u32 io = utils::rol32(iomap_table.io[ea], 32 - 20);
-
-					if (io + 1)
-					{
-						unmap_status[io / 64] |= 1ull << (io & 63);
-						iomap_table.ea[io].release(-1);
-						iomap_table.io[ea].release(-1);
-					}
-				}
-
-				for (u32 i = 0; i < std::size(unmap_status); i++)
-				{
-					// TODO: Check order when sending multiple events
-					if (u64 to_unmap = unmap_status[i])
-					{
-						// Each 64 entries are grouped by a bit
-						const u64 io_event = SYS_RSX_EVENT_UNMAPPED_BASE << i;
-						send_event(0, io_event, to_unmap);
-					}
-				}
-			}
-			else
-			{
-				// TODO: Fix this
-				u32 ea = address >> 20, io = iomap_table.io[ea];
+				const u32 io = utils::rol32(iomap_table.io[ea], 32 - 20);

 				if (io + 1)
 				{
-					io >>= 20;
-
-					auto& cfg = g_fxo->get<gcm_config>();
-					std::lock_guard lock(cfg.gcmio_mutex);
-
-					for (const u32 end = ea + (size >> 20); ea < end;)
-					{
-						cfg.offsetTable.ioAddress[ea++] = 0xFFFF;
-						cfg.offsetTable.eaAddress[io++] = 0xFFFF;
-					}
+					unmap_status[io / 64] |= 1ull << (io & 63);
+					iomap_table.io[ea].release(-1);
+					iomap_table.ea[io].release(-1);
 				}
 			}

+			auto& cfg = g_fxo->get<gcm_config>();
+
+			std::unique_lock<shared_mutex> hle_lock;
+
+			for (u32 i = 0; i < std::size(unmap_status); i++)
+			{
+				// TODO: Check order when sending multiple events
+				if (u64 to_unmap = unmap_status[i])
+				{
+					if (isHLE)
+					{
+						if (!hle_lock)
+						{
+							hle_lock = std::unique_lock{cfg.gcmio_mutex};
+						}
+
+						int bit = 0;
+
+						while (to_unmap)
+						{
+							bit = (std::countr_zero<u64>(utils::rol64(to_unmap, 0 - bit)) + bit);
+							to_unmap &= ~(1ull << bit);
+
+							constexpr u16 null_entry = 0xFFFF;
+							const u32 ea = std::exchange(cfg.offsetTable.eaAddress[(i * 64 + bit)], null_entry);
+
+							if (ea < (rsx::constants::local_mem_base >> 20))
+							{
+								cfg.offsetTable.eaAddress[ea] = null_entry;
+							}
+						}
+
+						continue;
+					}
+
+					// Each 64 entries are grouped by a bit
+					const u64 io_event = SYS_RSX_EVENT_UNMAPPED_BASE << i;
+					event_data.emplace_back(io_event, to_unmap);
+				}
+			}
+
+			if (hle_lock)
+			{
+				hle_lock.unlock();
+			}
+
 			// Pause RSX thread momentarily to handle unmapping
 			eng_lock elock(this);

@ -3581,6 +3602,14 @@ namespace rsx
 		}
 	}

+	void thread::on_notify_post_memory_unmapped(u64 event_data1, u64 event_data2)
+	{
+		if (!isHLE)
+		{
+			send_event(0, event_data1, event_data2);
+		}
+	}
+
 	// NOTE: m_mtx_task lock must be acquired before calling this method
 	void thread::handle_invalidated_memory_range()
 	{
@ -3646,7 +3675,7 @@ namespace rsx
 		//Average load over around 30 frames
 		if (!performance_counters.last_update_timestamp || performance_counters.sampled_frames > 30)
 		{
-			const auto timestamp = rsx::uclock();
+			const auto timestamp = get_system_time();
 			const auto idle = performance_counters.idle_time.load();
 			const auto elapsed = timestamp - performance_counters.last_update_timestamp;

@ -3930,7 +3959,7 @@ namespace rsx

 		flip(m_queued_flip);

-		last_guest_flip_timestamp = rsx::uclock() - 1000000;
+		last_guest_flip_timestamp = get_system_time() - 1000000;
 		flip_status = CELL_GCM_DISPLAY_FLIP_STATUS_DONE;
 		m_queued_flip.in_progress = false;

--- a/rpcs3/Emu/RSX/RSXThread.h
+++ b/rpcs3/Emu/RSX/RSXThread.h
@ -42,6 +42,8 @@ extern rsx::frame_capture_data frame_capture;

 namespace rsx
 {
+	class RSXDMAWriter;
+
 	struct context;

 	namespace overlays
@ -212,6 +214,9 @@ namespace rsx
 		// Context
 		context* m_ctx = nullptr;

+		// Host DMA
+		std::unique_ptr<RSXDMAWriter> m_host_dma_ctrl;
+
 	public:
 		atomic_t<u64> new_get_put = u64{umax};
 		u32 restore_point = 0;
@ -494,11 +499,18 @@ namespace rsx
 		 */
 		void on_notify_memory_mapped(u32 address_base, u32 size);

+		/**
+		 * Notify that a section of memory is to be unmapped
+		 * Any data held in the defined range is discarded
+		 * Sets optional unmap event data
+		 */
+		void on_notify_pre_memory_unmapped(u32 address_base, u32 size, std::vector<std::pair<u64, u64>>& event_data);
+
 		/**
 		 * Notify that a section of memory has been unmapped
 		 * Any data held in the defined range is discarded
 		 */
-		void on_notify_memory_unmapped(u32 address_base, u32 size);
+		void on_notify_post_memory_unmapped(u64 event_data1, u64 event_data2);

 		/**
 		 * Notify to check internal state during semaphore wait
--- a/rpcs3/Emu/RSX/RSXZCULL.cpp
+++ b/rpcs3/Emu/RSX/RSXZCULL.cpp
@ -542,7 +542,7 @@ namespace rsx
 					}
 				}

-				if (m_tsc = rsx::uclock(); m_tsc < m_next_tsc)
+				if (m_tsc = get_system_time(); m_tsc < m_next_tsc)
 				{
 					return;
 				}
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@ -15,6 +15,7 @@
 #include "vkutils/scratch.h"

 #include "Emu/RSX/rsx_methods.h"
+#include "Emu/RSX/Host/RSXDMAWriter.h"
 #include "Emu/RSX/NV47/HW/context_accessors.define.h"
 #include "Emu/Memory/vm_locking.h"

@ -867,8 +868,7 @@ VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar)
 				VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0,
 				VMM_ALLOCATION_POOL_SYSTEM);

-			m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t();
-			ensure(m_host_data_ptr->magic == 0xCAFEBABE);
+			m_host_dma_ctrl = std::make_unique<rsx::RSXDMAWriter>(m_host_object_data->map(0, 0x10000));
 		}
 		else
 		{
@ -1257,7 +1257,7 @@ void VKGSRender::notify_tile_unbound(u32 tile)
 	if (false)
 	{
 		u32 addr = rsx::get_address(tiles[tile].offset, tiles[tile].location);
-		on_notify_memory_unmapped(addr, tiles[tile].size);
+		on_notify_pre_memory_unmapped(addr, tiles[tile].size, *std::make_unique<std::vector<std::pair<u64, u64>>>());
 		m_rtts.invalidate_surface_address(addr, false);
 	}

@ -1784,6 +1784,11 @@ void VKGSRender::flush_command_queue(bool hard_sync, bool do_not_switch)
 	m_current_command_buffer->begin();
 }

+std::pair<volatile vk::host_data_t*, VkBuffer> VKGSRender::map_host_object_data() const
+{
+	return { m_host_dma_ctrl->host_ctx(), m_host_object_data->value };
+}
+
 bool VKGSRender::release_GCM_label(u32 address, u32 args)
 {
 	if (!backend_config.supports_host_gpu_labels)
@ -1791,25 +1796,13 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args)
 		return false;
 	}

-	auto drain_label_queue = [this]()
-	{
-		while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
-		{
-			utils::pause();
+	auto host_ctx = ensure(m_host_dma_ctrl->host_ctx());

-			if (thread_ctrl::state() == thread_state::aborting)
-			{
-				break;
-			}
-		}
-	};
-
-	ensure(m_host_data_ptr);
-	if (m_host_data_ptr->texture_load_complete_event == m_host_data_ptr->texture_load_request_event)
+	if (host_ctx->texture_loads_completed())
 	{
 		// All texture loads already seen by the host GPU
 		// Wait for all previously submitted labels to be flushed
-		drain_label_queue();
+		m_host_dma_ctrl->drain_label_queue();
 		return false;
 	}

@ -1821,13 +1814,13 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args)
 		// NVIDIA GPUs can disappoint when DMA blocks straddle VirtualAlloc boundaries.
 		// Take the L and try the fallback.
 		rsx_log.warning("Host label update at 0x%x was not possible.", address);
-		drain_label_queue();
+		m_host_dma_ctrl->drain_label_queue();
 		return false;
 	}

-	m_host_data_ptr->last_label_release_event = m_host_data_ptr->inc_counter();
+	const auto release_event_id = host_ctx->on_label_acquire();

-	if (m_host_data_ptr->texture_load_request_event > m_host_data_ptr->last_label_submit_event)
+	if (host_ctx->has_unflushed_texture_loads())
 	{
 		if (vk::is_renderpass_open(*m_current_command_buffer))
 		{
@ -1842,17 +1835,31 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args)
 		auto cmd = m_secondary_cb_list.next();
 		cmd->begin();
 		vkCmdUpdateBuffer(*cmd, mapping.second->value, mapping.first, 4, &write_data);
-		vkCmdUpdateBuffer(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
+		vkCmdUpdateBuffer(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, &release_event_id);
 		cmd->end();

 		vk::queue_submit_t submit_info = { m_device->get_graphics_queue(), nullptr };
 		cmd->submit(submit_info);

-		m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
+		host_ctx->on_label_release();
 	}
+
 	return true;
 }

+void VKGSRender::on_guest_texture_read(const vk::command_buffer& cmd)
+{
+	if (!backend_config.supports_host_gpu_labels)
+	{
+		return;
+	}
+
+	// Queue a sync update on the CB doing the load
+	auto host_ctx = ensure(m_host_dma_ctrl->host_ctx());
+	const auto event_id = host_ctx->on_texture_load_acquire();
+	vkCmdUpdateBuffer(cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id);
+}
+
 void VKGSRender::sync_hint(rsx::FIFO::interrupt_hint hint, rsx::reports::sync_hint_payload_t payload)
 {
 	rsx::thread::sync_hint(hint, payload);
@ -1885,7 +1892,7 @@ void VKGSRender::sync_hint(rsx::FIFO::interrupt_hint hint, rsx::reports::sync_hi
 		// OK, cell will be accessing the results, probably.
 		// Try to avoid flush spam, it is more costly to flush the CB than it is to just upload the vertex data
 		// This is supposed to be an optimization afterall.
-		const auto now = rsx::uclock();
+		const auto now = get_system_time();
 		if ((now - m_last_cond_render_eval_hint) > 50)
 		{
 			// Schedule a sync on the next loop iteration
@ -2516,15 +2523,15 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
 		m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
 	}

-	if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->last_label_submit_event)
+	if (m_host_dma_ctrl && m_host_dma_ctrl->host_ctx()->needs_label_release())
 	{
 		vkCmdUpdateBuffer(*m_current_command_buffer,
 			m_host_object_data->value,
 			::offset32(&vk::host_data_t::commands_complete_event),
 			sizeof(u64),
-			const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
+			const_cast<u64*>(&m_host_dma_ctrl->host_ctx()->last_label_acquire_event));

-		m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
+		m_host_dma_ctrl->host_ctx()->on_label_release();
 	}

 	m_current_command_buffer->end();
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@ -1,6 +1,4 @@
 #pragma once
-#include "Emu/RSX/GSRender.h"
-#include "Emu/Cell/timers.hpp"

 #include "upscalers/upscaling.h"

@ -19,15 +17,23 @@
 #include "VKFramebuffer.h"
 #include "VKShaderInterpreter.h"
 #include "VKQueryPool.h"
-#include "../GCM.h"
 #include "util/asm.hpp"

+#include "Emu/RSX/GCM.h"
+#include "Emu/RSX/GSRender.h"
+#include "Emu/RSX/Host/RSXDMAWriter.h"
+
 #include <thread>
 #include <optional>

 using namespace vk::vmm_allocation_pool_; // clang workaround.
 using namespace vk::upscaling_flags_;     // ditto

+namespace vk
+{
+	using host_data_t = rsx::host_gpu_context_t;
+}
+
 class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control
 {
 private:
@ -118,7 +124,6 @@ private:
 	vk::command_buffer_chain<VK_MAX_ASYNC_CB_COUNT> m_primary_cb_list;
 	vk::command_buffer_chunk* m_current_command_buffer = nullptr;

-	volatile vk::host_data_t* m_host_data_ptr = nullptr;
 	std::unique_ptr<vk::buffer> m_host_object_data;

 	vk::descriptor_pool m_descriptor_pool;
@ -274,7 +279,8 @@ public:
 	void end_conditional_rendering() override;

 	// Host sync object
-	inline std::pair<volatile vk::host_data_t*, VkBuffer> map_host_object_data() { return { m_host_data_ptr, m_host_object_data->value }; }
+	std::pair<volatile vk::host_data_t*, VkBuffer> map_host_object_data() const;
+	void on_guest_texture_read(const vk::command_buffer& cmd);

 	// GRAPH backend
 	void patch_transform_constants(rsx::context* ctx, u32 index, u32 count) override;
--- a/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp
@ -6,10 +6,12 @@

 #include "Emu/RSX/Common/simple_array.hpp"
 #include "Emu/RSX/rsx_utils.h"
+#include "Emu/RSX/rsx_cache.h"
 #include "Utilities/mutex.h"
 #include "util/asm.hpp"

 #include <optional>
+#include <thread>

 // Initial heap allocation values. The heaps are growable and will automatically increase in size to accomodate demands
 #define VK_ATTRIB_RING_BUFFER_SIZE_M 64
--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@ -1240,15 +1240,9 @@ namespace vk
 			dst_image->queue_release(cmd2, cmd.get_queue_family(), dst_image->current_layout);
 		}

-		if (auto rsxthr = rsx::get_current_renderer();
-			rsxthr->get_backend_config().supports_host_gpu_labels)
+		if (auto rsxthr = static_cast<VKGSRender*>(rsx::get_current_renderer()))
 		{
-			// Queue a sync update on the CB doing the load
-			auto [host_data, host_buffer] = static_cast<VKGSRender*>(rsxthr)->map_host_object_data();
-			ensure(host_data);
-			const auto event_id = host_data->inc_counter();
-			host_data->texture_load_request_event = event_id;
-			vkCmdUpdateBuffer(cmd2, host_buffer, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id);
+			rsxthr->on_guest_texture_read(cmd2);
 		}
 	}

--- a/rpcs3/Emu/RSX/VK/vkutils/sync.h
+++ b/rpcs3/Emu/RSX/VK/vkutils/sync.h
@ -18,25 +18,6 @@ namespace vk
 		gpu = 1
 	};

-	struct host_data_t // Pick a better name
-	{
-		u64 magic = 0xCAFEBABE;
-		u64 event_counter = 0;
-		u64 texture_load_request_event = 0;
-		u64 texture_load_complete_event = 0;
-		u64 last_label_release_event = 0;
-		u64 last_label_submit_event = 0;
-		u64 commands_complete_event = 0;
-		u64 last_label_request_timestamp = 0;
-
-		inline u64 inc_counter() volatile
-		{
-			// Workaround for volatile increment warning. GPU can see this value directly, but currently we do not modify it on the device.
-			event_counter = event_counter + 1;
-			return event_counter;
-		}
-	};
-
 	struct fence
 	{
 		atomic_t<bool> flushed = false;
--- a/rpcs3/GLGSRender.vcxproj
+++ b/rpcs3/GLGSRender.vcxproj
@ -52,6 +52,7 @@
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="Emu\RSX\GL\GLCompute.h" />
+    <ClInclude Include="Emu\RSX\GL\GLDMA.h" />
    <ClInclude Include="Emu\RSX\GL\GLOverlays.h" />
    <ClInclude Include="Emu\RSX\GL\GLPipelineCompiler.h" />
    <ClInclude Include="Emu\RSX\GL\GLCommonDecompiler.h" />
@ -88,6 +89,7 @@
  <ItemGroup>
    <ClCompile Include="Emu\RSX\GL\GLCommonDecompiler.cpp" />
    <ClCompile Include="Emu\RSX\GL\GLCompute.cpp" />
+    <ClCompile Include="Emu\RSX\GL\GLDMA.cpp" />
    <ClCompile Include="Emu\RSX\GL\GLDraw.cpp" />
    <ClCompile Include="Emu\RSX\GL\GLFragmentProgram.cpp" />
    <ClCompile Include="Emu\RSX\GL\GLGSRender.cpp" />
--- a/rpcs3/GLGSRender.vcxproj.filters
+++ b/rpcs3/GLGSRender.vcxproj.filters
@ -47,6 +47,7 @@
    <ClCompile Include="Emu\RSX\GL\upscalers\fsr1\fsr_pass.cpp">
      <Filter>upscalers\fsr1</Filter>
    </ClCompile>
+    <ClCompile Include="Emu\RSX\GL\GLDMA.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="Emu\RSX\GL\GLTexture.h" />
@ -118,6 +119,7 @@
    <ClInclude Include="Emu\RSX\GL\upscalers\fsr_pass.h">
      <Filter>upscalers</Filter>
    </ClInclude>
+    <ClInclude Include="Emu\RSX\GL\GLDMA.h" />
  </ItemGroup>
  <ItemGroup>
    <Filter Include="glutils">
--- a/rpcs3/Input/evdev_joystick_handler.cpp
+++ b/rpcs3/Input/evdev_joystick_handler.cpp
@ -1219,7 +1219,7 @@ void evdev_joystick_handler::apply_input_events(const std::shared_ptr<Pad>& pad)
 	s32 stick_val[4]{};

 	// Translate any corresponding keycodes to our two sticks. (ignoring thresholds for now)
-	for (int i = 0; i < static_cast<int>(pad->m_sticks.size()); i++)
+	for (usz i = 0; i < pad->m_sticks.size(); i++)
 	{
 		bool pressed{}; // unused
 		u16 val_min{};
@ -1424,23 +1424,23 @@ bool evdev_joystick_handler::bindPadToDevice(std::shared_ptr<Pad> pad)
 	m_dev->axis_right[2] = find_buttons(cfg->rs_up);
 	m_dev->axis_right[3] = find_buttons(cfg->rs_down);

-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_X,  m_dev->axis_left[1],  m_dev->axis_left[0]);
-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_Y,  m_dev->axis_left[3],  m_dev->axis_left[2]);
-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_X, m_dev->axis_right[1], m_dev->axis_right[0]);
-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_Y, m_dev->axis_right[3], m_dev->axis_right[2]);
+	pad->m_sticks[0] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_X,  m_dev->axis_left[1],  m_dev->axis_left[0]);
+	pad->m_sticks[1] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_Y,  m_dev->axis_left[3],  m_dev->axis_left[2]);
+	pad->m_sticks[2] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_X, m_dev->axis_right[1], m_dev->axis_right[0]);
+	pad->m_sticks[3] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_Y, m_dev->axis_right[3], m_dev->axis_right[2]);

 	m_dev->axis_motion[0] = find_motion_button(cfg->motion_sensor_x);
 	m_dev->axis_motion[1] = find_motion_button(cfg->motion_sensor_y);
 	m_dev->axis_motion[2] = find_motion_button(cfg->motion_sensor_z);
 	m_dev->axis_motion[3] = find_motion_button(cfg->motion_sensor_g);

-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_X, m_dev->axis_motion[0].code, m_dev->axis_motion[0].mirrored, m_dev->axis_motion[0].shift, DEFAULT_MOTION_X);
-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_Y, m_dev->axis_motion[1].code, m_dev->axis_motion[1].mirrored, m_dev->axis_motion[1].shift, DEFAULT_MOTION_Y);
-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_Z, m_dev->axis_motion[2].code, m_dev->axis_motion[2].mirrored, m_dev->axis_motion[2].shift, DEFAULT_MOTION_Z);
-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_G, m_dev->axis_motion[3].code, m_dev->axis_motion[3].mirrored, m_dev->axis_motion[3].shift, DEFAULT_MOTION_G);
+	pad->m_sensors[0] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_X, m_dev->axis_motion[0].code, m_dev->axis_motion[0].mirrored, m_dev->axis_motion[0].shift, DEFAULT_MOTION_X);
+	pad->m_sensors[1] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_Y, m_dev->axis_motion[1].code, m_dev->axis_motion[1].mirrored, m_dev->axis_motion[1].shift, DEFAULT_MOTION_Y);
+	pad->m_sensors[2] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_Z, m_dev->axis_motion[2].code, m_dev->axis_motion[2].mirrored, m_dev->axis_motion[2].shift, DEFAULT_MOTION_Z);
+	pad->m_sensors[3] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_G, m_dev->axis_motion[3].code, m_dev->axis_motion[3].mirrored, m_dev->axis_motion[3].shift, DEFAULT_MOTION_G);

-	pad->m_vibrateMotors.emplace_back(true, 0);
-	pad->m_vibrateMotors.emplace_back(false, 0);
+	pad->m_vibrateMotors[0] = VibrateMotor(true, 0);
+	pad->m_vibrateMotors[1] = VibrateMotor(false, 0);

 	if (std::shared_ptr<EvdevDevice> evdev_device = add_device(player_config->device, false))
 	{
--- a/rpcs3/Input/keyboard_pad_handler.cpp
+++ b/rpcs3/Input/keyboard_pad_handler.cpp
@ -314,11 +314,6 @@ void keyboard_pad_handler::release_all_keys()

 		for (usz i = 0; i < pad.m_sticks.size(); i++)
 		{
-			if (i >= max_sticks)
-			{
-				input_log.fatal("Too many sticks (%d vs %d)", pad.m_sticks.size(), max_sticks);
-				break;
-			}
 			m_stick_min[i] = 0;
 			m_stick_max[i] = 128;
 			m_stick_val[i] = 128;
@ -1067,18 +1062,18 @@ bool keyboard_pad_handler::bindPadToDevice(std::shared_ptr<Pad> pad)
 		pad->m_buttons.emplace_back(CELL_PAD_BTN_OFFSET_PRESS_PIGGYBACK, find_keys(cfg->tilt_right), CELL_PAD_CTRL_PRESS_R1);
 	}

-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_X,  find_keys(cfg->ls_left), find_keys(cfg->ls_right));
-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_Y,  find_keys(cfg->ls_up),   find_keys(cfg->ls_down));
-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_X, find_keys(cfg->rs_left), find_keys(cfg->rs_right));
-	pad->m_sticks.emplace_back(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_Y, find_keys(cfg->rs_up),   find_keys(cfg->rs_down));
+	pad->m_sticks[0] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_X,  find_keys(cfg->ls_left), find_keys(cfg->ls_right));
+	pad->m_sticks[1] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_LEFT_Y,  find_keys(cfg->ls_up),   find_keys(cfg->ls_down));
+	pad->m_sticks[2] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_X, find_keys(cfg->rs_left), find_keys(cfg->rs_right));
+	pad->m_sticks[3] = AnalogStick(CELL_PAD_BTN_OFFSET_ANALOG_RIGHT_Y, find_keys(cfg->rs_up),   find_keys(cfg->rs_down));

-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_X, 0, 0, 0, DEFAULT_MOTION_X);
-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_Y, 0, 0, 0, DEFAULT_MOTION_Y);
-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_Z, 0, 0, 0, DEFAULT_MOTION_Z);
-	pad->m_sensors.emplace_back(CELL_PAD_BTN_OFFSET_SENSOR_G, 0, 0, 0, DEFAULT_MOTION_G);
+	pad->m_sensors[0] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_X, 0, 0, 0, DEFAULT_MOTION_X);
+	pad->m_sensors[1] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_Y, 0, 0, 0, DEFAULT_MOTION_Y);
+	pad->m_sensors[2] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_Z, 0, 0, 0, DEFAULT_MOTION_Z);
+	pad->m_sensors[3] = AnalogSensor(CELL_PAD_BTN_OFFSET_SENSOR_G, 0, 0, 0, DEFAULT_MOTION_G);

-	pad->m_vibrateMotors.emplace_back(true, 0);
-	pad->m_vibrateMotors.emplace_back(false, 0);
+	pad->m_vibrateMotors[0] = VibrateMotor(true, 0);
+	pad->m_vibrateMotors[1] = VibrateMotor(false, 0);

 	m_bindings.emplace_back(pad, nullptr, nullptr);
 	m_pads_internal.push_back(*pad);
@ -1258,7 +1253,7 @@ void keyboard_pad_handler::process()

 		// Normalize and apply pad squircling
 		// Copy sticks first. We don't want to modify the raw internal values
-		std::vector<AnalogStick> squircled_sticks = pad_internal.m_sticks;
+		std::array<AnalogStick, 4> squircled_sticks = pad_internal.m_sticks;

 		// Apply squircling
 		if (cfg->lpadsquircling != 0)
@ -1278,6 +1273,6 @@ void keyboard_pad_handler::process()
 		}

 		pad->m_buttons = pad_internal.m_buttons;
-		pad->m_sticks = std::move(squircled_sticks);
+		pad->m_sticks = squircled_sticks; // Don't use std::move here. We assign values lockless, so std::move can lead to segfaults.
 	}
 }
--- a/rpcs3/Input/pad_thread.cpp
+++ b/rpcs3/Input/pad_thread.cpp
@ -212,11 +212,8 @@ void pad_thread::SetRumble(const u32 pad, u8 large_motor, bool small_motor)
 	if (pad >= m_pads.size())
 		return;

-	if (m_pads[pad]->m_vibrateMotors.size() >= 2)
-	{
-		m_pads[pad]->m_vibrateMotors[0].m_value = large_motor;
-		m_pads[pad]->m_vibrateMotors[1].m_value = small_motor ? 255 : 0;
-	}
+	m_pads[pad]->m_vibrateMotors[0].m_value = large_motor;
+	m_pads[pad]->m_vibrateMotors[1].m_value = small_motor ? 255 : 0;
 }

 void pad_thread::SetIntercepted(bool intercepted)
--- a/rpcs3/emucore.vcxproj
+++ b/rpcs3/emucore.vcxproj
@ -104,6 +104,7 @@
    <ClCompile Include="Emu\perf_monitor.cpp" />
    <ClCompile Include="Emu\RSX\Common\texture_cache.cpp" />
    <ClCompile Include="Emu\RSX\Core\RSXContext.cpp" />
+    <ClCompile Include="Emu\RSX\Host\RSXDMAWriter.cpp" />
    <ClCompile Include="Emu\RSX\NV47\FW\draw_call.cpp" />
    <ClCompile Include="Emu\RSX\NV47\FW\reg_context.cpp" />
    <ClCompile Include="Emu\RSX\NV47\HW\common.cpp" />
@ -617,6 +618,7 @@
    <ClInclude Include="Emu\RSX\Core\RSXDisplay.h" />
    <ClInclude Include="Emu\RSX\Core\RSXReservationLock.hpp" />
    <ClInclude Include="Emu\RSX\Core\RSXVertexTypes.h" />
+    <ClInclude Include="Emu\RSX\Host\RSXDMAWriter.h" />
    <ClInclude Include="Emu\RSX\NV47\FW\draw_call.hpp" />
    <ClInclude Include="Emu\RSX\NV47\FW\draw_call.inc.h" />
    <ClInclude Include="Emu\RSX\NV47\FW\GRAPH_backend.h" />
--- a/rpcs3/emucore.vcxproj.filters
+++ b/rpcs3/emucore.vcxproj.filters
@ -1300,6 +1300,9 @@
    <ClCompile Include="Emu\RSX\gcm_enums.cpp">
      <Filter>Emu\GPU\RSX\NV47\FW</Filter>
    </ClCompile>
+    <ClCompile Include="Emu\RSX\Host\RSXDMAWriter.cpp">
+      <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="Crypto\aes.h">
@ -2620,6 +2623,9 @@
    <ClInclude Include="Emu\RSX\color_utils.h">
      <Filter>Emu\GPU\RSX\Utils</Filter>
    </ClInclude>
+    <ClInclude Include="Emu\RSX\Host\RSXDMAWriter.h">
+      <Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="Emu\RSX\Program\GLSLSnippets\GPUDeswizzle.glsl">
--- a/rpcs3/main.cpp
+++ b/rpcs3/main.cpp
@ -252,11 +252,12 @@ LOG_CHANNEL(q_debug, "QDEBUG");

 struct fatal_error_listener final : logs::listener
 {
+public:
 	~fatal_error_listener() override = default;

 	void log(u64 /*stamp*/, const logs::message& msg, const std::string& prefix, const std::string& text) override
 	{
-		if (msg <= logs::level::fatal)
+		if (msg == logs::level::fatal || (msg == logs::level::always && m_log_always))
 		{
 			std::string _msg = "RPCS3: ";

@ -276,10 +277,17 @@ struct fatal_error_listener final : logs::listener
 			_msg += '\n';

 			// If launched from CMD
-			utils::attach_console(utils::console_stream::std_err, false);
+			utils::attach_console(msg == logs::level::fatal ? utils::console_stream::std_err : utils::console_stream::std_out, false);

 			// Output to error stream as is
-			utils::output_stderr(_msg);
+			if (msg == logs::level::fatal)
+			{
+				utils::output_stderr(_msg);
+			}
+			else
+			{
+				std::cout << _msg;
+			}

 #ifdef _WIN32
 			if (IsDebuggerPresent())
@ -295,6 +303,14 @@ struct fatal_error_listener final : logs::listener
 			}
 		}
 	}
+
+	void log_always(bool enabled)
+	{
+		m_log_always = enabled;
+	}
+
+private:
+	bool m_log_always = false;
 };

 // Arguments that force a headless application (need to be checked in create_application)
@ -494,6 +510,7 @@ int main(int argc, char** argv)
 	}

 	const std::string lock_name = fs::get_cache_dir() + "RPCS3.buf";
+	const std::string log_name = fs::get_cache_dir() + "RPCS3.log";

 	static fs::file instance_lock;

@ -512,19 +529,19 @@ int main(int argc, char** argv)
 		{
 			if (fs::exists(lock_name))
 			{
-				report_fatal_error("Another instance of RPCS3 is running.\nClose it or kill its process, if necessary.");
+				report_fatal_error(fmt::format("Another instance of RPCS3 is running.\nClose it or kill its process, if necessary.\n'%s' still exists.", lock_name));
 			}

-			report_fatal_error("Cannot create RPCS3.log (access denied)."
+			report_fatal_error(fmt::format("Cannot create '%s' or '%s' (access denied).\n"
 #ifdef _WIN32
-				"\nNote that RPCS3 cannot be installed in Program Files or similar directories with limited permissions."
+				"Note that RPCS3 cannot be installed in Program Files or similar directories with limited permissions."
 #else
-				"\nPlease, check RPCS3 permissions in '~/.config/rpcs3'."
+				"Please, check RPCS3 permissions."
 #endif
-			);
+				, log_name, lock_name));
 		}

-		report_fatal_error(fmt::format("Cannot create RPCS3.log (error %s)", fs::g_tls_error));
+		report_fatal_error(fmt::format("Cannot create'%s' or '%s' (error=%s)", log_name, lock_name, fs::g_tls_error));
 	}

 #ifdef _WIN32
@ -552,9 +569,6 @@ int main(int argc, char** argv)

 	ensure(thread_ctrl::is_main(), "Not main thread");

-	// Initialize TSC freq (in case it isn't)
-	static_cast<void>(utils::get_tsc_freq());
-
 	// Initialize thread pool finalizer (on first use)
 	static_cast<void>(named_thread("", [](int) {}));

@ -568,10 +582,10 @@ int main(int argc, char** argv)
 		}

 		// Limit log size to ~25% of free space
-		log_file = logs::make_file_listener(fs::get_cache_dir() + "RPCS3.log", stats.avail_free / 4);
+		log_file = logs::make_file_listener(log_name, stats.avail_free / 4);
 	}

-	static std::unique_ptr<logs::listener> fatal_listener = std::make_unique<fatal_error_listener>();
+	static std::unique_ptr<fatal_error_listener> fatal_listener = std::make_unique<fatal_error_listener>();
 	logs::listener::add(fatal_listener.get());

 	{
@ -999,6 +1013,10 @@ int main(int argc, char** argv)
 		return 0;
 	}

+	// Enable console output of "always" log messages.
+	// Do this after parsing any Qt cli args that might open a window.
+	fatal_listener->log_always(true);
+
 	// Log unique ID
 	gui::utils::log_uuid();

--- a/rpcs3/rpcs3_version.cpp
+++ b/rpcs3/rpcs3_version.cpp
@ -28,7 +28,7 @@ namespace rpcs3
 	// Currently accessible by Windows and Linux build scripts, see implementations when doing MACOSX
 	const utils::version& get_version()
 	{
-		static constexpr utils::version version{ 0, 0, 33, utils::version_type::alpha, 1, RPCS3_GIT_VERSION };
+		static constexpr utils::version version{ 0, 0, 34, utils::version_type::alpha, 1, RPCS3_GIT_VERSION };
 		return version;
 	}

--- a/rpcs3/rpcs3qt/emu_settings.cpp
+++ b/rpcs3/rpcs3qt/emu_settings.cpp
@ -441,9 +441,9 @@ void emu_settings::EnhanceCheckBox(QCheckBox* checkbox, emu_settings_type type)
 		m_broken_types.insert(type);
 	}

-	connect(checkbox, &QCheckBox::stateChanged, this, [type, this](int val)
+	connect(checkbox, &QCheckBox::checkStateChanged, this, [type, this](Qt::CheckState val)
 	{
-		const std::string str = val != 0 ? "true" : "false";
+		const std::string str = val != Qt::Unchecked ? "true" : "false";
 		SetSetting(type, str);
 	});

--- a/rpcs3/rpcs3qt/game_list.cpp
+++ b/rpcs3/rpcs3qt/game_list.cpp
@ -14,6 +14,30 @@ game_list::game_list() : QTableWidget(), game_list_base()
 	};
 }

+void game_list::sync_header_actions(QList<QAction*>& actions, std::function<bool(int)> get_visibility)
+{
+	ensure(get_visibility);
+
+	bool is_dirty = false;
+
+	for (int col = 0; col < actions.count(); ++col)
+	{
+		const bool is_hidden = !get_visibility(col);
+		actions[col]->setChecked(!is_hidden);
+
+		if (isColumnHidden(col) != is_hidden)
+		{
+			setColumnHidden(col, is_hidden);
+			is_dirty = true;
+		}
+	}
+
+	if (is_dirty)
+	{
+		fix_narrow_columns();
+	}
+}
+
 void game_list::create_header_actions(QList<QAction*>& actions, std::function<bool(int)> get_visibility, std::function<void(int, bool)> set_visibility)
 {
 	ensure(get_visibility);
@ -48,6 +72,7 @@ void game_list::create_header_actions(QList<QAction*>& actions, std::function<bo
 					return;
 				}
 			}
+
 			setColumnHidden(col, !checked); // Negate because it's a set col hidden and we have menu say show.
 			set_visibility(col, checked);

@ -56,11 +81,9 @@ void game_list::create_header_actions(QList<QAction*>& actions, std::function<bo
 				fix_narrow_columns();
 			}
 		});
-
-		const bool vis = get_visibility(col);
-		actions[col]->setChecked(vis);
-		setColumnHidden(col, !vis);
 	}
+
+	sync_header_actions(actions, get_visibility);
 }

 void game_list::clear_list()
--- a/rpcs3/rpcs3qt/game_list.h
+++ b/rpcs3/rpcs3qt/game_list.h
@ -24,6 +24,7 @@ class game_list : public QTableWidget, public game_list_base
 public:
 	game_list();

+	void sync_header_actions(QList<QAction*>& actions, std::function<bool(int)> get_visibility);
 	void create_header_actions(QList<QAction*>& actions, std::function<bool(int)> get_visibility, std::function<void(int, bool)> set_visibility);

 	void clear_list() override; // Use this instead of clearContents
--- a/rpcs3/rpcs3qt/game_list_frame.cpp
+++ b/rpcs3/rpcs3qt/game_list_frame.cpp
@ -230,12 +230,7 @@ void game_list_frame::LoadSettings()
 	m_show_custom_icons = m_gui_settings->GetValue(gui::gl_custom_icon).toBool();
 	m_play_hover_movies = m_gui_settings->GetValue(gui::gl_hover_gifs).toBool();

-	for (int col = 0; col < m_columnActs.count(); ++col)
-	{
-		const bool vis = m_gui_settings->GetGamelistColVisibility(static_cast<gui::game_list_columns>(col));
-		m_columnActs[col]->setChecked(vis);
-		m_game_list->setColumnHidden(col, !vis);
-	}
+	m_game_list->sync_header_actions(m_columnActs, [this](int col) { return m_gui_settings->GetGamelistColVisibility(static_cast<gui::game_list_columns>(col)); });
 }

 game_list_frame::~game_list_frame()
@ -915,6 +910,7 @@ void game_list_frame::OnRefreshFinished()
 	if (!std::exchange(m_initial_refresh_done, true))
 	{
 		m_game_list->restore_layout(m_gui_settings->GetValue(gui::gl_state).toByteArray());
+		m_game_list->sync_header_actions(m_columnActs, [this](int col) { return m_gui_settings->GetGamelistColVisibility(static_cast<gui::game_list_columns>(col)); });
 	}

 	// Emit signal and remove slots
--- a/rpcs3/rpcs3qt/gui_application.cpp
+++ b/rpcs3/rpcs3qt/gui_application.cpp
@ -160,9 +160,17 @@ bool gui_application::Init()
 	if (m_gui_settings->GetValue(gui::ib_show_welcome).toBool())
 	{
 		welcome_dialog* welcome = new welcome_dialog(m_gui_settings, false);
+
+		bool use_dark_theme = false;
+
+		connect(welcome, &QDialog::accepted, this, [&]()
+		{
+			use_dark_theme = welcome->does_user_want_dark_theme();
+		});
+
 		welcome->exec();

-		if (welcome->does_user_want_dark_theme())
+		if (use_dark_theme)
 		{
 			m_gui_settings->SetValue(gui::m_currentStylesheet, "Darker Style by TheMitoSan");
 		}
--- a/rpcs3/rpcs3qt/pad_motion_settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/pad_motion_settings_dialog.cpp
@ -99,7 +99,7 @@ pad_motion_settings_dialog::pad_motion_settings_dialog(QDialog* parent, std::sha
 			m_shifts[i]->setRange(config->shift.min, config->shift.max);
 			m_shifts[i]->setValue(config->shift.get());

-			connect(m_mirrors[i], &QCheckBox::stateChanged, this, [this, i](int state)
+			connect(m_mirrors[i], &QCheckBox::checkStateChanged, this, [this, i](Qt::CheckState state)
 			{
 				std::lock_guard lock(m_config_mutex);
 				m_config_entries[i]->mirrored.set(state != Qt::Unchecked);
--- a/rpcs3/rpcs3qt/patch_manager_dialog.cpp
+++ b/rpcs3/rpcs3qt/patch_manager_dialog.cpp
@ -93,7 +93,7 @@ patch_manager_dialog::patch_manager_dialog(std::shared_ptr<gui_settings> gui_set
 	connect(ui->patch_tree, &QTreeWidget::currentItemChanged, this, &patch_manager_dialog::handle_item_selected);
 	connect(ui->patch_tree, &QTreeWidget::itemChanged, this, &patch_manager_dialog::handle_item_changed);
 	connect(ui->patch_tree, &QTreeWidget::customContextMenuRequested, this, &patch_manager_dialog::handle_custom_context_menu_requested);
-	connect(ui->cb_owned_games_only, &QCheckBox::stateChanged, this, &patch_manager_dialog::handle_show_owned_games_only);
+	connect(ui->cb_owned_games_only, &QCheckBox::checkStateChanged, this, &patch_manager_dialog::handle_show_owned_games_only);
 	connect(ui->configurable_selector, QOverload<int>::of(&QComboBox::currentIndexChanged), this, [this](int index)
 	{
 		if (index >= 0)
@ -1087,7 +1087,7 @@ void patch_manager_dialog::dropEvent(QDropEvent* event)
 	}
 }

-void patch_manager_dialog::handle_show_owned_games_only(int state)
+void patch_manager_dialog::handle_show_owned_games_only(Qt::CheckState state)
 {
 	m_show_owned_games_only = state == Qt::CheckState::Checked;
 	m_gui_settings->SetValue(gui::pm_show_owned, m_show_owned_games_only);
--- a/rpcs3/rpcs3qt/patch_manager_dialog.h
+++ b/rpcs3/rpcs3qt/patch_manager_dialog.h
@ -50,7 +50,7 @@ private Q_SLOTS:
 	void handle_item_changed(QTreeWidgetItem* item, int column);
 	void handle_config_value_changed(double value);
 	void handle_custom_context_menu_requested(const QPoint& pos);
-	void handle_show_owned_games_only(int state);
+	void handle_show_owned_games_only(Qt::CheckState state);

 private:
 	void refresh(bool restore_layout = false);
--- a/rpcs3/rpcs3qt/settings_dialog.cpp
+++ b/rpcs3/rpcs3qt/settings_dialog.cpp
@ -1433,7 +1433,7 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 	if (game)
 		ui->gb_DiskCacheClearing->setDisabled(true);
 	else
-		connect(ui->enableCacheClearing, &QCheckBox::stateChanged, ui->maximumCacheSize, &QSlider::setEnabled);
+		connect(ui->enableCacheClearing, &QCheckBox::checkStateChanged, ui->maximumCacheSize, &QSlider::setEnabled);

 	// Date Time Edit Box
 	m_emu_settings->EnhanceDateTimeEdit(ui->console_time_edit, emu_settings_type::ConsoleTimeOffset, tr("dd MMM yyyy HH:mm"), true, true, 15000);
@ -1580,7 +1580,7 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std

 	ui->mfcDelayCommand->setChecked(m_emu_settings->GetSetting(emu_settings_type::MFCCommandsShuffling) == "1");
 	SubscribeTooltip(ui->mfcDelayCommand, tooltips.settings.mfc_delay_command);
-	connect(ui->mfcDelayCommand, &QCheckBox::stateChanged, [&](int val)
+	connect(ui->mfcDelayCommand, &QCheckBox::checkStateChanged, [&](Qt::CheckState val)
 	{
 		const std::string str = val != Qt::Unchecked ? "1" : "0";
 		m_emu_settings->SetSetting(emu_settings_type::MFCCommandsShuffling, str);
--- a/rpcs3/rpcs3qt/trophy_manager_dialog.cpp
+++ b/rpcs3/rpcs3qt/trophy_manager_dialog.cpp
@ -33,6 +33,7 @@
 #include <QWheelEvent>
 #include <QGuiApplication>
 #include <QScreen>
+#include <QTimeZone>

 LOG_CHANNEL(gui_log, "GUI");

@ -522,6 +523,13 @@ void trophy_manager_dialog::RepaintUI(bool restore_layout)
 		//m_trophy_table->horizontalHeader()->resizeSections(QHeaderView::ResizeMode::ResizeToContents);
 	}

+	if (restore_layout)
+	{
+		// Make sure the actions and the headers are synced
+		m_game_table->sync_header_actions(m_game_column_acts, [this](int col) { return m_gui_settings->GetTrophyGamelistColVisibility(static_cast<gui::trophy_game_list_columns>(col)); });
+		m_trophy_table->sync_header_actions(m_trophy_column_acts, [this](int col) { return m_gui_settings->GetTrophylistColVisibility(static_cast<gui::trophy_list_columns>(col)); });
+	}
+
 	ApplyFilter();

 	// Show dialog and then paint gui in order to adjust headers correctly
@ -543,6 +551,10 @@ void trophy_manager_dialog::HandleRepaintUiRequest()
 	m_game_table->horizontalHeader()->restoreState(game_table_state);
 	m_trophy_table->horizontalHeader()->restoreState(trophy_table_state);

+	// Make sure the actions and the headers are synced
+	m_game_table->sync_header_actions(m_game_column_acts, [this](int col) { return m_gui_settings->GetTrophyGamelistColVisibility(static_cast<gui::trophy_game_list_columns>(col)); });
+	m_trophy_table->sync_header_actions(m_trophy_column_acts, [this](int col) { return m_gui_settings->GetTrophylistColVisibility(static_cast<gui::trophy_list_columns>(col)); });
+
 	resize(window_size);
 }

@ -1331,7 +1343,7 @@ QDateTime trophy_manager_dialog::TickToDateTime(u64 tick)
 	const QDateTime datetime(
 		QDate(rtc_date.year, rtc_date.month, rtc_date.day),
 		QTime(rtc_date.hour, rtc_date.minute, rtc_date.second, rtc_date.microsecond / 1000),
-		Qt::TimeSpec::UTC);
+		QTimeZone::UTC);
 	return datetime.toLocalTime();
 }

--- a/rpcs3/util/asm.hpp
+++ b/rpcs3/util/asm.hpp
@ -410,10 +410,12 @@ namespace utils
 			return static_cast<T>(value * u64{numerator} / u64{denominator});
 		}

+#if is_u128_emulated
 		if constexpr (sizeof(T) <= sizeof(u128) / 2)
 		{
-			return static_cast<T>(value * u128{numerator} / u64{denominator});
+			return static_cast<T>(u128_from_mul(value, numerator) / u64{denominator});
 		}
+#endif

 		return static_cast<T>(value / denominator * numerator + (value % denominator) * numerator / denominator);
 	}
@ -464,3 +466,7 @@ namespace utils
 } // namespace utils

 using utils::busy_wait;
+
+#ifdef _MSC_VER
+using utils::operator/;
+#endif
--- a/rpcs3/util/atomic.cpp
+++ b/rpcs3/util/atomic.cpp
@ -434,7 +434,7 @@ static u32 cond_alloc(uptr iptr, u32 tls_slot = -1)
 		});

 		// Set lowest clear bit
-		const u64 bits = s_cond_bits[level3].fetch_op(FN(x |= x + 1, void()));
+		const u64 bits = s_cond_bits[level3].fetch_op(AOFN(x |= x + 1, void()));

 		// Find lowest clear bit (before it was set in fetch_op)
 		const u32 id = level3 * 64 + std::countr_one(bits);
@ -503,9 +503,9 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1)
 	// Release the semaphore tree in the reverse order
 	s_cond_bits[cond_id / 64] &= ~(1ull << (cond_id % 64));

-	s_cond_sem3[level2].atomic_op(FN(x -= u128{1} << (level3 * 7)));
-	s_cond_sem2[level1].atomic_op(FN(x -= u128{1} << (level2 * 11)));
-	s_cond_sem1.atomic_op(FN(x -= u128{1} << (level1 * 14)));
+	s_cond_sem3[level2].atomic_op(AOFN(x -= u128{1} << (level3 * 7)));
+	s_cond_sem2[level1].atomic_op(AOFN(x -= u128{1} << (level2 * 11)));
+	s_cond_sem1.atomic_op(AOFN(x -= u128{1} << (level1 * 14)));
 }

 static cond_handle* cond_id_lock(u32 cond_id, uptr iptr = 0)
@ -674,19 +674,28 @@ u64 utils::get_unique_tsc()
 {
 	const u64 stamp0 = utils::get_tsc();

-	return s_min_tsc.atomic_op([&](u64& tsc)
+	if (!s_min_tsc.fetch_op([=](u64& tsc)
 	{
-		if (stamp0 <= s_min_tsc)
+		if (stamp0 <= tsc)
 		{
 			// Add 1 if new stamp is too old
-			return ++tsc;
+			return false;
 		}
 		else
 		{
 			// Update last tsc with new stamp otherwise
-			return ((tsc = stamp0));
+			tsc = stamp0;
+			return true;
 		}
-	});
+	}).second)
+	{
+		// Add 1 if new stamp is too old
+		// Avoid doing it in the atomic operaion because, if it gets here it means there is already much cntention
+		// So break the race (at least on x86)
+		return s_min_tsc.add_fetch(1);
+	}
+
+	return stamp0;
 }

 atomic_t<u16>* root_info::slot_alloc(uptr ptr) noexcept
--- a/rpcs3/util/atomic.hpp
+++ b/rpcs3/util/atomic.hpp
@ -1233,6 +1233,7 @@ public:

 	// Atomic operation; returns old value, or pair of old value and return value (cancel op if evaluates to false)
 	template <typename F, typename RT = std::invoke_result_t<F, T&>>
+		requires (!std::is_invocable_v<F, const T> && !std::is_invocable_v<F, volatile T>)
 	std::conditional_t<std::is_void_v<RT>, type, std::pair<type, RT>> fetch_op(F func)
 	{
 		type _new, old = atomic_storage<type>::load(m_data);
@ -1264,6 +1265,7 @@ public:

 	// Atomic operation; returns function result value, function is the lambda
 	template <typename F, typename RT = std::invoke_result_t<F, T&>>
+		requires (!std::is_invocable_v<F, const T> && !std::is_invocable_v<F, volatile T>)
 	RT atomic_op(F func)
 	{
 		type _new, old = atomic_storage<type>::load(m_data);
@ -1798,3 +1800,31 @@ struct std::common_type<T, atomic_t<T2, Align2>> : std::common_type<std::common_
 #pragma GCC diagnostic pop
 #pragma GCC diagnostic pop
 #endif
+
+namespace utils
+{
+	template <typename F>
+	struct aofn_helper
+	{
+		F f;
+
+		aofn_helper(F&& f) noexcept
+			: f(std::forward<F>(f))
+		{
+		}
+
+		template <typename Arg> requires (std::is_same_v<std::remove_reference_t<Arg>, std::remove_cvref_t<Arg>> && !std::is_rvalue_reference_v<Arg>)
+		auto operator()(Arg& arg) const noexcept
+		{
+			return f(std::forward<Arg&>(arg));
+		}
+	};
+
+	template <typename F>
+	aofn_helper(F&& f) -> aofn_helper<F>;
+}
+
+// Shorter lambda for non-cv qualified L-values
+// For use with atomic operations
+#define AOFN(...) \
+	::utils::aofn_helper([&](auto& x) { return (__VA_ARGS__); })
--- a/rpcs3/util/logs.cpp
+++ b/rpcs3/util/logs.cpp
@ -451,47 +451,47 @@ void logs::message::broadcast(const char* fmt, const fmt_type_info* sup, ...) co
 logs::file_writer::file_writer(const std::string& name, u64 max_size)
 	: m_max_size(max_size)
 {
-	if (!name.empty() && max_size)
+	if (name.empty() || !max_size)
 	{
-		// Initialize ringbuffer
-		m_fptr = std::make_unique<uchar[]>(s_log_size);
+		return;
+	}

-		// Actual log file (allowed to fail)
-		if (!m_fout.open(name, fs::rewrite))
-		{
-			fprintf(stderr, "Log file open failed: %s (error %d)\n", name.c_str(), errno);
-		}
+	// Initialize ringbuffer
+	m_fptr = std::make_unique<uchar[]>(s_log_size);

-		// Compressed log, make it inaccessible (foolproof)
-		if (m_fout2.open(name + ".gz", fs::rewrite + fs::unread))
-		{
+	// Actual log file (allowed to fail)
+	if (!m_fout.open(name, fs::rewrite))
+	{
+		fprintf(stderr, "Log file open failed: %s (error %d)\n", name.c_str(), errno);
+	}
+
+	// Compressed log, make it inaccessible (foolproof)
+	if (m_fout2.open(name + ".gz", fs::rewrite + fs::unread))
+	{
 #ifndef _MSC_VER
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wold-style-cast"
 #endif
-			if (deflateInit2(&m_zs, 9, Z_DEFLATED, 16 + 15, 9, Z_DEFAULT_STRATEGY) != Z_OK)
+		if (deflateInit2(&m_zs, 9, Z_DEFLATED, 16 + 15, 9, Z_DEFAULT_STRATEGY) != Z_OK)
 #ifndef _MSC_VER
 #pragma GCC diagnostic pop
 #endif
-				m_fout2.close();
-		}
-
-		if (!m_fout2)
 		{
-			fprintf(stderr, "Log file open failed: %s.gz (error %d)\n", name.c_str(), errno);
+			m_fout2.close();
 		}
+	}
+
+	if (!m_fout2)
+	{
+		fprintf(stderr, "Log file open failed: %s.gz (error %d)\n", name.c_str(), errno);
+	}

 #ifdef _WIN32
-		// Autodelete compressed log file
-		FILE_DISPOSITION_INFO disp;
-		disp.DeleteFileW = true;
-		SetFileInformationByHandle(m_fout2.get_handle(), FileDispositionInfo, &disp, sizeof(disp));
+	// Autodelete compressed log file
+	FILE_DISPOSITION_INFO disp{};
+	disp.DeleteFileW = true;
+	SetFileInformationByHandle(m_fout2.get_handle(), FileDispositionInfo, &disp, sizeof(disp));
 #endif
-	}
-	else
-	{
-		return;
-	}

 	m_writer = std::thread([this]()
 	{
--- a/rpcs3/util/sysinfo.cpp
+++ b/rpcs3/util/sysinfo.cpp
@ -22,6 +22,8 @@
 #endif
 #endif

+#include <thread>
+
 #include "util/asm.hpp"
 #include "util/fence.hpp"

@ -734,12 +736,32 @@ bool utils::get_low_power_mode()
 #endif
 }

-static constexpr ullong round_tsc(ullong val)
+static constexpr ullong round_tsc(ullong val, ullong known_error)
 {
-	return utils::rounded_div(val, 1'000'000) * 1'000'000;
+	if (known_error >= 500'000)
+	{
+		// Do not accept large errors
+		return 0;
+	}
+
+	ullong by = 1000;
+	known_error /= 1000;
+
+	while (known_error && by < 100'000)
+	{
+		by *= 10;
+		known_error /= 10;
+	}
+
+	return utils::rounded_div(val, by) * by;
 }

-ullong utils::get_tsc_freq()
+namespace utils
+{
+	u64 s_tsc_freq = 0;
+}
+
+static const bool s_tsc_freq_evaluated = []() -> bool
 {
 	static const ullong cal_tsc = []() -> ullong
 	{
@ -749,7 +771,7 @@ ullong utils::get_tsc_freq()
 		return r;
 #endif

-		if (!has_invariant_tsc())
+		if (!utils::has_invariant_tsc())
 			return 0;

 #ifdef _WIN32
@ -758,64 +780,109 @@ ullong utils::get_tsc_freq()
 			return 0;

 		if (freq.QuadPart <= 9'999'999)
-			return round_tsc(freq.QuadPart * 1024);
+			return 0;

 		const ullong timer_freq = freq.QuadPart;
 #else
-		const ullong timer_freq = 1'000'000'000;
+		constexpr ullong timer_freq = 1'000'000'000;
 #endif

-		// Calibrate TSC
-		constexpr int samples = 40;
-		ullong rdtsc_data[samples];
-		ullong timer_data[samples];
-		[[maybe_unused]] ullong error_data[samples];
+		constexpr u64 retry_count = 1024;

-		// Narrow thread affinity to a single core
-		const u64 old_aff = thread_ctrl::get_thread_affinity_mask();
-		thread_ctrl::set_thread_affinity_mask(old_aff & (0 - old_aff));
+		// First is entry is for the onset measurements, last is for the end measurements
+		constexpr usz sample_count = 2;
+		std::array<u64, sample_count> rdtsc_data{};
+		std::array<u64, sample_count> rdtsc_diff{};
+		std::array<u64, sample_count> timer_data{};

-#ifndef _WIN32
+#ifdef _WIN32
+		LARGE_INTEGER ctr0;
+		QueryPerformanceCounter(&ctr0);
+		const ullong time_base = ctr0.QuadPart;
+#else
 		struct timespec ts0;
 		clock_gettime(CLOCK_MONOTONIC, &ts0);
-		ullong sec_base = ts0.tv_sec;
+		const ullong sec_base = ts0.tv_sec;
 #endif

-		for (int i = 0; i < samples; i++)
+		constexpr usz sleep_time_ms = 40;
+
+		for (usz sample = 0; sample < sample_count; sample++)
 		{
+			for (usz i = 0; i < retry_count; i++)
+			{
+				const u64 rdtsc_read = (utils::lfence(), utils::get_tsc());
 #ifdef _WIN32
-			Sleep(1);
-			error_data[i] = (utils::lfence(), utils::get_tsc());
-			LARGE_INTEGER ctr;
-			QueryPerformanceCounter(&ctr);
-			rdtsc_data[i] = (utils::lfence(), utils::get_tsc());
-			timer_data[i] = ctr.QuadPart;
+				LARGE_INTEGER ctr;
+				QueryPerformanceCounter(&ctr);
 #else
-			usleep(200);
-			error_data[i] = (utils::lfence(), utils::get_tsc());
-			struct timespec ts;
-			clock_gettime(CLOCK_MONOTONIC, &ts);
-			rdtsc_data[i] = (utils::lfence(), utils::get_tsc());
-			timer_data[i] = ts.tv_nsec + (ts.tv_sec - sec_base) * 1'000'000'000;
+				struct timespec ts;
+				clock_gettime(CLOCK_MONOTONIC, &ts);
 #endif
+				const u64 rdtsc_read2 = (utils::lfence(), utils::get_tsc());
+
+#ifdef _WIN32
+				const u64 timer_read = ctr.QuadPart - time_base;
+#else
+				const u64 timer_read = ts.tv_nsec + (ts.tv_sec - sec_base) * 1'000'000'000;
+#endif
+
+				if (i == 0 || (rdtsc_read2 >= rdtsc_read && rdtsc_read2 - rdtsc_read < rdtsc_diff[sample]))
+				{
+					rdtsc_data[sample] = rdtsc_read; // Note: rdtsc_read2 can also be written here because of the assumption of accuracy
+					timer_data[sample] = timer_read;
+					rdtsc_diff[sample] = rdtsc_read2 >= rdtsc_read ? rdtsc_read2 - rdtsc_read : u64{umax};
+				}
+
+				// 80 results in an error range of 4000 hertz (0.00025% of 4GHz CPU, quite acceptable)
+				// Error of 2.5 seconds per month
+				if (rdtsc_read2 - rdtsc_read < 80 && rdtsc_read2 >= rdtsc_read)
+				{
+					break;
+				}
+
+				// 8 yields seems to reduce significantly thread contention, improving accuracy
+				// Even 3 seem to do the job though, but just in case
+				if (i % 128 == 64)
+				{
+					std::this_thread::yield();
+				}
+
+				// Take 50% more yields with the last sample because it helps accuracy additionally the more time that passes
+				if (sample == sample_count - 1 && i % 256 == 128)
+				{
+					std::this_thread::yield();
+				}
+			}
+
+			if (sample < sample_count - 1)
+			{
+				// Sleep between first and last sample
+#ifdef _WIN32
+				Sleep(sleep_time_ms);
+#else
+				usleep(sleep_time_ms * 1000);
+#endif
+			}
 		}

-		// Restore main thread affinity
-		thread_ctrl::set_thread_affinity_mask(old_aff);
-
-		// Compute average TSC
-		ullong acc = 0;
-		for (int i = 0; i < samples - 1; i++)
+		if (timer_data[1] == timer_data[0])
 		{
-			acc += (rdtsc_data[i + 1] - rdtsc_data[i]) * timer_freq / (timer_data[i + 1] - timer_data[i]);
+			// Division by zero
+			return 0;
 		}

+		const u128 data = u128_from_mul(rdtsc_data[1] - rdtsc_data[0], timer_freq);
+
+		const u64 res = utils::udiv128(static_cast<u64>(data >> 64), static_cast<u64>(data), (timer_data[1] - timer_data[0]));
+
 		// Rounding
-		return round_tsc(acc / (samples - 1));
+		return round_tsc(res, utils::mul_saturate<u64>(utils::add_saturate<u64>(rdtsc_diff[0], rdtsc_diff[1]), utils::aligned_div(timer_freq, timer_data[1] - timer_data[0])));
 	}();

-	return cal_tsc;
-}
+	atomic_storage<u64>::release(utils::s_tsc_freq, cal_tsc);
+	return true;
+}();

 u64 utils::get_total_memory()
 {
--- a/rpcs3/util/sysinfo.hpp
+++ b/rpcs3/util/sysinfo.hpp
@ -73,8 +73,6 @@ namespace utils

 	bool get_low_power_mode();

-	ullong get_tsc_freq();
-
 	u64 get_total_memory();

 	u32 get_thread_count();
@ -89,4 +87,11 @@ namespace utils
 	u64 _get_main_tid();

 	inline const u64 main_tid = _get_main_tid();
+
+	extern u64 s_tsc_freq;
+
+	inline ullong get_tsc_freq()
+	{
+		return s_tsc_freq;
+	}
 }
--- a/rpcs3/util/types.hpp
+++ b/rpcs3/util/types.hpp
@ -566,6 +566,22 @@ struct s128 : u128
 };
 #endif

+// Optimization for u64*u64=u128
+constexpr u128 u128_from_mul(u64 a, u64 b)
+{
+#ifdef _MSC_VER
+	if (!std::is_constant_evaluated())
+	{
+		u64 hi;
+		u128 result = _umul128(a, b, &hi);
+		result.hi = hi;
+		return result;
+	}
+#endif
+
+	return u128{a} * b;
+}
+
 template <>
 struct get_int_impl<16>
 {