build(externals): move to MbedTLS upstream 2.28.1

The latest LTS release now includes the changes of the yuzu-emu fork, so we can now directly use the upstream version instead.
build: use system MbedTLS when available
2023-01-07 18:07:56 +01:00 · 2023-01-07 18:07:42 +01:00 · 2023-01-07 10:41:37 -06:00 · 2023-01-07 10:40:21 -06:00 · 2023-01-06 21:23:21 -05:00 · 2023-01-06 19:07:47 -05:00
146 changed files with 4417 additions and 1231 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -30,7 +30,7 @@
    url = https://github.com/yuzu-emu/sirit
 [submodule "mbedtls"]
    path = externals/mbedtls
-    url = https://github.com/yuzu-emu/mbedtls
+    url = https://github.com/Mbed-TLS/mbedtls
 [submodule "xbyak"]
    path = externals/xbyak
    url = https://github.com/herumi/xbyak.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,13 +3,8 @@

 cmake_minimum_required(VERSION 3.22)

-# Dynarmic has cmake_minimum_required(3.12) and we may want to override
-# some of its variables, which is only possible in 3.13+
-set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
-
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules")
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/find-modules")
 include(DownloadExternals)
 include(CMakeDependentOption)

--- a/externals/find-modules/FindDiscordRPC.cmake
+++ b/externals/find-modules/FindDiscordRPC.cmake
--- a/externals/find-modules/FindFFmpeg.cmake
+++ b/externals/find-modules/FindFFmpeg.cmake
--- a/CMakeModules/FindMbedTLS.cmake
+++ b/CMakeModules/FindMbedTLS.cmake
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: 2021 Andrea Pappacoda <andrea@pappacoda.it>
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+# MbedTLS 3.0.0 will ship with a CMake package config file,
+# see ARMmbed/mbedtls@d259e347e6e3a630acfc1a811709ca05e5d3b92e,
+# so when yuzu will switch to that version this won't be required anymore.
+#
+# yuzu only uses mbedcrypto, searching for mbedtls and mbedx509 is not
+# needed.
+
+find_path(MbedTLS_INCLUDE_DIR mbedtls/cipher.h)
+
+find_library(MbedTLS_LIBRARY mbedcrypto)
+
+if (MbedTLS_INCLUDE_DIR AND MbedTLS_LIBRARY)
+    # Check for CMAC support
+    include(CheckSymbolExists)
+    set(CMAKE_REQUIRED_LIBRARIES ${MbedTLS_LIBRARY})
+    check_symbol_exists(mbedtls_cipher_cmac ${MbedTLS_INCLUDE_DIR}/mbedtls/cmac.h mbedcrypto_HAS_CMAC)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+
+    # Check if version 2.x is available
+    file(READ "${MbedTLS_INCLUDE_DIR}/mbedtls/version.h" MbedTLS_VERSION_FILE)
+    string(REGEX MATCH "#define[ ]+MBEDTLS_VERSION_STRING[ ]+\"([0-9.]+)\"" _ ${MbedTLS_VERSION_FILE})
+    set(MbedTLS_VERSION "${CMAKE_MATCH_1}")
+
+    if (NOT TARGET MbedTLS::mbedcrypto)
+        add_library(MbedTLS::mbedcrypto UNKNOWN IMPORTED GLOBAL)
+        set_target_properties(MbedTLS::mbedcrypto PROPERTIES
+            IMPORTED_LOCATION "${MbedTLS_LIBRARY}"
+            INTERFACE_INCLUDE_DIRECTORIES "${MbedTLS_INCLUDE_DIR}"
+        )
+    endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MbedTLS
+    REQUIRED_VARS
+        MbedTLS_LIBRARY
+        MbedTLS_INCLUDE_DIR
+        mbedcrypto_HAS_CMAC
+    VERSION_VAR MbedTLS_VERSION
+)
--- a/externals/find-modules/FindOpus.cmake
+++ b/externals/find-modules/FindOpus.cmake
--- a/externals/find-modules/Findenet.cmake
+++ b/externals/find-modules/Findenet.cmake
--- a/externals/find-modules/Findhttplib.cmake
+++ b/externals/find-modules/Findhttplib.cmake
--- a/externals/find-modules/Findinih.cmake
+++ b/externals/find-modules/Findinih.cmake
--- a/externals/find-modules/Findlibusb.cmake
+++ b/externals/find-modules/Findlibusb.cmake
--- a/externals/find-modules/Findlz4.cmake
+++ b/externals/find-modules/Findlz4.cmake
--- a/externals/find-modules/Findzstd.cmake
+++ b/externals/find-modules/Findzstd.cmake
--- a/externals/cmake-modules/WindowsCopyFiles.cmake
+++ b/externals/cmake-modules/WindowsCopyFiles.cmake
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -1,9 +1,9 @@
 # SPDX-FileCopyrightText: 2016 Citra Emulator Project
 # SPDX-License-Identifier: GPL-2.0-or-later

-list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules")
-list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/externals/find-modules")
-include(DownloadExternals)
+# Dynarmic has cmake_minimum_required(3.12) and we may want to override
+# some of its variables, which is only possible in 3.13+
+set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)

 # xbyak
 if ((ARCHITECTURE_x86 OR ARCHITECTURE_x86_64) AND NOT TARGET xbyak::xbyak)
@@ -12,8 +12,7 @@ endif()

 # Dynarmic
 if ((ARCHITECTURE_x86_64 OR ARCHITECTURE_arm64) AND NOT TARGET dynarmic::dynarmic)
-    set(DYNARMIC_NO_BUNDLED_FMT ON)
-    set(DYNARMIC_IGNORE_ASSERTS ON CACHE BOOL "" FORCE)
+    set(DYNARMIC_IGNORE_ASSERTS ON)
    add_subdirectory(dynarmic EXCLUDE_FROM_ALL)
    add_library(dynarmic::dynarmic ALIAS dynarmic)
 endif()
@@ -31,9 +30,23 @@ if (NOT TARGET inih::INIReader)
    add_subdirectory(inih)
 endif()

-# mbedtls
-add_subdirectory(mbedtls EXCLUDE_FROM_ALL)
-target_include_directories(mbedtls PUBLIC ./mbedtls/include)
+# MbedTLS
+find_package(MbedTLS 2.16)
+if(NOT MbedTLS_FOUND)
+    message(STATUS "MbedTLS not found, falling back to externals")
+
+    set(ENABLE_PROGRAMS OFF CACHE BOOL "")
+    set(ENABLE_TESTING OFF CACHE BOOL "")
+
+    # Edit MbedTLS config header to enable CMAC
+    file(READ "./mbedtls/include/mbedtls/config.h" MbedTLS_CONFIG_FILE)
+    string(REPLACE "//#define MBEDTLS_CMAC_C" "#define MBEDTLS_CMAC_C" MbedTLS_CONFIG_FILE_CMAC "${MbedTLS_CONFIG_FILE}")
+    file(WRITE "./mbedtls/include/mbedtls/config.h" "${MbedTLS_CONFIG_FILE_CMAC}")
+
+    add_subdirectory(mbedtls EXCLUDE_FROM_ALL)
+    target_include_directories(mbedcrypto PUBLIC ./mbedtls/include)
+    add_library(MbedTLS::mbedcrypto ALIAS mbedcrypto)
+endif()

 # MicroProfile
 add_library(microprofile INTERFACE)
@@ -60,10 +73,10 @@ if (YUZU_USE_EXTERNAL_SDL2)
            Locale Power Render)
        foreach(_SUB ${SDL_UNUSED_SUBSYSTEMS})
          string(TOUPPER ${_SUB} _OPT)
-          option(SDL_${_OPT} "" OFF)
+          set(SDL_${_OPT} OFF)
        endforeach()

-        option(HIDAPI "" ON)
+        set(HIDAPI ON)
    endif()
    set(SDL_STATIC ON)
    set(SDL_SHARED OFF)
@@ -83,7 +96,7 @@ endif()

 # Cubeb
 if (ENABLE_CUBEB AND NOT TARGET cubeb::cubeb)
-    set(BUILD_TESTS OFF CACHE BOOL "")
+    set(BUILD_TESTS OFF)
    add_subdirectory(cubeb EXCLUDE_FROM_ALL)
    add_library(cubeb::cubeb ALIAS cubeb)
 endif()
@@ -98,6 +111,7 @@ endif()
 # Sirit
 add_subdirectory(sirit EXCLUDE_FROM_ALL)

+# httplib
 if (ENABLE_WEB_SERVICE AND NOT TARGET httplib::httplib)
    if (NOT WIN32)
        find_package(OpenSSL 1.1)
@@ -108,7 +122,7 @@ if (ENABLE_WEB_SERVICE AND NOT TARGET httplib::httplib)

    if (WIN32 OR NOT OPENSSL_FOUND)
        # LibreSSL
-        set(LIBRESSL_SKIP_INSTALL ON CACHE BOOL "")
+        set(LIBRESSL_SKIP_INSTALL ON)
        set(OPENSSLDIR "/etc/ssl/")
        add_subdirectory(libressl EXCLUDE_FROM_ALL)
        target_include_directories(ssl INTERFACE ./libressl/include)
@@ -118,7 +132,6 @@ if (ENABLE_WEB_SERVICE AND NOT TARGET httplib::httplib)
            DEFINITION OPENSSL_LIBS)
    endif()

-    # httplib
    add_library(httplib INTERFACE)
    target_include_directories(httplib INTERFACE ./cpp-httplib)
    target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)
--- a/externals/dynarmic
+++ b/externals/dynarmic
--- a/externals/mbedtls
+++ b/externals/mbedtls
--- a/externals/sirit
+++ b/externals/sirit
--- a/externals/xbyak
+++ b/externals/xbyak
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -97,6 +97,7 @@ add_library(common STATIC
    point.h
    precompiled_headers.h
    quaternion.h
+    range_map.h
    reader_writer_queue.h
    ring_buffer.h
    ${CMAKE_CURRENT_BINARY_DIR}/scm_rev.cpp
--- a/src/common/range_map.h
+++ b/src/common/range_map.h
@@ -0,0 +1,139 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#include <map>
+#include <type_traits>
+
+#include "common/common_types.h"
+
+namespace Common {
+
+template <typename KeyTBase, typename ValueT>
+class RangeMap {
+private:
+    using KeyT =
+        std::conditional_t<std::is_signed_v<KeyTBase>, KeyTBase, std::make_signed_t<KeyTBase>>;
+
+public:
+    explicit RangeMap(ValueT null_value_) : null_value{null_value_} {
+        container.emplace(std::numeric_limits<KeyT>::min(), null_value);
+    };
+    ~RangeMap() = default;
+
+    void Map(KeyTBase address, KeyTBase address_end, ValueT value) {
+        KeyT new_address = static_cast<KeyT>(address);
+        KeyT new_address_end = static_cast<KeyT>(address_end);
+        if (new_address < 0) {
+            new_address = 0;
+        }
+        if (new_address_end < 0) {
+            new_address_end = 0;
+        }
+        InternalMap(new_address, new_address_end, value);
+    }
+
+    void Unmap(KeyTBase address, KeyTBase address_end) {
+        Map(address, address_end, null_value);
+    }
+
+    [[nodiscard]] size_t GetContinousSizeFrom(KeyTBase address) const {
+        const KeyT new_address = static_cast<KeyT>(address);
+        if (new_address < 0) {
+            return 0;
+        }
+        return ContinousSizeInternal(new_address);
+    }
+
+    [[nodiscard]] ValueT GetValueAt(KeyT address) const {
+        const KeyT new_address = static_cast<KeyT>(address);
+        if (new_address < 0) {
+            return null_value;
+        }
+        return GetValueInternal(new_address);
+    }
+
+private:
+    using MapType = std::map<KeyT, ValueT>;
+    using IteratorType = typename MapType::iterator;
+    using ConstIteratorType = typename MapType::const_iterator;
+
+    size_t ContinousSizeInternal(KeyT address) const {
+        const auto it = GetFirstElementBeforeOrOn(address);
+        if (it == container.end() || it->second == null_value) {
+            return 0;
+        }
+        const auto it_end = std::next(it);
+        if (it_end == container.end()) {
+            return std::numeric_limits<KeyT>::max() - address;
+        }
+        return it_end->first - address;
+    }
+
+    ValueT GetValueInternal(KeyT address) const {
+        const auto it = GetFirstElementBeforeOrOn(address);
+        if (it == container.end()) {
+            return null_value;
+        }
+        return it->second;
+    }
+
+    ConstIteratorType GetFirstElementBeforeOrOn(KeyT address) const {
+        auto it = container.lower_bound(address);
+        if (it == container.begin()) {
+            return it;
+        }
+        if (it != container.end() && (it->first == address)) {
+            return it;
+        }
+        --it;
+        return it;
+    }
+
+    ValueT GetFirstValueWithin(KeyT address) {
+        auto it = container.lower_bound(address);
+        if (it == container.begin()) {
+            return it->second;
+        }
+        if (it == container.end()) [[unlikely]] { // this would be a bug
+            return null_value;
+        }
+        --it;
+        return it->second;
+    }
+
+    ValueT GetLastValueWithin(KeyT address) {
+        auto it = container.upper_bound(address);
+        if (it == container.end()) {
+            return null_value;
+        }
+        if (it == container.begin()) [[unlikely]] { // this would be a bug
+            return it->second;
+        }
+        --it;
+        return it->second;
+    }
+
+    void InternalMap(KeyT address, KeyT address_end, ValueT value) {
+        const bool must_add_start = GetFirstValueWithin(address) != value;
+        const ValueT last_value = GetLastValueWithin(address_end);
+        const bool must_add_end = last_value != value;
+        auto it = container.lower_bound(address);
+        const auto it_end = container.upper_bound(address_end);
+        while (it != it_end) {
+            it = container.erase(it);
+        }
+        if (must_add_start) {
+            container.emplace(address, value);
+        }
+        if (must_add_end) {
+            container.emplace(address_end, last_value);
+        }
+    }
+
+    ValueT null_value;
+    MapType container;
+};
+
+} // namespace Common
--- a/src/common/settings.cpp
+++ b/src/common/settings.cpp
@@ -185,6 +185,7 @@ void RestoreGlobalState(bool is_powered_on) {
    // Renderer
    values.fsr_sharpening_slider.SetGlobal(true);
    values.renderer_backend.SetGlobal(true);
+    values.renderer_force_max_clock.SetGlobal(true);
    values.vulkan_device.SetGlobal(true);
    values.aspect_ratio.SetGlobal(true);
    values.max_anisotropy.SetGlobal(true);
@@ -200,6 +201,7 @@ void RestoreGlobalState(bool is_powered_on) {
    values.use_asynchronous_shaders.SetGlobal(true);
    values.use_fast_gpu_time.SetGlobal(true);
    values.use_pessimistic_flushes.SetGlobal(true);
+    values.use_vulkan_driver_pipeline_cache.SetGlobal(true);
    values.bg_red.SetGlobal(true);
    values.bg_green.SetGlobal(true);
    values.bg_blue.SetGlobal(true);
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -415,6 +415,7 @@ struct Values {
    // Renderer
    SwitchableSetting<RendererBackend, true> renderer_backend{
        RendererBackend::Vulkan, RendererBackend::OpenGL, RendererBackend::Null, "backend"};
+    SwitchableSetting<bool> renderer_force_max_clock{true, "force_max_clock"};
    Setting<bool> renderer_debug{false, "debug"};
    Setting<bool> renderer_shader_feedback{false, "shader_feedback"};
    Setting<bool> enable_nsight_aftermath{false, "nsight_aftermath"};
@@ -451,6 +452,8 @@ struct Values {
    SwitchableSetting<bool> use_asynchronous_shaders{false, "use_asynchronous_shaders"};
    SwitchableSetting<bool> use_fast_gpu_time{true, "use_fast_gpu_time"};
    SwitchableSetting<bool> use_pessimistic_flushes{false, "use_pessimistic_flushes"};
+    SwitchableSetting<bool> use_vulkan_driver_pipeline_cache{true,
+                                                             "use_vulkan_driver_pipeline_cache"};

    SwitchableSetting<u8> bg_red{0, "bg_red"};
    SwitchableSetting<u8> bg_green{0, "bg_green"};
@@ -531,6 +534,7 @@ struct Values {
    Setting<bool> reporting_services{false, "reporting_services"};
    Setting<bool> quest_flag{false, "quest_flag"};
    Setting<bool> disable_macro_jit{false, "disable_macro_jit"};
+    Setting<bool> disable_macro_hle{false, "disable_macro_hle"};
    Setting<bool> extended_logging{false, "extended_logging"};
    Setting<bool> use_debug_asserts{false, "use_debug_asserts"};
    Setting<bool> use_auto_stub{false, "use_auto_stub"};
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -226,7 +226,6 @@ add_library(core STATIC
    hle/kernel/k_page_buffer.h
    hle/kernel/k_page_heap.cpp
    hle/kernel/k_page_heap.h
-    hle/kernel/k_page_group.cpp
    hle/kernel/k_page_group.h
    hle/kernel/k_page_table.cpp
    hle/kernel/k_page_table.h
@@ -806,7 +805,7 @@ endif()
 create_target_directory_groups(core)

 target_link_libraries(core PUBLIC common PRIVATE audio_core network video_core)
-target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls Opus::opus)
+target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json MbedTLS::mbedcrypto Opus::opus)
 if (MINGW)
    target_link_libraries(core PRIVATE ${MSWSOCK_LIBRARY})
 endif()
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -229,7 +229,11 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable*
    config.enable_cycle_counting = true;

    // Code cache size
+#ifdef ARCHITECTURE_arm64
+    config.code_cache_size = 128_MiB;
+#else
    config.code_cache_size = 512_MiB;
+#endif

    // Allow memory fault handling to work
    if (system.DebuggerEnabled()) {
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -288,7 +288,11 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable*
    config.enable_cycle_counting = true;

    // Code cache size
+#ifdef ARCHITECTURE_arm64
+    config.code_cache_size = 128_MiB;
+#else
    config.code_cache_size = 512_MiB;
+#endif

    // Allow memory fault handling to work
    if (system.DebuggerEnabled()) {
--- a/src/core/hid/emulated_controller.cpp
+++ b/src/core/hid/emulated_controller.cpp
@@ -11,6 +11,11 @@
 namespace Core::HID {
 constexpr s32 HID_JOYSTICK_MAX = 0x7fff;
 constexpr s32 HID_TRIGGER_MAX = 0x7fff;
+// Use a common UUID for TAS and Virtual Gamepad
+constexpr Common::UUID TAS_UUID =
+    Common::UUID{{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7, 0xA5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}};
+constexpr Common::UUID VIRTUAL_UUID =
+    Common::UUID{{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}};

 EmulatedController::EmulatedController(NpadIdType npad_id_type_) : npad_id_type(npad_id_type_) {}

@@ -348,10 +353,6 @@ void EmulatedController::ReloadInput() {
        }
    }

-    // Use a common UUID for TAS
-    static constexpr Common::UUID TAS_UUID = Common::UUID{
-        {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7, 0xA5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}};
-
    // Register TAS devices. No need to force update
    for (std::size_t index = 0; index < tas_button_devices.size(); ++index) {
        if (!tas_button_devices[index]) {
@@ -377,10 +378,6 @@ void EmulatedController::ReloadInput() {
        });
    }

-    // Use a common UUID for Virtual Gamepad
-    static constexpr Common::UUID VIRTUAL_UUID = Common::UUID{
-        {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}};
-
    // Register virtual devices. No need to force update
    for (std::size_t index = 0; index < virtual_button_devices.size(); ++index) {
        if (!virtual_button_devices[index]) {
@@ -780,7 +777,12 @@ void EmulatedController::SetStick(const Common::Input::CallbackStatus& callback,

    // Only read stick values that have the same uuid or are over the threshold to avoid flapping
    if (controller.stick_values[index].uuid != uuid) {
-        if (!stick_value.down && !stick_value.up && !stick_value.left && !stick_value.right) {
+        const bool is_tas = uuid == TAS_UUID;
+        if (is_tas && stick_value.x.value == 0 && stick_value.y.value == 0) {
+            return;
+        }
+        if (!is_tas && !stick_value.down && !stick_value.up && !stick_value.left &&
+            !stick_value.right) {
            return;
        }
    }
--- a/src/core/hle/kernel/k_code_memory.cpp
+++ b/src/core/hle/kernel/k_code_memory.cpp
@@ -27,13 +27,13 @@ Result KCodeMemory::Initialize(Core::DeviceMemory& device_memory, VAddr addr, si
    auto& page_table = m_owner->PageTable();

    // Construct the page group.
-    m_page_group.emplace(kernel, page_table.GetBlockInfoManager());
+    m_page_group = {};

    // Lock the memory.
-    R_TRY(page_table.LockForCodeMemory(std::addressof(*m_page_group), addr, size))
+    R_TRY(page_table.LockForCodeMemory(&m_page_group, addr, size))

    // Clear the memory.
-    for (const auto& block : *m_page_group) {
+    for (const auto& block : m_page_group.Nodes()) {
        std::memset(device_memory.GetPointer<void>(block.GetAddress()), 0xFF, block.GetSize());
    }

@@ -51,13 +51,12 @@ Result KCodeMemory::Initialize(Core::DeviceMemory& device_memory, VAddr addr, si
 void KCodeMemory::Finalize() {
    // Unlock.
    if (!m_is_mapped && !m_is_owner_mapped) {
-        const size_t size = m_page_group->GetNumPages() * PageSize;
-        m_owner->PageTable().UnlockForCodeMemory(m_address, size, *m_page_group);
+        const size_t size = m_page_group.GetNumPages() * PageSize;
+        m_owner->PageTable().UnlockForCodeMemory(m_address, size, m_page_group);
    }

    // Close the page group.
-    m_page_group->Close();
-    m_page_group->Finalize();
+    m_page_group = {};

    // Close our reference to our owner.
    m_owner->Close();
@@ -65,7 +64,7 @@ void KCodeMemory::Finalize() {

 Result KCodeMemory::Map(VAddr address, size_t size) {
    // Validate the size.
-    R_UNLESS(m_page_group->GetNumPages() == Common::DivideUp(size, PageSize), ResultInvalidSize);
+    R_UNLESS(m_page_group.GetNumPages() == Common::DivideUp(size, PageSize), ResultInvalidSize);

    // Lock ourselves.
    KScopedLightLock lk(m_lock);
@@ -75,7 +74,7 @@ Result KCodeMemory::Map(VAddr address, size_t size) {

    // Map the memory.
    R_TRY(kernel.CurrentProcess()->PageTable().MapPages(
-        address, *m_page_group, KMemoryState::CodeOut, KMemoryPermission::UserReadWrite));
+        address, m_page_group, KMemoryState::CodeOut, KMemoryPermission::UserReadWrite));

    // Mark ourselves as mapped.
    m_is_mapped = true;
@@ -85,13 +84,13 @@ Result KCodeMemory::Map(VAddr address, size_t size) {

 Result KCodeMemory::Unmap(VAddr address, size_t size) {
    // Validate the size.
-    R_UNLESS(m_page_group->GetNumPages() == Common::DivideUp(size, PageSize), ResultInvalidSize);
+    R_UNLESS(m_page_group.GetNumPages() == Common::DivideUp(size, PageSize), ResultInvalidSize);

    // Lock ourselves.
    KScopedLightLock lk(m_lock);

    // Unmap the memory.
-    R_TRY(kernel.CurrentProcess()->PageTable().UnmapPages(address, *m_page_group,
+    R_TRY(kernel.CurrentProcess()->PageTable().UnmapPages(address, m_page_group,
                                                          KMemoryState::CodeOut));

    // Mark ourselves as unmapped.
@@ -102,7 +101,7 @@ Result KCodeMemory::Unmap(VAddr address, size_t size) {

 Result KCodeMemory::MapToOwner(VAddr address, size_t size, Svc::MemoryPermission perm) {
    // Validate the size.
-    R_UNLESS(m_page_group->GetNumPages() == Common::DivideUp(size, PageSize), ResultInvalidSize);
+    R_UNLESS(m_page_group.GetNumPages() == Common::DivideUp(size, PageSize), ResultInvalidSize);

    // Lock ourselves.
    KScopedLightLock lk(m_lock);
@@ -126,7 +125,7 @@ Result KCodeMemory::MapToOwner(VAddr address, size_t size, Svc::MemoryPermission

    // Map the memory.
    R_TRY(
-        m_owner->PageTable().MapPages(address, *m_page_group, KMemoryState::GeneratedCode, k_perm));
+        m_owner->PageTable().MapPages(address, m_page_group, KMemoryState::GeneratedCode, k_perm));

    // Mark ourselves as mapped.
    m_is_owner_mapped = true;
@@ -136,13 +135,13 @@ Result KCodeMemory::MapToOwner(VAddr address, size_t size, Svc::MemoryPermission

 Result KCodeMemory::UnmapFromOwner(VAddr address, size_t size) {
    // Validate the size.
-    R_UNLESS(m_page_group->GetNumPages() == Common::DivideUp(size, PageSize), ResultInvalidSize);
+    R_UNLESS(m_page_group.GetNumPages() == Common::DivideUp(size, PageSize), ResultInvalidSize);

    // Lock ourselves.
    KScopedLightLock lk(m_lock);

    // Unmap the memory.
-    R_TRY(m_owner->PageTable().UnmapPages(address, *m_page_group, KMemoryState::GeneratedCode));
+    R_TRY(m_owner->PageTable().UnmapPages(address, m_page_group, KMemoryState::GeneratedCode));

    // Mark ourselves as unmapped.
    m_is_owner_mapped = false;
--- a/src/core/hle/kernel/k_code_memory.h
+++ b/src/core/hle/kernel/k_code_memory.h
@@ -3,8 +3,6 @@

 #pragma once

-#include <optional>
-
 #include "common/common_types.h"
 #include "core/device_memory.h"
 #include "core/hle/kernel/k_auto_object.h"
@@ -51,11 +49,11 @@ public:
        return m_address;
    }
    size_t GetSize() const {
-        return m_is_initialized ? m_page_group->GetNumPages() * PageSize : 0;
+        return m_is_initialized ? m_page_group.GetNumPages() * PageSize : 0;
    }

 private:
-    std::optional<KPageGroup> m_page_group{};
+    KPageGroup m_page_group{};
    KProcess* m_owner{};
    VAddr m_address{};
    KLightLock m_lock;
--- a/src/core/hle/kernel/k_memory_manager.cpp
+++ b/src/core/hle/kernel/k_memory_manager.cpp
@@ -223,7 +223,7 @@ Result KMemoryManager::AllocatePageGroupImpl(KPageGroup* out, size_t num_pages,

    // Ensure that we don't leave anything un-freed.
    ON_RESULT_FAILURE {
-        for (const auto& it : *out) {
+        for (const auto& it : out->Nodes()) {
            auto& manager = this->GetManager(it.GetAddress());
            const size_t node_num_pages = std::min<u64>(
                it.GetNumPages(), (manager.GetEndAddress() - it.GetAddress()) / PageSize);
@@ -285,7 +285,7 @@ Result KMemoryManager::AllocateAndOpen(KPageGroup* out, size_t num_pages, u32 op
                                      m_has_optimized_process[static_cast<size_t>(pool)], true));

    // Open the first reference to the pages.
-    for (const auto& block : *out) {
+    for (const auto& block : out->Nodes()) {
        PAddr cur_address = block.GetAddress();
        size_t remaining_pages = block.GetNumPages();
        while (remaining_pages > 0) {
@@ -335,7 +335,7 @@ Result KMemoryManager::AllocateForProcess(KPageGroup* out, size_t num_pages, u32
    // Perform optimized memory tracking, if we should.
    if (optimized) {
        // Iterate over the allocated blocks.
-        for (const auto& block : *out) {
+        for (const auto& block : out->Nodes()) {
            // Get the block extents.
            const PAddr block_address = block.GetAddress();
            const size_t block_pages = block.GetNumPages();
@@ -391,7 +391,7 @@ Result KMemoryManager::AllocateForProcess(KPageGroup* out, size_t num_pages, u32
        }
    } else {
        // Set all the allocated memory.
-        for (const auto& block : *out) {
+        for (const auto& block : out->Nodes()) {
            std::memset(m_system.DeviceMemory().GetPointer<void>(block.GetAddress()), fill_pattern,
                        block.GetSize());
        }
--- a/src/core/hle/kernel/k_page_group.cpp
+++ b/src/core/hle/kernel/k_page_group.cpp
@@ -1,121 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include "core/hle/kernel/k_dynamic_resource_manager.h"
-#include "core/hle/kernel/k_memory_manager.h"
-#include "core/hle/kernel/k_page_group.h"
-#include "core/hle/kernel/kernel.h"
-#include "core/hle/kernel/svc_results.h"
-
-namespace Kernel {
-
-void KPageGroup::Finalize() {
-    KBlockInfo* cur = m_first_block;
-    while (cur != nullptr) {
-        KBlockInfo* next = cur->GetNext();
-        m_manager->Free(cur);
-        cur = next;
-    }
-
-    m_first_block = nullptr;
-    m_last_block = nullptr;
-}
-
-void KPageGroup::CloseAndReset() {
-    auto& mm = m_kernel.MemoryManager();
-
-    KBlockInfo* cur = m_first_block;
-    while (cur != nullptr) {
-        KBlockInfo* next = cur->GetNext();
-        mm.Close(cur->GetAddress(), cur->GetNumPages());
-        m_manager->Free(cur);
-        cur = next;
-    }
-
-    m_first_block = nullptr;
-    m_last_block = nullptr;
-}
-
-size_t KPageGroup::GetNumPages() const {
-    size_t num_pages = 0;
-
-    for (const auto& it : *this) {
-        num_pages += it.GetNumPages();
-    }
-
-    return num_pages;
-}
-
-Result KPageGroup::AddBlock(KPhysicalAddress addr, size_t num_pages) {
-    // Succeed immediately if we're adding no pages.
-    R_SUCCEED_IF(num_pages == 0);
-
-    // Check for overflow.
-    ASSERT(addr < addr + num_pages * PageSize);
-
-    // Try to just append to the last block.
-    if (m_last_block != nullptr) {
-        R_SUCCEED_IF(m_last_block->TryConcatenate(addr, num_pages));
-    }
-
-    // Allocate a new block.
-    KBlockInfo* new_block = m_manager->Allocate();
-    R_UNLESS(new_block != nullptr, ResultOutOfResource);
-
-    // Initialize the block.
-    new_block->Initialize(addr, num_pages);
-
-    // Add the block to our list.
-    if (m_last_block != nullptr) {
-        m_last_block->SetNext(new_block);
-    } else {
-        m_first_block = new_block;
-    }
-    m_last_block = new_block;
-
-    R_SUCCEED();
-}
-
-void KPageGroup::Open() const {
-    auto& mm = m_kernel.MemoryManager();
-
-    for (const auto& it : *this) {
-        mm.Open(it.GetAddress(), it.GetNumPages());
-    }
-}
-
-void KPageGroup::OpenFirst() const {
-    auto& mm = m_kernel.MemoryManager();
-
-    for (const auto& it : *this) {
-        mm.OpenFirst(it.GetAddress(), it.GetNumPages());
-    }
-}
-
-void KPageGroup::Close() const {
-    auto& mm = m_kernel.MemoryManager();
-
-    for (const auto& it : *this) {
-        mm.Close(it.GetAddress(), it.GetNumPages());
-    }
-}
-
-bool KPageGroup::IsEquivalentTo(const KPageGroup& rhs) const {
-    auto lit = this->begin();
-    auto rit = rhs.begin();
-    auto lend = this->end();
-    auto rend = rhs.end();
-
-    while (lit != lend && rit != rend) {
-        if (*lit != *rit) {
-            return false;
-        }
-
-        ++lit;
-        ++rit;
-    }
-
-    return lit == lend && rit == rend;
-}
-
-} // namespace Kernel
--- a/src/core/hle/kernel/k_page_group.h
+++ b/src/core/hle/kernel/k_page_group.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

 #pragma once
@@ -13,23 +13,24 @@

 namespace Kernel {

-class KBlockInfoManager;
-class KernelCore;
 class KPageGroup;

 class KBlockInfo {
-public:
-    constexpr explicit KBlockInfo() : m_next(nullptr) {}
+private:
+    friend class KPageGroup;

-    constexpr void Initialize(KPhysicalAddress addr, size_t np) {
+public:
+    constexpr KBlockInfo() = default;
+
+    constexpr void Initialize(PAddr addr, size_t np) {
        ASSERT(Common::IsAligned(addr, PageSize));
        ASSERT(static_cast<u32>(np) == np);

-        m_page_index = static_cast<u32>(addr / PageSize);
+        m_page_index = static_cast<u32>(addr) / PageSize;
        m_num_pages = static_cast<u32>(np);
    }

-    constexpr KPhysicalAddress GetAddress() const {
+    constexpr PAddr GetAddress() const {
        return m_page_index * PageSize;
    }
    constexpr size_t GetNumPages() const {
@@ -38,10 +39,10 @@ public:
    constexpr size_t GetSize() const {
        return this->GetNumPages() * PageSize;
    }
-    constexpr KPhysicalAddress GetEndAddress() const {
+    constexpr PAddr GetEndAddress() const {
        return (m_page_index + m_num_pages) * PageSize;
    }
-    constexpr KPhysicalAddress GetLastAddress() const {
+    constexpr PAddr GetLastAddress() const {
        return this->GetEndAddress() - 1;
    }

@@ -61,8 +62,8 @@ public:
        return !(*this == rhs);
    }

-    constexpr bool IsStrictlyBefore(KPhysicalAddress addr) const {
-        const KPhysicalAddress end = this->GetEndAddress();
+    constexpr bool IsStrictlyBefore(PAddr addr) const {
+        const PAddr end = this->GetEndAddress();

        if (m_page_index != 0 && end == 0) {
            return false;
@@ -71,11 +72,11 @@ public:
        return end < addr;
    }

-    constexpr bool operator<(KPhysicalAddress addr) const {
+    constexpr bool operator<(PAddr addr) const {
        return this->IsStrictlyBefore(addr);
    }

-    constexpr bool TryConcatenate(KPhysicalAddress addr, size_t np) {
+    constexpr bool TryConcatenate(PAddr addr, size_t np) {
        if (addr != 0 && addr == this->GetEndAddress()) {
            m_num_pages += static_cast<u32>(np);
            return true;
@@ -89,118 +90,96 @@ private:
    }

 private:
-    friend class KPageGroup;
-
    KBlockInfo* m_next{};
    u32 m_page_index{};
    u32 m_num_pages{};
 };
 static_assert(sizeof(KBlockInfo) <= 0x10);

-class KPageGroup {
+class KPageGroup final {
 public:
-    class Iterator {
+    class Node final {
    public:
-        using iterator_category = std::forward_iterator_tag;
-        using value_type = const KBlockInfo;
-        using difference_type = std::ptrdiff_t;
-        using pointer = value_type*;
-        using reference = value_type&;
+        constexpr Node(u64 addr_, std::size_t num_pages_) : addr{addr_}, num_pages{num_pages_} {}

-        constexpr explicit Iterator(pointer n) : m_node(n) {}
-
-        constexpr bool operator==(const Iterator& rhs) const {
-            return m_node == rhs.m_node;
-        }
-        constexpr bool operator!=(const Iterator& rhs) const {
-            return !(*this == rhs);
+        constexpr u64 GetAddress() const {
+            return addr;
        }

-        constexpr pointer operator->() const {
-            return m_node;
-        }
-        constexpr reference operator*() const {
-            return *m_node;
+        constexpr std::size_t GetNumPages() const {
+            return num_pages;
        }

-        constexpr Iterator& operator++() {
-            m_node = m_node->GetNext();
-            return *this;
-        }
-
-        constexpr Iterator operator++(int) {
-            const Iterator it{*this};
-            ++(*this);
-            return it;
+        constexpr std::size_t GetSize() const {
+            return GetNumPages() * PageSize;
        }

    private:
-        pointer m_node{};
+        u64 addr{};
+        std::size_t num_pages{};
    };

-    explicit KPageGroup(KernelCore& kernel, KBlockInfoManager* m)
-        : m_kernel{kernel}, m_manager{m} {}
-    ~KPageGroup() {
-        this->Finalize();
-    }
-
-    void CloseAndReset();
-    void Finalize();
-
-    Iterator begin() const {
-        return Iterator{m_first_block};
-    }
-    Iterator end() const {
-        return Iterator{nullptr};
-    }
-    bool empty() const {
-        return m_first_block == nullptr;
-    }
-
-    Result AddBlock(KPhysicalAddress addr, size_t num_pages);
-    void Open() const;
-    void OpenFirst() const;
-    void Close() const;
-
-    size_t GetNumPages() const;
-
-    bool IsEquivalentTo(const KPageGroup& rhs) const;
-
-    bool operator==(const KPageGroup& rhs) const {
-        return this->IsEquivalentTo(rhs);
-    }
-
-    bool operator!=(const KPageGroup& rhs) const {
-        return !(*this == rhs);
-    }
-
-private:
-    KernelCore& m_kernel;
-    KBlockInfo* m_first_block{};
-    KBlockInfo* m_last_block{};
-    KBlockInfoManager* m_manager{};
-};
-
-class KScopedPageGroup {
 public:
-    explicit KScopedPageGroup(const KPageGroup* gp) : m_pg(gp) {
-        if (m_pg) {
-            m_pg->Open();
-        }
-    }
-    explicit KScopedPageGroup(const KPageGroup& gp) : KScopedPageGroup(std::addressof(gp)) {}
-    ~KScopedPageGroup() {
-        if (m_pg) {
-            m_pg->Close();
-        }
+    KPageGroup() = default;
+    KPageGroup(u64 address, u64 num_pages) {
+        ASSERT(AddBlock(address, num_pages).IsSuccess());
    }

-    void CancelClose() {
-        m_pg = nullptr;
+    constexpr std::list<Node>& Nodes() {
+        return nodes;
    }

+    constexpr const std::list<Node>& Nodes() const {
+        return nodes;
+    }
+
+    std::size_t GetNumPages() const {
+        std::size_t num_pages = 0;
+        for (const Node& node : nodes) {
+            num_pages += node.GetNumPages();
+        }
+        return num_pages;
+    }
+
+    bool IsEqual(KPageGroup& other) const {
+        auto this_node = nodes.begin();
+        auto other_node = other.nodes.begin();
+        while (this_node != nodes.end() && other_node != other.nodes.end()) {
+            if (this_node->GetAddress() != other_node->GetAddress() ||
+                this_node->GetNumPages() != other_node->GetNumPages()) {
+                return false;
+            }
+            this_node = std::next(this_node);
+            other_node = std::next(other_node);
+        }
+
+        return this_node == nodes.end() && other_node == other.nodes.end();
+    }
+
+    Result AddBlock(u64 address, u64 num_pages) {
+        if (!num_pages) {
+            return ResultSuccess;
+        }
+        if (!nodes.empty()) {
+            const auto node = nodes.back();
+            if (node.GetAddress() + node.GetNumPages() * PageSize == address) {
+                address = node.GetAddress();
+                num_pages += node.GetNumPages();
+                nodes.pop_back();
+            }
+        }
+        nodes.push_back({address, num_pages});
+        return ResultSuccess;
+    }
+
+    bool Empty() const {
+        return nodes.empty();
+    }
+
+    void Finalize() {}
+
 private:
-    const KPageGroup* m_pg{};
+    std::list<Node> nodes;
 };

 } // namespace Kernel
--- a/src/core/hle/kernel/k_page_table.cpp
+++ b/src/core/hle/kernel/k_page_table.cpp
@@ -100,7 +100,7 @@ constexpr size_t GetAddressSpaceWidthFromType(FileSys::ProgramAddressSpaceType a

 KPageTable::KPageTable(Core::System& system_)
    : m_general_lock{system_.Kernel()},
-      m_map_physical_memory_lock{system_.Kernel()}, m_system{system_}, m_kernel{system_.Kernel()} {}
+      m_map_physical_memory_lock{system_.Kernel()}, m_system{system_} {}

 KPageTable::~KPageTable() = default;

@@ -373,7 +373,7 @@ Result KPageTable::MapProcessCode(VAddr addr, size_t num_pages, KMemoryState sta
                                                 m_memory_block_slab_manager);

    // Allocate and open.
-    KPageGroup pg{m_kernel, m_block_info_manager};
+    KPageGroup pg;
    R_TRY(m_system.Kernel().MemoryManager().AllocateAndOpen(
        &pg, num_pages,
        KMemoryManager::EncodeOption(KMemoryManager::Pool::Application, m_allocation_option)));
@@ -432,7 +432,7 @@ Result KPageTable::MapCodeMemory(VAddr dst_address, VAddr src_address, size_t si
        const size_t num_pages = size / PageSize;

        // Create page groups for the memory being mapped.
-        KPageGroup pg{m_kernel, m_block_info_manager};
+        KPageGroup pg;
        AddRegionToPages(src_address, num_pages, pg);

        // Reprotect the source as kernel-read/not mapped.
@@ -593,7 +593,7 @@ Result KPageTable::MakePageGroup(KPageGroup& pg, VAddr addr, size_t num_pages) {
    const size_t size = num_pages * PageSize;

    // We're making a new group, not adding to an existing one.
-    R_UNLESS(pg.empty(), ResultInvalidCurrentMemory);
+    R_UNLESS(pg.Empty(), ResultInvalidCurrentMemory);

    // Begin traversal.
    Common::PageTable::TraversalContext context;
@@ -640,10 +640,11 @@ Result KPageTable::MakePageGroup(KPageGroup& pg, VAddr addr, size_t num_pages) {
    R_SUCCEED();
 }

-bool KPageTable::IsValidPageGroup(const KPageGroup& pg, VAddr addr, size_t num_pages) {
+bool KPageTable::IsValidPageGroup(const KPageGroup& pg_ll, VAddr addr, size_t num_pages) {
    ASSERT(this->IsLockedByCurrentThread());

    const size_t size = num_pages * PageSize;
+    const auto& pg = pg_ll.Nodes();
    const auto& memory_layout = m_system.Kernel().MemoryLayout();

    // Empty groups are necessarily invalid.
@@ -941,6 +942,9 @@ Result KPageTable::SetupForIpcServer(VAddr* out_addr, size_t size, VAddr src_add

    ON_RESULT_FAILURE {
        if (cur_mapped_addr != dst_addr) {
+            // HACK: Manually close the pages.
+            HACK_ClosePages(dst_addr, (cur_mapped_addr - dst_addr) / PageSize);
+
            ASSERT(Operate(dst_addr, (cur_mapped_addr - dst_addr) / PageSize,
                           KMemoryPermission::None, OperationType::Unmap)
                       .IsSuccess());
@@ -1016,6 +1020,9 @@ Result KPageTable::SetupForIpcServer(VAddr* out_addr, size_t size, VAddr src_add
        // Map the page.
        R_TRY(Operate(cur_mapped_addr, 1, test_perm, OperationType::Map, start_partial_page));

+        // HACK: Manually open the pages.
+        HACK_OpenPages(start_partial_page, 1);
+
        // Update tracking extents.
        cur_mapped_addr += PageSize;
        cur_block_addr += PageSize;
@@ -1044,6 +1051,9 @@ Result KPageTable::SetupForIpcServer(VAddr* out_addr, size_t size, VAddr src_add
            R_TRY(Operate(cur_mapped_addr, cur_block_size / PageSize, test_perm, OperationType::Map,
                          cur_block_addr));

+            // HACK: Manually open the pages.
+            HACK_OpenPages(cur_block_addr, cur_block_size / PageSize);
+
            // Update tracking extents.
            cur_mapped_addr += cur_block_size;
            cur_block_addr = next_entry.phys_addr;
@@ -1063,6 +1073,9 @@ Result KPageTable::SetupForIpcServer(VAddr* out_addr, size_t size, VAddr src_add
        R_TRY(Operate(cur_mapped_addr, last_block_size / PageSize, test_perm, OperationType::Map,
                      cur_block_addr));

+        // HACK: Manually open the pages.
+        HACK_OpenPages(cur_block_addr, last_block_size / PageSize);
+
        // Update tracking extents.
        cur_mapped_addr += last_block_size;
        cur_block_addr += last_block_size;
@@ -1094,6 +1107,9 @@ Result KPageTable::SetupForIpcServer(VAddr* out_addr, size_t size, VAddr src_add

        // Map the page.
        R_TRY(Operate(cur_mapped_addr, 1, test_perm, OperationType::Map, end_partial_page));
+
+        // HACK: Manually open the pages.
+        HACK_OpenPages(end_partial_page, 1);
    }

    // Update memory blocks to reflect our changes
@@ -1195,6 +1211,9 @@ Result KPageTable::CleanupForIpcServer(VAddr address, size_t size, KMemoryState
    const size_t aligned_size = aligned_end - aligned_start;
    const size_t aligned_num_pages = aligned_size / PageSize;

+    // HACK: Manually close the pages.
+    HACK_ClosePages(aligned_start, aligned_num_pages);
+
    // Unmap the pages.
    R_TRY(Operate(aligned_start, aligned_num_pages, KMemoryPermission::None, OperationType::Unmap));

@@ -1482,6 +1501,17 @@ void KPageTable::CleanupForIpcClientOnServerSetupFailure([[maybe_unused]] PageLi
    }
 }

+void KPageTable::HACK_OpenPages(PAddr phys_addr, size_t num_pages) {
+    m_system.Kernel().MemoryManager().OpenFirst(phys_addr, num_pages);
+}
+
+void KPageTable::HACK_ClosePages(VAddr virt_addr, size_t num_pages) {
+    for (size_t index = 0; index < num_pages; ++index) {
+        const auto paddr = GetPhysicalAddr(virt_addr + (index * PageSize));
+        m_system.Kernel().MemoryManager().Close(paddr, 1);
+    }
+}
+
 Result KPageTable::MapPhysicalMemory(VAddr address, size_t size) {
    // Lock the physical memory lock.
    KScopedLightLock phys_lk(m_map_physical_memory_lock);
@@ -1542,7 +1572,7 @@ Result KPageTable::MapPhysicalMemory(VAddr address, size_t size) {
            R_UNLESS(memory_reservation.Succeeded(), ResultLimitReached);

            // Allocate pages for the new memory.
-            KPageGroup pg{m_kernel, m_block_info_manager};
+            KPageGroup pg;
            R_TRY(m_system.Kernel().MemoryManager().AllocateForProcess(
                &pg, (size - mapped_size) / PageSize, m_allocate_option, 0, 0));

@@ -1620,7 +1650,7 @@ Result KPageTable::MapPhysicalMemory(VAddr address, size_t size) {
                KScopedPageTableUpdater updater(this);

                // Prepare to iterate over the memory.
-                auto pg_it = pg.begin();
+                auto pg_it = pg.Nodes().begin();
                PAddr pg_phys_addr = pg_it->GetAddress();
                size_t pg_pages = pg_it->GetNumPages();

@@ -1650,6 +1680,9 @@ Result KPageTable::MapPhysicalMemory(VAddr address, size_t size) {
                                             last_unmap_address + 1 - cur_address) /
                                    PageSize;

+                                // HACK: Manually close the pages.
+                                HACK_ClosePages(cur_address, cur_pages);
+
                                // Unmap.
                                ASSERT(Operate(cur_address, cur_pages, KMemoryPermission::None,
                                               OperationType::Unmap)
@@ -1670,7 +1703,7 @@ Result KPageTable::MapPhysicalMemory(VAddr address, size_t size) {
                    // Release any remaining unmapped memory.
                    m_system.Kernel().MemoryManager().OpenFirst(pg_phys_addr, pg_pages);
                    m_system.Kernel().MemoryManager().Close(pg_phys_addr, pg_pages);
-                    for (++pg_it; pg_it != pg.end(); ++pg_it) {
+                    for (++pg_it; pg_it != pg.Nodes().end(); ++pg_it) {
                        m_system.Kernel().MemoryManager().OpenFirst(pg_it->GetAddress(),
                                                                    pg_it->GetNumPages());
                        m_system.Kernel().MemoryManager().Close(pg_it->GetAddress(),
@@ -1698,7 +1731,7 @@ Result KPageTable::MapPhysicalMemory(VAddr address, size_t size) {
                            // Check if we're at the end of the physical block.
                            if (pg_pages == 0) {
                                // Ensure there are more pages to map.
-                                ASSERT(pg_it != pg.end());
+                                ASSERT(pg_it != pg.Nodes().end());

                                // Advance our physical block.
                                ++pg_it;
@@ -1709,7 +1742,10 @@ Result KPageTable::MapPhysicalMemory(VAddr address, size_t size) {
                            // Map whatever we can.
                            const size_t cur_pages = std::min(pg_pages, map_pages);
                            R_TRY(Operate(cur_address, cur_pages, KMemoryPermission::UserReadWrite,
-                                          OperationType::MapFirst, pg_phys_addr));
+                                          OperationType::Map, pg_phys_addr));
+
+                            // HACK: Manually open the pages.
+                            HACK_OpenPages(pg_phys_addr, cur_pages);

                            // Advance.
                            cur_address += cur_pages * PageSize;
@@ -1852,6 +1888,9 @@ Result KPageTable::UnmapPhysicalMemory(VAddr address, size_t size) {
                                              last_address + 1 - cur_address) /
                                     PageSize;

+            // HACK: Manually close the pages.
+            HACK_ClosePages(cur_address, cur_pages);
+
            // Unmap.
            ASSERT(Operate(cur_address, cur_pages, KMemoryPermission::None, OperationType::Unmap)
                       .IsSuccess());
@@ -1916,7 +1955,7 @@ Result KPageTable::MapMemory(VAddr dst_address, VAddr src_address, size_t size)
    R_TRY(dst_allocator_result);

    // Map the memory.
-    KPageGroup page_linked_list{m_kernel, m_block_info_manager};
+    KPageGroup page_linked_list;
    const size_t num_pages{size / PageSize};
    const KMemoryPermission new_src_perm = static_cast<KMemoryPermission>(
        KMemoryPermission::KernelRead | KMemoryPermission::NotMapped);
@@ -1983,14 +2022,14 @@ Result KPageTable::UnmapMemory(VAddr dst_address, VAddr src_address, size_t size
                                                     num_dst_allocator_blocks);
    R_TRY(dst_allocator_result);

-    KPageGroup src_pages{m_kernel, m_block_info_manager};
-    KPageGroup dst_pages{m_kernel, m_block_info_manager};
+    KPageGroup src_pages;
+    KPageGroup dst_pages;
    const size_t num_pages{size / PageSize};

    AddRegionToPages(src_address, num_pages, src_pages);
    AddRegionToPages(dst_address, num_pages, dst_pages);

-    R_UNLESS(dst_pages.IsEquivalentTo(src_pages), ResultInvalidMemoryRegion);
+    R_UNLESS(dst_pages.IsEqual(src_pages), ResultInvalidMemoryRegion);

    {
        auto block_guard = detail::ScopeExit([&] { MapPages(dst_address, dst_pages, dst_perm); });
@@ -2021,7 +2060,7 @@ Result KPageTable::MapPages(VAddr addr, const KPageGroup& page_linked_list,

    VAddr cur_addr{addr};

-    for (const auto& node : page_linked_list) {
+    for (const auto& node : page_linked_list.Nodes()) {
        if (const auto result{
                Operate(cur_addr, node.GetNumPages(), perm, OperationType::Map, node.GetAddress())};
            result.IsError()) {
@@ -2121,7 +2160,7 @@ Result KPageTable::UnmapPages(VAddr addr, const KPageGroup& page_linked_list) {

    VAddr cur_addr{addr};

-    for (const auto& node : page_linked_list) {
+    for (const auto& node : page_linked_list.Nodes()) {
        if (const auto result{Operate(cur_addr, node.GetNumPages(), KMemoryPermission::None,
                                      OperationType::Unmap)};
            result.IsError()) {
@@ -2488,13 +2527,13 @@ Result KPageTable::SetHeapSize(VAddr* out, size_t size) {
    R_UNLESS(memory_reservation.Succeeded(), ResultLimitReached);

    // Allocate pages for the heap extension.
-    KPageGroup pg{m_kernel, m_block_info_manager};
+    KPageGroup pg;
    R_TRY(m_system.Kernel().MemoryManager().AllocateAndOpen(
        &pg, allocation_size / PageSize,
        KMemoryManager::EncodeOption(m_memory_pool, m_allocation_option)));

    // Clear all the newly allocated pages.
-    for (const auto& it : pg) {
+    for (const auto& it : pg.Nodes()) {
        std::memset(m_system.DeviceMemory().GetPointer<void>(it.GetAddress()), m_heap_fill_value,
                    it.GetSize());
    }
@@ -2571,23 +2610,11 @@ ResultVal<VAddr> KPageTable::AllocateAndMapMemory(size_t needed_num_pages, size_
    if (is_map_only) {
        R_TRY(Operate(addr, needed_num_pages, perm, OperationType::Map, map_addr));
    } else {
-        // Create a page group tohold the pages we allocate.
-        KPageGroup pg{m_kernel, m_block_info_manager};
-
-        R_TRY(m_system.Kernel().MemoryManager().AllocateAndOpen(
-            &pg, needed_num_pages,
-            KMemoryManager::EncodeOption(m_memory_pool, m_allocation_option)));
-
-        // Ensure that the page group is closed when we're done working with it.
-        SCOPE_EXIT({ pg.Close(); });
-
-        // Clear all pages.
-        for (const auto& it : pg) {
-            std::memset(m_system.DeviceMemory().GetPointer<void>(it.GetAddress()),
-                        m_heap_fill_value, it.GetSize());
-        }
-
-        R_TRY(Operate(addr, needed_num_pages, pg, OperationType::MapGroup));
+        KPageGroup page_group;
+        R_TRY(m_system.Kernel().MemoryManager().AllocateForProcess(
+            &page_group, needed_num_pages,
+            KMemoryManager::EncodeOption(m_memory_pool, m_allocation_option), 0, 0));
+        R_TRY(Operate(addr, needed_num_pages, page_group, OperationType::MapGroup));
    }

    // Update the blocks.
@@ -2768,28 +2795,19 @@ Result KPageTable::Operate(VAddr addr, size_t num_pages, const KPageGroup& page_
    ASSERT(num_pages > 0);
    ASSERT(num_pages == page_group.GetNumPages());

-    switch (operation) {
-    case OperationType::MapGroup: {
-        // We want to maintain a new reference to every page in the group.
-        KScopedPageGroup spg(page_group);
+    for (const auto& node : page_group.Nodes()) {
+        const size_t size{node.GetNumPages() * PageSize};

-        for (const auto& node : page_group) {
-            const size_t size{node.GetNumPages() * PageSize};
-
-            // Map the pages.
+        switch (operation) {
+        case OperationType::MapGroup:
            m_system.Memory().MapMemoryRegion(*m_page_table_impl, addr, size, node.GetAddress());
-
-            addr += size;
+            break;
+        default:
+            ASSERT(false);
+            break;
        }

-        // We succeeded! We want to persist the reference to the pages.
-        spg.CancelClose();
-
-        break;
-    }
-    default:
-        ASSERT(false);
-        break;
+        addr += size;
    }

    R_SUCCEED();
@@ -2804,29 +2822,13 @@ Result KPageTable::Operate(VAddr addr, size_t num_pages, KMemoryPermission perm,
    ASSERT(ContainsPages(addr, num_pages));

    switch (operation) {
-    case OperationType::Unmap: {
-        // Ensure that any pages we track close on exit.
-        KPageGroup pages_to_close{m_kernel, this->GetBlockInfoManager()};
-        SCOPE_EXIT({ pages_to_close.CloseAndReset(); });
-
-        this->AddRegionToPages(addr, num_pages, pages_to_close);
+    case OperationType::Unmap:
        m_system.Memory().UnmapRegion(*m_page_table_impl, addr, num_pages * PageSize);
        break;
-    }
-    case OperationType::MapFirst:
    case OperationType::Map: {
        ASSERT(map_addr);
        ASSERT(Common::IsAligned(map_addr, PageSize));
        m_system.Memory().MapMemoryRegion(*m_page_table_impl, addr, num_pages * PageSize, map_addr);
-
-        // Open references to pages, if we should.
-        if (IsHeapPhysicalAddress(m_kernel.MemoryLayout(), map_addr)) {
-            if (operation == OperationType::MapFirst) {
-                m_kernel.MemoryManager().OpenFirst(map_addr, num_pages);
-            } else {
-                m_kernel.MemoryManager().Open(map_addr, num_pages);
-            }
-        }
        break;
    }
    case OperationType::Separate: {
--- a/src/core/hle/kernel/k_page_table.h
+++ b/src/core/hle/kernel/k_page_table.h
@@ -107,10 +107,6 @@ public:
        return *m_page_table_impl;
    }

-    KBlockInfoManager* GetBlockInfoManager() {
-        return m_block_info_manager;
-    }
-
    bool CanContain(VAddr addr, size_t size, KMemoryState state) const;

 protected:
@@ -265,6 +261,10 @@ private:
    void CleanupForIpcClientOnServerSetupFailure(PageLinkedList* page_list, VAddr address,
                                                 size_t size, KMemoryPermission prot_perm);

+    // HACK: These will be removed once we automatically manage page reference counts.
+    void HACK_OpenPages(PAddr phys_addr, size_t num_pages);
+    void HACK_ClosePages(VAddr virt_addr, size_t num_pages);
+
    mutable KLightLock m_general_lock;
    mutable KLightLock m_map_physical_memory_lock;

@@ -488,7 +488,6 @@ private:
    std::unique_ptr<Common::PageTable> m_page_table_impl;

    Core::System& m_system;
-    KernelCore& m_kernel;
 };

 } // namespace Kernel
--- a/src/core/hle/kernel/k_shared_memory.cpp
+++ b/src/core/hle/kernel/k_shared_memory.cpp
@@ -13,7 +13,10 @@
 namespace Kernel {

 KSharedMemory::KSharedMemory(KernelCore& kernel_) : KAutoObjectWithSlabHeapAndContainer{kernel_} {}
-KSharedMemory::~KSharedMemory() = default;
+
+KSharedMemory::~KSharedMemory() {
+    kernel.GetSystemResourceLimit()->Release(LimitableResource::PhysicalMemoryMax, size);
+}

 Result KSharedMemory::Initialize(Core::DeviceMemory& device_memory_, KProcess* owner_process_,
                                 Svc::MemoryPermission owner_permission_,
@@ -46,8 +49,7 @@ Result KSharedMemory::Initialize(Core::DeviceMemory& device_memory_, KProcess* o
    R_UNLESS(physical_address != 0, ResultOutOfMemory);

    //! Insert the result into our page group.
-    page_group.emplace(kernel, &kernel.GetSystemSystemResource().GetBlockInfoManager());
-    page_group->AddBlock(physical_address, num_pages);
+    page_group.emplace(physical_address, num_pages);

    // Commit our reservation.
    memory_reservation.Commit();
@@ -60,7 +62,7 @@ Result KSharedMemory::Initialize(Core::DeviceMemory& device_memory_, KProcess* o
    is_initialized = true;

    // Clear all pages in the memory.
-    for (const auto& block : *page_group) {
+    for (const auto& block : page_group->Nodes()) {
        std::memset(device_memory_.GetPointer<void>(block.GetAddress()), 0, block.GetSize());
    }

@@ -69,8 +71,13 @@ Result KSharedMemory::Initialize(Core::DeviceMemory& device_memory_, KProcess* o

 void KSharedMemory::Finalize() {
    // Close and finalize the page group.
-    page_group->Close();
-    page_group->Finalize();
+    // page_group->Close();
+    // page_group->Finalize();
+
+    //! HACK: Manually close.
+    for (const auto& block : page_group->Nodes()) {
+        kernel.MemoryManager().Close(block.GetAddress(), block.GetNumPages());
+    }

    // Release the memory reservation.
    resource_limit->Release(LimitableResource::PhysicalMemoryMax, size);
--- a/src/core/hle/kernel/memory_types.h
+++ b/src/core/hle/kernel/memory_types.h
@@ -14,7 +14,4 @@ constexpr std::size_t PageSize{1 << PageBits};

 using Page = std::array<u8, PageSize>;

-using KPhysicalAddress = PAddr;
-using KProcessAddress = VAddr;
-
 } // namespace Kernel
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -1485,7 +1485,7 @@ static Result MapProcessMemory(Core::System& system, VAddr dst_address, Handle p
             ResultInvalidMemoryRegion);

    // Create a new page group.
-    KPageGroup pg{system.Kernel(), dst_pt.GetBlockInfoManager()};
+    KPageGroup pg;
    R_TRY(src_pt.MakeAndOpenPageGroup(
        std::addressof(pg), src_address, size / PageSize, KMemoryState::FlagCanMapProcess,
        KMemoryState::FlagCanMapProcess, KMemoryPermission::None, KMemoryPermission::None,
--- a/src/core/hle/service/nifm/nifm.cpp
+++ b/src/core/hle/service/nifm/nifm.cpp
@@ -22,15 +22,19 @@ namespace {

 namespace Service::NIFM {

+// This is nn::nifm::RequestState
 enum class RequestState : u32 {
    NotSubmitted = 1,
-    Error = 1, ///< The duplicate 1 is intentional; it means both not submitted and error on HW.
-    Pending = 2,
-    Connected = 3,
+    Invalid = 1, ///< The duplicate 1 is intentional; it means both not submitted and error on HW.
+    OnHold = 2,
+    Accepted = 3,
+    Blocking = 4,
 };

-enum class InternetConnectionType : u8 {
-    WiFi = 1,
+// This is nn::nifm::NetworkInterfaceType
+enum class NetworkInterfaceType : u32 {
+    Invalid = 0,
+    WiFi_Ieee80211 = 1,
    Ethernet = 2,
 };

@@ -42,14 +46,23 @@ enum class InternetConnectionStatus : u8 {
    Connected,
 };

+// This is nn::nifm::NetworkProfileType
+enum class NetworkProfileType : u32 {
+    User,
+    SsidList,
+    Temporary,
+};
+
+// This is nn::nifm::IpAddressSetting
 struct IpAddressSetting {
    bool is_automatic{};
-    Network::IPv4Address current_address{};
+    Network::IPv4Address ip_address{};
    Network::IPv4Address subnet_mask{};
-    Network::IPv4Address gateway{};
+    Network::IPv4Address default_gateway{};
 };
 static_assert(sizeof(IpAddressSetting) == 0xD, "IpAddressSetting has incorrect size.");

+// This is nn::nifm::DnsSetting
 struct DnsSetting {
    bool is_automatic{};
    Network::IPv4Address primary_dns{};
@@ -57,18 +70,26 @@ struct DnsSetting {
 };
 static_assert(sizeof(DnsSetting) == 0x9, "DnsSetting has incorrect size.");

+// This is nn::nifm::AuthenticationSetting
+struct AuthenticationSetting {
+    bool is_enabled{};
+    std::array<char, 0x20> user{};
+    std::array<char, 0x20> password{};
+};
+static_assert(sizeof(AuthenticationSetting) == 0x41, "AuthenticationSetting has incorrect size.");
+
+// This is nn::nifm::ProxySetting
 struct ProxySetting {
-    bool enabled{};
+    bool is_enabled{};
    INSERT_PADDING_BYTES(1);
    u16 port{};
    std::array<char, 0x64> proxy_server{};
-    bool automatic_auth_enabled{};
-    std::array<char, 0x20> user{};
-    std::array<char, 0x20> password{};
+    AuthenticationSetting authentication{};
    INSERT_PADDING_BYTES(1);
 };
 static_assert(sizeof(ProxySetting) == 0xAA, "ProxySetting has incorrect size.");

+// This is nn::nifm::IpSettingData
 struct IpSettingData {
    IpAddressSetting ip_address_setting{};
    DnsSetting dns_setting{};
@@ -101,6 +122,7 @@ static_assert(sizeof(NifmWirelessSettingData) == 0x70,
              "NifmWirelessSettingData has incorrect size.");

 #pragma pack(push, 1)
+// This is nn::nifm::detail::sf::NetworkProfileData
 struct SfNetworkProfileData {
    IpSettingData ip_setting_data{};
    u128 uuid{};
@@ -114,13 +136,14 @@ struct SfNetworkProfileData {
 };
 static_assert(sizeof(SfNetworkProfileData) == 0x17C, "SfNetworkProfileData has incorrect size.");

+// This is nn::nifm::NetworkProfileData
 struct NifmNetworkProfileData {
    u128 uuid{};
    std::array<char, 0x40> network_name{};
-    u32 unknown_1{};
-    u32 unknown_2{};
-    u8 unknown_3{};
-    u8 unknown_4{};
+    NetworkProfileType network_profile_type{};
+    NetworkInterfaceType network_interface_type{};
+    bool is_auto_connect{};
+    bool is_large_capacity{};
    INSERT_PADDING_BYTES(2);
    NifmWirelessSettingData wireless_setting_data{};
    IpSettingData ip_setting_data{};
@@ -184,6 +207,7 @@ public:

        event1 = CreateKEvent(service_context, "IRequest:Event1");
        event2 = CreateKEvent(service_context, "IRequest:Event2");
+        state = RequestState::NotSubmitted;
    }

    ~IRequest() override {
@@ -196,7 +220,7 @@ private:
        LOG_WARNING(Service_NIFM, "(STUBBED) called");

        if (state == RequestState::NotSubmitted) {
-            UpdateState(RequestState::Pending);
+            UpdateState(RequestState::OnHold);
        }

        IPC::ResponseBuilder rb{ctx, 2};
@@ -219,14 +243,14 @@ private:
            switch (state) {
            case RequestState::NotSubmitted:
                return has_connection ? ResultSuccess : ResultNetworkCommunicationDisabled;
-            case RequestState::Pending:
+            case RequestState::OnHold:
                if (has_connection) {
-                    UpdateState(RequestState::Connected);
+                    UpdateState(RequestState::Accepted);
                } else {
-                    UpdateState(RequestState::Error);
+                    UpdateState(RequestState::Invalid);
                }
                return ResultPendingConnection;
-            case RequestState::Connected:
+            case RequestState::Accepted:
            default:
                return ResultSuccess;
            }
@@ -338,9 +362,9 @@ void IGeneralService::GetCurrentNetworkProfile(Kernel::HLERequestContext& ctx) {
            .ip_setting_data{
                .ip_address_setting{
                    .is_automatic{true},
-                    .current_address{Network::TranslateIPv4(net_iface->ip_address)},
+                    .ip_address{Network::TranslateIPv4(net_iface->ip_address)},
                    .subnet_mask{Network::TranslateIPv4(net_iface->subnet_mask)},
-                    .gateway{Network::TranslateIPv4(net_iface->gateway)},
+                    .default_gateway{Network::TranslateIPv4(net_iface->gateway)},
                },
                .dns_setting{
                    .is_automatic{true},
@@ -348,12 +372,14 @@ void IGeneralService::GetCurrentNetworkProfile(Kernel::HLERequestContext& ctx) {
                    .secondary_dns{1, 0, 0, 1},
                },
                .proxy_setting{
-                    .enabled{false},
+                    .is_enabled{false},
                    .port{},
                    .proxy_server{},
-                    .automatic_auth_enabled{},
-                    .user{},
-                    .password{},
+                    .authentication{
+                        .is_enabled{},
+                        .user{},
+                        .password{},
+                    },
                },
                .mtu{1500},
            },
@@ -370,7 +396,7 @@ void IGeneralService::GetCurrentNetworkProfile(Kernel::HLERequestContext& ctx) {
    // When we're connected to a room, spoof the hosts IP address
    if (auto room_member = network.GetRoomMember().lock()) {
        if (room_member->IsConnected()) {
-            network_profile_data.ip_setting_data.ip_address_setting.current_address =
+            network_profile_data.ip_setting_data.ip_address_setting.ip_address =
                room_member->GetFakeIpAddress();
        }
    }
@@ -444,9 +470,9 @@ void IGeneralService::GetCurrentIpConfigInfo(Kernel::HLERequestContext& ctx) {
        return IpConfigInfo{
            .ip_address_setting{
                .is_automatic{true},
-                .current_address{Network::TranslateIPv4(net_iface->ip_address)},
+                .ip_address{Network::TranslateIPv4(net_iface->ip_address)},
                .subnet_mask{Network::TranslateIPv4(net_iface->subnet_mask)},
-                .gateway{Network::TranslateIPv4(net_iface->gateway)},
+                .default_gateway{Network::TranslateIPv4(net_iface->gateway)},
            },
            .dns_setting{
                .is_automatic{true},
@@ -459,7 +485,7 @@ void IGeneralService::GetCurrentIpConfigInfo(Kernel::HLERequestContext& ctx) {
    // When we're connected to a room, spoof the hosts IP address
    if (auto room_member = network.GetRoomMember().lock()) {
        if (room_member->IsConnected()) {
-            ip_config_info.ip_address_setting.current_address = room_member->GetFakeIpAddress();
+            ip_config_info.ip_address_setting.ip_address = room_member->GetFakeIpAddress();
        }
    }

@@ -480,7 +506,7 @@ void IGeneralService::GetInternetConnectionStatus(Kernel::HLERequestContext& ctx
    LOG_WARNING(Service_NIFM, "(STUBBED) called");

    struct Output {
-        InternetConnectionType type{InternetConnectionType::WiFi};
+        u8 type{static_cast<u8>(NetworkInterfaceType::WiFi_Ieee80211)};
        u8 wifi_strength{3};
        InternetConnectionStatus state{InternetConnectionStatus::Connected};
    };
--- a/src/core/internal_network/network.cpp
+++ b/src/core/internal_network/network.cpp
@@ -117,6 +117,8 @@ Errno TranslateNativeError(int e) {
        return Errno::NETUNREACH;
    case WSAEMSGSIZE:
        return Errno::MSGSIZE;
+    case WSAETIMEDOUT:
+        return Errno::TIMEDOUT;
    default:
        UNIMPLEMENTED_MSG("Unimplemented errno={}", e);
        return Errno::OTHER;
@@ -211,6 +213,8 @@ Errno TranslateNativeError(int e) {
        return Errno::NETUNREACH;
    case EMSGSIZE:
        return Errno::MSGSIZE;
+    case ETIMEDOUT:
+        return Errno::TIMEDOUT;
    default:
        UNIMPLEMENTED_MSG("Unimplemented errno={}", e);
        return Errno::OTHER;
@@ -226,7 +230,7 @@ Errno GetAndLogLastError() {
    int e = errno;
 #endif
    const Errno err = TranslateNativeError(e);
-    if (err == Errno::AGAIN) {
+    if (err == Errno::AGAIN || err == Errno::TIMEDOUT) {
        return err;
    }
    LOG_ERROR(Network, "Socket operation error: {}", Common::NativeErrorToString(e));
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -436,7 +436,7 @@ struct Memory::Impl {
        }

        if (Settings::IsFastmemEnabled()) {
-            const bool is_read_enable = Settings::IsGPULevelHigh() || !cached;
+            const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached;
            system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached);
        }

--- a/src/dedicated_room/CMakeLists.txt
+++ b/src/dedicated_room/CMakeLists.txt
@@ -1,8 +1,6 @@
 # SPDX-FileCopyrightText: 2017 Citra Emulator Project
 # SPDX-License-Identifier: GPL-2.0-or-later

-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/CMakeModules)
-
 add_executable(yuzu-room
    precompiled_headers.h
    yuzu_room.cpp
@@ -17,7 +15,7 @@ if (ENABLE_WEB_SERVICE)
    target_link_libraries(yuzu-room PRIVATE web_service)
 endif()

-target_link_libraries(yuzu-room PRIVATE mbedtls mbedcrypto)
+target_link_libraries(yuzu-room PRIVATE MbedTLS::mbedcrypto)
 if (MSVC)
    target_link_libraries(yuzu-room PRIVATE getopt)
 endif()
--- a/src/shader_recompiler/backend/glasm/emit_glasm_context_get_set.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm_context_get_set.cpp
@@ -137,6 +137,15 @@ void EmitGetAttribute(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr, Scal
    case IR::Attribute::VertexId:
        ctx.Add("MOV.F {}.x,{}.id;", inst, ctx.attrib_name);
        break;
+    case IR::Attribute::BaseInstance:
+        ctx.Add("MOV.F {}.x,{}.baseInstance;", inst, ctx.attrib_name);
+        break;
+    case IR::Attribute::BaseVertex:
+        ctx.Add("MOV.F {}.x,{}.baseVertex;", inst, ctx.attrib_name);
+        break;
+    case IR::Attribute::DrawID:
+        ctx.Add("MOV.F {}.x,{}.draw.id;", inst, ctx.attrib_name);
+        break;
    case IR::Attribute::FrontFace:
        ctx.Add("CMP.F {}.x,{}.facing.x,0,-1;", inst, ctx.attrib_name);
        break;
@@ -156,6 +165,15 @@ void EmitGetAttributeU32(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr, S
    case IR::Attribute::VertexId:
        ctx.Add("MOV.S {}.x,{}.id;", inst, ctx.attrib_name);
        break;
+    case IR::Attribute::BaseInstance:
+        ctx.Add("MOV.S {}.x,{}.baseInstance;", inst, ctx.attrib_name);
+        break;
+    case IR::Attribute::BaseVertex:
+        ctx.Add("MOV.S {}.x,{}.baseVertex;", inst, ctx.attrib_name);
+        break;
+    case IR::Attribute::DrawID:
+        ctx.Add("MOV.S {}.x,{}.draw.id;", inst, ctx.attrib_name);
+        break;
    default:
        throw NotImplementedException("Get U32 attribute {}", attr);
    }
--- a/src/shader_recompiler/backend/glsl/emit_glsl.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl.cpp
@@ -219,7 +219,7 @@ std::string EmitGLSL(const Profile& profile, const RuntimeInfo& runtime_info, IR
    EmitContext ctx{program, bindings, profile, runtime_info};
    Precolor(program);
    EmitCode(ctx, program);
-    const std::string version{fmt::format("#version 450{}\n", GlslVersionSpecifier(ctx))};
+    const std::string version{fmt::format("#version 460{}\n", GlslVersionSpecifier(ctx))};
    ctx.header.insert(0, version);
    if (program.shared_memory_size > 0) {
        const auto requested_size{program.shared_memory_size};
--- a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp
@@ -234,6 +234,15 @@ void EmitGetAttribute(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr,
    case IR::Attribute::FrontFace:
        ctx.AddF32("{}=itof(gl_FrontFacing?-1:0);", inst);
        break;
+    case IR::Attribute::BaseInstance:
+        ctx.AddF32("{}=itof(gl_BaseInstance);", inst);
+        break;
+    case IR::Attribute::BaseVertex:
+        ctx.AddF32("{}=itof(gl_BaseVertex);", inst);
+        break;
+    case IR::Attribute::DrawID:
+        ctx.AddF32("{}=itof(gl_DrawID);", inst);
+        break;
    default:
        throw NotImplementedException("Get attribute {}", attr);
    }
@@ -250,6 +259,15 @@ void EmitGetAttributeU32(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr, s
    case IR::Attribute::VertexId:
        ctx.AddU32("{}=uint(gl_VertexID);", inst);
        break;
+    case IR::Attribute::BaseInstance:
+        ctx.AddU32("{}=uint(gl_BaseInstance);", inst);
+        break;
+    case IR::Attribute::BaseVertex:
+        ctx.AddU32("{}=uint(gl_BaseVertex);", inst);
+        break;
+    case IR::Attribute::DrawID:
+        ctx.AddU32("{}=uint(gl_DrawID);", inst);
+        break;
    default:
        throw NotImplementedException("Get U32 attribute {}", attr);
    }
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -321,8 +321,12 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) {
    case IR::Attribute::PositionY:
    case IR::Attribute::PositionZ:
    case IR::Attribute::PositionW:
-        return ctx.OpLoad(ctx.F32[1], AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position,
-                                                  ctx.Const(element)));
+        return ctx.OpLoad(
+            ctx.F32[1],
+            ctx.need_input_position_indirect
+                ? AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position, ctx.u32_zero_value,
+                              ctx.Const(element))
+                : AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position, ctx.Const(element)));
    case IR::Attribute::InstanceId:
        if (ctx.profile.support_vertex_instance_id) {
            return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id));
@@ -339,6 +343,12 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) {
            const Id base{ctx.OpLoad(ctx.U32[1], ctx.base_vertex)};
            return ctx.OpBitcast(ctx.F32[1], ctx.OpISub(ctx.U32[1], index, base));
        }
+    case IR::Attribute::BaseInstance:
+        return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.base_instance));
+    case IR::Attribute::BaseVertex:
+        return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.base_vertex));
+    case IR::Attribute::DrawID:
+        return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.draw_index));
    case IR::Attribute::FrontFace:
        return ctx.OpSelect(ctx.F32[1], ctx.OpLoad(ctx.U1, ctx.front_face),
                            ctx.OpBitcast(ctx.F32[1], ctx.Const(std::numeric_limits<u32>::max())),
@@ -380,6 +390,12 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, Id) {
            const Id base{ctx.OpLoad(ctx.U32[1], ctx.base_vertex)};
            return ctx.OpISub(ctx.U32[1], index, base);
        }
+    case IR::Attribute::BaseInstance:
+        return ctx.OpLoad(ctx.U32[1], ctx.base_instance);
+    case IR::Attribute::BaseVertex:
+        return ctx.OpLoad(ctx.U32[1], ctx.base_vertex);
+    case IR::Attribute::DrawID:
+        return ctx.OpLoad(ctx.U32[1], ctx.draw_index);
    default:
        throw NotImplementedException("Read U32 attribute {}", attr);
    }
--- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
@@ -58,11 +58,10 @@ Id SelectValue(EmitContext& ctx, Id in_range, Id value, Id src_thread_id) {
        ctx.OpGroupNonUniformShuffle(ctx.U32[1], SubgroupScope(ctx), value, src_thread_id), value);
 }

-Id GetUpperClamp(EmitContext& ctx, Id invocation_id, Id clamp) {
-    const Id thirty_two{ctx.Const(32u)};
-    const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, invocation_id, thirty_two)};
-    const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)};
-    return ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp);
+Id AddPartitionBase(EmitContext& ctx, Id thread_id) {
+    const Id partition_idx{ctx.OpShiftRightLogical(ctx.U32[1], GetThreadId(ctx), ctx.Const(5u))};
+    const Id partition_base{ctx.OpShiftLeftLogical(ctx.U32[1], partition_idx, ctx.Const(5u))};
+    return ctx.OpIAdd(ctx.U32[1], thread_id, partition_base);
 }
 } // Anonymous namespace

@@ -145,64 +144,63 @@ Id EmitSubgroupGeMask(EmitContext& ctx) {
 Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                    Id segmentation_mask) {
    const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)};
-    const Id thread_id{GetThreadId(ctx)};
-    if (ctx.profile.warp_size_potentially_larger_than_guest) {
-        const Id thirty_two{ctx.Const(32u)};
-        const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, thread_id, thirty_two)};
-        const Id upper_index{ctx.OpIAdd(ctx.U32[1], thirty_two, index)};
-        const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)};
-        index = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_index, index);
-        clamp = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp);
-    }
+    const Id thread_id{EmitLaneId(ctx)};
    const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)};
    const Id max_thread_id{ComputeMaxThreadId(ctx, min_thread_id, clamp, not_seg_mask)};

    const Id lhs{ctx.OpBitwiseAnd(ctx.U32[1], index, not_seg_mask)};
-    const Id src_thread_id{ctx.OpBitwiseOr(ctx.U32[1], lhs, min_thread_id)};
+    Id src_thread_id{ctx.OpBitwiseOr(ctx.U32[1], lhs, min_thread_id)};
    const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)};

+    if (ctx.profile.warp_size_potentially_larger_than_guest) {
+        src_thread_id = AddPartitionBase(ctx, src_thread_id);
+    }
+
    SetInBoundsFlag(inst, in_range);
    return SelectValue(ctx, in_range, value, src_thread_id);
 }

 Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                 Id segmentation_mask) {
-    const Id thread_id{GetThreadId(ctx)};
-    if (ctx.profile.warp_size_potentially_larger_than_guest) {
-        clamp = GetUpperClamp(ctx, thread_id, clamp);
-    }
+    const Id thread_id{EmitLaneId(ctx)};
    const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
-    const Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)};
+    Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)};
    const Id in_range{ctx.OpSGreaterThanEqual(ctx.U1, src_thread_id, max_thread_id)};

+    if (ctx.profile.warp_size_potentially_larger_than_guest) {
+        src_thread_id = AddPartitionBase(ctx, src_thread_id);
+    }
+
    SetInBoundsFlag(inst, in_range);
    return SelectValue(ctx, in_range, value, src_thread_id);
 }

 Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                   Id segmentation_mask) {
-    const Id thread_id{GetThreadId(ctx)};
-    if (ctx.profile.warp_size_potentially_larger_than_guest) {
-        clamp = GetUpperClamp(ctx, thread_id, clamp);
-    }
+    const Id thread_id{EmitLaneId(ctx)};
    const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
-    const Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)};
+    Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)};
    const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)};

+    if (ctx.profile.warp_size_potentially_larger_than_guest) {
+        src_thread_id = AddPartitionBase(ctx, src_thread_id);
+    }
+
    SetInBoundsFlag(inst, in_range);
    return SelectValue(ctx, in_range, value, src_thread_id);
 }

 Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                        Id segmentation_mask) {
-    const Id thread_id{GetThreadId(ctx)};
-    if (ctx.profile.warp_size_potentially_larger_than_guest) {
-        clamp = GetUpperClamp(ctx, thread_id, clamp);
-    }
+    const Id thread_id{EmitLaneId(ctx)};
    const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
-    const Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)};
+    Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)};
    const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)};

+    if (ctx.profile.warp_size_potentially_larger_than_guest) {
+        src_thread_id = AddPartitionBase(ctx, src_thread_id);
+    }
+
    SetInBoundsFlag(inst, in_range);
    return SelectValue(ctx, in_range, value, src_thread_id);
 }
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -544,7 +544,7 @@ void EmitContext::DefineCommonTypes(const Info& info) {
        U16 = Name(TypeInt(16, false), "u16");
        S16 = Name(TypeInt(16, true), "s16");
    }
-    if (info.uses_int64) {
+    if (info.uses_int64 && profile.support_int64) {
        AddCapability(spv::Capability::Int64);
        U64 = Name(TypeInt(64, false), "u64");
    }
@@ -721,9 +721,21 @@ void EmitContext::DefineAttributeMemAccess(const Info& info) {
        size_t label_index{0};
        if (info.loads.AnyComponent(IR::Attribute::PositionX)) {
            AddLabel(labels[label_index]);
-            const Id pointer{is_array
-                                 ? OpAccessChain(input_f32, input_position, vertex, masked_index)
-                                 : OpAccessChain(input_f32, input_position, masked_index)};
+            const Id pointer{[&]() {
+                if (need_input_position_indirect) {
+                    if (is_array)
+                        return OpAccessChain(input_f32, input_position, vertex, u32_zero_value,
+                                             masked_index);
+                    else
+                        return OpAccessChain(input_f32, input_position, u32_zero_value,
+                                             masked_index);
+                } else {
+                    if (is_array)
+                        return OpAccessChain(input_f32, input_position, vertex, masked_index);
+                    else
+                        return OpAccessChain(input_f32, input_position, masked_index);
+                }
+            }()};
            const Id result{OpLoad(F32[1], pointer)};
            OpReturnValue(result);
            ++label_index;
@@ -1367,30 +1379,56 @@ void EmitContext::DefineInputs(const IR::Program& program) {
        Decorate(layer, spv::Decoration::Flat);
    }
    if (loads.AnyComponent(IR::Attribute::PositionX)) {
-        const bool is_fragment{stage != Stage::Fragment};
-        const spv::BuiltIn built_in{is_fragment ? spv::BuiltIn::Position : spv::BuiltIn::FragCoord};
-        input_position = DefineInput(*this, F32[4], true, built_in);
-        if (profile.support_geometry_shader_passthrough) {
-            if (info.passthrough.AnyComponent(IR::Attribute::PositionX)) {
-                Decorate(input_position, spv::Decoration::PassthroughNV);
+        const bool is_fragment{stage == Stage::Fragment};
+        if (!is_fragment && profile.has_broken_spirv_position_input) {
+            need_input_position_indirect = true;
+
+            const Id input_position_struct = TypeStruct(F32[4]);
+            input_position = DefineInput(*this, input_position_struct, true);
+
+            MemberDecorate(input_position_struct, 0, spv::Decoration::BuiltIn,
+                           static_cast<unsigned>(spv::BuiltIn::Position));
+            Decorate(input_position_struct, spv::Decoration::Block);
+        } else {
+            const spv::BuiltIn built_in{is_fragment ? spv::BuiltIn::FragCoord
+                                                    : spv::BuiltIn::Position};
+            input_position = DefineInput(*this, F32[4], true, built_in);
+
+            if (profile.support_geometry_shader_passthrough) {
+                if (info.passthrough.AnyComponent(IR::Attribute::PositionX)) {
+                    Decorate(input_position, spv::Decoration::PassthroughNV);
+                }
            }
        }
    }
    if (loads[IR::Attribute::InstanceId]) {
        if (profile.support_vertex_instance_id) {
            instance_id = DefineInput(*this, U32[1], true, spv::BuiltIn::InstanceId);
+            if (loads[IR::Attribute::BaseInstance]) {
+                base_instance = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseVertex);
+            }
        } else {
            instance_index = DefineInput(*this, U32[1], true, spv::BuiltIn::InstanceIndex);
            base_instance = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseInstance);
        }
+    } else if (loads[IR::Attribute::BaseInstance]) {
+        base_instance = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseInstance);
    }
    if (loads[IR::Attribute::VertexId]) {
        if (profile.support_vertex_instance_id) {
            vertex_id = DefineInput(*this, U32[1], true, spv::BuiltIn::VertexId);
+            if (loads[IR::Attribute::BaseVertex]) {
+                base_vertex = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseVertex);
+            }
        } else {
            vertex_index = DefineInput(*this, U32[1], true, spv::BuiltIn::VertexIndex);
            base_vertex = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseVertex);
        }
+    } else if (loads[IR::Attribute::BaseVertex]) {
+        base_vertex = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseVertex);
+    }
+    if (loads[IR::Attribute::DrawID]) {
+        draw_index = DefineInput(*this, U32[1], true, spv::BuiltIn::DrawIndex);
    }
    if (loads[IR::Attribute::FrontFace]) {
        front_face = DefineInput(*this, U1, true, spv::BuiltIn::FrontFacing);
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@@ -218,6 +218,7 @@ public:
    Id base_instance{};
    Id vertex_id{};
    Id vertex_index{};
+    Id draw_index{};
    Id base_vertex{};
    Id front_face{};
    Id point_coord{};
@@ -279,6 +280,7 @@ public:
    Id write_global_func_u32x2{};
    Id write_global_func_u32x4{};

+    bool need_input_position_indirect{};
    Id input_position{};
    std::array<Id, 32> input_generics{};

--- a/src/shader_recompiler/environment.h
+++ b/src/shader_recompiler/environment.h
@@ -34,6 +34,11 @@ public:

    [[nodiscard]] virtual std::array<u32, 3> WorkgroupSize() const = 0;

+    [[nodiscard]] virtual bool HasHLEMacroState() const = 0;
+
+    [[nodiscard]] virtual std::optional<ReplaceConstant> GetReplaceConstBuffer(u32 bank,
+                                                                               u32 offset) = 0;
+
    virtual void Dump(u64 hash) = 0;

    [[nodiscard]] const ProgramHeader& SPH() const noexcept {
@@ -52,11 +57,16 @@ public:
        return start_address;
    }

+    [[nodiscard]] bool IsPropietaryDriver() const noexcept {
+        return is_propietary_driver;
+    }
+
 protected:
    ProgramHeader sph{};
    std::array<u32, 8> gp_passthrough_mask{};
    Stage stage{};
    u32 start_address{};
+    bool is_propietary_driver{};
 };

 } // namespace Shader
--- a/src/shader_recompiler/frontend/ir/attribute.cpp
+++ b/src/shader_recompiler/frontend/ir/attribute.cpp
@@ -446,6 +446,12 @@ std::string NameOf(Attribute attribute) {
        return "ViewportMask";
    case Attribute::FrontFace:
        return "FrontFace";
+    case Attribute::BaseInstance:
+        return "BaseInstance";
+    case Attribute::BaseVertex:
+        return "BaseVertex";
+    case Attribute::DrawID:
+        return "DrawID";
    }
    return fmt::format("<reserved attribute {}>", static_cast<int>(attribute));
 }
--- a/src/shader_recompiler/frontend/ir/attribute.h
+++ b/src/shader_recompiler/frontend/ir/attribute.h
@@ -219,6 +219,11 @@ enum class Attribute : u64 {
    FixedFncTexture9Q = 231,
    ViewportMask = 232,
    FrontFace = 255,
+
+    // Implementation attributes
+    BaseInstance = 256,
+    BaseVertex = 257,
+    DrawID = 258,
 };

 constexpr size_t NUM_GENERICS = 32;
--- a/src/shader_recompiler/frontend/ir/ir_emitter.cpp
+++ b/src/shader_recompiler/frontend/ir/ir_emitter.cpp
@@ -294,6 +294,14 @@ F32 IREmitter::GetAttribute(IR::Attribute attribute, const U32& vertex) {
    return Inst<F32>(Opcode::GetAttribute, attribute, vertex);
 }

+U32 IREmitter::GetAttributeU32(IR::Attribute attribute) {
+    return GetAttributeU32(attribute, Imm32(0));
+}
+
+U32 IREmitter::GetAttributeU32(IR::Attribute attribute, const U32& vertex) {
+    return Inst<U32>(Opcode::GetAttributeU32, attribute, vertex);
+}
+
 void IREmitter::SetAttribute(IR::Attribute attribute, const F32& value, const U32& vertex) {
    Inst(Opcode::SetAttribute, attribute, value, vertex);
 }
--- a/src/shader_recompiler/frontend/ir/ir_emitter.h
+++ b/src/shader_recompiler/frontend/ir/ir_emitter.h
@@ -74,6 +74,8 @@ public:

    [[nodiscard]] F32 GetAttribute(IR::Attribute attribute);
    [[nodiscard]] F32 GetAttribute(IR::Attribute attribute, const U32& vertex);
+    [[nodiscard]] U32 GetAttributeU32(IR::Attribute attribute);
+    [[nodiscard]] U32 GetAttributeU32(IR::Attribute attribute, const U32& vertex);
    void SetAttribute(IR::Attribute attribute, const F32& value, const U32& vertex);

    [[nodiscard]] F32 GetAttributeIndexed(const U32& phys_address);
--- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
@@ -171,6 +171,70 @@ std::map<IR::Attribute, IR::Attribute> GenerateLegacyToGenericMappings(
    }
    return mapping;
 }
+
+void EmitGeometryPassthrough(IR::IREmitter& ir, const IR::Program& program,
+                             const Shader::VaryingState& passthrough_mask,
+                             bool passthrough_position,
+                             std::optional<IR::Attribute> passthrough_layer_attr) {
+    for (u32 i = 0; i < program.output_vertices; i++) {
+        // Assign generics from input
+        for (u32 j = 0; j < 32; j++) {
+            if (!passthrough_mask.Generic(j)) {
+                continue;
+            }
+
+            const IR::Attribute attr = IR::Attribute::Generic0X + (j * 4);
+            ir.SetAttribute(attr + 0, ir.GetAttribute(attr + 0, ir.Imm32(i)), ir.Imm32(0));
+            ir.SetAttribute(attr + 1, ir.GetAttribute(attr + 1, ir.Imm32(i)), ir.Imm32(0));
+            ir.SetAttribute(attr + 2, ir.GetAttribute(attr + 2, ir.Imm32(i)), ir.Imm32(0));
+            ir.SetAttribute(attr + 3, ir.GetAttribute(attr + 3, ir.Imm32(i)), ir.Imm32(0));
+        }
+
+        if (passthrough_position) {
+            // Assign position from input
+            const IR::Attribute attr = IR::Attribute::PositionX;
+            ir.SetAttribute(attr + 0, ir.GetAttribute(attr + 0, ir.Imm32(i)), ir.Imm32(0));
+            ir.SetAttribute(attr + 1, ir.GetAttribute(attr + 1, ir.Imm32(i)), ir.Imm32(0));
+            ir.SetAttribute(attr + 2, ir.GetAttribute(attr + 2, ir.Imm32(i)), ir.Imm32(0));
+            ir.SetAttribute(attr + 3, ir.GetAttribute(attr + 3, ir.Imm32(i)), ir.Imm32(0));
+        }
+
+        if (passthrough_layer_attr) {
+            // Assign layer
+            ir.SetAttribute(IR::Attribute::Layer, ir.GetAttribute(*passthrough_layer_attr),
+                            ir.Imm32(0));
+        }
+
+        // Emit vertex
+        ir.EmitVertex(ir.Imm32(0));
+    }
+    ir.EndPrimitive(ir.Imm32(0));
+}
+
+u32 GetOutputTopologyVertices(OutputTopology output_topology) {
+    switch (output_topology) {
+    case OutputTopology::PointList:
+        return 1;
+    case OutputTopology::LineStrip:
+        return 2;
+    default:
+        return 3;
+    }
+}
+
+void LowerGeometryPassthrough(const IR::Program& program, const HostTranslateInfo& host_info) {
+    for (IR::Block* const block : program.blocks) {
+        for (IR::Inst& inst : block->Instructions()) {
+            if (inst.GetOpcode() == IR::Opcode::Epilogue) {
+                IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
+                EmitGeometryPassthrough(
+                    ir, program, program.info.passthrough,
+                    program.info.passthrough.AnyComponent(IR::Attribute::PositionX), {});
+            }
+        }
+    }
+}
+
 } // Anonymous namespace

 IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
@@ -198,6 +262,11 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
            for (size_t i = 0; i < program.info.passthrough.mask.size(); ++i) {
                program.info.passthrough.mask[i] = ((mask[i / 32] >> (i % 32)) & 1) == 0;
            }
+
+            if (!host_info.support_geometry_shader_passthrough) {
+                program.output_vertices = GetOutputTopologyVertices(program.output_topology);
+                LowerGeometryPassthrough(program, host_info);
+            }
        }
        break;
    }
@@ -219,11 +288,11 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
    }
    Optimization::SsaRewritePass(program);

-    Optimization::ConstantPropagationPass(program);
+    Optimization::ConstantPropagationPass(env, program);

    Optimization::PositionPass(env, program);

-    Optimization::GlobalMemoryToStorageBufferPass(program);
+    Optimization::GlobalMemoryToStorageBufferPass(program, host_info);
    Optimization::TexturePass(env, program, host_info);

    if (Settings::values.resolution_info.active) {
@@ -342,17 +411,7 @@ IR::Program GenerateGeometryPassthrough(ObjectPool<IR::Inst>& inst_pool,
    IR::Program program;
    program.stage = Stage::Geometry;
    program.output_topology = output_topology;
-    switch (output_topology) {
-    case OutputTopology::PointList:
-        program.output_vertices = 1;
-        break;
-    case OutputTopology::LineStrip:
-        program.output_vertices = 2;
-        break;
-    default:
-        program.output_vertices = 3;
-        break;
-    }
+    program.output_vertices = GetOutputTopologyVertices(output_topology);

    program.is_geometry_passthrough = false;
    program.info.loads.mask = source_program.info.stores.mask;
@@ -366,35 +425,8 @@ IR::Program GenerateGeometryPassthrough(ObjectPool<IR::Inst>& inst_pool,
    node.data.block = current_block;

    IR::IREmitter ir{*current_block};
-    for (u32 i = 0; i < program.output_vertices; i++) {
-        // Assign generics from input
-        for (u32 j = 0; j < 32; j++) {
-            if (!program.info.stores.Generic(j)) {
-                continue;
-            }
-
-            const IR::Attribute attr = IR::Attribute::Generic0X + (j * 4);
-            ir.SetAttribute(attr + 0, ir.GetAttribute(attr + 0, ir.Imm32(i)), ir.Imm32(0));
-            ir.SetAttribute(attr + 1, ir.GetAttribute(attr + 1, ir.Imm32(i)), ir.Imm32(0));
-            ir.SetAttribute(attr + 2, ir.GetAttribute(attr + 2, ir.Imm32(i)), ir.Imm32(0));
-            ir.SetAttribute(attr + 3, ir.GetAttribute(attr + 3, ir.Imm32(i)), ir.Imm32(0));
-        }
-
-        // Assign position from input
-        const IR::Attribute attr = IR::Attribute::PositionX;
-        ir.SetAttribute(attr + 0, ir.GetAttribute(attr + 0, ir.Imm32(i)), ir.Imm32(0));
-        ir.SetAttribute(attr + 1, ir.GetAttribute(attr + 1, ir.Imm32(i)), ir.Imm32(0));
-        ir.SetAttribute(attr + 2, ir.GetAttribute(attr + 2, ir.Imm32(i)), ir.Imm32(0));
-        ir.SetAttribute(attr + 3, ir.GetAttribute(attr + 3, ir.Imm32(i)), ir.Imm32(0));
-
-        // Assign layer
-        ir.SetAttribute(IR::Attribute::Layer, ir.GetAttribute(source_program.info.emulated_layer),
-                        ir.Imm32(0));
-
-        // Emit vertex
-        ir.EmitVertex(ir.Imm32(0));
-    }
-    ir.EndPrimitive(ir.Imm32(0));
+    EmitGeometryPassthrough(ir, program, program.info.stores, true,
+                            source_program.info.emulated_layer);

    IR::Block* return_block{block_pool.Create(inst_pool)};
    IR::IREmitter{*return_block}.Epilogue();
--- a/src/shader_recompiler/host_translate_info.h
+++ b/src/shader_recompiler/host_translate_info.h
@@ -15,6 +15,9 @@ struct HostTranslateInfo {
    bool needs_demote_reorder{}; ///< True when the device needs DemoteToHelperInvocation reordered
    bool support_snorm_render_buffer{};  ///< True when the device supports SNORM render buffers
    bool support_viewport_index_layer{}; ///< True when the device supports gl_Layer in VS
+    u32 min_ssbo_alignment{};            ///< Minimum alignment supported by the device for SSBOs
+    bool support_geometry_shader_passthrough{}; ///< True when the device supports geometry
+                                                ///< passthrough shaders
 };

 } // namespace Shader
--- a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
+++ b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
@@ -7,6 +7,7 @@
 #include <type_traits>

 #include "common/bit_cast.h"
+#include "shader_recompiler/environment.h"
 #include "shader_recompiler/exception.h"
 #include "shader_recompiler/frontend/ir/ir_emitter.h"
 #include "shader_recompiler/frontend/ir/value.h"
@@ -515,6 +516,9 @@ void FoldBitCast(IR::Inst& inst, IR::Opcode reverse) {
            case IR::Attribute::PrimitiveId:
            case IR::Attribute::InstanceId:
            case IR::Attribute::VertexId:
+            case IR::Attribute::BaseVertex:
+            case IR::Attribute::BaseInstance:
+            case IR::Attribute::DrawID:
                break;
            default:
                return;
@@ -644,7 +648,63 @@ void FoldFSwizzleAdd(IR::Block& block, IR::Inst& inst) {
    }
 }

-void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
+void FoldConstBuffer(Environment& env, IR::Block& block, IR::Inst& inst) {
+    const IR::Value bank{inst.Arg(0)};
+    const IR::Value offset{inst.Arg(1)};
+    if (!bank.IsImmediate() || !offset.IsImmediate()) {
+        return;
+    }
+    const auto bank_value = bank.U32();
+    const auto offset_value = offset.U32();
+    auto replacement = env.GetReplaceConstBuffer(bank_value, offset_value);
+    if (!replacement) {
+        return;
+    }
+    const auto new_attribute = [replacement]() {
+        switch (*replacement) {
+        case ReplaceConstant::BaseInstance:
+            return IR::Attribute::BaseInstance;
+        case ReplaceConstant::BaseVertex:
+            return IR::Attribute::BaseVertex;
+        case ReplaceConstant::DrawID:
+            return IR::Attribute::DrawID;
+        default:
+            throw NotImplementedException("Not implemented replacement variable {}", *replacement);
+        }
+    }();
+    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+    if (inst.GetOpcode() == IR::Opcode::GetCbufU32) {
+        inst.ReplaceUsesWith(ir.GetAttributeU32(new_attribute));
+    } else {
+        inst.ReplaceUsesWith(ir.GetAttribute(new_attribute));
+    }
+}
+
+void FoldDriverConstBuffer(Environment& env, IR::Block& block, IR::Inst& inst, u32 which_bank,
+                           u32 offset_start = 0, u32 offset_end = std::numeric_limits<u16>::max()) {
+    const IR::Value bank{inst.Arg(0)};
+    const IR::Value offset{inst.Arg(1)};
+    if (!bank.IsImmediate() || !offset.IsImmediate()) {
+        return;
+    }
+    const auto bank_value = bank.U32();
+    if (bank_value != which_bank) {
+        return;
+    }
+    const auto offset_value = offset.U32();
+    if (offset_value < offset_start || offset_value >= offset_end) {
+        return;
+    }
+    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+    if (inst.GetOpcode() == IR::Opcode::GetCbufU32) {
+        inst.ReplaceUsesWith(IR::Value{env.ReadCbufValue(bank_value, offset_value)});
+    } else {
+        inst.ReplaceUsesWith(
+            IR::Value{Common::BitCast<f32>(env.ReadCbufValue(bank_value, offset_value))});
+    }
+}
+
+void ConstantPropagation(Environment& env, IR::Block& block, IR::Inst& inst) {
    switch (inst.GetOpcode()) {
    case IR::Opcode::GetRegister:
        return FoldGetRegister(inst);
@@ -789,18 +849,28 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
                                    IR::Opcode::CompositeInsertF16x4);
    case IR::Opcode::FSwizzleAdd:
        return FoldFSwizzleAdd(block, inst);
+    case IR::Opcode::GetCbufF32:
+    case IR::Opcode::GetCbufU32:
+        if (env.HasHLEMacroState()) {
+            FoldConstBuffer(env, block, inst);
+        }
+        if (env.IsPropietaryDriver()) {
+            FoldDriverConstBuffer(env, block, inst, 1);
+        }
+        break;
    default:
        break;
    }
 }
+
 } // Anonymous namespace

-void ConstantPropagationPass(IR::Program& program) {
+void ConstantPropagationPass(Environment& env, IR::Program& program) {
    const auto end{program.post_order_blocks.rend()};
    for (auto it = program.post_order_blocks.rbegin(); it != end; ++it) {
        IR::Block* const block{*it};
        for (IR::Inst& inst : block->Instructions()) {
-            ConstantPropagation(*block, inst);
+            ConstantPropagation(env, *block, inst);
        }
    }
 }
--- a/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp
+++ b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp
@@ -11,6 +11,7 @@
 #include "shader_recompiler/frontend/ir/breadth_first_search.h"
 #include "shader_recompiler/frontend/ir/ir_emitter.h"
 #include "shader_recompiler/frontend/ir/value.h"
+#include "shader_recompiler/host_translate_info.h"
 #include "shader_recompiler/ir_opt/passes.h"

 namespace Shader::Optimization {
@@ -402,7 +403,7 @@ void CollectStorageBuffers(IR::Block& block, IR::Inst& inst, StorageInfo& info)
 }

 /// Returns the offset in indices (not bytes) for an equivalent storage instruction
-IR::U32 StorageOffset(IR::Block& block, IR::Inst& inst, StorageBufferAddr buffer) {
+IR::U32 StorageOffset(IR::Block& block, IR::Inst& inst, StorageBufferAddr buffer, u32 alignment) {
    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
    IR::U32 offset;
    if (const std::optional<LowAddrInfo> low_addr{TrackLowAddress(&inst)}) {
@@ -415,7 +416,10 @@ IR::U32 StorageOffset(IR::Block& block, IR::Inst& inst, StorageBufferAddr buffer
    }
    // Subtract the least significant 32 bits from the guest offset. The result is the storage
    // buffer offset in bytes.
-    const IR::U32 low_cbuf{ir.GetCbuf(ir.Imm32(buffer.index), ir.Imm32(buffer.offset))};
+    IR::U32 low_cbuf{ir.GetCbuf(ir.Imm32(buffer.index), ir.Imm32(buffer.offset))};
+
+    // Align the offset base to match the host alignment requirements
+    low_cbuf = ir.BitwiseAnd(low_cbuf, ir.Imm32(~(alignment - 1U)));
    return ir.ISub(offset, low_cbuf);
 }

@@ -510,7 +514,7 @@ void Replace(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
 }
 } // Anonymous namespace

-void GlobalMemoryToStorageBufferPass(IR::Program& program) {
+void GlobalMemoryToStorageBufferPass(IR::Program& program, const HostTranslateInfo& host_info) {
    StorageInfo info;
    for (IR::Block* const block : program.post_order_blocks) {
        for (IR::Inst& inst : block->Instructions()) {
@@ -534,7 +538,8 @@ void GlobalMemoryToStorageBufferPass(IR::Program& program) {
        const IR::U32 index{IR::Value{static_cast<u32>(info.set.index_of(it))}};
        IR::Block* const block{storage_inst.block};
        IR::Inst* const inst{storage_inst.inst};
-        const IR::U32 offset{StorageOffset(*block, *inst, storage_buffer)};
+        const IR::U32 offset{
+            StorageOffset(*block, *inst, storage_buffer, host_info.min_ssbo_alignment)};
        Replace(*block, *inst, index, offset);
    }
 }
--- a/src/shader_recompiler/ir_opt/passes.h
+++ b/src/shader_recompiler/ir_opt/passes.h
@@ -13,9 +13,9 @@ struct HostTranslateInfo;
 namespace Shader::Optimization {

 void CollectShaderInfoPass(Environment& env, IR::Program& program);
-void ConstantPropagationPass(IR::Program& program);
+void ConstantPropagationPass(Environment& env, IR::Program& program);
 void DeadCodeEliminationPass(IR::Program& program);
-void GlobalMemoryToStorageBufferPass(IR::Program& program);
+void GlobalMemoryToStorageBufferPass(IR::Program& program, const HostTranslateInfo& host_info);
 void IdentityRemovalPass(IR::Program& program);
 void LowerFp16ToFp32(IR::Program& program);
 void LowerInt64ToInt32(IR::Program& program);
--- a/src/shader_recompiler/profile.h
+++ b/src/shader_recompiler/profile.h
@@ -55,6 +55,8 @@ struct Profile {

    /// OpFClamp is broken and OpFMax + OpFMin should be used instead
    bool has_broken_spirv_clamp{};
+    /// The Position builtin needs to be wrapped in a struct when used as an input
+    bool has_broken_spirv_position_input{};
    /// Offset image operands with an unsigned type do not work
    bool has_broken_unsigned_image_offsets{};
    /// Signed instructions with unsigned data types are misinterpreted
--- a/src/shader_recompiler/shader_info.h
+++ b/src/shader_recompiler/shader_info.h
@@ -16,6 +16,12 @@

 namespace Shader {

+enum class ReplaceConstant : u32 {
+    BaseInstance,
+    BaseVertex,
+    DrawID,
+};
+
 enum class TextureType : u32 {
    Color1D,
    ColorArray1D,
@@ -59,6 +65,8 @@ enum class Interpolation {
 struct ConstantBufferDescriptor {
    u32 index;
    u32 count;
+
+    auto operator<=>(const ConstantBufferDescriptor&) const = default;
 };

 struct StorageBufferDescriptor {
@@ -66,6 +74,8 @@ struct StorageBufferDescriptor {
    u32 cbuf_offset;
    u32 count;
    bool is_written;
+
+    auto operator<=>(const StorageBufferDescriptor&) const = default;
 };

 struct TextureBufferDescriptor {
@@ -78,6 +88,8 @@ struct TextureBufferDescriptor {
    u32 secondary_shift_left;
    u32 count;
    u32 size_shift;
+
+    auto operator<=>(const TextureBufferDescriptor&) const = default;
 };
 using TextureBufferDescriptors = boost::container::small_vector<TextureBufferDescriptor, 6>;

@@ -89,6 +101,8 @@ struct ImageBufferDescriptor {
    u32 cbuf_offset;
    u32 count;
    u32 size_shift;
+
+    auto operator<=>(const ImageBufferDescriptor&) const = default;
 };
 using ImageBufferDescriptors = boost::container::small_vector<ImageBufferDescriptor, 2>;

@@ -104,6 +118,8 @@ struct TextureDescriptor {
    u32 secondary_shift_left;
    u32 count;
    u32 size_shift;
+
+    auto operator<=>(const TextureDescriptor&) const = default;
 };
 using TextureDescriptors = boost::container::small_vector<TextureDescriptor, 12>;

@@ -116,6 +132,8 @@ struct ImageDescriptor {
    u32 cbuf_offset;
    u32 count;
    u32 size_shift;
+
+    auto operator<=>(const ImageDescriptor&) const = default;
 };
 using ImageDescriptors = boost::container::small_vector<ImageDescriptor, 4>;

--- a/src/shader_recompiler/varying_state.h
+++ b/src/shader_recompiler/varying_state.h
@@ -11,7 +11,7 @@
 namespace Shader {

 struct VaryingState {
-    std::bitset<256> mask{};
+    std::bitset<512> mask{};

    void Set(IR::Attribute attribute, bool state = true) {
        mask[static_cast<size_t>(attribute)] = state;
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -7,6 +7,7 @@ add_executable(tests
    common/fibers.cpp
    common/host_memory.cpp
    common/param_package.cpp
+    common/range_map.cpp
    common/ring_buffer.cpp
    common/scratch_buffer.cpp
    common/unique_function.cpp
--- a/src/tests/common/range_map.cpp
+++ b/src/tests/common/range_map.cpp
@@ -0,0 +1,70 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include <stdexcept>
+
+#include <catch2/catch.hpp>
+
+#include "common/range_map.h"
+
+enum class MappedEnum : u32 {
+    Invalid = 0,
+    Valid_1 = 1,
+    Valid_2 = 2,
+    Valid_3 = 3,
+};
+
+TEST_CASE("Range Map: Setup", "[video_core]") {
+    Common::RangeMap<u64, MappedEnum> my_map(MappedEnum::Invalid);
+    my_map.Map(3000, 3500, MappedEnum::Valid_1);
+    my_map.Unmap(3200, 3600);
+    my_map.Map(4000, 4500, MappedEnum::Valid_2);
+    my_map.Map(4200, 4400, MappedEnum::Valid_2);
+    my_map.Map(4200, 4400, MappedEnum::Valid_1);
+    REQUIRE(my_map.GetContinousSizeFrom(4200) == 200);
+    REQUIRE(my_map.GetContinousSizeFrom(3000) == 200);
+    REQUIRE(my_map.GetContinousSizeFrom(2900) == 0);
+
+    REQUIRE(my_map.GetValueAt(2900) == MappedEnum::Invalid);
+    REQUIRE(my_map.GetValueAt(3100) == MappedEnum::Valid_1);
+    REQUIRE(my_map.GetValueAt(3000) == MappedEnum::Valid_1);
+    REQUIRE(my_map.GetValueAt(3200) == MappedEnum::Invalid);
+
+    REQUIRE(my_map.GetValueAt(4199) == MappedEnum::Valid_2);
+    REQUIRE(my_map.GetValueAt(4200) == MappedEnum::Valid_1);
+    REQUIRE(my_map.GetValueAt(4400) == MappedEnum::Valid_2);
+    REQUIRE(my_map.GetValueAt(4500) == MappedEnum::Invalid);
+    REQUIRE(my_map.GetValueAt(4600) == MappedEnum::Invalid);
+
+    my_map.Unmap(0, 6000);
+    for (u64 address = 0; address < 10000; address += 1000) {
+        REQUIRE(my_map.GetContinousSizeFrom(address) == 0);
+    }
+
+    my_map.Map(1000, 3000, MappedEnum::Valid_1);
+    my_map.Map(4000, 5000, MappedEnum::Valid_1);
+    my_map.Map(2500, 4100, MappedEnum::Valid_1);
+    REQUIRE(my_map.GetContinousSizeFrom(1000) == 4000);
+
+    my_map.Map(1000, 3000, MappedEnum::Valid_1);
+    my_map.Map(4000, 5000, MappedEnum::Valid_2);
+    my_map.Map(2500, 4100, MappedEnum::Valid_3);
+    REQUIRE(my_map.GetContinousSizeFrom(1000) == 1500);
+    REQUIRE(my_map.GetContinousSizeFrom(2500) == 1600);
+    REQUIRE(my_map.GetContinousSizeFrom(4100) == 900);
+    REQUIRE(my_map.GetValueAt(900) == MappedEnum::Invalid);
+    REQUIRE(my_map.GetValueAt(1000) == MappedEnum::Valid_1);
+    REQUIRE(my_map.GetValueAt(2500) == MappedEnum::Valid_3);
+    REQUIRE(my_map.GetValueAt(4100) == MappedEnum::Valid_2);
+    REQUIRE(my_map.GetValueAt(5000) == MappedEnum::Invalid);
+
+    my_map.Map(2000, 6000, MappedEnum::Valid_3);
+    REQUIRE(my_map.GetContinousSizeFrom(1000) == 1000);
+    REQUIRE(my_map.GetContinousSizeFrom(3000) == 3000);
+    REQUIRE(my_map.GetValueAt(1000) == MappedEnum::Valid_1);
+    REQUIRE(my_map.GetValueAt(1999) == MappedEnum::Valid_1);
+    REQUIRE(my_map.GetValueAt(1500) == MappedEnum::Valid_1);
+    REQUIRE(my_map.GetValueAt(2001) == MappedEnum::Valid_3);
+    REQUIRE(my_map.GetValueAt(5999) == MappedEnum::Valid_3);
+    REQUIRE(my_map.GetValueAt(6000) == MappedEnum::Invalid);
+}
--- a/src/tests/video_core/buffer_base.cpp
+++ b/src/tests/video_core/buffer_base.cpp
@@ -538,7 +538,7 @@ TEST_CASE("BufferBase: Cached write downloads") {
    int num = 0;
    buffer.ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; });
    buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
-    REQUIRE(num == 0);
+    REQUIRE(num == 1);
    REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
    REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE));
    buffer.FlushCachedWrites();
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -13,6 +13,7 @@ add_library(video_core STATIC
    buffer_cache/buffer_base.h
    buffer_cache/buffer_cache.cpp
    buffer_cache/buffer_cache.h
+    cache_types.h
    cdma_pusher.cpp
    cdma_pusher.h
    compatible_formats.cpp
@@ -84,6 +85,7 @@ add_library(video_core STATIC
    gpu.h
    gpu_thread.cpp
    gpu_thread.h
+    invalidation_accumulator.h
    memory_manager.cpp
    memory_manager.h
    precompiled_headers.h
@@ -189,6 +191,8 @@ add_library(video_core STATIC
    renderer_vulkan/vk_texture_cache.cpp
    renderer_vulkan/vk_texture_cache.h
    renderer_vulkan/vk_texture_cache_base.cpp
+    renderer_vulkan/vk_turbo_mode.cpp
+    renderer_vulkan/vk_turbo_mode.h
    renderer_vulkan/vk_update_descriptor.cpp
    renderer_vulkan/vk_update_descriptor.h
    shader_cache.cpp
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -430,7 +430,7 @@ private:
        if (query_begin >= SizeBytes() || size < 0) {
            return;
        }
-        u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>();
        u64* const state_words = Array<type>();
        const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
        u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
@@ -483,7 +483,7 @@ private:
                NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
            }
            // Exclude CPU modified pages when visiting GPU pages
-            const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
+            const u64 word = current_word;
            u64 page = page_begin;
            page_begin = 0;

@@ -531,7 +531,7 @@ private:
    [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
        static_assert(type != Type::Untracked);

-        const u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
        const u64* const state_words = Array<type>();
        const u64 num_query_words = size / BYTES_PER_WORD + 1;
        const u64 word_begin = offset / BYTES_PER_WORD;
@@ -539,8 +539,7 @@ private:
        const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
        u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
        for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
-            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
-            const u64 word = state_words[word_index] & ~off_word;
+            const u64 word = state_words[word_index];
            if (word == 0) {
                continue;
            }
@@ -564,7 +563,7 @@ private:
    [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
        static_assert(type != Type::Untracked);

-        const u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
        const u64* const state_words = Array<type>();
        const u64 num_query_words = size / BYTES_PER_WORD + 1;
        const u64 word_begin = offset / BYTES_PER_WORD;
@@ -574,8 +573,7 @@ private:
        u64 begin = std::numeric_limits<u64>::max();
        u64 end = 0;
        for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
-            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
-            const u64 word = state_words[word_index] & ~off_word;
+            const u64 word = state_words[word_index];
            if (word == 0) {
                continue;
            }
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -200,7 +200,16 @@ public:
    /// Return true when a CPU region is modified from the CPU
    [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);

-    std::mutex mutex;
+    void SetDrawIndirect(
+        const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) {
+        current_draw_indirect = current_draw_indirect_;
+    }
+
+    [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectCount();
+
+    [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer();
+
+    std::recursive_mutex mutex;
    Runtime& runtime;

 private:
@@ -272,6 +281,8 @@ private:

    void BindHostVertexBuffers();

+    void BindHostDrawIndirectBuffers();
+
    void BindHostGraphicsUniformBuffers(size_t stage);

    void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind);
@@ -298,6 +309,8 @@ private:

    void UpdateVertexBuffer(u32 index);

+    void UpdateDrawIndirect();
+
    void UpdateUniformBuffers(size_t stage);

    void UpdateStorageBuffers(size_t stage);
@@ -372,6 +385,8 @@ private:
    SlotVector<Buffer> slot_buffers;
    DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;

+    const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{};
+
    u32 last_index_count = 0;

    Binding index_buffer;
@@ -380,6 +395,8 @@ private:
    std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
    std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers;
    std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
+    Binding count_buffer_binding;
+    Binding indirect_buffer_binding;

    std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
    std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
@@ -674,6 +691,9 @@ void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) {
    }
    BindHostVertexBuffers();
    BindHostTransformFeedbackBuffers();
+    if (current_draw_indirect) {
+        BindHostDrawIndirectBuffers();
+    }
 }

 template <class P>
@@ -823,6 +843,7 @@ bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
 template <class P>
 void BufferCache<P>::CommitAsyncFlushesHigh() {
    AccumulateFlushes();
+
    if (committed_ranges.empty()) {
        return;
    }
@@ -869,7 +890,7 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
                                buffer_id,
                            });
                            // Align up to avoid cache conflicts
-                            constexpr u64 align = 256ULL;
+                            constexpr u64 align = 8ULL;
                            constexpr u64 mask = ~(align - 1ULL);
                            total_size_bytes += (new_size + align - 1) & mask;
                            largest_copy = std::max(largest_copy, new_size);
@@ -1041,6 +1062,19 @@ void BufferCache<P>::BindHostVertexBuffers() {
    }
 }

+template <class P>
+void BufferCache<P>::BindHostDrawIndirectBuffers() {
+    const auto bind_buffer = [this](const Binding& binding) {
+        Buffer& buffer = slot_buffers[binding.buffer_id];
+        TouchBuffer(buffer, binding.buffer_id);
+        SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
+    };
+    if (current_draw_indirect->include_count) {
+        bind_buffer(count_buffer_binding);
+    }
+    bind_buffer(indirect_buffer_binding);
+}
+
 template <class P>
 void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
    u32 dirty = ~0U;
@@ -1272,6 +1306,9 @@ void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
            UpdateStorageBuffers(stage);
            UpdateTextureBuffers(stage);
        }
+        if (current_draw_indirect) {
+            UpdateDrawIndirect();
+        }
    } while (has_deleted_buffers);
 }

@@ -1289,7 +1326,7 @@ void BufferCache<P>::UpdateIndexBuffer() {
    const auto& draw_state = maxwell3d->draw_manager->GetDrawState();
    const auto& index_array = draw_state.index_buffer;
    auto& flags = maxwell3d->dirty.flags;
-    if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) {
+    if (!flags[Dirty::IndexBuffer]) {
        return;
    }
    flags[Dirty::IndexBuffer] = false;
@@ -1361,6 +1398,27 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) {
    };
 }

+template <class P>
+void BufferCache<P>::UpdateDrawIndirect() {
+    const auto update = [this](GPUVAddr gpu_addr, size_t size, Binding& binding) {
+        const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+        if (!cpu_addr) {
+            binding = NULL_BINDING;
+            return;
+        }
+        binding = Binding{
+            .cpu_addr = *cpu_addr,
+            .size = static_cast<u32>(size),
+            .buffer_id = FindBuffer(*cpu_addr, static_cast<u32>(size)),
+        };
+    };
+    if (current_draw_indirect->include_count) {
+        update(current_draw_indirect->count_start_address, sizeof(u32), count_buffer_binding);
+    }
+    update(current_draw_indirect->indirect_start_address, current_draw_indirect->buffer_size,
+           indirect_buffer_binding);
+}
+
 template <class P>
 void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
    ForEachEnabledBit(enabled_uniform_buffer_masks[stage], [&](u32 index) {
@@ -1880,14 +1938,21 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s
                                                                      bool is_written) const {
    const GPUVAddr gpu_addr = gpu_memory->Read<u64>(ssbo_addr);
    const u32 size = gpu_memory->Read<u32>(ssbo_addr + 8);
-    const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+    const u32 alignment = runtime.GetStorageBufferAlignment();
+
+    const GPUVAddr aligned_gpu_addr = Common::AlignDown(gpu_addr, alignment);
+    const u32 aligned_size =
+        Common::AlignUp(static_cast<u32>(gpu_addr - aligned_gpu_addr) + size, alignment);
+
+    const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(aligned_gpu_addr);
    if (!cpu_addr || size == 0) {
        return NULL_BINDING;
    }
-    const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE);
+
+    const VAddr cpu_end = Common::AlignUp(*cpu_addr + aligned_size, Core::Memory::YUZU_PAGESIZE);
    const Binding binding{
        .cpu_addr = *cpu_addr,
-        .size = is_written ? size : static_cast<u32>(cpu_end - *cpu_addr),
+        .size = is_written ? aligned_size : static_cast<u32>(cpu_end - *cpu_addr),
        .buffer_id = BufferId{},
    };
    return binding;
@@ -1941,4 +2006,16 @@ bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index)
    }
 }

+template <class P>
+std::pair<typename BufferCache<P>::Buffer*, u32> BufferCache<P>::GetDrawIndirectCount() {
+    auto& buffer = slot_buffers[count_buffer_binding.buffer_id];
+    return std::make_pair(&buffer, buffer.Offset(count_buffer_binding.cpu_addr));
+}
+
+template <class P>
+std::pair<typename BufferCache<P>::Buffer*, u32> BufferCache<P>::GetDrawIndirectBuffer() {
+    auto& buffer = slot_buffers[indirect_buffer_binding.buffer_id];
+    return std::make_pair(&buffer, buffer.Offset(indirect_buffer_binding.cpu_addr));
+}
+
 } // namespace VideoCommon
--- a/src/video_core/cache_types.h
+++ b/src/video_core/cache_types.h
@@ -0,0 +1,24 @@
+// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+
+namespace VideoCommon {
+
+enum class CacheType : u32 {
+    None = 0,
+    TextureCache = 1 << 0,
+    QueryCache = 1 << 1,
+    BufferCache = 1 << 2,
+    ShaderCache = 1 << 3,
+    NoTextureCache = QueryCache | BufferCache | ShaderCache,
+    NoBufferCache = TextureCache | QueryCache | ShaderCache,
+    NoQueryCache = TextureCache | BufferCache | ShaderCache,
+    All = TextureCache | QueryCache | BufferCache | ShaderCache,
+};
+DECLARE_ENUM_FLAG_OPERATORS(CacheType)
+
+} // namespace VideoCommon
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -61,7 +61,7 @@ bool DmaPusher::Step() {
    } else {
        const CommandListHeader command_list_header{
            command_list.command_lists[dma_pushbuffer_subindex++]};
-        const GPUVAddr dma_get = command_list_header.addr;
+        dma_state.dma_get = command_list_header.addr;

        if (dma_pushbuffer_subindex >= command_list.command_lists.size()) {
            // We've gone through the current list, remove it from the queue
@@ -75,12 +75,22 @@ bool DmaPusher::Step() {

        // Push buffer non-empty, read a word
        command_headers.resize_destructive(command_list_header.size);
-        if (Settings::IsGPULevelHigh()) {
-            memory_manager.ReadBlock(dma_get, command_headers.data(),
-                                     command_list_header.size * sizeof(u32));
+        constexpr u32 MacroRegistersStart = 0xE00;
+        if (dma_state.method < MacroRegistersStart) {
+            if (Settings::IsGPULevelHigh()) {
+                memory_manager.ReadBlock(dma_state.dma_get, command_headers.data(),
+                                         command_list_header.size * sizeof(u32));
+            } else {
+                memory_manager.ReadBlockUnsafe(dma_state.dma_get, command_headers.data(),
+                                               command_list_header.size * sizeof(u32));
+            }
        } else {
-            memory_manager.ReadBlockUnsafe(dma_get, command_headers.data(),
-                                           command_list_header.size * sizeof(u32));
+            const size_t copy_size = command_list_header.size * sizeof(u32);
+            if (subchannels[dma_state.subchannel]) {
+                subchannels[dma_state.subchannel]->current_dirty =
+                    memory_manager.IsMemoryDirty(dma_state.dma_get, copy_size);
+            }
+            memory_manager.ReadBlockUnsafe(dma_state.dma_get, command_headers.data(), copy_size);
        }
        ProcessCommands(command_headers);
    }
@@ -94,6 +104,7 @@ void DmaPusher::ProcessCommands(std::span<const CommandHeader> commands) {

        if (dma_state.method_count) {
            // Data word of methods command
+            dma_state.dma_word_offset = static_cast<u32>(index * sizeof(u32));
            if (dma_state.non_incrementing) {
                const u32 max_write = static_cast<u32>(
                    std::min<std::size_t>(index + dma_state.method_count, commands.size()) - index);
@@ -132,6 +143,8 @@ void DmaPusher::ProcessCommands(std::span<const CommandHeader> commands) {
            case SubmissionMode::Inline:
                dma_state.method = command_header.method;
                dma_state.subchannel = command_header.subchannel;
+                dma_state.dma_word_offset = static_cast<u64>(
+                    -static_cast<s64>(dma_state.dma_get)); // negate to set address as 0
                CallMethod(command_header.arg_count);
                dma_state.non_incrementing = true;
                dma_increment_once = false;
@@ -164,8 +177,14 @@ void DmaPusher::CallMethod(u32 argument) const {
            dma_state.method_count,
        });
    } else {
-        subchannels[dma_state.subchannel]->CallMethod(dma_state.method, argument,
-                                                      dma_state.is_last_call);
+        auto subchannel = subchannels[dma_state.subchannel];
+        if (!subchannel->execution_mask[dma_state.method]) [[likely]] {
+            subchannel->method_sink.emplace_back(dma_state.method, argument);
+            return;
+        }
+        subchannel->ConsumeSink();
+        subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset;
+        subchannel->CallMethod(dma_state.method, argument, dma_state.is_last_call);
    }
 }

@@ -174,8 +193,11 @@ void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const {
        puller.CallMultiMethod(dma_state.method, dma_state.subchannel, base_start, num_methods,
                               dma_state.method_count);
    } else {
-        subchannels[dma_state.subchannel]->CallMultiMethod(dma_state.method, base_start,
-                                                           num_methods, dma_state.method_count);
+        auto subchannel = subchannels[dma_state.subchannel];
+        subchannel->ConsumeSink();
+        subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset;
+        subchannel->CallMultiMethod(dma_state.method, base_start, num_methods,
+                                    dma_state.method_count);
    }
 }

--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -156,6 +156,8 @@ private:
        u32 subchannel;        ///< Current subchannel
        u32 method_count;      ///< Current method count
        u32 length_pending;    ///< Large NI command length pending
+        GPUVAddr dma_get;      ///< Currently read segment
+        u64 dma_word_offset;   ///< Current word ofset from address
        bool non_incrementing; ///< Current command's NI flag
        bool is_last_call;
    };
--- a/src/video_core/engines/draw_manager.cpp
+++ b/src/video_core/engines/draw_manager.cpp
@@ -91,6 +91,23 @@ void DrawManager::DrawIndex(PrimitiveTopology topology, u32 index_first, u32 ind
    ProcessDraw(true, num_instances);
 }

+void DrawManager::DrawArrayIndirect(PrimitiveTopology topology) {
+    draw_state.topology = topology;
+
+    ProcessDrawIndirect();
+}
+
+void DrawManager::DrawIndexedIndirect(PrimitiveTopology topology, u32 index_first,
+                                      u32 index_count) {
+    const auto& regs{maxwell3d->regs};
+    draw_state.topology = topology;
+    draw_state.index_buffer = regs.index_buffer;
+    draw_state.index_buffer.first = index_first;
+    draw_state.index_buffer.count = index_count;
+
+    ProcessDrawIndirect();
+}
+
 void DrawManager::SetInlineIndexBuffer(u32 index) {
    draw_state.inline_index_draw_indexes.push_back(static_cast<u8>(index & 0x000000ff));
    draw_state.inline_index_draw_indexes.push_back(static_cast<u8>((index & 0x0000ff00) >> 8));
@@ -198,4 +215,18 @@ void DrawManager::ProcessDraw(bool draw_indexed, u32 instance_count) {
        maxwell3d->rasterizer->Draw(draw_indexed, instance_count);
    }
 }
+
+void DrawManager::ProcessDrawIndirect() {
+    LOG_TRACE(
+        HW_GPU,
+        "called, topology={}, is_indexed={}, includes_count={}, buffer_size={}, max_draw_count={}",
+        draw_state.topology, indirect_state.is_indexed, indirect_state.include_count,
+        indirect_state.buffer_size, indirect_state.max_draw_counts);
+
+    UpdateTopology();
+
+    if (maxwell3d->ShouldExecute()) {
+        maxwell3d->rasterizer->DrawIndirect();
+    }
+}
 } // namespace Tegra::Engines
--- a/src/video_core/engines/draw_manager.h
+++ b/src/video_core/engines/draw_manager.h
@@ -32,6 +32,16 @@ public:
        std::vector<u8> inline_index_draw_indexes;
    };

+    struct IndirectParams {
+        bool is_indexed;
+        bool include_count;
+        GPUVAddr count_start_address;
+        GPUVAddr indirect_start_address;
+        size_t buffer_size;
+        size_t max_draw_counts;
+        size_t stride;
+    };
+
    explicit DrawManager(Maxwell3D* maxwell_3d);

    void ProcessMethodCall(u32 method, u32 argument);
@@ -46,10 +56,22 @@ public:
    void DrawIndex(PrimitiveTopology topology, u32 index_first, u32 index_count, u32 base_index,
                   u32 base_instance, u32 num_instances);

+    void DrawArrayIndirect(PrimitiveTopology topology);
+
+    void DrawIndexedIndirect(PrimitiveTopology topology, u32 index_first, u32 index_count);
+
    const State& GetDrawState() const {
        return draw_state;
    }

+    IndirectParams& GetIndirectParams() {
+        return indirect_state;
+    }
+
+    const IndirectParams& GetIndirectParams() const {
+        return indirect_state;
+    }
+
 private:
    void SetInlineIndexBuffer(u32 index);

@@ -63,7 +85,10 @@ private:

    void ProcessDraw(bool draw_indexed, u32 instance_count);

+    void ProcessDrawIndirect();
+
    Maxwell3D* maxwell3d{};
    State draw_state{};
+    IndirectParams indirect_state{};
 };
 } // namespace Tegra::Engines
--- a/src/video_core/engines/engine_interface.h
+++ b/src/video_core/engines/engine_interface.h
@@ -3,6 +3,10 @@

 #pragma once

+#include <bitset>
+#include <limits>
+#include <vector>
+
 #include "common/common_types.h"

 namespace Tegra::Engines {
@@ -17,6 +21,26 @@ public:
    /// Write multiple values to the register identified by method.
    virtual void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
                                 u32 methods_pending) = 0;
+
+    void ConsumeSink() {
+        if (method_sink.empty()) {
+            return;
+        }
+        ConsumeSinkImpl();
+    }
+
+    std::bitset<std::numeric_limits<u16>::max()> execution_mask{};
+    std::vector<std::pair<u32, u32>> method_sink{};
+    bool current_dirty{};
+    GPUVAddr current_dma_segment;
+
+protected:
+    virtual void ConsumeSinkImpl() {
+        for (auto [method, value] : method_sink) {
+            CallMethod(method, value, true);
+        }
+        method_sink.clear();
+    }
 };

 } // namespace Tegra::Engines
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -76,7 +76,7 @@ void State::ProcessData(std::span<const u8> read_buffer) {
                                       regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
                                       x_elements, regs.line_count, regs.dest.BlockHeight(),
                                       regs.dest.BlockDepth(), regs.line_length_in);
-        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+        memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size);
    }
 }

--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -6,6 +6,7 @@
 #include "common/microprofile.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/sw_blitter/blitter.h"
+#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/surface.h"
 #include "video_core/textures/decoders.h"
@@ -20,11 +21,14 @@ namespace Tegra::Engines {

 using namespace Texture;

-Fermi2D::Fermi2D(MemoryManager& memory_manager_) {
-    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager_);
+Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager_} {
+    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager);
    // Nvidia's OpenGL driver seems to assume these values
    regs.src.depth = 1;
    regs.dst.depth = 1;
+
+    execution_mask.reset();
+    execution_mask[FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1] = true;
 }

 Fermi2D::~Fermi2D() = default;
@@ -49,6 +53,13 @@ void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32
    }
 }

+void Fermi2D::ConsumeSinkImpl() {
+    for (auto [method, value] : method_sink) {
+        regs.reg_array[method] = value;
+    }
+    method_sink.clear();
+}
+
 void Fermi2D::Blit() {
    MICROPROFILE_SCOPE(GPU_BlitEngine);
    LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
@@ -94,6 +105,7 @@ void Fermi2D::Blit() {
        config.src_x0 = 0;
    }

+    memory_manager.FlushCaching();
    if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
        sw_blitter->Blit(src, regs.dst, config);
    }
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -305,10 +305,13 @@ public:
 private:
    VideoCore::RasterizerInterface* rasterizer = nullptr;
    std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter;
+    MemoryManager& memory_manager;

    /// Performs the copy from the source surface to the destination surface as configured in the
    /// registers.
    void Blit();
+
+    void ConsumeSinkImpl() override;
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -14,7 +14,12 @@
 namespace Tegra::Engines {

 KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manager_)
-    : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {}
+    : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {
+    execution_mask.reset();
+    execution_mask[KEPLER_COMPUTE_REG_INDEX(exec_upload)] = true;
+    execution_mask[KEPLER_COMPUTE_REG_INDEX(data_upload)] = true;
+    execution_mask[KEPLER_COMPUTE_REG_INDEX(launch)] = true;
+}

 KeplerCompute::~KeplerCompute() = default;

@@ -23,6 +28,13 @@ void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_)
    upload_state.BindRasterizer(rasterizer);
 }

+void KeplerCompute::ConsumeSinkImpl() {
+    for (auto [method, value] : method_sink) {
+        regs.reg_array[method] = value;
+    }
+    method_sink.clear();
+}
+
 void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
    ASSERT_MSG(method < Regs::NUM_REGS,
               "Invalid KeplerCompute register, increase the size of the Regs structure");
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -204,6 +204,8 @@ public:
 private:
    void ProcessLaunch();

+    void ConsumeSinkImpl() override;
+
    /// Retrieves information about a specific TIC entry from the TIC buffer.
    Texture::TICEntry GetTICEntry(u32 tic_index) const;

--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -18,6 +18,17 @@ KeplerMemory::~KeplerMemory() = default;

 void KeplerMemory::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
    upload_state.BindRasterizer(rasterizer_);
+
+    execution_mask.reset();
+    execution_mask[KEPLERMEMORY_REG_INDEX(exec)] = true;
+    execution_mask[KEPLERMEMORY_REG_INDEX(data)] = true;
+}
+
+void KeplerMemory::ConsumeSinkImpl() {
+    for (auto [method, value] : method_sink) {
+        regs.reg_array[method] = value;
+    }
+    method_sink.clear();
 }

 void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -73,6 +73,8 @@ public:
    } regs{};

 private:
+    void ConsumeSinkImpl() override;
+
    Core::System& system;
    Upload::State upload_state;
 };
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -4,6 +4,8 @@
 #include <cstring>
 #include <optional>
 #include "common/assert.h"
+#include "common/scope_exit.h"
+#include "common/settings.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "video_core/dirty_flags.h"
@@ -28,6 +30,10 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
                                                                                regs.upload} {
    dirty.flags.flip();
    InitializeRegisterDefaults();
+    execution_mask.reset();
+    for (size_t i = 0; i < execution_mask.size(); i++) {
+        execution_mask[i] = IsMethodExecutable(static_cast<u32>(i));
+    }
 }

 Maxwell3D::~Maxwell3D() = default;
@@ -121,6 +127,71 @@ void Maxwell3D::InitializeRegisterDefaults() {
    shadow_state = regs;
 }

+bool Maxwell3D::IsMethodExecutable(u32 method) {
+    if (method >= MacroRegistersStart) {
+        return true;
+    }
+    switch (method) {
+    case MAXWELL3D_REG_INDEX(draw.end):
+    case MAXWELL3D_REG_INDEX(draw.begin):
+    case MAXWELL3D_REG_INDEX(vertex_buffer.first):
+    case MAXWELL3D_REG_INDEX(vertex_buffer.count):
+    case MAXWELL3D_REG_INDEX(index_buffer.first):
+    case MAXWELL3D_REG_INDEX(index_buffer.count):
+    case MAXWELL3D_REG_INDEX(draw_inline_index):
+    case MAXWELL3D_REG_INDEX(index_buffer32_subsequent):
+    case MAXWELL3D_REG_INDEX(index_buffer16_subsequent):
+    case MAXWELL3D_REG_INDEX(index_buffer8_subsequent):
+    case MAXWELL3D_REG_INDEX(index_buffer32_first):
+    case MAXWELL3D_REG_INDEX(index_buffer16_first):
+    case MAXWELL3D_REG_INDEX(index_buffer8_first):
+    case MAXWELL3D_REG_INDEX(inline_index_2x16.even):
+    case MAXWELL3D_REG_INDEX(inline_index_4x8.index0):
+    case MAXWELL3D_REG_INDEX(vertex_array_instance_first):
+    case MAXWELL3D_REG_INDEX(vertex_array_instance_subsequent):
+    case MAXWELL3D_REG_INDEX(wait_for_idle):
+    case MAXWELL3D_REG_INDEX(shadow_ram_control):
+    case MAXWELL3D_REG_INDEX(load_mme.instruction_ptr):
+    case MAXWELL3D_REG_INDEX(load_mme.instruction):
+    case MAXWELL3D_REG_INDEX(load_mme.start_address):
+    case MAXWELL3D_REG_INDEX(falcon[4]):
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer):
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 1:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 2:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 3:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 4:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 5:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 6:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 7:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 8:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 9:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 10:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 11:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 12:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 13:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 14:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 15:
+    case MAXWELL3D_REG_INDEX(bind_groups[0].raw_config):
+    case MAXWELL3D_REG_INDEX(bind_groups[1].raw_config):
+    case MAXWELL3D_REG_INDEX(bind_groups[2].raw_config):
+    case MAXWELL3D_REG_INDEX(bind_groups[3].raw_config):
+    case MAXWELL3D_REG_INDEX(bind_groups[4].raw_config):
+    case MAXWELL3D_REG_INDEX(topology_override):
+    case MAXWELL3D_REG_INDEX(clear_surface):
+    case MAXWELL3D_REG_INDEX(report_semaphore.query):
+    case MAXWELL3D_REG_INDEX(render_enable.mode):
+    case MAXWELL3D_REG_INDEX(clear_report_value):
+    case MAXWELL3D_REG_INDEX(sync_info):
+    case MAXWELL3D_REG_INDEX(launch_dma):
+    case MAXWELL3D_REG_INDEX(inline_data):
+    case MAXWELL3D_REG_INDEX(fragment_barrier):
+    case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
+        return true;
+    default:
+        return false;
+    }
+}
+
 void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
    if (executing_macro == 0) {
        // A macro call must begin by writing the macro method's register, not its argument.
@@ -130,14 +201,72 @@ void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool
    }

    macro_params.insert(macro_params.end(), base_start, base_start + amount);
+    for (size_t i = 0; i < amount; i++) {
+        macro_addresses.push_back(current_dma_segment + i * sizeof(u32));
+    }
+    macro_segments.emplace_back(current_dma_segment, amount);
+    current_macro_dirty |= current_dirty;
+    current_dirty = false;

    // Call the macro when there are no more parameters in the command buffer
    if (is_last_call) {
+        ConsumeSink();
        CallMacroMethod(executing_macro, macro_params);
        macro_params.clear();
+        macro_addresses.clear();
+        macro_segments.clear();
+        current_macro_dirty = false;
    }
 }

+void Maxwell3D::RefreshParametersImpl() {
+    size_t current_index = 0;
+    for (auto& segment : macro_segments) {
+        if (segment.first == 0) {
+            current_index += segment.second;
+            continue;
+        }
+        memory_manager.ReadBlock(segment.first, &macro_params[current_index],
+                                 sizeof(u32) * segment.second);
+        current_index += segment.second;
+    }
+}
+
+u32 Maxwell3D::GetMaxCurrentVertices() {
+    u32 num_vertices = 0;
+    for (size_t index = 0; index < Regs::NumVertexArrays; ++index) {
+        const auto& array = regs.vertex_streams[index];
+        if (array.enable == 0) {
+            continue;
+        }
+        const auto& attribute = regs.vertex_attrib_format[index];
+        if (attribute.constant) {
+            num_vertices = std::max(num_vertices, 1U);
+            continue;
+        }
+        const auto& limit = regs.vertex_stream_limits[index];
+        const GPUVAddr gpu_addr_begin = array.Address();
+        const GPUVAddr gpu_addr_end = limit.Address() + 1;
+        const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
+        num_vertices = std::max(
+            num_vertices, address_size / std::max(attribute.SizeInBytes(), array.stride.Value()));
+    }
+    return num_vertices;
+}
+
+size_t Maxwell3D::EstimateIndexBufferSize() {
+    GPUVAddr start_address = regs.index_buffer.StartAddress();
+    GPUVAddr end_address = regs.index_buffer.EndAddress();
+    constexpr std::array<size_t, 4> max_sizes = {
+        std::numeric_limits<u8>::max(), std::numeric_limits<u16>::max(),
+        std::numeric_limits<u32>::max(), std::numeric_limits<u32>::max()};
+    const size_t byte_size = regs.index_buffer.FormatSizeInBytes();
+    return std::min<size_t>(
+        memory_manager.GetMemoryLayoutSize(start_address, byte_size * max_sizes[byte_size]) /
+            byte_size,
+        static_cast<size_t>(end_address - start_address));
+}
+
 u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
    // Keep track of the register value in shadow_state when requested.
    const auto control = shadow_state.shadow_ram_control;
@@ -152,6 +281,29 @@ u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
    return argument;
 }

+void Maxwell3D::ConsumeSinkImpl() {
+    SCOPE_EXIT({ method_sink.clear(); });
+    const auto control = shadow_state.shadow_ram_control;
+    if (control == Regs::ShadowRamControl::Track ||
+        control == Regs::ShadowRamControl::TrackWithFilter) {
+
+        for (auto [method, value] : method_sink) {
+            shadow_state.reg_array[method] = value;
+            ProcessDirtyRegisters(method, value);
+        }
+        return;
+    }
+    if (control == Regs::ShadowRamControl::Replay) {
+        for (auto [method, value] : method_sink) {
+            ProcessDirtyRegisters(method, shadow_state.reg_array[method]);
+        }
+        return;
+    }
+    for (auto [method, value] : method_sink) {
+        ProcessDirtyRegisters(method, value);
+    }
+}
+
 void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
    if (regs.reg_array[method] == argument) {
        return;
@@ -263,7 +415,6 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {

    const u32 argument = ProcessShadowRam(method, method_argument);
    ProcessDirtyRegisters(method, argument);
-
    ProcessMethodCall(method, argument, method_argument, is_last_call);
 }

@@ -294,9 +445,11 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 15:
        ProcessCBMultiData(base_start, amount);
        break;
-    case MAXWELL3D_REG_INDEX(inline_data):
+    case MAXWELL3D_REG_INDEX(inline_data): {
+        ASSERT(methods_pending == amount);
        upload_state.ProcessData(base_start, amount);
        return;
+    }
    default:
        for (u32 i = 0; i < amount; i++) {
            CallMethod(method, base_start[i], methods_pending - i <= 1);
@@ -332,11 +485,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
 }

 void Maxwell3D::ProcessQueryGet() {
-    // TODO(Subv): Support the other query units.
-    if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) {
-        LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented");
-    }
-
    switch (regs.report_semaphore.query.operation) {
    case Regs::ReportSemaphore::Operation::Release:
        if (regs.report_semaphore.query.short_query != 0) {
@@ -389,7 +537,11 @@ void Maxwell3D::ProcessQueryCondition() {
    case Regs::RenderEnable::Override::NeverRender:
        execute_on = false;
        break;
-    case Regs::RenderEnable::Override::UseRenderEnable:
+    case Regs::RenderEnable::Override::UseRenderEnable: {
+        if (rasterizer->AccelerateConditionalRendering()) {
+            execute_on = true;
+            return;
+        }
        switch (regs.render_enable.mode) {
        case Regs::RenderEnable::Mode::True: {
            execute_on = true;
@@ -427,6 +579,7 @@ void Maxwell3D::ProcessQueryCondition() {
        }
        break;
    }
+    }
 }

 void Maxwell3D::ProcessCounterReset() {
@@ -463,7 +616,8 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
 }

 void Maxwell3D::ProcessCBBind(size_t stage_index) {
-    // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
+    // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader
+    // stage.
    const auto& bind_data = regs.bind_groups[stage_index];
    auto& buffer = state.shader_stages[stage_index].const_buffers[bind_data.shader_slot];
    buffer.enabled = bind_data.valid.Value() != 0;
@@ -490,7 +644,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) {

    const GPUVAddr address{buffer_address + regs.const_buffer.offset};
    const size_t copy_size = amount * sizeof(u32);
-    memory_manager.WriteBlock(address, start_base, copy_size);
+    memory_manager.WriteBlockCached(address, start_base, copy_size);

    // Increment the current buffer position.
    regs.const_buffer.offset += static_cast<u32>(copy_size);
@@ -524,4 +678,10 @@ u32 Maxwell3D::GetRegisterValue(u32 method) const {
    return regs.reg_array[method];
 }

+void Maxwell3D::SetHLEReplacementAttributeType(u32 bank, u32 offset,
+                                               HLEReplacementAttributeType name) {
+    const u64 key = (static_cast<u64>(bank) << 32) | offset;
+    replace_table.emplace(key, name);
+}
+
 } // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -272,6 +272,7 @@ public:
            };

            union {
+                u32 raw;
                BitField<0, 1, Mode> mode;
                BitField<4, 8, u32> pad;
            };
@@ -1217,10 +1218,12 @@ public:

        struct Window {
            union {
+                u32 raw_x;
                BitField<0, 16, u32> x_min;
                BitField<16, 16, u32> x_max;
            };
            union {
+                u32 raw_y;
                BitField<0, 16, u32> y_min;
                BitField<16, 16, u32> y_max;
            };
@@ -2708,7 +2711,7 @@ public:
                u32 post_z_pixel_imask;                                                ///< 0x0F1C
                INSERT_PADDING_BYTES_NOINIT(0x20);
                ConstantColorRendering const_color_rendering;                          ///< 0x0F40
-                s32 stencil_back_ref;                                                  ///< 0x0F54
+                u32 stencil_back_ref;                                                  ///< 0x0F54
                u32 stencil_back_mask;                                                 ///< 0x0F58
                u32 stencil_back_func_mask;                                            ///< 0x0F5C
                INSERT_PADDING_BYTES_NOINIT(0x14);
@@ -2832,9 +2835,9 @@ public:
                Blend blend;                                                           ///< 0x133C
                u32 stencil_enable;                                                    ///< 0x1380
                StencilOp stencil_front_op;                                            ///< 0x1384
-                s32 stencil_front_ref;                                                 ///< 0x1394
-                s32 stencil_front_func_mask;                                           ///< 0x1398
-                s32 stencil_front_mask;                                                ///< 0x139C
+                u32 stencil_front_ref;                                                 ///< 0x1394
+                u32 stencil_front_func_mask;                                           ///< 0x1398
+                u32 stencil_front_mask;                                                ///< 0x139C
                INSERT_PADDING_BYTES_NOINIT(0x4);
                u32 draw_auto_start_byte_count;                                        ///< 0x13A4
                PsSaturate frag_color_clamp;                                           ///< 0x13A8
@@ -3020,6 +3023,24 @@ public:
    /// Store temporary hw register values, used by some calls to restore state after a operation
    Regs shadow_state;

+    // None Engine
+    enum class EngineHint : u32 {
+        None = 0x0,
+        OnHLEMacro = 0x1,
+    };
+
+    EngineHint engine_state{EngineHint::None};
+
+    enum class HLEReplacementAttributeType : u32 {
+        BaseVertex = 0x0,
+        BaseInstance = 0x1,
+        DrawID = 0x2,
+    };
+
+    void SetHLEReplacementAttributeType(u32 bank, u32 offset, HLEReplacementAttributeType name);
+
+    std::unordered_map<u64, HLEReplacementAttributeType> replace_table;
+
    static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32), "Maxwell3D Regs has wrong size");
    static_assert(std::is_trivially_copyable_v<Regs>, "Maxwell3D Regs must be trivially copyable");

@@ -3067,6 +3088,35 @@ public:
    std::unique_ptr<DrawManager> draw_manager;
    friend class DrawManager;

+    GPUVAddr GetMacroAddress(size_t index) const {
+        return macro_addresses[index];
+    }
+
+    void RefreshParameters() {
+        if (!current_macro_dirty) {
+            return;
+        }
+        RefreshParametersImpl();
+    }
+
+    bool AnyParametersDirty() const {
+        return current_macro_dirty;
+    }
+
+    u32 GetMaxCurrentVertices();
+
+    size_t EstimateIndexBufferSize();
+
+    /// Handles a write to the CLEAR_BUFFERS register.
+    void ProcessClearBuffers(u32 layer_count);
+
+    /// Handles a write to the CB_BIND register.
+    void ProcessCBBind(size_t stage_index);
+
+    /// Handles a write to the CB_DATA[i] register.
+    void ProcessCBData(u32 value);
+    void ProcessCBMultiData(const u32* start_base, u32 amount);
+
 private:
    void InitializeRegisterDefaults();

@@ -3076,6 +3126,8 @@ private:

    void ProcessDirtyRegisters(u32 method, u32 argument);

+    void ConsumeSinkImpl() override;
+
    void ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call);

    /// Retrieves information about a specific TIC entry from the TIC buffer.
@@ -3116,16 +3168,13 @@ private:
    /// Handles writes to syncing register.
    void ProcessSyncPoint();

-    /// Handles a write to the CB_DATA[i] register.
-    void ProcessCBData(u32 value);
-    void ProcessCBMultiData(const u32* start_base, u32 amount);
-
-    /// Handles a write to the CB_BIND register.
-    void ProcessCBBind(size_t stage_index);
-
    /// Returns a query's value or an empty object if the value will be deferred through a cache.
    std::optional<u64> GetQueryResult();

+    void RefreshParametersImpl();
+
+    bool IsMethodExecutable(u32 method);
+
    Core::System& system;
    MemoryManager& memory_manager;

@@ -3145,6 +3194,10 @@ private:
    Upload::State upload_state;

    bool execute_on{true};
+
+    std::vector<std::pair<GPUVAddr, size_t>> macro_segments;
+    std::vector<GPUVAddr> macro_addresses;
+    bool current_macro_dirty{};
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -21,7 +21,10 @@ namespace Tegra::Engines {
 using namespace Texture;

 MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_)
-    : system{system_}, memory_manager{memory_manager_} {}
+    : system{system_}, memory_manager{memory_manager_} {
+    execution_mask.reset();
+    execution_mask[offsetof(Regs, launch_dma) / sizeof(u32)] = true;
+}

 MaxwellDMA::~MaxwellDMA() = default;

@@ -29,6 +32,13 @@ void MaxwellDMA::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
    rasterizer = rasterizer_;
 }

+void MaxwellDMA::ConsumeSinkImpl() {
+    for (auto [method, value] : method_sink) {
+        regs.reg_array[method] = value;
+    }
+    method_sink.clear();
+}
+
 void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
    ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register");

@@ -59,7 +69,7 @@ void MaxwellDMA::Launch() {
    if (launch.multi_line_enable) {
        const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
        const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
-
+        memory_manager.FlushCaching();
        if (!is_src_pitch && !is_dst_pitch) {
            // If both the source and the destination are in block layout, assert.
            CopyBlockLinearToBlockLinear();
@@ -94,6 +104,7 @@ void MaxwellDMA::Launch() {
                                            reinterpret_cast<u8*>(tmp_buffer.data()),
                                            regs.line_length_in * sizeof(u32));
        } else {
+            memory_manager.FlushCaching();
            const auto convert_linear_2_blocklinear_addr = [](u64 address) {
                return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) |
                       ((address & 0x180) >> 1) | ((address & 0x20) << 3);
@@ -111,8 +122,8 @@ void MaxwellDMA::Launch() {
                    memory_manager.ReadBlockUnsafe(
                        convert_linear_2_blocklinear_addr(regs.offset_in + offset),
                        tmp_buffer.data(), tmp_buffer.size());
-                    memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(),
-                                              tmp_buffer.size());
+                    memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(),
+                                                    tmp_buffer.size());
                }
            } else if (is_src_pitch && !is_dst_pitch) {
                UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0);
@@ -122,7 +133,7 @@ void MaxwellDMA::Launch() {
                for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
                    memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(),
                                                   tmp_buffer.size());
-                    memory_manager.WriteBlock(
+                    memory_manager.WriteBlockCached(
                        convert_linear_2_blocklinear_addr(regs.offset_out + offset),
                        tmp_buffer.data(), tmp_buffer.size());
                }
@@ -131,8 +142,8 @@ void MaxwellDMA::Launch() {
                    std::vector<u8> tmp_buffer(regs.line_length_in);
                    memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),
                                                   regs.line_length_in);
-                    memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(),
-                                              regs.line_length_in);
+                    memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(),
+                                                    regs.line_length_in);
                }
            }
        }
@@ -194,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
                     src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                     regs.pitch_out);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::CopyPitchToBlockLinear() {
@@ -246,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
                   dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                   regs.pitch_in);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::FastCopyBlockLinearToPitch() {
@@ -277,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
                     regs.src_params.block_size.height, regs.src_params.block_size.depth,
                     regs.pitch_out);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::CopyBlockLinearToBlockLinear() {
@@ -337,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
                   dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
                   dst.block_size.height, dst.block_size.depth, pitch);

-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }

 void MaxwellDMA::ReleaseSemaphore() {
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -231,6 +231,8 @@ private:

    void ReleaseSemaphore();

+    void ConsumeSinkImpl() override;
+
    Core::System& system;

    MemoryManager& memory_manager;
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -47,6 +47,7 @@ set(SHADER_FILES
    vulkan_present_scaleforce_fp16.frag
    vulkan_present_scaleforce_fp32.frag
    vulkan_quad_indexed.comp
+    vulkan_turbo_mode.comp
    vulkan_uint8.comp
 )

--- a/src/video_core/host_shaders/vulkan_turbo_mode.comp
+++ b/src/video_core/host_shaders/vulkan_turbo_mode.comp
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#version 460 core
+
+layout (local_size_x = 16, local_size_y = 8, local_size_z = 1) in;
+
+layout (binding = 0) buffer ThreadData {
+    uint data[];
+};
+
+uint xorshift32(uint x) {
+    x ^= x << 13;
+    x ^= x >> 17;
+    x ^= x << 5;
+    return x;
+}
+
+uint getGlobalIndex() {
+    return gl_GlobalInvocationID.x + gl_GlobalInvocationID.y * gl_WorkGroupSize.y * gl_NumWorkGroups.y;
+}
+
+void main() {
+    uint myIndex = xorshift32(getGlobalIndex());
+    uint otherIndex = xorshift32(myIndex);
+
+    uint otherValue = atomicAdd(data[otherIndex % data.length()], 0) + 1;
+    atomicAdd(data[myIndex % data.length()], otherValue);
+}
--- a/src/video_core/invalidation_accumulator.h
+++ b/src/video_core/invalidation_accumulator.h
@@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace VideoCommon {
+
+class InvalidationAccumulator {
+public:
+    InvalidationAccumulator() = default;
+    ~InvalidationAccumulator() = default;
+
+    void Add(GPUVAddr address, size_t size) {
+        const auto reset_values = [&]() {
+            if (has_collected) {
+                buffer.emplace_back(start_address, accumulated_size);
+            }
+            start_address = address;
+            accumulated_size = size;
+            last_collection = start_address + size;
+        };
+        if (address >= start_address && address + size <= last_collection) [[likely]] {
+            return;
+        }
+        size = ((address + size + atomicity_size_mask) & atomicity_mask) - address;
+        address = address & atomicity_mask;
+        if (!has_collected) [[unlikely]] {
+            reset_values();
+            has_collected = true;
+            return;
+        }
+        if (address != last_collection) [[unlikely]] {
+            reset_values();
+            return;
+        }
+        accumulated_size += size;
+        last_collection += size;
+    }
+
+    void Clear() {
+        buffer.clear();
+        start_address = 0;
+        last_collection = 0;
+        has_collected = false;
+    }
+
+    bool AnyAccumulated() const {
+        return has_collected;
+    }
+
+    template <typename Func>
+    void Callback(Func&& func) {
+        if (!has_collected) {
+            return;
+        }
+        buffer.emplace_back(start_address, accumulated_size);
+        for (auto& [address, size] : buffer) {
+            func(address, size);
+        }
+    }
+
+private:
+    static constexpr size_t atomicity_bits = 5;
+    static constexpr size_t atomicity_size = 1ULL << atomicity_bits;
+    static constexpr size_t atomicity_size_mask = atomicity_size - 1;
+    static constexpr size_t atomicity_mask = ~atomicity_size_mask;
+    GPUVAddr start_address{};
+    GPUVAddr last_collection{};
+    size_t accumulated_size{};
+    bool has_collected{};
+    std::vector<std::pair<VAddr, size_t>> buffer;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -12,7 +12,9 @@
 #include "common/assert.h"
 #include "common/fs/fs.h"
 #include "common/fs/path_util.h"
+#include "common/microprofile.h"
 #include "common/settings.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro/macro.h"
 #include "video_core/macro/macro_hle.h"
 #include "video_core/macro/macro_interpreter.h"
@@ -21,6 +23,8 @@
 #include "video_core/macro/macro_jit_x64.h"
 #endif

+MICROPROFILE_DEFINE(MacroHLE, "GPU", "Execute macro HLE", MP_RGB(128, 192, 192));
+
 namespace Tegra {

 static void Dump(u64 hash, std::span<const u32> code) {
@@ -40,8 +44,8 @@ static void Dump(u64 hash, std::span<const u32> code) {
    macro_file.write(reinterpret_cast<const char*>(code.data()), code.size_bytes());
 }

-MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
-    : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
+MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d_)
+    : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d_)}, maxwell3d{maxwell3d_} {}

 MacroEngine::~MacroEngine() = default;

@@ -59,8 +63,10 @@ void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
    if (compiled_macro != macro_cache.end()) {
        const auto& cache_info = compiled_macro->second;
        if (cache_info.has_hle_program) {
+            MICROPROFILE_SCOPE(MacroHLE);
            cache_info.hle_program->Execute(parameters, method);
        } else {
+            maxwell3d.RefreshParameters();
            cache_info.lle_program->Execute(parameters, method);
        }
    } else {
@@ -101,12 +107,15 @@ void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
            }
        }

-        if (auto hle_program = hle_macros->GetHLEProgram(cache_info.hash)) {
+        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
+        if (!hle_program || Settings::values.disable_macro_hle) {
+            maxwell3d.RefreshParameters();
+            cache_info.lle_program->Execute(parameters, method);
+        } else {
            cache_info.has_hle_program = true;
            cache_info.hle_program = std::move(hle_program);
+            MICROPROFILE_SCOPE(MacroHLE);
            cache_info.hle_program->Execute(parameters, method);
-        } else {
-            cache_info.lle_program->Execute(parameters, method);
        }
    }
 }
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro/macro.h
@@ -137,6 +137,7 @@ private:
    std::unordered_map<u32, CacheInfo> macro_cache;
    std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
    std::unique_ptr<HLEMacro> hle_macros;
+    Engines::Maxwell3D& maxwell3d;
 };

 std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -1,143 +1,551 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later

 #include <array>
 #include <vector>
+#include "common/assert.h"
 #include "common/scope_exit.h"
 #include "video_core/dirty_flags.h"
 #include "video_core/engines/draw_manager.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro/macro.h"
 #include "video_core/macro/macro_hle.h"
+#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"

 namespace Tegra {
+
+using Maxwell3D = Engines::Maxwell3D;
+
 namespace {

-using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
-
-// HLE'd functions
-void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
-    const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
-    maxwell3d.draw_manager->DrawIndex(
-        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0x3ffffff),
-        parameters[4], parameters[1], parameters[3], parameters[5], instance_count);
-}
-
-void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
-    const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
-    maxwell3d.draw_manager->DrawArray(
-        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]),
-        parameters[3], parameters[1], parameters[4], instance_count);
-}
-
-void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
-    const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
-    const u32 element_base = parameters[4];
-    const u32 base_instance = parameters[5];
-    maxwell3d.regs.vertex_id_base = element_base;
-    maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-    maxwell3d.CallMethod(0x8e3, 0x640, true);
-    maxwell3d.CallMethod(0x8e4, element_base, true);
-    maxwell3d.CallMethod(0x8e5, base_instance, true);
-
-    maxwell3d.draw_manager->DrawIndex(
-        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]),
-        parameters[3], parameters[1], element_base, base_instance, instance_count);
-
-    maxwell3d.regs.vertex_id_base = 0x0;
-    maxwell3d.CallMethod(0x8e3, 0x640, true);
-    maxwell3d.CallMethod(0x8e4, 0x0, true);
-    maxwell3d.CallMethod(0x8e5, 0x0, true);
-}
-
-// Multidraw Indirect
-void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
-    SCOPE_EXIT({
-        // Clean everything.
-        maxwell3d.regs.vertex_id_base = 0x0;
-        maxwell3d.CallMethod(0x8e3, 0x640, true);
-        maxwell3d.CallMethod(0x8e4, 0x0, true);
-        maxwell3d.CallMethod(0x8e5, 0x0, true);
-        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-    });
-    const u32 start_indirect = parameters[0];
-    const u32 end_indirect = parameters[1];
-    if (start_indirect >= end_indirect) {
-        // Nothing to do.
-        return;
-    }
-    const u32 padding = parameters[3];
-    const std::size_t max_draws = parameters[4];
-
-    const u32 indirect_words = 5 + padding;
-    const std::size_t first_draw = start_indirect;
-    const std::size_t effective_draws = end_indirect - start_indirect;
-    const std::size_t last_draw = start_indirect + std::min(effective_draws, max_draws);
-
-    for (std::size_t index = first_draw; index < last_draw; index++) {
-        const std::size_t base = index * indirect_words + 5;
-        const u32 base_vertex = parameters[base + 3];
-        const u32 base_instance = parameters[base + 4];
-        maxwell3d.regs.vertex_id_base = base_vertex;
-        maxwell3d.CallMethod(0x8e3, 0x640, true);
-        maxwell3d.CallMethod(0x8e4, base_vertex, true);
-        maxwell3d.CallMethod(0x8e5, base_instance, true);
-        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        maxwell3d.draw_manager->DrawIndex(
-            static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[2]),
-            parameters[base + 2], parameters[base], base_vertex, base_instance,
-            parameters[base + 1]);
+bool IsTopologySafe(Maxwell3D::Regs::PrimitiveTopology topology) {
+    switch (topology) {
+    case Maxwell3D::Regs::PrimitiveTopology::Points:
+    case Maxwell3D::Regs::PrimitiveTopology::Lines:
+    case Maxwell3D::Regs::PrimitiveTopology::LineLoop:
+    case Maxwell3D::Regs::PrimitiveTopology::LineStrip:
+    case Maxwell3D::Regs::PrimitiveTopology::Triangles:
+    case Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
+    case Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
+    case Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
+    case Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
+    case Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
+    case Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
+    case Maxwell3D::Regs::PrimitiveTopology::Patches:
+        return true;
+    case Maxwell3D::Regs::PrimitiveTopology::Quads:
+    case Maxwell3D::Regs::PrimitiveTopology::QuadStrip:
+    case Maxwell3D::Regs::PrimitiveTopology::Polygon:
+    default:
+        return false;
    }
 }

-// Multi-layer Clear
-void HLE_EAD26C3E2109B06B(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
-    ASSERT(parameters.size() == 1);
-
-    const Engines::Maxwell3D::Regs::ClearSurface clear_params{parameters[0]};
-    const u32 rt_index = clear_params.RT;
-    const u32 num_layers = maxwell3d.regs.rt[rt_index].depth;
-    ASSERT(clear_params.layer == 0);
-
-    maxwell3d.regs.clear_surface.raw = clear_params.raw;
-    maxwell3d.draw_manager->Clear(num_layers);
-}
-
-constexpr std::array<std::pair<u64, HLEFunction>, 5> hle_funcs{{
-    {0x771BB18C62444DA0, &HLE_771BB18C62444DA0},
-    {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD},
-    {0x0217920100488FF7, &HLE_0217920100488FF7},
-    {0x3F5E74B9C9A50164, &HLE_3F5E74B9C9A50164},
-    {0xEAD26C3E2109B06B, &HLE_EAD26C3E2109B06B},
-}};
-
-class HLEMacroImpl final : public CachedMacro {
+class HLEMacroImpl : public CachedMacro {
 public:
-    explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d_, HLEFunction func_)
-        : maxwell3d{maxwell3d_}, func{func_} {}
+    explicit HLEMacroImpl(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {}

-    void Execute(const std::vector<u32>& parameters, u32 method) override {
-        func(maxwell3d, parameters);
+protected:
+    Maxwell3D& maxwell3d;
+};
+
+/*
+ * @note: these macros have two versions, a normal and extended version, with the extended version
+ * also assigning the base vertex/instance.
+ */
+template <bool extended>
+class HLE_DrawArraysIndirect final : public HLEMacroImpl {
+public:
+    explicit HLE_DrawArraysIndirect(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
+        if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) {
+            Fallback(parameters);
+            return;
+        }
+
+        auto& params = maxwell3d.draw_manager->GetIndirectParams();
+        params.is_indexed = false;
+        params.include_count = false;
+        params.count_start_address = 0;
+        params.indirect_start_address = maxwell3d.GetMacroAddress(1);
+        params.buffer_size = 4 * sizeof(u32);
+        params.max_draw_counts = 1;
+        params.stride = 0;
+
+        if constexpr (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+            maxwell3d.SetHLEReplacementAttributeType(
+                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+        }
+
+        maxwell3d.draw_manager->DrawArrayIndirect(topology);
+
+        if constexpr (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+            maxwell3d.replace_table.clear();
+        }
    }

 private:
-    Engines::Maxwell3D& maxwell3d;
-    HLEFunction func;
+    void Fallback(const std::vector<u32>& parameters) {
+        SCOPE_EXIT({
+            if (extended) {
+                maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+                maxwell3d.replace_table.clear();
+            }
+        });
+        maxwell3d.RefreshParameters();
+        const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+
+        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
+        const u32 vertex_first = parameters[3];
+        const u32 vertex_count = parameters[1];
+
+        if (!IsTopologySafe(topology) &&
+            static_cast<size_t>(maxwell3d.GetMaxCurrentVertices()) <
+                static_cast<size_t>(vertex_first) + static_cast<size_t>(vertex_count)) {
+            ASSERT_MSG(false, "Faulty draw!");
+            return;
+        }
+
+        const u32 base_instance = parameters[4];
+        if constexpr (extended) {
+            maxwell3d.regs.global_base_instance_index = base_instance;
+            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+            maxwell3d.SetHLEReplacementAttributeType(
+                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+        }
+
+        maxwell3d.draw_manager->DrawArray(topology, vertex_first, vertex_count, base_instance,
+                                          instance_count);
+
+        if constexpr (extended) {
+            maxwell3d.regs.global_base_instance_index = 0;
+            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+            maxwell3d.replace_table.clear();
+        }
+    }
+};
+
+/*
+ * @note: these macros have two versions, a normal and extended version, with the extended version
+ * also assigning the base vertex/instance.
+ */
+template <bool extended>
+class HLE_DrawIndexedIndirect final : public HLEMacroImpl {
+public:
+    explicit HLE_DrawIndexedIndirect(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
+        if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) {
+            Fallback(parameters);
+            return;
+        }
+
+        const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize());
+        const u32 element_base = parameters[4];
+        const u32 base_instance = parameters[5];
+        maxwell3d.regs.vertex_id_base = element_base;
+        maxwell3d.regs.global_base_vertex_index = element_base;
+        maxwell3d.regs.global_base_instance_index = base_instance;
+        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+        if constexpr (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+            maxwell3d.SetHLEReplacementAttributeType(
+                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
+            maxwell3d.SetHLEReplacementAttributeType(
+                0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+        }
+        auto& params = maxwell3d.draw_manager->GetIndirectParams();
+        params.is_indexed = true;
+        params.include_count = false;
+        params.count_start_address = 0;
+        params.indirect_start_address = maxwell3d.GetMacroAddress(1);
+        params.buffer_size = 5 * sizeof(u32);
+        params.max_draw_counts = 1;
+        params.stride = 0;
+        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+        maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, estimate);
+        maxwell3d.regs.vertex_id_base = 0x0;
+        maxwell3d.regs.global_base_vertex_index = 0x0;
+        maxwell3d.regs.global_base_instance_index = 0x0;
+        if constexpr (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+            maxwell3d.replace_table.clear();
+        }
+    }
+
+private:
+    void Fallback(const std::vector<u32>& parameters) {
+        maxwell3d.RefreshParameters();
+        const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+        const u32 element_base = parameters[4];
+        const u32 base_instance = parameters[5];
+        maxwell3d.regs.vertex_id_base = element_base;
+        maxwell3d.regs.global_base_vertex_index = element_base;
+        maxwell3d.regs.global_base_instance_index = base_instance;
+        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+        if constexpr (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+            maxwell3d.SetHLEReplacementAttributeType(
+                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
+            maxwell3d.SetHLEReplacementAttributeType(
+                0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+        }
+
+        maxwell3d.draw_manager->DrawIndex(
+            static_cast<Tegra::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]), parameters[3],
+            parameters[1], element_base, base_instance, instance_count);
+
+        maxwell3d.regs.vertex_id_base = 0x0;
+        maxwell3d.regs.global_base_vertex_index = 0x0;
+        maxwell3d.regs.global_base_instance_index = 0x0;
+        if constexpr (extended) {
+            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+            maxwell3d.replace_table.clear();
+        }
+    }
+};
+
+class HLE_MultiLayerClear final : public HLEMacroImpl {
+public:
+    explicit HLE_MultiLayerClear(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        ASSERT(parameters.size() == 1);
+
+        const Maxwell3D::Regs::ClearSurface clear_params{parameters[0]};
+        const u32 rt_index = clear_params.RT;
+        const u32 num_layers = maxwell3d.regs.rt[rt_index].depth;
+        ASSERT(clear_params.layer == 0);
+
+        maxwell3d.regs.clear_surface.raw = clear_params.raw;
+        maxwell3d.draw_manager->Clear(num_layers);
+    }
+};
+
+class HLE_MultiDrawIndexedIndirectCount final : public HLEMacroImpl {
+public:
+    explicit HLE_MultiDrawIndexedIndirectCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        const auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[2]);
+        if (!IsTopologySafe(topology)) {
+            Fallback(parameters);
+            return;
+        }
+
+        const u32 start_indirect = parameters[0];
+        const u32 end_indirect = parameters[1];
+        if (start_indirect >= end_indirect) {
+            // Nothing to do.
+            return;
+        }
+
+        const u32 padding = parameters[3]; // padding is in words
+
+        // size of each indirect segment
+        const u32 indirect_words = 5 + padding;
+        const u32 stride = indirect_words * sizeof(u32);
+        const std::size_t draw_count = end_indirect - start_indirect;
+        const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize());
+        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+        auto& params = maxwell3d.draw_manager->GetIndirectParams();
+        params.is_indexed = true;
+        params.include_count = true;
+        params.count_start_address = maxwell3d.GetMacroAddress(4);
+        params.indirect_start_address = maxwell3d.GetMacroAddress(5);
+        params.buffer_size = stride * draw_count;
+        params.max_draw_counts = draw_count;
+        params.stride = stride;
+        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+        maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+        maxwell3d.SetHLEReplacementAttributeType(
+            0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
+        maxwell3d.SetHLEReplacementAttributeType(
+            0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+        maxwell3d.SetHLEReplacementAttributeType(0, 0x648,
+                                                 Maxwell3D::HLEReplacementAttributeType::DrawID);
+        maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, estimate);
+        maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+        maxwell3d.replace_table.clear();
+    }
+
+private:
+    void Fallback(const std::vector<u32>& parameters) {
+        SCOPE_EXIT({
+            // Clean everything.
+            maxwell3d.regs.vertex_id_base = 0x0;
+            maxwell3d.engine_state = Maxwell3D::EngineHint::None;
+            maxwell3d.replace_table.clear();
+        });
+        maxwell3d.RefreshParameters();
+        const u32 start_indirect = parameters[0];
+        const u32 end_indirect = parameters[1];
+        if (start_indirect >= end_indirect) {
+            // Nothing to do.
+            return;
+        }
+        const auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[2]);
+        const u32 padding = parameters[3];
+        const std::size_t max_draws = parameters[4];
+
+        const u32 indirect_words = 5 + padding;
+        const std::size_t first_draw = start_indirect;
+        const std::size_t effective_draws = end_indirect - start_indirect;
+        const std::size_t last_draw = start_indirect + std::min(effective_draws, max_draws);
+
+        for (std::size_t index = first_draw; index < last_draw; index++) {
+            const std::size_t base = index * indirect_words + 5;
+            const u32 base_vertex = parameters[base + 3];
+            const u32 base_instance = parameters[base + 4];
+            maxwell3d.regs.vertex_id_base = base_vertex;
+            maxwell3d.engine_state = Maxwell3D::EngineHint::OnHLEMacro;
+            maxwell3d.SetHLEReplacementAttributeType(
+                0, 0x640, Maxwell3D::HLEReplacementAttributeType::BaseVertex);
+            maxwell3d.SetHLEReplacementAttributeType(
+                0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
+            maxwell3d.CallMethod(0x8e3, 0x648, true);
+            maxwell3d.CallMethod(0x8e4, static_cast<u32>(index), true);
+            maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+            maxwell3d.draw_manager->DrawIndex(topology, parameters[base + 2], parameters[base],
+                                              base_vertex, base_instance, parameters[base + 1]);
+        }
+    }
+};
+
+class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl {
+public:
+    explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        const u32 offset = (parameters[0] & 0x3FFFFFFF) << 2;
+        const u32 address = maxwell3d.regs.shadow_scratch[24];
+        auto& const_buffer = maxwell3d.regs.const_buffer;
+        const_buffer.size = 0x7000;
+        const_buffer.address_high = (address >> 24) & 0xFF;
+        const_buffer.address_low = address << 8;
+        const_buffer.offset = offset;
+    }
+};
+
+class HLE_D7333D26E0A93EDE final : public HLEMacroImpl {
+public:
+    explicit HLE_D7333D26E0A93EDE(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        const size_t index = parameters[0];
+        const u32 address = maxwell3d.regs.shadow_scratch[42 + index];
+        const u32 size = maxwell3d.regs.shadow_scratch[47 + index];
+        auto& const_buffer = maxwell3d.regs.const_buffer;
+        const_buffer.size = size;
+        const_buffer.address_high = (address >> 24) & 0xFF;
+        const_buffer.address_low = address << 8;
+    }
+};
+
+class HLE_BindShader final : public HLEMacroImpl {
+public:
+    explicit HLE_BindShader(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        auto& regs = maxwell3d.regs;
+        const u32 index = parameters[0];
+        if ((parameters[1] - regs.shadow_scratch[28 + index]) == 0) {
+            return;
+        }
+
+        regs.pipelines[index & 0xF].offset = parameters[2];
+        maxwell3d.dirty.flags[VideoCommon::Dirty::Shaders] = true;
+        regs.shadow_scratch[28 + index] = parameters[1];
+        regs.shadow_scratch[34 + index] = parameters[2];
+
+        const u32 address = parameters[4];
+        auto& const_buffer = regs.const_buffer;
+        const_buffer.size = 0x10000;
+        const_buffer.address_high = (address >> 24) & 0xFF;
+        const_buffer.address_low = address << 8;
+
+        const size_t bind_group_id = parameters[3] & 0x7F;
+        auto& bind_group = regs.bind_groups[bind_group_id];
+        bind_group.raw_config = 0x11;
+        maxwell3d.ProcessCBBind(bind_group_id);
+    }
+};
+
+class HLE_SetRasterBoundingBox final : public HLEMacroImpl {
+public:
+    explicit HLE_SetRasterBoundingBox(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        const u32 raster_mode = parameters[0];
+        auto& regs = maxwell3d.regs;
+        const u32 raster_enabled = maxwell3d.regs.conservative_raster_enable;
+        const u32 scratch_data = maxwell3d.regs.shadow_scratch[52];
+        regs.raster_bounding_box.raw = raster_mode & 0xFFFFF00F;
+        regs.raster_bounding_box.pad.Assign(scratch_data & raster_enabled);
+    }
+};
+
+template <size_t base_size>
+class HLE_ClearConstBuffer final : public HLEMacroImpl {
+public:
+    explicit HLE_ClearConstBuffer(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+        static constexpr std::array<u32, base_size> zeroes{};
+        auto& regs = maxwell3d.regs;
+        regs.const_buffer.size = static_cast<u32>(base_size);
+        regs.const_buffer.address_high = parameters[0];
+        regs.const_buffer.address_low = parameters[1];
+        regs.const_buffer.offset = 0;
+        maxwell3d.ProcessCBMultiData(zeroes.data(), parameters[2] * 4);
+    }
+};
+
+class HLE_ClearMemory final : public HLEMacroImpl {
+public:
+    explicit HLE_ClearMemory(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+
+        const u32 needed_memory = parameters[2] / sizeof(u32);
+        if (needed_memory > zero_memory.size()) {
+            zero_memory.resize(needed_memory, 0);
+        }
+        auto& regs = maxwell3d.regs;
+        regs.upload.line_length_in = parameters[2];
+        regs.upload.line_count = 1;
+        regs.upload.dest.address_high = parameters[0];
+        regs.upload.dest.address_low = parameters[1];
+        maxwell3d.CallMethod(static_cast<size_t>(MAXWELL3D_REG_INDEX(launch_dma)), 0x1011, true);
+        maxwell3d.CallMultiMethod(static_cast<size_t>(MAXWELL3D_REG_INDEX(inline_data)),
+                                  zero_memory.data(), needed_memory, needed_memory);
+    }
+
+private:
+    std::vector<u32> zero_memory;
+};
+
+class HLE_TransformFeedbackSetup final : public HLEMacroImpl {
+public:
+    explicit HLE_TransformFeedbackSetup(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
+
+    void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
+        maxwell3d.RefreshParameters();
+
+        auto& regs = maxwell3d.regs;
+        regs.transform_feedback_enabled = 1;
+        regs.transform_feedback.buffers[0].start_offset = 0;
+        regs.transform_feedback.buffers[1].start_offset = 0;
+        regs.transform_feedback.buffers[2].start_offset = 0;
+        regs.transform_feedback.buffers[3].start_offset = 0;
+
+        regs.upload.line_length_in = 4;
+        regs.upload.line_count = 1;
+        regs.upload.dest.address_high = parameters[0];
+        regs.upload.dest.address_low = parameters[1];
+        maxwell3d.CallMethod(static_cast<size_t>(MAXWELL3D_REG_INDEX(launch_dma)), 0x1011, true);
+        maxwell3d.CallMethod(static_cast<size_t>(MAXWELL3D_REG_INDEX(inline_data)),
+                             regs.transform_feedback.controls[0].stride, true);
+    }
 };

 } // Anonymous namespace

-HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {}
+HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {
+    builders.emplace(0x0D61FC9FAAC9FCADULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_DrawArraysIndirect<false>>(maxwell3d__);
+                         }));
+    builders.emplace(0x8A4D173EB99A8603ULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_DrawArraysIndirect<true>>(maxwell3d__);
+                         }));
+    builders.emplace(0x771BB18C62444DA0ULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_DrawIndexedIndirect<false>>(maxwell3d__);
+                         }));
+    builders.emplace(0x0217920100488FF7ULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_DrawIndexedIndirect<true>>(maxwell3d__);
+                         }));
+    builders.emplace(0x3F5E74B9C9A50164ULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_MultiDrawIndexedIndirectCount>(
+                                 maxwell3d__);
+                         }));
+    builders.emplace(0xEAD26C3E2109B06BULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_MultiLayerClear>(maxwell3d__);
+                         }));
+    builders.emplace(0xC713C83D8F63CCF3ULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_C713C83D8F63CCF3>(maxwell3d__);
+                         }));
+    builders.emplace(0xD7333D26E0A93EDEULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_D7333D26E0A93EDE>(maxwell3d__);
+                         }));
+    builders.emplace(0xEB29B2A09AA06D38ULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_BindShader>(maxwell3d__);
+                         }));
+    builders.emplace(0xDB1341DBEB4C8AF7ULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_SetRasterBoundingBox>(maxwell3d__);
+                         }));
+    builders.emplace(0x6C97861D891EDf7EULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_ClearConstBuffer<0x5F00>>(maxwell3d__);
+                         }));
+    builders.emplace(0xD246FDDF3A6173D7ULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_ClearConstBuffer<0x7000>>(maxwell3d__);
+                         }));
+    builders.emplace(0xEE4D0004BEC8ECF4ULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_ClearMemory>(maxwell3d__);
+                         }));
+    builders.emplace(0xFC0CF27F5FFAA661ULL,
+                     std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
+                         [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
+                             return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d__);
+                         }));
+}
+
 HLEMacro::~HLEMacro() = default;

 std::unique_ptr<CachedMacro> HLEMacro::GetHLEProgram(u64 hash) const {
-    const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
-                                 [hash](const auto& pair) { return pair.first == hash; });
-    if (it == hle_funcs.end()) {
+    const auto it = builders.find(hash);
+    if (it == builders.end()) {
        return nullptr;
    }
-    return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
+    return it->second(maxwell3d);
 }

 } // namespace Tegra
--- a/src/video_core/macro/macro_hle.h
+++ b/src/video_core/macro/macro_hle.h
@@ -3,7 +3,10 @@

 #pragma once

+#include <functional>
 #include <memory>
+#include <unordered_map>
+
 #include "common/common_types.h"

 namespace Tegra {
@@ -23,6 +26,8 @@ public:

 private:
    Engines::Maxwell3D& maxwell3d;
+    std::unordered_map<u64, std::function<std::unique_ptr<CachedMacro>(Engines::Maxwell3D&)>>
+        builders;
 };

 } // namespace Tegra
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -6,11 +6,13 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/settings.h"
 #include "core/core.h"
 #include "core/device_memory.h"
 #include "core/hle/kernel/k_page_table.h"
 #include "core/hle/kernel/k_process.h"
 #include "core/memory.h"
+#include "video_core/invalidation_accumulator.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
@@ -25,7 +27,9 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
      address_space_bits{address_space_bits_}, page_bits{page_bits_}, big_page_bits{big_page_bits_},
      entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38,
                                           page_bits != big_page_bits ? page_bits : 0},
-      unique_identifier{unique_identifier_generator.fetch_add(1, std::memory_order_acq_rel)} {
+      kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add(
+                                      1, std::memory_order_acq_rel)},
+      accumulator{std::make_unique<VideoCommon::InvalidationAccumulator>()} {
    address_space_size = 1ULL << address_space_bits;
    page_size = 1ULL << page_bits;
    page_mask = page_size - 1ULL;
@@ -41,11 +45,12 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
    big_entries.resize(big_page_table_size / 32, 0);
    big_page_table_cpu.resize(big_page_table_size);
    big_page_continous.resize(big_page_table_size / continous_bits, 0);
-    std::array<PTEKind, 32> kind_valus;
-    kind_valus.fill(PTEKind::INVALID);
-    big_kinds.resize(big_page_table_size / 32, kind_valus);
    entries.resize(page_table_size / 32, 0);
-    kinds.resize(page_table_size / 32, kind_valus);
+    if (!Settings::IsGPULevelExtreme() && Settings::IsFastmemEnabled()) {
+        fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer();
+    } else {
+        fastmem_arena = nullptr;
+    }
 }

 MemoryManager::~MemoryManager() = default;
@@ -83,38 +88,7 @@ void MemoryManager::SetEntry(size_t position, MemoryManager::EntryType entry) {
 }

 PTEKind MemoryManager::GetPageKind(GPUVAddr gpu_addr) const {
-    auto entry = GetEntry<true>(gpu_addr);
-    if (entry == EntryType::Mapped || entry == EntryType::Reserved) [[likely]] {
-        return GetKind<true>(gpu_addr);
-    } else {
-        return GetKind<false>(gpu_addr);
-    }
-}
-
-template <bool is_big_page>
-PTEKind MemoryManager::GetKind(size_t position) const {
-    if constexpr (is_big_page) {
-        position = position >> big_page_bits;
-        const size_t sub_index = position % 32;
-        return big_kinds[position / 32][sub_index];
-    } else {
-        position = position >> page_bits;
-        const size_t sub_index = position % 32;
-        return kinds[position / 32][sub_index];
-    }
-}
-
-template <bool is_big_page>
-void MemoryManager::SetKind(size_t position, PTEKind kind) {
-    if constexpr (is_big_page) {
-        position = position >> big_page_bits;
-        const size_t sub_index = position % 32;
-        big_kinds[position / 32][sub_index] = kind;
-    } else {
-        position = position >> page_bits;
-        const size_t sub_index = position % 32;
-        kinds[position / 32][sub_index] = kind;
-    }
+    return kind_map.GetValueAt(gpu_addr);
 }

 inline bool MemoryManager::IsBigPageContinous(size_t big_page_index) const {
@@ -141,7 +115,6 @@ GPUVAddr MemoryManager::PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cp
        const GPUVAddr current_gpu_addr = gpu_addr + offset;
        [[maybe_unused]] const auto current_entry_type = GetEntry<false>(current_gpu_addr);
        SetEntry<false>(current_gpu_addr, entry_type);
-        SetKind<false>(current_gpu_addr, kind);
        if (current_entry_type != entry_type) {
            rasterizer->ModifyGPUMemory(unique_identifier, gpu_addr, page_size);
        }
@@ -153,6 +126,7 @@ GPUVAddr MemoryManager::PageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr cp
        }
        remaining_size -= page_size;
    }
+    kind_map.Map(gpu_addr, gpu_addr + size, kind);
    return gpu_addr;
 }

@@ -164,7 +138,6 @@ GPUVAddr MemoryManager::BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr
        const GPUVAddr current_gpu_addr = gpu_addr + offset;
        [[maybe_unused]] const auto current_entry_type = GetEntry<true>(current_gpu_addr);
        SetEntry<true>(current_gpu_addr, entry_type);
-        SetKind<true>(current_gpu_addr, kind);
        if (current_entry_type != entry_type) {
            rasterizer->ModifyGPUMemory(unique_identifier, gpu_addr, big_page_size);
        }
@@ -193,6 +166,7 @@ GPUVAddr MemoryManager::BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr
        }
        remaining_size -= big_page_size;
    }
+    kind_map.Map(gpu_addr, gpu_addr + size, kind);
    return gpu_addr;
 }

@@ -219,15 +193,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
    if (size == 0) {
        return;
    }
-    const auto submapped_ranges = GetSubmappedRange(gpu_addr, size);
+    GetSubmappedRangeImpl<false>(gpu_addr, size, page_stash);

-    for (const auto& [map_addr, map_size] : submapped_ranges) {
-        // Flush and invalidate through the GPU interface, to be asynchronous if possible.
-        const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map_addr);
-        ASSERT(cpu_addr);
-
-        rasterizer->UnmapMemory(*cpu_addr, map_size);
+    for (const auto& [map_addr, map_size] : page_stash) {
+        rasterizer->UnmapMemory(map_addr, map_size);
    }
+    page_stash.clear();

    BigPageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
    PageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
@@ -325,9 +296,15 @@ template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typenam
 inline void MemoryManager::MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size,
                                           FuncMapped&& func_mapped, FuncReserved&& func_reserved,
                                           FuncUnmapped&& func_unmapped) const {
-    static constexpr bool BOOL_BREAK_MAPPED = std::is_same_v<FuncMapped, bool>;
-    static constexpr bool BOOL_BREAK_RESERVED = std::is_same_v<FuncReserved, bool>;
-    static constexpr bool BOOL_BREAK_UNMAPPED = std::is_same_v<FuncUnmapped, bool>;
+    using FuncMappedReturn =
+        typename std::invoke_result<FuncMapped, std::size_t, std::size_t, std::size_t>::type;
+    using FuncReservedReturn =
+        typename std::invoke_result<FuncReserved, std::size_t, std::size_t, std::size_t>::type;
+    using FuncUnmappedReturn =
+        typename std::invoke_result<FuncUnmapped, std::size_t, std::size_t, std::size_t>::type;
+    static constexpr bool BOOL_BREAK_MAPPED = std::is_same_v<FuncMappedReturn, bool>;
+    static constexpr bool BOOL_BREAK_RESERVED = std::is_same_v<FuncReservedReturn, bool>;
+    static constexpr bool BOOL_BREAK_UNMAPPED = std::is_same_v<FuncUnmappedReturn, bool>;
    u64 used_page_size;
    u64 used_page_mask;
    u64 used_page_bits;
@@ -383,9 +360,9 @@ inline void MemoryManager::MemoryOperation(GPUVAddr gpu_src_addr, std::size_t si
    }
 }

-template <bool is_safe>
-void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer,
-                                  std::size_t size) const {
+template <bool is_safe, bool use_fastmem>
+void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
+                                  [[maybe_unused]] VideoCommon::CacheType which) const {
    auto set_to_zero = [&]([[maybe_unused]] std::size_t page_index,
                           [[maybe_unused]] std::size_t offset, std::size_t copy_amount) {
        std::memset(dest_buffer, 0, copy_amount);
@@ -395,23 +372,31 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer,
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
        if constexpr (is_safe) {
-            rasterizer->FlushRegion(cpu_addr_base, copy_amount);
+            rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
+        }
+        if constexpr (use_fastmem) {
+            std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
+        } else {
+            u8* physical = memory.GetPointer(cpu_addr_base);
+            std::memcpy(dest_buffer, physical, copy_amount);
        }
-        u8* physical = memory.GetPointer(cpu_addr_base);
-        std::memcpy(dest_buffer, physical, copy_amount);
        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
    };
    auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
        if constexpr (is_safe) {
-            rasterizer->FlushRegion(cpu_addr_base, copy_amount);
+            rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
        }
-        if (!IsBigPageContinous(page_index)) [[unlikely]] {
-            memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount);
+        if constexpr (use_fastmem) {
+            std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
        } else {
-            u8* physical = memory.GetPointer(cpu_addr_base);
-            std::memcpy(dest_buffer, physical, copy_amount);
+            if (!IsBigPageContinous(page_index)) [[unlikely]] {
+                memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount);
+            } else {
+                u8* physical = memory.GetPointer(cpu_addr_base);
+                std::memcpy(dest_buffer, physical, copy_amount);
+            }
        }
        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
    };
@@ -423,18 +408,27 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer,
    MemoryOperation<true>(gpu_src_addr, size, mapped_big, set_to_zero, read_short_pages);
 }

-void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const {
-    ReadBlockImpl<true>(gpu_src_addr, dest_buffer, size);
+void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
+                              VideoCommon::CacheType which) const {
+    if (fastmem_arena) [[likely]] {
+        ReadBlockImpl<true, true>(gpu_src_addr, dest_buffer, size, which);
+        return;
+    }
+    ReadBlockImpl<true, false>(gpu_src_addr, dest_buffer, size, which);
 }

 void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
                                    const std::size_t size) const {
-    ReadBlockImpl<false>(gpu_src_addr, dest_buffer, size);
+    if (fastmem_arena) [[likely]] {
+        ReadBlockImpl<false, true>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
+        return;
+    }
+    ReadBlockImpl<false, false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
 }

 template <bool is_safe>
-void MemoryManager::WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffer,
-                                   std::size_t size) {
+void MemoryManager::WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size,
+                                   [[maybe_unused]] VideoCommon::CacheType which) {
    auto just_advance = [&]([[maybe_unused]] std::size_t page_index,
                            [[maybe_unused]] std::size_t offset, std::size_t copy_amount) {
        src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
@@ -443,7 +437,7 @@ void MemoryManager::WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffe
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
        if constexpr (is_safe) {
-            rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
+            rasterizer->InvalidateRegion(cpu_addr_base, copy_amount, which);
        }
        u8* physical = memory.GetPointer(cpu_addr_base);
        std::memcpy(physical, src_buffer, copy_amount);
@@ -453,7 +447,7 @@ void MemoryManager::WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffe
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
        if constexpr (is_safe) {
-            rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
+            rasterizer->InvalidateRegion(cpu_addr_base, copy_amount, which);
        }
        if (!IsBigPageContinous(page_index)) [[unlikely]] {
            memory.WriteBlockUnsafe(cpu_addr_base, src_buffer, copy_amount);
@@ -471,16 +465,24 @@ void MemoryManager::WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffe
    MemoryOperation<true>(gpu_dest_addr, size, mapped_big, just_advance, write_short_pages);
 }

-void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size) {
-    WriteBlockImpl<true>(gpu_dest_addr, src_buffer, size);
+void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size,
+                               VideoCommon::CacheType which) {
+    WriteBlockImpl<true>(gpu_dest_addr, src_buffer, size, which);
 }

 void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer,
                                     std::size_t size) {
-    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size);
+    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
 }

-void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size) const {
+void MemoryManager::WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                                     std::size_t size) {
+    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
+    accumulator->Add(gpu_dest_addr, size);
+}
+
+void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size,
+                                VideoCommon::CacheType which) const {
    auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
                          [[maybe_unused]] std::size_t offset,
                          [[maybe_unused]] std::size_t copy_amount) {};
@@ -488,12 +490,12 @@ void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size) const {
    auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
-        rasterizer->FlushRegion(cpu_addr_base, copy_amount);
+        rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
    };
    auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
-        rasterizer->FlushRegion(cpu_addr_base, copy_amount);
+        rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
    };
    auto flush_short_pages = [&](std::size_t page_index, std::size_t offset,
                                 std::size_t copy_amount) {
@@ -503,7 +505,8 @@ void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size) const {
    MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, flush_short_pages);
 }

-bool MemoryManager::IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const {
+bool MemoryManager::IsMemoryDirty(GPUVAddr gpu_addr, size_t size,
+                                  VideoCommon::CacheType which) const {
    bool result = false;
    auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
                          [[maybe_unused]] std::size_t offset,
@@ -512,13 +515,13 @@ bool MemoryManager::IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const {
    auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
-        result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount);
+        result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount, which);
        return result;
    };
    auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
-        result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount);
+        result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount, which);
        return result;
    };
    auto check_short_pages = [&](std::size_t page_index, std::size_t offset,
@@ -571,7 +574,12 @@ size_t MemoryManager::MaxContinousRange(GPUVAddr gpu_addr, size_t size) const {
    return range_so_far;
 }

-void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size) const {
+size_t MemoryManager::GetMemoryLayoutSize(GPUVAddr gpu_addr, size_t max_size) const {
+    return kind_map.GetContinousSizeFrom(gpu_addr);
+}
+
+void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size,
+                                     VideoCommon::CacheType which) const {
    auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
                          [[maybe_unused]] std::size_t offset,
                          [[maybe_unused]] std::size_t copy_amount) {};
@@ -579,12 +587,12 @@ void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size) const {
    auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
-        rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
+        rasterizer->InvalidateRegion(cpu_addr_base, copy_amount, which);
    };
    auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
-        rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
+        rasterizer->InvalidateRegion(cpu_addr_base, copy_amount, which);
    };
    auto invalidate_short_pages = [&](std::size_t page_index, std::size_t offset,
                                      std::size_t copy_amount) {
@@ -594,14 +602,15 @@ void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size) const {
    MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, invalidate_short_pages);
 }

-void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) {
+void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size,
+                              VideoCommon::CacheType which) {
    std::vector<u8> tmp_buffer(size);
-    ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
+    ReadBlock(gpu_src_addr, tmp_buffer.data(), size, which);

    // The output block must be flushed in case it has data modified from the GPU.
    // Fixes NPC geometry in Zombie Panic in Wonderland DX
-    FlushRegion(gpu_dest_addr, size);
-    WriteBlock(gpu_dest_addr, tmp_buffer.data(), size);
+    FlushRegion(gpu_dest_addr, size, which);
+    WriteBlock(gpu_dest_addr, tmp_buffer.data(), size, which);
 }

 bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) const {
@@ -681,7 +690,17 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons
 std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
    GPUVAddr gpu_addr, std::size_t size) const {
    std::vector<std::pair<GPUVAddr, std::size_t>> result{};
-    std::optional<std::pair<GPUVAddr, std::size_t>> last_segment{};
+    GetSubmappedRangeImpl<true>(gpu_addr, size, result);
+    return result;
+}
+
+template <bool is_gpu_address>
+void MemoryManager::GetSubmappedRangeImpl(
+    GPUVAddr gpu_addr, std::size_t size,
+    std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+        result) const {
+    std::optional<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>
+        last_segment{};
    std::optional<VAddr> old_page_addr{};
    const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index,
                                                [[maybe_unused]] std::size_t offset,
@@ -703,8 +722,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
        }
        old_page_addr = {cpu_addr_base + copy_amount};
        if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
        } else {
            last_segment->second += copy_amount;
        }
@@ -721,8 +744,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
        }
        old_page_addr = {cpu_addr_base + copy_amount};
        if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
        } else {
            last_segment->second += copy_amount;
        }
@@ -733,7 +760,18 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
    };
    MemoryOperation<true>(gpu_addr, size, extend_size_big, split, do_short_pages);
    split(0, 0, 0);
-    return result;
+}
+
+void MemoryManager::FlushCaching() {
+    if (!accumulator->AnyAccumulated()) {
+        return;
+    }
+    accumulator->Callback([this](GPUVAddr addr, size_t size) {
+        GetSubmappedRangeImpl<false>(addr, size, page_stash);
+    });
+    rasterizer->InnerInvalidation(page_stash);
+    page_stash.clear();
+    accumulator->Clear();
 }

 } // namespace Tegra
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -10,13 +10,19 @@

 #include "common/common_types.h"
 #include "common/multi_level_page_table.h"
+#include "common/range_map.h"
 #include "common/virtual_buffer.h"
+#include "video_core/cache_types.h"
 #include "video_core/pte_kind.h"

 namespace VideoCore {
 class RasterizerInterface;
 }

+namespace VideoCommon {
+class InvalidationAccumulator;
+}
+
 namespace Core {
 class DeviceMemory;
 namespace Memory {
@@ -59,9 +65,12 @@ public:
     * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
     * Flushes and Invalidations, respectively to each operation.
     */
-    void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
+    void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
+                   VideoCommon::CacheType which = VideoCommon::CacheType::All) const;
+    void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size,
+                    VideoCommon::CacheType which = VideoCommon::CacheType::All);
+    void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size,
+                   VideoCommon::CacheType which = VideoCommon::CacheType::All);

    /**
     * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
@@ -75,6 +84,7 @@ public:
     */
    void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
    void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);

    /**
     * Checks if a gpu region can be simply read with a pointer.
@@ -104,11 +114,14 @@ public:
    GPUVAddr MapSparse(GPUVAddr gpu_addr, std::size_t size, bool is_big_pages = true);
    void Unmap(GPUVAddr gpu_addr, std::size_t size);

-    void FlushRegion(GPUVAddr gpu_addr, size_t size) const;
+    void FlushRegion(GPUVAddr gpu_addr, size_t size,
+                     VideoCommon::CacheType which = VideoCommon::CacheType::All) const;

-    void InvalidateRegion(GPUVAddr gpu_addr, size_t size) const;
+    void InvalidateRegion(GPUVAddr gpu_addr, size_t size,
+                          VideoCommon::CacheType which = VideoCommon::CacheType::All) const;

-    bool IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const;
+    bool IsMemoryDirty(GPUVAddr gpu_addr, size_t size,
+                       VideoCommon::CacheType which = VideoCommon::CacheType::All) const;

    size_t MaxContinousRange(GPUVAddr gpu_addr, size_t size) const;

@@ -118,16 +131,23 @@ public:

    PTEKind GetPageKind(GPUVAddr gpu_addr) const;

+    size_t GetMemoryLayoutSize(GPUVAddr gpu_addr,
+                               size_t max_size = std::numeric_limits<size_t>::max()) const;
+
+    void FlushCaching();
+
 private:
    template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
    inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
                                FuncReserved&& func_reserved, FuncUnmapped&& func_unmapped) const;

-    template <bool is_safe>
-    void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    template <bool is_safe, bool use_fastmem>
+    void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
+                       VideoCommon::CacheType which) const;

    template <bool is_safe>
-    void WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size,
+                        VideoCommon::CacheType which);

    template <bool is_big_page>
    [[nodiscard]] std::size_t PageEntryIndex(GPUVAddr gpu_addr) const {
@@ -141,6 +161,12 @@ private:
    inline bool IsBigPageContinous(size_t big_page_index) const;
    inline void SetBigPageContinous(size_t big_page_index, bool value);

+    template <bool is_gpu_address>
+    void GetSubmappedRangeImpl(
+        GPUVAddr gpu_addr, std::size_t size,
+        std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+            result) const;
+
    Core::System& system;
    Core::Memory::Memory& memory;
    Core::DeviceMemory& device_memory;
@@ -183,23 +209,18 @@ private:
    template <bool is_big_page>
    inline void SetEntry(size_t position, EntryType entry);

-    std::vector<std::array<PTEKind, 32>> kinds;
-    std::vector<std::array<PTEKind, 32>> big_kinds;
-
-    template <bool is_big_page>
-    inline PTEKind GetKind(size_t position) const;
-
-    template <bool is_big_page>
-    inline void SetKind(size_t position, PTEKind kind);
-
    Common::MultiLevelPageTable<u32> page_table;
+    Common::RangeMap<GPUVAddr, PTEKind> kind_map;
    Common::VirtualBuffer<u32> big_page_table_cpu;

    std::vector<u64> big_page_continous;
+    std::vector<std::pair<VAddr, std::size_t>> page_stash{};
+    u8* fastmem_arena{};

    constexpr static size_t continous_bits = 64;

    const size_t unique_identifier;
+    std::unique_ptr<VideoCommon::InvalidationAccumulator> accumulator;

    static std::atomic<size_t> unique_identifier_generator;
 };
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -6,8 +6,10 @@
 #include <functional>
 #include <optional>
 #include <span>
+#include <utility>
 #include "common/common_types.h"
 #include "common/polyfill_thread.h"
+#include "video_core/cache_types.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/gpu.h"

@@ -42,6 +44,9 @@ public:
    /// Dispatches a draw invocation
    virtual void Draw(bool is_indexed, u32 instance_count) = 0;

+    /// Dispatches an indirect draw invocation
+    virtual void DrawIndirect() {}
+
    /// Clear the current framebuffer
    virtual void Clear(u32 layer_count) = 0;

@@ -80,13 +85,22 @@ public:
    virtual void FlushAll() = 0;

    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
-    virtual void FlushRegion(VAddr addr, u64 size) = 0;
+    virtual void FlushRegion(VAddr addr, u64 size,
+                             VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;

    /// Check if the the specified memory area requires flushing to CPU Memory.
-    virtual bool MustFlushRegion(VAddr addr, u64 size) = 0;
+    virtual bool MustFlushRegion(VAddr addr, u64 size,
+                                 VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;

    /// Notify rasterizer that any caches of the specified region should be invalidated
-    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
+    virtual void InvalidateRegion(VAddr addr, u64 size,
+                                  VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;
+
+    virtual void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
+        for (const auto& [cpu_addr, size] : sequences) {
+            InvalidateRegion(cpu_addr, size);
+        }
+    }

    /// Notify rasterizer that any caches of the specified region are desync with guest
    virtual void OnCPUWrite(VAddr addr, u64 size) = 0;
@@ -102,7 +116,8 @@ public:

    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
    /// and invalidated
-    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
+    virtual void FlushAndInvalidateRegion(
+        VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;

    /// Notify the host renderer to wait for previous primitive and compute operations.
    virtual void WaitForIdle() = 0;
@@ -119,6 +134,10 @@ public:
    /// Notify rasterizer that a frame is about to finish
    virtual void TickFrame() = 0;

+    virtual bool AccelerateConditionalRendering() {
+        return false;
+    }
+
    /// Attempt to use a faster method to perform a surface copy
    [[nodiscard]] virtual bool AccelerateSurfaceCopy(
        const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst,
--- a/src/video_core/renderer_null/null_rasterizer.cpp
+++ b/src/video_core/renderer_null/null_rasterizer.cpp
@@ -39,11 +39,11 @@ void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr
                                               u32 size) {}
 void RasterizerNull::DisableGraphicsUniformBuffer(size_t stage, u32 index) {}
 void RasterizerNull::FlushAll() {}
-void RasterizerNull::FlushRegion(VAddr addr, u64 size) {}
-bool RasterizerNull::MustFlushRegion(VAddr addr, u64 size) {
+void RasterizerNull::FlushRegion(VAddr addr, u64 size, VideoCommon::CacheType) {}
+bool RasterizerNull::MustFlushRegion(VAddr addr, u64 size, VideoCommon::CacheType) {
    return false;
 }
-void RasterizerNull::InvalidateRegion(VAddr addr, u64 size) {}
+void RasterizerNull::InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {}
 void RasterizerNull::OnCPUWrite(VAddr addr, u64 size) {}
 void RasterizerNull::InvalidateGPUCache() {}
 void RasterizerNull::UnmapMemory(VAddr addr, u64 size) {}
@@ -61,7 +61,7 @@ void RasterizerNull::SignalSyncPoint(u32 value) {
 }
 void RasterizerNull::SignalReference() {}
 void RasterizerNull::ReleaseFences() {}
-void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size) {}
+void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {}
 void RasterizerNull::WaitForIdle() {}
 void RasterizerNull::FragmentBarrier() {}
 void RasterizerNull::TiledCacheBarrier() {}
--- a/src/video_core/renderer_null/null_rasterizer.h
+++ b/src/video_core/renderer_null/null_rasterizer.h
@@ -38,9 +38,12 @@ public:
    void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
    void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
    void FlushAll() override;
-    void FlushRegion(VAddr addr, u64 size) override;
-    bool MustFlushRegion(VAddr addr, u64 size) override;
-    void InvalidateRegion(VAddr addr, u64 size) override;
+    void FlushRegion(VAddr addr, u64 size,
+                     VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
+    bool MustFlushRegion(VAddr addr, u64 size,
+                         VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
+    void InvalidateRegion(VAddr addr, u64 size,
+                          VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
    void OnCPUWrite(VAddr addr, u64 size) override;
    void InvalidateGPUCache() override;
    void UnmapMemory(VAddr addr, u64 size) override;
@@ -50,7 +53,8 @@ public:
    void SignalSyncPoint(u32 value) override;
    void SignalReference() override;
    void ReleaseFences() override;
-    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(
+        VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
    void WaitForIdle() override;
    void FragmentBarrier() override;
    void TiledCacheBarrier() override;
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -160,6 +160,10 @@ public:
        return device.CanReportMemoryUsage();
    }

+    u32 GetStorageBufferAlignment() const {
+        return static_cast<u32>(device.GetShaderStorageBufferAlignment());
+    }
+
 private:
    static constexpr std::array PABO_LUT{
        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV,          GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h
@@ -40,6 +40,7 @@ struct GraphicsPipelineKey {
        BitField<6, 2, Maxwell::Tessellation::DomainType> tessellation_primitive;
        BitField<8, 2, Maxwell::Tessellation::Spacing> tessellation_spacing;
        BitField<10, 1, u32> tessellation_clockwise;
+        BitField<11, 3, Tegra::Engines::Maxwell3D::EngineHint> app_stage;
    };
    std::array<u32, 3> padding;
    VideoCommon::TransformFeedbackState xfb_state;
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -202,7 +202,8 @@ void RasterizerOpenGL::Clear(u32 layer_count) {
    ++num_queued_commands;
 }

-void RasterizerOpenGL::Draw(bool is_indexed, u32 instance_count) {
+template <typename Func>
+void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) {
    MICROPROFILE_SCOPE(OpenGL_Drawing);

    SCOPE_EXIT({ gpu.TickWork(); });
@@ -226,48 +227,97 @@ void RasterizerOpenGL::Draw(bool is_indexed, u32 instance_count) {
    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(draw_state.topology);
    BeginTransformFeedback(pipeline, primitive_mode);

-    const GLuint base_instance = static_cast<GLuint>(draw_state.base_instance);
-    const GLsizei num_instances = static_cast<GLsizei>(instance_count);
-    if (is_indexed) {
-        const GLint base_vertex = static_cast<GLint>(draw_state.base_index);
-        const GLsizei num_vertices = static_cast<GLsizei>(draw_state.index_buffer.count);
-        const GLvoid* const offset = buffer_cache_runtime.IndexOffset();
-        const GLenum format = MaxwellToGL::IndexFormat(draw_state.index_buffer.format);
-        if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
-            glDrawElements(primitive_mode, num_vertices, format, offset);
-        } else if (num_instances == 1 && base_instance == 0) {
-            glDrawElementsBaseVertex(primitive_mode, num_vertices, format, offset, base_vertex);
-        } else if (base_vertex == 0 && base_instance == 0) {
-            glDrawElementsInstanced(primitive_mode, num_vertices, format, offset, num_instances);
-        } else if (base_vertex == 0) {
-            glDrawElementsInstancedBaseInstance(primitive_mode, num_vertices, format, offset,
-                                                num_instances, base_instance);
-        } else if (base_instance == 0) {
-            glDrawElementsInstancedBaseVertex(primitive_mode, num_vertices, format, offset,
-                                              num_instances, base_vertex);
-        } else {
-            glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, num_vertices, format,
-                                                          offset, num_instances, base_vertex,
-                                                          base_instance);
-        }
-    } else {
-        const GLint base_vertex = static_cast<GLint>(draw_state.vertex_buffer.first);
-        const GLsizei num_vertices = static_cast<GLsizei>(draw_state.vertex_buffer.count);
-        if (num_instances == 1 && base_instance == 0) {
-            glDrawArrays(primitive_mode, base_vertex, num_vertices);
-        } else if (base_instance == 0) {
-            glDrawArraysInstanced(primitive_mode, base_vertex, num_vertices, num_instances);
-        } else {
-            glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices,
-                                              num_instances, base_instance);
-        }
-    }
+    draw_func(primitive_mode);
+
    EndTransformFeedback();

    ++num_queued_commands;
    has_written_global_memory |= pipeline->WritesGlobalMemory();
 }

+void RasterizerOpenGL::Draw(bool is_indexed, u32 instance_count) {
+    PrepareDraw(is_indexed, [this, is_indexed, instance_count](GLenum primitive_mode) {
+        const auto& draw_state = maxwell3d->draw_manager->GetDrawState();
+        const GLuint base_instance = static_cast<GLuint>(draw_state.base_instance);
+        const GLsizei num_instances = static_cast<GLsizei>(instance_count);
+        if (is_indexed) {
+            const GLint base_vertex = static_cast<GLint>(draw_state.base_index);
+            const GLsizei num_vertices = static_cast<GLsizei>(draw_state.index_buffer.count);
+            const GLvoid* const offset = buffer_cache_runtime.IndexOffset();
+            const GLenum format = MaxwellToGL::IndexFormat(draw_state.index_buffer.format);
+            if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
+                glDrawElements(primitive_mode, num_vertices, format, offset);
+            } else if (num_instances == 1 && base_instance == 0) {
+                glDrawElementsBaseVertex(primitive_mode, num_vertices, format, offset, base_vertex);
+            } else if (base_vertex == 0 && base_instance == 0) {
+                glDrawElementsInstanced(primitive_mode, num_vertices, format, offset,
+                                        num_instances);
+            } else if (base_vertex == 0) {
+                glDrawElementsInstancedBaseInstance(primitive_mode, num_vertices, format, offset,
+                                                    num_instances, base_instance);
+            } else if (base_instance == 0) {
+                glDrawElementsInstancedBaseVertex(primitive_mode, num_vertices, format, offset,
+                                                  num_instances, base_vertex);
+            } else {
+                glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, num_vertices, format,
+                                                              offset, num_instances, base_vertex,
+                                                              base_instance);
+            }
+        } else {
+            const GLint base_vertex = static_cast<GLint>(draw_state.vertex_buffer.first);
+            const GLsizei num_vertices = static_cast<GLsizei>(draw_state.vertex_buffer.count);
+            if (num_instances == 1 && base_instance == 0) {
+                glDrawArrays(primitive_mode, base_vertex, num_vertices);
+            } else if (base_instance == 0) {
+                glDrawArraysInstanced(primitive_mode, base_vertex, num_vertices, num_instances);
+            } else {
+                glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices,
+                                                  num_instances, base_instance);
+            }
+        }
+    });
+}
+
+void RasterizerOpenGL::DrawIndirect() {
+    const auto& params = maxwell3d->draw_manager->GetIndirectParams();
+    buffer_cache.SetDrawIndirect(&params);
+    PrepareDraw(params.is_indexed, [this, &params](GLenum primitive_mode) {
+        const auto [buffer, offset] = buffer_cache.GetDrawIndirectBuffer();
+        const GLvoid* const gl_offset =
+            reinterpret_cast<const GLvoid*>(static_cast<uintptr_t>(offset));
+        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, buffer->Handle());
+        if (params.include_count) {
+            const auto [draw_buffer, offset_base] = buffer_cache.GetDrawIndirectCount();
+            glBindBuffer(GL_PARAMETER_BUFFER, draw_buffer->Handle());
+
+            if (params.is_indexed) {
+                const GLenum format = MaxwellToGL::IndexFormat(maxwell3d->regs.index_buffer.format);
+                glMultiDrawElementsIndirectCount(primitive_mode, format, gl_offset,
+                                                 static_cast<GLintptr>(offset_base),
+                                                 static_cast<GLsizei>(params.max_draw_counts),
+                                                 static_cast<GLsizei>(params.stride));
+            } else {
+                glMultiDrawArraysIndirectCount(primitive_mode, gl_offset,
+                                               static_cast<GLintptr>(offset_base),
+                                               static_cast<GLsizei>(params.max_draw_counts),
+                                               static_cast<GLsizei>(params.stride));
+            }
+            return;
+        }
+        if (params.is_indexed) {
+            const GLenum format = MaxwellToGL::IndexFormat(maxwell3d->regs.index_buffer.format);
+            glMultiDrawElementsIndirect(primitive_mode, format, gl_offset,
+                                        static_cast<GLsizei>(params.max_draw_counts),
+                                        static_cast<GLsizei>(params.stride));
+        } else {
+            glMultiDrawArraysIndirect(primitive_mode, gl_offset,
+                                      static_cast<GLsizei>(params.max_draw_counts),
+                                      static_cast<GLsizei>(params.stride));
+        }
+    });
+    buffer_cache.SetDrawIndirect(nullptr);
+}
+
 void RasterizerOpenGL::DispatchCompute() {
    ComputePipeline* const pipeline{shader_cache.CurrentComputePipeline()};
    if (!pipeline) {
@@ -302,46 +352,60 @@ void RasterizerOpenGL::DisableGraphicsUniformBuffer(size_t stage, u32 index) {

 void RasterizerOpenGL::FlushAll() {}

-void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
+void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size, VideoCommon::CacheType which) {
    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
    if (addr == 0 || size == 0) {
        return;
    }
-    {
+    if (True(which & VideoCommon::CacheType::TextureCache)) {
        std::scoped_lock lock{texture_cache.mutex};
        texture_cache.DownloadMemory(addr, size);
    }
-    {
+    if ((True(which & VideoCommon::CacheType::BufferCache))) {
        std::scoped_lock lock{buffer_cache.mutex};
        buffer_cache.DownloadMemory(addr, size);
    }
-    query_cache.FlushRegion(addr, size);
-}
-
-bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) {
-    std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
-    if (!Settings::IsGPULevelHigh()) {
-        return buffer_cache.IsRegionGpuModified(addr, size);
+    if ((True(which & VideoCommon::CacheType::QueryCache))) {
+        query_cache.FlushRegion(addr, size);
    }
-    return texture_cache.IsRegionGpuModified(addr, size) ||
-           buffer_cache.IsRegionGpuModified(addr, size);
 }

-void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
+bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size, VideoCommon::CacheType which) {
+    if ((True(which & VideoCommon::CacheType::BufferCache))) {
+        std::scoped_lock lock{buffer_cache.mutex};
+        if (buffer_cache.IsRegionGpuModified(addr, size)) {
+            return true;
+        }
+    }
+    if (!Settings::IsGPULevelHigh()) {
+        return false;
+    }
+    if (True(which & VideoCommon::CacheType::TextureCache)) {
+        std::scoped_lock lock{texture_cache.mutex};
+        return texture_cache.IsRegionGpuModified(addr, size);
+    }
+    return false;
+}
+
+void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which) {
    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
    if (addr == 0 || size == 0) {
        return;
    }
-    {
+    if (True(which & VideoCommon::CacheType::TextureCache)) {
        std::scoped_lock lock{texture_cache.mutex};
        texture_cache.WriteMemory(addr, size);
    }
-    {
+    if (True(which & VideoCommon::CacheType::BufferCache)) {
        std::scoped_lock lock{buffer_cache.mutex};
        buffer_cache.WriteMemory(addr, size);
    }
-    shader_cache.InvalidateRegion(addr, size);
-    query_cache.InvalidateRegion(addr, size);
+    if (True(which & VideoCommon::CacheType::ShaderCache)) {
+        shader_cache.InvalidateRegion(addr, size);
+    }
+    if (True(which & VideoCommon::CacheType::QueryCache)) {
+        query_cache.InvalidateRegion(addr, size);
+    }
 }

 void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
@@ -408,11 +472,12 @@ void RasterizerOpenGL::ReleaseFences() {
    fence_manager.WaitPendingFences();
 }

-void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size,
+                                                VideoCommon::CacheType which) {
    if (Settings::IsGPULevelExtreme()) {
-        FlushRegion(addr, size);
+        FlushRegion(addr, size, which);
    }
-    InvalidateRegion(addr, size);
+    InvalidateRegion(addr, size, which);
 }

 void RasterizerOpenGL::WaitForIdle() {
@@ -460,6 +525,21 @@ void RasterizerOpenGL::TickFrame() {
    }
 }

+bool RasterizerOpenGL::AccelerateConditionalRendering() {
+    if (Settings::IsGPULevelHigh()) {
+        // Reimplement Host conditional rendering.
+        return false;
+    }
+    // Medium / Low Hack: stub any checks on queries writen into the buffer cache.
+    const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()};
+    Maxwell::ReportSemaphore::Compare cmp;
+    if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp),
+                                  VideoCommon::CacheType::BufferCache)) {
+        return true;
+    }
+    return false;
+}
+
 bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
                                             const Tegra::Engines::Fermi2D::Surface& dst,
                                             const Tegra::Engines::Fermi2D::Config& copy_config) {
@@ -481,7 +561,7 @@ void RasterizerOpenGL::AccelerateInlineToMemory(GPUVAddr address, size_t copy_si
    }
    gpu_memory->WriteBlockUnsafe(address, memory.data(), copy_size);
    {
-        std::unique_lock<std::mutex> lock{buffer_cache.mutex};
+        std::unique_lock<std::recursive_mutex> lock{buffer_cache.mutex};
        if (!buffer_cache.InlineMemory(*cpu_addr, copy_size, memory)) {
            buffer_cache.WriteMemory(*cpu_addr, copy_size);
        }
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -69,6 +69,7 @@ public:
    ~RasterizerOpenGL() override;

    void Draw(bool is_indexed, u32 instance_count) override;
+    void DrawIndirect() override;
    void Clear(u32 layer_count) override;
    void DispatchCompute() override;
    void ResetCounter(VideoCore::QueryType type) override;
@@ -76,9 +77,12 @@ public:
    void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
    void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
    void FlushAll() override;
-    void FlushRegion(VAddr addr, u64 size) override;
-    bool MustFlushRegion(VAddr addr, u64 size) override;
-    void InvalidateRegion(VAddr addr, u64 size) override;
+    void FlushRegion(VAddr addr, u64 size,
+                     VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
+    bool MustFlushRegion(VAddr addr, u64 size,
+                         VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
+    void InvalidateRegion(VAddr addr, u64 size,
+                          VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
    void OnCPUWrite(VAddr addr, u64 size) override;
    void InvalidateGPUCache() override;
    void UnmapMemory(VAddr addr, u64 size) override;
@@ -88,12 +92,14 @@ public:
    void SignalSyncPoint(u32 value) override;
    void SignalReference() override;
    void ReleaseFences() override;
-    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(
+        VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
    void WaitForIdle() override;
    void FragmentBarrier() override;
    void TiledCacheBarrier() override;
    void FlushCommands() override;
    void TickFrame() override;
+    bool AccelerateConditionalRendering() override;
    bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
                               const Tegra::Engines::Fermi2D::Surface& dst,
                               const Tegra::Engines::Fermi2D::Config& copy_config) override;
@@ -121,6 +127,9 @@ private:
    static constexpr size_t MAX_IMAGES = 48;
    static constexpr size_t MAX_IMAGE_VIEWS = MAX_TEXTURES + MAX_IMAGES;

+    template <typename Func>
+    void PrepareDraw(bool is_indexed, Func&&);
+
    /// Syncs state to match guest's
    void SyncState();

--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -51,7 +51,7 @@ using VideoCommon::LoadPipelines;
 using VideoCommon::SerializePipeline;
 using Context = ShaderContext::Context;

-constexpr u32 CACHE_VERSION = 7;
+constexpr u32 CACHE_VERSION = 9;

 template <typename Container>
 auto MakeSpan(Container& container) {
@@ -236,6 +236,8 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo
          .needs_demote_reorder = device.IsAmd(),
          .support_snorm_render_buffer = false,
          .support_viewport_index_layer = device.HasVertexViewportLayer(),
+          .min_ssbo_alignment = static_cast<u32>(device.GetShaderStorageBufferAlignment()),
+          .support_geometry_shader_passthrough = device.HasGeometryShaderPassthrough(),
      } {
    if (use_asynchronous_shaders) {
        workers = CreateWorkers();
@@ -350,6 +352,7 @@ GraphicsPipeline* ShaderCache::CurrentGraphicsPipeline() {
        regs.tessellation.params.output_primitives.Value() ==
        Maxwell::Tessellation::OutputPrimitives::Triangles_CW);
    graphics_key.xfb_enabled.Assign(regs.transform_feedback_enabled != 0 ? 1 : 0);
+    graphics_key.app_stage.Assign(maxwell3d->engine_state);
    if (graphics_key.xfb_enabled) {
        SetXfbState(graphics_key.xfb_state, regs);
    }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Andrea Pappacoda	70f7ff3db5	build(externals): move to MbedTLS upstream 2.28.1 The latest LTS release now includes the changes of the yuzu-emu fork, so we can now directly use the upstream version instead.	2023-01-07 18:07:56 +01:00
Andrea Pappacoda	caa25146f2	build: use system MbedTLS when available Since MbedTLS pre-3.0.0 doesn't ship neither a pkg-config file nor a CMake package config file it is required to use a custom FindMbedTLS.cmake file. Since yuzu requires CMAC support it is also needed to check for the `mbedtls_cipher_cmac` symbol. I also changed src/core/CMakeLists.txt to only link against mbedcrypto, as yuzu doesn't use the full MbedTLS library	2023-01-07 18:07:42 +01:00
Narr the Reg	432d48d9c8	Merge pull request #9570 from liamwhite/less-clock-boost renderer_vulkan: disable clock boost on unvalidated devices	2023-01-07 10:41:37 -06:00
Narr the Reg	cc92b7fd94	Merge pull request #9573 from liamwhite/optional vulkan_device: avoid attempt to access empty optional	2023-01-07 10:40:21 -06:00
Liam	444b25bae1	vulkan_device: avoid attempt to access empty optional	2023-01-06 21:23:21 -05:00
Liam	2e4dde12c7	renderer_vulkan: disable clock boost on unvalidated devices	2023-01-06 19:07:47 -05:00
liamwhite	c0f17e1b27	Merge pull request #9567 from german77/antialias opengl: Sanitize antialiasing config	2023-01-06 15:20:23 -05:00
Narr the Reg	4bda2b475f	opengl: Sanitize antialiasing config	2023-01-06 13:42:20 -06:00
Fernando S	7ef897a277	Merge pull request #9566 from Wollnashorn/vulkan-cache-header-fix video_core/vulkan: Fixed loading of Vulkan driver pipeline cache	2023-01-06 11:58:36 -05:00
Wollnashorn	457826a83b	video_core/vulkan: Fixed loading of Vulkan driver pipeline cache The header size of the Vulkan driver pipeline cache files was incorrectly in PipelineCache::LoadVulkanPipelineCache, for which the pipeline cache wasn't read correctly and got invalidated on each load.	2023-01-06 16:52:41 +01:00
Fernando S	8b251fc3f6	Merge pull request #9535 from bylaws/master Port over several shader-compiler fixes from skyline	2023-01-06 10:06:45 -05:00
liamwhite	3c05988df2	Merge pull request #9561 from liamwhite/update-dynarmic externals: update dynarmic, xbyak	2023-01-06 10:00:18 -05:00
liamwhite	6d74490139	Merge pull request #9558 from MonsterDruide1/network-timeout-noerror net: Silently translate ETIMEDOUT network error	2023-01-06 10:00:09 -05:00
liamwhite	020dbcdbc7	Merge pull request #9552 from liamwhite/turbo vulkan: implement 'turbo mode' clock booster	2023-01-06 09:59:59 -05:00
Fernando S	5bcbb8de45	Merge pull request #9559 from FernandoS27/cached-writes VideoCore: Implement Cached Writes, use fastmem for reading GPU memory and eliminate old stuffs	2023-01-06 07:31:39 -05:00
liamwhite	990fe2b3fc	Merge pull request #9564 from FernandoS27/oops-i-did-it-again MacroHLE: eliminate 2 rushed macros.	2023-01-05 22:14:27 -05:00
Fernando Sahmkow	f6245dc40a	MacroHLE: eliminate 2 rushed macros.	2023-01-05 20:53:31 -05:00
liamwhite	eaca61e073	Merge pull request #9528 from liamwhite/mvk-nulldesc renderer_vulkan: implement fallback path for null buffer descriptors	2023-01-05 18:31:55 -05:00
liamwhite	3e33a878dc	Merge pull request #9536 from liamwhite/debug-utils vulkan_common: unify VK_EXT_debug_utils and selection of validation layer	2023-01-05 18:31:45 -05:00
Liam	1ee0540f82	externals: update dynarmic, xbyak	2023-01-05 18:06:06 -05:00
Billy Laws	58fec43768	Run clang-format	2023-01-05 22:18:10 +00:00
Billy Laws	12b4c9c04c	externals: Update sirit	2023-01-05 22:13:07 +00:00
Billy Laws	68ed60cee4	shader_recompiler: Fix shuffle partitioning for >64 invoc-per-subgroup GPUs The existing implementation only supports 64 invoc-per-subgroup GPUs, and misbehaves on adreno when invocations need to be split into 4 emulated subgroups.	2023-01-05 22:13:07 +00:00
Billy Laws	6c812a0c84	Vulkan, OpenGL: Hook up geometry shader passthrough emulation	2023-01-05 22:13:07 +00:00
Billy Laws	625a4af73a	shader_recompiler: Add support for lowering geometry passthrough Reuses most of the existing code for generating the gl_Layer passthrough. Fixes geometry in Nier: Automata on GPUs without HW passthrough support.	2023-01-05 22:13:07 +00:00
Billy Laws	9e2997c4b6	Vulkan, OpenGL: Hook up storage buffer alignment code	2023-01-05 22:13:07 +00:00
Billy Laws	8804a4eb23	shader_recompiler: Align SSBO offsets to meet host requirements We can take advantage of SSBO addresses being passed in a constant bufer to account for the extra alignment requirements in the shader itself.	2023-01-05 22:13:07 +00:00
Billy Laws	3f0985c7b0	shader_recompiler: SPIRV: Only enable int64 feature when supported	2023-01-05 22:13:07 +00:00
Billy Laws	c1cc99584c	shader_recompiler: Add comparison operators to descriptor types	2023-01-05 22:13:07 +00:00
Billy Laws	bbfad79c89	Vulkan: Add a workaround for input_position on Adreno drivers Adreno drivers will crash compiling geometry shaders if the input position is not wrapped in a gl_in struct.	2023-01-05 22:13:07 +00:00
Fernando S	1428451722	Merge pull request #9527 from Wollnashorn/amd-cache-fix video_core/vulkan: Implemented `VkPipelineCache` to store Vulkan pipelines	2023-01-05 16:38:07 -05:00
Wollnashorn	e07976a22b	video_core/vulkan: Vulkan driver pipelines now contain cache version So that old cache can get deleted when the cache version changes and does not grow infinitely	2023-01-05 21:03:01 +01:00
Wollnashorn	9c9008ac81	video_core/vulkan: Driver pipeline cache will now be deleted with the shader cache	2023-01-05 21:03:01 +01:00
Wollnashorn	8945fafcc0	config: Set the Vulkan driver pipeline cache option to be global	2023-01-05 21:03:01 +01:00
Wollnashorn	f2aa816679	video_core/vulkan: Added check if Vulkan pipeline path has been set	2023-01-05 21:03:01 +01:00
Wollnashorn	f4626512ff	config: Better wording for VK pipeline cache option and enable by default	2023-01-05 21:03:01 +01:00
Wollnashorn	67d4f190f7	yuzu-cmd: Removed `use_vulkan_driver_pipeline_cache` from default_ini.h The addition of the use_vulkan_driver_pipeline_cache option into the default ini string literal caused the 16,384-byte limit of the MSVC compiler to be exceeded.	2023-01-05 21:03:01 +01:00
Wollnashorn	16809c1fa7	video_core/vulkan: Added `VkPipelineCache` to store Vulkan pipelines As an optional feature which can be enabled in the advanced graphics configuration, all pipelines that get built at the initial shader loading are stored in a VkPipelineCache object and are dumped to the disk. These vendor specific pipeline cache files are located at `/shader/GAME_ID/vulkan_pipelines.bin`. This feature was mainly added because of an issue with the AMD driver (see yuzu-emu#8507) causing invalidation of the cache files the driver builds automatically.	2023-01-05 21:02:44 +01:00
Fernando Sahmkow	b56ad93bbc	BufferBase: Don't ignore GPU pages.	2023-01-05 14:00:10 -05:00
Fernando Sahmkow	2d0c4f2b1d	Fermi2D: sync cache flushes	2023-01-05 06:43:28 -05:00
Fernando Sahmkow	af5ecb0b15	MemoryManager: use fastmem directly.	2023-01-05 06:06:33 -05:00
MonsterDruide1	688a9fbfa6	net: Silently translate ETIMEDOUT network error	2023-01-05 11:54:36 +01:00
Fernando Sahmkow	6c7eb81f7d	video_core: Cache GPU internal writes.	2023-01-05 05:23:39 -05:00
liamwhite	e82e3e06be	Merge pull request #9557 from FernandoS27/ooops-i-killed-the-shitty-drivers Vulkan: Fix drivers that don't support dynamic_state_2 up	2023-01-05 00:14:01 -05:00
Fernando Sahmkow	4d9af4a9d2	Vulkan: Fix drivers that don't support dynamic_state_2 up	2023-01-05 00:11:16 -05:00
Liam	a4269c285a	common: add setting for renderer clock workaround	2023-01-04 22:22:01 -05:00
Liam	301e9bbc03	vulkan: implement 'turbo mode' clock booster	2023-01-04 22:22:01 -05:00
Liam	66ae79de13	renderer_vulkan: implement fallback path for null descriptors	2023-01-04 22:14:01 -05:00
liamwhite	b78328f19a	Merge pull request #9501 from FernandoS27/yfc-rel-2 Yuzu Fried Chicken Part 1.5: MacroHLE Rework and Dynamic State	2023-01-04 21:20:00 -05:00
Fernando Sahmkow	3ecc03ec1b	yuzu-ui: Add setting for disabling macro HLE	2023-01-04 14:56:52 -05:00
Fernando Sahmkow	a0c697124c	Video_core: Address feedback	2023-01-04 14:39:42 -05:00
Fernando Sahmkow	03ccd8bf43	Texture Cache: Implement async texture downloads.	2023-01-03 22:52:15 -05:00
liamwhite	bbeb6e460c	Merge pull request #9518 from gidoly/revert-9504-pg2 Revert "k_page_group: synchronize"	2023-01-03 21:40:57 -05:00
Fernando Sahmkow	ddbf851ef6	Vulkan: Update blacklisting to latest driver versions.	2023-01-03 21:16:43 -05:00
Fernando Sahmkow	a045e860dd	ShaderCompiler: Inline driver specific constants.	2023-01-03 16:29:25 -05:00
Fernando Sahmkow	b62ffb612d	Vulkan: rework stencil tracking.	2023-01-03 16:29:16 -05:00
liamwhite	6f031f08fe	Merge pull request #9547 from MonsterDruide1/tas-doesnt-flap TAS: Immediately switch stick to TAS on input	2023-01-03 15:03:58 -05:00
MonsterDruide1	04cb05fce0	TAS: Immediately switch stick to TAS on input Co-Authored-By: Narr the Reg <5944268+german77@users.noreply.github.com>	2023-01-03 20:08:25 +01:00
liamwhite	a7e610403d	Merge pull request #9542 from abouvier/cmake-module-path cmake: move find-modules to root cmake dir	2023-01-02 13:45:36 -05:00
Alexandre Bouvier	eceee8c3d9	cmake: move find-modules to root cmake dir	2023-01-02 18:22:07 +01:00
liamwhite	2b110d61e7	Merge pull request #9541 from abouvier/cmake-option cmake: allow options shadowing with normal variables	2023-01-02 11:29:32 -05:00
liamwhite	6804a43f49	Merge pull request #9540 from MonsterDruide1/tas-sanitized-record TAS: Record sanitized instead of raw stick inputs	2023-01-02 09:51:29 -05:00
bunnei	48bcb91a2e	Merge pull request #9537 from abouvier/cmake-almost-quiet cmake: improve find_package failure messages	2023-01-02 01:08:20 -08:00
bunnei	09c9be3703	Merge pull request #9543 from german77/nifm service: nifm: Initialize request state	2023-01-01 21:38:43 -08:00
german77	80bcc18788	service: nifm: Initialize request state	2023-01-01 20:58:08 -06:00
german77	ebd811b535	service: nifm: Match documentation names	2023-01-01 20:58:01 -06:00
Alexandre Bouvier	306c791e67	cmake: allow options shadowing with normal variables	2023-01-02 02:43:38 +01:00
Liam	f9c6d39a6c	vulkan_common: blacklist radv from extended_dynamic_state2 on drivers before 22.3.1	2023-01-01 16:43:58 -05:00
Liam	4814d87385	video_core: fix build	2023-01-01 16:43:58 -05:00
Fernando Sahmkow	d09aa0182f	MacroHLE: Final cleanup and fixes.	2023-01-01 16:43:58 -05:00
Fernando Sahmkow	581a7d785b	Rasterizer: Setup skeleton for Host Conditional rendering	2023-01-01 16:43:58 -05:00
Fernando Sahmkow	3630bfaef3	RasterizerMemory: Add filtering for flushing/invalidation operations.	2023-01-01 16:43:58 -05:00
Fernando Sahmkow	2793304117	Vulkan: Allow stagging buffer deferrals.	2023-01-01 16:43:58 -05:00
Fernando Sahmkow	8d694701bc	MacroHLE: Add OpenGL Support	2023-01-01 16:43:58 -05:00
Fernando Sahmkow	4c82e47edd	Vulkan: Add other additional pipeline specs	2023-01-01 16:43:58 -05:00
Fernando Sahmkow	d33251db93	Vulkan: Implement Dynamic State 3	2023-01-01 16:43:58 -05:00
Fernando Sahmkow	f800e485c9	Vulkan Implement Dynamic State 2 LogicOp and PatchVertices	2023-01-01 16:43:58 -05:00
Fernando Sahmkow	c897c55e3c	Vulkan: Implement Dynamic States 2	2023-01-01 16:43:57 -05:00
Fernando Sahmkow	cb1497d0d7	DMAPusher: Improve collection of non executing methods	2023-01-01 16:43:57 -05:00
Fernando Sahmkow	ce448ce770	Revert Buffer cache changes and setup additional macros.	2023-01-01 16:43:57 -05:00
Fernando Sahmkow	18637766ef	MacroHLE: Reduce massive calculations on sizing estimation.	2023-01-01 16:43:57 -05:00
Fernando Sahmkow	aad0cbf024	MacroHLE: Add HLE replacement for base vertex and base instance.	2023-01-01 16:43:57 -05:00
Fernando Sahmkow	93ac5a6a6d	MacroHLE: Add Index Buffer size estimation.	2023-01-01 16:43:57 -05:00
Fernando Sahmkow	c541559767	MacroHLE: Refactor MacroHLE system.	2023-01-01 16:43:57 -05:00
Fernando Sahmkow	0f89828073	MacroHLE: Implement DrawIndexedIndirect & DrawArraysIndirect.	2023-01-01 16:43:57 -05:00
Fernando Sahmkow	a5a94f52ff	MacroHLE: Add MultidrawIndirect HLE Macro.	2023-01-01 16:43:57 -05:00
MonsterDruide1	d46c9c4659	TAS: Record sanitized instead of raw stick inputs Co-Authored-By: Narr the Reg <5944268+german77@users.noreply.github.com>	2023-01-01 22:39:18 +01:00
Liam	aa13ee5c4a	vulkan_common: unify VK_EXT_debug_utils and selection of validation layer	2023-01-01 11:59:47 -05:00
gidoly	10eaf31af3	Revert "k_page_group: synchronize"	2022-12-29 17:39:42 +09:00