buffer_cache: Avoid passing references of shared pointers and misc style changes

Instead of using as template argument a shared pointer, use the underlying type and manage shared pointers explicitly. This can make removing shared pointers from the cache more easy. While we are at it, make some misc style changes and general improvements (like insert_or_assign instead of operator[] + operator=).
Merge pull request #4040 from ReinUsesLisp/nv-transform-feedback
2020-06-09 18:30:49 -03:00 · 2020-06-08 16:18:33 -04:00 · 2020-06-08 10:16:41 -04:00 · 2020-06-07 18:43:24 -03:00 · 2020-06-06 02:37:24 -04:00 · 2020-06-06 02:56:42 -03:00
88 changed files with 3233 additions and 1124 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -28,3 +28,6 @@
 [submodule "libzip"]
    path = externals/libzip/libzip
    url = https://github.com/nih-at/libzip.git
+[submodule "xbyak"]
+    path = externals/xbyak
+    url = https://github.com/herumi/xbyak.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.11)
+cmake_minimum_required(VERSION 3.15)

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules")
@@ -13,7 +13,7 @@ project(yuzu)
 option(ENABLE_SDL2 "Enable the SDL2 frontend" ON)

 option(ENABLE_QT "Enable the Qt frontend" ON)
-CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" OFF "ENABLE_QT;MSVC" OFF)
+CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" ON "ENABLE_QT;MSVC" OFF)

 option(ENABLE_WEB_SERVICE "Enable web services (telemetry, etc.)" ON)

--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -75,3 +75,11 @@ if (ENABLE_WEB_SERVICE)
    target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)
    target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto)
 endif()
+
+if (NOT TARGET xbyak)
+    if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
+        add_library(xbyak INTERFACE)
+        target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
+        target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
+    endif()
+endif()
--- a/externals/sirit
+++ b/externals/sirit
--- a/externals/xbyak
+++ b/externals/xbyak
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -123,6 +123,8 @@ add_library(common STATIC
    lz4_compression.cpp
    lz4_compression.h
    math_util.h
+    memory_detect.cpp
+    memory_detect.h
    memory_hook.cpp
    memory_hook.h
    microprofile.cpp
@@ -169,10 +171,12 @@ if(ARCHITECTURE_x86_64)
        PRIVATE
            x64/cpu_detect.cpp
            x64/cpu_detect.h
+            x64/xbyak_abi.h
+            x64/xbyak_util.h
    )
 endif()

 create_target_directory_groups(common)

 target_link_libraries(common PUBLIC Boost::boost fmt::fmt microprofile)
-target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd)
+target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd xbyak)
--- a/src/common/memory_detect.cpp
+++ b/src/common/memory_detect.cpp
@@ -0,0 +1,60 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#ifdef _WIN32
+// clang-format off
+#include <windows.h>
+#include <sysinfoapi.h>
+// clang-format on
+#else
+#include <sys/types.h>
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#else
+#include <sys/sysinfo.h>
+#endif
+#endif
+
+#include "common/memory_detect.h"
+
+namespace Common {
+
+// Detects the RAM and Swapfile sizes
+static MemoryInfo Detect() {
+    MemoryInfo mem_info{};
+
+#ifdef _WIN32
+    MEMORYSTATUSEX memorystatus;
+    memorystatus.dwLength = sizeof(memorystatus);
+    GlobalMemoryStatusEx(&memorystatus);
+    mem_info.TotalPhysicalMemory = memorystatus.ullTotalPhys;
+    mem_info.TotalSwapMemory = memorystatus.ullTotalPageFile - mem_info.TotalPhysicalMemory;
+#elif defined(__APPLE__)
+    u64 ramsize;
+    struct xsw_usage vmusage;
+    std::size_t sizeof_ramsize = sizeof(ramsize);
+    std::size_t sizeof_vmusage = sizeof(vmusage);
+    // hw and vm are defined in sysctl.h
+    // https://github.com/apple/darwin-xnu/blob/master/bsd/sys/sysctl.h#L471
+    // sysctlbyname(const char *, void *, size_t *, void *, size_t);
+    sysctlbyname("hw.memsize", &ramsize, &sizeof_ramsize, NULL, 0);
+    sysctlbyname("vm.swapusage", &vmusage, &sizeof_vmusage, NULL, 0);
+    mem_info.TotalPhysicalMemory = ramsize;
+    mem_info.TotalSwapMemory = vmusage.xsu_total;
+#else
+    struct sysinfo meminfo;
+    sysinfo(&meminfo);
+    mem_info.TotalPhysicalMemory = meminfo.totalram;
+    mem_info.TotalSwapMemory = meminfo.totalswap;
+#endif
+
+    return mem_info;
+}
+
+const MemoryInfo& GetMemInfo() {
+    static MemoryInfo mem_info = Detect();
+    return mem_info;
+}
+
+} // namespace Common
--- a/src/common/memory_detect.h
+++ b/src/common/memory_detect.h
@@ -0,0 +1,22 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Common {
+
+struct MemoryInfo {
+    u64 TotalPhysicalMemory{};
+    u64 TotalSwapMemory{};
+};
+
+/**
+ * Gets the memory info of the host system
+ * @return Reference to a MemoryInfo struct with the physical and swap memory sizes in bytes
+ */
+const MemoryInfo& GetMemInfo();
+
+} // namespace Common
--- a/src/common/x64/xbyak_abi.h
+++ b/src/common/x64/xbyak_abi.h
@@ -0,0 +1,266 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <bitset>
+#include <initializer_list>
+#include <xbyak.h>
+#include "common/assert.h"
+
+namespace Common::X64 {
+
+inline int RegToIndex(const Xbyak::Reg& reg) {
+    using Kind = Xbyak::Reg::Kind;
+    ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
+               "RegSet only support GPRs and XMM registers.");
+    ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15.");
+    return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16);
+}
+
+inline Xbyak::Reg64 IndexToReg64(int reg_index) {
+    ASSERT(reg_index < 16);
+    return Xbyak::Reg64(reg_index);
+}
+
+inline Xbyak::Xmm IndexToXmm(int reg_index) {
+    ASSERT(reg_index >= 16 && reg_index < 32);
+    return Xbyak::Xmm(reg_index - 16);
+}
+
+inline Xbyak::Reg IndexToReg(int reg_index) {
+    if (reg_index < 16) {
+        return IndexToReg64(reg_index);
+    } else {
+        return IndexToXmm(reg_index);
+    }
+}
+
+inline std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
+    std::bitset<32> bits;
+    for (const Xbyak::Reg& reg : regs) {
+        bits[RegToIndex(reg)] = true;
+    }
+    return bits;
+}
+
+const std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
+const std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
+
+#ifdef _WIN32
+
+// Microsoft x64 ABI
+const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
+const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx;
+const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
+const Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
+const Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;
+
+const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rcx,
+    Xbyak::util::rdx,
+    Xbyak::util::r8,
+    Xbyak::util::r9,
+    Xbyak::util::r10,
+    Xbyak::util::r11,
+    // XMMs
+    Xbyak::util::xmm0,
+    Xbyak::util::xmm1,
+    Xbyak::util::xmm2,
+    Xbyak::util::xmm3,
+    Xbyak::util::xmm4,
+    Xbyak::util::xmm5,
+});
+
+const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rbx,
+    Xbyak::util::rsi,
+    Xbyak::util::rdi,
+    Xbyak::util::rbp,
+    Xbyak::util::r12,
+    Xbyak::util::r13,
+    Xbyak::util::r14,
+    Xbyak::util::r15,
+    // XMMs
+    Xbyak::util::xmm6,
+    Xbyak::util::xmm7,
+    Xbyak::util::xmm8,
+    Xbyak::util::xmm9,
+    Xbyak::util::xmm10,
+    Xbyak::util::xmm11,
+    Xbyak::util::xmm12,
+    Xbyak::util::xmm13,
+    Xbyak::util::xmm14,
+    Xbyak::util::xmm15,
+});
+
+constexpr size_t ABI_SHADOW_SPACE = 0x20;
+
+#else
+
+// System V x86-64 ABI
+const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
+const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi;
+const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
+const Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
+const Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;
+
+const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rcx,
+    Xbyak::util::rdx,
+    Xbyak::util::rdi,
+    Xbyak::util::rsi,
+    Xbyak::util::r8,
+    Xbyak::util::r9,
+    Xbyak::util::r10,
+    Xbyak::util::r11,
+    // XMMs
+    Xbyak::util::xmm0,
+    Xbyak::util::xmm1,
+    Xbyak::util::xmm2,
+    Xbyak::util::xmm3,
+    Xbyak::util::xmm4,
+    Xbyak::util::xmm5,
+    Xbyak::util::xmm6,
+    Xbyak::util::xmm7,
+    Xbyak::util::xmm8,
+    Xbyak::util::xmm9,
+    Xbyak::util::xmm10,
+    Xbyak::util::xmm11,
+    Xbyak::util::xmm12,
+    Xbyak::util::xmm13,
+    Xbyak::util::xmm14,
+    Xbyak::util::xmm15,
+});
+
+const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rbx,
+    Xbyak::util::rbp,
+    Xbyak::util::r12,
+    Xbyak::util::r13,
+    Xbyak::util::r14,
+    Xbyak::util::r15,
+});
+
+constexpr size_t ABI_SHADOW_SPACE = 0;
+
+#endif
+
+inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
+                                   size_t needed_frame_size, s32* out_subtraction,
+                                   s32* out_xmm_offset) {
+    const auto count = (regs & ABI_ALL_GPRS).count();
+    rsp_alignment -= count * 8;
+    size_t subtraction = 0;
+    const auto xmm_count = (regs & ABI_ALL_XMMS).count();
+    if (xmm_count) {
+        // If we have any XMMs to save, we must align the stack here.
+        subtraction = rsp_alignment & 0xF;
+    }
+    subtraction += 0x10 * xmm_count;
+    size_t xmm_base_subtraction = subtraction;
+    subtraction += needed_frame_size;
+    subtraction += ABI_SHADOW_SPACE;
+    // Final alignment.
+    rsp_alignment -= subtraction;
+    subtraction += rsp_alignment & 0xF;
+
+    *out_subtraction = (s32)subtraction;
+    *out_xmm_offset = (s32)(subtraction - xmm_base_subtraction);
+}
+
+inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+                                              size_t rsp_alignment, size_t needed_frame_size = 0) {
+    s32 subtraction, xmm_offset;
+    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+    for (std::size_t i = 0; i < regs.size(); ++i) {
+        if (regs[i] && ABI_ALL_GPRS[i]) {
+            code.push(IndexToReg64(static_cast<int>(i)));
+        }
+    }
+    if (subtraction != 0) {
+        code.sub(code.rsp, subtraction);
+    }
+
+    for (int i = 0; i < regs.count(); i++) {
+        if (regs.test(i) & ABI_ALL_GPRS.test(i)) {
+            code.push(IndexToReg64(i));
+        }
+    }
+
+    for (std::size_t i = 0; i < regs.size(); ++i) {
+        if (regs[i] && ABI_ALL_XMMS[i]) {
+            code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(static_cast<int>(i)));
+            xmm_offset += 0x10;
+        }
+    }
+
+    return ABI_SHADOW_SPACE;
+}
+
+inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+                                           size_t rsp_alignment, size_t needed_frame_size = 0) {
+    s32 subtraction, xmm_offset;
+    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+
+    for (std::size_t i = 0; i < regs.size(); ++i) {
+        if (regs[i] && ABI_ALL_XMMS[i]) {
+            code.movaps(IndexToXmm(static_cast<int>(i)), code.xword[code.rsp + xmm_offset]);
+            xmm_offset += 0x10;
+        }
+    }
+
+    if (subtraction != 0) {
+        code.add(code.rsp, subtraction);
+    }
+
+    // GPRs need to be popped in reverse order
+    for (int i = 15; i >= 0; i--) {
+        if (regs[i]) {
+            code.pop(IndexToReg64(i));
+        }
+    }
+}
+
+inline size_t ABI_PushRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+                                                 size_t rsp_alignment,
+                                                 size_t needed_frame_size = 0) {
+    s32 subtraction, xmm_offset;
+    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+
+    for (std::size_t i = 0; i < regs.size(); ++i) {
+        if (regs[i] && ABI_ALL_GPRS[i]) {
+            code.push(IndexToReg64(static_cast<int>(i)));
+        }
+    }
+
+    if (subtraction != 0) {
+        code.sub(code.rsp, subtraction);
+    }
+
+    return ABI_SHADOW_SPACE;
+}
+
+inline void ABI_PopRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+                                              size_t rsp_alignment, size_t needed_frame_size = 0) {
+    s32 subtraction, xmm_offset;
+    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+
+    if (subtraction != 0) {
+        code.add(code.rsp, subtraction);
+    }
+
+    // GPRs need to be popped in reverse order
+    for (int i = 15; i >= 0; i--) {
+        if (regs[i]) {
+            code.pop(IndexToReg64(i));
+        }
+    }
+}
+
+} // namespace Common::X64
--- a/src/common/x64/xbyak_util.h
+++ b/src/common/x64/xbyak_util.h
@@ -0,0 +1,47 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <type_traits>
+#include <xbyak.h>
+#include "common/x64/xbyak_abi.h"
+
+namespace Common::X64 {
+
+// Constants for use with cmpps/cmpss
+enum {
+    CMP_EQ = 0,
+    CMP_LT = 1,
+    CMP_LE = 2,
+    CMP_UNORD = 3,
+    CMP_NEQ = 4,
+    CMP_NLT = 5,
+    CMP_NLE = 6,
+    CMP_ORD = 7,
+};
+
+constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) {
+    const u64 distance = target - (ref + 5);
+    return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL);
+}
+
+inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) {
+    return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target);
+}
+
+template <typename T>
+inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) {
+    static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
+    size_t addr = reinterpret_cast<size_t>(f);
+    if (IsWithin2G(code, addr)) {
+        code.call(f);
+    } else {
+        // ABI_RETURN is a safe temp register to use before a call
+        code.mov(ABI_RETURN, addr);
+        code.call(ABI_RETURN);
+    }
+}
+
+} // namespace Common::X64
--- a/src/core/file_sys/patch_manager.cpp
+++ b/src/core/file_sys/patch_manager.cpp
@@ -10,6 +10,7 @@
 #include "common/file_util.h"
 #include "common/hex_util.h"
 #include "common/logging/log.h"
+#include "common/string_util.h"
 #include "core/core.h"
 #include "core/file_sys/content_archive.h"
 #include "core/file_sys/control_metadata.h"
@@ -48,6 +49,23 @@ std::string FormatTitleVersion(u32 version, TitleVersionFormat format) {
    return fmt::format("v{}.{}.{}", bytes[3], bytes[2], bytes[1]);
 }

+std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
+                                                       std::string_view name) {
+#ifdef _WIN32
+    return dir->GetSubdirectory(name);
+#else
+    const auto subdirs = dir->GetSubdirectories();
+    for (const auto& subdir : subdirs) {
+        std::string dir_name = Common::ToLower(subdir->GetName());
+        if (dir_name == name) {
+            return subdir;
+        }
+    }
+
+    return nullptr;
+#endif
+}
+
 PatchManager::PatchManager(u64 title_id) : title_id(title_id) {}

 PatchManager::~PatchManager() = default;
@@ -104,7 +122,7 @@ VirtualDir PatchManager::PatchExeFS(VirtualDir exefs) const {
            if (std::find(disabled.begin(), disabled.end(), subdir->GetName()) != disabled.end())
                continue;

-            auto exefs_dir = subdir->GetSubdirectory("exefs");
+            auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
            if (exefs_dir != nullptr)
                layers.push_back(std::move(exefs_dir));
        }
@@ -130,7 +148,7 @@ std::vector<VirtualFile> PatchManager::CollectPatches(const std::vector<VirtualD
        if (std::find(disabled.cbegin(), disabled.cend(), subdir->GetName()) != disabled.cend())
            continue;

-        auto exefs_dir = subdir->GetSubdirectory("exefs");
+        auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
        if (exefs_dir != nullptr) {
            for (const auto& file : exefs_dir->GetFiles()) {
                if (file->GetExtension() == "ips") {
@@ -295,7 +313,7 @@ std::vector<Core::Memory::CheatEntry> PatchManager::CreateCheatList(
            continue;
        }

-        auto cheats_dir = subdir->GetSubdirectory("cheats");
+        auto cheats_dir = FindSubdirectoryCaseless(subdir, "cheats");
        if (cheats_dir != nullptr) {
            auto res = ReadCheatFileFromFolder(system, title_id, build_id_, cheats_dir, true);
            if (res.has_value()) {
@@ -340,11 +358,11 @@ static void ApplyLayeredFS(VirtualFile& romfs, u64 title_id, ContentRecordType t
            continue;
        }

-        auto romfs_dir = subdir->GetSubdirectory("romfs");
+        auto romfs_dir = FindSubdirectoryCaseless(subdir, "romfs");
        if (romfs_dir != nullptr)
            layers.push_back(std::move(romfs_dir));

-        auto ext_dir = subdir->GetSubdirectory("romfs_ext");
+        auto ext_dir = FindSubdirectoryCaseless(subdir, "romfs_ext");
        if (ext_dir != nullptr)
            layers_ext.push_back(std::move(ext_dir));
    }
@@ -470,7 +488,7 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
        for (const auto& mod : mod_dir->GetSubdirectories()) {
            std::string types;

-            const auto exefs_dir = mod->GetSubdirectory("exefs");
+            const auto exefs_dir = FindSubdirectoryCaseless(mod, "exefs");
            if (IsDirValidAndNonEmpty(exefs_dir)) {
                bool ips = false;
                bool ipswitch = false;
@@ -494,9 +512,9 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
                if (layeredfs)
                    AppendCommaIfNotEmpty(types, "LayeredExeFS");
            }
-            if (IsDirValidAndNonEmpty(mod->GetSubdirectory("romfs")))
+            if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "romfs")))
                AppendCommaIfNotEmpty(types, "LayeredFS");
-            if (IsDirValidAndNonEmpty(mod->GetSubdirectory("cheats")))
+            if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "cheats")))
                AppendCommaIfNotEmpty(types, "Cheats");

            if (types.empty())
--- a/src/core/file_sys/patch_manager.h
+++ b/src/core/file_sys/patch_manager.h
@@ -29,6 +29,11 @@ enum class TitleVersionFormat : u8 {
 std::string FormatTitleVersion(u32 version,
                               TitleVersionFormat format = TitleVersionFormat::ThreeElements);

+// Returns a directory with name matching name case-insensitive. Returns nullptr if directory
+// doesn't have a directory with name.
+std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
+                                                       std::string_view name);
+
 // A centralized class to manage patches to games.
 class PatchManager {
 public:
--- a/src/core/file_sys/system_archive/system_version.cpp
+++ b/src/core/file_sys/system_archive/system_version.cpp
@@ -12,17 +12,17 @@ namespace SystemVersionData {
 // This section should reflect the best system version to describe yuzu's HLE api.
 // TODO(DarkLordZach): Update when HLE gets better.

-constexpr u8 VERSION_MAJOR = 5;
-constexpr u8 VERSION_MINOR = 1;
-constexpr u8 VERSION_MICRO = 0;
+constexpr u8 VERSION_MAJOR = 10;
+constexpr u8 VERSION_MINOR = 0;
+constexpr u8 VERSION_MICRO = 2;

-constexpr u8 REVISION_MAJOR = 3;
+constexpr u8 REVISION_MAJOR = 1;
 constexpr u8 REVISION_MINOR = 0;

 constexpr char PLATFORM_STRING[] = "NX";
-constexpr char VERSION_HASH[] = "23f9df53e25709d756e0c76effcb2473bd3447dd";
-constexpr char DISPLAY_VERSION[] = "5.1.0";
-constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 5.1.0-3.0";
+constexpr char VERSION_HASH[] = "f90143fa8bbc061d4f68c35f95f04f8080c0ecdc";
+constexpr char DISPLAY_VERSION[] = "10.0.2";
+constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 10.0.2-1.0";

 } // namespace SystemVersionData

--- a/src/core/hle/kernel/readable_event.cpp
+++ b/src/core/hle/kernel/readable_event.cpp
@@ -38,7 +38,7 @@ void ReadableEvent::Clear() {

 ResultCode ReadableEvent::Reset() {
    if (!is_signaled) {
-        LOG_ERROR(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
+        LOG_TRACE(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
                  GetObjectId(), GetTypeName(), GetName());
        return ERR_INVALID_STATE;
    }
--- a/src/core/hle/service/hid/controllers/keyboard.cpp
+++ b/src/core/hle/service/hid/controllers/keyboard.cpp
@@ -38,10 +38,11 @@ void Controller_Keyboard::OnUpdate(const Core::Timing::CoreTiming& core_timing,
    cur_entry.sampling_number = last_entry.sampling_number + 1;
    cur_entry.sampling_number2 = cur_entry.sampling_number;

+    cur_entry.key.fill(0);
+    cur_entry.modifier = 0;
+
    for (std::size_t i = 0; i < keyboard_keys.size(); ++i) {
-        for (std::size_t k = 0; k < KEYS_PER_BYTE; ++k) {
-            cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << k);
-        }
+        cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << (i % KEYS_PER_BYTE));
    }

    for (std::size_t i = 0; i < keyboard_mods.size(); ++i) {
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -161,7 +161,7 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) {
        {40, nullptr, "AcquireXpadIdEventHandle"},
        {41, nullptr, "ReleaseXpadIdEventHandle"},
        {51, &Hid::ActivateXpad, "ActivateXpad"},
-        {55, nullptr, "GetXpadIds"},
+        {55, &Hid::GetXpadIDs, "GetXpadIds"},
        {56, nullptr, "ActivateJoyXpad"},
        {58, nullptr, "GetJoyXpadLifoHandle"},
        {59, nullptr, "GetJoyXpadIds"},
@@ -319,6 +319,17 @@ void Hid::ActivateXpad(Kernel::HLERequestContext& ctx) {
    rb.Push(RESULT_SUCCESS);
 }

+void Hid::GetXpadIDs(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_DEBUG(Service_HID, "(STUBBED) called, applet_resource_user_id={}", applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 3};
+    rb.Push(RESULT_SUCCESS);
+    rb.Push(0);
+}
+
 void Hid::ActivateDebugPad(Kernel::HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};
    const auto applet_resource_user_id{rp.Pop<u64>()};
--- a/src/core/hle/service/hid/hid.h
+++ b/src/core/hle/service/hid/hid.h
@@ -86,6 +86,7 @@ public:
 private:
    void CreateAppletResource(Kernel::HLERequestContext& ctx);
    void ActivateXpad(Kernel::HLERequestContext& ctx);
+    void GetXpadIDs(Kernel::HLERequestContext& ctx);
    void ActivateDebugPad(Kernel::HLERequestContext& ctx);
    void ActivateTouchScreen(Kernel::HLERequestContext& ctx);
    void ActivateMouse(Kernel::HLERequestContext& ctx);
--- a/src/core/hle/service/nifm/nifm.cpp
+++ b/src/core/hle/service/nifm/nifm.cpp
@@ -177,7 +177,8 @@ private:
    void CreateTemporaryNetworkProfile(Kernel::HLERequestContext& ctx) {
        LOG_DEBUG(Service_NIFM, "called");

-        ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c, "NetworkProfileData is not the correct size");
+        ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c,
+                   "SfNetworkProfileData is not the correct size");
        u128 uuid{};
        auto buffer = ctx.ReadBuffer();
        std::memcpy(&uuid, buffer.data() + 8, sizeof(u128));
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -112,6 +112,7 @@ void LogSettings() {
    LogSetting("Renderer_UseAsynchronousGpuEmulation",
               Settings::values.use_asynchronous_gpu_emulation);
    LogSetting("Renderer_UseVsync", Settings::values.use_vsync);
+    LogSetting("Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
    LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy);
    LogSetting("Audio_OutputEngine", Settings::values.sink_id);
    LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching);
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -446,6 +446,7 @@ struct Values {
    GPUAccuracy gpu_accuracy;
    bool use_asynchronous_gpu_emulation;
    bool use_vsync;
+    bool use_assembly_shaders;
    bool force_30fps_mode;
    bool use_fast_gpu_time;

@@ -473,6 +474,7 @@ struct Values {
    bool reporting_services;
    bool quest_flag;
    bool disable_cpu_opt;
+    bool disable_macro_jit;

    // BCAT
    std::string bcat_backend;
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -201,6 +201,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
    AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
             Settings::values.use_asynchronous_gpu_emulation);
    AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync);
+    AddField(field_type, "Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
    AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode);
 }

--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_library(video_core STATIC
    buffer_cache/buffer_block.h
    buffer_cache/buffer_cache.h
+    buffer_cache/map_interval.cpp
    buffer_cache/map_interval.h
    dirty_flags.cpp
    dirty_flags.h
@@ -24,6 +25,12 @@ add_library(video_core STATIC
    engines/shader_bytecode.h
    engines/shader_header.h
    engines/shader_type.h
+    macro/macro.cpp
+    macro/macro.h
+    macro/macro_interpreter.cpp
+    macro/macro_interpreter.h
+    macro/macro_jit_x64.cpp
+    macro/macro_jit_x64.h
    fence_manager.h
    gpu.cpp
    gpu.h
@@ -35,8 +42,6 @@ add_library(video_core STATIC
    gpu_thread.h
    guest_driver.cpp
    guest_driver.h
-    macro_interpreter.cpp
-    macro_interpreter.h
    memory_manager.cpp
    memory_manager.h
    morton.cpp
@@ -228,7 +233,7 @@ endif()
 create_target_directory_groups(video_core)

 target_link_libraries(video_core PUBLIC common core)
-target_link_libraries(video_core PRIVATE glad)
+target_link_libraries(video_core PRIVATE glad xbyak)

 if (ENABLE_VULKAN)
    target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
--- a/src/video_core/buffer_cache/buffer_block.h
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -15,48 +15,47 @@ namespace VideoCommon {

 class BufferBlock {
 public:
-    bool Overlaps(const VAddr start, const VAddr end) const {
+    bool Overlaps(VAddr start, VAddr end) const {
        return (cpu_addr < end) && (cpu_addr_end > start);
    }

-    bool IsInside(const VAddr other_start, const VAddr other_end) const {
+    bool IsInside(VAddr other_start, VAddr other_end) const {
        return cpu_addr <= other_start && other_end <= cpu_addr_end;
    }

-    std::size_t GetOffset(const VAddr in_addr) {
+    std::size_t Offset(VAddr in_addr) const {
        return static_cast<std::size_t>(in_addr - cpu_addr);
    }

-    VAddr GetCpuAddr() const {
+    VAddr CpuAddr() const {
        return cpu_addr;
    }

-    VAddr GetCpuAddrEnd() const {
+    VAddr CpuAddrEnd() const {
        return cpu_addr_end;
    }

-    void SetCpuAddr(const VAddr new_addr) {
+    void SetCpuAddr(VAddr new_addr) {
        cpu_addr = new_addr;
        cpu_addr_end = new_addr + size;
    }

-    std::size_t GetSize() const {
+    std::size_t Size() const {
        return size;
    }

+    u64 Epoch() const {
+        return epoch;
+    }
+
    void SetEpoch(u64 new_epoch) {
        epoch = new_epoch;
    }

-    u64 GetEpoch() {
-        return epoch;
-    }
-
 protected:
-    explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} {
-        SetCpuAddr(cpu_addr);
+    explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
+        SetCpuAddr(cpu_addr_);
    }
-    ~BufferBlock() = default;

 private:
    VAddr cpu_addr{};
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -12,11 +12,12 @@
 #include <utility>
 #include <vector>

-#include <boost/icl/interval_map.hpp>
+#include <boost/container/small_vector.hpp>
 #include <boost/icl/interval_set.hpp>
-#include <boost/range/iterator_range.hpp>
+#include <boost/intrusive/set.hpp>

 #include "common/alignment.h"
+#include "common/assert.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "core/core.h"
@@ -29,10 +30,16 @@

 namespace VideoCommon {

-using MapInterval = std::shared_ptr<MapIntervalBase>;
-
-template <typename OwnerBuffer, typename BufferType, typename StreamBuffer>
+template <typename Buffer, typename BufferType, typename StreamBuffer>
 class BufferCache {
+    using IntervalSet = boost::icl::interval_set<VAddr>;
+    using IntervalType = typename IntervalSet::interval_type;
+    using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
+
+    static constexpr u64 WRITE_PAGE_BIT = 11;
+    static constexpr u64 BLOCK_PAGE_BITS = 21;
+    static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
+
 public:
    using BufferInfo = std::pair<BufferType, u64>;

@@ -40,14 +47,12 @@ public:
                            bool is_written = false, bool use_fast_cbuf = false) {
        std::lock_guard lock{mutex};

-        const std::optional<VAddr> cpu_addr_opt =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
-
+        const auto& memory_manager = system.GPU().MemoryManager();
+        const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
        if (!cpu_addr_opt) {
            return {GetEmptyBuffer(size), 0};
        }
-
-        VAddr cpu_addr = *cpu_addr_opt;
+        const VAddr cpu_addr = *cpu_addr_opt;

        // Cache management is a big overhead, so only cache entries with a given size.
        // TODO: Figure out which size is the best for given games.
@@ -55,49 +60,58 @@ public:
        if (use_fast_cbuf || size < max_stream_size) {
            if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) {
                auto& memory_manager = system.GPU().MemoryManager();
+                const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size);
                if (use_fast_cbuf) {
-                    if (memory_manager.IsGranularRange(gpu_addr, size)) {
-                        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
-                        return ConstBufferUpload(host_ptr, size);
+                    u8* dest;
+                    if (is_granular) {
+                        dest = memory_manager.GetPointer(gpu_addr);
                    } else {
                        staging_buffer.resize(size);
-                        memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                        return ConstBufferUpload(staging_buffer.data(), size);
+                        dest = staging_buffer.data();
+                        memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
                    }
+                    return ConstBufferUpload(dest, size);
+                }
+                if (is_granular) {
+                    u8* const host_ptr = memory_manager.GetPointer(gpu_addr);
+                    return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
+                        std::memcpy(dest, host_ptr, size);
+                    });
                } else {
-                    if (memory_manager.IsGranularRange(gpu_addr, size)) {
-                        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
-                        return StreamBufferUpload(host_ptr, size, alignment);
-                    } else {
-                        staging_buffer.resize(size);
-                        memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                        return StreamBufferUpload(staging_buffer.data(), size, alignment);
-                    }
+                    return StreamBufferUpload(
+                        size, alignment, [&memory_manager, gpu_addr, size](u8* dest) {
+                            memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
+                        });
                }
            }
        }

-        auto block = GetBlock(cpu_addr, size);
-        auto map = MapAddress(block, gpu_addr, cpu_addr, size);
+        Buffer* const block = GetBlock(cpu_addr, size);
+        MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
+        if (!map) {
+            return {GetEmptyBuffer(size), 0};
+        }
        if (is_written) {
            map->MarkAsModified(true, GetModifiedTicks());
            if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
                MarkForAsyncFlush(map);
            }
-            if (!map->IsWritten()) {
-                map->MarkAsWritten(true);
-                MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+            if (!map->is_written) {
+                map->is_written = true;
+                MarkRegionAsWritten(map->start, map->end - 1);
            }
        }

-        return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))};
+        return {block->Handle(), static_cast<u64>(block->Offset(cpu_addr))};
    }

    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
                                std::size_t alignment = 4) {
        std::lock_guard lock{mutex};
-        return StreamBufferUpload(raw_pointer, size, alignment);
+        return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
+            std::memcpy(dest, raw_pointer, size);
+        });
    }

    void Map(std::size_t max_size) {
@@ -115,16 +129,18 @@ public:
        return std::exchange(invalidated, false);
    }

+    /// Function called at the end of each frame, inteded for deferred operations
    void TickFrame() {
        ++epoch;
+
        while (!pending_destruction.empty()) {
            // Delay at least 4 frames before destruction.
            // This is due to triple buffering happening on some drivers.
            static constexpr u64 epochs_to_destroy = 5;
-            if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) {
+            if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
                break;
            }
-            pending_destruction.pop_front();
+            pending_destruction.pop();
        }
    }

@@ -132,12 +148,11 @@ public:
    void FlushRegion(VAddr addr, std::size_t size) {
        std::lock_guard lock{mutex};

-        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
-        std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) {
-            return a->GetModificationTick() < b->GetModificationTick();
-        });
-        for (auto& object : objects) {
-            if (object->IsModified() && object->IsRegistered()) {
+        VectorMapInterval objects = GetMapsInRange(addr, size);
+        std::sort(objects.begin(), objects.end(),
+                  [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
+        for (MapInterval* object : objects) {
+            if (object->is_modified && object->is_registered) {
                mutex.unlock();
                FlushMap(object);
                mutex.lock();
@@ -148,9 +163,9 @@ public:
    bool MustFlushRegion(VAddr addr, std::size_t size) {
        std::lock_guard lock{mutex};

-        const std::vector<MapInterval> objects = GetMapsInRange(addr, size);
-        return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval& map) {
-            return map->IsModified() && map->IsRegistered();
+        const VectorMapInterval objects = GetMapsInRange(addr, size);
+        return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
+            return map->is_modified && map->is_registered;
        });
    }

@@ -158,9 +173,8 @@ public:
    void InvalidateRegion(VAddr addr, u64 size) {
        std::lock_guard lock{mutex};

-        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
-        for (auto& object : objects) {
-            if (object->IsRegistered()) {
+        for (auto& object : GetMapsInRange(addr, size)) {
+            if (object->is_registered) {
                Unregister(object);
            }
        }
@@ -169,10 +183,10 @@ public:
    void OnCPUWrite(VAddr addr, std::size_t size) {
        std::lock_guard lock{mutex};

-        for (const auto& object : GetMapsInRange(addr, size)) {
-            if (object->IsMemoryMarked() && object->IsRegistered()) {
+        for (MapInterval* object : GetMapsInRange(addr, size)) {
+            if (object->is_memory_marked && object->is_registered) {
                UnmarkMemory(object);
-                object->SetSyncPending(true);
+                object->is_sync_pending = true;
                marked_for_unregister.emplace_back(object);
            }
        }
@@ -181,9 +195,9 @@ public:
    void SyncGuestHost() {
        std::lock_guard lock{mutex};

-        for (const auto& object : marked_for_unregister) {
-            if (object->IsRegistered()) {
-                object->SetSyncPending(false);
+        for (auto& object : marked_for_unregister) {
+            if (object->is_registered) {
+                object->is_sync_pending = false;
                Unregister(object);
            }
        }
@@ -192,9 +206,9 @@ public:

    void CommitAsyncFlushes() {
        if (uncommitted_flushes) {
-            auto commit_list = std::make_shared<std::list<MapInterval>>();
-            for (auto& map : *uncommitted_flushes) {
-                if (map->IsRegistered() && map->IsModified()) {
+            auto commit_list = std::make_shared<std::list<MapInterval*>>();
+            for (MapInterval* map : *uncommitted_flushes) {
+                if (map->is_registered && map->is_modified) {
                    // TODO(Blinkhawk): Implement backend asynchronous flushing
                    // AsyncFlushMap(map)
                    commit_list->push_back(map);
@@ -228,8 +242,8 @@ public:
            committed_flushes.pop_front();
            return;
        }
-        for (MapInterval& map : *flush_list) {
-            if (map->IsRegistered()) {
+        for (MapInterval* map : *flush_list) {
+            if (map->is_registered) {
                // TODO(Blinkhawk): Replace this for reading the asynchronous flush
                FlushMap(map);
            }
@@ -241,23 +255,21 @@ public:

 protected:
    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                         std::unique_ptr<StreamBuffer> stream_buffer)
-        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
-          stream_buffer_handle{this->stream_buffer->GetHandle()} {}
+                         std::unique_ptr<StreamBuffer> stream_buffer_)
+        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer_)},
+          stream_buffer_handle{stream_buffer->Handle()} {}

    ~BufferCache() = default;

-    virtual BufferType ToHandle(const OwnerBuffer& storage) = 0;
+    virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;

-    virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
-
-    virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
+    virtual void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
                                 const u8* data) = 0;

-    virtual void DownloadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
+    virtual void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
                                   u8* data) = 0;

-    virtual void CopyBlock(const OwnerBuffer& src, const OwnerBuffer& dst, std::size_t src_offset,
+    virtual void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
                           std::size_t dst_offset, std::size_t size) = 0;

    virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
@@ -265,76 +277,74 @@ protected:
    }

    /// Register an object into the cache
-    void Register(const MapInterval& new_map, bool inherit_written = false) {
-        const VAddr cpu_addr = new_map->GetStart();
+    MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
+        const VAddr cpu_addr = new_map.start;
        if (!cpu_addr) {
            LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
-                         new_map->GetGpuAddress());
-            return;
+                         new_map.gpu_addr);
+            return nullptr;
        }
-        const std::size_t size = new_map->GetEnd() - new_map->GetStart();
-        new_map->MarkAsRegistered(true);
-        const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
-        mapped_addresses.insert({interval, new_map});
+        const std::size_t size = new_map.end - new_map.start;
+        new_map.is_registered = true;
        rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
-        new_map->SetMemoryMarked(true);
+        new_map.is_memory_marked = true;
        if (inherit_written) {
-            MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1);
-            new_map->MarkAsWritten(true);
+            MarkRegionAsWritten(new_map.start, new_map.end - 1);
+            new_map.is_written = true;
        }
+        MapInterval* const storage = mapped_addresses_allocator.Allocate();
+        *storage = new_map;
+        mapped_addresses.insert(*storage);
+        return storage;
    }

-    void UnmarkMemory(const MapInterval& map) {
-        if (!map->IsMemoryMarked()) {
+    void UnmarkMemory(MapInterval* map) {
+        if (!map->is_memory_marked) {
            return;
        }
-        const std::size_t size = map->GetEnd() - map->GetStart();
-        rasterizer.UpdatePagesCachedCount(map->GetStart(), size, -1);
-        map->SetMemoryMarked(false);
+        const std::size_t size = map->end - map->start;
+        rasterizer.UpdatePagesCachedCount(map->start, size, -1);
+        map->is_memory_marked = false;
    }

    /// Unregisters an object from the cache
-    void Unregister(const MapInterval& map) {
+    void Unregister(MapInterval* map) {
        UnmarkMemory(map);
-        map->MarkAsRegistered(false);
-        if (map->IsSyncPending()) {
+        map->is_registered = false;
+        if (map->is_sync_pending) {
+            map->is_sync_pending = false;
            marked_for_unregister.remove(map);
-            map->SetSyncPending(false);
        }
-        if (map->IsWritten()) {
-            UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+        if (map->is_written) {
+            UnmarkRegionAsWritten(map->start, map->end - 1);
        }
-        const IntervalType delete_interval{map->GetStart(), map->GetEnd()};
-        mapped_addresses.erase(delete_interval);
+        const auto it = mapped_addresses.find(*map);
+        ASSERT(it != mapped_addresses.end());
+        mapped_addresses.erase(it);
+        mapped_addresses_allocator.Release(map);
    }

 private:
-    MapInterval CreateMap(const VAddr start, const VAddr end, const GPUVAddr gpu_addr) {
-        return std::make_shared<MapIntervalBase>(start, end, gpu_addr);
-    }
-
-    MapInterval MapAddress(const OwnerBuffer& block, const GPUVAddr gpu_addr, const VAddr cpu_addr,
-                           const std::size_t size) {
-        std::vector<MapInterval> overlaps = GetMapsInRange(cpu_addr, size);
+    MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr,
+                            std::size_t size) {
+        const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
        if (overlaps.empty()) {
            auto& memory_manager = system.GPU().MemoryManager();
            const VAddr cpu_addr_end = cpu_addr + size;
-            MapInterval new_map = CreateMap(cpu_addr, cpu_addr_end, gpu_addr);
            if (memory_manager.IsGranularRange(gpu_addr, size)) {
                u8* host_ptr = memory_manager.GetPointer(gpu_addr);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr);
+                UploadBlockData(*block, block->Offset(cpu_addr), size, host_ptr);
            } else {
                staging_buffer.resize(size);
                memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data());
+                UploadBlockData(*block, block->Offset(cpu_addr), size, staging_buffer.data());
            }
-            Register(new_map);
-            return new_map;
+            return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
        }

        const VAddr cpu_addr_end = cpu_addr + size;
        if (overlaps.size() == 1) {
-            MapInterval& current_map = overlaps[0];
+            MapInterval* const current_map = overlaps[0];
            if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
                return current_map;
            }
@@ -344,60 +354,70 @@ private:
        bool write_inheritance = false;
        bool modified_inheritance = false;
        // Calculate new buffer parameters
-        for (auto& overlap : overlaps) {
-            new_start = std::min(overlap->GetStart(), new_start);
-            new_end = std::max(overlap->GetEnd(), new_end);
-            write_inheritance |= overlap->IsWritten();
-            modified_inheritance |= overlap->IsModified();
+        for (MapInterval* overlap : overlaps) {
+            new_start = std::min(overlap->start, new_start);
+            new_end = std::max(overlap->end, new_end);
+            write_inheritance |= overlap->is_written;
+            modified_inheritance |= overlap->is_modified;
        }
        GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
        for (auto& overlap : overlaps) {
            Unregister(overlap);
        }
        UpdateBlock(block, new_start, new_end, overlaps);
-        MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr);
+
+        const MapInterval new_map{new_start, new_end, new_gpu_addr};
+        MapInterval* const map = Register(new_map, write_inheritance);
+        if (!map) {
+            return nullptr;
+        }
        if (modified_inheritance) {
-            new_map->MarkAsModified(true, GetModifiedTicks());
+            map->MarkAsModified(true, GetModifiedTicks());
            if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
-                MarkForAsyncFlush(new_map);
+                MarkForAsyncFlush(map);
            }
        }
-        Register(new_map, write_inheritance);
-        return new_map;
+        return map;
    }

-    void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end,
-                     std::vector<MapInterval>& overlaps) {
+    void UpdateBlock(const Buffer* block, VAddr start, VAddr end,
+                     const VectorMapInterval& overlaps) {
        const IntervalType base_interval{start, end};
        IntervalSet interval_set{};
        interval_set.add(base_interval);
        for (auto& overlap : overlaps) {
-            const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()};
+            const IntervalType subtract{overlap->start, overlap->end};
            interval_set.subtract(subtract);
        }
        for (auto& interval : interval_set) {
-            std::size_t size = interval.upper() - interval.lower();
-            if (size > 0) {
-                staging_buffer.resize(size);
-                system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(interval.lower()), size,
-                                staging_buffer.data());
+            const std::size_t size = interval.upper() - interval.lower();
+            if (size == 0) {
+                continue;
            }
+            staging_buffer.resize(size);
+            system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
+            UploadBlockData(*block, block->Offset(interval.lower()), size, staging_buffer.data());
        }
    }

-    std::vector<MapInterval> GetMapsInRange(VAddr addr, std::size_t size) {
+    VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
+        VectorMapInterval result;
        if (size == 0) {
-            return {};
+            return result;
        }

-        std::vector<MapInterval> objects{};
-        const IntervalType interval{addr, addr + size};
-        for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) {
-            objects.push_back(pair.second);
+        const VAddr addr_end = addr + size;
+        auto it = mapped_addresses.lower_bound(addr);
+        if (it != mapped_addresses.begin()) {
+            --it;
        }
-
-        return objects;
+        while (it != mapped_addresses.end() && it->start < addr_end) {
+            if (it->Overlaps(addr, addr_end)) {
+                result.push_back(&*it);
+            }
+            ++it;
+        }
+        return result;
    }

    /// Returns a ticks counter used for tracking when cached objects were last modified
@@ -405,20 +425,24 @@ private:
        return ++modified_ticks;
    }

-    void FlushMap(MapInterval map) {
-        std::size_t size = map->GetEnd() - map->GetStart();
-        OwnerBuffer block = blocks[map->GetStart() >> block_page_bits];
+    void FlushMap(MapInterval* map) {
+        const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
+        ASSERT_OR_EXECUTE(it != blocks.end(), return;);
+
+        std::shared_ptr<Buffer> block = it->second;
+
+        const std::size_t size = map->end - map->start;
        staging_buffer.resize(size);
-        DownloadBlockData(block, block->GetOffset(map->GetStart()), size, staging_buffer.data());
-        system.Memory().WriteBlockUnsafe(map->GetStart(), staging_buffer.data(), size);
+        DownloadBlockData(*block, block->Offset(map->start), size, staging_buffer.data());
+        system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
        map->MarkAsModified(false, 0);
    }

-    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
-                                  std::size_t alignment) {
+    template <typename Callable>
+    BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
        AlignBuffer(alignment);
        const std::size_t uploaded_offset = buffer_offset;
-        std::memcpy(buffer_ptr, raw_pointer, size);
+        callable(buffer_ptr);

        buffer_ptr += size;
        buffer_offset += size;
@@ -432,97 +456,89 @@ private:
        buffer_offset = offset_aligned;
    }

-    OwnerBuffer EnlargeBlock(OwnerBuffer buffer) {
-        const std::size_t old_size = buffer->GetSize();
-        const std::size_t new_size = old_size + block_page_size;
-        const VAddr cpu_addr = buffer->GetCpuAddr();
-        OwnerBuffer new_buffer = CreateBlock(cpu_addr, new_size);
-        CopyBlock(buffer, new_buffer, 0, 0, old_size);
-        buffer->SetEpoch(epoch);
-        pending_destruction.push_back(buffer);
+    std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
+        const std::size_t old_size = buffer->Size();
+        const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
+        const VAddr cpu_addr = buffer->CpuAddr();
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
+        CopyBlock(*buffer, *new_buffer, 0, 0, old_size);
+        QueueDestruction(std::move(buffer));
+
        const VAddr cpu_addr_end = cpu_addr + new_size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
        }
+
        return new_buffer;
    }

-    OwnerBuffer MergeBlocks(OwnerBuffer first, OwnerBuffer second) {
-        const std::size_t size_1 = first->GetSize();
-        const std::size_t size_2 = second->GetSize();
-        const VAddr first_addr = first->GetCpuAddr();
-        const VAddr second_addr = second->GetCpuAddr();
+    std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
+                                        std::shared_ptr<Buffer> second) {
+        const std::size_t size_1 = first->Size();
+        const std::size_t size_2 = second->Size();
+        const VAddr first_addr = first->CpuAddr();
+        const VAddr second_addr = second->CpuAddr();
        const VAddr new_addr = std::min(first_addr, second_addr);
        const std::size_t new_size = size_1 + size_2;
-        OwnerBuffer new_buffer = CreateBlock(new_addr, new_size);
-        CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
-        CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
-        first->SetEpoch(epoch);
-        second->SetEpoch(epoch);
-        pending_destruction.push_back(first);
-        pending_destruction.push_back(second);
+
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
+        CopyBlock(*first, *new_buffer, 0, new_buffer->Offset(first_addr), size_1);
+        CopyBlock(*second, *new_buffer, 0, new_buffer->Offset(second_addr), size_2);
+        QueueDestruction(std::move(first));
+        QueueDestruction(std::move(second));
+
        const VAddr cpu_addr_end = new_addr + new_size - 1;
-        u64 page_start = new_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
        }
        return new_buffer;
    }

-    OwnerBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) {
-        OwnerBuffer found;
+    Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
+        std::shared_ptr<Buffer> found;
+
        const VAddr cpu_addr_end = cpu_addr + size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
            auto it = blocks.find(page_start);
            if (it == blocks.end()) {
                if (found) {
                    found = EnlargeBlock(found);
-                } else {
-                    const VAddr start_addr = (page_start << block_page_bits);
-                    found = CreateBlock(start_addr, block_page_size);
-                    blocks[page_start] = found;
-                }
-            } else {
-                if (found) {
-                    if (found == it->second) {
-                        ++page_start;
-                        continue;
-                    }
-                    found = MergeBlocks(found, it->second);
-                } else {
-                    found = it->second;
+                    continue;
                }
+                const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
+                found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
+                blocks.insert_or_assign(page_start, found);
+                continue;
+            }
+            if (!found) {
+                found = it->second;
+                continue;
+            }
+            if (found != it->second) {
+                found = MergeBlocks(std::move(found), it->second);
            }
-            ++page_start;
        }
-        return found;
+        return found.get();
    }

-    void MarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    void MarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
            auto it = written_pages.find(page_start);
            if (it != written_pages.end()) {
                it->second = it->second + 1;
            } else {
-                written_pages[page_start] = 1;
+                written_pages.insert_or_assign(page_start, 1);
            }
-            page_start++;
        }
    }

-    void UnmarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    void UnmarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
            auto it = written_pages.find(page_start);
            if (it != written_pages.end()) {
                if (it->second > 1) {
@@ -531,25 +547,27 @@ private:
                    written_pages.erase(it);
                }
            }
-            page_start++;
        }
    }

-    bool IsRegionWritten(const VAddr start, const VAddr end) const {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    bool IsRegionWritten(VAddr start, VAddr end) const {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
            if (written_pages.count(page_start) > 0) {
                return true;
            }
-            page_start++;
        }
        return false;
    }

-    void MarkForAsyncFlush(MapInterval& map) {
+    void QueueDestruction(std::shared_ptr<Buffer> buffer) {
+        buffer->SetEpoch(epoch);
+        pending_destruction.push(std::move(buffer));
+    }
+
+    void MarkForAsyncFlush(MapInterval* map) {
        if (!uncommitted_flushes) {
-            uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval>>();
+            uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
        }
        uncommitted_flushes->insert(map);
    }
@@ -558,7 +576,7 @@ private:
    Core::System& system;

    std::unique_ptr<StreamBuffer> stream_buffer;
-    BufferType stream_buffer_handle{};
+    BufferType stream_buffer_handle;

    bool invalidated = false;

@@ -566,27 +584,23 @@ private:
    u64 buffer_offset = 0;
    u64 buffer_offset_base = 0;

-    using IntervalSet = boost::icl::interval_set<VAddr>;
-    using IntervalCache = boost::icl::interval_map<VAddr, MapInterval>;
-    using IntervalType = typename IntervalCache::interval_type;
-    IntervalCache mapped_addresses;
+    MapIntervalAllocator mapped_addresses_allocator;
+    boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
+        mapped_addresses;

-    static constexpr u64 write_page_bit = 11;
    std::unordered_map<u64, u32> written_pages;
+    std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;

-    static constexpr u64 block_page_bits = 21;
-    static constexpr u64 block_page_size = 1ULL << block_page_bits;
-    std::unordered_map<u64, OwnerBuffer> blocks;
-
-    std::list<OwnerBuffer> pending_destruction;
+    std::queue<std::shared_ptr<Buffer>> pending_destruction;
    u64 epoch = 0;
    u64 modified_ticks = 0;

    std::vector<u8> staging_buffer;
-    std::list<MapInterval> marked_for_unregister;

-    std::shared_ptr<std::unordered_set<MapInterval>> uncommitted_flushes{};
-    std::list<std::shared_ptr<std::list<MapInterval>>> committed_flushes;
+    std::list<MapInterval*> marked_for_unregister;
+
+    std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
+    std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;

    std::recursive_mutex mutex;
 };
--- a/src/video_core/buffer_cache/map_interval.cpp
+++ b/src/video_core/buffer_cache/map_interval.cpp
@@ -0,0 +1,33 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <memory>
+
+#include "video_core/buffer_cache/map_interval.h"
+
+namespace VideoCommon {
+
+MapIntervalAllocator::MapIntervalAllocator() {
+    FillFreeList(first_chunk);
+}
+
+MapIntervalAllocator::~MapIntervalAllocator() = default;
+
+void MapIntervalAllocator::AllocateNewChunk() {
+    *new_chunk = std::make_unique<Chunk>();
+    FillFreeList(**new_chunk);
+    new_chunk = &(*new_chunk)->next;
+}
+
+void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
+    const std::size_t old_size = free_list.size();
+    free_list.resize(old_size + chunk.data.size());
+    std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
+                   [](MapInterval& interval) { return &interval; });
+}
+
+} // namespace VideoCommon
--- a/src/video_core/buffer_cache/map_interval.h
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -4,104 +4,89 @@

 #pragma once

+#include <array>
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <boost/intrusive/set_hook.hpp>
+
 #include "common/common_types.h"
 #include "video_core/gpu.h"

 namespace VideoCommon {

-class MapIntervalBase {
-public:
-    MapIntervalBase(const VAddr start, const VAddr end, const GPUVAddr gpu_addr)
-        : start{start}, end{end}, gpu_addr{gpu_addr} {}
+struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
+    MapInterval() = default;

-    void SetCpuAddress(VAddr new_cpu_addr) {
-        cpu_addr = new_cpu_addr;
+    /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
+
+    explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
+        : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
+
+    bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
+        return start <= other_start && other_end <= end;
    }

-    VAddr GetCpuAddress() const {
-        return cpu_addr;
+    bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
+        return start < other_end && other_start < end;
    }

-    GPUVAddr GetGpuAddress() const {
-        return gpu_addr;
-    }
-
-    bool IsInside(const VAddr other_start, const VAddr other_end) const {
-        return (start <= other_start && other_end <= end);
-    }
-
-    bool operator==(const MapIntervalBase& rhs) const {
-        return std::tie(start, end) == std::tie(rhs.start, rhs.end);
-    }
-
-    bool operator!=(const MapIntervalBase& rhs) const {
-        return !operator==(rhs);
-    }
-
-    void MarkAsRegistered(const bool registered) {
-        is_registered = registered;
-    }
-
-    bool IsRegistered() const {
-        return is_registered;
-    }
-
-    void SetMemoryMarked(bool is_memory_marked_) {
-        is_memory_marked = is_memory_marked_;
-    }
-
-    bool IsMemoryMarked() const {
-        return is_memory_marked;
-    }
-
-    void SetSyncPending(bool is_sync_pending_) {
-        is_sync_pending = is_sync_pending_;
-    }
-
-    bool IsSyncPending() const {
-        return is_sync_pending;
-    }
-
-    VAddr GetStart() const {
-        return start;
-    }
-
-    VAddr GetEnd() const {
-        return end;
-    }
-
-    void MarkAsModified(const bool is_modified_, const u64 tick) {
+    void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
        is_modified = is_modified_;
-        ticks = tick;
+        ticks = ticks_;
    }

-    bool IsModified() const {
-        return is_modified;
+    boost::intrusive::set_member_hook<> member_hook_;
+    VAddr start = 0;
+    VAddr end = 0;
+    GPUVAddr gpu_addr = 0;
+    u64 ticks = 0;
+    bool is_written = false;
+    bool is_modified = false;
+    bool is_registered = false;
+    bool is_memory_marked = false;
+    bool is_sync_pending = false;
+};
+
+struct MapIntervalCompare {
+    constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
+        return lhs.start < rhs.start;
+    }
+};
+
+class MapIntervalAllocator {
+public:
+    MapIntervalAllocator();
+    ~MapIntervalAllocator();
+
+    MapInterval* Allocate() {
+        if (free_list.empty()) {
+            AllocateNewChunk();
+        }
+        MapInterval* const interval = free_list.back();
+        free_list.pop_back();
+        return interval;
    }

-    u64 GetModificationTick() const {
-        return ticks;
-    }
-
-    void MarkAsWritten(const bool is_written_) {
-        is_written = is_written_;
-    }
-
-    bool IsWritten() const {
-        return is_written;
+    void Release(MapInterval* interval) {
+        free_list.push_back(interval);
    }

 private:
-    VAddr start;
-    VAddr end;
-    GPUVAddr gpu_addr;
-    VAddr cpu_addr{};
-    bool is_written{};
-    bool is_modified{};
-    bool is_registered{};
-    bool is_memory_marked{};
-    bool is_sync_pending{};
-    u64 ticks{};
+    struct Chunk {
+        std::unique_ptr<Chunk> next;
+        std::array<MapInterval, 0x8000> data;
+    };
+
+    void AllocateNewChunk();
+
+    void FillFreeList(Chunk& chunk);
+
+    std::vector<MapInterval*> free_list;
+    std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
+
+    Chunk first_chunk;
 };

 } // namespace VideoCommon
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
 Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                     MemoryManager& memory_manager)
    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
-      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
+      macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} {
    dirty.flags.flip();
-
    InitializeRegisterDefaults();
 }

@@ -106,7 +105,11 @@ void Maxwell3D::InitializeRegisterDefaults() {
    regs.rasterize_enable = 1;
    regs.rt_separate_frag_data = 1;
    regs.framebuffer_srgb = 1;
+    regs.line_width_aliased = 1.0f;
+    regs.line_width_smooth = 1.0f;
    regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise;
+    regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill;
+    regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill;

    shadow_state = regs;

@@ -116,7 +119,7 @@ void Maxwell3D::InitializeRegisterDefaults() {
    mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
 }

-void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) {
+void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
    // Reset the current macro.
    executing_macro = 0;

@@ -125,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3
        ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());

    // Execute the current macro.
-    macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters);
+    macro_engine->Execute(macro_positions[entry], parameters);
    if (mme_draw.current_mode != MMEDrawMode::Undefined) {
        FlushMMEInlineDraw();
    }
@@ -161,7 +164,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {

        // Call the macro when there are no more parameters in the command buffer
        if (is_last_call) {
-            CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
+            CallMacroMethod(executing_macro, macro_params);
            macro_params.clear();
        }
        return;
@@ -197,7 +200,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
        break;
    }
    case MAXWELL3D_REG_INDEX(macros.data): {
-        ProcessMacroUpload(arg);
+        macro_engine->AddCode(regs.macros.upload_address, arg);
        break;
    }
    case MAXWELL3D_REG_INDEX(macros.bind): {
@@ -306,7 +309,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,

        // Call the macro when there are no more parameters in the command buffer
        if (amount == methods_pending) {
-            CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
+            CallMacroMethod(executing_macro, macro_params);
            macro_params.clear();
        }
        return;
@@ -420,9 +423,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
 }

 void Maxwell3D::ProcessMacroUpload(u32 data) {
-    ASSERT_MSG(regs.macros.upload_address < macro_memory.size(),
-               "upload_address exceeded macro_memory size!");
-    macro_memory[regs.macros.upload_address++] = data;
+    macro_engine->AddCode(regs.macros.upload_address++, data);
 }

 void Maxwell3D::ProcessMacroBind(u32 data) {
@@ -457,8 +458,9 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {

 void Maxwell3D::ProcessQueryGet() {
    // TODO(Subv): Support the other query units.
-    ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop,
-               "Units other than CROP are unimplemented");
+    if (regs.query.query_get.unit != Regs::QueryUnit::Crop) {
+        LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented");
+    }

    switch (regs.query.query_get.operation) {
    case Regs::QueryOperation::Release:
@@ -534,8 +536,8 @@ void Maxwell3D::ProcessCounterReset() {
        rasterizer.ResetCounter(QueryType::SamplesPassed);
        break;
    default:
-        LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
-                    static_cast<int>(regs.counter_reset));
+        LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}",
+                  static_cast<int>(regs.counter_reset));
        break;
    }
 }
@@ -592,8 +594,8 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
                         system.GPU().GetTicks());
        return {};
    default:
-        UNIMPLEMENTED_MSG("Unimplemented query select type {}",
-                          static_cast<u32>(regs.query.query_get.select.Value()));
+        LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
+                  static_cast<u32>(regs.query.query_get.select.Value()));
        return 1;
    }
 }
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -23,7 +23,7 @@
 #include "video_core/engines/engine_upload.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/gpu.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro.h"
 #include "video_core/textures/texture.h"

 namespace Core {
@@ -1411,15 +1411,6 @@ public:

    const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;

-    /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
-    /// we've seen used.
-    using MacroMemory = std::array<u32, 0x40000>;
-
-    /// Gets a reference to macro memory.
-    const MacroMemory& GetMacroMemory() const {
-        return macro_memory;
-    }
-
    bool ShouldExecute() const {
        return execute_on;
    }
@@ -1468,16 +1459,13 @@ private:

    std::array<bool, Regs::NUM_REGS> mme_inline{};

-    /// Memory for macro code
-    MacroMemory macro_memory;
-
    /// Macro method that is currently being executed / being fed parameters.
    u32 executing_macro = 0;
    /// Parameters that have been submitted to the macro call so far.
    std::vector<u32> macro_params;

    /// Interpreter for the macro codes uploaded to the GPU.
-    MacroInterpreter macro_interpreter;
+    std::unique_ptr<MacroEngine> macro_engine;

    static constexpr u32 null_cb_data = 0xFFFFFFFF;
    struct {
@@ -1506,7 +1494,7 @@ private:
     * @param num_parameters Number of arguments
     * @param parameters Arguments to the method call
     */
-    void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters);
+    void CallMacroMethod(u32 method, const std::vector<u32>& parameters);

    /// Handles writes to the macro uploading register.
    void ProcessMacroUpload(u32 data);
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -0,0 +1,45 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/settings.h"
+#include "video_core/macro/macro.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+namespace Tegra {
+
+void MacroEngine::AddCode(u32 method, u32 data) {
+    uploaded_macro_code[method].push_back(data);
+}
+
+void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
+    auto compiled_macro = macro_cache.find(method);
+    if (compiled_macro != macro_cache.end()) {
+        compiled_macro->second->Execute(parameters, method);
+    } else {
+        // Macro not compiled, check if it's uploaded and if so, compile it
+        auto macro_code = uploaded_macro_code.find(method);
+        if (macro_code == uploaded_macro_code.end()) {
+            UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
+            return;
+        }
+        macro_cache[method] = Compile(macro_code->second);
+        macro_cache[method]->Execute(parameters, method);
+    }
+}
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
+    if (Settings::values.disable_macro_jit) {
+        return std::make_unique<MacroInterpreter>(maxwell3d);
+    }
+#ifdef ARCHITECTURE_x86_64
+    return std::make_unique<MacroJITx64>(maxwell3d);
+#else
+    return std::make_unique<MacroInterpreter>(maxwell3d);
+#endif
+}
+
+} // namespace Tegra
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro/macro.h
@@ -0,0 +1,128 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+namespace Engines {
+class Maxwell3D;
+}
+namespace Macro {
+constexpr std::size_t NUM_MACRO_REGISTERS = 8;
+enum class Operation : u32 {
+    ALU = 0,
+    AddImmediate = 1,
+    ExtractInsert = 2,
+    ExtractShiftLeftImmediate = 3,
+    ExtractShiftLeftRegister = 4,
+    Read = 5,
+    Unused = 6, // This operation doesn't seem to be a valid encoding.
+    Branch = 7,
+};
+
+enum class ALUOperation : u32 {
+    Add = 0,
+    AddWithCarry = 1,
+    Subtract = 2,
+    SubtractWithBorrow = 3,
+    // Operations 4-7 don't seem to be valid encodings.
+    Xor = 8,
+    Or = 9,
+    And = 10,
+    AndNot = 11,
+    Nand = 12
+};
+
+enum class ResultOperation : u32 {
+    IgnoreAndFetch = 0,
+    Move = 1,
+    MoveAndSetMethod = 2,
+    FetchAndSend = 3,
+    MoveAndSend = 4,
+    FetchAndSetMethod = 5,
+    MoveAndSetMethodFetchAndSend = 6,
+    MoveAndSetMethodSend = 7
+};
+
+enum class BranchCondition : u32 {
+    Zero = 0,
+    NotZero = 1,
+};
+
+union Opcode {
+    u32 raw;
+    BitField<0, 3, Operation> operation;
+    BitField<4, 3, ResultOperation> result_operation;
+    BitField<4, 1, BranchCondition> branch_condition;
+    // If set on a branch, then the branch doesn't have a delay slot.
+    BitField<5, 1, u32> branch_annul;
+    BitField<7, 1, u32> is_exit;
+    BitField<8, 3, u32> dst;
+    BitField<11, 3, u32> src_a;
+    BitField<14, 3, u32> src_b;
+    // The signed immediate overlaps the second source operand and the alu operation.
+    BitField<14, 18, s32> immediate;
+
+    BitField<17, 5, ALUOperation> alu_operation;
+
+    // Bitfield instructions data
+    BitField<17, 5, u32> bf_src_bit;
+    BitField<22, 5, u32> bf_size;
+    BitField<27, 5, u32> bf_dst_bit;
+
+    u32 GetBitfieldMask() const {
+        return (1 << bf_size) - 1;
+    }
+
+    s32 GetBranchTarget() const {
+        return static_cast<s32>(immediate * sizeof(u32));
+    }
+};
+
+union MethodAddress {
+    u32 raw;
+    BitField<0, 12, u32> address;
+    BitField<12, 6, u32> increment;
+};
+
+} // namespace Macro
+
+class CachedMacro {
+public:
+    virtual ~CachedMacro() = default;
+    /**
+     * Executes the macro code with the specified input parameters.
+     * @param code The macro byte code to execute
+     * @param parameters The parameters of the macro
+     */
+    virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
+};
+
+class MacroEngine {
+public:
+    virtual ~MacroEngine() = default;
+
+    // Store the uploaded macro code to compile them when they're called.
+    void AddCode(u32 method, u32 data);
+
+    // Compiles the macro if its not in the cache, and executes the compiled macro
+    void Execute(u32 method, const std::vector<u32>& parameters);
+
+protected:
+    virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
+
+private:
+    std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache;
+    std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
+};
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -1,4 +1,4 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

@@ -6,109 +6,46 @@
 #include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro_interpreter.h"

 MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));

 namespace Tegra {
-namespace {
-enum class Operation : u32 {
-    ALU = 0,
-    AddImmediate = 1,
-    ExtractInsert = 2,
-    ExtractShiftLeftImmediate = 3,
-    ExtractShiftLeftRegister = 4,
-    Read = 5,
-    Unused = 6, // This operation doesn't seem to be a valid encoding.
-    Branch = 7,
-};
-} // Anonymous namespace
-
-enum class MacroInterpreter::ALUOperation : u32 {
-    Add = 0,
-    AddWithCarry = 1,
-    Subtract = 2,
-    SubtractWithBorrow = 3,
-    // Operations 4-7 don't seem to be valid encodings.
-    Xor = 8,
-    Or = 9,
-    And = 10,
-    AndNot = 11,
-    Nand = 12
-};
-
-enum class MacroInterpreter::ResultOperation : u32 {
-    IgnoreAndFetch = 0,
-    Move = 1,
-    MoveAndSetMethod = 2,
-    FetchAndSend = 3,
-    MoveAndSend = 4,
-    FetchAndSetMethod = 5,
-    MoveAndSetMethodFetchAndSend = 6,
-    MoveAndSetMethodSend = 7
-};
-
-enum class MacroInterpreter::BranchCondition : u32 {
-    Zero = 0,
-    NotZero = 1,
-};
-
-union MacroInterpreter::Opcode {
-    u32 raw;
-    BitField<0, 3, Operation> operation;
-    BitField<4, 3, ResultOperation> result_operation;
-    BitField<4, 1, BranchCondition> branch_condition;
-    // If set on a branch, then the branch doesn't have a delay slot.
-    BitField<5, 1, u32> branch_annul;
-    BitField<7, 1, u32> is_exit;
-    BitField<8, 3, u32> dst;
-    BitField<11, 3, u32> src_a;
-    BitField<14, 3, u32> src_b;
-    // The signed immediate overlaps the second source operand and the alu operation.
-    BitField<14, 18, s32> immediate;
-
-    BitField<17, 5, ALUOperation> alu_operation;
-
-    // Bitfield instructions data
-    BitField<17, 5, u32> bf_src_bit;
-    BitField<22, 5, u32> bf_size;
-    BitField<27, 5, u32> bf_dst_bit;
-
-    u32 GetBitfieldMask() const {
-        return (1 << bf_size) - 1;
-    }
-
-    s32 GetBranchTarget() const {
-        return static_cast<s32>(immediate * sizeof(u32));
-    }
-};
-
 MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}

-void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) {
+std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
+}
+
+MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d,
+                                           const std::vector<u32>& code)
+    : maxwell3d(maxwell3d), code(code) {}
+
+void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {
    MICROPROFILE_SCOPE(MacroInterp);
    Reset();

    registers[1] = parameters[0];
+    num_parameters = parameters.size();

    if (num_parameters > parameters_capacity) {
        parameters_capacity = num_parameters;
        this->parameters = std::make_unique<u32[]>(num_parameters);
    }
-    std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32));
+    std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));
    this->num_parameters = num_parameters;

    // Execute the code until we hit an exit condition.
    bool keep_executing = true;
    while (keep_executing) {
-        keep_executing = Step(offset, false);
+        keep_executing = Step(false);
    }

    // Assert the the macro used all the input parameters
    ASSERT(next_parameter_index == num_parameters);
 }

-void MacroInterpreter::Reset() {
+void MacroInterpreterImpl::Reset() {
    registers = {};
    pc = 0;
    delayed_pc = {};
@@ -120,10 +57,10 @@ void MacroInterpreter::Reset() {
    carry_flag = false;
 }

-bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
+bool MacroInterpreterImpl::Step(bool is_delay_slot) {
    u32 base_address = pc;

-    Opcode opcode = GetOpcode(offset);
+    Macro::Opcode opcode = GetOpcode();
    pc += 4;

    // Update the program counter if we were delayed
@@ -134,18 +71,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
    }

    switch (opcode.operation) {
-    case Operation::ALU: {
+    case Macro::Operation::ALU: {
        u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
                                  GetRegister(opcode.src_b));
        ProcessResult(opcode.result_operation, opcode.dst, result);
        break;
    }
-    case Operation::AddImmediate: {
+    case Macro::Operation::AddImmediate: {
        ProcessResult(opcode.result_operation, opcode.dst,
                      GetRegister(opcode.src_a) + opcode.immediate);
        break;
    }
-    case Operation::ExtractInsert: {
+    case Macro::Operation::ExtractInsert: {
        u32 dst = GetRegister(opcode.src_a);
        u32 src = GetRegister(opcode.src_b);

@@ -155,7 +92,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
        ProcessResult(opcode.result_operation, opcode.dst, dst);
        break;
    }
-    case Operation::ExtractShiftLeftImmediate: {
+    case Macro::Operation::ExtractShiftLeftImmediate: {
        u32 dst = GetRegister(opcode.src_a);
        u32 src = GetRegister(opcode.src_b);

@@ -164,7 +101,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
        ProcessResult(opcode.result_operation, opcode.dst, result);
        break;
    }
-    case Operation::ExtractShiftLeftRegister: {
+    case Macro::Operation::ExtractShiftLeftRegister: {
        u32 dst = GetRegister(opcode.src_a);
        u32 src = GetRegister(opcode.src_b);

@@ -173,12 +110,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
        ProcessResult(opcode.result_operation, opcode.dst, result);
        break;
    }
-    case Operation::Read: {
+    case Macro::Operation::Read: {
        u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
        ProcessResult(opcode.result_operation, opcode.dst, result);
        break;
    }
-    case Operation::Branch: {
+    case Macro::Operation::Branch: {
        ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
        u32 value = GetRegister(opcode.src_a);
        bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
@@ -191,7 +128,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {

            delayed_pc = base_address + opcode.GetBranchTarget();
            // Execute one more instruction due to the delay slot.
-            return Step(offset, true);
+            return Step(true);
        }
        break;
    }
@@ -204,51 +141,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
    // cause an exit if it's executed inside a delay slot.
    if (opcode.is_exit && !is_delay_slot) {
        // Exit has a delay slot, execute the next instruction
-        Step(offset, true);
+        Step(true);
        return false;
    }

    return true;
 }

-MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const {
-    const auto& macro_memory{maxwell3d.GetMacroMemory()};
-    ASSERT((pc % sizeof(u32)) == 0);
-    ASSERT((pc + offset) < macro_memory.size() * sizeof(u32));
-    return {macro_memory[offset + pc / sizeof(u32)]};
-}
-
-u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) {
+u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
    switch (operation) {
-    case ALUOperation::Add: {
+    case Macro::ALUOperation::Add: {
        const u64 result{static_cast<u64>(src_a) + src_b};
        carry_flag = result > 0xffffffff;
        return static_cast<u32>(result);
    }
-    case ALUOperation::AddWithCarry: {
+    case Macro::ALUOperation::AddWithCarry: {
        const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
        carry_flag = result > 0xffffffff;
        return static_cast<u32>(result);
    }
-    case ALUOperation::Subtract: {
+    case Macro::ALUOperation::Subtract: {
        const u64 result{static_cast<u64>(src_a) - src_b};
        carry_flag = result < 0x100000000;
        return static_cast<u32>(result);
    }
-    case ALUOperation::SubtractWithBorrow: {
+    case Macro::ALUOperation::SubtractWithBorrow: {
        const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
        carry_flag = result < 0x100000000;
        return static_cast<u32>(result);
    }
-    case ALUOperation::Xor:
+    case Macro::ALUOperation::Xor:
        return src_a ^ src_b;
-    case ALUOperation::Or:
+    case Macro::ALUOperation::Or:
        return src_a | src_b;
-    case ALUOperation::And:
+    case Macro::ALUOperation::And:
        return src_a & src_b;
-    case ALUOperation::AndNot:
+    case Macro::ALUOperation::AndNot:
        return src_a & ~src_b;
-    case ALUOperation::Nand:
+    case Macro::ALUOperation::Nand:
        return ~(src_a & src_b);

    default:
@@ -257,43 +187,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b)
    }
 }

-void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) {
+void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
    switch (operation) {
-    case ResultOperation::IgnoreAndFetch:
+    case Macro::ResultOperation::IgnoreAndFetch:
        // Fetch parameter and ignore result.
        SetRegister(reg, FetchParameter());
        break;
-    case ResultOperation::Move:
+    case Macro::ResultOperation::Move:
        // Move result.
        SetRegister(reg, result);
        break;
-    case ResultOperation::MoveAndSetMethod:
+    case Macro::ResultOperation::MoveAndSetMethod:
        // Move result and use as Method Address.
        SetRegister(reg, result);
        SetMethodAddress(result);
        break;
-    case ResultOperation::FetchAndSend:
+    case Macro::ResultOperation::FetchAndSend:
        // Fetch parameter and send result.
        SetRegister(reg, FetchParameter());
        Send(result);
        break;
-    case ResultOperation::MoveAndSend:
+    case Macro::ResultOperation::MoveAndSend:
        // Move and send result.
        SetRegister(reg, result);
        Send(result);
        break;
-    case ResultOperation::FetchAndSetMethod:
+    case Macro::ResultOperation::FetchAndSetMethod:
        // Fetch parameter and use result as Method Address.
        SetRegister(reg, FetchParameter());
        SetMethodAddress(result);
        break;
-    case ResultOperation::MoveAndSetMethodFetchAndSend:
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
        // Move result and use as Method Address, then fetch and send parameter.
        SetRegister(reg, result);
        SetMethodAddress(result);
        Send(FetchParameter());
        break;
-    case ResultOperation::MoveAndSetMethodSend:
+    case Macro::ResultOperation::MoveAndSetMethodSend:
        // Move result and use as Method Address, then send bits 12:17 of result.
        SetRegister(reg, result);
        SetMethodAddress(result);
@@ -304,16 +234,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res
    }
 }

-u32 MacroInterpreter::FetchParameter() {
-    ASSERT(next_parameter_index < num_parameters);
-    return parameters[next_parameter_index++];
+bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
+    switch (cond) {
+    case Macro::BranchCondition::Zero:
+        return value == 0;
+    case Macro::BranchCondition::NotZero:
+        return value != 0;
+    }
+    UNREACHABLE();
+    return true;
 }

-u32 MacroInterpreter::GetRegister(u32 register_id) const {
+Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
+    ASSERT((pc % sizeof(u32)) == 0);
+    ASSERT(pc < code.size() * sizeof(u32));
+    return {code[pc / sizeof(u32)]};
+}
+
+u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
    return registers.at(register_id);
 }

-void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
+void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
    // Register 0 is hardwired as the zero register.
    // Ensure no writes to it actually occur.
    if (register_id == 0) {
@@ -323,30 +265,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
    registers.at(register_id) = value;
 }

-void MacroInterpreter::SetMethodAddress(u32 address) {
+void MacroInterpreterImpl::SetMethodAddress(u32 address) {
    method_address.raw = address;
 }

-void MacroInterpreter::Send(u32 value) {
+void MacroInterpreterImpl::Send(u32 value) {
    maxwell3d.CallMethodFromMME(method_address.address, value);
    // Increment the method address by the method increment.
    method_address.address.Assign(method_address.address.Value() +
                                  method_address.increment.Value());
 }

-u32 MacroInterpreter::Read(u32 method) const {
+u32 MacroInterpreterImpl::Read(u32 method) const {
    return maxwell3d.GetRegisterValue(method);
 }

-bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const {
-    switch (cond) {
-    case BranchCondition::Zero:
-        return value == 0;
-    case BranchCondition::NotZero:
-        return value != 0;
-    }
-    UNREACHABLE();
-    return true;
+u32 MacroInterpreterImpl::FetchParameter() {
+    ASSERT(next_parameter_index < num_parameters);
+    return parameters[next_parameter_index++];
 }

 } // namespace Tegra
--- a/src/video_core/macro/macro_interpreter.h
+++ b/src/video_core/macro/macro_interpreter.h
@@ -1,44 +1,37 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

 #pragma once
-
 #include <array>
 #include <optional>
-
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_types.h"
+#include "video_core/macro/macro.h"

 namespace Tegra {
 namespace Engines {
 class Maxwell3D;
 }

-class MacroInterpreter final {
+class MacroInterpreter final : public MacroEngine {
 public:
    explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d);

-    /**
-     * Executes the macro code with the specified input parameters.
-     * @param offset Offset to start execution at.
-     * @param parameters The parameters of the macro.
-     */
-    void Execute(u32 offset, std::size_t num_parameters, const u32* parameters);
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;

 private:
-    enum class ALUOperation : u32;
-    enum class BranchCondition : u32;
-    enum class ResultOperation : u32;
+    Engines::Maxwell3D& maxwell3d;
+};

-    union Opcode;
-
-    union MethodAddress {
-        u32 raw;
-        BitField<0, 12, u32> address;
-        BitField<12, 6, u32> increment;
-    };
+class MacroInterpreterImpl : public CachedMacro {
+public:
+    MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+    void Execute(const std::vector<u32>& parameters, u32 method) override;

+private:
    /// Resets the execution engine state, zeroing registers, etc.
    void Reset();

@@ -49,20 +42,20 @@ private:
     * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
     * previous instruction.
     */
-    bool Step(u32 offset, bool is_delay_slot);
+    bool Step(bool is_delay_slot);

    /// Calculates the result of an ALU operation. src_a OP src_b;
-    u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b);
+    u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);

    /// Performs the result operation on the input result and stores it in the specified register
    /// (if necessary).
-    void ProcessResult(ResultOperation operation, u32 reg, u32 result);
+    void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);

    /// Evaluates the branch condition and returns whether the branch should be taken or not.
-    bool EvaluateBranchCondition(BranchCondition cond, u32 value) const;
+    bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;

    /// Reads an opcode at the current program counter location.
-    Opcode GetOpcode(u32 offset) const;
+    Macro::Opcode GetOpcode() const;

    /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
    u32 GetRegister(u32 register_id) const;
@@ -89,13 +82,11 @@ private:
    /// Program counter to execute at after the delay slot is executed.
    std::optional<u32> delayed_pc;

-    static constexpr std::size_t NumMacroRegisters = 8;
-
    /// General purpose macro registers.
-    std::array<u32, NumMacroRegisters> registers = {};
+    std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};

    /// Method address to use for the next Send instruction.
-    MethodAddress method_address = {};
+    Macro::MethodAddress method_address = {};

    /// Input parameters of the current macro.
    std::unique_ptr<u32[]> parameters;
@@ -105,5 +96,7 @@ private:
    u32 next_parameter_index = 0;

    bool carry_flag = false;
+    const std::vector<u32>& code;
 };
+
 } // namespace Tegra
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -0,0 +1,640 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/microprofile.h"
+#include "common/x64/xbyak_util.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47));
+MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
+
+namespace Tegra {
+static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9;
+static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10;
+static const Xbyak::Reg64 STATE = Xbyak::util::r11;
+static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12;
+static const Xbyak::Reg32 RESULT = Xbyak::util::r13d;
+static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13;
+static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
+static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14;
+static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
+
+static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
+    PARAMETERS,
+    REGISTERS,
+    STATE,
+    NEXT_PARAMETER,
+    RESULT,
+    METHOD_ADDRESS,
+    BRANCH_HOLDER,
+});
+
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+
+std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
+}
+
+MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code)
+    : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) {
+    Compile();
+}
+
+MacroJITx64Impl::~MacroJITx64Impl() = default;
+
+void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
+    MICROPROFILE_SCOPE(MacroJitExecute);
+    ASSERT_OR_EXECUTE(program != nullptr, { return; });
+    JITState state{};
+    state.maxwell3d = &maxwell3d;
+    state.registers = {};
+    state.parameters = parameters.data();
+    program(&state);
+}
+
+void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
+    const bool is_a_zero = opcode.src_a == 0;
+    const bool is_b_zero = opcode.src_b == 0;
+    const bool valid_operation = !is_a_zero && !is_b_zero;
+    const bool is_move_operation = !is_a_zero && is_b_zero;
+    const bool has_zero_register = is_a_zero || is_b_zero;
+
+    Xbyak::Reg64 src_a;
+    Xbyak::Reg32 src_b;
+
+    if (!optimizer.zero_reg_skip) {
+        src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
+        src_b = Compile_GetRegister(opcode.src_b, ebx);
+    } else {
+        if (!is_a_zero) {
+            src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
+        }
+        if (!is_b_zero) {
+            src_b = Compile_GetRegister(opcode.src_b, ebx);
+        }
+    }
+    Xbyak::Label skip_carry{};
+
+    bool has_emitted = false;
+
+    switch (opcode.alu_operation) {
+    case Macro::ALUOperation::Add:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                add(src_a, src_b);
+            }
+        } else {
+            add(src_a, src_b);
+        }
+
+        if (!optimizer.can_skip_carry) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::AddWithCarry:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        adc(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Subtract:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                sub(src_a, src_b);
+                has_emitted = true;
+            }
+        } else {
+            sub(src_a, src_b);
+            has_emitted = true;
+        }
+        if (!optimizer.can_skip_carry && has_emitted) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::SubtractWithBorrow:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        sbb(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Xor:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                xor_(src_a, src_b);
+            }
+        } else {
+            xor_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Or:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                or_(src_a, src_b);
+            }
+        } else {
+            or_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::And:
+        if (optimizer.zero_reg_skip) {
+            if (!has_zero_register) {
+                and_(src_a, src_b);
+            }
+        } else {
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::AndNot:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                not_(src_b);
+                and_(src_a, src_b);
+            }
+        } else {
+            not_(src_b);
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Nand:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                and_(src_a, src_b);
+                not_(src_a);
+            }
+        } else {
+            and_(src_a, src_b);
+            not_(src_a);
+        }
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}",
+                          static_cast<std::size_t>(opcode.alu_operation.Value()));
+        break;
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
+    if (optimizer.skip_dummy_addimmediate) {
+        // Games tend to use this as an exit instruction placeholder. It's to encode an instruction
+        // without doing anything. In our case we can just not emit anything.
+        if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
+            return;
+        }
+    }
+    // Check for redundant moves
+    if (optimizer.optimize_for_method_move &&
+        opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+        if (next_opcode.has_value()) {
+            const auto next = *next_opcode;
+            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+                return;
+            }
+        }
+    }
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
+    auto dst = Compile_GetRegister(opcode.src_a, RESULT);
+    auto src = Compile_GetRegister(opcode.src_b, eax);
+
+    if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) {
+        shr(src, opcode.bf_src_bit);
+    } else if (opcode.bf_src_bit == 31) {
+        xor_(src, src);
+    }
+    // Don't bother masking the whole register since we're using a 32 bit register
+    if (opcode.bf_size != 31 && opcode.bf_size != 0) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+    if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+
+    const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
+    if (mask != 0xffffffff) {
+        and_(dst, mask);
+    }
+    or_(dst, src);
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
+    auto dst = Compile_GetRegister(opcode.src_a, eax);
+    auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    shr(src, al);
+    if (opcode.bf_size != 0 && opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+
+    if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
+    auto dst = Compile_GetRegister(opcode.src_a, eax);
+    auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    if (opcode.bf_src_bit != 0) {
+        shr(src, opcode.bf_src_bit);
+    }
+
+    if (opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    }
+    shl(src, al);
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) {
+    return maxwell3d->GetRegisterValue(method);
+}
+
+static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
+    maxwell3d->CallMethodFromMME(method_address.address, value);
+}
+
+void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    mov(Common::X64::ABI_PARAM1, qword[STATE]);
+    mov(Common::X64::ABI_PARAM2, RESULT);
+    Common::X64::CallFarFunction(*this, &Read);
+    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    mov(RESULT, Common::X64::ABI_RETURN.cvt32());
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
+    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    mov(Common::X64::ABI_PARAM1, qword[STATE]);
+    mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
+    mov(Common::X64::ABI_PARAM3, value);
+    Common::X64::CallFarFunction(*this, &Send);
+    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+
+    Xbyak::Label dont_process{};
+    // Get increment
+    test(METHOD_ADDRESS, 0x3f000);
+    // If zero, method address doesn't update
+    je(dont_process);
+
+    mov(ecx, METHOD_ADDRESS);
+    and_(METHOD_ADDRESS, 0xfff);
+    shr(ecx, 12);
+    and_(ecx, 0x3f);
+    lea(eax, ptr[rcx + METHOD_ADDRESS_64]);
+    sal(ecx, 12);
+    or_(eax, ecx);
+
+    mov(METHOD_ADDRESS, eax);
+
+    L(dont_process);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
+    ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
+    const s32 jump_address =
+        static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
+
+    Xbyak::Label end;
+    auto value = Compile_GetRegister(opcode.src_a, eax);
+    test(value, value);
+    if (optimizer.has_delayed_pc) {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            jne(end, T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            je(end, T_NEAR);
+            break;
+        }
+
+        if (opcode.branch_annul) {
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(labels[jump_address], T_NEAR);
+        } else {
+            Xbyak::Label handle_post_exit{};
+            Xbyak::Label skip{};
+            jmp(skip, T_NEAR);
+            if (opcode.is_exit) {
+                L(handle_post_exit);
+                // Execute 1 instruction
+                mov(BRANCH_HOLDER, end_of_code);
+                // Jump to next instruction to skip delay slot check
+                jmp(labels[jump_address], T_NEAR);
+            } else {
+                L(handle_post_exit);
+                xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+                jmp(labels[jump_address], T_NEAR);
+            }
+            L(skip);
+            mov(BRANCH_HOLDER, handle_post_exit);
+            jmp(delay_skip[pc], T_NEAR);
+        }
+    } else {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            je(labels[jump_address], T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            jne(labels[jump_address], T_NEAR);
+            break;
+        }
+    }
+
+    L(end);
+}
+
+void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
+    optimizer.can_skip_carry = true;
+    optimizer.has_delayed_pc = false;
+    for (auto raw_op : code) {
+        Macro::Opcode op{};
+        op.raw = raw_op;
+
+        if (op.operation == Macro::Operation::ALU) {
+            // Scan for any ALU operations which actually use the carry flag, if they don't exist in
+            // our current code we can skip emitting the carry flag handling operations
+            if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
+                optimizer.can_skip_carry = false;
+            }
+        }
+
+        if (op.operation == Macro::Operation::Branch) {
+            if (!op.branch_annul) {
+                optimizer.has_delayed_pc = true;
+            }
+        }
+    }
+}
+
+void MacroJITx64Impl::Compile() {
+    MICROPROFILE_SCOPE(MacroJitCompile);
+    bool keep_executing = true;
+    labels.fill(Xbyak::Label());
+
+    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    // JIT state
+    mov(STATE, Common::X64::ABI_PARAM1);
+    mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 +
+                          static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]);
+    mov(REGISTERS, Common::X64::ABI_PARAM1);
+    add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers)));
+    xor_(RESULT, RESULT);
+    xor_(METHOD_ADDRESS, METHOD_ADDRESS);
+    xor_(NEXT_PARAMETER, NEXT_PARAMETER);
+    xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+
+    mov(dword[REGISTERS + 4], Compile_FetchParameter());
+
+    // Track get register for zero registers and mark it as no-op
+    optimizer.zero_reg_skip = true;
+
+    // AddImmediate tends to be used as a NOP instruction, if we detect this we can
+    // completely skip the entire code path and no emit anything
+    optimizer.skip_dummy_addimmediate = true;
+
+    // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
+    // one if our register isn't "dirty"
+    optimizer.optimize_for_method_move = true;
+
+    // Check to see if we can skip emitting certain instructions
+    Optimizer_ScanFlags();
+
+    const u32 op_count = static_cast<u32>(code.size());
+    for (u32 i = 0; i < op_count; i++) {
+        if (i < op_count - 1) {
+            pc = i + 1;
+            next_opcode = GetOpCode();
+        } else {
+            next_opcode = {};
+        }
+        pc = i;
+        Compile_NextInstruction();
+    }
+
+    L(end_of_code);
+
+    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    ret();
+    ready();
+    program = getCode<ProgramType>();
+}
+
+bool MacroJITx64Impl::Compile_NextInstruction() {
+    const auto opcode = GetOpCode();
+    if (labels[pc].getAddress()) {
+        return false;
+    }
+
+    L(labels[pc]);
+
+    switch (opcode.operation) {
+    case Macro::Operation::ALU:
+        Compile_ALU(opcode);
+        break;
+    case Macro::Operation::AddImmediate:
+        Compile_AddImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractInsert:
+        Compile_ExtractInsert(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftImmediate:
+        Compile_ExtractShiftLeftImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftRegister:
+        Compile_ExtractShiftLeftRegister(opcode);
+        break;
+    case Macro::Operation::Read:
+        Compile_Read(opcode);
+        break;
+    case Macro::Operation::Branch:
+        Compile_Branch(opcode);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
+        break;
+    }
+
+    if (optimizer.has_delayed_pc) {
+        if (opcode.is_exit) {
+            mov(rax, end_of_code);
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            cmove(BRANCH_HOLDER, rax);
+            // Jump to next instruction to skip delay slot check
+            je(labels[pc + 1], T_NEAR);
+        } else {
+            // TODO(ogniK): Optimize delay slot branching
+            Xbyak::Label no_delay_slot{};
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            je(no_delay_slot, T_NEAR);
+            mov(rax, BRANCH_HOLDER);
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(rax);
+            L(no_delay_slot);
+        }
+        L(delay_skip[pc]);
+        if (opcode.is_exit) {
+            return false;
+        }
+    } else {
+        test(BRANCH_HOLDER, BRANCH_HOLDER);
+        jne(end_of_code, T_NEAR);
+        if (opcode.is_exit) {
+            inc(BRANCH_HOLDER);
+            return false;
+        }
+    }
+    return true;
+}
+
+Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
+    mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]);
+    inc(NEXT_PARAMETER);
+    return eax;
+}
+
+Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
+    if (index == 0) {
+        // Register 0 is always zero
+        xor_(dst, dst);
+    } else {
+        mov(dst, dword[REGISTERS + index * sizeof(u32)]);
+    }
+
+    return dst;
+}
+
+Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) {
+    if (index == 0) {
+        // Register 0 is always zero
+        xor_(dst, dst);
+    } else {
+        mov(dst, dword[REGISTERS + index * sizeof(u32)]);
+    }
+
+    return dst;
+}
+
+void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) {
+    Xbyak::Label zero{}, end{};
+    xor_(ecx, ecx);
+    shr(dst, 32);
+    setne(cl);
+    mov(dword[STATE + offsetof(JITState, carry_flag)], ecx);
+}
+
+void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
+    auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) {
+        // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
+        // register.
+        if (reg == 0) {
+            return;
+        }
+        mov(dword[REGISTERS + reg * sizeof(u32)], result);
+    };
+    auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); };
+
+    switch (operation) {
+    case Macro::ResultOperation::IgnoreAndFetch:
+        SetRegister(reg, Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::Move:
+        SetRegister(reg, RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethod:
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSend:
+        // Fetch parameter and send result.
+        SetRegister(reg, Compile_FetchParameter());
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSend:
+        // Move and send result.
+        SetRegister(reg, RESULT);
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSetMethod:
+        // Fetch parameter and use result as Method Address.
+        SetRegister(reg, Compile_FetchParameter());
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
+        // Move result and use as Method Address, then fetch and send parameter.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        Compile_Send(Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodSend:
+        // Move result and use as Method Address, then send bits 12:17 of result.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        shr(RESULT, 12);
+        and_(RESULT, 0b111111);
+        Compile_Send(RESULT);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation));
+    }
+}
+
+Macro::Opcode MacroJITx64Impl::GetOpCode() const {
+    ASSERT(pc < code.size());
+    return {code[pc]};
+}
+
+std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
+    return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
+}
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_jit_x64.h
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -0,0 +1,100 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <bitset>
+#include <xbyak.h>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "common/x64/xbyak_abi.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
+constexpr size_t MAX_CODE_SIZE = 0x10000;
+
+class MacroJITx64 final : public MacroEngine {
+public:
+    explicit MacroJITx64(Engines::Maxwell3D& maxwell3d);
+
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
+public:
+    MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+    ~MacroJITx64Impl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+    void Compile_ALU(Macro::Opcode opcode);
+    void Compile_AddImmediate(Macro::Opcode opcode);
+    void Compile_ExtractInsert(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
+    void Compile_Read(Macro::Opcode opcode);
+    void Compile_Branch(Macro::Opcode opcode);
+
+private:
+    void Optimizer_ScanFlags();
+
+    void Compile();
+    bool Compile_NextInstruction();
+
+    Xbyak::Reg32 Compile_FetchParameter();
+    Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
+    Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst);
+    void Compile_WriteCarry(Xbyak::Reg64 dst);
+
+    void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
+    void Compile_Send(Xbyak::Reg32 value);
+
+    Macro::Opcode GetOpCode() const;
+    std::bitset<32> PersistentCallerSavedRegs() const;
+
+    struct JITState {
+        Engines::Maxwell3D* maxwell3d{};
+        std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
+        const u32* parameters{};
+        u32 carry_flag{};
+    };
+    static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
+    using ProgramType = void (*)(JITState*);
+
+    struct OptimizerState {
+        bool can_skip_carry{};
+        bool has_delayed_pc{};
+        bool zero_reg_skip{};
+        bool skip_dummy_addimmediate{};
+        bool optimize_for_method_move{};
+    };
+    OptimizerState optimizer{};
+
+    std::optional<Macro::Opcode> next_opcode{};
+    ProgramType program{nullptr};
+
+    std::array<Xbyak::Label, MAX_CODE_SIZE> labels{};
+    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip{};
+    Xbyak::Label end_of_code{};
+
+    bool is_delay_slot{};
+    u32 pc{};
+    std::optional<u32> delayed_pc;
+
+    const std::vector<u32>& code;
+    Engines::Maxwell3D& maxwell3d;
+};
+
+} // namespace Tegra
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -56,9 +56,27 @@ public:
        last_modified_ticks = cache.GetModifiedTicks();
    }

+    void SetMemoryMarked(bool is_memory_marked_) {
+        is_memory_marked = is_memory_marked_;
+    }
+
+    bool IsMemoryMarked() const {
+        return is_memory_marked;
+    }
+
+    void SetSyncPending(bool is_sync_pending_) {
+        is_sync_pending = is_sync_pending_;
+    }
+
+    bool IsSyncPending() const {
+        return is_sync_pending;
+    }
+
 private:
    bool is_registered{};      ///< Whether the object is currently registered with the cache
    bool is_dirty{};           ///< Whether the object is dirty (out of sync with guest memory)
+    bool is_memory_marked{};   ///< Whether the object is marking rasterizer memory.
+    bool is_sync_pending{};    ///< Whether the object is pending deletion.
    u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
    VAddr cpu_addr{};          ///< Cpu address memory, unique from emulated virtual address space
 };
@@ -94,6 +112,30 @@ public:
        }
    }

+    void OnCPUWrite(VAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
+        for (const auto& object : GetSortedObjectsFromRegion(addr, size)) {
+            if (object->IsRegistered()) {
+                UnmarkMemory(object);
+                object->SetSyncPending(true);
+                marked_for_unregister.emplace_back(object);
+            }
+        }
+    }
+
+    void SyncGuestHost() {
+        std::lock_guard lock{mutex};
+
+        for (const auto& object : marked_for_unregister) {
+            if (object->IsRegistered()) {
+                object->SetSyncPending(false);
+                Unregister(object);
+            }
+        }
+        marked_for_unregister.clear();
+    }
+
    /// Invalidates everything in the cache
    void InvalidateAll() {
        std::lock_guard lock{mutex};
@@ -120,19 +162,32 @@ protected:
        interval_cache.add({GetInterval(object), ObjectSet{object}});
        map_cache.insert({object->GetCpuAddr(), object});
        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
+        object->SetMemoryMarked(true);
    }

    /// Unregisters an object from the cache
    virtual void Unregister(const T& object) {
        std::lock_guard lock{mutex};

+        UnmarkMemory(object);
        object->SetIsRegistered(false);
-        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
+        if (object->IsSyncPending()) {
+            marked_for_unregister.remove(object);
+            object->SetSyncPending(false);
+        }
        const VAddr addr = object->GetCpuAddr();
        interval_cache.subtract({GetInterval(object), ObjectSet{object}});
        map_cache.erase(addr);
    }

+    void UnmarkMemory(const T& object) {
+        if (!object->IsMemoryMarked()) {
+            return;
+        }
+        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
+        object->SetMemoryMarked(false);
+    }
+
    /// Returns a ticks counter used for tracking when cached objects were last modified
    u64 GetModifiedTicks() {
        std::lock_guard lock{mutex};
@@ -194,4 +249,5 @@ private:
    IntervalCache interval_cache; ///< Cache of objects
    u64 modified_ticks{};         ///< Counter of cache state ticks, used for in-order flushing
    VideoCore::RasterizerInterface& rasterizer;
+    std::list<T> marked_for_unregister;
 };
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -8,6 +8,7 @@

 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
@@ -21,13 +22,12 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;

 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));

-CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size)
-    : VideoCommon::BufferBlock{cpu_addr, size} {
+Buffer::Buffer(VAddr cpu_addr, const std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} {
    gl_buffer.Create();
    glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
 }

-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;

 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                               const Device& device, std::size_t stream_size)
@@ -47,12 +47,8 @@ OGLBufferCache::~OGLBufferCache() {
    glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
 }

-Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(cpu_addr, size);
-}
-
-GLuint OGLBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
+std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(cpu_addr, size);
 }

 GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
@@ -61,7 +57,7 @@ GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {

 void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
                                     const u8* data) {
-    glNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
+    glNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
                         static_cast<GLsizeiptr>(size), data);
 }

@@ -69,20 +65,20 @@ void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
                                       u8* data) {
    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
-    glGetNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
+    glGetNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
                            static_cast<GLsizeiptr>(size), data);
 }

 void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
                               std::size_t dst_offset, std::size_t size) {
-    glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast<GLintptr>(src_offset),
+    glCopyNamedBufferSubData(src.Handle(), dst.Handle(), static_cast<GLintptr>(src_offset),
                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
 }

 OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
                                                             std::size_t size) {
    DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
-    const GLuint& cbuf = cbufs[cbuf_cursor++];
+    const GLuint cbuf = cbufs[cbuf_cursor++];
    glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
    return {cbuf, 0};
 }
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -24,17 +24,12 @@ class Device;
 class OGLStreamBuffer;
 class RasterizerOpenGL;

-class CachedBufferBlock;
-
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
-
-class CachedBufferBlock : public VideoCommon::BufferBlock {
+class Buffer : public VideoCommon::BufferBlock {
 public:
-    explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size);
-    ~CachedBufferBlock();
+    explicit Buffer(VAddr cpu_addr, const std::size_t size);
+    ~Buffer();

-    GLuint GetHandle() const {
+    GLuint Handle() const {
        return gl_buffer.handle;
    }

@@ -42,6 +37,7 @@ private:
    OGLBuffer gl_buffer;
 };

+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
 class OGLBufferCache final : public GenericBufferCache {
 public:
    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
@@ -55,9 +51,7 @@ public:
    }

 protected:
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
-    GLuint ToHandle(const Buffer& buffer) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;

    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
                         const u8* data) override;
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -6,6 +6,7 @@
 #include <array>
 #include <cstddef>
 #include <cstring>
+#include <limits>
 #include <optional>
 #include <vector>

@@ -13,6 +14,7 @@

 #include "common/logging/log.h"
 #include "common/scope_exit.h"
+#include "core/settings.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"

@@ -25,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;

 constexpr u32 NumStages = 5;

-constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
-                                  GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS,
-                                  GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS};
+constexpr std::array LimitUBOs = {
+    GL_MAX_VERTEX_UNIFORM_BLOCKS,          GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
+    GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
+    GL_MAX_FRAGMENT_UNIFORM_BLOCKS,        GL_MAX_COMPUTE_UNIFORM_BLOCKS};

 constexpr std::array LimitSSBOs = {
-    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
+    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS,          GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
    GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
-    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS};
+    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS,        GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};

-constexpr std::array LimitSamplers = {
-    GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
-    GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
-    GL_MAX_TEXTURE_IMAGE_UNITS};
+constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};

-constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS,
-                                    GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
-                                    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS,
-                                    GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS};
+constexpr std::array LimitImages = {
+    GL_MAX_VERTEX_IMAGE_UNIFORMS,          GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
+    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
+    GL_MAX_FRAGMENT_IMAGE_UNIFORMS,        GL_MAX_COMPUTE_IMAGE_UNIFORMS};

 template <typename T>
 T GetInteger(GLenum pname) {
@@ -84,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
    return std::exchange(base, base + amount);
 }

+std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max;
+    std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
+                   [](GLenum pname) { return GetInteger<u32>(pname); });
+    return max;
+}
+
 std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
    std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;

@@ -132,6 +144,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
 }

 bool IsASTCSupported() {
+    static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
    static constexpr std::array formats = {
        GL_COMPRESSED_RGBA_ASTC_4x4_KHR,           GL_COMPRESSED_RGBA_ASTC_5x4_KHR,
        GL_COMPRESSED_RGBA_ASTC_5x5_KHR,           GL_COMPRESSED_RGBA_ASTC_6x5_KHR,
@@ -148,25 +161,43 @@ bool IsASTCSupported() {
        GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR,  GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR,
        GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR,
    };
-    return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) {
-               GLint supported;
-               glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1,
-                                     &supported);
-               return supported == GL_TRUE;
-           }) == formats.end();
+    static constexpr std::array required_support = {
+        GL_VERTEX_TEXTURE,   GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE,
+        GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE,     GL_COMPUTE_TEXTURE,
+    };
+
+    for (const GLenum target : targets) {
+        for (const GLenum format : formats) {
+            for (const GLenum support : required_support) {
+                GLint value;
+                glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value);
+                if (value != GL_FULL_SUPPORT) {
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
 }

 } // Anonymous namespace

-Device::Device() : base_bindings{BuildBaseBindings()} {
+Device::Device()
+    : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
    const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
-    const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+    const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
    const std::vector extensions = GetExtensions();

    const bool is_nvidia = vendor == "NVIDIA Corporation";
    const bool is_amd = vendor == "ATI Technologies Inc.";
-    const bool is_intel = vendor == "Intel";
-    const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr;
+
+    bool disable_fast_buffer_sub_data = false;
+    if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
+        LOG_WARNING(
+            Render_OpenGL,
+            "Beta driver 443.24 is known to have issues. There might be performance issues.");
+        disable_fast_buffer_sub_data = true;
+    }

    uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
    shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
@@ -181,16 +212,24 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
    has_variable_aoffi = TestVariableAoffi();
    has_component_indexing_bug = is_amd;
    has_precise_bug = TestPreciseBug();
-    has_broken_compute = is_intel_proprietary;
-    has_fast_buffer_sub_data = is_nvidia;
+    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+    use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
+                           GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
+                           GLAD_GL_NV_transform_feedback2;

    LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
    LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
    LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
+
+    if (Settings::values.use_assembly_shaders && !use_assembly_shaders) {
+        LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
+    }
 }

 Device::Device(std::nullptr_t) {
-    uniform_buffer_alignment = 0;
+    max_uniform_buffers.fill(std::numeric_limits<u32>::max());
+    uniform_buffer_alignment = 4;
+    shader_storage_alignment = 4;
    max_vertex_attributes = 16;
    max_varyings = 15;
    has_warp_intrinsics = true;
@@ -198,9 +237,6 @@ Device::Device(std::nullptr_t) {
    has_vertex_viewport_layer = true;
    has_image_load_formatted = true;
    has_variable_aoffi = true;
-    has_component_indexing_bug = false;
-    has_broken_compute = false;
-    has_precise_bug = false;
 }

 bool Device::TestVariableAoffi() {
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -24,6 +24,10 @@ public:
    explicit Device();
    explicit Device(std::nullptr_t);

+    u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
+        return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
+    }
+
    const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
        return base_bindings[stage_index];
    }
@@ -80,19 +84,20 @@ public:
        return has_precise_bug;
    }

-    bool HasBrokenCompute() const {
-        return has_broken_compute;
-    }
-
    bool HasFastBufferSubData() const {
        return has_fast_buffer_sub_data;
    }

+    bool UseAssemblyShaders() const {
+        return use_assembly_shaders;
+    }
+
 private:
    static bool TestVariableAoffi();
    static bool TestPreciseBug();

-    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings;
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
+    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
    std::size_t uniform_buffer_alignment{};
    std::size_t shader_storage_alignment{};
    u32 max_vertex_attributes{};
@@ -105,8 +110,8 @@ private:
    bool has_variable_aoffi{};
    bool has_component_indexing_bug{};
    bool has_precise_bug{};
-    bool has_broken_compute{};
    bool has_fast_buffer_sub_data{};
+    bool use_assembly_shaders{};
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -4,6 +4,7 @@

 #include "common/assert.h"

+#include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_fence_manager.h"

 namespace OpenGL {
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -54,6 +54,12 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255

 namespace {

+constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
+constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
+    NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
+constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
+    NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
+
 constexpr std::size_t NumSupportedVertexAttributes = 16;

 template <typename Engine, typename Entry>
@@ -87,6 +93,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
    return buffer.size;
 }

+/// Translates hardware transform feedback indices
+/// @param location Hardware location
+/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
+/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt
+std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
+    const u8 index = location / 4;
+    if (index >= 8 && index <= 39) {
+        return {GL_GENERIC_ATTRIB_NV, index - 8};
+    }
+    if (index >= 48 && index <= 55) {
+        return {GL_TEXTURE_COORD_NV, index - 48};
+    }
+    switch (index) {
+    case 7:
+        return {GL_POSITION, 0};
+    case 40:
+        return {GL_PRIMARY_COLOR_NV, 0};
+    case 41:
+        return {GL_SECONDARY_COLOR_NV, 0};
+    case 42:
+        return {GL_BACK_PRIMARY_COLOR_NV, 0};
+    case 43:
+        return {GL_BACK_SECONDARY_COLOR_NV, 0};
+    }
+    UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
+    return {GL_POSITION, 0};
+}
+
 void oglEnable(GLenum cap, bool state) {
    (state ? glEnable : glDisable)(cap);
 }
@@ -94,17 +128,33 @@ void oglEnable(GLenum cap, bool state) {
 } // Anonymous namespace

 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                                   ScreenInfo& info, GLShader::ProgramManager& program_manager,
-                                   StateTracker& state_tracker)
-    : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker},
+                                   const Device& device, ScreenInfo& info,
+                                   ProgramManager& program_manager, StateTracker& state_tracker)
+    : RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device,
+                                                                            state_tracker},
      shader_cache{*this, system, emu_window, device}, query_cache{system, *this},
      buffer_cache{*this, system, device, STREAM_BUFFER_SIZE},
      fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system},
      screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
    CheckExtensions();
+
+    unified_uniform_buffer.Create();
+    glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
+
+    if (device.UseAssemblyShaders()) {
+        glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
+        for (const GLuint cbuf : staging_cbufs) {
+            glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
+                                 nullptr, 0);
+        }
+    }
 }

-RasterizerOpenGL::~RasterizerOpenGL() {}
+RasterizerOpenGL::~RasterizerOpenGL() {
+    if (device.UseAssemblyShaders()) {
+        glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
+    }
+}

 void RasterizerOpenGL::CheckExtensions() {
    if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
@@ -230,6 +280,7 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
    MICROPROFILE_SCOPE(OpenGL_Shader);
    auto& gpu = system.GPU().Maxwell3D();
+    std::size_t num_ssbos = 0;
    u32 clip_distances = 0;

    for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@@ -261,6 +312,14 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {

        Shader shader{shader_cache.GetStageProgram(program)};

+        if (device.UseAssemblyShaders()) {
+            // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
+            // all stages share the same bindings.
+            const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size();
+            ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage");
+            num_ssbos += num_stage_ssbos;
+        }
+
        // Stage indices are 0 - 5
        const std::size_t stage = index == 0 ? 0 : index - 1;
        SetupDrawConstBuffers(stage, shader);
@@ -526,6 +585,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
    SyncFramebufferSRGB();

    buffer_cache.Acquire();
+    current_cbuf = 0;

    std::size_t buffer_size = CalculateVertexArraysSize();

@@ -535,9 +595,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
    }

    // Uniform space for the 5 shader stages
-    buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
-                  (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
-                      Maxwell::MaxShaderStage;
+    buffer_size =
+        Common::AlignUp<std::size_t>(buffer_size, 4) +
+        (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;

    // Add space for at least 18 constant buffers
    buffer_size += Maxwell::MaxConstBuffers *
@@ -558,12 +618,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
    }

    // Setup emulation uniform buffer.
-    GLShader::MaxwellUniformData ubo;
-    ubo.SetFromRegs(gpu);
-    const auto [buffer, offset] =
-        buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
-    glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
-                      static_cast<GLsizeiptr>(sizeof(ubo)));
+    if (!device.UseAssemblyShaders()) {
+        MaxwellUniformData ubo;
+        ubo.SetFromRegs(gpu);
+        const auto [buffer, offset] =
+            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
+        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
+                          static_cast<GLsizeiptr>(sizeof(ubo)));
+    }

    // Setup shaders and their used resources.
    texture_cache.GuardSamplers(true);
@@ -630,16 +692,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 }

 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
-    if (device.HasBrokenCompute()) {
-        return;
-    }
-
    buffer_cache.Acquire();
+    current_cbuf = 0;

    auto kernel = shader_cache.GetComputeKernel(code_addr);
    SetupComputeTextures(kernel);
    SetupComputeImages(kernel);
-    program_manager.BindComputeShader(kernel->GetHandle());

    const std::size_t buffer_size =
        Tegra::Engines::KeplerCompute::NumConstBuffers *
@@ -652,6 +710,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
    buffer_cache.Unmap();

    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    program_manager.BindCompute(kernel->GetHandle());
    glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
    ++num_queued_commands;
 }
@@ -701,15 +760,15 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
        return;
    }
    texture_cache.OnCPUWrite(addr, size);
-    shader_cache.InvalidateRegion(addr, size);
+    shader_cache.OnCPUWrite(addr, size);
    buffer_cache.OnCPUWrite(addr, size);
-    query_cache.InvalidateRegion(addr, size);
 }

 void RasterizerOpenGL::SyncGuestHost() {
    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
    texture_cache.SyncGuestHost();
    buffer_cache.SyncGuestHost();
+    shader_cache.SyncGuestHost();
 }

 void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
@@ -812,39 +871,72 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
 }

 void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
+    static constexpr std::array PARAMETER_LUT = {
+        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV};
+
    MICROPROFILE_SCOPE(OpenGL_UBO);
    const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
    const auto& shader_stage = stages[stage_index];
+    const auto& entries = shader->GetEntries();
+    const bool use_unified = entries.use_unified_uniforms;
+    const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;

-    u32 binding = device.GetBaseBindings(stage_index).uniform_buffer;
-    for (const auto& entry : shader->GetEntries().const_buffers) {
-        const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
-        SetupConstBuffer(binding++, buffer, entry);
+    const auto base_bindings = device.GetBaseBindings(stage_index);
+    u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
+    for (const auto& entry : entries.const_buffers) {
+        const u32 index = entry.GetIndex();
+        const auto& buffer = shader_stage.const_buffers[index];
+        SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
+                         base_unified_offset + index * Maxwell::MaxConstBufferSize);
+        ++binding;
+    }
+    if (use_unified) {
+        const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
+                                           entries.global_memory_entries.size());
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
+                          base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
    }
 }

 void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
    MICROPROFILE_SCOPE(OpenGL_UBO);
    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const auto& entries = kernel->GetEntries();
+    const bool use_unified = entries.use_unified_uniforms;

    u32 binding = 0;
-    for (const auto& entry : kernel->GetEntries().const_buffers) {
+    for (const auto& entry : entries.const_buffers) {
        const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
        const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
        Tegra::Engines::ConstBufferInfo buffer;
        buffer.address = config.Address();
        buffer.size = config.size;
        buffer.enabled = mask[entry.GetIndex()];
-        SetupConstBuffer(binding++, buffer, entry);
+        SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
+                         use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
+        ++binding;
+    }
+    if (use_unified) {
+        const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
+                          NUM_CONST_BUFFERS_BYTES_PER_STAGE);
    }
 }

-void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                                        const ConstBufferEntry& entry) {
+void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
+                                        const Tegra::Engines::ConstBufferInfo& buffer,
+                                        const ConstBufferEntry& entry, bool use_unified,
+                                        std::size_t unified_offset) {
    if (!buffer.enabled) {
        // Set values to zero to unbind buffers
-        glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
-                          sizeof(float));
+        if (device.UseAssemblyShaders()) {
+            glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
+        } else {
+            glBindBufferRange(GL_UNIFORM_BUFFER, binding,
+                              buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
+        }
        return;
    }

@@ -852,10 +944,29 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const
    // UBO alignment requirements.
    const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));

-    const auto alignment = device.GetUniformBufferAlignment();
-    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
-                                                          device.HasFastBufferSubData());
-    glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+    const bool fast_upload = !use_unified && device.HasFastBufferSubData();
+
+    const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
+    const GPUVAddr gpu_addr = buffer.address;
+    auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+
+    if (device.UseAssemblyShaders()) {
+        UNIMPLEMENTED_IF(use_unified);
+        if (offset != 0) {
+            const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
+            glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
+            cbuf = staging_cbuf;
+            offset = 0;
+        }
+        glBindBufferRangeNV(stage, binding, cbuf, offset, size);
+        return;
+    }
+
+    if (use_unified) {
+        glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
+    } else {
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+    }
 }

 void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
@@ -863,7 +974,8 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
    auto& memory_manager{gpu.MemoryManager()};
    const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};

-    u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer;
+    u32 binding =
+        device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
    for (const auto& entry : shader->GetEntries().global_memory_entries) {
        const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
        const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
@@ -929,16 +1041,12 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
        glBindTextureUnit(binding, 0);
        return;
    }
-    glBindTextureUnit(binding, view->GetTexture());
-
-    if (view->GetSurfaceParams().IsBuffer()) {
-        return;
+    const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source,
+                                           texture.tic.z_source, texture.tic.w_source);
+    glBindTextureUnit(binding, handle);
+    if (!view->GetSurfaceParams().IsBuffer()) {
+        glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
    }
-    // Apply swizzle to textures that are not buffers.
-    view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source,
-                       texture.tic.w_source);
-
-    glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
 }

 void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
@@ -967,14 +1075,11 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t
        glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
        return;
    }
-    if (!tic.IsBuffer()) {
-        view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
-    }
    if (entry.is_written) {
        view->MarkAsModified(texture_cache.Tick());
    }
-    glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE,
-                       view->GetFormat());
+    const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
+    glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat());
 }

 void RasterizerOpenGL::SyncViewport() {
@@ -983,6 +1088,26 @@ void RasterizerOpenGL::SyncViewport() {
    const auto& regs = gpu.regs;

    const bool dirty_viewport = flags[Dirty::Viewports];
+    const bool dirty_clip_control = flags[Dirty::ClipControl];
+
+    if (dirty_clip_control || flags[Dirty::FrontFace]) {
+        flags[Dirty::FrontFace] = false;
+
+        GLenum mode = MaxwellToGL::FrontFace(regs.front_face);
+        if (regs.screen_y_control.triangle_rast_flip != 0 &&
+            regs.viewport_transform[0].scale_y < 0.0f) {
+            switch (mode) {
+            case GL_CW:
+                mode = GL_CCW;
+                break;
+            case GL_CCW:
+                mode = GL_CW;
+                break;
+            }
+        }
+        glFrontFace(mode);
+    }
+
    if (dirty_viewport || flags[Dirty::ClipControl]) {
        flags[Dirty::ClipControl] = false;

@@ -1080,11 +1205,6 @@ void RasterizerOpenGL::SyncCullMode() {
            glDisable(GL_CULL_FACE);
        }
    }
-
-    if (flags[Dirty::FrontFace]) {
-        flags[Dirty::FrontFace] = false;
-        glFrontFace(MaxwellToGL::FrontFace(regs.front_face));
-    }
 }

 void RasterizerOpenGL::SyncPrimitiveRestart() {
@@ -1455,12 +1575,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
    oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
 }

+void RasterizerOpenGL::SyncTransformFeedback() {
+    // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
+    // when this is required.
+    const auto& regs = system.GPU().Maxwell3D().regs;
+
+    static constexpr std::size_t STRIDE = 3;
+    std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
+    std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams;
+
+    GLint* cursor = attribs.data();
+    GLint* current_stream = streams.data();
+
+    for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) {
+        const auto& layout = regs.tfb_layouts[feedback];
+        UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding");
+        if (layout.varying_count == 0) {
+            continue;
+        }
+
+        *current_stream = static_cast<GLint>(feedback);
+        if (current_stream != streams.data()) {
+            // When stepping one stream, push the expected token
+            cursor[0] = GL_NEXT_BUFFER_NV;
+            cursor[1] = 0;
+            cursor[2] = 0;
+            cursor += STRIDE;
+        }
+        ++current_stream;
+
+        const auto& locations = regs.tfb_varying_locs[feedback];
+        std::optional<u8> current_index;
+        for (u32 offset = 0; offset < layout.varying_count; ++offset) {
+            const u8 location = locations[offset];
+            const u8 index = location / 4;
+
+            if (current_index == index) {
+                // Increase number of components of the previous attachment
+                ++cursor[-2];
+                continue;
+            }
+            current_index = index;
+
+            std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location);
+            cursor[1] = 1;
+            cursor += STRIDE;
+        }
+    }
+
+    const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE);
+    const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data());
+    glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(),
+                                       GL_INTERLEAVED_ATTRIBS);
+}
+
 void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
    const auto& regs = system.GPU().Maxwell3D().regs;
    if (regs.tfb_enabled == 0) {
        return;
    }

+    if (device.UseAssemblyShaders()) {
+        SyncTransformFeedback();
+    }
+
    UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
@@ -1487,6 +1665,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
                          static_cast<GLsizeiptr>(size));
    }

+    // We may have to call BeginTransformFeedbackNV here since they seem to call different
+    // implementations on Nvidia's driver (the pointer is different) but we are using
+    // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
+    // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
    glBeginTransformFeedback(GL_POINTS);
 }

--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -56,8 +56,8 @@ struct DrawParameters;
 class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
 public:
    explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                              ScreenInfo& info, GLShader::ProgramManager& program_manager,
-                              StateTracker& state_tracker);
+                              const Device& device, ScreenInfo& info,
+                              ProgramManager& program_manager, StateTracker& state_tracker);
    ~RasterizerOpenGL() override;

    void Draw(bool is_indexed, bool is_instanced) override;
@@ -106,8 +106,9 @@ private:
    void SetupComputeConstBuffers(const Shader& kernel);

    /// Configures a constant buffer.
-    void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                          const ConstBufferEntry& entry);
+    void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
+                          const ConstBufferEntry& entry, bool use_unified,
+                          std::size_t unified_offset);

    /// Configures the current global memory entries to use for the draw command.
    void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
@@ -201,6 +202,10 @@ private:
    /// Syncs the framebuffer sRGB state to match the guest state
    void SyncFramebufferSRGB();

+    /// Syncs transform feedback state to match guest state
+    /// @note Only valid on assembly shaders
+    void SyncTransformFeedback();
+
    /// Begin a transform feedback
    void BeginTransformFeedback(GLenum primitive_mode);

@@ -224,7 +229,7 @@ private:

    void SetupShaders(GLenum primitive_mode);

-    const Device device;
+    const Device& device;

    TextureCacheOpenGL texture_cache;
    ShaderCacheOpenGL shader_cache;
@@ -236,7 +241,7 @@ private:

    Core::System& system;
    ScreenInfo& screen_info;
-    GLShader::ProgramManager& program_manager;
+    ProgramManager& program_manager;
    StateTracker& state_tracker;

    static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
@@ -248,6 +253,13 @@ private:
    std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
        enabled_transform_feedback_buffers;

+    static constexpr std::size_t NUM_CONSTANT_BUFFERS =
+        Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+        Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+    std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
+    std::size_t current_cbuf = 0;
+    OGLBuffer unified_uniform_buffer;
+
    /// Number of commands queued to the OpenGL driver. Reseted on flush.
    std::size_t num_queued_commands = 0;

--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -125,6 +125,15 @@ void OGLProgram::Release() {
    handle = 0;
 }

+void OGLAssemblyProgram::Release() {
+    if (handle == 0) {
+        return;
+    }
+    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
+    glDeleteProgramsARB(1, &handle);
+    handle = 0;
+}
+
 void OGLPipeline::Create() {
    if (handle != 0)
        return;
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -167,6 +167,22 @@ public:
    GLuint handle = 0;
 };

+class OGLAssemblyProgram : private NonCopyable {
+public:
+    OGLAssemblyProgram() = default;
+
+    OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+    ~OGLAssemblyProgram() {
+        Release();
+    }
+
+    /// Deletes the internal OpenGL resource
+    void Release();
+
+    GLuint handle = 0;
+};
+
 class OGLPipeline : private NonCopyable {
 public:
    OGLPipeline() = default;
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -97,6 +97,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
    return {};
 }

+constexpr GLenum AssemblyEnum(ShaderType shader_type) {
+    switch (shader_type) {
+    case ShaderType::Vertex:
+        return GL_VERTEX_PROGRAM_NV;
+    case ShaderType::TesselationControl:
+        return GL_TESS_CONTROL_PROGRAM_NV;
+    case ShaderType::TesselationEval:
+        return GL_TESS_EVALUATION_PROGRAM_NV;
+    case ShaderType::Geometry:
+        return GL_GEOMETRY_PROGRAM_NV;
+    case ShaderType::Fragment:
+        return GL_FRAGMENT_PROGRAM_NV;
+    case ShaderType::Compute:
+        return GL_COMPUTE_PROGRAM_NV;
+    }
+    return {};
+}
+
 std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
    return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
 }
@@ -120,18 +138,43 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
    return registry;
 }

-std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type,
-                                        u64 unique_identifier, const ShaderIR& ir,
-                                        const Registry& registry, bool hint_retrievable = false) {
+ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier,
+                             const ShaderIR& ir, const Registry& registry,
+                             bool hint_retrievable = false) {
    const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
    LOG_INFO(Render_OpenGL, "{}", shader_id);

-    const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
-    OGLShader shader;
-    shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
+    auto program = std::make_shared<ProgramHandle>();
+
+    if (device.UseAssemblyShaders()) {
+        const std::string arb = "Not implemented";
+
+        GLuint& arb_prog = program->assembly_program.handle;
+
+// Commented out functions signal OpenGL errors but are compatible with apitrace.
+// Use them only to capture and replay on apitrace.
+#if 0
+        glGenProgramsNV(1, &arb_prog);
+        glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()),
+                        reinterpret_cast<const GLubyte*>(arb.data()));
+#else
+        glGenProgramsARB(1, &arb_prog);
+        glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB,
+                                static_cast<GLsizei>(arb.size()), arb.data());
+#endif
+        const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV));
+        if (err && *err) {
+            LOG_CRITICAL(Render_OpenGL, "{}", err);
+            LOG_INFO(Render_OpenGL, "\n{}", arb);
+        }
+    } else {
+        const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
+        OGLShader shader;
+        shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
+
+        program->source_program.Create(true, hint_retrievable, shader.handle);
+    }

-    auto program = std::make_shared<OGLProgram>();
-    program->Create(true, hint_retrievable, shader.handle);
    return program;
 }

@@ -153,15 +196,22 @@ std::unordered_set<GLenum> GetSupportedFormats() {

 CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
                           std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                           ShaderEntries entries, std::shared_ptr<OGLProgram> program)
+                           ShaderEntries entries, ProgramSharedPtr program_)
    : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
-      size_in_bytes{size_in_bytes}, program{std::move(program)} {}
+      size_in_bytes{size_in_bytes}, program{std::move(program_)} {
+    // Assign either the assembly program or source program. We can't have both.
+    handle = program->assembly_program.handle;
+    if (handle == 0) {
+        handle = program->source_program.handle;
+    }
+    ASSERT(handle != 0);
+}

 CachedShader::~CachedShader() = default;

 GLuint CachedShader::GetHandle() const {
    DEBUG_ASSERT(registry->IsConsistent());
-    return program->handle;
+    return handle;
 }

 Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
@@ -191,8 +241,9 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
    entry.bindless_samplers = registry->GetBindlessSamplers();
    params.disk_cache.SaveEntry(std::move(entry));

-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
+                         MakeEntries(params.device, ir, shader_type), std::move(program)));
 }

 Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
@@ -215,8 +266,9 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
    entry.bindless_samplers = registry->GetBindlessSamplers();
    params.disk_cache.SaveEntry(std::move(entry));

-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
+                         MakeEntries(params.device, ir, ShaderType::Compute), std::move(program)));
 }

 Shader CachedShader::CreateFromCache(const ShaderParameters& params,
@@ -239,7 +291,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
        return;
    }

-    const std::vector gl_cache = disk_cache.LoadPrecompiled();
+    std::vector<ShaderDiskCachePrecompiled> gl_cache;
+    if (!device.UseAssemblyShaders()) {
+        // Only load precompiled cache when we are not using assembly shaders
+        gl_cache = disk_cache.LoadPrecompiled();
+    }
    const auto supported_formats = GetSupportedFormats();

    // Track if precompiled cache was altered during loading to know if we have to
@@ -278,7 +334,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
            auto registry = MakeRegistry(entry);
            const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);

-            std::shared_ptr<OGLProgram> program;
+            ProgramSharedPtr program;
            if (precompiled_entry) {
                // If the shader is precompiled, attempt to load it with
                program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
@@ -294,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
            PrecompiledShader shader;
            shader.program = std::move(program);
            shader.registry = std::move(registry);
-            shader.entries = MakeEntries(ir);
+            shader.entries = MakeEntries(device, ir, entry.type);

            std::scoped_lock lock{mutex};
            if (callback) {
@@ -332,6 +388,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
        return;
    }

+    if (device.UseAssemblyShaders()) {
+        // Don't store precompiled binaries for assembly shaders.
+        return;
+    }
+
    // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
    // before precompiling them

@@ -339,7 +400,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
        const u64 id = (*transferable)[i].unique_identifier;
        const auto it = find_precompiled(id);
        if (it == gl_cache.end()) {
-            const GLuint program = runtime_cache.at(id).program->handle;
+            const GLuint program = runtime_cache.at(id).program->source_program.handle;
            disk_cache.SavePrecompiled(id, program);
            precompiled_cache_altered = true;
        }
@@ -350,7 +411,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
    }
 }

-std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
+ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
    const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
    const std::unordered_set<GLenum>& supported_formats) {
    if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
@@ -358,15 +419,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
        return {};
    }

-    auto program = std::make_shared<OGLProgram>();
-    program->handle = glCreateProgram();
-    glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
-    glProgramBinary(program->handle, precompiled_entry.binary_format,
-                    precompiled_entry.binary.data(),
+    auto program = std::make_shared<ProgramHandle>();
+    GLuint& handle = program->source_program.handle;
+    handle = glCreateProgram();
+    glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
+    glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(),
                    static_cast<GLsizei>(precompiled_entry.binary.size()));

    GLint link_status;
-    glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status);
+    glGetProgramiv(handle, GL_LINK_STATUS, &link_status);
    if (link_status == GL_FALSE) {
        LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
        return {};
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -43,8 +43,14 @@ struct UnspecializedShader;
 using Shader = std::shared_ptr<CachedShader>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;

+struct ProgramHandle {
+    OGLProgram source_program;
+    OGLAssemblyProgram assembly_program;
+};
+using ProgramSharedPtr = std::shared_ptr<ProgramHandle>;
+
 struct PrecompiledShader {
-    std::shared_ptr<OGLProgram> program;
+    ProgramSharedPtr program;
    std::shared_ptr<VideoCommon::Shader::Registry> registry;
    ShaderEntries entries;
 };
@@ -87,12 +93,13 @@ public:
 private:
    explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
                          std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                          ShaderEntries entries, std::shared_ptr<OGLProgram> program);
+                          ShaderEntries entries, ProgramSharedPtr program);

    std::shared_ptr<VideoCommon::Shader::Registry> registry;
    ShaderEntries entries;
    std::size_t size_in_bytes = 0;
-    std::shared_ptr<OGLProgram> program;
+    ProgramSharedPtr program;
+    GLuint handle = 0;
 };

 class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
@@ -115,7 +122,7 @@ protected:
    void FlushObjectInner(const Shader& object) override {}

 private:
-    std::shared_ptr<OGLProgram> GeneratePrecompiledProgram(
+    ProgramSharedPtr GeneratePrecompiledProgram(
        const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
        const std::unordered_set<GLenum>& supported_formats);

--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -61,8 +61,8 @@ struct TextureDerivates {};
 using TextureArgument = std::pair<Type, Node>;
 using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;

-constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
-    static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
+constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
+constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);

 constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
 #define ftou floatBitsToUint
@@ -402,6 +402,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
    return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
 }

+bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
+    const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
+    // We waste one UBO for emulation
+    const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
+    return num_ubos > num_available_ubos;
+}
+
 struct GenericVaryingDescription {
    std::string name;
    u8 first_element = 0;
@@ -412,8 +419,9 @@ class GLSLDecompiler final {
 public:
    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
                            ShaderType stage, std::string_view identifier, std::string_view suffix)
-        : device{device}, ir{ir}, registry{registry}, stage{stage},
-          identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
+        : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
+          suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
+                                                      UseUnifiedUniforms(device, ir, stage)} {
        if (stage != ShaderType::Compute) {
            transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
        }
@@ -618,7 +626,9 @@ private:
                break;
            }
        }
-        if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) {
+
+        if (stage != ShaderType::Geometry &&
+            (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) {
            if (ir.UsesLayer()) {
                code.AddLine("int gl_Layer;");
            }
@@ -647,6 +657,16 @@ private:
        --code.scope;
        code.AddLine("}};");
        code.AddNewLine();
+
+        if (stage == ShaderType::Geometry) {
+            if (ir.UsesLayer()) {
+                code.AddLine("out int gl_Layer;");
+            }
+            if (ir.UsesViewportIndex()) {
+                code.AddLine("out int gl_ViewportIndex;");
+            }
+        }
+        code.AddNewLine();
    }

    void DeclareRegisters() {
@@ -834,12 +854,24 @@ private:
    }

    void DeclareConstantBuffers() {
+        if (use_unified_uniforms) {
+            const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
+                                static_cast<u32>(ir.GetGlobalMemory().size());
+            code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
+                         binding);
+            code.AddLine("    uint cbufs[];");
+            code.AddLine("}};");
+            code.AddNewLine();
+            return;
+        }
+
        u32 binding = device.GetBaseBindings(stage).uniform_buffer;
-        for (const auto& buffers : ir.GetConstantBuffers()) {
-            const auto index = buffers.first;
+        for (const auto [index, info] : ir.GetConstantBuffers()) {
+            const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
+            const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
            code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
                         GetConstBufferBlock(index));
-            code.AddLine("    uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS);
+            code.AddLine("    uvec4 {}[{}];", GetConstBuffer(index), size);
            code.AddLine("}};");
            code.AddNewLine();
        }
@@ -1038,42 +1070,51 @@ private:

        if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
            const Node offset = cbuf->GetOffset();
+            const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
+
            if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
                // Direct access
                const u32 offset_imm = immediate->GetValue();
                ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
-                return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
-                                    offset_imm / (4 * 4), (offset_imm / 4) % 4),
+                if (use_unified_uniforms) {
+                    return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
+                            Type::Uint};
+                } else {
+                    return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
+                                        offset_imm / (4 * 4), (offset_imm / 4) % 4),
+                            Type::Uint};
+                }
+            }
+
+            // Indirect access
+            if (use_unified_uniforms) {
+                return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
+                                    Visit(offset).AsUint()),
                        Type::Uint};
            }

-            if (std::holds_alternative<OperationNode>(*offset)) {
-                // Indirect access
-                const std::string final_offset = code.GenerateTemporary();
-                code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
+            const std::string final_offset = code.GenerateTemporary();
+            code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());

-                if (!device.HasComponentIndexingBug()) {
-                    return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
-                                        final_offset, final_offset),
-                            Type::Uint};
-                }
-
-                // AMD's proprietary GLSL compiler emits ill code for variable component access.
-                // To bypass this driver bug generate 4 ifs, one per each component.
-                const std::string pack = code.GenerateTemporary();
-                code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
-                             final_offset);
-
-                const std::string result = code.GenerateTemporary();
-                code.AddLine("uint {};", result);
-                for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
-                    code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
-                                 pack, GetSwizzle(swizzle));
-                }
-                return {result, Type::Uint};
+            if (!device.HasComponentIndexingBug()) {
+                return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
+                                    final_offset, final_offset),
+                        Type::Uint};
            }

-            UNREACHABLE_MSG("Unmanaged offset node type");
+            // AMD's proprietary GLSL compiler emits ill code for variable component access.
+            // To bypass this driver bug generate 4 ifs, one per each component.
+            const std::string pack = code.GenerateTemporary();
+            code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
+                         final_offset);
+
+            const std::string result = code.GenerateTemporary();
+            code.AddLine("uint {};", result);
+            for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
+                code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
+                             GetSwizzle(swizzle));
+            }
+            return {result, Type::Uint};
        }

        if (const auto gmem = std::get_if<GmemNode>(&*node)) {
@@ -1538,7 +1579,9 @@ private:
        Expression target;
        if (const auto gpr = std::get_if<GprNode>(&*dest)) {
            if (gpr->GetIndex() == Register::ZeroIndex) {
-                // Writing to Register::ZeroIndex is a no op
+                // Writing to Register::ZeroIndex is a no op but we still have to visit the source
+                // as it might have side effects.
+                code.AddLine("{};", Visit(src).GetCode());
                return {};
            }
            target = {GetRegister(gpr->GetIndex()), Type::Float};
@@ -2309,6 +2352,18 @@ private:
        return {"gl_SubGroupInvocationARB", Type::Uint};
    }

+    template <const std::string_view& comparison>
+    Expression ThreadMask(Operation) {
+        if (device.HasWarpIntrinsics()) {
+            return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint};
+        }
+        if (device.HasShaderBallot()) {
+            return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint};
+        }
+        LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader");
+        return {"0U", Type::Uint};
+    }
+
    Expression ShuffleIndexed(Operation operation) {
        std::string value = VisitOperand(operation, 0).AsFloat();

@@ -2321,7 +2376,21 @@ private:
        return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float};
    }

-    Expression MemoryBarrierGL(Operation) {
+    Expression Barrier(Operation) {
+        if (!ir.IsDecompiled()) {
+            LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled");
+            return {};
+        }
+        code.AddLine("barrier();");
+        return {};
+    }
+
+    Expression MemoryBarrierGroup(Operation) {
+        code.AddLine("groupMemoryBarrier();");
+        return {};
+    }
+
+    Expression MemoryBarrierGlobal(Operation) {
        code.AddLine("memoryBarrier();");
        return {};
    }
@@ -2337,6 +2406,12 @@ private:
        static constexpr std::string_view NotEqual = "!=";
        static constexpr std::string_view GreaterEqual = ">=";

+        static constexpr std::string_view Eq = "Eq";
+        static constexpr std::string_view Ge = "Ge";
+        static constexpr std::string_view Gt = "Gt";
+        static constexpr std::string_view Le = "Le";
+        static constexpr std::string_view Lt = "Lt";
+
        static constexpr std::string_view Add = "Add";
        static constexpr std::string_view Min = "Min";
        static constexpr std::string_view Max = "Max";
@@ -2554,9 +2629,16 @@ private:
        &GLSLDecompiler::VoteEqual,

        &GLSLDecompiler::ThreadId,
+        &GLSLDecompiler::ThreadMask<Func::Eq>,
+        &GLSLDecompiler::ThreadMask<Func::Ge>,
+        &GLSLDecompiler::ThreadMask<Func::Gt>,
+        &GLSLDecompiler::ThreadMask<Func::Le>,
+        &GLSLDecompiler::ThreadMask<Func::Lt>,
        &GLSLDecompiler::ShuffleIndexed,

-        &GLSLDecompiler::MemoryBarrierGL,
+        &GLSLDecompiler::Barrier,
+        &GLSLDecompiler::MemoryBarrierGroup,
+        &GLSLDecompiler::MemoryBarrierGlobal,
    };
    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

@@ -2669,6 +2751,7 @@ private:
    const std::string_view identifier;
    const std::string_view suffix;
    const Header header;
+    const bool use_unified_uniforms;
    std::unordered_map<u8, VaryingTFB> transform_feedback;

    ShaderWriter code;
@@ -2864,7 +2947,7 @@ void GLSLDecompiler::DecompileAST() {

 } // Anonymous namespace

-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
+ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
    ShaderEntries entries;
    for (const auto& cbuf : ir.GetConstantBuffers()) {
        entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2885,6 +2968,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
        entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
    }
    entries.shader_length = ir.GetLength();
+    entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
    return entries;
 }

--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -53,11 +53,13 @@ struct ShaderEntries {
    std::vector<GlobalMemoryEntry> global_memory_entries;
    std::vector<SamplerEntry> samplers;
    std::vector<ImageEntry> images;
-    u32 clip_distances{};
    std::size_t shader_length{};
+    u32 clip_distances{};
+    bool use_unified_uniforms{};
 };

-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);
+ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                          Tegra::Engines::ShaderType stage);

 std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
                            const VideoCommon::Shader::Registry& registry,
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -6,47 +6,111 @@

 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"

-namespace OpenGL::GLShader {
+namespace OpenGL {

-ProgramManager::ProgramManager() = default;
+ProgramManager::ProgramManager(const Device& device) {
+    use_assembly_programs = device.UseAssemblyShaders();
+    if (use_assembly_programs) {
+        glEnable(GL_COMPUTE_PROGRAM_NV);
+    } else {
+        graphics_pipeline.Create();
+        glBindProgramPipeline(graphics_pipeline.handle);
+    }
+}

 ProgramManager::~ProgramManager() = default;

-void ProgramManager::Create() {
-    graphics_pipeline.Create();
-    glBindProgramPipeline(graphics_pipeline.handle);
+void ProgramManager::BindCompute(GLuint program) {
+    if (use_assembly_programs) {
+        glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
+    } else {
+        is_graphics_bound = false;
+        glUseProgram(program);
+    }
 }

 void ProgramManager::BindGraphicsPipeline() {
+    if (use_assembly_programs) {
+        UpdateAssemblyPrograms();
+    } else {
+        UpdateSourcePrograms();
+    }
+}
+
+void ProgramManager::BindHostPipeline(GLuint pipeline) {
+    if (use_assembly_programs) {
+        if (geometry_enabled) {
+            geometry_enabled = false;
+            old_state.geometry = 0;
+            glDisable(GL_GEOMETRY_PROGRAM_NV);
+        }
+    } else {
+        if (!is_graphics_bound) {
+            glUseProgram(0);
+        }
+    }
+    glBindProgramPipeline(pipeline);
+}
+
+void ProgramManager::RestoreGuestPipeline() {
+    if (use_assembly_programs) {
+        glBindProgramPipeline(0);
+    } else {
+        glBindProgramPipeline(graphics_pipeline.handle);
+    }
+}
+
+void ProgramManager::UpdateAssemblyPrograms() {
+    const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) {
+        if (current == old) {
+            return;
+        }
+        if (current == 0) {
+            if (enabled) {
+                enabled = false;
+                glDisable(stage);
+            }
+            return;
+        }
+        if (!enabled) {
+            enabled = true;
+            glEnable(stage);
+        }
+        glBindProgramARB(stage, current);
+    };
+
+    update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex);
+    update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry,
+                 old_state.geometry);
+    update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment,
+                 old_state.fragment);
+
+    old_state = current_state;
+}
+
+void ProgramManager::UpdateSourcePrograms() {
    if (!is_graphics_bound) {
        is_graphics_bound = true;
        glUseProgram(0);
    }

-    // Avoid updating the pipeline when values have no changed
-    if (old_state == current_state) {
-        return;
-    }
-
-    // Workaround for AMD bug
-    static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT |
-                                            GL_FRAGMENT_SHADER_BIT};
    const GLuint handle = graphics_pipeline.handle;
-    glUseProgramStages(handle, all_used_stages, 0);
-    glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader);
-    glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader);
-    glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader);
+    const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) {
+        if (current == old) {
+            return;
+        }
+        glUseProgramStages(handle, stage, current);
+    };
+    update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex);
+    update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry);
+    update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment);

    old_state = current_state;
 }

-void ProgramManager::BindComputeShader(GLuint program) {
-    is_graphics_bound = false;
-    glUseProgram(program);
-}
-
 void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
    const auto& regs = maxwell.regs;

@@ -54,4 +118,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
    y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;
 }

-} // namespace OpenGL::GLShader
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -11,7 +11,9 @@
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"

-namespace OpenGL::GLShader {
+namespace OpenGL {
+
+class Device;

 /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
 /// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
@@ -28,50 +30,58 @@ static_assert(sizeof(MaxwellUniformData) < 16384,

 class ProgramManager {
 public:
-    explicit ProgramManager();
+    explicit ProgramManager(const Device& device);
    ~ProgramManager();

-    void Create();
+    /// Binds a compute program
+    void BindCompute(GLuint program);

-    /// Updates the graphics pipeline and binds it.
+    /// Updates bound programs.
    void BindGraphicsPipeline();

-    /// Binds a compute shader.
-    void BindComputeShader(GLuint program);
+    /// Binds an OpenGL pipeline object unsynchronized with the guest state.
+    void BindHostPipeline(GLuint pipeline);
+
+    /// Rewinds BindHostPipeline state changes.
+    void RestoreGuestPipeline();

    void UseVertexShader(GLuint program) {
-        current_state.vertex_shader = program;
+        current_state.vertex = program;
    }

    void UseGeometryShader(GLuint program) {
-        current_state.geometry_shader = program;
+        current_state.geometry = program;
    }

    void UseFragmentShader(GLuint program) {
-        current_state.fragment_shader = program;
+        current_state.fragment = program;
    }

 private:
    struct PipelineState {
-        bool operator==(const PipelineState& rhs) const noexcept {
-            return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader &&
-                   geometry_shader == rhs.geometry_shader;
-        }
-
-        bool operator!=(const PipelineState& rhs) const noexcept {
-            return !operator==(rhs);
-        }
-
-        GLuint vertex_shader = 0;
-        GLuint fragment_shader = 0;
-        GLuint geometry_shader = 0;
+        GLuint vertex = 0;
+        GLuint geometry = 0;
+        GLuint fragment = 0;
    };

+    /// Update NV_gpu_program5 programs.
+    void UpdateAssemblyPrograms();
+
+    /// Update GLSL programs.
+    void UpdateSourcePrograms();
+
    OGLPipeline graphics_pipeline;
-    OGLPipeline compute_pipeline;
+
    PipelineState current_state;
    PipelineState old_state;
+
+    bool use_assembly_programs = false;
+
    bool is_graphics_bound = true;
+
+    bool vertex_enabled = false;
+    bool geometry_enabled = false;
+    bool fragment_enabled = false;
 };

-} // namespace OpenGL::GLShader
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -49,14 +49,6 @@ OGLStreamBuffer::~OGLStreamBuffer() {
    gl_buffer.Release();
 }

-GLuint OGLStreamBuffer::GetHandle() const {
-    return gl_buffer.handle;
-}
-
-GLsizeiptr OGLStreamBuffer::GetSize() const {
-    return buffer_size;
-}
-
 std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
    ASSERT(size <= buffer_size);
    ASSERT(alignment <= buffer_size);
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -17,9 +17,6 @@ public:
                             bool use_persistent = true);
    ~OGLStreamBuffer();

-    GLuint GetHandle() const;
-    GLsizeiptr GetSize() const;
-
    /*
     * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
     * and the optional alignment requirement.
@@ -32,6 +29,14 @@ public:

    void Unmap(GLsizeiptr size);

+    GLuint Handle() const {
+        return gl_buffer.handle;
+    }
+
+    GLsizeiptr Size() const {
+        return buffer_size;
+    }
+
 private:
    OGLBuffer gl_buffer;

--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -35,7 +35,7 @@ MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
 namespace {

 struct FormatTuple {
-    GLint internal_format;
+    GLenum internal_format;
    GLenum format = GL_NONE;
    GLenum type = GL_NONE;
 };
@@ -238,6 +238,12 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
    return texture;
 }

+constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source,
+                            SwizzleSource w_source) {
+    return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+           (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+}
+
 } // Anonymous namespace

 CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params,
@@ -381,7 +387,7 @@ void CachedSurface::DecorateSurfaceName() {
 }

 void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) {
-    LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix);
+    LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix);
 }

 View CachedSurface::CreateView(const ViewParams& view_key) {
@@ -397,14 +403,12 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr
 }

 CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params,
-                                     const bool is_proxy)
-    : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} {
-    target = GetTextureTarget(params.target);
-    format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format;
+                                     bool is_proxy)
+    : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format},
+      target{GetTextureTarget(params.target)}, is_proxy{is_proxy} {
    if (!is_proxy) {
-        texture_view = CreateTextureView();
+        main_view = CreateTextureView();
    }
-    swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A);
 }

 CachedSurfaceView::~CachedSurfaceView() = default;
@@ -447,27 +451,49 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
    }
 }

-void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source,
+GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source,
                                     SwizzleSource z_source, SwizzleSource w_source) {
-    u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
-    if (new_swizzle == swizzle)
-        return;
-    swizzle = new_swizzle;
-    const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source),
-                                   GetSwizzleSource(z_source), GetSwizzleSource(w_source)};
-    const GLuint handle = GetTexture();
-    const PixelFormat format = surface.GetSurfaceParams().pixel_format;
-    switch (format) {
+    if (GetSurfaceParams().IsBuffer()) {
+        return GetTexture();
+    }
+    const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
+    if (current_swizzle == new_swizzle) {
+        return current_view;
+    }
+    current_swizzle = new_swizzle;
+
+    const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
+    OGLTextureView& view = entry->second;
+    if (!is_cache_miss) {
+        current_view = view.handle;
+        return view.handle;
+    }
+    view = CreateTextureView();
+    current_view = view.handle;
+
+    std::array swizzle{x_source, y_source, z_source, w_source};
+
+    switch (const PixelFormat format = GetSurfaceParams().pixel_format) {
    case PixelFormat::Z24S8:
    case PixelFormat::Z32FS8:
    case PixelFormat::S8Z24:
-        glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
+        UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G);
+        glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
                            GetComponent(format, x_source == SwizzleSource::R));
-        break;
-    default:
-        glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
+
+        // Make sure we sample the first component
+        std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) {
+            return value == SwizzleSource::G ? SwizzleSource::R : value;
+        });
+        [[fallthrough]];
+    default: {
+        const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]),
+                                       GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])};
+        glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
        break;
    }
+    }
+    return view.handle;
 }

 OGLTextureView CachedSurfaceView::CreateTextureView() const {
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -83,7 +83,7 @@ public:
    /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER
    void Attach(GLenum attachment, GLenum target) const;

-    void ApplySwizzle(Tegra::Texture::SwizzleSource x_source,
+    GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
                      Tegra::Texture::SwizzleSource y_source,
                      Tegra::Texture::SwizzleSource z_source,
                      Tegra::Texture::SwizzleSource w_source);
@@ -98,7 +98,7 @@ public:
        if (is_proxy) {
            return surface.GetTexture();
        }
-        return texture_view.handle;
+        return main_view.handle;
    }

    GLenum GetFormat() const {
@@ -110,23 +110,19 @@ public:
    }

 private:
-    u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
-                      Tegra::Texture::SwizzleSource y_source,
-                      Tegra::Texture::SwizzleSource z_source,
-                      Tegra::Texture::SwizzleSource w_source) const {
-        return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
-               (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
-    }
-
    OGLTextureView CreateTextureView() const;

    CachedSurface& surface;
-    GLenum target{};
-    GLenum format{};
+    const GLenum format;
+    const GLenum target;
+    const bool is_proxy;

-    OGLTextureView texture_view;
-    u32 swizzle{};
-    bool is_proxy{};
+    std::unordered_map<u32, OGLTextureView> view_cache;
+    OGLTextureView main_view;
+
+    // Use an invalid default so it always fails the comparison test
+    u32 current_swizzle = 0xffffffff;
+    GLuint current_view = 0;
 };

 class TextureCacheOpenGL final : public TextureCacheBase {
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -316,7 +316,7 @@ public:
 RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
                               Core::Frontend::GraphicsContext& context)
    : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context},
-      has_debug_tool{HasDebugTool()} {}
+      program_manager{device}, has_debug_tool{HasDebugTool()} {}

 RendererOpenGL::~RendererOpenGL() = default;

@@ -468,8 +468,9 @@ void RendererOpenGL::InitOpenGLObjects() {
    vertex_program.Create(true, false, vertex_shader.handle);
    fragment_program.Create(true, false, fragment_shader.handle);

-    // Create program pipeline
-    program_manager.Create();
+    pipeline.Create();
+    glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle);
+    glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle);

    // Generate VBO handle for drawing
    vertex_buffer.Create();
@@ -508,7 +509,7 @@ void RendererOpenGL::CreateRasterizer() {
    if (rasterizer) {
        return;
    }
-    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info,
+    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, device, screen_info,
                                                    program_manager, state_tracker);
 }

@@ -620,10 +621,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
    state_tracker.NotifyClipControl();
    state_tracker.NotifyAlphaTest();

-    program_manager.UseVertexShader(vertex_program.handle);
-    program_manager.UseGeometryShader(0);
-    program_manager.UseFragmentShader(fragment_program.handle);
-    program_manager.BindGraphicsPipeline();
+    program_manager.BindHostPipeline(pipeline.handle);

    glEnable(GL_CULL_FACE);
    if (screen_info.display_srgb) {
@@ -665,6 +663,8 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {

    glClear(GL_COLOR_BUFFER_BIT);
    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+
+    program_manager.RestoreGuestPipeline();
 }

 bool RendererOpenGL::TryPresent(int timeout_ms) {
@@ -751,8 +751,9 @@ void RendererOpenGL::RenderScreenshot() {
 }

 bool RendererOpenGL::Init() {
-    if (GLAD_GL_KHR_debug) {
+    if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
        glEnable(GL_DEBUG_OUTPUT);
+        glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
        glDebugMessageCallback(DebugHandler, nullptr);
    }

--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -9,6 +9,7 @@
 #include "common/common_types.h"
 #include "common/math_util.h"
 #include "video_core/renderer_base.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state_tracker.h"
@@ -95,6 +96,7 @@ private:
    Core::Frontend::EmuWindow& emu_window;
    Core::System& system;
    Core::Frontend::GraphicsContext& context;
+    const Device device;

    StateTracker state_tracker{system};

@@ -102,13 +104,14 @@ private:
    OGLBuffer vertex_buffer;
    OGLProgram vertex_program;
    OGLProgram fragment_program;
+    OGLPipeline pipeline;
    OGLFramebuffer screenshot_framebuffer;

    /// Display information for Switch screen
    ScreenInfo screen_info;

    /// Global dummy shader pipeline
-    GLShader::ProgramManager program_manager;
+    ProgramManager program_manager;

    /// OpenGL framebuffer data
    std::vector<u8> gl_framebuffer_data;
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
@@ -71,8 +71,7 @@ void FixedPipelineState::Rasterizer::Fill(const Maxwell& regs) noexcept {
    const u32 topology_index = static_cast<u32>(regs.draw.topology.Value());

    u32 packed_front_face = PackFrontFace(regs.front_face);
-    if (regs.screen_y_control.triangle_rast_flip != 0 &&
-        regs.viewport_transform[0].scale_y > 0.0f) {
+    if (regs.screen_y_control.triangle_rast_flip != 0) {
        // Flip front face
        packed_front_face = 1 - packed_front_face;
    }
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -142,14 +142,14 @@ struct FormatTuple {
    {VK_FORMAT_BC6H_UFLOAT_BLOCK},                              // BC6H_UF16
    {VK_FORMAT_BC6H_SFLOAT_BLOCK},                              // BC6H_SF16
    {VK_FORMAT_ASTC_4x4_UNORM_BLOCK},                           // ASTC_2D_4X4
-    {VK_FORMAT_B8G8R8A8_UNORM},                                 // BGRA8
+    {VK_FORMAT_B8G8R8A8_UNORM, Attachable},                     // BGRA8
    {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage},      // RGBA32F
    {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage},            // RG32F
    {VK_FORMAT_R32_SFLOAT, Attachable | Storage},               // R32F
    {VK_FORMAT_R16_SFLOAT, Attachable | Storage},               // R16F
    {VK_FORMAT_R16_UNORM, Attachable | Storage},                // R16U
    {VK_FORMAT_UNDEFINED},                                      // R16S
-    {VK_FORMAT_UNDEFINED},                                      // R16UI
+    {VK_FORMAT_R16_UINT, Attachable | Storage},                 // R16UI
    {VK_FORMAT_UNDEFINED},                                      // R16I
    {VK_FORMAT_R16G16_UNORM, Attachable | Storage},             // RG16
    {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage},            // RG16F
@@ -168,7 +168,7 @@ struct FormatTuple {
    {VK_FORMAT_ASTC_8x8_UNORM_BLOCK},                           // ASTC_2D_8X8
    {VK_FORMAT_UNDEFINED},                                      // ASTC_2D_8X5
    {VK_FORMAT_UNDEFINED},                                      // ASTC_2D_5X4
-    {VK_FORMAT_UNDEFINED},                                      // BGRA8_SRGB
+    {VK_FORMAT_B8G8R8A8_SRGB, Attachable},                      // BGRA8_SRGB
    {VK_FORMAT_BC1_RGBA_SRGB_BLOCK},                            // DXT1_SRGB
    {VK_FORMAT_BC2_SRGB_BLOCK},                                 // DXT23_SRGB
    {VK_FORMAT_BC3_SRGB_BLOCK},                                 // DXT45_SRGB
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -7,6 +7,7 @@
 #include <memory>

 #include "core/core.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
@@ -36,8 +37,8 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch

 } // Anonymous namespace

-CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
-                                     VAddr cpu_addr, std::size_t size)
+Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
+               std::size_t size)
    : VideoCommon::BufferBlock{cpu_addr, size} {
    VkBufferCreateInfo ci;
    ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
@@ -53,7 +54,7 @@ CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& me
    buffer.commit = memory_manager.Commit(buffer.handle, false);
 }

-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;

 VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
                             const VKDevice& device, VKMemoryManager& memory_manager,
@@ -66,12 +67,8 @@ VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::S

 VKBufferCache::~VKBufferCache() = default;

-Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(device, memory_manager, cpu_addr, size);
-}
-
-VkBuffer VKBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
+std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, memory_manager, cpu_addr, size);
 }

 VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) {
@@ -90,7 +87,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
    std::memcpy(staging.commit->Map(size), data, size);

    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
+    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
                      size](vk::CommandBuffer cmdbuf) {
        cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size});

@@ -113,7 +110,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
                                      u8* data) {
    const auto& staging = staging_pool.GetUnusedBuffer(size, true);
    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
+    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
                      size](vk::CommandBuffer cmdbuf) {
        VkBufferMemoryBarrier barrier;
        barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -140,8 +137,8 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
 void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
                              std::size_t dst_offset, std::size_t size) {
    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([src_buffer = src->GetHandle(), dst_buffer = dst->GetHandle(), src_offset,
-                      dst_offset, size](vk::CommandBuffer cmdbuf) {
+    scheduler.Record([src_buffer = src.Handle(), dst_buffer = dst.Handle(), src_offset, dst_offset,
+                      size](vk::CommandBuffer cmdbuf) {
        cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size});

        std::array<VkBufferMemoryBarrier, 2> barriers;
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -24,13 +24,13 @@ class VKDevice;
 class VKMemoryManager;
 class VKScheduler;

-class CachedBufferBlock final : public VideoCommon::BufferBlock {
+class Buffer final : public VideoCommon::BufferBlock {
 public:
-    explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
-                               VAddr cpu_addr, std::size_t size);
-    ~CachedBufferBlock();
+    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
+                    std::size_t size);
+    ~Buffer();

-    VkBuffer GetHandle() const {
+    VkBuffer Handle() const {
        return *buffer.handle;
    }

@@ -38,8 +38,6 @@ private:
    VKBuffer buffer;
 };

-using Buffer = std::shared_ptr<CachedBufferBlock>;
-
 class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {
 public:
    explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
@@ -50,9 +48,7 @@ public:
    VkBuffer GetEmptyBuffer(std::size_t size) override;

 protected:
-    VkBuffer ToHandle(const Buffer& buffer) override;
-
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;

    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
                         const u8* data) override;
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -53,8 +53,9 @@ vk::DescriptorSetLayout VKComputePipeline::CreateDescriptorSetLayout() const {
    };
    add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size());
    add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size());
-    add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.texel_buffers.size());
+    add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.uniform_texels.size());
    add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size());
+    add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, entries.storage_texels.size());
    add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size());

    VkDescriptorSetLayoutCreateInfo ci;
--- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
@@ -42,6 +42,7 @@ vk::DescriptorPool* VKDescriptorPool::AllocateNewPool() {
        {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60},
        {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64},
        {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64},
+        {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, num_sets * 64},
        {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}};

    VkDescriptorPoolCreateInfo ci;
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -73,75 +73,79 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType

 std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
    vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) {
-    static constexpr std::array formats{VK_FORMAT_A8B8G8R8_UNORM_PACK32,
-                                        VK_FORMAT_A8B8G8R8_UINT_PACK32,
-                                        VK_FORMAT_A8B8G8R8_SNORM_PACK32,
-                                        VK_FORMAT_A8B8G8R8_SRGB_PACK32,
-                                        VK_FORMAT_B5G6R5_UNORM_PACK16,
-                                        VK_FORMAT_A2B10G10R10_UNORM_PACK32,
-                                        VK_FORMAT_A1R5G5B5_UNORM_PACK16,
-                                        VK_FORMAT_R32G32B32A32_SFLOAT,
-                                        VK_FORMAT_R32G32B32A32_UINT,
-                                        VK_FORMAT_R32G32_SFLOAT,
-                                        VK_FORMAT_R32G32_UINT,
-                                        VK_FORMAT_R16G16B16A16_UINT,
-                                        VK_FORMAT_R16G16B16A16_SNORM,
-                                        VK_FORMAT_R16G16B16A16_UNORM,
-                                        VK_FORMAT_R16G16_UNORM,
-                                        VK_FORMAT_R16G16_SNORM,
-                                        VK_FORMAT_R16G16_SFLOAT,
-                                        VK_FORMAT_R16_UNORM,
-                                        VK_FORMAT_R8G8B8A8_SRGB,
-                                        VK_FORMAT_R8G8_UNORM,
-                                        VK_FORMAT_R8G8_SNORM,
-                                        VK_FORMAT_R8G8_UINT,
-                                        VK_FORMAT_R8_UNORM,
-                                        VK_FORMAT_R8_UINT,
-                                        VK_FORMAT_B10G11R11_UFLOAT_PACK32,
-                                        VK_FORMAT_R32_SFLOAT,
-                                        VK_FORMAT_R32_UINT,
-                                        VK_FORMAT_R32_SINT,
-                                        VK_FORMAT_R16_SFLOAT,
-                                        VK_FORMAT_R16G16B16A16_SFLOAT,
-                                        VK_FORMAT_B8G8R8A8_UNORM,
-                                        VK_FORMAT_R4G4B4A4_UNORM_PACK16,
-                                        VK_FORMAT_D32_SFLOAT,
-                                        VK_FORMAT_D16_UNORM,
-                                        VK_FORMAT_D16_UNORM_S8_UINT,
-                                        VK_FORMAT_D24_UNORM_S8_UINT,
-                                        VK_FORMAT_D32_SFLOAT_S8_UINT,
-                                        VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
-                                        VK_FORMAT_BC2_UNORM_BLOCK,
-                                        VK_FORMAT_BC3_UNORM_BLOCK,
-                                        VK_FORMAT_BC4_UNORM_BLOCK,
-                                        VK_FORMAT_BC5_UNORM_BLOCK,
-                                        VK_FORMAT_BC5_SNORM_BLOCK,
-                                        VK_FORMAT_BC7_UNORM_BLOCK,
-                                        VK_FORMAT_BC6H_UFLOAT_BLOCK,
-                                        VK_FORMAT_BC6H_SFLOAT_BLOCK,
-                                        VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
-                                        VK_FORMAT_BC2_SRGB_BLOCK,
-                                        VK_FORMAT_BC3_SRGB_BLOCK,
-                                        VK_FORMAT_BC7_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
-                                        VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
-                                        VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
-                                        VK_FORMAT_E5B9G9R9_UFLOAT_PACK32};
+    static constexpr std::array formats{
+        VK_FORMAT_A8B8G8R8_UNORM_PACK32,
+        VK_FORMAT_A8B8G8R8_UINT_PACK32,
+        VK_FORMAT_A8B8G8R8_SNORM_PACK32,
+        VK_FORMAT_A8B8G8R8_SRGB_PACK32,
+        VK_FORMAT_B5G6R5_UNORM_PACK16,
+        VK_FORMAT_A2B10G10R10_UNORM_PACK32,
+        VK_FORMAT_A1R5G5B5_UNORM_PACK16,
+        VK_FORMAT_R32G32B32A32_SFLOAT,
+        VK_FORMAT_R32G32B32A32_UINT,
+        VK_FORMAT_R32G32_SFLOAT,
+        VK_FORMAT_R32G32_UINT,
+        VK_FORMAT_R16G16B16A16_UINT,
+        VK_FORMAT_R16G16B16A16_SNORM,
+        VK_FORMAT_R16G16B16A16_UNORM,
+        VK_FORMAT_R16G16_UNORM,
+        VK_FORMAT_R16G16_SNORM,
+        VK_FORMAT_R16G16_SFLOAT,
+        VK_FORMAT_R16_UNORM,
+        VK_FORMAT_R16_UINT,
+        VK_FORMAT_R8G8B8A8_SRGB,
+        VK_FORMAT_R8G8_UNORM,
+        VK_FORMAT_R8G8_SNORM,
+        VK_FORMAT_R8G8_UINT,
+        VK_FORMAT_R8_UNORM,
+        VK_FORMAT_R8_UINT,
+        VK_FORMAT_B10G11R11_UFLOAT_PACK32,
+        VK_FORMAT_R32_SFLOAT,
+        VK_FORMAT_R32_UINT,
+        VK_FORMAT_R32_SINT,
+        VK_FORMAT_R16_SFLOAT,
+        VK_FORMAT_R16G16B16A16_SFLOAT,
+        VK_FORMAT_B8G8R8A8_UNORM,
+        VK_FORMAT_B8G8R8A8_SRGB,
+        VK_FORMAT_R4G4B4A4_UNORM_PACK16,
+        VK_FORMAT_D32_SFLOAT,
+        VK_FORMAT_D16_UNORM,
+        VK_FORMAT_D16_UNORM_S8_UINT,
+        VK_FORMAT_D24_UNORM_S8_UINT,
+        VK_FORMAT_D32_SFLOAT_S8_UINT,
+        VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
+        VK_FORMAT_BC2_UNORM_BLOCK,
+        VK_FORMAT_BC3_UNORM_BLOCK,
+        VK_FORMAT_BC4_UNORM_BLOCK,
+        VK_FORMAT_BC5_UNORM_BLOCK,
+        VK_FORMAT_BC5_SNORM_BLOCK,
+        VK_FORMAT_BC7_UNORM_BLOCK,
+        VK_FORMAT_BC6H_UFLOAT_BLOCK,
+        VK_FORMAT_BC6H_SFLOAT_BLOCK,
+        VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
+        VK_FORMAT_BC2_SRGB_BLOCK,
+        VK_FORMAT_BC3_SRGB_BLOCK,
+        VK_FORMAT_BC7_SRGB_BLOCK,
+        VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
+        VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
+        VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
+        VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
+        VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
+        VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
+        VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
+        VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
+        VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
+        VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
+        VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
+        VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
+        VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
+        VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
+        VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
+        VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
+        VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
+        VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
+        VK_FORMAT_E5B9G9R9_UFLOAT_PACK32,
+    };
    std::unordered_map<VkFormat, VkFormatProperties> format_properties;
    for (const auto format : formats) {
        format_properties.emplace(format, physical.GetFormatProperties(format));
--- a/src/video_core/renderer_vulkan/vk_fence_manager.h
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.h
@@ -7,6 +7,7 @@
 #include <memory>

 #include "video_core/fence_manager.h"
+#include "video_core/renderer_vulkan/vk_buffer_cache.h"
 #include "video_core/renderer_vulkan/wrapper.h"

 namespace Core {
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -45,6 +45,7 @@ constexpr VkDescriptorType UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
 constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
 constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER;
 constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;

 constexpr VideoCommon::Shader::CompilerSettings compiler_settings{
@@ -104,8 +105,9 @@ u32 FillDescriptorLayout(const ShaderEntries& entries,
    u32 binding = base_binding;
    AddBindings<UNIFORM_BUFFER>(bindings, binding, flags, entries.const_buffers);
    AddBindings<STORAGE_BUFFER>(bindings, binding, flags, entries.global_buffers);
-    AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.texel_buffers);
+    AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.uniform_texels);
    AddBindings<COMBINED_IMAGE_SAMPLER>(bindings, binding, flags, entries.samplers);
+    AddBindings<STORAGE_TEXEL_BUFFER>(bindings, binding, flags, entries.storage_texels);
    AddBindings<STORAGE_IMAGE>(bindings, binding, flags, entries.images);
    return binding;
 }
@@ -312,7 +314,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
        ASSERT(point_size != 0.0f);
    }
    for (std::size_t i = 0; i < Maxwell::NumVertexAttributes; ++i) {
-        specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].Type();
+        const auto& attribute = fixed_state.vertex_input.attributes[i];
+        specialization.enabled_attributes[i] = attribute.enabled.Value() != 0;
+        specialization.attribute_types[i] = attribute.Type();
    }
    specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one;

@@ -329,8 +333,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {

        const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum);
        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
-        ASSERT(cpu_addr);
-        const auto shader = TryGet(*cpu_addr);
+        const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader;
        ASSERT(shader);

        const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
@@ -376,16 +379,17 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3
        return;
    }

-    if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER) {
-        // Nvidia has a bug where updating multiple uniform texels at once causes the driver to
-        // crash.
+    if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER ||
+                  descriptor_type == STORAGE_TEXEL_BUFFER) {
+        // Nvidia has a bug where updating multiple texels at once causes the driver to crash.
+        // Note: Fixed in driver Windows 443.24, Linux 440.66.15
        for (u32 i = 0; i < count; ++i) {
            VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back();
            entry.dstBinding = binding + i;
            entry.dstArrayElement = 0;
            entry.descriptorCount = 1;
            entry.descriptorType = descriptor_type;
-            entry.offset = offset + i * entry_size;
+            entry.offset = static_cast<std::size_t>(offset + i * entry_size);
            entry.stride = entry_size;
        }
    } else if (count > 0) {
@@ -406,8 +410,9 @@ void FillDescriptorUpdateTemplateEntries(
    std::vector<VkDescriptorUpdateTemplateEntryKHR>& template_entries) {
    AddEntry<UNIFORM_BUFFER>(template_entries, offset, binding, entries.const_buffers);
    AddEntry<STORAGE_BUFFER>(template_entries, offset, binding, entries.global_buffers);
-    AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.texel_buffers);
+    AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.uniform_texels);
    AddEntry<COMBINED_IMAGE_SAMPLER>(template_entries, offset, binding, entries.samplers);
+    AddEntry<STORAGE_TEXEL_BUFFER>(template_entries, offset, binding, entries.storage_texels);
    AddEntry<STORAGE_IMAGE>(template_entries, offset, binding, entries.images);
 }

--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -468,8 +468,9 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
    const auto& entries = pipeline.GetEntries();
    SetupComputeConstBuffers(entries);
    SetupComputeGlobalBuffers(entries);
-    SetupComputeTexelBuffers(entries);
+    SetupComputeUniformTexels(entries);
    SetupComputeTextures(entries);
+    SetupComputeStorageTexels(entries);
    SetupComputeImages(entries);

    buffer_cache.Unmap();
@@ -532,14 +533,14 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
        return;
    }
    texture_cache.OnCPUWrite(addr, size);
-    pipeline_cache.InvalidateRegion(addr, size);
+    pipeline_cache.OnCPUWrite(addr, size);
    buffer_cache.OnCPUWrite(addr, size);
-    query_cache.InvalidateRegion(addr, size);
 }

 void RasterizerVulkan::SyncGuestHost() {
    texture_cache.SyncGuestHost();
    buffer_cache.SyncGuestHost();
+    pipeline_cache.SyncGuestHost();
 }

 void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) {
@@ -787,8 +788,9 @@ void RasterizerVulkan::SetupShaderDescriptors(
        const auto& entries = shader->GetEntries();
        SetupGraphicsConstBuffers(entries, stage);
        SetupGraphicsGlobalBuffers(entries, stage);
-        SetupGraphicsTexelBuffers(entries, stage);
+        SetupGraphicsUniformTexels(entries, stage);
        SetupGraphicsTextures(entries, stage);
+        SetupGraphicsStorageTexels(entries, stage);
        SetupGraphicsImages(entries, stage);
    }
    texture_cache.GuardSamplers(false);
@@ -838,6 +840,10 @@ void RasterizerVulkan::BeginTransformFeedback() {
    if (regs.tfb_enabled == 0) {
        return;
    }
+    if (!device.IsExtTransformFeedbackSupported()) {
+        LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
+        return;
+    }

    UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
@@ -866,6 +872,9 @@ void RasterizerVulkan::EndTransformFeedback() {
    if (regs.tfb_enabled == 0) {
        return;
    }
+    if (!device.IsExtTransformFeedbackSupported()) {
+        return;
+    }

    scheduler.Record(
        [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
@@ -877,14 +886,10 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex

    for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) {
        const auto& attrib = regs.vertex_attrib_format[index];
-        if (!attrib.IsValid()) {
+        if (attrib.IsConstant()) {
            vertex_input.SetAttribute(index, false, 0, 0, {}, {});
            continue;
        }
-
-        [[maybe_unused]] const auto& buffer = regs.vertex_array[attrib.buffer];
-        ASSERT(buffer.IsEnabled());
-
        vertex_input.SetAttribute(index, true, attrib.buffer, attrib.offset, attrib.type.Value(),
                                  attrib.size.Value());
    }
@@ -980,12 +985,12 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries,
    }
 }

-void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) {
+void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) {
    MICROPROFILE_SCOPE(Vulkan_Textures);
    const auto& gpu = system.GPU().Maxwell3D();
-    for (const auto& entry : entries.texel_buffers) {
+    for (const auto& entry : entries.uniform_texels) {
        const auto image = GetTextureInfo(gpu, entry, stage).tic;
-        SetupTexelBuffer(image, entry);
+        SetupUniformTexels(image, entry);
    }
 }

@@ -1000,6 +1005,15 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::
    }
 }

+void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) {
+    MICROPROFILE_SCOPE(Vulkan_Textures);
+    const auto& gpu = system.GPU().Maxwell3D();
+    for (const auto& entry : entries.storage_texels) {
+        const auto image = GetTextureInfo(gpu, entry, stage).tic;
+        SetupStorageTexel(image, entry);
+    }
+}
+
 void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) {
    MICROPROFILE_SCOPE(Vulkan_Images);
    const auto& gpu = system.GPU().Maxwell3D();
@@ -1032,12 +1046,12 @@ void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
    }
 }

-void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) {
+void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
    MICROPROFILE_SCOPE(Vulkan_Textures);
    const auto& gpu = system.GPU().KeplerCompute();
-    for (const auto& entry : entries.texel_buffers) {
+    for (const auto& entry : entries.uniform_texels) {
        const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
-        SetupTexelBuffer(image, entry);
+        SetupUniformTexels(image, entry);
    }
 }

@@ -1052,6 +1066,15 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
    }
 }

+void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
+    MICROPROFILE_SCOPE(Vulkan_Textures);
+    const auto& gpu = system.GPU().KeplerCompute();
+    for (const auto& entry : entries.storage_texels) {
+        const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
+        SetupStorageTexel(image, entry);
+    }
+}
+
 void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
    MICROPROFILE_SCOPE(Vulkan_Images);
    const auto& gpu = system.GPU().KeplerCompute();
@@ -1101,8 +1124,8 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd
    update_descriptor_queue.AddBuffer(buffer, offset, size);
 }

-void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic,
-                                        const TexelBufferEntry& entry) {
+void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic,
+                                          const UniformTexelEntry& entry) {
    const auto view = texture_cache.GetTextureSurface(tic, entry);
    ASSERT(view->IsBufferView());

@@ -1124,6 +1147,14 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
    sampled_views.push_back(ImageView{std::move(view), image_layout});
 }

+void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic,
+                                         const StorageTexelEntry& entry) {
+    const auto view = texture_cache.GetImageSurface(tic, entry);
+    ASSERT(view->IsBufferView());
+
+    update_descriptor_queue.AddTexelBuffer(view->GetBufferView());
+}
+
 void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) {
    auto view = texture_cache.GetImageSurface(tic, entry);

--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -193,12 +193,15 @@ private:
    /// Setup global buffers in the graphics pipeline.
    void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage);

-    /// Setup texel buffers in the graphics pipeline.
-    void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage);
+    /// Setup uniform texels in the graphics pipeline.
+    void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);

    /// Setup textures in the graphics pipeline.
    void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage);

+    /// Setup storage texels in the graphics pipeline.
+    void SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage);
+
    /// Setup images in the graphics pipeline.
    void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage);

@@ -209,11 +212,14 @@ private:
    void SetupComputeGlobalBuffers(const ShaderEntries& entries);

    /// Setup texel buffers in the compute pipeline.
-    void SetupComputeTexelBuffers(const ShaderEntries& entries);
+    void SetupComputeUniformTexels(const ShaderEntries& entries);

    /// Setup textures in the compute pipeline.
    void SetupComputeTextures(const ShaderEntries& entries);

+    /// Setup storage texels in the compute pipeline.
+    void SetupComputeStorageTexels(const ShaderEntries& entries);
+
    /// Setup images in the compute pipeline.
    void SetupComputeImages(const ShaderEntries& entries);

@@ -222,10 +228,12 @@ private:

    void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address);

-    void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry);
+    void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry);

    void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry);

+    void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry);
+
    void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);

    void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -400,8 +400,9 @@ private:
        u32 binding = specialization.base_binding;
        binding = DeclareConstantBuffers(binding);
        binding = DeclareGlobalBuffers(binding);
-        binding = DeclareTexelBuffers(binding);
+        binding = DeclareUniformTexels(binding);
        binding = DeclareSamplers(binding);
+        binding = DeclareStorageTexels(binding);
        binding = DeclareImages(binding);

        const Id main = OpFunction(t_void, {}, TypeFunction(t_void));
@@ -515,6 +516,16 @@ private:
    void DeclareCommon() {
        thread_id =
            DeclareInputBuiltIn(spv::BuiltIn::SubgroupLocalInvocationId, t_in_uint, "thread_id");
+        thread_masks[0] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupEqMask, t_in_uint4, "thread_eq_mask");
+        thread_masks[1] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupGeMask, t_in_uint4, "thread_ge_mask");
+        thread_masks[2] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupGtMask, t_in_uint4, "thread_gt_mask");
+        thread_masks[3] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupLeMask, t_in_uint4, "thread_le_mask");
+        thread_masks[4] =
+            DeclareInputBuiltIn(spv::BuiltIn::SubgroupLtMask, t_in_uint4, "thread_lt_mask");
    }

    void DeclareVertex() {
@@ -731,8 +742,10 @@ private:
            if (!IsGenericAttribute(index)) {
                continue;
            }
-
            const u32 location = GetGenericAttributeLocation(index);
+            if (!IsAttributeEnabled(location)) {
+                continue;
+            }
            const auto type_descriptor = GetAttributeType(location);
            Id type;
            if (IsInputAttributeArray()) {
@@ -877,7 +890,7 @@ private:
        return binding;
    }

-    u32 DeclareTexelBuffers(u32 binding) {
+    u32 DeclareUniformTexels(u32 binding) {
        for (const auto& sampler : ir.GetSamplers()) {
            if (!sampler.is_buffer) {
                continue;
@@ -898,7 +911,7 @@ private:
            Decorate(id, spv::Decoration::Binding, binding++);
            Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);

-            texel_buffers.emplace(sampler.index, TexelBuffer{image_type, id});
+            uniform_texels.emplace(sampler.index, TexelBuffer{image_type, id});
        }
        return binding;
    }
@@ -933,31 +946,48 @@ private:
        return binding;
    }

-    u32 DeclareImages(u32 binding) {
+    u32 DeclareStorageTexels(u32 binding) {
        for (const auto& image : ir.GetImages()) {
-            const auto [dim, arrayed] = GetImageDim(image);
-            constexpr int depth = 0;
-            constexpr bool ms = false;
-            constexpr int sampled = 2; // This won't be accessed with a sampler
-            constexpr auto format = spv::ImageFormat::Unknown;
-            const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
-            const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
-            const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
-            AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
-
-            Decorate(id, spv::Decoration::Binding, binding++);
-            Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
-            if (image.is_read && !image.is_written) {
-                Decorate(id, spv::Decoration::NonWritable);
-            } else if (image.is_written && !image.is_read) {
-                Decorate(id, spv::Decoration::NonReadable);
+            if (image.type != Tegra::Shader::ImageType::TextureBuffer) {
+                continue;
            }
-
-            images.emplace(image.index, StorageImage{image_type, id});
+            DeclareImage(image, binding);
        }
        return binding;
    }

+    u32 DeclareImages(u32 binding) {
+        for (const auto& image : ir.GetImages()) {
+            if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
+                continue;
+            }
+            DeclareImage(image, binding);
+        }
+        return binding;
+    }
+
+    void DeclareImage(const Image& image, u32& binding) {
+        const auto [dim, arrayed] = GetImageDim(image);
+        constexpr int depth = 0;
+        constexpr bool ms = false;
+        constexpr int sampled = 2; // This won't be accessed with a sampler
+        const auto format = image.is_atomic ? spv::ImageFormat::R32ui : spv::ImageFormat::Unknown;
+        const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
+        const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
+        const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
+        AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
+
+        Decorate(id, spv::Decoration::Binding, binding++);
+        Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
+        if (image.is_read && !image.is_written) {
+            Decorate(id, spv::Decoration::NonWritable);
+        } else if (image.is_written && !image.is_read) {
+            Decorate(id, spv::Decoration::NonReadable);
+        }
+
+        images.emplace(image.index, StorageImage{image_type, id});
+    }
+
    bool IsRenderTargetEnabled(u32 rt) const {
        for (u32 component = 0; component < 4; ++component) {
            if (header.ps.IsColorComponentOutputEnabled(rt, component)) {
@@ -976,6 +1006,10 @@ private:
        return stage == ShaderType::TesselationControl;
    }

+    bool IsAttributeEnabled(u32 location) const {
+        return stage != ShaderType::Vertex || specialization.enabled_attributes[location];
+    }
+
    u32 GetNumInputVertices() const {
        switch (stage) {
        case ShaderType::Geometry:
@@ -1071,8 +1105,7 @@ private:

    void VisitBasicBlock(const NodeBlock& bb) {
        for (const auto& node : bb) {
-            [[maybe_unused]] const Type type = Visit(node).type;
-            ASSERT(type == Type::Void);
+            Visit(node);
        }
    }

@@ -1192,16 +1225,20 @@ private:
                UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element);
                return {v_float_zero, Type::Float};
            default:
-                if (IsGenericAttribute(attribute)) {
-                    const u32 location = GetGenericAttributeLocation(attribute);
-                    const auto type_descriptor = GetAttributeType(location);
-                    const Type type = type_descriptor.type;
-                    const Id attribute_id = input_attributes.at(attribute);
-                    const std::vector elements = {element};
-                    const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
-                    return {OpLoad(GetTypeDefinition(type), pointer), type};
+                if (!IsGenericAttribute(attribute)) {
+                    break;
                }
-                break;
+                const u32 location = GetGenericAttributeLocation(attribute);
+                if (!IsAttributeEnabled(location)) {
+                    // Disabled attributes (also known as constant attributes) always return zero.
+                    return {v_float_zero, Type::Float};
+                }
+                const auto type_descriptor = GetAttributeType(location);
+                const Type type = type_descriptor.type;
+                const Id attribute_id = input_attributes.at(attribute);
+                const std::vector elements = {element};
+                const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
+                return {OpLoad(GetTypeDefinition(type), pointer), type};
            }
            UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
            return {v_float_zero, Type::Float};
@@ -1237,7 +1274,7 @@ private:
                } else {
                    UNREACHABLE_MSG("Unmanaged offset node type");
                }
-                pointer = OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), buffer_index,
+                pointer = OpAccessChain(t_cbuf_float, buffer_id, v_uint_zero, buffer_index,
                                        buffer_element);
            }
            return {OpLoad(t_float, pointer), Type::Float};
@@ -1362,7 +1399,9 @@ private:
        Expression target{};
        if (const auto gpr = std::get_if<GprNode>(&*dest)) {
            if (gpr->GetIndex() == Register::ZeroIndex) {
-                // Writing to Register::ZeroIndex is a no op
+                // Writing to Register::ZeroIndex is a no op but we still have to visit its source
+                // because it might have side effects.
+                Visit(src);
                return {};
            }
            target = {registers.at(gpr->GetIndex()), Type::Float};
@@ -1590,7 +1629,7 @@ private:

        const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b);
        const Id carry = OpCompositeExtract(t_uint, result, 1);
-        return {OpINotEqual(t_bool, carry, Constant(t_uint, 0)), Type::Bool};
+        return {OpINotEqual(t_bool, carry, v_uint_zero), Type::Bool};
    }

    Expression LogicalAssign(Operation operation) {
@@ -1653,7 +1692,7 @@ private:
        const auto& meta = std::get<MetaTexture>(operation.GetMeta());
        const u32 index = meta.sampler.index;
        if (meta.sampler.is_buffer) {
-            const auto& entry = texel_buffers.at(index);
+            const auto& entry = uniform_texels.at(index);
            return OpLoad(entry.image_type, entry.image);
        } else {
            const auto& entry = sampled_images.at(index);
@@ -1930,39 +1969,20 @@ private:
        return {};
    }

-    Expression AtomicImageAdd(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
+    template <Id (Module::*func)(Id, Id, Id, Id, Id)>
+    Expression AtomicImage(Operation operation) {
+        const auto& meta{std::get<MetaImage>(operation.GetMeta())};
+        ASSERT(meta.values.size() == 1);

-    Expression AtomicImageMin(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
+        const Id coordinate = GetCoordinates(operation, Type::Int);
+        const Id image = images.at(meta.image.index).image;
+        const Id sample = v_uint_zero;
+        const Id pointer = OpImageTexelPointer(t_image_uint, image, coordinate, sample);

-    Expression AtomicImageMax(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageAnd(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageOr(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageXor(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageExchange(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
+        const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
+        const Id semantics = v_uint_zero;
+        const Id value = AsUint(Visit(meta.values[0]));
+        return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
    }

    template <Id (Module::*func)(Id, Id, Id, Id, Id)>
@@ -1977,7 +1997,7 @@ private:
            return {v_float_zero, Type::Float};
        }
        const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
-        const Id semantics = Constant(t_uint, 0);
+        const Id semantics = v_uint_zero;
        const Id value = AsUint(Visit(operation[1]));

        return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
@@ -2175,14 +2195,37 @@ private:
        return {OpLoad(t_uint, thread_id), Type::Uint};
    }

+    template <std::size_t index>
+    Expression ThreadMask(Operation) {
+        // TODO(Rodrigo): Handle devices with different warp sizes
+        const Id mask = thread_masks[index];
+        return {OpLoad(t_uint, AccessElement(t_in_uint, mask, 0)), Type::Uint};
+    }
+
    Expression ShuffleIndexed(Operation operation) {
        const Id value = AsFloat(Visit(operation[0]));
        const Id index = AsUint(Visit(operation[1]));
        return {OpSubgroupReadInvocationKHR(t_float, value, index), Type::Float};
    }

-    Expression MemoryBarrierGL(Operation) {
-        const auto scope = spv::Scope::Device;
+    Expression Barrier(Operation) {
+        if (!ir.IsDecompiled()) {
+            LOG_ERROR(Render_Vulkan, "OpBarrier used by shader is not decompiled");
+            return {};
+        }
+
+        const auto scope = spv::Scope::Workgroup;
+        const auto memory = spv::Scope::Workgroup;
+        const auto semantics =
+            spv::MemorySemanticsMask::WorkgroupMemory | spv::MemorySemanticsMask::AcquireRelease;
+        OpControlBarrier(Constant(t_uint, static_cast<u32>(scope)),
+                         Constant(t_uint, static_cast<u32>(memory)),
+                         Constant(t_uint, static_cast<u32>(semantics)));
+        return {};
+    }
+
+    template <spv::Scope scope>
+    Expression MemoryBarrier(Operation) {
        const auto semantics =
            spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory |
            spv::MemorySemanticsMask::WorkgroupMemory |
@@ -2578,11 +2621,11 @@ private:

        &SPIRVDecompiler::ImageLoad,
        &SPIRVDecompiler::ImageStore,
-        &SPIRVDecompiler::AtomicImageAdd,
-        &SPIRVDecompiler::AtomicImageAnd,
-        &SPIRVDecompiler::AtomicImageOr,
-        &SPIRVDecompiler::AtomicImageXor,
-        &SPIRVDecompiler::AtomicImageExchange,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicXor>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicExchange>,

        &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,
        &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>,
@@ -2639,9 +2682,16 @@ private:
        &SPIRVDecompiler::Vote<&Module::OpSubgroupAllEqualKHR>,

        &SPIRVDecompiler::ThreadId,
+        &SPIRVDecompiler::ThreadMask<0>, // Eq
+        &SPIRVDecompiler::ThreadMask<1>, // Ge
+        &SPIRVDecompiler::ThreadMask<2>, // Gt
+        &SPIRVDecompiler::ThreadMask<3>, // Le
+        &SPIRVDecompiler::ThreadMask<4>, // Lt
        &SPIRVDecompiler::ShuffleIndexed,

-        &SPIRVDecompiler::MemoryBarrierGL,
+        &SPIRVDecompiler::Barrier,
+        &SPIRVDecompiler::MemoryBarrier<spv::Scope::Workgroup>,
+        &SPIRVDecompiler::MemoryBarrier<spv::Scope::Device>,
    };
    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

@@ -2717,8 +2767,11 @@ private:
        Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
    const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct);

+    const Id t_image_uint = TypePointer(spv::StorageClass::Image, t_uint);
+
    const Id v_float_zero = Constant(t_float, 0.0f);
    const Id v_float_one = Constant(t_float, 1.0f);
+    const Id v_uint_zero = Constant(t_uint, 0);

    // Nvidia uses these defaults for varyings (e.g. position and generic attributes)
    const Id v_varying_default =
@@ -2743,15 +2796,16 @@ private:
    std::unordered_map<u8, GenericVaryingDescription> output_attributes;
    std::map<u32, Id> constant_buffers;
    std::map<GlobalMemoryBase, Id> global_buffers;
-    std::map<u32, TexelBuffer> texel_buffers;
+    std::map<u32, TexelBuffer> uniform_texels;
    std::map<u32, SampledImage> sampled_images;
+    std::map<u32, TexelBuffer> storage_texels;
    std::map<u32, StorageImage> images;

+    std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
    Id instance_index{};
    Id vertex_index{};
    Id base_instance{};
    Id base_vertex{};
-    std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
    Id frag_depth{};
    Id frag_coord{};
    Id front_facing{};
@@ -2763,6 +2817,7 @@ private:
    Id workgroup_id{};
    Id local_invocation_id{};
    Id thread_id{};
+    std::array<Id, 5> thread_masks{}; // eq, ge, gt, le, lt

    VertexIndices in_indices;
    VertexIndices out_indices;
@@ -3006,13 +3061,17 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
    }
    for (const auto& sampler : ir.GetSamplers()) {
        if (sampler.is_buffer) {
-            entries.texel_buffers.emplace_back(sampler);
+            entries.uniform_texels.emplace_back(sampler);
        } else {
            entries.samplers.emplace_back(sampler);
        }
    }
    for (const auto& image : ir.GetImages()) {
-        entries.images.emplace_back(image);
+        if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
+            entries.storage_texels.emplace_back(image);
+        } else {
+            entries.images.emplace_back(image);
+        }
    }
    for (const auto& attribute : ir.GetInputAttributes()) {
        if (IsGenericAttribute(attribute)) {
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -21,8 +21,9 @@ class VKDevice;
 namespace Vulkan {

 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using TexelBufferEntry = VideoCommon::Shader::Sampler;
+using UniformTexelEntry = VideoCommon::Shader::Sampler;
 using SamplerEntry = VideoCommon::Shader::Sampler;
+using StorageTexelEntry = VideoCommon::Shader::Image;
 using ImageEntry = VideoCommon::Shader::Image;

 constexpr u32 DESCRIPTOR_SET = 0;
@@ -66,13 +67,15 @@ private:
 struct ShaderEntries {
    u32 NumBindings() const {
        return static_cast<u32>(const_buffers.size() + global_buffers.size() +
-                                texel_buffers.size() + samplers.size() + images.size());
+                                uniform_texels.size() + samplers.size() + storage_texels.size() +
+                                images.size());
    }

    std::vector<ConstBufferEntry> const_buffers;
    std::vector<GlobalBufferEntry> global_buffers;
-    std::vector<TexelBufferEntry> texel_buffers;
+    std::vector<UniformTexelEntry> uniform_texels;
    std::vector<SamplerEntry> samplers;
+    std::vector<StorageTexelEntry> storage_texels;
    std::vector<ImageEntry> images;
    std::set<u32> attributes;
    std::array<bool, Maxwell::NumClipDistances> clip_distances{};
@@ -88,7 +91,8 @@ struct Specialization final {
    u32 shared_memory_size{};

    // Graphics specific
-    std::optional<float> point_size{};
+    std::optional<float> point_size;
+    std::bitset<Maxwell::NumVertexAttributes> enabled_attributes;
    std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{};
    bool ndc_minus_one_to_one{};
 };
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -35,7 +35,7 @@ public:
    /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
    void Unmap(u64 size);

-    VkBuffer GetHandle() const {
+    VkBuffer Handle() const {
        return *buffer;
    }

--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -100,8 +100,8 @@ vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params,
    ci.pNext = nullptr;
    ci.flags = 0;
    ci.size = static_cast<VkDeviceSize>(host_memory_size);
-    ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
-               VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+    ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT |
+               VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
    ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
    ci.queueFamilyIndexCount = 0;
    ci.pQueueFamilyIndices = nullptr;
@@ -354,26 +354,23 @@ CachedSurfaceView::~CachedSurfaceView() = default;

 VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source,
                                         SwizzleSource z_source, SwizzleSource w_source) {
-    const u32 swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
-    if (last_image_view && last_swizzle == swizzle) {
+    const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
+    if (last_image_view && last_swizzle == new_swizzle) {
        return last_image_view;
    }
-    last_swizzle = swizzle;
+    last_swizzle = new_swizzle;

-    const auto [entry, is_cache_miss] = view_cache.try_emplace(swizzle);
+    const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
    auto& image_view = entry->second;
    if (!is_cache_miss) {
        return last_image_view = *image_view;
    }

-    auto swizzle_x = MaxwellToVK::SwizzleSource(x_source);
-    auto swizzle_y = MaxwellToVK::SwizzleSource(y_source);
-    auto swizzle_z = MaxwellToVK::SwizzleSource(z_source);
-    auto swizzle_w = MaxwellToVK::SwizzleSource(w_source);
-
+    std::array swizzle{MaxwellToVK::SwizzleSource(x_source), MaxwellToVK::SwizzleSource(y_source),
+                       MaxwellToVK::SwizzleSource(z_source), MaxwellToVK::SwizzleSource(w_source)};
    if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) {
        // A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here.
-        std::swap(swizzle_x, swizzle_z);
+        std::swap(swizzle[0], swizzle[2]);
    }

    // Games can sample depth or stencil values on textures. This is decided by the swizzle value on
@@ -395,11 +392,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
            UNIMPLEMENTED();
        }

-        // Vulkan doesn't seem to understand swizzling of a depth stencil image, use identity
-        swizzle_x = VK_COMPONENT_SWIZZLE_R;
-        swizzle_y = VK_COMPONENT_SWIZZLE_G;
-        swizzle_z = VK_COMPONENT_SWIZZLE_B;
-        swizzle_w = VK_COMPONENT_SWIZZLE_A;
+        // Make sure we sample the first component
+        std::transform(
+            swizzle.begin(), swizzle.end(), swizzle.begin(), [](VkComponentSwizzle component) {
+                return component == VK_COMPONENT_SWIZZLE_G ? VK_COMPONENT_SWIZZLE_R : component;
+            });
    }

    VkImageViewCreateInfo ci;
@@ -409,7 +406,7 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
    ci.image = surface.GetImageHandle();
    ci.viewType = image_view_type;
    ci.format = surface.GetImage().GetFormat();
-    ci.components = {swizzle_x, swizzle_y, swizzle_z, swizzle_w};
+    ci.components = {swizzle[0], swizzle[1], swizzle[2], swizzle[3]};
    ci.subresourceRange.aspectMask = aspect;
    ci.subresourceRange.baseMipLevel = base_level;
    ci.subresourceRange.levelCount = num_levels;
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -387,7 +387,6 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
    }
    case OpCode::Id::RED: {
        UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32);
-        UNIMPLEMENTED_IF_MSG(instr.red.operation != AtomicOp::Add);
        const auto [real_address, base_address, descriptor] =
            TrackGlobalMemory(bb, instr, true, true);
        if (!real_address || !base_address) {
@@ -396,7 +395,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
        }
        Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
        Node value = GetRegister(instr.gpr0);
-        bb.push_back(Operation(OperationCode::ReduceIAdd, move(gmem), move(value)));
+        bb.push_back(Operation(GetAtomOperation(instr.red.operation), move(gmem), move(value)));
        break;
    }
    case OpCode::Id::ATOM: {
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -83,7 +83,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                return Operation(OperationCode::YNegate);
            case SystemVariable::InvocationInfo:
                LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete");
-                return Immediate(0U);
+                return Immediate(0x00ff'0000U);
            case SystemVariable::WscaleFactorXY:
                UNIMPLEMENTED_MSG("S2R WscaleFactorXY is not implemented");
                return Immediate(0U);
@@ -109,6 +109,27 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                return Operation(OperationCode::WorkGroupIdY);
            case SystemVariable::CtaIdZ:
                return Operation(OperationCode::WorkGroupIdZ);
+            case SystemVariable::EqMask:
+            case SystemVariable::LtMask:
+            case SystemVariable::LeMask:
+            case SystemVariable::GtMask:
+            case SystemVariable::GeMask:
+                uses_warps = true;
+                switch (instr.sys20) {
+                case SystemVariable::EqMask:
+                    return Operation(OperationCode::ThreadEqMask);
+                case SystemVariable::LtMask:
+                    return Operation(OperationCode::ThreadLtMask);
+                case SystemVariable::LeMask:
+                    return Operation(OperationCode::ThreadLeMask);
+                case SystemVariable::GtMask:
+                    return Operation(OperationCode::ThreadGtMask);
+                case SystemVariable::GeMask:
+                    return Operation(OperationCode::ThreadGeMask);
+                default:
+                    UNREACHABLE();
+                    return Immediate(0u);
+                }
            default:
                UNIMPLEMENTED_MSG("Unhandled system move: {}",
                                  static_cast<u32>(instr.sys20.Value()));
@@ -272,10 +293,25 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8));
        break;
    }
+    case OpCode::Id::BAR: {
+        UNIMPLEMENTED_IF_MSG(instr.value != 0xF0A81B8000070000ULL, "BAR is not BAR.SYNC 0x0");
+        bb.push_back(Operation(OperationCode::Barrier));
+        break;
+    }
    case OpCode::Id::MEMBAR: {
-        UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL);
        UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default);
-        bb.push_back(Operation(OperationCode::MemoryBarrierGL));
+        const OperationCode type = [instr] {
+            switch (instr.membar.type) {
+            case Tegra::Shader::MembarType::CTA:
+                return OperationCode::MemoryBarrierGroup;
+            case Tegra::Shader::MembarType::GL:
+                return OperationCode::MemoryBarrierGlobal;
+            default:
+                UNIMPLEMENTED_MSG("MEMBAR type={}", static_cast<int>(instr.membar.type.Value()));
+                return OperationCode::MemoryBarrierGlobal;
+            }
+        }();
+        bb.push_back(Operation(type));
        break;
    }
    case OpCode::Id::DEPBAR: {
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -226,9 +226,16 @@ enum class OperationCode {
    VoteEqual,    /// (bool) -> bool

    ThreadId,       /// () -> uint
+    ThreadEqMask,   /// () -> uint
+    ThreadGeMask,   /// () -> uint
+    ThreadGtMask,   /// () -> uint
+    ThreadLeMask,   /// () -> uint
+    ThreadLtMask,   /// () -> uint
    ShuffleIndexed, /// (uint value, uint index) -> uint

-    MemoryBarrierGL, /// () -> void
+    Barrier,             /// () -> void
+    MemoryBarrierGroup,  /// () -> void
+    MemoryBarrierGlobal, /// () -> void

    Amount,
 };
--- a/src/video_core/texture_cache/format_lookup_table.cpp
+++ b/src/video_core/texture_cache/format_lookup_table.cpp
@@ -41,7 +41,7 @@ struct Table {
    ComponentType alpha_component;
    bool is_srgb;
 };
-constexpr std::array<Table, 77> DefinitionTable = {{
+constexpr std::array<Table, 78> DefinitionTable = {{
    {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U},
    {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S},
    {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI},
@@ -98,6 +98,7 @@ constexpr std::array<Table, 77> DefinitionTable = {{
    {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F},
    {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16},
    {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
+    {TextureFormat::G24R8, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
    {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8},

    {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1},
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -14,6 +14,7 @@
 #include <unordered_map>
 #include <vector>

+#include <boost/container/small_vector.hpp>
 #include <boost/icl/interval_map.hpp>
 #include <boost/range/iterator_range.hpp>

@@ -53,6 +54,7 @@ using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;

 template <typename TSurface, typename TView>
 class TextureCache {
+    using VectorSurface = boost::container::small_vector<TSurface, 1>;

 public:
    void InvalidateRegion(VAddr addr, std::size_t size) {
@@ -308,18 +310,20 @@ public:
        dst_surface.first->MarkAsModified(true, Tick());
    }

-    TSurface TryFindFramebufferSurface(VAddr addr) {
+    TSurface TryFindFramebufferSurface(VAddr addr) const {
        if (!addr) {
            return nullptr;
        }
        const VAddr page = addr >> registry_page_bits;
-        std::vector<TSurface>& list = registry[page];
-        for (auto& surface : list) {
-            if (surface->GetCpuAddr() == addr) {
-                return surface;
-            }
+        const auto it = registry.find(page);
+        if (it == registry.end()) {
+            return nullptr;
        }
-        return nullptr;
+        const auto& list = it->second;
+        const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) {
+            return surface->GetCpuAddr() == addr;
+        });
+        return found != list.end() ? *found : nullptr;
    }

    u64 Tick() {
@@ -498,7 +502,7 @@ private:
     * @param untopological Indicates to the recycler that the texture has no way
     *                      to match the overlaps due to topological reasons.
     **/
-    RecycleStrategy PickStrategy(std::vector<TSurface>& overlaps, const SurfaceParams& params,
+    RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params,
                                 const GPUVAddr gpu_addr, const MatchTopologyResult untopological) {
        if (Settings::IsGPULevelExtreme()) {
            return RecycleStrategy::Flush;
@@ -538,9 +542,8 @@ private:
     * @param untopological     Indicates to the recycler that the texture has no way to match the
     *                          overlaps due to topological reasons.
     **/
-    std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps,
-                                              const SurfaceParams& params, const GPUVAddr gpu_addr,
-                                              const bool preserve_contents,
+    std::pair<TSurface, TView> RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params,
+                                              const GPUVAddr gpu_addr, const bool preserve_contents,
                                              const MatchTopologyResult untopological) {
        const bool do_load = preserve_contents && Settings::IsGPULevelExtreme();
        for (auto& surface : overlaps) {
@@ -650,47 +653,65 @@ private:
     * @param params   The parameters on the new surface.
     * @param gpu_addr The starting address of the new surface.
     **/
-    std::optional<std::pair<TSurface, TView>> TryReconstructSurface(std::vector<TSurface>& overlaps,
+    std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps,
                                                                    const SurfaceParams& params,
-                                                                    const GPUVAddr gpu_addr) {
+                                                                    GPUVAddr gpu_addr) {
        if (params.target == SurfaceTarget::Texture3D) {
-            return {};
+            return std::nullopt;
        }
-        bool modified = false;
+        const auto test_modified = [](TSurface& surface) { return surface->IsModified(); };
        TSurface new_surface = GetUncachedSurface(gpu_addr, params);
-        u32 passed_tests = 0;
+
+        if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) {
+            LoadSurface(new_surface);
+            for (const auto& surface : overlaps) {
+                Unregister(surface);
+            }
+            Register(new_surface);
+            return {{new_surface, new_surface->GetMainView()}};
+        }
+
+        std::size_t passed_tests = 0;
        for (auto& surface : overlaps) {
            const SurfaceParams& src_params = surface->GetSurfaceParams();
-            if (src_params.is_layered || src_params.num_levels > 1) {
-                // We send this cases to recycle as they are more complex to handle
-                return {};
-            }
-            const std::size_t candidate_size = surface->GetSizeInBytes();
-            auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
+            const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
            if (!mipmap_layer) {
                continue;
            }
-            const auto [layer, mipmap] = *mipmap_layer;
-            if (new_surface->GetMipmapSize(mipmap) != candidate_size) {
+            const auto [base_layer, base_mipmap] = *mipmap_layer;
+            if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) {
                continue;
            }
-            modified |= surface->IsModified();
-            // Now we got all the data set up
-            const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
-            const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
-            const CopyParams copy_params(0, 0, 0, 0, 0, layer, 0, mipmap, width, height, 1);
-            passed_tests++;
-            ImageCopy(surface, new_surface, copy_params);
+            ++passed_tests;
+
+            // Copy all mipmaps and layers
+            const u32 block_width = params.GetDefaultBlockWidth();
+            const u32 block_height = params.GetDefaultBlockHeight();
+            for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) {
+                const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
+                const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
+                if (width < block_width || height < block_height) {
+                    // Current APIs forbid copying small compressed textures, avoid errors
+                    break;
+                }
+                const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,
+                                             src_params.depth);
+                ImageCopy(surface, new_surface, copy_params);
+            }
        }
        if (passed_tests == 0) {
-            return {};
-            // In Accurate GPU all tests should pass, else we recycle
-        } else if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
-            return {};
+            return std::nullopt;
        }
+        if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
+            // In Accurate GPU all tests should pass, else we recycle
+            return std::nullopt;
+        }
+
+        const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified);
        for (const auto& surface : overlaps) {
            Unregister(surface);
        }
+
        new_surface->MarkAsModified(modified, Tick());
        Register(new_surface);
        return {{new_surface, new_surface->GetMainView()}};
@@ -708,7 +729,7 @@ private:
     * @param preserve_contents Indicates that the new surface should be loaded from memory or
     *                          left blank.
     */
-    std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps,
+    std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps,
                                                               const SurfaceParams& params,
                                                               const GPUVAddr gpu_addr,
                                                               const VAddr cpu_addr,
@@ -810,7 +831,7 @@ private:
            TSurface& current_surface = iter->second;
            const auto topological_result = current_surface->MatchesTopology(params);
            if (topological_result != MatchTopologyResult::FullMatch) {
-                std::vector<TSurface> overlaps{current_surface};
+                VectorSurface overlaps{current_surface};
                return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
                                      topological_result);
            }
@@ -868,12 +889,9 @@ private:
            // two things either the candidate surface is a supertexture of the overlap
            // or they don't match in any known way.
            if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) {
-                if (current_surface->GetGpuAddr() == gpu_addr) {
-                    std::optional<std::pair<TSurface, TView>> view =
-                        TryReconstructSurface(overlaps, params, gpu_addr);
-                    if (view) {
-                        return *view;
-                    }
+                const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr);
+                if (view) {
+                    return *view;
                }
                return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
                                      MatchTopologyResult::FullMatch);
@@ -991,7 +1009,9 @@ private:
        params.target = target;
        params.is_tiled = false;
        params.srgb_conversion = false;
-        params.is_layered = false;
+        params.is_layered =
+            target == SurfaceTarget::Texture1DArray || target == SurfaceTarget::Texture2DArray ||
+            target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray;
        params.block_width = 0;
        params.block_height = 0;
        params.block_depth = 0;
@@ -1124,23 +1144,25 @@ private:
        }
    }

-    std::vector<TSurface> GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
+    VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
        if (size == 0) {
            return {};
        }
        const VAddr cpu_addr_end = cpu_addr + size;
-        VAddr start = cpu_addr >> registry_page_bits;
        const VAddr end = (cpu_addr_end - 1) >> registry_page_bits;
-        std::vector<TSurface> surfaces;
-        while (start <= end) {
-            std::vector<TSurface>& list = registry[start];
-            for (auto& surface : list) {
-                if (!surface->IsPicked() && surface->Overlaps(cpu_addr, cpu_addr_end)) {
-                    surface->MarkAsPicked(true);
-                    surfaces.push_back(surface);
-                }
+        VectorSurface surfaces;
+        for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) {
+            const auto it = registry.find(start);
+            if (it == registry.end()) {
+                continue;
+            }
+            for (auto& surface : it->second) {
+                if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) {
+                    continue;
+                }
+                surface->MarkAsPicked(true);
+                surfaces.push_back(surface);
            }
-            start++;
        }
        for (auto& surface : surfaces) {
            surface->MarkAsPicked(false);
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -106,6 +106,9 @@ public:
        format.setVersion(4, 3);
        format.setProfile(QSurfaceFormat::CompatibilityProfile);
        format.setOption(QSurfaceFormat::FormatOption::DeprecatedFunctions);
+        if (Settings::values.renderer_debug) {
+            format.setOption(QSurfaceFormat::FormatOption::DebugContext);
+        }
        // TODO: expose a setting for buffer value (ie default/single/double/triple)
        format.setSwapBehavior(QSurfaceFormat::DefaultSwapBehavior);
        format.setSwapInterval(0);
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -533,6 +533,8 @@ void Config::ReadDebuggingValues() {
    Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool();
    Settings::values.disable_cpu_opt =
        ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool();
+    Settings::values.disable_macro_jit =
+        ReadSetting(QStringLiteral("disable_macro_jit"), false).toBool();

    qt_config->endGroup();
 }
@@ -643,6 +645,8 @@ void Config::ReadRendererValues() {
    Settings::values.use_asynchronous_gpu_emulation =
        ReadSetting(QStringLiteral("use_asynchronous_gpu_emulation"), false).toBool();
    Settings::values.use_vsync = ReadSetting(QStringLiteral("use_vsync"), true).toBool();
+    Settings::values.use_assembly_shaders =
+        ReadSetting(QStringLiteral("use_assembly_shaders"), false).toBool();
    Settings::values.use_fast_gpu_time =
        ReadSetting(QStringLiteral("use_fast_gpu_time"), true).toBool();
    Settings::values.force_30fps_mode =
@@ -1009,6 +1013,7 @@ void Config::SaveDebuggingValues() {
    WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false);
    WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false);
    WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false);
+    WriteSetting(QStringLiteral("disable_macro_jit"), Settings::values.disable_macro_jit, false);

    qt_config->endGroup();
 }
@@ -1090,6 +1095,8 @@ void Config::SaveRendererValues() {
    WriteSetting(QStringLiteral("use_asynchronous_gpu_emulation"),
                 Settings::values.use_asynchronous_gpu_emulation, false);
    WriteSetting(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
+    WriteSetting(QStringLiteral("use_assembly_shaders"), Settings::values.use_assembly_shaders,
+                 false);
    WriteSetting(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, true);
    WriteSetting(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, false);

--- a/src/yuzu/configuration/configure_debug.cpp
+++ b/src/yuzu/configuration/configure_debug.cpp
@@ -39,6 +39,8 @@ void ConfigureDebug::SetConfiguration() {
    ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt);
    ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn());
    ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug);
+    ui->disable_macro_jit->setEnabled(!Core::System::GetInstance().IsPoweredOn());
+    ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit);
 }

 void ConfigureDebug::ApplyConfiguration() {
@@ -51,6 +53,7 @@ void ConfigureDebug::ApplyConfiguration() {
    Settings::values.quest_flag = ui->quest_flag->isChecked();
    Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked();
    Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked();
+    Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked();
    Debugger::ToggleConsole();
    Log::Filter filter;
    filter.ParseFilterString(Settings::values.log_filter);
--- a/src/yuzu/configuration/configure_debug.ui
+++ b/src/yuzu/configuration/configure_debug.ui
@@ -148,6 +148,19 @@
        </property>
       </widget>
      </item>
+      <item>
+       <widget class="QCheckBox" name="disable_macro_jit">
+        <property name="enabled">
+         <bool>true</bool>
+        </property>
+        <property name="whatsThis">
+         <string>When checked, it disables the macro Just In Time compiler. Enabled this makes games run slower</string>
+        </property>
+        <property name="text">
+         <string>Disable Macro JIT</string>
+        </property>
+       </widget>
+      </item>
     </layout>
    </widget>
   </item>
--- a/src/yuzu/configuration/configure_graphics_advanced.cpp
+++ b/src/yuzu/configuration/configure_graphics_advanced.cpp
@@ -12,6 +12,9 @@ ConfigureGraphicsAdvanced::ConfigureGraphicsAdvanced(QWidget* parent)

    ui->setupUi(this);

+    // TODO: Remove this after assembly shaders are fully integrated
+    ui->use_assembly_shaders->setVisible(false);
+
    SetConfiguration();
 }

@@ -22,6 +25,8 @@ void ConfigureGraphicsAdvanced::SetConfiguration() {
    ui->gpu_accuracy->setCurrentIndex(static_cast<int>(Settings::values.gpu_accuracy));
    ui->use_vsync->setEnabled(runtime_lock);
    ui->use_vsync->setChecked(Settings::values.use_vsync);
+    ui->use_assembly_shaders->setEnabled(runtime_lock);
+    ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders);
    ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time);
    ui->force_30fps_mode->setEnabled(runtime_lock);
    ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode);
@@ -33,6 +38,7 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() {
    auto gpu_accuracy = static_cast<Settings::GPUAccuracy>(ui->gpu_accuracy->currentIndex());
    Settings::values.gpu_accuracy = gpu_accuracy;
    Settings::values.use_vsync = ui->use_vsync->isChecked();
+    Settings::values.use_assembly_shaders = ui->use_assembly_shaders->isChecked();
    Settings::values.use_fast_gpu_time = ui->use_fast_gpu_time->isChecked();
    Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked();
    Settings::values.max_anisotropy = ui->anisotropic_filtering_combobox->currentIndex();
--- a/src/yuzu/configuration/configure_graphics_advanced.ui
+++ b/src/yuzu/configuration/configure_graphics_advanced.ui
@@ -62,6 +62,16 @@
          </property>
         </widget>
        </item>
+        <item>
+         <widget class="QCheckBox" name="use_assembly_shaders">
+          <property name="toolTip">
+           <string>Enabling this reduces shader stutter. Enables OpenGL assembly shaders on supported Nvidia devices (NV_gpu_program5 is required). This feature is experimental.</string>
+          </property>
+          <property name="text">
+           <string>Use assembly shaders (experimental, Nvidia OpenGL only)</string>
+          </property>
+         </widget>
+        </item>
        <item>
         <widget class="QCheckBox" name="force_30fps_mode">
          <property name="text">
--- a/src/yuzu/configuration/configure_input_player.cpp
+++ b/src/yuzu/configuration/configure_input_player.cpp
@@ -480,7 +480,9 @@ void ConfigureInputPlayer::RestoreDefaults() {
            SetAnalogButton(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]);
        }
    }
+
    UpdateButtonLabels();
+    ApplyConfiguration();
 }

 void ConfigureInputPlayer::ClearAll() {
@@ -505,6 +507,7 @@ void ConfigureInputPlayer::ClearAll() {
    }

    UpdateButtonLabels();
+    ApplyConfiguration();
 }

 void ConfigureInputPlayer::UpdateButtonLabels() {
--- a/src/yuzu/discord_impl.cpp
+++ b/src/yuzu/discord_impl.cpp
@@ -18,7 +18,7 @@ DiscordImpl::DiscordImpl() {

    // The number is the client ID for yuzu, it's used for images and the
    // application name
-    Discord_Initialize("471872241299226636", &handlers, 1, nullptr);
+    Discord_Initialize("712465656758665259", &handlers, 1, nullptr);
 }

 DiscordImpl::~DiscordImpl() {
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -65,6 +65,7 @@ static FileSys::VirtualFile VfsDirectoryCreateFileWrapper(const FileSys::Virtual
 #include "common/logging/backend.h"
 #include "common/logging/filter.h"
 #include "common/logging/log.h"
+#include "common/memory_detect.h"
 #include "common/microprofile.h"
 #include "common/scm_rev.h"
 #include "common/scope_exit.h"
@@ -219,6 +220,10 @@ GMainWindow::GMainWindow()
    LOG_INFO(Frontend, "Host CPU: {}", Common::GetCPUCaps().cpu_string);
 #endif
    LOG_INFO(Frontend, "Host OS: {}", QSysInfo::prettyProductName().toStdString());
+    LOG_INFO(Frontend, "Host RAM: {:.2f} GB",
+             Common::GetMemInfo().TotalPhysicalMemory / 1024.0f / 1024 / 1024);
+    LOG_INFO(Frontend, "Host Swap: {:.2f} GB",
+             Common::GetMemInfo().TotalSwapMemory / 1024.0f / 1024 / 1024);
    UpdateWindowTitle();

    show();
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -397,6 +397,8 @@ void Config::ReadValues() {
        sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false);
    Settings::values.use_vsync =
        static_cast<u16>(sdl2_config->GetInteger("Renderer", "use_vsync", 1));
+    Settings::values.use_assembly_shaders =
+        sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", false);
    Settings::values.use_fast_gpu_time =
        sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true);

@@ -430,6 +432,8 @@ void Config::ReadValues() {
    Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false);
    Settings::values.disable_cpu_opt =
        sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false);
+    Settings::values.disable_macro_jit =
+        sdl2_config->GetBoolean("Debugging", "disable_macro_jit", false);

    const auto title_list = sdl2_config->Get("AddOns", "title_ids", "");
    std::stringstream ss(title_list);
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -134,6 +134,10 @@ max_anisotropy =
 # 0 (default): Off, 1: On
 use_vsync =

+# Whether to use OpenGL assembly shaders or not. NV_gpu_program5 is required.
+# 0 (default): Off, 1: On
+use_assembly_shaders =
+
 # Turns on the frame limiter, which will limit frames output to the target game speed
 # 0: Off, 1: On (default)
 use_frame_limit =
@@ -287,6 +291,8 @@ quest_flag =
 # Determines whether or not JIT CPU optimizations are enabled
 # false: Optimizations Enabled, true: Optimizations Disabled
 disable_cpu_opt =
+# Enables/Disables the macro JIT compiler
+disable_macro_jit=false

 [WebService]
 # Whether or not to enable telemetry
--- a/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
+++ b/src/yuzu_cmd/emu_window/emu_window_sdl2_gl.cpp
@@ -98,6 +98,9 @@ EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(Core::System& system, bool fullscreen)
    SDL_GL_SetAttribute(SDL_GL_BLUE_SIZE, 8);
    SDL_GL_SetAttribute(SDL_GL_ALPHA_SIZE, 0);
    SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1);
+    if (Settings::values.renderer_debug) {
+        SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
+    }
    SDL_GL_SetSwapInterval(0);

    std::string window_title = fmt::format("yuzu {} | {}-{}", Common::g_build_fullname,