bcat: Disable Boxcat backend by default

This commit disables the Boxcat backend by default for new users of yuzu. There's several reasons as to why this is done: 1. Boxcat currently only actually has an impact on 3 games and doesn't influence any core mechanics of them 2. It causes a plethora of issues when enabled such as games like Crash Team Racing, Diablo 3 and Tales of Vesperia not booting at all or hanging 3. It causes https://github.com/yuzu-emu/yuzu/issues/2957 to happen. This makes the configuration menu totally unusable for many Linux users of yuzu I think those points show that currently the negative impact of Boxcat outweighs its benefits and should therefore be disabled by default. For users who are eager to use the extra features provided by it, they can still just turn it on in the settings.
Merge pull request #3521 from ReinUsesLisp/nsight-debug
2020-03-17 15:24:26 +01:00 · 2020-03-16 22:52:42 -04:00 · 2020-03-16 04:03:34 -03:00 · 2020-03-16 03:59:08 -03:00 · 2020-03-15 21:26:54 -03:00 · 2020-03-15 21:24:53 -03:00
148 changed files with 5240 additions and 4086 deletions
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -57,8 +57,6 @@ set(HASH_FILES
    "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.h"
    "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.cpp"
    "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.h"
-    "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.cpp"
-    "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.h"
    "${VIDEO_CORE}/shader/decode/arithmetic.cpp"
    "${VIDEO_CORE}/shader/decode/arithmetic_half.cpp"
    "${VIDEO_CORE}/shader/decode/arithmetic_half_immediate.cpp"
@@ -91,8 +89,6 @@ set(HASH_FILES
    "${VIDEO_CORE}/shader/ast.h"
    "${VIDEO_CORE}/shader/compiler_settings.cpp"
    "${VIDEO_CORE}/shader/compiler_settings.h"
-    "${VIDEO_CORE}/shader/const_buffer_locker.cpp"
-    "${VIDEO_CORE}/shader/const_buffer_locker.h"
    "${VIDEO_CORE}/shader/control_flow.cpp"
    "${VIDEO_CORE}/shader/control_flow.h"
    "${VIDEO_CORE}/shader/decode.cpp"
@@ -101,9 +97,13 @@ set(HASH_FILES
    "${VIDEO_CORE}/shader/node.h"
    "${VIDEO_CORE}/shader/node_helper.cpp"
    "${VIDEO_CORE}/shader/node_helper.h"
+    "${VIDEO_CORE}/shader/registry.cpp"
+    "${VIDEO_CORE}/shader/registry.h"
    "${VIDEO_CORE}/shader/shader_ir.cpp"
    "${VIDEO_CORE}/shader/shader_ir.h"
    "${VIDEO_CORE}/shader/track.cpp"
+    "${VIDEO_CORE}/shader/transform_feedback.cpp"
+    "${VIDEO_CORE}/shader/transform_feedback.h"
 )
 set(COMBINED "")
 foreach (F IN LISTS HASH_FILES)
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 yuzu emulator
 =============
-[![Travis CI Build Status](https://travis-ci.org/yuzu-emu/yuzu.svg?branch=master)](https://travis-ci.org/yuzu-emu/yuzu)
+[![Travis CI Build Status](https://travis-ci.com/yuzu-emu/yuzu.svg?branch=master)](https://travis-ci.com/yuzu-emu/yuzu)
 [![Azure Mainline CI Build Status](https://dev.azure.com/yuzu-emu/yuzu/_apis/build/status/yuzu%20mainline?branchName=master)](https://dev.azure.com/yuzu-emu/yuzu/)

 yuzu is an experimental open-source emulator for the Nintendo Switch from the creators of [Citra](https://citra-emu.org/).
@@ -21,7 +21,7 @@ For development discussion, please join us on [Discord](https://discord.gg/XQV6d

 Most of the development happens on GitHub. It's also where [our central repository](https://github.com/yuzu-emu/yuzu) is hosted.

-If you want to contribute please take a look at the [Contributor's Guide](CONTRIBUTING.md) and [Developer Information](https://github.com/yuzu-emu/yuzu/wiki/Developer-Information). You should as well contact any of the developers on Discord in order to know about the current state of the emulator.
+If you want to contribute please take a look at the [Contributor's Guide](CONTRIBUTING.md) and [Developer Information](https://github.com/yuzu-emu/yuzu/wiki/Developer-Information). You should also contact any of the developers on Discord in order to know about the current state of the emulator.

 ### Building

--- a/externals/microprofile/microprofile.h
+++ b/externals/microprofile/microprofile.h
@@ -243,6 +243,7 @@ typedef uint32_t ThreadIdType;
 #define MICROPROFILE_DEFINE_GPU(var, name, color) MicroProfileToken g_mp_##var = MicroProfileGetToken("GPU", name, color, MicroProfileTokenTypeGpu)
 #define MICROPROFILE_TOKEN_PASTE0(a, b) a ## b
 #define MICROPROFILE_TOKEN_PASTE(a, b)  MICROPROFILE_TOKEN_PASTE0(a,b)
+#define MICROPROFILE_TOKEN(var) g_mp_##var
 #define MICROPROFILE_SCOPE(var) MicroProfileScopeHandler MICROPROFILE_TOKEN_PASTE(foo, __LINE__)(g_mp_##var)
 #define MICROPROFILE_SCOPE_TOKEN(token) MicroProfileScopeHandler MICROPROFILE_TOKEN_PASTE(foo, __LINE__)(token)
 #define MICROPROFILE_SCOPEI(group, name, color) static MicroProfileToken MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__) = MicroProfileGetToken(group, name, color, MicroProfileTokenTypeCpu); MicroProfileScopeHandler MICROPROFILE_TOKEN_PASTE(foo,__LINE__)( MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__))
--- a/src/audio_core/algorithm/interpolate.cpp
+++ b/src/audio_core/algorithm/interpolate.cpp
@@ -8,13 +8,14 @@
 #include <climits>
 #include <cmath>
 #include <vector>
+
 #include "audio_core/algorithm/interpolate.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"

 namespace AudioCore {

-constexpr std::array<s16, 512> curve_lut0 = {
+constexpr std::array<s16, 512> curve_lut0{
    6600,  19426, 6722,  3,     6479,  19424, 6845,  9,     6359,  19419, 6968,  15,    6239,
    19412, 7093,  22,    6121,  19403, 7219,  28,    6004,  19391, 7345,  34,    5888,  19377,
    7472,  41,    5773,  19361, 7600,  48,    5659,  19342, 7728,  55,    5546,  19321, 7857,
@@ -56,7 +57,7 @@ constexpr std::array<s16, 512> curve_lut0 = {
    19403, 6121,  22,    7093,  19412, 6239,  15,    6968,  19419, 6359,  9,     6845,  19424,
    6479,  3,     6722,  19426, 6600};

-constexpr std::array<s16, 512> curve_lut1 = {
+constexpr std::array<s16, 512> curve_lut1{
    -68,   32639, 69,    -5,    -200,  32630, 212,   -15,   -328,  32613, 359,   -26,   -450,
    32586, 512,   -36,   -568,  32551, 669,   -47,   -680,  32507, 832,   -58,   -788,  32454,
    1000,  -69,   -891,  32393, 1174,  -80,   -990,  32323, 1352,  -92,   -1084, 32244, 1536,
@@ -98,7 +99,7 @@ constexpr std::array<s16, 512> curve_lut1 = {
    32551, -568,  -36,   512,   32586, -450,  -26,   359,   32613, -328,  -15,   212,   32630,
    -200,  -5,    69,    32639, -68};

-constexpr std::array<s16, 512> curve_lut2 = {
+constexpr std::array<s16, 512> curve_lut2{
    3195,  26287, 3329,  -32,   3064,  26281, 3467,  -34,   2936,  26270, 3608,  -38,   2811,
    26253, 3751,  -42,   2688,  26230, 3897,  -46,   2568,  26202, 4046,  -50,   2451,  26169,
    4199,  -54,   2338,  26130, 4354,  -58,   2227,  26085, 4512,  -63,   2120,  26035, 4673,
@@ -146,10 +147,10 @@ std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input,

    if (ratio <= 0) {
        LOG_CRITICAL(Audio, "Nonsensical interpolation ratio {}", ratio);
-        ratio = 1.0;
+        return input;
    }

-    const int step = static_cast<int>(ratio * 0x8000);
+    const s32 step{static_cast<s32>(ratio * 0x8000)};
    const std::array<s16, 512>& lut = [step] {
        if (step > 0xaaaa) {
            return curve_lut0;
@@ -160,28 +161,37 @@ std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input,
        return curve_lut2;
    }();

-    std::vector<s16> output(static_cast<std::size_t>(input.size() / ratio));
-    int in_offset = 0;
-    for (std::size_t out_offset = 0; out_offset < output.size(); out_offset += 2) {
-        const int lut_index = (state.fraction >> 8) * 4;
+    const std::size_t num_frames{input.size() / 2};

-        const int l = input[(in_offset + 0) * 2 + 0] * lut[lut_index + 0] +
-                      input[(in_offset + 1) * 2 + 0] * lut[lut_index + 1] +
-                      input[(in_offset + 2) * 2 + 0] * lut[lut_index + 2] +
-                      input[(in_offset + 3) * 2 + 0] * lut[lut_index + 3];
+    std::vector<s16> output;
+    output.reserve(static_cast<std::size_t>(input.size() / ratio + InterpolationState::taps));

-        const int r = input[(in_offset + 0) * 2 + 1] * lut[lut_index + 0] +
-                      input[(in_offset + 1) * 2 + 1] * lut[lut_index + 1] +
-                      input[(in_offset + 2) * 2 + 1] * lut[lut_index + 2] +
-                      input[(in_offset + 3) * 2 + 1] * lut[lut_index + 3];
+    for (std::size_t frame{}; frame < num_frames; ++frame) {
+        const std::size_t lut_index{(state.fraction >> 8) * InterpolationState::taps};

-        const int new_offset = state.fraction + step;
+        std::rotate(state.history.begin(), state.history.end() - 1, state.history.end());
+        state.history[0][0] = input[frame * 2 + 0];
+        state.history[0][1] = input[frame * 2 + 1];

-        in_offset += new_offset >> 15;
-        state.fraction = new_offset & 0x7fff;
+        while (state.position <= 1.0) {
+            const s32 left{state.history[0][0] * lut[lut_index + 0] +
+                           state.history[1][0] * lut[lut_index + 1] +
+                           state.history[2][0] * lut[lut_index + 2] +
+                           state.history[3][0] * lut[lut_index + 3]};
+            const s32 right{state.history[0][1] * lut[lut_index + 0] +
+                            state.history[1][1] * lut[lut_index + 1] +
+                            state.history[2][1] * lut[lut_index + 2] +
+                            state.history[3][1] * lut[lut_index + 3]};
+            const s32 new_offset{state.fraction + step};

-        output[out_offset + 0] = static_cast<s16>(std::clamp(l >> 15, SHRT_MIN, SHRT_MAX));
-        output[out_offset + 1] = static_cast<s16>(std::clamp(r >> 15, SHRT_MIN, SHRT_MAX));
+            state.fraction = new_offset & 0x7fff;
+
+            output.emplace_back(static_cast<s16>(std::clamp(left >> 15, SHRT_MIN, SHRT_MAX)));
+            output.emplace_back(static_cast<s16>(std::clamp(right >> 15, SHRT_MIN, SHRT_MAX)));
+
+            state.position += ratio;
+        }
+        state.position -= 1.0;
    }

    return output;
--- a/src/audio_core/algorithm/interpolate.h
+++ b/src/audio_core/algorithm/interpolate.h
@@ -6,12 +6,17 @@

 #include <array>
 #include <vector>
+
 #include "common/common_types.h"

 namespace AudioCore {

 struct InterpolationState {
-    int fraction = 0;
+    static constexpr std::size_t taps{4};
+    static constexpr std::size_t history_size{taps * 2 - 1};
+    std::array<std::array<s16, 2>, history_size> history{};
+    double position{};
+    s32 fraction{};
 };

 /// Interpolates input signal to produce output signal.
--- a/src/audio_core/cubeb_sink.cpp
+++ b/src/audio_core/cubeb_sink.cpp
@@ -8,6 +8,7 @@
 #include "audio_core/cubeb_sink.h"
 #include "audio_core/stream.h"
 #include "audio_core/time_stretch.h"
+#include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/ring_buffer.h"
 #include "core/settings.h"
@@ -65,12 +66,25 @@ public:
    void EnqueueSamples(u32 source_num_channels, const std::vector<s16>& samples) override {
        if (source_num_channels > num_channels) {
            // Downsample 6 channels to 2
+            ASSERT_MSG(source_num_channels == 6, "Channel count must be 6");
+
            std::vector<s16> buf;
            buf.reserve(samples.size() * num_channels / source_num_channels);
            for (std::size_t i = 0; i < samples.size(); i += source_num_channels) {
-                for (std::size_t ch = 0; ch < num_channels; ch++) {
-                    buf.push_back(samples[i + ch]);
-                }
+                // Downmixing implementation taken from the ATSC standard
+                const s16 left{samples[i + 0]};
+                const s16 right{samples[i + 1]};
+                const s16 center{samples[i + 2]};
+                const s16 surround_left{samples[i + 4]};
+                const s16 surround_right{samples[i + 5]};
+                // Not used in the ATSC reference implementation
+                [[maybe_unused]] const s16 low_frequency_effects { samples[i + 3] };
+
+                constexpr s32 clev{707}; // center mixing level coefficient
+                constexpr s32 slev{707}; // surround mixing level coefficient
+
+                buf.push_back(left + (clev * center / 1000) + (slev * surround_left / 1000));
+                buf.push_back(right + (clev * center / 1000) + (slev * surround_right / 1000));
            }
            queue.Push(buf);
            return;
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -38,8 +38,6 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.h"
      "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.cpp"
      "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.h"
-      "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.cpp"
-      "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.h"
      "${VIDEO_CORE}/shader/decode/arithmetic.cpp"
      "${VIDEO_CORE}/shader/decode/arithmetic_half.cpp"
      "${VIDEO_CORE}/shader/decode/arithmetic_half_immediate.cpp"
@@ -72,8 +70,6 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/shader/ast.h"
      "${VIDEO_CORE}/shader/compiler_settings.cpp"
      "${VIDEO_CORE}/shader/compiler_settings.h"
-      "${VIDEO_CORE}/shader/const_buffer_locker.cpp"
-      "${VIDEO_CORE}/shader/const_buffer_locker.h"
      "${VIDEO_CORE}/shader/control_flow.cpp"
      "${VIDEO_CORE}/shader/control_flow.h"
      "${VIDEO_CORE}/shader/decode.cpp"
@@ -82,9 +78,13 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/shader/node.h"
      "${VIDEO_CORE}/shader/node_helper.cpp"
      "${VIDEO_CORE}/shader/node_helper.h"
+      "${VIDEO_CORE}/shader/registry.cpp"
+      "${VIDEO_CORE}/shader/registry.h"
      "${VIDEO_CORE}/shader/shader_ir.cpp"
      "${VIDEO_CORE}/shader/shader_ir.h"
      "${VIDEO_CORE}/shader/track.cpp"
+      "${VIDEO_CORE}/shader/transform_feedback.cpp"
+      "${VIDEO_CORE}/shader/transform_feedback.h"
      # and also check that the scm_rev files haven't changed
      "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.cpp.in"
      "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.h"
--- a/src/common/math_util.h
+++ b/src/common/math_util.h
@@ -24,17 +24,29 @@ struct Rectangle {
        : left(left), top(top), right(right), bottom(bottom) {}

    T GetWidth() const {
-        return std::abs(static_cast<std::make_signed_t<T>>(right - left));
+        if constexpr (std::is_floating_point_v<T>) {
+            return std::abs(right - left);
+        } else {
+            return std::abs(static_cast<std::make_signed_t<T>>(right - left));
+        }
    }
+
    T GetHeight() const {
-        return std::abs(static_cast<std::make_signed_t<T>>(bottom - top));
+        if constexpr (std::is_floating_point_v<T>) {
+            return std::abs(bottom - top);
+        } else {
+            return std::abs(static_cast<std::make_signed_t<T>>(bottom - top));
+        }
    }
+
    Rectangle<T> TranslateX(const T x) const {
        return Rectangle{left + x, top, right + x, bottom};
    }
+
    Rectangle<T> TranslateY(const T y) const {
        return Rectangle{left, top + y, right, bottom + y};
    }
+
    Rectangle<T> Scale(const float s) const {
        return Rectangle{left, top, static_cast<T>(left + GetWidth() * s),
                         static_cast<T>(top + GetHeight() * s)};
--- a/src/common/page_table.cpp
+++ b/src/common/page_table.cpp
@@ -16,7 +16,6 @@ void PageTable::Resize(std::size_t address_space_width_in_bits) {

    pointers.resize(num_page_table_entries);
    attributes.resize(num_page_table_entries);
-    backing_addr.resize(num_page_table_entries);

    // The default is a 39-bit address space, which causes an initial 1GB allocation size. If the
    // vector size is subsequently decreased (via resize), the vector might not automatically
@@ -25,6 +24,17 @@ void PageTable::Resize(std::size_t address_space_width_in_bits) {

    pointers.shrink_to_fit();
    attributes.shrink_to_fit();
+}
+
+BackingPageTable::BackingPageTable(std::size_t page_size_in_bits) : PageTable{page_size_in_bits} {}
+
+BackingPageTable::~BackingPageTable() = default;
+
+void BackingPageTable::Resize(std::size_t address_space_width_in_bits) {
+    PageTable::Resize(address_space_width_in_bits);
+    const std::size_t num_page_table_entries = 1ULL
+                                               << (address_space_width_in_bits - page_size_in_bits);
+    backing_addr.resize(num_page_table_entries);
    backing_addr.shrink_to_fit();
 }

--- a/src/common/page_table.h
+++ b/src/common/page_table.h
@@ -76,9 +76,20 @@ struct PageTable {
     */
    std::vector<PageType> attributes;

-    std::vector<u64> backing_addr;
-
    const std::size_t page_size_in_bits{};
 };

+/**
+ * A more advanced Page Table with the ability to save a backing address when using it
+ * depends on another MMU.
+ */
+struct BackingPageTable : PageTable {
+    explicit BackingPageTable(std::size_t page_size_in_bits);
+    ~BackingPageTable();
+
+    void Resize(std::size_t address_space_width_in_bits);
+
+    std::vector<u64> backing_addr;
+};
+
 } // namespace Common
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -595,8 +595,12 @@ endif()

 if (ARCHITECTURE_x86_64)
    target_sources(core PRIVATE
-        arm/dynarmic/arm_dynarmic.cpp
-        arm/dynarmic/arm_dynarmic.h
+        arm/dynarmic/arm_dynarmic_32.cpp
+        arm/dynarmic/arm_dynarmic_32.h
+        arm/dynarmic/arm_dynarmic_64.cpp
+        arm/dynarmic/arm_dynarmic_64.h
+        arm/dynarmic/arm_dynarmic_cp15.cpp
+        arm/dynarmic/arm_dynarmic_cp15.h
    )
    target_link_libraries(core PRIVATE dynarmic)
 endif()
--- a/src/core/arm/arm_interface.h
+++ b/src/core/arm/arm_interface.h
@@ -25,7 +25,20 @@ public:
    explicit ARM_Interface(System& system_) : system{system_} {}
    virtual ~ARM_Interface() = default;

-    struct ThreadContext {
+    struct ThreadContext32 {
+        std::array<u32, 16> cpu_registers;
+        u32 cpsr;
+        std::array<u8, 4> padding;
+        std::array<u64, 32> fprs;
+        u32 fpscr;
+        u32 fpexc;
+        u32 tpidr;
+    };
+    // Internally within the kernel, it expects the AArch32 version of the
+    // thread context to be 344 bytes in size.
+    static_assert(sizeof(ThreadContext32) == 0x158);
+
+    struct ThreadContext64 {
        std::array<u64, 31> cpu_registers;
        u64 sp;
        u64 pc;
@@ -38,7 +51,7 @@ public:
    };
    // Internally within the kernel, it expects the AArch64 version of the
    // thread context to be 800 bytes in size.
-    static_assert(sizeof(ThreadContext) == 0x320);
+    static_assert(sizeof(ThreadContext64) == 0x320);

    /// Runs the CPU until an event happens
    virtual void Run() = 0;
@@ -130,17 +143,10 @@ public:
     */
    virtual void SetTPIDR_EL0(u64 value) = 0;

-    /**
-     * Saves the current CPU context
-     * @param ctx Thread context to save
-     */
-    virtual void SaveContext(ThreadContext& ctx) = 0;
-
-    /**
-     * Loads a CPU context
-     * @param ctx Thread context to load
-     */
-    virtual void LoadContext(const ThreadContext& ctx) = 0;
+    virtual void SaveContext(ThreadContext32& ctx) = 0;
+    virtual void SaveContext(ThreadContext64& ctx) = 0;
+    virtual void LoadContext(const ThreadContext32& ctx) = 0;
+    virtual void LoadContext(const ThreadContext64& ctx) = 0;

    /// Clears the exclusive monitor's state.
    virtual void ClearExclusiveState() = 0;
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -0,0 +1,208 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cinttypes>
+#include <memory>
+#include <dynarmic/A32/a32.h>
+#include <dynarmic/A32/config.h>
+#include <dynarmic/A32/context.h>
+#include "common/microprofile.h"
+#include "core/arm/dynarmic/arm_dynarmic_32.h"
+#include "core/arm/dynarmic/arm_dynarmic_64.h"
+#include "core/arm/dynarmic/arm_dynarmic_cp15.h"
+#include "core/core.h"
+#include "core/core_manager.h"
+#include "core/core_timing.h"
+#include "core/hle/kernel/svc.h"
+#include "core/memory.h"
+
+namespace Core {
+
+class DynarmicCallbacks32 : public Dynarmic::A32::UserCallbacks {
+public:
+    explicit DynarmicCallbacks32(ARM_Dynarmic_32& parent) : parent(parent) {}
+
+    u8 MemoryRead8(u32 vaddr) override {
+        return parent.system.Memory().Read8(vaddr);
+    }
+    u16 MemoryRead16(u32 vaddr) override {
+        return parent.system.Memory().Read16(vaddr);
+    }
+    u32 MemoryRead32(u32 vaddr) override {
+        return parent.system.Memory().Read32(vaddr);
+    }
+    u64 MemoryRead64(u32 vaddr) override {
+        return parent.system.Memory().Read64(vaddr);
+    }
+
+    void MemoryWrite8(u32 vaddr, u8 value) override {
+        parent.system.Memory().Write8(vaddr, value);
+    }
+    void MemoryWrite16(u32 vaddr, u16 value) override {
+        parent.system.Memory().Write16(vaddr, value);
+    }
+    void MemoryWrite32(u32 vaddr, u32 value) override {
+        parent.system.Memory().Write32(vaddr, value);
+    }
+    void MemoryWrite64(u32 vaddr, u64 value) override {
+        parent.system.Memory().Write64(vaddr, value);
+    }
+
+    void InterpreterFallback(u32 pc, std::size_t num_instructions) override {
+        UNIMPLEMENTED();
+    }
+
+    void ExceptionRaised(u32 pc, Dynarmic::A32::Exception exception) override {
+        switch (exception) {
+        case Dynarmic::A32::Exception::UndefinedInstruction:
+        case Dynarmic::A32::Exception::UnpredictableInstruction:
+            break;
+        case Dynarmic::A32::Exception::Breakpoint:
+            break;
+        }
+        LOG_CRITICAL(HW_GPU, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
+                     static_cast<std::size_t>(exception), pc, MemoryReadCode(pc));
+        UNIMPLEMENTED();
+    }
+
+    void CallSVC(u32 swi) override {
+        Kernel::CallSVC(parent.system, swi);
+    }
+
+    void AddTicks(u64 ticks) override {
+        // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
+        // rough approximation of the amount of executed ticks in the system, it may be thrown off
+        // if not all cores are doing a similar amount of work. Instead of doing this, we should
+        // device a way so that timing is consistent across all cores without increasing the ticks 4
+        // times.
+        u64 amortized_ticks = (ticks - num_interpreted_instructions) / Core::NUM_CPU_CORES;
+        // Always execute at least one tick.
+        amortized_ticks = std::max<u64>(amortized_ticks, 1);
+
+        parent.system.CoreTiming().AddTicks(amortized_ticks);
+        num_interpreted_instructions = 0;
+    }
+    u64 GetTicksRemaining() override {
+        return std::max(parent.system.CoreTiming().GetDowncount(), {});
+    }
+
+    ARM_Dynarmic_32& parent;
+    std::size_t num_interpreted_instructions{};
+    u64 tpidrro_el0{};
+    u64 tpidr_el0{};
+};
+
+std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable& page_table,
+                                                             std::size_t address_space_bits) const {
+    Dynarmic::A32::UserConfig config;
+    config.callbacks = cb.get();
+    // TODO(bunnei): Implement page table for 32-bit
+    // config.page_table = &page_table.pointers;
+    config.coprocessors[15] = std::make_shared<DynarmicCP15>((u32*)&CP15_regs[0]);
+    config.define_unpredictable_behaviour = true;
+    return std::make_unique<Dynarmic::A32::Jit>(config);
+}
+
+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_32, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64));
+
+void ARM_Dynarmic_32::Run() {
+    MICROPROFILE_SCOPE(ARM_Jit_Dynarmic_32);
+    jit->Run();
+}
+
+void ARM_Dynarmic_32::Step() {
+    cb->InterpreterFallback(jit->Regs()[15], 1);
+}
+
+ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor,
+                                 std::size_t core_index)
+    : ARM_Interface{system},
+      cb(std::make_unique<DynarmicCallbacks32>(*this)), core_index{core_index},
+      exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}
+
+ARM_Dynarmic_32::~ARM_Dynarmic_32() = default;
+
+void ARM_Dynarmic_32::SetPC(u64 pc) {
+    jit->Regs()[15] = static_cast<u32>(pc);
+}
+
+u64 ARM_Dynarmic_32::GetPC() const {
+    return jit->Regs()[15];
+}
+
+u64 ARM_Dynarmic_32::GetReg(int index) const {
+    return jit->Regs()[index];
+}
+
+void ARM_Dynarmic_32::SetReg(int index, u64 value) {
+    jit->Regs()[index] = static_cast<u32>(value);
+}
+
+u128 ARM_Dynarmic_32::GetVectorReg(int index) const {
+    return {};
+}
+
+void ARM_Dynarmic_32::SetVectorReg(int index, u128 value) {}
+
+u32 ARM_Dynarmic_32::GetPSTATE() const {
+    return jit->Cpsr();
+}
+
+void ARM_Dynarmic_32::SetPSTATE(u32 cpsr) {
+    jit->SetCpsr(cpsr);
+}
+
+u64 ARM_Dynarmic_32::GetTlsAddress() const {
+    return CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
+}
+
+void ARM_Dynarmic_32::SetTlsAddress(VAddr address) {
+    CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)] = static_cast<u32>(address);
+}
+
+u64 ARM_Dynarmic_32::GetTPIDR_EL0() const {
+    return cb->tpidr_el0;
+}
+
+void ARM_Dynarmic_32::SetTPIDR_EL0(u64 value) {
+    cb->tpidr_el0 = value;
+}
+
+void ARM_Dynarmic_32::SaveContext(ThreadContext32& ctx) {
+    Dynarmic::A32::Context context;
+    jit->SaveContext(context);
+    ctx.cpu_registers = context.Regs();
+    ctx.cpsr = context.Cpsr();
+}
+
+void ARM_Dynarmic_32::LoadContext(const ThreadContext32& ctx) {
+    Dynarmic::A32::Context context;
+    context.Regs() = ctx.cpu_registers;
+    context.SetCpsr(ctx.cpsr);
+    jit->LoadContext(context);
+}
+
+void ARM_Dynarmic_32::PrepareReschedule() {
+    jit->HaltExecution();
+}
+
+void ARM_Dynarmic_32::ClearInstructionCache() {
+    jit->ClearCache();
+}
+
+void ARM_Dynarmic_32::ClearExclusiveState() {}
+
+void ARM_Dynarmic_32::PageTableChanged(Common::PageTable& page_table,
+                                       std::size_t new_address_space_size_in_bits) {
+    auto key = std::make_pair(&page_table, new_address_space_size_in_bits);
+    auto iter = jit_cache.find(key);
+    if (iter != jit_cache.end()) {
+        jit = iter->second;
+        return;
+    }
+    jit = MakeJit(page_table, new_address_space_size_in_bits);
+    jit_cache.emplace(key, jit);
+}
+
+} // namespace Core
--- a/src/core/arm/dynarmic/arm_dynarmic_32.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.h
@@ -0,0 +1,77 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include <dynarmic/A32/a32.h>
+#include <dynarmic/A64/a64.h>
+#include <dynarmic/A64/exclusive_monitor.h>
+#include "common/common_types.h"
+#include "common/hash.h"
+#include "core/arm/arm_interface.h"
+#include "core/arm/exclusive_monitor.h"
+
+namespace Memory {
+class Memory;
+}
+
+namespace Core {
+
+class DynarmicCallbacks32;
+class DynarmicExclusiveMonitor;
+class System;
+
+class ARM_Dynarmic_32 final : public ARM_Interface {
+public:
+    ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
+    ~ARM_Dynarmic_32() override;
+
+    void SetPC(u64 pc) override;
+    u64 GetPC() const override;
+    u64 GetReg(int index) const override;
+    void SetReg(int index, u64 value) override;
+    u128 GetVectorReg(int index) const override;
+    void SetVectorReg(int index, u128 value) override;
+    u32 GetPSTATE() const override;
+    void SetPSTATE(u32 pstate) override;
+    void Run() override;
+    void Step() override;
+    VAddr GetTlsAddress() const override;
+    void SetTlsAddress(VAddr address) override;
+    void SetTPIDR_EL0(u64 value) override;
+    u64 GetTPIDR_EL0() const override;
+
+    void SaveContext(ThreadContext32& ctx) override;
+    void SaveContext(ThreadContext64& ctx) override {}
+    void LoadContext(const ThreadContext32& ctx) override;
+    void LoadContext(const ThreadContext64& ctx) override {}
+
+    void PrepareReschedule() override;
+    void ClearExclusiveState() override;
+
+    void ClearInstructionCache() override;
+    void PageTableChanged(Common::PageTable& new_page_table,
+                          std::size_t new_address_space_size_in_bits) override;
+
+private:
+    std::shared_ptr<Dynarmic::A32::Jit> MakeJit(Common::PageTable& page_table,
+                                                std::size_t address_space_bits) const;
+
+    using JitCacheKey = std::pair<Common::PageTable*, std::size_t>;
+    using JitCacheType =
+        std::unordered_map<JitCacheKey, std::shared_ptr<Dynarmic::A32::Jit>, Common::PairHash>;
+
+    friend class DynarmicCallbacks32;
+    std::unique_ptr<DynarmicCallbacks32> cb;
+    JitCacheType jit_cache;
+    std::shared_ptr<Dynarmic::A32::Jit> jit;
+    std::size_t core_index;
+    DynarmicExclusiveMonitor& exclusive_monitor;
+    std::array<u32, 84> CP15_regs{};
+};
+
+} // namespace Core
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -8,7 +8,7 @@
 #include <dynarmic/A64/config.h>
 #include "common/logging/log.h"
 #include "common/microprofile.h"
-#include "core/arm/dynarmic/arm_dynarmic.h"
+#include "core/arm/dynarmic/arm_dynarmic_64.h"
 #include "core/core.h"
 #include "core/core_manager.h"
 #include "core/core_timing.h"
@@ -25,9 +25,9 @@ namespace Core {

 using Vector = Dynarmic::A64::Vector;

-class ARM_Dynarmic_Callbacks : public Dynarmic::A64::UserCallbacks {
+class DynarmicCallbacks64 : public Dynarmic::A64::UserCallbacks {
 public:
-    explicit ARM_Dynarmic_Callbacks(ARM_Dynarmic& parent) : parent(parent) {}
+    explicit DynarmicCallbacks64(ARM_Dynarmic_64& parent) : parent(parent) {}

    u8 MemoryRead8(u64 vaddr) override {
        return parent.system.Memory().Read8(vaddr);
@@ -68,7 +68,7 @@ public:
        LOG_INFO(Core_ARM, "Unicorn fallback @ 0x{:X} for {} instructions (instr = {:08X})", pc,
                 num_instructions, MemoryReadCode(pc));

-        ARM_Interface::ThreadContext ctx;
+        ARM_Interface::ThreadContext64 ctx;
        parent.SaveContext(ctx);
        parent.inner_unicorn.LoadContext(ctx);
        parent.inner_unicorn.ExecuteInstructions(num_instructions);
@@ -90,7 +90,7 @@ public:
                parent.jit->HaltExecution();
                parent.SetPC(pc);
                Kernel::Thread* const thread = parent.system.CurrentScheduler().GetCurrentThread();
-                parent.SaveContext(thread->GetContext());
+                parent.SaveContext(thread->GetContext64());
                GDBStub::Break();
                GDBStub::SendTrap(thread, 5);
                return;
@@ -126,14 +126,14 @@ public:
        return Timing::CpuCyclesToClockCycles(parent.system.CoreTiming().GetTicks());
    }

-    ARM_Dynarmic& parent;
+    ARM_Dynarmic_64& parent;
    std::size_t num_interpreted_instructions = 0;
    u64 tpidrro_el0 = 0;
    u64 tpidr_el0 = 0;
 };

-std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit(Common::PageTable& page_table,
-                                                          std::size_t address_space_bits) const {
+std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable& page_table,
+                                                             std::size_t address_space_bits) const {
    Dynarmic::A64::UserConfig config;

    // Callbacks
@@ -159,79 +159,79 @@ std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit(Common::PageTable& pag
    // Unpredictable instructions
    config.define_unpredictable_behaviour = true;

-    return std::make_unique<Dynarmic::A64::Jit>(config);
+    return std::make_shared<Dynarmic::A64::Jit>(config);
 }

-MICROPROFILE_DEFINE(ARM_Jit_Dynarmic, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64));
+MICROPROFILE_DEFINE(ARM_Jit_Dynarmic_64, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64));

-void ARM_Dynarmic::Run() {
-    MICROPROFILE_SCOPE(ARM_Jit_Dynarmic);
+void ARM_Dynarmic_64::Run() {
+    MICROPROFILE_SCOPE(ARM_Jit_Dynarmic_64);

    jit->Run();
 }

-void ARM_Dynarmic::Step() {
+void ARM_Dynarmic_64::Step() {
    cb->InterpreterFallback(jit->GetPC(), 1);
 }

-ARM_Dynarmic::ARM_Dynarmic(System& system, ExclusiveMonitor& exclusive_monitor,
-                           std::size_t core_index)
+ARM_Dynarmic_64::ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor,
+                                 std::size_t core_index)
    : ARM_Interface{system},
-      cb(std::make_unique<ARM_Dynarmic_Callbacks>(*this)), inner_unicorn{system},
+      cb(std::make_unique<DynarmicCallbacks64>(*this)), inner_unicorn{system},
      core_index{core_index}, exclusive_monitor{
                                  dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}

-ARM_Dynarmic::~ARM_Dynarmic() = default;
+ARM_Dynarmic_64::~ARM_Dynarmic_64() = default;

-void ARM_Dynarmic::SetPC(u64 pc) {
+void ARM_Dynarmic_64::SetPC(u64 pc) {
    jit->SetPC(pc);
 }

-u64 ARM_Dynarmic::GetPC() const {
+u64 ARM_Dynarmic_64::GetPC() const {
    return jit->GetPC();
 }

-u64 ARM_Dynarmic::GetReg(int index) const {
+u64 ARM_Dynarmic_64::GetReg(int index) const {
    return jit->GetRegister(index);
 }

-void ARM_Dynarmic::SetReg(int index, u64 value) {
+void ARM_Dynarmic_64::SetReg(int index, u64 value) {
    jit->SetRegister(index, value);
 }

-u128 ARM_Dynarmic::GetVectorReg(int index) const {
+u128 ARM_Dynarmic_64::GetVectorReg(int index) const {
    return jit->GetVector(index);
 }

-void ARM_Dynarmic::SetVectorReg(int index, u128 value) {
+void ARM_Dynarmic_64::SetVectorReg(int index, u128 value) {
    jit->SetVector(index, value);
 }

-u32 ARM_Dynarmic::GetPSTATE() const {
+u32 ARM_Dynarmic_64::GetPSTATE() const {
    return jit->GetPstate();
 }

-void ARM_Dynarmic::SetPSTATE(u32 pstate) {
+void ARM_Dynarmic_64::SetPSTATE(u32 pstate) {
    jit->SetPstate(pstate);
 }

-u64 ARM_Dynarmic::GetTlsAddress() const {
+u64 ARM_Dynarmic_64::GetTlsAddress() const {
    return cb->tpidrro_el0;
 }

-void ARM_Dynarmic::SetTlsAddress(VAddr address) {
+void ARM_Dynarmic_64::SetTlsAddress(VAddr address) {
    cb->tpidrro_el0 = address;
 }

-u64 ARM_Dynarmic::GetTPIDR_EL0() const {
+u64 ARM_Dynarmic_64::GetTPIDR_EL0() const {
    return cb->tpidr_el0;
 }

-void ARM_Dynarmic::SetTPIDR_EL0(u64 value) {
+void ARM_Dynarmic_64::SetTPIDR_EL0(u64 value) {
    cb->tpidr_el0 = value;
 }

-void ARM_Dynarmic::SaveContext(ThreadContext& ctx) {
+void ARM_Dynarmic_64::SaveContext(ThreadContext64& ctx) {
    ctx.cpu_registers = jit->GetRegisters();
    ctx.sp = jit->GetSP();
    ctx.pc = jit->GetPC();
@@ -242,7 +242,7 @@ void ARM_Dynarmic::SaveContext(ThreadContext& ctx) {
    ctx.tpidr = cb->tpidr_el0;
 }

-void ARM_Dynarmic::LoadContext(const ThreadContext& ctx) {
+void ARM_Dynarmic_64::LoadContext(const ThreadContext64& ctx) {
    jit->SetRegisters(ctx.cpu_registers);
    jit->SetSP(ctx.sp);
    jit->SetPC(ctx.pc);
@@ -253,25 +253,32 @@ void ARM_Dynarmic::LoadContext(const ThreadContext& ctx) {
    SetTPIDR_EL0(ctx.tpidr);
 }

-void ARM_Dynarmic::PrepareReschedule() {
+void ARM_Dynarmic_64::PrepareReschedule() {
    jit->HaltExecution();
 }

-void ARM_Dynarmic::ClearInstructionCache() {
+void ARM_Dynarmic_64::ClearInstructionCache() {
    jit->ClearCache();
 }

-void ARM_Dynarmic::ClearExclusiveState() {
+void ARM_Dynarmic_64::ClearExclusiveState() {
    jit->ClearExclusiveState();
 }

-void ARM_Dynarmic::PageTableChanged(Common::PageTable& page_table,
-                                    std::size_t new_address_space_size_in_bits) {
+void ARM_Dynarmic_64::PageTableChanged(Common::PageTable& page_table,
+                                       std::size_t new_address_space_size_in_bits) {
+    auto key = std::make_pair(&page_table, new_address_space_size_in_bits);
+    auto iter = jit_cache.find(key);
+    if (iter != jit_cache.end()) {
+        jit = iter->second;
+        return;
+    }
    jit = MakeJit(page_table, new_address_space_size_in_bits);
+    jit_cache.emplace(key, jit);
 }

-DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(Memory::Memory& memory_, std::size_t core_count)
-    : monitor(core_count), memory{memory_} {}
+DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count)
+    : monitor(core_count), memory{memory} {}

 DynarmicExclusiveMonitor::~DynarmicExclusiveMonitor() = default;

--- a/src/core/arm/dynarmic/arm_dynarmic_64.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.h
@@ -5,9 +5,12 @@
 #pragma once

 #include <memory>
+#include <unordered_map>
+
 #include <dynarmic/A64/a64.h>
 #include <dynarmic/A64/exclusive_monitor.h>
 #include "common/common_types.h"
+#include "common/hash.h"
 #include "core/arm/arm_interface.h"
 #include "core/arm/exclusive_monitor.h"
 #include "core/arm/unicorn/arm_unicorn.h"
@@ -18,14 +21,14 @@ class Memory;

 namespace Core {

-class ARM_Dynarmic_Callbacks;
+class DynarmicCallbacks64;
 class DynarmicExclusiveMonitor;
 class System;

-class ARM_Dynarmic final : public ARM_Interface {
+class ARM_Dynarmic_64 final : public ARM_Interface {
 public:
-    ARM_Dynarmic(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
-    ~ARM_Dynarmic() override;
+    ARM_Dynarmic_64(System& system, ExclusiveMonitor& exclusive_monitor, std::size_t core_index);
+    ~ARM_Dynarmic_64() override;

    void SetPC(u64 pc) override;
    u64 GetPC() const override;
@@ -42,8 +45,10 @@ public:
    void SetTPIDR_EL0(u64 value) override;
    u64 GetTPIDR_EL0() const override;

-    void SaveContext(ThreadContext& ctx) override;
-    void LoadContext(const ThreadContext& ctx) override;
+    void SaveContext(ThreadContext32& ctx) override {}
+    void SaveContext(ThreadContext64& ctx) override;
+    void LoadContext(const ThreadContext32& ctx) override {}
+    void LoadContext(const ThreadContext64& ctx) override;

    void PrepareReschedule() override;
    void ClearExclusiveState() override;
@@ -53,12 +58,17 @@ public:
                          std::size_t new_address_space_size_in_bits) override;

 private:
-    std::unique_ptr<Dynarmic::A64::Jit> MakeJit(Common::PageTable& page_table,
+    std::shared_ptr<Dynarmic::A64::Jit> MakeJit(Common::PageTable& page_table,
                                                std::size_t address_space_bits) const;

-    friend class ARM_Dynarmic_Callbacks;
-    std::unique_ptr<ARM_Dynarmic_Callbacks> cb;
-    std::unique_ptr<Dynarmic::A64::Jit> jit;
+    using JitCacheKey = std::pair<Common::PageTable*, std::size_t>;
+    using JitCacheType =
+        std::unordered_map<JitCacheKey, std::shared_ptr<Dynarmic::A64::Jit>, Common::PairHash>;
+
+    friend class DynarmicCallbacks64;
+    std::unique_ptr<DynarmicCallbacks64> cb;
+    JitCacheType jit_cache;
+    std::shared_ptr<Dynarmic::A64::Jit> jit;
    ARM_Unicorn inner_unicorn;

    std::size_t core_index;
@@ -67,7 +77,7 @@ private:

 class DynarmicExclusiveMonitor final : public ExclusiveMonitor {
 public:
-    explicit DynarmicExclusiveMonitor(Memory::Memory& memory_, std::size_t core_count);
+    explicit DynarmicExclusiveMonitor(Memory::Memory& memory, std::size_t core_count);
    ~DynarmicExclusiveMonitor() override;

    void SetExclusive(std::size_t core_index, VAddr addr) override;
@@ -80,7 +90,7 @@ public:
    bool ExclusiveWrite128(std::size_t core_index, VAddr vaddr, u128 value) override;

 private:
-    friend class ARM_Dynarmic;
+    friend class ARM_Dynarmic_64;
    Dynarmic::A64::ExclusiveMonitor monitor;
    Memory::Memory& memory;
 };
--- a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
@@ -0,0 +1,80 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/arm/dynarmic/arm_dynarmic_cp15.h"
+
+using Callback = Dynarmic::A32::Coprocessor::Callback;
+using CallbackOrAccessOneWord = Dynarmic::A32::Coprocessor::CallbackOrAccessOneWord;
+using CallbackOrAccessTwoWords = Dynarmic::A32::Coprocessor::CallbackOrAccessTwoWords;
+
+std::optional<Callback> DynarmicCP15::CompileInternalOperation(bool two, unsigned opc1,
+                                                               CoprocReg CRd, CoprocReg CRn,
+                                                               CoprocReg CRm, unsigned opc2) {
+    return {};
+}
+
+CallbackOrAccessOneWord DynarmicCP15::CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn,
+                                                         CoprocReg CRm, unsigned opc2) {
+    // TODO(merry): Privileged CP15 registers
+
+    if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C5 && opc2 == 4) {
+        // This is a dummy write, we ignore the value written here.
+        return &CP15[static_cast<std::size_t>(CP15Register::CP15_FLUSH_PREFETCH_BUFFER)];
+    }
+
+    if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C10) {
+        switch (opc2) {
+        case 4:
+            // This is a dummy write, we ignore the value written here.
+            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_SYNC_BARRIER)];
+        case 5:
+            // This is a dummy write, we ignore the value written here.
+            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_MEMORY_BARRIER)];
+        default:
+            return {};
+        }
+    }
+
+    if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0 && opc2 == 2) {
+        return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+    }
+
+    return {};
+}
+
+CallbackOrAccessTwoWords DynarmicCP15::CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    return {};
+}
+
+CallbackOrAccessOneWord DynarmicCP15::CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn,
+                                                        CoprocReg CRm, unsigned opc2) {
+    // TODO(merry): Privileged CP15 registers
+
+    if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0) {
+        switch (opc2) {
+        case 2:
+            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+        case 3:
+            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
+        default:
+            return {};
+        }
+    }
+
+    return {};
+}
+
+CallbackOrAccessTwoWords DynarmicCP15::CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    return {};
+}
+
+std::optional<Callback> DynarmicCP15::CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd,
+                                                       std::optional<u8> option) {
+    return {};
+}
+
+std::optional<Callback> DynarmicCP15::CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
+                                                        std::optional<u8> option) {
+    return {};
+}
--- a/src/core/arm/dynarmic/arm_dynarmic_cp15.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.h
@@ -0,0 +1,152 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+
+#include <dynarmic/A32/coprocessor.h>
+#include "common/common_types.h"
+
+enum class CP15Register {
+    // c0 - Information registers
+    CP15_MAIN_ID,
+    CP15_CACHE_TYPE,
+    CP15_TCM_STATUS,
+    CP15_TLB_TYPE,
+    CP15_CPU_ID,
+    CP15_PROCESSOR_FEATURE_0,
+    CP15_PROCESSOR_FEATURE_1,
+    CP15_DEBUG_FEATURE_0,
+    CP15_AUXILIARY_FEATURE_0,
+    CP15_MEMORY_MODEL_FEATURE_0,
+    CP15_MEMORY_MODEL_FEATURE_1,
+    CP15_MEMORY_MODEL_FEATURE_2,
+    CP15_MEMORY_MODEL_FEATURE_3,
+    CP15_ISA_FEATURE_0,
+    CP15_ISA_FEATURE_1,
+    CP15_ISA_FEATURE_2,
+    CP15_ISA_FEATURE_3,
+    CP15_ISA_FEATURE_4,
+
+    // c1 - Control registers
+    CP15_CONTROL,
+    CP15_AUXILIARY_CONTROL,
+    CP15_COPROCESSOR_ACCESS_CONTROL,
+
+    // c2 - Translation table registers
+    CP15_TRANSLATION_BASE_TABLE_0,
+    CP15_TRANSLATION_BASE_TABLE_1,
+    CP15_TRANSLATION_BASE_CONTROL,
+    CP15_DOMAIN_ACCESS_CONTROL,
+    CP15_RESERVED,
+
+    // c5 - Fault status registers
+    CP15_FAULT_STATUS,
+    CP15_INSTR_FAULT_STATUS,
+    CP15_COMBINED_DATA_FSR = CP15_FAULT_STATUS,
+    CP15_INST_FSR,
+
+    // c6 - Fault Address registers
+    CP15_FAULT_ADDRESS,
+    CP15_COMBINED_DATA_FAR = CP15_FAULT_ADDRESS,
+    CP15_WFAR,
+    CP15_IFAR,
+
+    // c7 - Cache operation registers
+    CP15_WAIT_FOR_INTERRUPT,
+    CP15_PHYS_ADDRESS,
+    CP15_INVALIDATE_INSTR_CACHE,
+    CP15_INVALIDATE_INSTR_CACHE_USING_MVA,
+    CP15_INVALIDATE_INSTR_CACHE_USING_INDEX,
+    CP15_FLUSH_PREFETCH_BUFFER,
+    CP15_FLUSH_BRANCH_TARGET_CACHE,
+    CP15_FLUSH_BRANCH_TARGET_CACHE_ENTRY,
+    CP15_INVALIDATE_DATA_CACHE,
+    CP15_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
+    CP15_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
+    CP15_INVALIDATE_DATA_AND_INSTR_CACHE,
+    CP15_CLEAN_DATA_CACHE,
+    CP15_CLEAN_DATA_CACHE_LINE_USING_MVA,
+    CP15_CLEAN_DATA_CACHE_LINE_USING_INDEX,
+    CP15_DATA_SYNC_BARRIER,
+    CP15_DATA_MEMORY_BARRIER,
+    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE,
+    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
+    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
+
+    // c8 - TLB operations
+    CP15_INVALIDATE_ITLB,
+    CP15_INVALIDATE_ITLB_SINGLE_ENTRY,
+    CP15_INVALIDATE_ITLB_ENTRY_ON_ASID_MATCH,
+    CP15_INVALIDATE_ITLB_ENTRY_ON_MVA,
+    CP15_INVALIDATE_DTLB,
+    CP15_INVALIDATE_DTLB_SINGLE_ENTRY,
+    CP15_INVALIDATE_DTLB_ENTRY_ON_ASID_MATCH,
+    CP15_INVALIDATE_DTLB_ENTRY_ON_MVA,
+    CP15_INVALIDATE_UTLB,
+    CP15_INVALIDATE_UTLB_SINGLE_ENTRY,
+    CP15_INVALIDATE_UTLB_ENTRY_ON_ASID_MATCH,
+    CP15_INVALIDATE_UTLB_ENTRY_ON_MVA,
+
+    // c9 - Data cache lockdown register
+    CP15_DATA_CACHE_LOCKDOWN,
+
+    // c10 - TLB/Memory map registers
+    CP15_TLB_LOCKDOWN,
+    CP15_PRIMARY_REGION_REMAP,
+    CP15_NORMAL_REGION_REMAP,
+
+    // c13 - Thread related registers
+    CP15_PID,
+    CP15_CONTEXT_ID,
+    CP15_THREAD_UPRW, // Thread ID register - User/Privileged Read/Write
+    CP15_THREAD_URO,  // Thread ID register - User Read Only (Privileged R/W)
+    CP15_THREAD_PRW,  // Thread ID register - Privileged R/W only.
+
+    // c15 - Performance and TLB lockdown registers
+    CP15_PERFORMANCE_MONITOR_CONTROL,
+    CP15_CYCLE_COUNTER,
+    CP15_COUNT_0,
+    CP15_COUNT_1,
+    CP15_READ_MAIN_TLB_LOCKDOWN_ENTRY,
+    CP15_WRITE_MAIN_TLB_LOCKDOWN_ENTRY,
+    CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS,
+    CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS,
+    CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE,
+    CP15_TLB_DEBUG_CONTROL,
+
+    // Skyeye defined
+    CP15_TLB_FAULT_ADDR,
+    CP15_TLB_FAULT_STATUS,
+
+    // Not an actual register.
+    // All registers should be defined above this.
+    CP15_REGISTER_COUNT,
+};
+
+class DynarmicCP15 final : public Dynarmic::A32::Coprocessor {
+public:
+    using CoprocReg = Dynarmic::A32::CoprocReg;
+
+    explicit DynarmicCP15(u32* cp15) : CP15(cp15){};
+
+    std::optional<Callback> CompileInternalOperation(bool two, unsigned opc1, CoprocReg CRd,
+                                                     CoprocReg CRn, CoprocReg CRm,
+                                                     unsigned opc2) override;
+    CallbackOrAccessOneWord CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn,
+                                               CoprocReg CRm, unsigned opc2) override;
+    CallbackOrAccessTwoWords CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) override;
+    CallbackOrAccessOneWord CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn, CoprocReg CRm,
+                                              unsigned opc2) override;
+    CallbackOrAccessTwoWords CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) override;
+    std::optional<Callback> CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd,
+                                             std::optional<u8> option) override;
+    std::optional<Callback> CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
+                                              std::optional<u8> option) override;
+
+private:
+    u32* CP15{};
+};
--- a/src/core/arm/exclusive_monitor.cpp
+++ b/src/core/arm/exclusive_monitor.cpp
@@ -3,7 +3,7 @@
 // Refer to the license.txt file included.

 #ifdef ARCHITECTURE_x86_64
-#include "core/arm/dynarmic/arm_dynarmic.h"
+#include "core/arm/dynarmic/arm_dynarmic_64.h"
 #endif
 #include "core/arm/exclusive_monitor.h"
 #include "core/memory.h"
--- a/src/core/arm/unicorn/arm_unicorn.cpp
+++ b/src/core/arm/unicorn/arm_unicorn.cpp
@@ -53,7 +53,7 @@ static bool UnmappedMemoryHook(uc_engine* uc, uc_mem_type type, u64 addr, int si
                               void* user_data) {
    auto* const system = static_cast<System*>(user_data);

-    ARM_Interface::ThreadContext ctx{};
+    ARM_Interface::ThreadContext64 ctx{};
    system->CurrentArmInterface().SaveContext(ctx);
    ASSERT_MSG(false, "Attempted to read from unmapped memory: 0x{:X}, pc=0x{:X}, lr=0x{:X}", addr,
               ctx.pc, ctx.cpu_registers[30]);
@@ -179,7 +179,7 @@ void ARM_Unicorn::ExecuteInstructions(std::size_t num_instructions) {
        }

        Kernel::Thread* const thread = system.CurrentScheduler().GetCurrentThread();
-        SaveContext(thread->GetContext());
+        SaveContext(thread->GetContext64());
        if (last_bkpt_hit || GDBStub::IsMemoryBreak() || GDBStub::GetCpuStepFlag()) {
            last_bkpt_hit = false;
            GDBStub::Break();
@@ -188,7 +188,7 @@ void ARM_Unicorn::ExecuteInstructions(std::size_t num_instructions) {
    }
 }

-void ARM_Unicorn::SaveContext(ThreadContext& ctx) {
+void ARM_Unicorn::SaveContext(ThreadContext64& ctx) {
    int uregs[32];
    void* tregs[32];

@@ -215,7 +215,7 @@ void ARM_Unicorn::SaveContext(ThreadContext& ctx) {
    CHECKED(uc_reg_read_batch(uc, uregs, tregs, 32));
 }

-void ARM_Unicorn::LoadContext(const ThreadContext& ctx) {
+void ARM_Unicorn::LoadContext(const ThreadContext64& ctx) {
    int uregs[32];
    void* tregs[32];

--- a/src/core/arm/unicorn/arm_unicorn.h
+++ b/src/core/arm/unicorn/arm_unicorn.h
@@ -30,8 +30,6 @@ public:
    void SetTlsAddress(VAddr address) override;
    void SetTPIDR_EL0(u64 value) override;
    u64 GetTPIDR_EL0() const override;
-    void SaveContext(ThreadContext& ctx) override;
-    void LoadContext(const ThreadContext& ctx) override;
    void PrepareReschedule() override;
    void ClearExclusiveState() override;
    void ExecuteInstructions(std::size_t num_instructions);
@@ -41,6 +39,11 @@ public:
    void PageTableChanged(Common::PageTable&, std::size_t) override {}
    void RecordBreak(GDBStub::BreakpointAddress bkpt);

+    void SaveContext(ThreadContext32& ctx) override {}
+    void SaveContext(ThreadContext64& ctx) override;
+    void LoadContext(const ThreadContext32& ctx) override {}
+    void LoadContext(const ThreadContext64& ctx) override;
+
 private:
    static void InterruptHook(uc_engine* uc, u32 int_no, void* user_data);

--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -174,6 +174,7 @@ struct System::Impl {
        }
        interrupt_manager = std::make_unique<Core::Hardware::InterruptManager>(system);
        gpu_core = VideoCore::CreateGPU(system);
+        renderer->Rasterizer().SetupDirtyFlags();

        is_powered_on = true;
        exit_lock = false;
--- a/src/core/core_manager.cpp
+++ b/src/core/core_manager.cpp
@@ -6,9 +6,6 @@
 #include <mutex>

 #include "common/logging/log.h"
-#ifdef ARCHITECTURE_x86_64
-#include "core/arm/dynarmic/arm_dynarmic.h"
-#endif
 #include "core/arm/exclusive_monitor.h"
 #include "core/arm/unicorn/arm_unicorn.h"
 #include "core/core.h"
--- a/src/core/frontend/framebuffer_layout.cpp
+++ b/src/core/frontend/framebuffer_layout.cpp
@@ -48,8 +48,8 @@ FramebufferLayout FrameLayoutFromResolutionScale(u32 res_scale) {
    u32 width, height;

    if (Settings::values.use_docked_mode) {
-        width = ScreenDocked::WidthDocked * res_scale;
-        height = ScreenDocked::HeightDocked * res_scale;
+        width = ScreenDocked::Width * res_scale;
+        height = ScreenDocked::Height * res_scale;
    } else {
        width = ScreenUndocked::Width * res_scale;
        height = ScreenUndocked::Height * res_scale;
--- a/src/core/frontend/framebuffer_layout.h
+++ b/src/core/frontend/framebuffer_layout.h
@@ -8,15 +8,15 @@

 namespace Layout {

-enum ScreenUndocked : u32 {
-    Width = 1280,
-    Height = 720,
-};
+namespace ScreenUndocked {
+constexpr u32 Width = 1280;
+constexpr u32 Height = 720;
+} // namespace ScreenUndocked

-enum ScreenDocked : u32 {
-    WidthDocked = 1920,
-    HeightDocked = 1080,
-};
+namespace ScreenDocked {
+constexpr u32 Width = 1920;
+constexpr u32 Height = 1080;
+} // namespace ScreenDocked

 enum class AspectRatio {
    Default,
--- a/src/core/gdbstub/gdbstub.cpp
+++ b/src/core/gdbstub/gdbstub.cpp
@@ -217,7 +217,7 @@ static u64 RegRead(std::size_t id, Kernel::Thread* thread = nullptr) {
        return 0;
    }

-    const auto& thread_context = thread->GetContext();
+    const auto& thread_context = thread->GetContext64();

    if (id < SP_REGISTER) {
        return thread_context.cpu_registers[id];
@@ -239,7 +239,7 @@ static void RegWrite(std::size_t id, u64 val, Kernel::Thread* thread = nullptr)
        return;
    }

-    auto& thread_context = thread->GetContext();
+    auto& thread_context = thread->GetContext64();

    if (id < SP_REGISTER) {
        thread_context.cpu_registers[id] = val;
@@ -259,7 +259,7 @@ static u128 FpuRead(std::size_t id, Kernel::Thread* thread = nullptr) {
        return u128{0};
    }

-    auto& thread_context = thread->GetContext();
+    auto& thread_context = thread->GetContext64();

    if (id >= UC_ARM64_REG_Q0 && id < FPCR_REGISTER) {
        return thread_context.vector_registers[id - UC_ARM64_REG_Q0];
@@ -275,7 +275,7 @@ static void FpuWrite(std::size_t id, u128 val, Kernel::Thread* thread = nullptr)
        return;
    }

-    auto& thread_context = thread->GetContext();
+    auto& thread_context = thread->GetContext64();

    if (id >= UC_ARM64_REG_Q0 && id < FPCR_REGISTER) {
        thread_context.vector_registers[id - UC_ARM64_REG_Q0] = val;
@@ -916,7 +916,7 @@ static void WriteRegister() {
    // Update ARM context, skipping scheduler - no running threads at this point
    Core::System::GetInstance()
        .ArmInterface(current_core)
-        .LoadContext(current_thread->GetContext());
+        .LoadContext(current_thread->GetContext64());

    SendReply("OK");
 }
@@ -947,7 +947,7 @@ static void WriteRegisters() {
    // Update ARM context, skipping scheduler - no running threads at this point
    Core::System::GetInstance()
        .ArmInterface(current_core)
-        .LoadContext(current_thread->GetContext());
+        .LoadContext(current_thread->GetContext64());

    SendReply("OK");
 }
@@ -1019,7 +1019,7 @@ static void Step() {
        // Update ARM context, skipping scheduler - no running threads at this point
        Core::System::GetInstance()
            .ArmInterface(current_core)
-            .LoadContext(current_thread->GetContext());
+            .LoadContext(current_thread->GetContext64());
    }
    step_loop = true;
    halt_loop = true;
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -186,6 +186,10 @@ struct KernelCore::Impl {
            return;
        }

+        for (auto& core : cores) {
+            core.SetIs64Bit(process->Is64BitProcess());
+        }
+
        system.Memory().SetCurrentPageTable(*process);
    }

--- a/src/core/hle/kernel/physical_core.cpp
+++ b/src/core/hle/kernel/physical_core.cpp
@@ -5,7 +5,8 @@
 #include "common/logging/log.h"
 #include "core/arm/arm_interface.h"
 #ifdef ARCHITECTURE_x86_64
-#include "core/arm/dynarmic/arm_dynarmic.h"
+#include "core/arm/dynarmic/arm_dynarmic_32.h"
+#include "core/arm/dynarmic/arm_dynarmic_64.h"
 #endif
 #include "core/arm/exclusive_monitor.h"
 #include "core/arm/unicorn/arm_unicorn.h"
@@ -20,13 +21,17 @@ PhysicalCore::PhysicalCore(Core::System& system, std::size_t id,
                           Core::ExclusiveMonitor& exclusive_monitor)
    : core_index{id} {
 #ifdef ARCHITECTURE_x86_64
-    arm_interface = std::make_unique<Core::ARM_Dynarmic>(system, exclusive_monitor, core_index);
+    arm_interface_32 =
+        std::make_unique<Core::ARM_Dynarmic_32>(system, exclusive_monitor, core_index);
+    arm_interface_64 =
+        std::make_unique<Core::ARM_Dynarmic_64>(system, exclusive_monitor, core_index);
+
 #else
    arm_interface = std::make_shared<Core::ARM_Unicorn>(system);
    LOG_WARNING(Core, "CPU JIT requested, but Dynarmic not available");
 #endif

-    scheduler = std::make_unique<Kernel::Scheduler>(system, *arm_interface, core_index);
+    scheduler = std::make_unique<Kernel::Scheduler>(system, core_index);
 }

 PhysicalCore::~PhysicalCore() = default;
@@ -48,4 +53,12 @@ void PhysicalCore::Shutdown() {
    scheduler->Shutdown();
 }

+void PhysicalCore::SetIs64Bit(bool is_64_bit) {
+    if (is_64_bit) {
+        arm_interface = arm_interface_64.get();
+    } else {
+        arm_interface = arm_interface_32.get();
+    }
+}
+
 } // namespace Kernel
--- a/src/core/hle/kernel/physical_core.h
+++ b/src/core/hle/kernel/physical_core.h
@@ -68,10 +68,14 @@ public:
        return *scheduler;
    }

+    void SetIs64Bit(bool is_64_bit);
+
 private:
    std::size_t core_index;
-    std::unique_ptr<Core::ARM_Interface> arm_interface;
+    std::unique_ptr<Core::ARM_Interface> arm_interface_32;
+    std::unique_ptr<Core::ARM_Interface> arm_interface_64;
    std::unique_ptr<Kernel::Scheduler> scheduler;
+    Core::ARM_Interface* arm_interface{};
 };

 } // namespace Kernel
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -42,7 +42,8 @@ void SetupMainThread(Process& owner_process, KernelCore& kernel, u32 priority) {

    // Register 1 must be a handle to the main thread
    const Handle thread_handle = owner_process.GetHandleTable().Create(thread).Unwrap();
-    thread->GetContext().cpu_registers[1] = thread_handle;
+    thread->GetContext32().cpu_registers[1] = thread_handle;
+    thread->GetContext64().cpu_registers[1] = thread_handle;

    // Threads by default are dormant, wake up the main thread so it runs when the scheduler fires
    thread->ResumeFromWait();
--- a/src/core/hle/kernel/scheduler.cpp
+++ b/src/core/hle/kernel/scheduler.cpp
@@ -383,8 +383,8 @@ void GlobalScheduler::Unlock() {
    // TODO(Blinkhawk): Setup the interrupts and change context on current core.
 }

-Scheduler::Scheduler(Core::System& system, Core::ARM_Interface& cpu_core, std::size_t core_id)
-    : system(system), cpu_core(cpu_core), core_id(core_id) {}
+Scheduler::Scheduler(Core::System& system, std::size_t core_id)
+    : system{system}, core_id{core_id} {}

 Scheduler::~Scheduler() = default;

@@ -422,9 +422,10 @@ void Scheduler::UnloadThread() {

    // Save context for previous thread
    if (previous_thread) {
-        cpu_core.SaveContext(previous_thread->GetContext());
+        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext32());
+        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext64());
        // Save the TPIDR_EL0 system register in case it was modified.
-        previous_thread->SetTPIDR_EL0(cpu_core.GetTPIDR_EL0());
+        previous_thread->SetTPIDR_EL0(system.ArmInterface(core_id).GetTPIDR_EL0());

        if (previous_thread->GetStatus() == ThreadStatus::Running) {
            // This is only the case when a reschedule is triggered without the current thread
@@ -451,9 +452,10 @@ void Scheduler::SwitchContext() {

    // Save context for previous thread
    if (previous_thread) {
-        cpu_core.SaveContext(previous_thread->GetContext());
+        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext32());
+        system.ArmInterface(core_id).SaveContext(previous_thread->GetContext64());
        // Save the TPIDR_EL0 system register in case it was modified.
-        previous_thread->SetTPIDR_EL0(cpu_core.GetTPIDR_EL0());
+        previous_thread->SetTPIDR_EL0(system.ArmInterface(core_id).GetTPIDR_EL0());

        if (previous_thread->GetStatus() == ThreadStatus::Running) {
            // This is only the case when a reschedule is triggered without the current thread
@@ -481,9 +483,10 @@ void Scheduler::SwitchContext() {
            system.Kernel().MakeCurrentProcess(thread_owner_process);
        }

-        cpu_core.LoadContext(new_thread->GetContext());
-        cpu_core.SetTlsAddress(new_thread->GetTLSAddress());
-        cpu_core.SetTPIDR_EL0(new_thread->GetTPIDR_EL0());
+        system.ArmInterface(core_id).LoadContext(new_thread->GetContext32());
+        system.ArmInterface(core_id).LoadContext(new_thread->GetContext64());
+        system.ArmInterface(core_id).SetTlsAddress(new_thread->GetTLSAddress());
+        system.ArmInterface(core_id).SetTPIDR_EL0(new_thread->GetTPIDR_EL0());
    } else {
        current_thread = nullptr;
        // Note: We do not reset the current process and current page table when idling because
--- a/src/core/hle/kernel/scheduler.h
+++ b/src/core/hle/kernel/scheduler.h
@@ -181,7 +181,7 @@ private:

 class Scheduler final {
 public:
-    explicit Scheduler(Core::System& system, Core::ARM_Interface& cpu_core, std::size_t core_id);
+    explicit Scheduler(Core::System& system, std::size_t core_id);
    ~Scheduler();

    /// Returns whether there are any threads that are ready to run.
@@ -235,7 +235,6 @@ private:
    std::shared_ptr<Thread> selected_thread = nullptr;

    Core::System& system;
-    Core::ARM_Interface& cpu_core;
    u64 last_context_switch_time = 0;
    u64 idle_selection_count = 0;
    const std::size_t core_id;
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -187,6 +187,13 @@ static ResultCode SetHeapSize(Core::System& system, VAddr* heap_addr, u64 heap_s
    return RESULT_SUCCESS;
 }

+static ResultCode SetHeapSize32(Core::System& system, u32* heap_addr, u32 heap_size) {
+    VAddr temp_heap_addr{};
+    const ResultCode result{SetHeapSize(system, &temp_heap_addr, heap_size)};
+    *heap_addr = static_cast<u32>(temp_heap_addr);
+    return result;
+}
+
 static ResultCode SetMemoryPermission(Core::System& system, VAddr addr, u64 size, u32 prot) {
    LOG_TRACE(Kernel_SVC, "called, addr=0x{:X}, size=0x{:X}, prot=0x{:X}", addr, size, prot);

@@ -371,6 +378,12 @@ static ResultCode ConnectToNamedPort(Core::System& system, Handle* out_handle,
    return RESULT_SUCCESS;
 }

+static ResultCode ConnectToNamedPort32(Core::System& system, Handle* out_handle,
+                                       u32 port_name_address) {
+
+    return ConnectToNamedPort(system, out_handle, port_name_address);
+}
+
 /// Makes a blocking IPC call to an OS service.
 static ResultCode SendSyncRequest(Core::System& system, Handle handle) {
    const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable();
@@ -390,6 +403,10 @@ static ResultCode SendSyncRequest(Core::System& system, Handle handle) {
    return session->SendSyncRequest(SharedFrom(thread), system.Memory());
 }

+static ResultCode SendSyncRequest32(Core::System& system, Handle handle) {
+    return SendSyncRequest(system, handle);
+}
+
 /// Get the ID for the specified thread.
 static ResultCode GetThreadId(Core::System& system, u64* thread_id, Handle thread_handle) {
    LOG_TRACE(Kernel_SVC, "called thread=0x{:08X}", thread_handle);
@@ -405,6 +422,17 @@ static ResultCode GetThreadId(Core::System& system, u64* thread_id, Handle threa
    return RESULT_SUCCESS;
 }

+static ResultCode GetThreadId32(Core::System& system, u32* thread_id_low, u32* thread_id_high,
+                                Handle thread_handle) {
+    u64 thread_id{};
+    const ResultCode result{GetThreadId(system, &thread_id, thread_handle)};
+
+    *thread_id_low = static_cast<u32>(thread_id >> 32);
+    *thread_id_high = static_cast<u32>(thread_id & std::numeric_limits<u32>::max());
+
+    return result;
+}
+
 /// Gets the ID of the specified process or a specified thread's owning process.
 static ResultCode GetProcessId(Core::System& system, u64* process_id, Handle handle) {
    LOG_DEBUG(Kernel_SVC, "called handle=0x{:08X}", handle);
@@ -479,6 +507,12 @@ static ResultCode WaitSynchronization(Core::System& system, Handle* index, VAddr
    return result;
 }

+static ResultCode WaitSynchronization32(Core::System& system, u32 timeout_low, u32 handles_address,
+                                        s32 handle_count, u32 timeout_high, Handle* index) {
+    const s64 nano_seconds{(static_cast<s64>(timeout_high) << 32) | static_cast<s64>(timeout_low)};
+    return WaitSynchronization(system, index, handles_address, handle_count, nano_seconds);
+}
+
 /// Resumes a thread waiting on WaitSynchronization
 static ResultCode CancelSynchronization(Core::System& system, Handle thread_handle) {
    LOG_TRACE(Kernel_SVC, "called thread=0x{:X}", thread_handle);
@@ -917,6 +951,18 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
    }
 }

+static ResultCode GetInfo32(Core::System& system, u32* result_low, u32* result_high, u32 sub_id_low,
+                            u32 info_id, u32 handle, u32 sub_id_high) {
+    const u64 sub_id{static_cast<u64>(sub_id_low | (static_cast<u64>(sub_id_high) << 32))};
+    u64 res_value{};
+
+    const ResultCode result{GetInfo(system, &res_value, info_id, handle, sub_id)};
+    *result_high = static_cast<u32>(res_value >> 32);
+    *result_low = static_cast<u32>(res_value & std::numeric_limits<u32>::max());
+
+    return result;
+}
+
 /// Maps memory at a desired address
 static ResultCode MapPhysicalMemory(Core::System& system, VAddr addr, u64 size) {
    LOG_DEBUG(Kernel_SVC, "called, addr=0x{:016X}, size=0x{:X}", addr, size);
@@ -1058,7 +1104,7 @@ static ResultCode GetThreadContext(Core::System& system, VAddr thread_context, H
        return ERR_BUSY;
    }

-    Core::ARM_Interface::ThreadContext ctx = thread->GetContext();
+    Core::ARM_Interface::ThreadContext64 ctx = thread->GetContext64();
    // Mask away mode bits, interrupt bits, IL bit, and other reserved bits.
    ctx.pstate &= 0xFF0FFE20;

@@ -1088,6 +1134,10 @@ static ResultCode GetThreadPriority(Core::System& system, u32* priority, Handle
    return RESULT_SUCCESS;
 }

+static ResultCode GetThreadPriority32(Core::System& system, u32* priority, Handle handle) {
+    return GetThreadPriority(system, priority, handle);
+}
+
 /// Sets the priority for the specified thread
 static ResultCode SetThreadPriority(Core::System& system, Handle handle, u32 priority) {
    LOG_TRACE(Kernel_SVC, "called");
@@ -1259,6 +1309,11 @@ static ResultCode QueryMemory(Core::System& system, VAddr memory_info_address,
                              query_address);
 }

+static ResultCode QueryMemory32(Core::System& system, u32 memory_info_address,
+                                u32 page_info_address, u32 query_address) {
+    return QueryMemory(system, memory_info_address, page_info_address, query_address);
+}
+
 static ResultCode MapProcessCodeMemory(Core::System& system, Handle process_handle, u64 dst_address,
                                       u64 src_address, u64 size) {
    LOG_DEBUG(Kernel_SVC,
@@ -1675,6 +1730,10 @@ static void SignalProcessWideKey(Core::System& system, VAddr condition_variable_
    }
 }

+static void SignalProcessWideKey32(Core::System& system, u32 condition_variable_addr, s32 target) {
+    SignalProcessWideKey(system, condition_variable_addr, target);
+}
+
 // Wait for an address (via Address Arbiter)
 static ResultCode WaitForAddress(Core::System& system, VAddr address, u32 type, s32 value,
                                 s64 timeout) {
@@ -1760,6 +1819,10 @@ static ResultCode CloseHandle(Core::System& system, Handle handle) {
    return handle_table.Close(handle);
 }

+static ResultCode CloseHandle32(Core::System& system, Handle handle) {
+    return CloseHandle(system, handle);
+}
+
 /// Clears the signaled state of an event or process.
 static ResultCode ResetSignal(Core::System& system, Handle handle) {
    LOG_DEBUG(Kernel_SVC, "called handle 0x{:08X}", handle);
@@ -2317,69 +2380,196 @@ struct FunctionDef {
 };
 } // namespace

-static const FunctionDef SVC_Table[] = {
+static const FunctionDef SVC_Table_32[] = {
    {0x00, nullptr, "Unknown"},
-    {0x01, SvcWrap<SetHeapSize>, "SetHeapSize"},
-    {0x02, SvcWrap<SetMemoryPermission>, "SetMemoryPermission"},
-    {0x03, SvcWrap<SetMemoryAttribute>, "SetMemoryAttribute"},
-    {0x04, SvcWrap<MapMemory>, "MapMemory"},
-    {0x05, SvcWrap<UnmapMemory>, "UnmapMemory"},
-    {0x06, SvcWrap<QueryMemory>, "QueryMemory"},
-    {0x07, SvcWrap<ExitProcess>, "ExitProcess"},
-    {0x08, SvcWrap<CreateThread>, "CreateThread"},
-    {0x09, SvcWrap<StartThread>, "StartThread"},
-    {0x0A, SvcWrap<ExitThread>, "ExitThread"},
-    {0x0B, SvcWrap<SleepThread>, "SleepThread"},
-    {0x0C, SvcWrap<GetThreadPriority>, "GetThreadPriority"},
-    {0x0D, SvcWrap<SetThreadPriority>, "SetThreadPriority"},
-    {0x0E, SvcWrap<GetThreadCoreMask>, "GetThreadCoreMask"},
-    {0x0F, SvcWrap<SetThreadCoreMask>, "SetThreadCoreMask"},
-    {0x10, SvcWrap<GetCurrentProcessorNumber>, "GetCurrentProcessorNumber"},
-    {0x11, SvcWrap<SignalEvent>, "SignalEvent"},
-    {0x12, SvcWrap<ClearEvent>, "ClearEvent"},
-    {0x13, SvcWrap<MapSharedMemory>, "MapSharedMemory"},
-    {0x14, SvcWrap<UnmapSharedMemory>, "UnmapSharedMemory"},
-    {0x15, SvcWrap<CreateTransferMemory>, "CreateTransferMemory"},
-    {0x16, SvcWrap<CloseHandle>, "CloseHandle"},
-    {0x17, SvcWrap<ResetSignal>, "ResetSignal"},
-    {0x18, SvcWrap<WaitSynchronization>, "WaitSynchronization"},
-    {0x19, SvcWrap<CancelSynchronization>, "CancelSynchronization"},
-    {0x1A, SvcWrap<ArbitrateLock>, "ArbitrateLock"},
-    {0x1B, SvcWrap<ArbitrateUnlock>, "ArbitrateUnlock"},
-    {0x1C, SvcWrap<WaitProcessWideKeyAtomic>, "WaitProcessWideKeyAtomic"},
-    {0x1D, SvcWrap<SignalProcessWideKey>, "SignalProcessWideKey"},
-    {0x1E, SvcWrap<GetSystemTick>, "GetSystemTick"},
-    {0x1F, SvcWrap<ConnectToNamedPort>, "ConnectToNamedPort"},
+    {0x01, SvcWrap32<SetHeapSize32>, "SetHeapSize32"},
+    {0x02, nullptr, "Unknown"},
+    {0x03, nullptr, "SetMemoryAttribute32"},
+    {0x04, nullptr, "MapMemory32"},
+    {0x05, nullptr, "UnmapMemory32"},
+    {0x06, SvcWrap32<QueryMemory32>, "QueryMemory32"},
+    {0x07, nullptr, "ExitProcess32"},
+    {0x08, nullptr, "CreateThread32"},
+    {0x09, nullptr, "StartThread32"},
+    {0x0a, nullptr, "ExitThread32"},
+    {0x0b, nullptr, "SleepThread32"},
+    {0x0c, SvcWrap32<GetThreadPriority32>, "GetThreadPriority32"},
+    {0x0d, nullptr, "SetThreadPriority32"},
+    {0x0e, nullptr, "GetThreadCoreMask32"},
+    {0x0f, nullptr, "SetThreadCoreMask32"},
+    {0x10, nullptr, "GetCurrentProcessorNumber32"},
+    {0x11, nullptr, "SignalEvent32"},
+    {0x12, nullptr, "ClearEvent32"},
+    {0x13, nullptr, "MapSharedMemory32"},
+    {0x14, nullptr, "UnmapSharedMemory32"},
+    {0x15, nullptr, "CreateTransferMemory32"},
+    {0x16, SvcWrap32<CloseHandle32>, "CloseHandle32"},
+    {0x17, nullptr, "ResetSignal32"},
+    {0x18, SvcWrap32<WaitSynchronization32>, "WaitSynchronization32"},
+    {0x19, nullptr, "CancelSynchronization32"},
+    {0x1a, nullptr, "ArbitrateLock32"},
+    {0x1b, nullptr, "ArbitrateUnlock32"},
+    {0x1c, nullptr, "WaitProcessWideKeyAtomic32"},
+    {0x1d, SvcWrap32<SignalProcessWideKey32>, "SignalProcessWideKey32"},
+    {0x1e, nullptr, "GetSystemTick32"},
+    {0x1f, SvcWrap32<ConnectToNamedPort32>, "ConnectToNamedPort32"},
+    {0x20, nullptr, "Unknown"},
+    {0x21, SvcWrap32<SendSyncRequest32>, "SendSyncRequest32"},
+    {0x22, nullptr, "SendSyncRequestWithUserBuffer32"},
+    {0x23, nullptr, "Unknown"},
+    {0x24, nullptr, "GetProcessId32"},
+    {0x25, SvcWrap32<GetThreadId32>, "GetThreadId32"},
+    {0x26, nullptr, "Break32"},
+    {0x27, nullptr, "OutputDebugString32"},
+    {0x28, nullptr, "Unknown"},
+    {0x29, SvcWrap32<GetInfo32>, "GetInfo32"},
+    {0x2a, nullptr, "Unknown"},
+    {0x2b, nullptr, "Unknown"},
+    {0x2c, nullptr, "MapPhysicalMemory32"},
+    {0x2d, nullptr, "UnmapPhysicalMemory32"},
+    {0x2e, nullptr, "Unknown"},
+    {0x2f, nullptr, "Unknown"},
+    {0x30, nullptr, "Unknown"},
+    {0x31, nullptr, "Unknown"},
+    {0x32, nullptr, "SetThreadActivity32"},
+    {0x33, nullptr, "GetThreadContext32"},
+    {0x34, nullptr, "WaitForAddress32"},
+    {0x35, nullptr, "SignalToAddress32"},
+    {0x36, nullptr, "Unknown"},
+    {0x37, nullptr, "Unknown"},
+    {0x38, nullptr, "Unknown"},
+    {0x39, nullptr, "Unknown"},
+    {0x3a, nullptr, "Unknown"},
+    {0x3b, nullptr, "Unknown"},
+    {0x3c, nullptr, "Unknown"},
+    {0x3d, nullptr, "Unknown"},
+    {0x3e, nullptr, "Unknown"},
+    {0x3f, nullptr, "Unknown"},
+    {0x40, nullptr, "CreateSession32"},
+    {0x41, nullptr, "AcceptSession32"},
+    {0x42, nullptr, "Unknown"},
+    {0x43, nullptr, "ReplyAndReceive32"},
+    {0x44, nullptr, "Unknown"},
+    {0x45, nullptr, "CreateEvent32"},
+    {0x46, nullptr, "Unknown"},
+    {0x47, nullptr, "Unknown"},
+    {0x48, nullptr, "Unknown"},
+    {0x49, nullptr, "Unknown"},
+    {0x4a, nullptr, "Unknown"},
+    {0x4b, nullptr, "Unknown"},
+    {0x4c, nullptr, "Unknown"},
+    {0x4d, nullptr, "Unknown"},
+    {0x4e, nullptr, "Unknown"},
+    {0x4f, nullptr, "Unknown"},
+    {0x50, nullptr, "Unknown"},
+    {0x51, nullptr, "Unknown"},
+    {0x52, nullptr, "Unknown"},
+    {0x53, nullptr, "Unknown"},
+    {0x54, nullptr, "Unknown"},
+    {0x55, nullptr, "Unknown"},
+    {0x56, nullptr, "Unknown"},
+    {0x57, nullptr, "Unknown"},
+    {0x58, nullptr, "Unknown"},
+    {0x59, nullptr, "Unknown"},
+    {0x5a, nullptr, "Unknown"},
+    {0x5b, nullptr, "Unknown"},
+    {0x5c, nullptr, "Unknown"},
+    {0x5d, nullptr, "Unknown"},
+    {0x5e, nullptr, "Unknown"},
+    {0x5F, nullptr, "FlushProcessDataCache32"},
+    {0x60, nullptr, "Unknown"},
+    {0x61, nullptr, "Unknown"},
+    {0x62, nullptr, "Unknown"},
+    {0x63, nullptr, "Unknown"},
+    {0x64, nullptr, "Unknown"},
+    {0x65, nullptr, "GetProcessList32"},
+    {0x66, nullptr, "Unknown"},
+    {0x67, nullptr, "Unknown"},
+    {0x68, nullptr, "Unknown"},
+    {0x69, nullptr, "Unknown"},
+    {0x6A, nullptr, "Unknown"},
+    {0x6B, nullptr, "Unknown"},
+    {0x6C, nullptr, "Unknown"},
+    {0x6D, nullptr, "Unknown"},
+    {0x6E, nullptr, "Unknown"},
+    {0x6f, nullptr, "GetSystemInfo32"},
+    {0x70, nullptr, "CreatePort32"},
+    {0x71, nullptr, "ManageNamedPort32"},
+    {0x72, nullptr, "ConnectToPort32"},
+    {0x73, nullptr, "SetProcessMemoryPermission32"},
+    {0x74, nullptr, "Unknown"},
+    {0x75, nullptr, "Unknown"},
+    {0x76, nullptr, "Unknown"},
+    {0x77, nullptr, "MapProcessCodeMemory32"},
+    {0x78, nullptr, "UnmapProcessCodeMemory32"},
+    {0x79, nullptr, "Unknown"},
+    {0x7A, nullptr, "Unknown"},
+    {0x7B, nullptr, "TerminateProcess32"},
+};
+
+static const FunctionDef SVC_Table_64[] = {
+    {0x00, nullptr, "Unknown"},
+    {0x01, SvcWrap64<SetHeapSize>, "SetHeapSize"},
+    {0x02, SvcWrap64<SetMemoryPermission>, "SetMemoryPermission"},
+    {0x03, SvcWrap64<SetMemoryAttribute>, "SetMemoryAttribute"},
+    {0x04, SvcWrap64<MapMemory>, "MapMemory"},
+    {0x05, SvcWrap64<UnmapMemory>, "UnmapMemory"},
+    {0x06, SvcWrap64<QueryMemory>, "QueryMemory"},
+    {0x07, SvcWrap64<ExitProcess>, "ExitProcess"},
+    {0x08, SvcWrap64<CreateThread>, "CreateThread"},
+    {0x09, SvcWrap64<StartThread>, "StartThread"},
+    {0x0A, SvcWrap64<ExitThread>, "ExitThread"},
+    {0x0B, SvcWrap64<SleepThread>, "SleepThread"},
+    {0x0C, SvcWrap64<GetThreadPriority>, "GetThreadPriority"},
+    {0x0D, SvcWrap64<SetThreadPriority>, "SetThreadPriority"},
+    {0x0E, SvcWrap64<GetThreadCoreMask>, "GetThreadCoreMask"},
+    {0x0F, SvcWrap64<SetThreadCoreMask>, "SetThreadCoreMask"},
+    {0x10, SvcWrap64<GetCurrentProcessorNumber>, "GetCurrentProcessorNumber"},
+    {0x11, SvcWrap64<SignalEvent>, "SignalEvent"},
+    {0x12, SvcWrap64<ClearEvent>, "ClearEvent"},
+    {0x13, SvcWrap64<MapSharedMemory>, "MapSharedMemory"},
+    {0x14, SvcWrap64<UnmapSharedMemory>, "UnmapSharedMemory"},
+    {0x15, SvcWrap64<CreateTransferMemory>, "CreateTransferMemory"},
+    {0x16, SvcWrap64<CloseHandle>, "CloseHandle"},
+    {0x17, SvcWrap64<ResetSignal>, "ResetSignal"},
+    {0x18, SvcWrap64<WaitSynchronization>, "WaitSynchronization"},
+    {0x19, SvcWrap64<CancelSynchronization>, "CancelSynchronization"},
+    {0x1A, SvcWrap64<ArbitrateLock>, "ArbitrateLock"},
+    {0x1B, SvcWrap64<ArbitrateUnlock>, "ArbitrateUnlock"},
+    {0x1C, SvcWrap64<WaitProcessWideKeyAtomic>, "WaitProcessWideKeyAtomic"},
+    {0x1D, SvcWrap64<SignalProcessWideKey>, "SignalProcessWideKey"},
+    {0x1E, SvcWrap64<GetSystemTick>, "GetSystemTick"},
+    {0x1F, SvcWrap64<ConnectToNamedPort>, "ConnectToNamedPort"},
    {0x20, nullptr, "SendSyncRequestLight"},
-    {0x21, SvcWrap<SendSyncRequest>, "SendSyncRequest"},
+    {0x21, SvcWrap64<SendSyncRequest>, "SendSyncRequest"},
    {0x22, nullptr, "SendSyncRequestWithUserBuffer"},
    {0x23, nullptr, "SendAsyncRequestWithUserBuffer"},
-    {0x24, SvcWrap<GetProcessId>, "GetProcessId"},
-    {0x25, SvcWrap<GetThreadId>, "GetThreadId"},
-    {0x26, SvcWrap<Break>, "Break"},
-    {0x27, SvcWrap<OutputDebugString>, "OutputDebugString"},
+    {0x24, SvcWrap64<GetProcessId>, "GetProcessId"},
+    {0x25, SvcWrap64<GetThreadId>, "GetThreadId"},
+    {0x26, SvcWrap64<Break>, "Break"},
+    {0x27, SvcWrap64<OutputDebugString>, "OutputDebugString"},
    {0x28, nullptr, "ReturnFromException"},
-    {0x29, SvcWrap<GetInfo>, "GetInfo"},
+    {0x29, SvcWrap64<GetInfo>, "GetInfo"},
    {0x2A, nullptr, "FlushEntireDataCache"},
    {0x2B, nullptr, "FlushDataCache"},
-    {0x2C, SvcWrap<MapPhysicalMemory>, "MapPhysicalMemory"},
-    {0x2D, SvcWrap<UnmapPhysicalMemory>, "UnmapPhysicalMemory"},
+    {0x2C, SvcWrap64<MapPhysicalMemory>, "MapPhysicalMemory"},
+    {0x2D, SvcWrap64<UnmapPhysicalMemory>, "UnmapPhysicalMemory"},
    {0x2E, nullptr, "GetFutureThreadInfo"},
    {0x2F, nullptr, "GetLastThreadInfo"},
-    {0x30, SvcWrap<GetResourceLimitLimitValue>, "GetResourceLimitLimitValue"},
-    {0x31, SvcWrap<GetResourceLimitCurrentValue>, "GetResourceLimitCurrentValue"},
-    {0x32, SvcWrap<SetThreadActivity>, "SetThreadActivity"},
-    {0x33, SvcWrap<GetThreadContext>, "GetThreadContext"},
-    {0x34, SvcWrap<WaitForAddress>, "WaitForAddress"},
-    {0x35, SvcWrap<SignalToAddress>, "SignalToAddress"},
+    {0x30, SvcWrap64<GetResourceLimitLimitValue>, "GetResourceLimitLimitValue"},
+    {0x31, SvcWrap64<GetResourceLimitCurrentValue>, "GetResourceLimitCurrentValue"},
+    {0x32, SvcWrap64<SetThreadActivity>, "SetThreadActivity"},
+    {0x33, SvcWrap64<GetThreadContext>, "GetThreadContext"},
+    {0x34, SvcWrap64<WaitForAddress>, "WaitForAddress"},
+    {0x35, SvcWrap64<SignalToAddress>, "SignalToAddress"},
    {0x36, nullptr, "SynchronizePreemptionState"},
    {0x37, nullptr, "Unknown"},
    {0x38, nullptr, "Unknown"},
    {0x39, nullptr, "Unknown"},
    {0x3A, nullptr, "Unknown"},
    {0x3B, nullptr, "Unknown"},
-    {0x3C, SvcWrap<KernelDebug>, "KernelDebug"},
-    {0x3D, SvcWrap<ChangeKernelTraceState>, "ChangeKernelTraceState"},
+    {0x3C, SvcWrap64<KernelDebug>, "KernelDebug"},
+    {0x3D, SvcWrap64<ChangeKernelTraceState>, "ChangeKernelTraceState"},
    {0x3E, nullptr, "Unknown"},
    {0x3F, nullptr, "Unknown"},
    {0x40, nullptr, "CreateSession"},
@@ -2387,7 +2577,7 @@ static const FunctionDef SVC_Table[] = {
    {0x42, nullptr, "ReplyAndReceiveLight"},
    {0x43, nullptr, "ReplyAndReceive"},
    {0x44, nullptr, "ReplyAndReceiveWithUserBuffer"},
-    {0x45, SvcWrap<CreateEvent>, "CreateEvent"},
+    {0x45, SvcWrap64<CreateEvent>, "CreateEvent"},
    {0x46, nullptr, "Unknown"},
    {0x47, nullptr, "Unknown"},
    {0x48, nullptr, "MapPhysicalMemoryUnsafe"},
@@ -2398,9 +2588,9 @@ static const FunctionDef SVC_Table[] = {
    {0x4D, nullptr, "SleepSystem"},
    {0x4E, nullptr, "ReadWriteRegister"},
    {0x4F, nullptr, "SetProcessActivity"},
-    {0x50, SvcWrap<CreateSharedMemory>, "CreateSharedMemory"},
-    {0x51, SvcWrap<MapTransferMemory>, "MapTransferMemory"},
-    {0x52, SvcWrap<UnmapTransferMemory>, "UnmapTransferMemory"},
+    {0x50, SvcWrap64<CreateSharedMemory>, "CreateSharedMemory"},
+    {0x51, SvcWrap64<MapTransferMemory>, "MapTransferMemory"},
+    {0x52, SvcWrap64<UnmapTransferMemory>, "UnmapTransferMemory"},
    {0x53, nullptr, "CreateInterruptEvent"},
    {0x54, nullptr, "QueryPhysicalAddress"},
    {0x55, nullptr, "QueryIoMapping"},
@@ -2419,8 +2609,8 @@ static const FunctionDef SVC_Table[] = {
    {0x62, nullptr, "TerminateDebugProcess"},
    {0x63, nullptr, "GetDebugEvent"},
    {0x64, nullptr, "ContinueDebugEvent"},
-    {0x65, SvcWrap<GetProcessList>, "GetProcessList"},
-    {0x66, SvcWrap<GetThreadList>, "GetThreadList"},
+    {0x65, SvcWrap64<GetProcessList>, "GetProcessList"},
+    {0x66, SvcWrap64<GetThreadList>, "GetThreadList"},
    {0x67, nullptr, "GetDebugThreadContext"},
    {0x68, nullptr, "SetDebugThreadContext"},
    {0x69, nullptr, "QueryDebugProcessMemory"},
@@ -2436,24 +2626,32 @@ static const FunctionDef SVC_Table[] = {
    {0x73, nullptr, "SetProcessMemoryPermission"},
    {0x74, nullptr, "MapProcessMemory"},
    {0x75, nullptr, "UnmapProcessMemory"},
-    {0x76, SvcWrap<QueryProcessMemory>, "QueryProcessMemory"},
-    {0x77, SvcWrap<MapProcessCodeMemory>, "MapProcessCodeMemory"},
-    {0x78, SvcWrap<UnmapProcessCodeMemory>, "UnmapProcessCodeMemory"},
+    {0x76, SvcWrap64<QueryProcessMemory>, "QueryProcessMemory"},
+    {0x77, SvcWrap64<MapProcessCodeMemory>, "MapProcessCodeMemory"},
+    {0x78, SvcWrap64<UnmapProcessCodeMemory>, "UnmapProcessCodeMemory"},
    {0x79, nullptr, "CreateProcess"},
    {0x7A, nullptr, "StartProcess"},
    {0x7B, nullptr, "TerminateProcess"},
-    {0x7C, SvcWrap<GetProcessInfo>, "GetProcessInfo"},
-    {0x7D, SvcWrap<CreateResourceLimit>, "CreateResourceLimit"},
-    {0x7E, SvcWrap<SetResourceLimitLimitValue>, "SetResourceLimitLimitValue"},
+    {0x7C, SvcWrap64<GetProcessInfo>, "GetProcessInfo"},
+    {0x7D, SvcWrap64<CreateResourceLimit>, "CreateResourceLimit"},
+    {0x7E, SvcWrap64<SetResourceLimitLimitValue>, "SetResourceLimitLimitValue"},
    {0x7F, nullptr, "CallSecureMonitor"},
 };

-static const FunctionDef* GetSVCInfo(u32 func_num) {
-    if (func_num >= std::size(SVC_Table)) {
+static const FunctionDef* GetSVCInfo32(u32 func_num) {
+    if (func_num >= std::size(SVC_Table_32)) {
        LOG_ERROR(Kernel_SVC, "Unknown svc=0x{:02X}", func_num);
        return nullptr;
    }
-    return &SVC_Table[func_num];
+    return &SVC_Table_32[func_num];
+}
+
+static const FunctionDef* GetSVCInfo64(u32 func_num) {
+    if (func_num >= std::size(SVC_Table_64)) {
+        LOG_ERROR(Kernel_SVC, "Unknown svc=0x{:02X}", func_num);
+        return nullptr;
+    }
+    return &SVC_Table_64[func_num];
 }

 MICROPROFILE_DEFINE(Kernel_SVC, "Kernel", "SVC", MP_RGB(70, 200, 70));
@@ -2464,7 +2662,8 @@ void CallSVC(Core::System& system, u32 immediate) {
    // Lock the global kernel mutex when we enter the kernel HLE.
    std::lock_guard lock{HLE::g_hle_lock};

-    const FunctionDef* info = GetSVCInfo(immediate);
+    const FunctionDef* info = system.CurrentProcess()->Is64BitProcess() ? GetSVCInfo64(immediate)
+                                                                        : GetSVCInfo32(immediate);
    if (info) {
        if (info->func) {
            info->func(system);
--- a/src/core/hle/kernel/svc_wrap.h
+++ b/src/core/hle/kernel/svc_wrap.h
@@ -15,6 +15,10 @@ static inline u64 Param(const Core::System& system, int n) {
    return system.CurrentArmInterface().GetReg(n);
 }

+static inline u32 Param32(const Core::System& system, int n) {
+    return static_cast<u32>(system.CurrentArmInterface().GetReg(n));
+}
+
 /**
 * HLE a function return from the current ARM userland process
 * @param system System context
@@ -24,40 +28,44 @@ static inline void FuncReturn(Core::System& system, u64 result) {
    system.CurrentArmInterface().SetReg(0, result);
 }

+static inline void FuncReturn32(Core::System& system, u32 result) {
+    system.CurrentArmInterface().SetReg(0, (u64)result);
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Function wrappers that return type ResultCode

 template <ResultCode func(Core::System&, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0)).raw);
 }

 template <ResultCode func(Core::System&, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), Param(system, 1)).raw);
 }

 template <ResultCode func(Core::System&, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0))).raw);
 }

 template <ResultCode func(Core::System&, u32, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(
        system,
        func(system, static_cast<u32>(Param(system, 0)), static_cast<u32>(Param(system, 1))).raw);
 }

 template <ResultCode func(Core::System&, u32, u64, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1),
                            Param(system, 2), Param(system, 3))
                           .raw);
 }

 template <ResultCode func(Core::System&, u32*)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param = 0;
    const u32 retval = func(system, &param).raw;
    system.CurrentArmInterface().SetReg(1, param);
@@ -65,7 +73,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, static_cast<u32>(Param(system, 1))).raw;
    system.CurrentArmInterface().SetReg(1, param_1);
@@ -73,7 +81,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u32*)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    u32 param_2 = 0;
    const u32 retval = func(system, &param_1, &param_2).raw;
@@ -86,7 +94,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1)).raw;
    system.CurrentArmInterface().SetReg(1, param_1);
@@ -94,7 +102,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval =
        func(system, &param_1, Param(system, 1), static_cast<u32>(Param(system, 2))).raw;
@@ -104,7 +112,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64*, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u64 param_1 = 0;
    const u32 retval = func(system, &param_1, static_cast<u32>(Param(system, 1))).raw;

@@ -113,12 +121,12 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), static_cast<u32>(Param(system, 1))).raw);
 }

 template <ResultCode func(Core::System&, u64*, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u64 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1)).raw;

@@ -127,7 +135,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64*, u32, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u64 param_1 = 0;
    const u32 retval = func(system, &param_1, static_cast<u32>(Param(system, 1)),
                            static_cast<u32>(Param(system, 2)))
@@ -138,19 +146,19 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1)).raw);
 }

 template <ResultCode func(Core::System&, u32, u32, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)),
                            static_cast<u32>(Param(system, 1)), Param(system, 2))
                           .raw);
 }

 template <ResultCode func(Core::System&, u32, u32*, u64*)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    u64 param_2 = 0;
    const ResultCode retval = func(system, static_cast<u32>(Param(system, 2)), &param_1, &param_2);
@@ -161,54 +169,54 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64, u64, u32, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), Param(system, 1),
                            static_cast<u32>(Param(system, 2)), static_cast<u32>(Param(system, 3)))
                           .raw);
 }

 template <ResultCode func(Core::System&, u64, u64, u32, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), Param(system, 1),
                            static_cast<u32>(Param(system, 2)), Param(system, 3))
                           .raw);
 }

 template <ResultCode func(Core::System&, u32, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1),
                            static_cast<u32>(Param(system, 2)))
                           .raw);
 }

 template <ResultCode func(Core::System&, u64, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), Param(system, 1), Param(system, 2)).raw);
 }

 template <ResultCode func(Core::System&, u64, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(
        system,
        func(system, Param(system, 0), Param(system, 1), static_cast<u32>(Param(system, 2))).raw);
 }

 template <ResultCode func(Core::System&, u32, u64, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0)), Param(system, 1),
                            Param(system, 2), static_cast<u32>(Param(system, 3)))
                           .raw);
 }

 template <ResultCode func(Core::System&, u32, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(
        system,
        func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2)).raw);
 }

 template <ResultCode func(Core::System&, u32*, u64, u64, s64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1), static_cast<u32>(Param(system, 2)),
                            static_cast<s64>(Param(system, 3)))
@@ -219,14 +227,14 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64, u64, u32, s64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), Param(system, 1),
                            static_cast<u32>(Param(system, 2)), static_cast<s64>(Param(system, 3)))
                           .raw);
 }

 template <ResultCode func(Core::System&, u64*, u64, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u64 param_1 = 0;
    const u32 retval =
        func(system, &param_1, Param(system, 1), Param(system, 2), Param(system, 3)).raw;
@@ -236,7 +244,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u64, u64, u64, u32, s32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1), Param(system, 2), Param(system, 3),
                            static_cast<u32>(Param(system, 4)), static_cast<s32>(Param(system, 5)))
@@ -247,7 +255,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u32*, u64, u64, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1), Param(system, 2),
                            static_cast<u32>(Param(system, 3)))
@@ -258,7 +266,7 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, Handle*, u64, u32, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    u32 param_1 = 0;
    const u32 retval = func(system, &param_1, Param(system, 1), static_cast<u32>(Param(system, 2)),
                            static_cast<u32>(Param(system, 3)))
@@ -269,14 +277,14 @@ void SvcWrap(Core::System& system) {
 }

 template <ResultCode func(Core::System&, u64, u32, s32, s64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), static_cast<u32>(Param(system, 1)),
                            static_cast<s32>(Param(system, 2)), static_cast<s64>(Param(system, 3)))
                           .raw);
 }

 template <ResultCode func(Core::System&, u64, u32, s32, s32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0), static_cast<u32>(Param(system, 1)),
                            static_cast<s32>(Param(system, 2)), static_cast<s32>(Param(system, 3)))
                           .raw);
@@ -286,7 +294,7 @@ void SvcWrap(Core::System& system) {
 // Function wrappers that return type u32

 template <u32 func(Core::System&)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system));
 }

@@ -294,7 +302,7 @@ void SvcWrap(Core::System& system) {
 // Function wrappers that return type u64

 template <u64 func(Core::System&)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    FuncReturn(system, func(system));
 }

@@ -302,44 +310,110 @@ void SvcWrap(Core::System& system) {
 /// Function wrappers that return type void

 template <void func(Core::System&)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system);
 }

 template <void func(Core::System&, u32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, static_cast<u32>(Param(system, 0)));
 }

 template <void func(Core::System&, u32, u64, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2),
         Param(system, 3));
 }

 template <void func(Core::System&, s64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, static_cast<s64>(Param(system, 0)));
 }

 template <void func(Core::System&, u64, s32)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, Param(system, 0), static_cast<s32>(Param(system, 1)));
 }

 template <void func(Core::System&, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, Param(system, 0), Param(system, 1));
 }

 template <void func(Core::System&, u64, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, Param(system, 0), Param(system, 1), Param(system, 2));
 }

 template <void func(Core::System&, u32, u64, u64)>
-void SvcWrap(Core::System& system) {
+void SvcWrap64(Core::System& system) {
    func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2));
 }

+// Used by QueryMemory32
+template <ResultCode func(Core::System&, u32, u32, u32)>
+void SvcWrap32(Core::System& system) {
+    FuncReturn32(system,
+                 func(system, Param32(system, 0), Param32(system, 1), Param32(system, 2)).raw);
+}
+
+// Used by GetInfo32
+template <ResultCode func(Core::System&, u32*, u32*, u32, u32, u32, u32)>
+void SvcWrap32(Core::System& system) {
+    u32 param_1 = 0;
+    u32 param_2 = 0;
+
+    const u32 retval = func(system, &param_1, &param_2, Param32(system, 0), Param32(system, 1),
+                            Param32(system, 2), Param32(system, 3))
+                           .raw;
+
+    system.CurrentArmInterface().SetReg(1, param_1);
+    system.CurrentArmInterface().SetReg(2, param_2);
+    FuncReturn(system, retval);
+}
+
+// Used by GetThreadPriority32, ConnectToNamedPort32
+template <ResultCode func(Core::System&, u32*, u32)>
+void SvcWrap32(Core::System& system) {
+    u32 param_1 = 0;
+    const u32 retval = func(system, &param_1, Param32(system, 1)).raw;
+    system.CurrentArmInterface().SetReg(1, param_1);
+    FuncReturn(system, retval);
+}
+
+// Used by GetThreadId32
+template <ResultCode func(Core::System&, u32*, u32*, u32)>
+void SvcWrap32(Core::System& system) {
+    u32 param_1 = 0;
+    u32 param_2 = 0;
+
+    const u32 retval = func(system, &param_1, &param_2, Param32(system, 1)).raw;
+    system.CurrentArmInterface().SetReg(1, param_1);
+    system.CurrentArmInterface().SetReg(2, param_2);
+    FuncReturn(system, retval);
+}
+
+// Used by SignalProcessWideKey32
+template <void func(Core::System&, u32, s32)>
+void SvcWrap32(Core::System& system) {
+    func(system, static_cast<u32>(Param(system, 0)), static_cast<s32>(Param(system, 1)));
+}
+
+// Used by SendSyncRequest32
+template <ResultCode func(Core::System&, u32)>
+void SvcWrap32(Core::System& system) {
+    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0))).raw);
+}
+
+// Used by WaitSynchronization32
+template <ResultCode func(Core::System&, u32, u32, s32, u32, Handle*)>
+void SvcWrap32(Core::System& system) {
+    u32 param_1 = 0;
+    const u32 retval = func(system, Param32(system, 0), Param32(system, 1), Param32(system, 2),
+                            Param32(system, 3), &param_1)
+                           .raw;
+    system.CurrentArmInterface().SetReg(1, param_1);
+    FuncReturn(system, retval);
+}
+
 } // namespace Kernel
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -133,15 +133,16 @@ void Thread::CancelWait() {
    ResumeFromWait();
 }

-/**
- * Resets a thread context, making it ready to be scheduled and run by the CPU
- * @param context Thread context to reset
- * @param stack_top Address of the top of the stack
- * @param entry_point Address of entry point for execution
- * @param arg User argument for thread
- */
-static void ResetThreadContext(Core::ARM_Interface::ThreadContext& context, VAddr stack_top,
-                               VAddr entry_point, u64 arg) {
+static void ResetThreadContext32(Core::ARM_Interface::ThreadContext32& context, u32 stack_top,
+                                 u32 entry_point, u32 arg) {
+    context = {};
+    context.cpu_registers[0] = arg;
+    context.cpu_registers[15] = entry_point;
+    context.cpu_registers[13] = stack_top;
+}
+
+static void ResetThreadContext64(Core::ARM_Interface::ThreadContext64& context, VAddr stack_top,
+                                 VAddr entry_point, u64 arg) {
    context = {};
    context.cpu_registers[0] = arg;
    context.pc = entry_point;
@@ -198,9 +199,9 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::strin

    thread->owner_process->RegisterThread(thread.get());

-    // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used
-    // to initialize the context
-    ResetThreadContext(thread->context, stack_top, entry_point, arg);
+    ResetThreadContext32(thread->context_32, static_cast<u32>(stack_top),
+                         static_cast<u32>(entry_point), static_cast<u32>(arg));
+    ResetThreadContext64(thread->context_64, stack_top, entry_point, arg);

    return MakeResult<std::shared_ptr<Thread>>(std::move(thread));
 }
@@ -213,11 +214,13 @@ void Thread::SetPriority(u32 priority) {
 }

 void Thread::SetWaitSynchronizationResult(ResultCode result) {
-    context.cpu_registers[0] = result.raw;
+    context_32.cpu_registers[0] = result.raw;
+    context_64.cpu_registers[0] = result.raw;
 }

 void Thread::SetWaitSynchronizationOutput(s32 output) {
-    context.cpu_registers[1] = output;
+    context_32.cpu_registers[1] = output;
+    context_64.cpu_registers[1] = output;
 }

 s32 Thread::GetSynchronizationObjectIndex(std::shared_ptr<SynchronizationObject> object) const {
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -102,7 +102,8 @@ public:

    using MutexWaitingThreads = std::vector<std::shared_ptr<Thread>>;

-    using ThreadContext = Core::ARM_Interface::ThreadContext;
+    using ThreadContext32 = Core::ARM_Interface::ThreadContext32;
+    using ThreadContext64 = Core::ARM_Interface::ThreadContext64;

    using ThreadSynchronizationObjects = std::vector<std::shared_ptr<SynchronizationObject>>;

@@ -273,12 +274,20 @@ public:
        return status == ThreadStatus::WaitSynch;
    }

-    ThreadContext& GetContext() {
-        return context;
+    ThreadContext32& GetContext32() {
+        return context_32;
    }

-    const ThreadContext& GetContext() const {
-        return context;
+    const ThreadContext32& GetContext32() const {
+        return context_32;
+    }
+
+    ThreadContext64& GetContext64() {
+        return context_64;
+    }
+
+    const ThreadContext64& GetContext64() const {
+        return context_64;
    }

    ThreadStatus GetStatus() const {
@@ -466,7 +475,8 @@ private:
    void AdjustSchedulingOnPriority(u32 old_priority);
    void AdjustSchedulingOnAffinity(u64 old_affinity_mask, s32 old_core);

-    Core::ARM_Interface::ThreadContext context{};
+    ThreadContext32 context_32{};
+    ThreadContext64 context_64{};

    u64 thread_id = 0;

--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -191,8 +191,6 @@ void NVFlinger::Compose() {
        // Search for a queued buffer and acquire it
        auto buffer = buffer_queue.AcquireBuffer();

-        MicroProfileFlip();
-
        if (!buffer) {
            continue;
        }
@@ -206,6 +204,8 @@ void NVFlinger::Compose() {
            gpu.WaitFence(fence.id, fence.value);
        }

+        MicroProfileFlip();
+
        // Now send the buffer to the GPU for drawing.
        // TODO(Subv): Support more than just disp0. The display device selection is probably based
        // on which display we're drawing (Default, Internal, External, etc)
--- a/src/core/loader/deconstructed_rom_directory.cpp
+++ b/src/core/loader/deconstructed_rom_directory.cpp
@@ -129,12 +129,6 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect
    }
    metadata.Print();

-    const FileSys::ProgramAddressSpaceType arch_bits{metadata.GetAddressSpaceType()};
-    if (arch_bits == FileSys::ProgramAddressSpaceType::Is32Bit ||
-        arch_bits == FileSys::ProgramAddressSpaceType::Is32BitNoMap) {
-        return {ResultStatus::Error32BitISA, {}};
-    }
-
    if (process.LoadFromMetadata(metadata).IsError()) {
        return {ResultStatus::ErrorUnableToParseKernelMetadata, {}};
    }
--- a/src/core/reporter.cpp
+++ b/src/core/reporter.cpp
@@ -111,7 +111,7 @@ json GetProcessorStateDataAuto(Core::System& system) {
    const auto& vm_manager{process->VMManager()};
    auto& arm{system.CurrentArmInterface()};

-    Core::ARM_Interface::ThreadContext context{};
+    Core::ARM_Interface::ThreadContext64 context{};
    arm.SaveContext(context);

    return GetProcessorStateData(process->Is64BitProcess() ? "AArch64" : "AArch32",
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -430,6 +430,7 @@ struct Values {

    float resolution_factor;
    int aspect_ratio;
+    int max_anisotropy;
    bool use_frame_limit;
    u16 frame_limit;
    bool use_disk_shader_cache;
--- a/src/input_common/udp/client.cpp
+++ b/src/input_common/udp/client.cpp
@@ -32,8 +32,16 @@ public:
                    SocketCallback callback)
        : callback(std::move(callback)), timer(io_service),
          socket(io_service, udp::endpoint(udp::v4(), 0)), client_id(client_id),
-          pad_index(pad_index),
-          send_endpoint(udp::endpoint(boost::asio::ip::make_address_v4(host), port)) {}
+          pad_index(pad_index) {
+        boost::system::error_code ec{};
+        auto ipv4 = boost::asio::ip::make_address_v4(host, ec);
+        if (ec.failed()) {
+            LOG_ERROR(Input, "Invalid IPv4 address \"{}\" provided to socket", host);
+            ipv4 = boost::asio::ip::address_v4{};
+        }
+
+        send_endpoint = {udp::endpoint(ipv4, port)};
+    }

    void Stop() {
        io_service.stop();
@@ -85,17 +93,18 @@ private:
    }

    void HandleSend(const boost::system::error_code& error) {
+        boost::system::error_code _ignored{};
        // Send a request for getting port info for the pad
        Request::PortInfo port_info{1, {pad_index, 0, 0, 0}};
        const auto port_message = Request::Create(port_info, client_id);
        std::memcpy(&send_buffer1, &port_message, PORT_INFO_SIZE);
-        socket.send_to(boost::asio::buffer(send_buffer1), send_endpoint);
+        socket.send_to(boost::asio::buffer(send_buffer1), send_endpoint, {}, _ignored);

        // Send a request for getting pad data for the pad
        Request::PadData pad_data{Request::PadData::Flags::Id, pad_index, EMPTY_MAC_ADDRESS};
        const auto pad_message = Request::Create(pad_data, client_id);
        std::memcpy(send_buffer2.data(), &pad_message, PAD_DATA_SIZE);
-        socket.send_to(boost::asio::buffer(send_buffer2), send_endpoint);
+        socket.send_to(boost::asio::buffer(send_buffer2), send_endpoint, {}, _ignored);
        StartSend(timer.expiry());
    }

--- a/src/input_common/udp/protocol.cpp
+++ b/src/input_common/udp/protocol.cpp
@@ -31,7 +31,6 @@ namespace Response {
 */
 std::optional<Type> Validate(u8* data, std::size_t size) {
    if (size < sizeof(Header)) {
-        LOG_DEBUG(Input, "Invalid UDP packet received");
        return std::nullopt;
    }
    Header header{};
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -2,6 +2,8 @@ add_library(video_core STATIC
    buffer_cache/buffer_block.h
    buffer_cache/buffer_cache.h
    buffer_cache/map_interval.h
+    dirty_flags.cpp
+    dirty_flags.h
    dma_pusher.cpp
    dma_pusher.h
    engines/const_buffer_engine_interface.h
@@ -63,14 +65,12 @@ add_library(video_core STATIC
    renderer_opengl/gl_shader_decompiler.h
    renderer_opengl/gl_shader_disk_cache.cpp
    renderer_opengl/gl_shader_disk_cache.h
-    renderer_opengl/gl_shader_gen.cpp
-    renderer_opengl/gl_shader_gen.h
    renderer_opengl/gl_shader_manager.cpp
    renderer_opengl/gl_shader_manager.h
    renderer_opengl/gl_shader_util.cpp
    renderer_opengl/gl_shader_util.h
-    renderer_opengl/gl_state.cpp
-    renderer_opengl/gl_state.h
+    renderer_opengl/gl_state_tracker.cpp
+    renderer_opengl/gl_state_tracker.h
    renderer_opengl/gl_stream_buffer.cpp
    renderer_opengl/gl_stream_buffer.h
    renderer_opengl/gl_texture_cache.cpp
@@ -116,8 +116,6 @@ add_library(video_core STATIC
    shader/ast.h
    shader/compiler_settings.cpp
    shader/compiler_settings.h
-    shader/const_buffer_locker.cpp
-    shader/const_buffer_locker.h
    shader/control_flow.cpp
    shader/control_flow.h
    shader/decode.cpp
@@ -126,9 +124,13 @@ add_library(video_core STATIC
    shader/node_helper.cpp
    shader/node_helper.h
    shader/node.h
+    shader/registry.cpp
+    shader/registry.h
    shader/shader_ir.cpp
    shader/shader_ir.h
    shader/track.cpp
+    shader/transform_feedback.cpp
+    shader/transform_feedback.h
    surface.cpp
    surface.h
    texture_cache/format_lookup_table.cpp
@@ -198,6 +200,8 @@ if (ENABLE_VULKAN)
        renderer_vulkan/vk_shader_util.h
        renderer_vulkan/vk_staging_buffer_pool.cpp
        renderer_vulkan/vk_staging_buffer_pool.h
+        renderer_vulkan/vk_state_tracker.cpp
+        renderer_vulkan/vk_state_tracker.h
        renderer_vulkan/vk_stream_buffer.cpp
        renderer_vulkan/vk_stream_buffer.h
        renderer_vulkan/vk_swapchain.cpp
--- a/src/video_core/dirty_flags.cpp
+++ b/src/video_core/dirty_flags.cpp
@@ -0,0 +1,46 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <cstddef>
+
+#include "common/common_types.h"
+#include "video_core/dirty_flags.h"
+
+#define OFF(field_name) MAXWELL3D_REG_INDEX(field_name)
+#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / sizeof(u32))
+
+namespace VideoCommon::Dirty {
+
+using Tegra::Engines::Maxwell3D;
+
+void SetupCommonOnWriteStores(Tegra::Engines::Maxwell3D::DirtyState::Flags& store) {
+    store[RenderTargets] = true;
+    store[ZetaBuffer] = true;
+    for (std::size_t i = 0; i < Maxwell3D::Regs::NumRenderTargets; ++i) {
+        store[ColorBuffer0 + i] = true;
+    }
+}
+
+void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) {
+    static constexpr std::size_t num_per_rt = NUM(rt[0]);
+    static constexpr std::size_t begin = OFF(rt);
+    static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets;
+    for (std::size_t rt = 0; rt < Maxwell3D::Regs::NumRenderTargets; ++rt) {
+        FillBlock(tables[0], begin + rt * num_per_rt, num_per_rt, ColorBuffer0 + rt);
+    }
+    FillBlock(tables[1], begin, num, RenderTargets);
+
+    static constexpr std::array zeta_flags{ZetaBuffer, RenderTargets};
+    for (std::size_t i = 0; i < std::size(zeta_flags); ++i) {
+        const u8 flag = zeta_flags[i];
+        auto& table = tables[i];
+        table[OFF(zeta_enable)] = flag;
+        table[OFF(zeta_width)] = flag;
+        table[OFF(zeta_height)] = flag;
+        FillBlock(table, OFF(zeta), NUM(zeta), flag);
+    }
+}
+
+} // namespace VideoCommon::Dirty
--- a/src/video_core/dirty_flags.h
+++ b/src/video_core/dirty_flags.h
@@ -0,0 +1,51 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+
+#include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
+
+namespace VideoCommon::Dirty {
+
+enum : u8 {
+    NullEntry = 0,
+
+    RenderTargets,
+    ColorBuffer0,
+    ColorBuffer1,
+    ColorBuffer2,
+    ColorBuffer3,
+    ColorBuffer4,
+    ColorBuffer5,
+    ColorBuffer6,
+    ColorBuffer7,
+    ZetaBuffer,
+
+    LastCommonEntry,
+};
+
+template <typename Integer>
+void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Table& table, std::size_t begin,
+               std::size_t num, Integer dirty_index) {
+    const auto it = std::begin(table) + begin;
+    std::fill(it, it + num, static_cast<u8>(dirty_index));
+}
+
+template <typename Integer1, typename Integer2>
+void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_t begin,
+               std::size_t num, Integer1 index_a, Integer2 index_b) {
+    FillBlock(tables[0], begin, num, index_a);
+    FillBlock(tables[1], begin, num, index_b);
+}
+
+void SetupCommonOnWriteStores(Tegra::Engines::Maxwell3D::DirtyState::Flags& store);
+
+void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables);
+
+} // namespace VideoCommon::Dirty
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -22,7 +22,7 @@ void DmaPusher::DispatchCalls() {
    MICROPROFILE_SCOPE(DispatchCalls);

    // On entering GPU code, assume all memory may be touched by the ARM core.
-    gpu.Maxwell3D().dirty.OnMemoryWrite();
+    gpu.Maxwell3D().OnMemoryWrite();

    dma_pushbuffer_subindex = 0;

--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -16,11 +16,12 @@ namespace Tegra::Engines {

 struct SamplerDescriptor {
    union {
-        BitField<0, 20, Tegra::Shader::TextureType> texture_type;
-        BitField<20, 1, u32> is_array;
-        BitField<21, 1, u32> is_buffer;
-        BitField<22, 1, u32> is_shadow;
-        u32 raw{};
+        u32 raw = 0;
+        BitField<0, 2, Tegra::Shader::TextureType> texture_type;
+        BitField<2, 3, Tegra::Texture::ComponentType> component_type;
+        BitField<5, 1, u32> is_array;
+        BitField<6, 1, u32> is_buffer;
+        BitField<7, 1, u32> is_shadow;
    };

    bool operator==(const SamplerDescriptor& rhs) const noexcept {
@@ -31,68 +32,48 @@ struct SamplerDescriptor {
        return !operator==(rhs);
    }

-    static SamplerDescriptor FromTicTexture(Tegra::Texture::TextureType tic_texture_type) {
+    static SamplerDescriptor FromTIC(const Tegra::Texture::TICEntry& tic) {
+        using Tegra::Shader::TextureType;
        SamplerDescriptor result;
-        switch (tic_texture_type) {
+
+        // This is going to be used to determine the shading language type.
+        // Because of that we don't care about all component types on color textures.
+        result.component_type.Assign(tic.r_type.Value());
+
+        switch (tic.texture_type.Value()) {
        case Tegra::Texture::TextureType::Texture1D:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::Texture1D);
            return result;
        case Tegra::Texture::TextureType::Texture2D:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::Texture2D);
            return result;
        case Tegra::Texture::TextureType::Texture3D:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture3D);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::Texture3D);
            return result;
        case Tegra::Texture::TextureType::TextureCubemap:
-            result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::TextureCube);
            return result;
        case Tegra::Texture::TextureType::Texture1DArray:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D);
+            result.texture_type.Assign(TextureType::Texture1D);
            result.is_array.Assign(1);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
            return result;
        case Tegra::Texture::TextureType::Texture2DArray:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
+            result.texture_type.Assign(TextureType::Texture2D);
            result.is_array.Assign(1);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
            return result;
        case Tegra::Texture::TextureType::Texture1DBuffer:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D);
-            result.is_array.Assign(0);
+            result.texture_type.Assign(TextureType::Texture1D);
            result.is_buffer.Assign(1);
-            result.is_shadow.Assign(0);
            return result;
        case Tegra::Texture::TextureType::Texture2DNoMipmap:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::Texture2D);
            return result;
        case Tegra::Texture::TextureType::TextureCubeArray:
-            result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube);
+            result.texture_type.Assign(TextureType::TextureCube);
            result.is_array.Assign(1);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
            return result;
        default:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::Texture2D);
            return result;
        }
    }
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -39,7 +39,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
-            system.GPU().Maxwell3D().dirty.OnMemoryWrite();
+            system.GPU().Maxwell3D().OnMemoryWrite();
        }
        break;
    }
@@ -89,7 +89,7 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con

    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
-    SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value());
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
    return result;
 }
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -34,7 +34,7 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
-            system.GPU().Maxwell3D().dirty.OnMemoryWrite();
+            system.GPU().Maxwell3D().OnMemoryWrite();
        }
        break;
    }
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -26,7 +26,8 @@ Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& raste
                     MemoryManager& memory_manager)
    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
-    InitDirtySettings();
+    dirty.flags.flip();
+
    InitializeRegisterDefaults();
 }

@@ -75,8 +76,8 @@ void Maxwell3D::InitializeRegisterDefaults() {
    regs.stencil_back_mask = 0xFFFFFFFF;

    regs.depth_test_func = Regs::ComparisonOp::Always;
-    regs.cull.front_face = Regs::Cull::FrontFace::CounterClockWise;
-    regs.cull.cull_face = Regs::Cull::CullFace::Back;
+    regs.front_face = Regs::FrontFace::CounterClockWise;
+    regs.cull_face = Regs::CullFace::Back;

    // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a
    // register carrying a default value. Assume it's OpenGL's default (1).
@@ -95,7 +96,7 @@ void Maxwell3D::InitializeRegisterDefaults() {
    regs.rasterize_enable = 1;
    regs.rt_separate_frag_data = 1;
    regs.framebuffer_srgb = 1;
-    regs.cull.front_face = Maxwell3D::Regs::Cull::FrontFace::ClockWise;
+    regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise;

    mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_end_gl)] = true;
    mme_inline[MAXWELL3D_REG_INDEX(draw.vertex_begin_gl)] = true;
@@ -103,164 +104,6 @@ void Maxwell3D::InitializeRegisterDefaults() {
    mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
 }

-#define DIRTY_REGS_POS(field_name) static_cast<u8>(offsetof(Maxwell3D::DirtyRegs, field_name))
-
-void Maxwell3D::InitDirtySettings() {
-    const auto set_block = [this](std::size_t start, std::size_t range, u8 position) {
-        const auto start_itr = dirty_pointers.begin() + start;
-        const auto end_itr = start_itr + range;
-        std::fill(start_itr, end_itr, position);
-    };
-    dirty.regs.fill(true);
-
-    // Init Render Targets
-    constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
-    constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt);
-    constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8;
-    u8 rt_dirty_reg = DIRTY_REGS_POS(render_target);
-    for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) {
-        set_block(rt_reg, registers_per_rt, rt_dirty_reg);
-        ++rt_dirty_reg;
-    }
-    constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer);
-    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag;
-    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_width)] = depth_buffer_flag;
-    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_height)] = depth_buffer_flag;
-    constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
-    constexpr u32 zeta_reg = MAXWELL3D_REG_INDEX(zeta);
-    set_block(zeta_reg, registers_in_zeta, depth_buffer_flag);
-
-    // Init Vertex Arrays
-    constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array);
-    constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32);
-    constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays;
-    u8 va_dirty_reg = DIRTY_REGS_POS(vertex_array);
-    u8 vi_dirty_reg = DIRTY_REGS_POS(vertex_instance);
-    for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end;
-         vertex_reg += vertex_array_size) {
-        set_block(vertex_reg, 3, va_dirty_reg);
-        // The divisor concerns vertex array instances
-        dirty_pointers[static_cast<std::size_t>(vertex_reg) + 3] = vi_dirty_reg;
-        ++va_dirty_reg;
-        ++vi_dirty_reg;
-    }
-    constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit);
-    constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32);
-    constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays;
-    va_dirty_reg = DIRTY_REGS_POS(vertex_array);
-    for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end;
-         vertex_reg += vertex_limit_size) {
-        set_block(vertex_reg, vertex_limit_size, va_dirty_reg);
-        va_dirty_reg++;
-    }
-    constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays);
-    constexpr u32 vertex_instance_size =
-        sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32);
-    constexpr u32 vertex_instance_end =
-        vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays;
-    vi_dirty_reg = DIRTY_REGS_POS(vertex_instance);
-    for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end;
-         vertex_reg += vertex_instance_size) {
-        set_block(vertex_reg, vertex_instance_size, vi_dirty_reg);
-        vi_dirty_reg++;
-    }
-    set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(),
-              DIRTY_REGS_POS(vertex_attrib_format));
-
-    // Init Shaders
-    constexpr u32 shader_registers_count =
-        sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
-    set_block(MAXWELL3D_REG_INDEX(shader_config[0]), shader_registers_count,
-              DIRTY_REGS_POS(shaders));
-
-    // State
-
-    // Viewport
-    constexpr u8 viewport_dirty_reg = DIRTY_REGS_POS(viewport);
-    constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports);
-    constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32);
-    set_block(viewport_start, viewport_size, viewport_dirty_reg);
-    constexpr u32 view_volume_start = MAXWELL3D_REG_INDEX(view_volume_clip_control);
-    constexpr u32 view_volume_size = sizeof(regs.view_volume_clip_control) / sizeof(u32);
-    set_block(view_volume_start, view_volume_size, viewport_dirty_reg);
-
-    // Viewport transformation
-    constexpr u32 viewport_trans_start = MAXWELL3D_REG_INDEX(viewport_transform);
-    constexpr u32 viewport_trans_size = sizeof(regs.viewport_transform) / sizeof(u32);
-    set_block(viewport_trans_start, viewport_trans_size, DIRTY_REGS_POS(viewport_transform));
-
-    // Cullmode
-    constexpr u32 cull_mode_start = MAXWELL3D_REG_INDEX(cull);
-    constexpr u32 cull_mode_size = sizeof(regs.cull) / sizeof(u32);
-    set_block(cull_mode_start, cull_mode_size, DIRTY_REGS_POS(cull_mode));
-
-    // Screen y control
-    dirty_pointers[MAXWELL3D_REG_INDEX(screen_y_control)] = DIRTY_REGS_POS(screen_y_control);
-
-    // Primitive Restart
-    constexpr u32 primitive_restart_start = MAXWELL3D_REG_INDEX(primitive_restart);
-    constexpr u32 primitive_restart_size = sizeof(regs.primitive_restart) / sizeof(u32);
-    set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart));
-
-    // Depth Test
-    constexpr u8 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test);
-    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg;
-
-    // Stencil Test
-    constexpr u32 stencil_test_dirty_reg = DIRTY_REGS_POS(stencil_test);
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_enable)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_func)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_ref)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_mask)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_fail)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zfail)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zpass)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_mask)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_two_side_enable)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_func)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_ref)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_mask)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_fail)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zfail)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zpass)] = stencil_test_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg;
-
-    // Color Mask
-    constexpr u8 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask);
-    dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg;
-    set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32),
-              color_mask_dirty_reg);
-    // Blend State
-    constexpr u8 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state);
-    set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32),
-              blend_state_dirty_reg);
-    dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg;
-    set_block(MAXWELL3D_REG_INDEX(blend), sizeof(regs.blend) / sizeof(u32), blend_state_dirty_reg);
-    set_block(MAXWELL3D_REG_INDEX(independent_blend), sizeof(regs.independent_blend) / sizeof(u32),
-              blend_state_dirty_reg);
-
-    // Scissor State
-    constexpr u8 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test);
-    set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32),
-              scissor_test_dirty_reg);
-
-    // Polygon Offset
-    constexpr u8 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset);
-    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_units)] = polygon_offset_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_factor)] = polygon_offset_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg;
-
-    // Depth bounds
-    constexpr u8 depth_bounds_values_dirty_reg = DIRTY_REGS_POS(depth_bounds_values);
-    dirty_pointers[MAXWELL3D_REG_INDEX(depth_bounds[0])] = depth_bounds_values_dirty_reg;
-    dirty_pointers[MAXWELL3D_REG_INDEX(depth_bounds[1])] = depth_bounds_values_dirty_reg;
-}
-
 void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) {
    // Reset the current macro.
    executing_macro = 0;
@@ -319,19 +162,9 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {

    if (regs.reg_array[method] != method_call.argument) {
        regs.reg_array[method] = method_call.argument;
-        const std::size_t dirty_reg = dirty_pointers[method];
-        if (dirty_reg) {
-            dirty.regs[dirty_reg] = true;
-            if (dirty_reg >= DIRTY_REGS_POS(vertex_array) &&
-                dirty_reg < DIRTY_REGS_POS(vertex_array_buffers)) {
-                dirty.vertex_array_buffers = true;
-            } else if (dirty_reg >= DIRTY_REGS_POS(vertex_instance) &&
-                       dirty_reg < DIRTY_REGS_POS(vertex_instances)) {
-                dirty.vertex_instances = true;
-            } else if (dirty_reg >= DIRTY_REGS_POS(render_target) &&
-                       dirty_reg < DIRTY_REGS_POS(render_settings)) {
-                dirty.render_settings = true;
-            }
+
+        for (const auto& table : dirty.tables) {
+            dirty.flags[table[method]] = true;
        }
    }

@@ -419,7 +252,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
-            dirty.OnMemoryWrite();
+            OnMemoryWrite();
        }
        break;
    }
@@ -727,7 +560,7 @@ void Maxwell3D::FinishCBData() {

    const u32 id = cb_data_state.id;
    memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
-    dirty.OnMemoryWrite();
+    OnMemoryWrite();

    cb_data_state.id = null_cb_data;
    cb_data_state.current = null_cb_data;
@@ -805,7 +638,7 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b

    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
-    SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value());
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
    return result;
 }
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -6,6 +6,7 @@

 #include <array>
 #include <bitset>
+#include <limits>
 #include <optional>
 #include <type_traits>
 #include <unordered_map>
@@ -66,6 +67,7 @@ public:
        static constexpr std::size_t NumVaryings = 31;
        static constexpr std::size_t NumImages = 8; // TODO(Rodrigo): Investigate this number
        static constexpr std::size_t NumClipDistances = 8;
+        static constexpr std::size_t NumTransformFeedbackBuffers = 4;
        static constexpr std::size_t MaxShaderProgram = 6;
        static constexpr std::size_t MaxShaderStage = 5;
        // Maximum number of const buffers per shader stage.
@@ -431,21 +433,15 @@ public:
            GeneratedPrimitives = 0x1F,
        };

-        struct Cull {
-            enum class FrontFace : u32 {
-                ClockWise = 0x0900,
-                CounterClockWise = 0x0901,
-            };
+        enum class FrontFace : u32 {
+            ClockWise = 0x0900,
+            CounterClockWise = 0x0901,
+        };

-            enum class CullFace : u32 {
-                Front = 0x0404,
-                Back = 0x0405,
-                FrontAndBack = 0x0408,
-            };
-
-            u32 enabled;
-            FrontFace front_face;
-            CullFace cull_face;
+        enum class CullFace : u32 {
+            Front = 0x0404,
+            Back = 0x0405,
+            FrontAndBack = 0x0408,
        };

        struct Blend {
@@ -529,6 +525,12 @@ public:
            FractionalEven = 2,
        };

+        enum class PolygonMode : u32 {
+            Point = 0x1b00,
+            Line = 0x1b01,
+            Fill = 0x1b02,
+        };
+
        struct RenderTargetConfig {
            u32 address_high;
            u32 address_low;
@@ -574,7 +576,7 @@ public:
            f32 translate_z;
            INSERT_UNION_PADDING_WORDS(2);

-            Common::Rectangle<s32> GetRect() const {
+            Common::Rectangle<f32> GetRect() const {
                return {
                    GetX(),               // left
                    GetY() + GetHeight(), // top
@@ -583,20 +585,20 @@ public:
                };
            };

-            s32 GetX() const {
-                return static_cast<s32>(std::max(0.0f, translate_x - std::fabs(scale_x)));
+            f32 GetX() const {
+                return std::max(0.0f, translate_x - std::fabs(scale_x));
            }

-            s32 GetY() const {
-                return static_cast<s32>(std::max(0.0f, translate_y - std::fabs(scale_y)));
+            f32 GetY() const {
+                return std::max(0.0f, translate_y - std::fabs(scale_y));
            }

-            s32 GetWidth() const {
-                return static_cast<s32>(translate_x + std::fabs(scale_x)) - GetX();
+            f32 GetWidth() const {
+                return translate_x + std::fabs(scale_x) - GetX();
            }

-            s32 GetHeight() const {
-                return static_cast<s32>(translate_y + std::fabs(scale_y)) - GetY();
+            f32 GetHeight() const {
+                return translate_y + std::fabs(scale_y) - GetY();
            }
        };

@@ -626,6 +628,29 @@ public:
            float depth_range_far;
        };

+        struct TransformFeedbackBinding {
+            u32 buffer_enable;
+            u32 address_high;
+            u32 address_low;
+            s32 buffer_size;
+            s32 buffer_offset;
+            INSERT_UNION_PADDING_WORDS(3);
+
+            GPUVAddr Address() const {
+                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                             address_low);
+            }
+        };
+        static_assert(sizeof(TransformFeedbackBinding) == 32);
+
+        struct TransformFeedbackLayout {
+            u32 stream;
+            u32 varying_count;
+            u32 stride;
+            INSERT_UNION_PADDING_WORDS(1);
+        };
+        static_assert(sizeof(TransformFeedbackLayout) == 16);
+
        bool IsShaderConfigEnabled(std::size_t index) const {
            // The VertexB is always enabled.
            if (index == static_cast<std::size_t>(Regs::ShaderProgram::VertexB)) {
@@ -634,6 +659,10 @@ public:
            return shader_config[index].enable != 0;
        }

+        bool IsShaderConfigEnabled(Regs::ShaderProgram type) const {
+            return IsShaderConfigEnabled(static_cast<std::size_t>(type));
+        }
+
        union {
            struct {
                INSERT_UNION_PADDING_WORDS(0x45);
@@ -682,7 +711,13 @@ public:

                u32 rasterize_enable;

-                INSERT_UNION_PADDING_WORDS(0xF1);
+                std::array<TransformFeedbackBinding, NumTransformFeedbackBuffers> tfb_bindings;
+
+                INSERT_UNION_PADDING_WORDS(0xC0);
+
+                std::array<TransformFeedbackLayout, NumTransformFeedbackBuffers> tfb_layouts;
+
+                INSERT_UNION_PADDING_WORDS(0x1);

                u32 tfb_enabled;

@@ -710,7 +745,12 @@ public:

                s32 clear_stencil;

-                INSERT_UNION_PADDING_WORDS(0x7);
+                INSERT_UNION_PADDING_WORDS(0x2);
+
+                PolygonMode polygon_mode_front;
+                PolygonMode polygon_mode_back;
+
+                INSERT_UNION_PADDING_WORDS(0x3);

                u32 polygon_offset_point_enable;
                u32 polygon_offset_line_enable;
@@ -769,7 +809,11 @@ public:
                    BitField<12, 4, u32> viewport;
                } clear_flags;

-                INSERT_UNION_PADDING_WORDS(0x19);
+                INSERT_UNION_PADDING_WORDS(0x10);
+
+                u32 fill_rectangle;
+
+                INSERT_UNION_PADDING_WORDS(0x8);

                std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format;

@@ -872,16 +916,7 @@ public:

                INSERT_UNION_PADDING_WORDS(0x35);

-                union {
-                    BitField<0, 1, u32> c0;
-                    BitField<1, 1, u32> c1;
-                    BitField<2, 1, u32> c2;
-                    BitField<3, 1, u32> c3;
-                    BitField<4, 1, u32> c4;
-                    BitField<5, 1, u32> c5;
-                    BitField<6, 1, u32> c6;
-                    BitField<7, 1, u32> c7;
-                } clip_distance_enabled;
+                u32 clip_distance_enabled;

                u32 samplecnt_enable;

@@ -1060,7 +1095,9 @@ public:

                INSERT_UNION_PADDING_WORDS(1);

-                Cull cull;
+                u32 cull_test_enabled;
+                FrontFace front_face;
+                CullFace cull_face;

                u32 pixel_center_integer;

@@ -1199,7 +1236,11 @@ public:

                u32 tex_cb_index;

-                INSERT_UNION_PADDING_WORDS(0x395);
+                INSERT_UNION_PADDING_WORDS(0x7D);
+
+                std::array<std::array<u8, 128>, NumTransformFeedbackBuffers> tfb_varying_locs;
+
+                INSERT_UNION_PADDING_WORDS(0x298);

                struct {
                    /// Compressed address of a buffer that holds information about bound SSBOs.
@@ -1238,79 +1279,6 @@ public:

    State state{};

-    struct DirtyRegs {
-        static constexpr std::size_t NUM_REGS = 256;
-        static_assert(NUM_REGS - 1 <= std::numeric_limits<u8>::max());
-
-        union {
-            struct {
-                bool null_dirty;
-
-                // Vertex Attributes
-                bool vertex_attrib_format;
-
-                // Vertex Arrays
-                std::array<bool, 32> vertex_array;
-
-                bool vertex_array_buffers;
-
-                // Vertex Instances
-                std::array<bool, 32> vertex_instance;
-
-                bool vertex_instances;
-
-                // Render Targets
-                std::array<bool, 8> render_target;
-                bool depth_buffer;
-
-                bool render_settings;
-
-                // Shaders
-                bool shaders;
-
-                // Rasterizer State
-                bool viewport;
-                bool clip_coefficient;
-                bool cull_mode;
-                bool primitive_restart;
-                bool depth_test;
-                bool stencil_test;
-                bool blend_state;
-                bool scissor_test;
-                bool transform_feedback;
-                bool color_mask;
-                bool polygon_offset;
-                bool depth_bounds_values;
-
-                // Complementary
-                bool viewport_transform;
-                bool screen_y_control;
-
-                bool memory_general;
-            };
-            std::array<bool, NUM_REGS> regs;
-        };
-
-        void ResetVertexArrays() {
-            vertex_array.fill(true);
-            vertex_array_buffers = true;
-        }
-
-        void ResetRenderTargets() {
-            depth_buffer = true;
-            render_target.fill(true);
-            render_settings = true;
-        }
-
-        void OnMemoryWrite() {
-            shaders = true;
-            memory_general = true;
-            ResetRenderTargets();
-            ResetVertexArrays();
-        }
-
-    } dirty{};
-
    /// Reads a register value located at the input method address
    u32 GetRegisterValue(u32 method) const;

@@ -1356,6 +1324,11 @@ public:
        return execute_on;
    }

+    /// Notify a memory write has happened.
+    void OnMemoryWrite() {
+        dirty.flags |= dirty.on_write_stores;
+    }
+
    enum class MMEDrawMode : u32 {
        Undefined,
        Array,
@@ -1371,6 +1344,16 @@ public:
        u32 gl_end_count{};
    } mme_draw;

+    struct DirtyState {
+        using Flags = std::bitset<std::numeric_limits<u8>::max()>;
+        using Table = std::array<u8, Regs::NUM_REGS>;
+        using Tables = std::array<Table, 2>;
+
+        Flags flags;
+        Flags on_write_stores;
+        Tables tables{};
+    } dirty;
+
 private:
    void InitializeRegisterDefaults();

@@ -1417,8 +1400,6 @@ private:
    /// Retrieves information about a specific TSC entry from the TSC buffer.
    Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;

-    void InitDirtySettings();
-
    /**
     * Call a macro on this engine.
     * @param method Method to call
@@ -1485,6 +1466,8 @@ ASSERT_REG_POSITION(tess_mode, 0xC8);
 ASSERT_REG_POSITION(tess_level_outer, 0xC9);
 ASSERT_REG_POSITION(tess_level_inner, 0xCD);
 ASSERT_REG_POSITION(rasterize_enable, 0xDF);
+ASSERT_REG_POSITION(tfb_bindings, 0xE0);
+ASSERT_REG_POSITION(tfb_layouts, 0x1C0);
 ASSERT_REG_POSITION(tfb_enabled, 0x1D1);
 ASSERT_REG_POSITION(rt, 0x200);
 ASSERT_REG_POSITION(viewport_transform, 0x280);
@@ -1494,6 +1477,8 @@ ASSERT_REG_POSITION(depth_mode, 0x35F);
 ASSERT_REG_POSITION(clear_color[0], 0x360);
 ASSERT_REG_POSITION(clear_depth, 0x364);
 ASSERT_REG_POSITION(clear_stencil, 0x368);
+ASSERT_REG_POSITION(polygon_mode_front, 0x36B);
+ASSERT_REG_POSITION(polygon_mode_back, 0x36C);
 ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370);
 ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371);
 ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372);
@@ -1507,6 +1492,7 @@ ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
 ASSERT_REG_POSITION(depth_bounds, 0x3E7);
 ASSERT_REG_POSITION(zeta, 0x3F8);
 ASSERT_REG_POSITION(clear_flags, 0x43E);
+ASSERT_REG_POSITION(fill_rectangle, 0x44F);
 ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
 ASSERT_REG_POSITION(rt_control, 0x487);
 ASSERT_REG_POSITION(zeta_width, 0x48a);
@@ -1561,7 +1547,9 @@ ASSERT_REG_POSITION(index_array, 0x5F2);
 ASSERT_REG_POSITION(polygon_offset_clamp, 0x61F);
 ASSERT_REG_POSITION(instanced_arrays, 0x620);
 ASSERT_REG_POSITION(vp_point_size, 0x644);
-ASSERT_REG_POSITION(cull, 0x646);
+ASSERT_REG_POSITION(cull_test_enabled, 0x646);
+ASSERT_REG_POSITION(front_face, 0x647);
+ASSERT_REG_POSITION(cull_face, 0x648);
 ASSERT_REG_POSITION(pixel_center_integer, 0x649);
 ASSERT_REG_POSITION(viewport_transform_enabled, 0x64B);
 ASSERT_REG_POSITION(view_volume_clip_control, 0x64F);
@@ -1578,6 +1566,7 @@ ASSERT_REG_POSITION(firmware, 0x8C0);
 ASSERT_REG_POSITION(const_buffer, 0x8E0);
 ASSERT_REG_POSITION(cb_bind[0], 0x904);
 ASSERT_REG_POSITION(tex_cb_index, 0x982);
+ASSERT_REG_POSITION(tfb_varying_locs, 0xA00);
 ASSERT_REG_POSITION(ssbo_info, 0xD18);
 ASSERT_REG_POSITION(tex_info_buffers.address[0], 0xD2A);
 ASSERT_REG_POSITION(tex_info_buffers.size[0], 0xD2F);
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -57,7 +57,7 @@ void MaxwellDMA::HandleCopy() {
    }

    // All copies here update the main memory, so mark all rasterizer states as invalid.
-    system.GPU().Maxwell3D().dirty.OnMemoryWrite();
+    system.GPU().Maxwell3D().OnMemoryWrite();

    if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
        // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -911,14 +911,9 @@ union Instruction {
    } fadd32i;

    union {
-        BitField<20, 8, u64> shift_position;
-        BitField<28, 8, u64> shift_length;
-        BitField<48, 1, u64> negate_b;
-        BitField<49, 1, u64> negate_a;
-
-        u64 GetLeftShiftValue() const {
-            return 32 - (shift_position + shift_length);
-        }
+        BitField<40, 1, u64> brev;
+        BitField<47, 1, u64> rd_cc;
+        BitField<48, 1, u64> is_signed;
    } bfe;

    union {
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -39,6 +39,7 @@ enum class RenderTargetFormat : u32 {
    RGBA32_FLOAT = 0xC0,
    RGBA32_UINT = 0xC2,
    RGBA16_UNORM = 0xC6,
+    RGBA16_SNORM = 0xC7,
    RGBA16_UINT = 0xC9,
    RGBA16_FLOAT = 0xCA,
    RG32_FLOAT = 0xCB,
--- a/src/video_core/guest_driver.cpp
+++ b/src/video_core/guest_driver.cpp
@@ -4,13 +4,15 @@

 #include <algorithm>
 #include <limits>
+#include <vector>

+#include "common/common_types.h"
 #include "video_core/guest_driver.h"

 namespace VideoCore {

-void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets) {
-    if (texture_handler_size_deduced) {
+void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32> bound_offsets) {
+    if (texture_handler_size) {
        return;
    }
    const std::size_t size = bound_offsets.size();
@@ -29,7 +31,6 @@ void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offse
    if (min_val > 2) {
        return;
    }
-    texture_handler_size_deduced = true;
    texture_handler_size = min_texture_handler_size * min_val;
 }

--- a/src/video_core/guest_driver.h
+++ b/src/video_core/guest_driver.h
@@ -4,6 +4,7 @@

 #pragma once

+#include <optional>
 #include <vector>

 #include "common/common_types.h"
@@ -17,25 +18,29 @@ namespace VideoCore {
 */
 class GuestDriverProfile {
 public:
-    void DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets);
+    explicit GuestDriverProfile() = default;
+    explicit GuestDriverProfile(std::optional<u32> texture_handler_size)
+        : texture_handler_size{texture_handler_size} {}
+
+    void DeduceTextureHandlerSize(std::vector<u32> bound_offsets);

    u32 GetTextureHandlerSize() const {
-        return texture_handler_size;
+        return texture_handler_size.value_or(default_texture_handler_size);
    }

-    bool TextureHandlerSizeKnown() const {
-        return texture_handler_size_deduced;
+    bool IsTextureHandlerSizeKnown() const {
+        return texture_handler_size.has_value();
    }

 private:
    // Minimum size of texture handler any driver can use.
    static constexpr u32 min_texture_handler_size = 4;
-    // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily
-    // use 4 bytes instead. Thus, certain drivers may squish the size.
+
+    // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily use 4 bytes instead.
+    // Thus, certain drivers may squish the size.
    static constexpr u32 default_texture_handler_size = 8;

-    u32 texture_handler_size = default_texture_handler_size;
-    bool texture_handler_size_deduced = false;
+    std::optional<u32> texture_handler_size = default_texture_handler_size;
 };

 } // namespace VideoCore
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -174,7 +174,7 @@ private:
    /// End of address space, based on address space in bits.
    static constexpr GPUVAddr address_space_end{1ULL << address_space_width};

-    Common::PageTable page_table{page_bits};
+    Common::BackingPageTable page_table{page_bits};
    VMAMap vma_map;
    VideoCore::RasterizerInterface& rasterizer;

--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
@@ -51,6 +51,7 @@ static constexpr ConversionArray morton_to_linear_fns = {
    MortonCopy<true, PixelFormat::R8UI>,
    MortonCopy<true, PixelFormat::RGBA16F>,
    MortonCopy<true, PixelFormat::RGBA16U>,
+    MortonCopy<true, PixelFormat::RGBA16S>,
    MortonCopy<true, PixelFormat::RGBA16UI>,
    MortonCopy<true, PixelFormat::R11FG11FB10F>,
    MortonCopy<true, PixelFormat::RGBA32UI>,
@@ -131,6 +132,7 @@ static constexpr ConversionArray linear_to_morton_fns = {
    MortonCopy<false, PixelFormat::R8U>,
    MortonCopy<false, PixelFormat::R8UI>,
    MortonCopy<false, PixelFormat::RGBA16F>,
+    MortonCopy<false, PixelFormat::RGBA16S>,
    MortonCopy<false, PixelFormat::RGBA16U>,
    MortonCopy<false, PixelFormat::RGBA16UI>,
    MortonCopy<false, PixelFormat::R11FG11FB10F>,
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -25,7 +25,6 @@ constexpr std::size_t NumQueryTypes = 1;

 enum class LoadCallbackStage {
    Prepare,
-    Decompile,
    Build,
    Complete,
 };
@@ -89,6 +88,9 @@ public:
    virtual void LoadDiskResources(const std::atomic_bool& stop_loading = false,
                                   const DiskResourceLoadCallback& callback = {}) {}

+    /// Initializes renderer dirty flags
+    virtual void SetupDirtyFlags() {}
+
    /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver.
    GuestDriverProfile& AccessGuestDriverProfile() {
        return guest_driver_profile;
--- a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp
@@ -11,7 +11,6 @@
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_framebuffer_cache.h"
-#include "video_core/renderer_opengl/gl_state.h"

 namespace OpenGL {

@@ -36,8 +35,7 @@ OGLFramebuffer FramebufferCacheOpenGL::CreateFramebuffer(const FramebufferCacheK
    framebuffer.Create();

    // TODO(Rodrigo): Use DSA here after Nvidia fixes their framebuffer DSA bugs.
-    local_state.draw.draw_framebuffer = framebuffer.handle;
-    local_state.ApplyFramebufferState();
+    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer.handle);

    if (key.zeta) {
        const bool stencil = key.zeta->GetSurfaceParams().type == SurfaceType::DepthStencil;
--- a/src/video_core/renderer_opengl/gl_framebuffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_framebuffer_cache.h
@@ -13,7 +13,6 @@
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
-#include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/gl_texture_cache.h"

 namespace OpenGL {
@@ -63,7 +62,6 @@ public:
 private:
    OGLFramebuffer CreateFramebuffer(const FramebufferCacheKey& key);

-    OpenGLState local_state;
    std::unordered_map<FramebufferCacheKey, OGLFramebuffer> cache;
 };

--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -30,7 +30,7 @@
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
-#include "video_core/renderer_opengl/gl_state.h"
+#include "video_core/renderer_opengl/gl_state_tracker.h"
 #include "video_core/renderer_opengl/gl_texture_cache.h"
 #include "video_core/renderer_opengl/utils.h"
 #include "video_core/textures/texture.h"
@@ -55,7 +55,8 @@ struct DrawParameters;
 class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
 public:
    explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                              ScreenInfo& info);
+                              ScreenInfo& info, GLShader::ProgramManager& program_manager,
+                              StateTracker& state_tracker);
    ~RasterizerOpenGL() override;

    void Draw(bool is_indexed, bool is_instanced) override;
@@ -76,6 +77,7 @@ public:
                           u32 pixel_stride) override;
    void LoadDiskResources(const std::atomic_bool& stop_loading,
                           const VideoCore::DiskResourceLoadCallback& callback) override;
+    void SetupDirtyFlags() override;

    /// Returns true when there are commands queued to the OpenGL server.
    bool AnyCommandQueued() const {
@@ -86,8 +88,7 @@ private:
    /// Configures the color and depth framebuffer states.
    void ConfigureFramebuffers();

-    void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
-                                   bool using_depth_fb, bool using_stencil_fb);
+    void ConfigureClearFramebuffer(bool using_color_fb, bool using_depth_fb, bool using_stencil_fb);

    /// Configures the current constbuffers to use for the draw command.
    void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader);
@@ -97,7 +98,7 @@ private:

    /// Configures a constant buffer.
    void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                          const GLShader::ConstBufferEntry& entry);
+                          const ConstBufferEntry& entry);

    /// Configures the current global memory entries to use for the draw command.
    void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
@@ -106,7 +107,7 @@ private:
    void SetupComputeGlobalMemory(const Shader& kernel);

    /// Configures a constant buffer.
-    void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
+    void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
                           std::size_t size);

    /// Configures the current textures to use for the draw command.
@@ -117,7 +118,7 @@ private:

    /// Configures a texture.
    void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
-                      const GLShader::SamplerEntry& entry);
+                      const SamplerEntry& entry);

    /// Configures images in a graphics shader.
    void SetupDrawImages(std::size_t stage_index, const Shader& shader);
@@ -126,15 +127,16 @@ private:
    void SetupComputeImages(const Shader& shader);

    /// Configures an image.
-    void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic,
-                    const GLShader::ImageEntry& entry);
+    void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);

    /// Syncs the viewport and depth range to match the guest state
-    void SyncViewport(OpenGLState& current_state);
+    void SyncViewport();
+
+    /// Syncs the depth clamp state
+    void SyncDepthClamp();

    /// Syncs the clip enabled status to match the guest state
-    void SyncClipEnabled(
-        const std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances>& clip_mask);
+    void SyncClipEnabled(u32 clip_mask);

    /// Syncs the clip coefficients to match the guest state
    void SyncClipCoef();
@@ -164,16 +166,16 @@ private:
    void SyncMultiSampleState();

    /// Syncs the scissor test state to match the guest state
-    void SyncScissorTest(OpenGLState& current_state);
-
-    /// Syncs the transform feedback state to match the guest state
-    void SyncTransformFeedback();
+    void SyncScissorTest();

    /// Syncs the point state to match the guest state
    void SyncPointState();

    /// Syncs the rasterizer enable state to match the guest state
-    void SyncRasterizeEnable(OpenGLState& current_state);
+    void SyncRasterizeEnable();
+
+    /// Syncs polygon modes to match the guest state
+    void SyncPolygonModes();

    /// Syncs Color Mask
    void SyncColorMask();
@@ -184,6 +186,15 @@ private:
    /// Syncs the alpha test state to match the guest state
    void SyncAlphaTest();

+    /// Syncs the framebuffer sRGB state to match the guest state
+    void SyncFramebufferSRGB();
+
+    /// Begin a transform feedback
+    void BeginTransformFeedback(GLenum primitive_mode);
+
+    /// End a transform feedback
+    void EndTransformFeedback();
+
    /// Check for extension that are not strictly required but are needed for correct emulation
    void CheckExtensions();

@@ -191,18 +202,17 @@ private:

    std::size_t CalculateIndexBufferSize() const;

-    /// Updates and returns a vertex array object representing current vertex format
-    GLuint SetupVertexFormat();
+    /// Updates the current vertex format
+    void SetupVertexFormat();

-    void SetupVertexBuffer(GLuint vao);
-    void SetupVertexInstances(GLuint vao);
+    void SetupVertexBuffer();
+    void SetupVertexInstances();

    GLintptr SetupIndexBuffer();

    void SetupShaders(GLenum primitive_mode);

    const Device device;
-    OpenGLState state;

    TextureCacheOpenGL texture_cache;
    ShaderCacheOpenGL shader_cache;
@@ -212,22 +222,25 @@ private:

    Core::System& system;
    ScreenInfo& screen_info;
-
-    std::unique_ptr<GLShader::ProgramManager> shader_program_manager;
-    std::map<std::array<Tegra::Engines::Maxwell3D::Regs::VertexAttribute,
-                        Tegra::Engines::Maxwell3D::Regs::NumVertexAttributes>,
-             OGLVertexArray>
-        vertex_array_cache;
+    GLShader::ProgramManager& program_manager;
+    StateTracker& state_tracker;

    static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
    OGLBufferCache buffer_cache;

-    VertexArrayPushBuffer vertex_array_pushbuffer;
+    VertexArrayPushBuffer vertex_array_pushbuffer{state_tracker};
    BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
    BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};

+    std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
+        transform_feedback_buffers;
+    std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
+        enabled_transform_feedback_buffers;
+
    /// Number of commands queued to the OpenGL driver. Reseted on flush.
    std::size_t num_queued_commands = 0;
+
+    u32 last_clip_distance_mask = 0;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -8,7 +8,6 @@
 #include "common/microprofile.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_util.h"
-#include "video_core/renderer_opengl/gl_state.h"

 MICROPROFILE_DEFINE(OpenGL_ResourceCreation, "OpenGL", "Resource Creation", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_ResourceDeletion, "OpenGL", "Resource Deletion", MP_RGB(128, 128, 192));
@@ -20,7 +19,7 @@ void OGLRenderbuffer::Create() {
        return;

    MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
-    glGenRenderbuffers(1, &handle);
+    glCreateRenderbuffers(1, &handle);
 }

 void OGLRenderbuffer::Release() {
@@ -29,7 +28,6 @@ void OGLRenderbuffer::Release() {

    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
    glDeleteRenderbuffers(1, &handle);
-    OpenGLState::GetCurState().ResetRenderbuffer(handle).Apply();
    handle = 0;
 }

@@ -47,7 +45,6 @@ void OGLTexture::Release() {

    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
    glDeleteTextures(1, &handle);
-    OpenGLState::GetCurState().UnbindTexture(handle).Apply();
    handle = 0;
 }

@@ -65,7 +62,6 @@ void OGLTextureView::Release() {

    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
    glDeleteTextures(1, &handle);
-    OpenGLState::GetCurState().UnbindTexture(handle).Apply();
    handle = 0;
 }

@@ -83,7 +79,6 @@ void OGLSampler::Release() {

    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
    glDeleteSamplers(1, &handle);
-    OpenGLState::GetCurState().ResetSampler(handle).Apply();
    handle = 0;
 }

@@ -127,7 +122,6 @@ void OGLProgram::Release() {

    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
    glDeleteProgram(handle);
-    OpenGLState::GetCurState().ResetProgram(handle).Apply();
    handle = 0;
 }

@@ -145,7 +139,6 @@ void OGLPipeline::Release() {

    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
    glDeleteProgramPipelines(1, &handle);
-    OpenGLState::GetCurState().ResetPipeline(handle).Apply();
    handle = 0;
 }

@@ -189,24 +182,6 @@ void OGLSync::Release() {
    handle = 0;
 }

-void OGLVertexArray::Create() {
-    if (handle != 0)
-        return;
-
-    MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
-    glCreateVertexArrays(1, &handle);
-}
-
-void OGLVertexArray::Release() {
-    if (handle == 0)
-        return;
-
-    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
-    glDeleteVertexArrays(1, &handle);
-    OpenGLState::GetCurState().ResetVertexArray(handle).Apply();
-    handle = 0;
-}
-
 void OGLFramebuffer::Create() {
    if (handle != 0)
        return;
@@ -221,7 +196,6 @@ void OGLFramebuffer::Release() {

    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
    glDeleteFramebuffers(1, &handle);
-    OpenGLState::GetCurState().ResetFramebuffer(handle).Apply();
    handle = 0;
 }

--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -241,31 +241,6 @@ public:
    GLsync handle = 0;
 };

-class OGLVertexArray : private NonCopyable {
-public:
-    OGLVertexArray() = default;
-
-    OGLVertexArray(OGLVertexArray&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
-
-    ~OGLVertexArray() {
-        Release();
-    }
-
-    OGLVertexArray& operator=(OGLVertexArray&& o) noexcept {
-        Release();
-        handle = std::exchange(o.handle, 0);
-        return *this;
-    }
-
-    /// Creates a new internal OpenGL resource and stores the handle
-    void Create();
-
-    /// Deletes the internal OpenGL resource
-    void Release();
-
-    GLuint handle = 0;
-};
-
 class OGLFramebuffer : private NonCopyable {
 public:
    OGLFramebuffer() = default;
--- a/src/video_core/renderer_opengl/gl_sampler_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_sampler_cache.cpp
@@ -38,7 +38,7 @@ OGLSampler SamplerCacheOpenGL::CreateSampler(const Tegra::Texture::TSCEntry& tsc
        glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY, tsc.GetMaxAnisotropy());
    } else if (GLAD_GL_EXT_texture_filter_anisotropic) {
        glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY_EXT, tsc.GetMaxAnisotropy());
-    } else if (tsc.GetMaxAnisotropy() != 1) {
+    } else {
        LOG_WARNING(Render_OpenGL, "Anisotropy not supported by host GPU driver");
    }

--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -2,12 +2,16 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <atomic>
+#include <functional>
 #include <mutex>
 #include <optional>
 #include <string>
 #include <thread>
 #include <unordered_set>
+
 #include <boost/functional/hash.hpp>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
@@ -22,14 +26,16 @@
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
+#include "video_core/renderer_opengl/gl_state_tracker.h"
 #include "video_core/renderer_opengl/utils.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"

 namespace OpenGL {

 using Tegra::Engines::ShaderType;
-using VideoCommon::Shader::ConstBufferLocker;
 using VideoCommon::Shader::ProgramCode;
+using VideoCommon::Shader::Registry;
 using VideoCommon::Shader::ShaderIR;

 namespace {
@@ -55,7 +61,7 @@ constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) {
 }

 /// Calculates the size of a program stream
-std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
+std::size_t CalculateProgramSize(const ProgramCode& program) {
    constexpr std::size_t start_offset = 10;
    // This is the encoded version of BRA that jumps to itself. All Nvidia
    // shaders end with one.
@@ -108,32 +114,9 @@ constexpr GLenum GetGLShaderType(ShaderType shader_type) {
    }
 }

-/// Describes primitive behavior on geometry shaders
-constexpr std::pair<const char*, u32> GetPrimitiveDescription(GLenum primitive_mode) {
-    switch (primitive_mode) {
-    case GL_POINTS:
-        return {"points", 1};
-    case GL_LINES:
-    case GL_LINE_STRIP:
-        return {"lines", 2};
-    case GL_LINES_ADJACENCY:
-    case GL_LINE_STRIP_ADJACENCY:
-        return {"lines_adjacency", 4};
-    case GL_TRIANGLES:
-    case GL_TRIANGLE_STRIP:
-    case GL_TRIANGLE_FAN:
-        return {"triangles", 3};
-    case GL_TRIANGLES_ADJACENCY:
-    case GL_TRIANGLE_STRIP_ADJACENCY:
-        return {"triangles_adjacency", 6};
-    default:
-        return {"points", 1};
-    }
-}
-
 /// Hashes one (or two) program streams
 u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& code,
-                        const ProgramCode& code_b) {
+                        const ProgramCode& code_b = {}) {
    u64 unique_identifier = boost::hash_value(code);
    if (is_a) {
        // VertexA programs include two programs
@@ -142,24 +125,6 @@ u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& co
    return unique_identifier;
 }

-/// Creates an unspecialized program from code streams
-std::string GenerateGLSL(const Device& device, ShaderType shader_type, const ShaderIR& ir,
-                         const std::optional<ShaderIR>& ir_b) {
-    switch (shader_type) {
-    case ShaderType::Vertex:
-        return GLShader::GenerateVertexShader(device, ir, ir_b ? &*ir_b : nullptr);
-    case ShaderType::Geometry:
-        return GLShader::GenerateGeometryShader(device, ir);
-    case ShaderType::Fragment:
-        return GLShader::GenerateFragmentShader(device, ir);
-    case ShaderType::Compute:
-        return GLShader::GenerateComputeShader(device, ir);
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented shader_type={}", static_cast<u32>(shader_type));
-        return {};
-    }
-}
-
 constexpr const char* GetShaderTypeName(ShaderType shader_type) {
    switch (shader_type) {
    case ShaderType::Vertex:
@@ -195,102 +160,38 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
    return {};
 }

-std::string GetShaderId(u64 unique_identifier, ShaderType shader_type) {
+std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
    return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
 }

-Tegra::Engines::ConstBufferEngineInterface& GetConstBufferEngineInterface(Core::System& system,
-                                                                          ShaderType shader_type) {
-    if (shader_type == ShaderType::Compute) {
-        return system.GPU().KeplerCompute();
-    } else {
-        return system.GPU().Maxwell3D();
+std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
+    const VideoCore::GuestDriverProfile guest_profile{entry.texture_handler_size};
+    const VideoCommon::Shader::SerializedRegistryInfo info{guest_profile, entry.bound_buffer,
+                                                           entry.graphics_info, entry.compute_info};
+    const auto registry = std::make_shared<Registry>(entry.type, info);
+    for (const auto& [address, value] : entry.keys) {
+        const auto [buffer, offset] = address;
+        registry->InsertKey(buffer, offset, value);
    }
-}
-
-std::unique_ptr<ConstBufferLocker> MakeLocker(Core::System& system, ShaderType shader_type) {
-    return std::make_unique<ConstBufferLocker>(shader_type,
-                                               GetConstBufferEngineInterface(system, shader_type));
-}
-
-void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) {
-    locker.SetBoundBuffer(usage.bound_buffer);
-    for (const auto& key : usage.keys) {
-        const auto [buffer, offset] = key.first;
-        locker.InsertKey(buffer, offset, key.second);
+    for (const auto& [offset, sampler] : entry.bound_samplers) {
+        registry->InsertBoundSampler(offset, sampler);
    }
-    for (const auto& [offset, sampler] : usage.bound_samplers) {
-        locker.InsertBoundSampler(offset, sampler);
-    }
-    for (const auto& [key, sampler] : usage.bindless_samplers) {
+    for (const auto& [key, sampler] : entry.bindless_samplers) {
        const auto [buffer, offset] = key;
-        locker.InsertBindlessSampler(buffer, offset, sampler);
+        registry->InsertBindlessSampler(buffer, offset, sampler);
    }
+    return registry;
 }

-CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderType shader_type,
-                          const ProgramCode& code, const ProgramCode& code_b,
-                          ConstBufferLocker& locker, const ProgramVariant& variant,
-                          bool hint_retrievable = false) {
-    LOG_INFO(Render_OpenGL, "called. {}", GetShaderId(unique_identifier, shader_type));
-
-    const bool is_compute = shader_type == ShaderType::Compute;
-    const u32 main_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
-    const ShaderIR ir(code, main_offset, COMPILER_SETTINGS, locker);
-    std::optional<ShaderIR> ir_b;
-    if (!code_b.empty()) {
-        ir_b.emplace(code_b, main_offset, COMPILER_SETTINGS, locker);
-    }
-
-    std::string source = fmt::format(R"(// {}
-#version 430 core
-#extension GL_ARB_separate_shader_objects : enable
-)",
-                                     GetShaderId(unique_identifier, shader_type));
-    if (device.HasShaderBallot()) {
-        source += "#extension GL_ARB_shader_ballot : require\n";
-    }
-    if (device.HasVertexViewportLayer()) {
-        source += "#extension GL_ARB_shader_viewport_layer_array : require\n";
-    }
-    if (device.HasImageLoadFormatted()) {
-        source += "#extension GL_EXT_shader_image_load_formatted : require\n";
-    }
-    if (device.HasWarpIntrinsics()) {
-        source += "#extension GL_NV_gpu_shader5 : require\n"
-                  "#extension GL_NV_shader_thread_group : require\n"
-                  "#extension GL_NV_shader_thread_shuffle : require\n";
-    }
-    // This pragma stops Nvidia's driver from over optimizing math (probably using fp16 operations)
-    // on places where we don't want to.
-    // Thanks to Ryujinx for finding this workaround.
-    source += "#pragma optionNV(fastmath off)\n";
-
-    if (shader_type == ShaderType::Geometry) {
-        const auto [glsl_topology, max_vertices] = GetPrimitiveDescription(variant.primitive_mode);
-        source += fmt::format("#define MAX_VERTEX_INPUT {}\n", max_vertices);
-        source += fmt::format("layout ({}) in;\n", glsl_topology);
-    }
-    if (shader_type == ShaderType::Compute) {
-        if (variant.local_memory_size > 0) {
-            source += fmt::format("#define LOCAL_MEMORY_SIZE {}\n",
-                                  Common::AlignUp(variant.local_memory_size, 4) / 4);
-        }
-        source +=
-            fmt::format("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;\n",
-                        variant.block_x, variant.block_y, variant.block_z);
-
-        if (variant.shared_memory_size > 0) {
-            // shared_memory_size is described in number of words
-            source += fmt::format("shared uint smem[{}];\n", variant.shared_memory_size);
-        }
-    }
-
-    source += '\n';
-    source += GenerateGLSL(device, shader_type, ir, ir_b);
+std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type,
+                                        u64 unique_identifier, const ShaderIR& ir,
+                                        const Registry& registry, bool hint_retrievable = false) {
+    const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
+    LOG_INFO(Render_OpenGL, "{}", shader_id);

+    const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
    OGLShader shader;
-    shader.Create(source.c_str(), GetGLShaderType(shader_type));
+    shader.Create(glsl.c_str(), GetGLShaderType(shader_type));

    auto program = std::make_shared<OGLProgram>();
    program->Create(true, hint_retrievable, shader.handle);
@@ -298,7 +199,7 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderTyp
 }

 std::unordered_set<GLenum> GetSupportedFormats() {
-    GLint num_formats{};
+    GLint num_formats;
    glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats);

    std::vector<GLint> formats(num_formats);
@@ -313,115 +214,82 @@ std::unordered_set<GLenum> GetSupportedFormats() {

 } // Anonymous namespace

-CachedShader::CachedShader(const ShaderParameters& params, ShaderType shader_type,
-                           GLShader::ShaderEntries entries, ProgramCode code, ProgramCode code_b)
-    : RasterizerCacheObject{params.host_ptr}, system{params.system},
-      disk_cache{params.disk_cache}, device{params.device}, cpu_addr{params.cpu_addr},
-      unique_identifier{params.unique_identifier}, shader_type{shader_type},
-      entries{std::move(entries)}, code{std::move(code)}, code_b{std::move(code_b)} {
-    if (!params.precompiled_variants) {
-        return;
-    }
-    for (const auto& pair : *params.precompiled_variants) {
-        auto locker = MakeLocker(system, shader_type);
-        const auto& usage = pair->first;
-        FillLocker(*locker, usage);
+CachedShader::CachedShader(const u8* host_ptr, VAddr cpu_addr, std::size_t size_in_bytes,
+                           std::shared_ptr<VideoCommon::Shader::Registry> registry,
+                           ShaderEntries entries, std::shared_ptr<OGLProgram> program)
+    : RasterizerCacheObject{host_ptr}, registry{std::move(registry)}, entries{std::move(entries)},
+      cpu_addr{cpu_addr}, size_in_bytes{size_in_bytes}, program{std::move(program)} {}

-        std::unique_ptr<LockerVariant>* locker_variant = nullptr;
-        const auto it =
-            std::find_if(locker_variants.begin(), locker_variants.end(), [&](const auto& variant) {
-                return variant->locker->HasEqualKeys(*locker);
-            });
-        if (it == locker_variants.end()) {
-            locker_variant = &locker_variants.emplace_back();
-            *locker_variant = std::make_unique<LockerVariant>();
-            locker_variant->get()->locker = std::move(locker);
-        } else {
-            locker_variant = &*it;
-        }
-        locker_variant->get()->programs.emplace(usage.variant, pair->second);
-    }
+CachedShader::~CachedShader() = default;
+
+GLuint CachedShader::GetHandle() const {
+    DEBUG_ASSERT(registry->IsConsistent());
+    return program->handle;
 }

 Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
                                           Maxwell::ShaderProgram program_type, ProgramCode code,
                                           ProgramCode code_b) {
    const auto shader_type = GetShaderType(program_type);
-    params.disk_cache.SaveRaw(
-        ShaderDiskCacheRaw(params.unique_identifier, shader_type, code, code_b));
+    const std::size_t size_in_bytes = code.size() * sizeof(u64);

-    ConstBufferLocker locker(shader_type, params.system.GPU().Maxwell3D());
-    const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, locker);
+    auto registry = std::make_shared<Registry>(shader_type, params.system.GPU().Maxwell3D());
+    const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
    // TODO(Rodrigo): Handle VertexA shaders
    // std::optional<ShaderIR> ir_b;
    // if (!code_b.empty()) {
    //     ir_b.emplace(code_b, STAGE_MAIN_OFFSET);
    // }
-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params, shader_type, GLShader::GetEntries(ir), std::move(code), std::move(code_b)));
+    auto program = BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry);
+
+    ShaderDiskCacheEntry entry;
+    entry.type = shader_type;
+    entry.code = std::move(code);
+    entry.code_b = std::move(code_b);
+    entry.unique_identifier = params.unique_identifier;
+    entry.bound_buffer = registry->GetBoundBuffer();
+    entry.graphics_info = registry->GetGraphicsInfo();
+    entry.keys = registry->GetKeys();
+    entry.bound_samplers = registry->GetBoundSamplers();
+    entry.bindless_samplers = registry->GetBindlessSamplers();
+    params.disk_cache.SaveEntry(std::move(entry));
+
+    return std::shared_ptr<CachedShader>(new CachedShader(params.host_ptr, params.cpu_addr,
+                                                          size_in_bytes, std::move(registry),
+                                                          MakeEntries(ir), std::move(program)));
 }

 Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
-    params.disk_cache.SaveRaw(
-        ShaderDiskCacheRaw(params.unique_identifier, ShaderType::Compute, code));
+    const std::size_t size_in_bytes = code.size() * sizeof(u64);

-    ConstBufferLocker locker(Tegra::Engines::ShaderType::Compute,
-                             params.system.GPU().KeplerCompute());
-    const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, locker);
-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params, ShaderType::Compute, GLShader::GetEntries(ir), std::move(code), {}));
+    auto& engine = params.system.GPU().KeplerCompute();
+    auto registry = std::make_shared<Registry>(ShaderType::Compute, engine);
+    const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
+    const u64 uid = params.unique_identifier;
+    auto program = BuildShader(params.device, ShaderType::Compute, uid, ir, *registry);
+
+    ShaderDiskCacheEntry entry;
+    entry.type = ShaderType::Compute;
+    entry.code = std::move(code);
+    entry.unique_identifier = uid;
+    entry.bound_buffer = registry->GetBoundBuffer();
+    entry.compute_info = registry->GetComputeInfo();
+    entry.keys = registry->GetKeys();
+    entry.bound_samplers = registry->GetBoundSamplers();
+    entry.bindless_samplers = registry->GetBindlessSamplers();
+    params.disk_cache.SaveEntry(std::move(entry));
+
+    return std::shared_ptr<CachedShader>(new CachedShader(params.host_ptr, params.cpu_addr,
+                                                          size_in_bytes, std::move(registry),
+                                                          MakeEntries(ir), std::move(program)));
 }

 Shader CachedShader::CreateFromCache(const ShaderParameters& params,
-                                     const UnspecializedShader& unspecialized) {
-    return std::shared_ptr<CachedShader>(new CachedShader(params, unspecialized.type,
-                                                          unspecialized.entries, unspecialized.code,
-                                                          unspecialized.code_b));
-}
-
-GLuint CachedShader::GetHandle(const ProgramVariant& variant) {
-    EnsureValidLockerVariant();
-
-    const auto [entry, is_cache_miss] = curr_locker_variant->programs.try_emplace(variant);
-    auto& program = entry->second;
-    if (!is_cache_miss) {
-        return program->handle;
-    }
-
-    program = BuildShader(device, unique_identifier, shader_type, code, code_b,
-                          *curr_locker_variant->locker, variant);
-    disk_cache.SaveUsage(GetUsage(variant, *curr_locker_variant->locker));
-
-    LabelGLObject(GL_PROGRAM, program->handle, cpu_addr);
-    return program->handle;
-}
-
-bool CachedShader::EnsureValidLockerVariant() {
-    const auto previous_variant = curr_locker_variant;
-    if (curr_locker_variant && !curr_locker_variant->locker->IsConsistent()) {
-        curr_locker_variant = nullptr;
-    }
-    if (!curr_locker_variant) {
-        for (auto& variant : locker_variants) {
-            if (variant->locker->IsConsistent()) {
-                curr_locker_variant = variant.get();
-            }
-        }
-    }
-    if (!curr_locker_variant) {
-        auto& new_variant = locker_variants.emplace_back();
-        new_variant = std::make_unique<LockerVariant>();
-        new_variant->locker = MakeLocker(system, shader_type);
-        curr_locker_variant = new_variant.get();
-    }
-    return previous_variant == curr_locker_variant;
-}
-
-ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant,
-                                            const ConstBufferLocker& locker) const {
-    return ShaderDiskCacheUsage{unique_identifier,         variant,
-                                locker.GetBoundBuffer(),   locker.GetKeys(),
-                                locker.GetBoundSamplers(), locker.GetBindlessSamplers()};
+                                     const PrecompiledShader& precompiled_shader,
+                                     std::size_t size_in_bytes) {
+    return std::shared_ptr<CachedShader>(new CachedShader(
+        params.host_ptr, params.cpu_addr, size_in_bytes, precompiled_shader.registry,
+        precompiled_shader.entries, precompiled_shader.program));
 }

 ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
@@ -431,16 +299,12 @@ ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System&

 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                      const VideoCore::DiskResourceLoadCallback& callback) {
-    const auto transferable = disk_cache.LoadTransferable();
+    const std::optional transferable = disk_cache.LoadTransferable();
    if (!transferable) {
        return;
    }
-    const auto [raws, shader_usages] = *transferable;
-    if (!GenerateUnspecializedShaders(stop_loading, callback, raws) || stop_loading) {
-        return;
-    }

-    const auto dumps = disk_cache.LoadPrecompiled();
+    const std::vector gl_cache = disk_cache.LoadPrecompiled();
    const auto supported_formats = GetSupportedFormats();

    // Track if precompiled cache was altered during loading to know if we have to
@@ -449,77 +313,82 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,

    // Inform the frontend about shader build initialization
    if (callback) {
-        callback(VideoCore::LoadCallbackStage::Build, 0, shader_usages.size());
+        callback(VideoCore::LoadCallbackStage::Build, 0, transferable->size());
    }

    std::mutex mutex;
    std::size_t built_shaders = 0; // It doesn't have be atomic since it's used behind a mutex
-    std::atomic_bool compilation_failed = false;
+    std::atomic_bool gl_cache_failed = false;

-    const auto Worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin,
-                            std::size_t end, const std::vector<ShaderDiskCacheUsage>& shader_usages,
-                            const ShaderDumpsMap& dumps) {
+    const auto find_precompiled = [&gl_cache](u64 id) {
+        return std::find_if(gl_cache.begin(), gl_cache.end(),
+                            [id](const auto& entry) { return entry.unique_identifier == id; });
+    };
+
+    const auto worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin,
+                            std::size_t end) {
        context->MakeCurrent();
        SCOPE_EXIT({ return context->DoneCurrent(); });

        for (std::size_t i = begin; i < end; ++i) {
-            if (stop_loading || compilation_failed) {
+            if (stop_loading) {
                return;
            }
-            const auto& usage{shader_usages[i]};
-            const auto& unspecialized{unspecialized_shaders.at(usage.unique_identifier)};
-            const auto dump{dumps.find(usage)};
+            const auto& entry = (*transferable)[i];
+            const u64 uid = entry.unique_identifier;
+            const auto it = find_precompiled(uid);
+            const auto precompiled_entry = it != gl_cache.end() ? &*it : nullptr;

-            CachedProgram shader;
-            if (dump != dumps.end()) {
-                // If the shader is dumped, attempt to load it with
-                shader = GeneratePrecompiledProgram(dump->second, supported_formats);
-                if (!shader) {
-                    compilation_failed = true;
-                    return;
+            const bool is_compute = entry.type == ShaderType::Compute;
+            const u32 main_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
+            auto registry = MakeRegistry(entry);
+            const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);
+
+            std::shared_ptr<OGLProgram> program;
+            if (precompiled_entry) {
+                // If the shader is precompiled, attempt to load it with
+                program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
+                if (!program) {
+                    gl_cache_failed = true;
                }
            }
-            if (!shader) {
-                auto locker{MakeLocker(system, unspecialized.type)};
-                FillLocker(*locker, usage);
-
-                shader = BuildShader(device, usage.unique_identifier, unspecialized.type,
-                                     unspecialized.code, unspecialized.code_b, *locker,
-                                     usage.variant, true);
+            if (!program) {
+                // Otherwise compile it from GLSL
+                program = BuildShader(device, entry.type, uid, ir, *registry, true);
            }

+            PrecompiledShader shader;
+            shader.program = std::move(program);
+            shader.registry = std::move(registry);
+            shader.entries = MakeEntries(ir);
+
            std::scoped_lock lock{mutex};
            if (callback) {
                callback(VideoCore::LoadCallbackStage::Build, ++built_shaders,
-                         shader_usages.size());
+                         transferable->size());
            }
-
-            precompiled_programs.emplace(usage, std::move(shader));
-
-            // TODO(Rodrigo): Is there a better way to do this?
-            precompiled_variants[usage.unique_identifier].push_back(
-                precompiled_programs.find(usage));
+            runtime_cache.emplace(entry.unique_identifier, std::move(shader));
        }
    };

    const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1ULL)};
-    const std::size_t bucket_size{shader_usages.size() / num_workers};
+    const std::size_t bucket_size{transferable->size() / num_workers};
    std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers);
    std::vector<std::thread> threads(num_workers);
    for (std::size_t i = 0; i < num_workers; ++i) {
        const bool is_last_worker = i + 1 == num_workers;
        const std::size_t start{bucket_size * i};
-        const std::size_t end{is_last_worker ? shader_usages.size() : start + bucket_size};
+        const std::size_t end{is_last_worker ? transferable->size() : start + bucket_size};

        // On some platforms the shared context has to be created from the GUI thread
        contexts[i] = emu_window.CreateSharedContext();
-        threads[i] = std::thread(Worker, contexts[i].get(), start, end, shader_usages, dumps);
+        threads[i] = std::thread(worker, contexts[i].get(), start, end);
    }
    for (auto& thread : threads) {
        thread.join();
    }

-    if (compilation_failed) {
+    if (gl_cache_failed) {
        // Invalidate the precompiled cache if a shader dumped shader was rejected
        disk_cache.InvalidatePrecompiled();
        precompiled_cache_altered = true;
@@ -532,11 +401,12 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
    // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
    // before precompiling them

-    for (std::size_t i = 0; i < shader_usages.size(); ++i) {
-        const auto& usage{shader_usages[i]};
-        if (dumps.find(usage) == dumps.end()) {
-            const auto& program{precompiled_programs.at(usage)};
-            disk_cache.SaveDump(usage, program->handle);
+    for (std::size_t i = 0; i < transferable->size(); ++i) {
+        const u64 id = (*transferable)[i].unique_identifier;
+        const auto it = find_precompiled(id);
+        if (it == gl_cache.end()) {
+            const GLuint program = runtime_cache.at(id).program->handle;
+            disk_cache.SavePrecompiled(id, program);
            precompiled_cache_altered = true;
        }
    }
@@ -546,84 +416,33 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
    }
 }

-const PrecompiledVariants* ShaderCacheOpenGL::GetPrecompiledVariants(u64 unique_identifier) const {
-    const auto it = precompiled_variants.find(unique_identifier);
-    return it == precompiled_variants.end() ? nullptr : &it->second;
-}
-
-CachedProgram ShaderCacheOpenGL::GeneratePrecompiledProgram(
-    const ShaderDiskCacheDump& dump, const std::unordered_set<GLenum>& supported_formats) {
-    if (supported_formats.find(dump.binary_format) == supported_formats.end()) {
-        LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format - removing");
+std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
+    const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
+    const std::unordered_set<GLenum>& supported_formats) {
+    if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
+        LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format, removing");
        return {};
    }

-    CachedProgram shader = std::make_shared<OGLProgram>();
-    shader->handle = glCreateProgram();
-    glProgramParameteri(shader->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
-    glProgramBinary(shader->handle, dump.binary_format, dump.binary.data(),
-                    static_cast<GLsizei>(dump.binary.size()));
+    auto program = std::make_shared<OGLProgram>();
+    program->handle = glCreateProgram();
+    glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
+    glProgramBinary(program->handle, precompiled_entry.binary_format,
+                    precompiled_entry.binary.data(),
+                    static_cast<GLsizei>(precompiled_entry.binary.size()));

-    GLint link_status{};
-    glGetProgramiv(shader->handle, GL_LINK_STATUS, &link_status);
+    GLint link_status;
+    glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status);
    if (link_status == GL_FALSE) {
-        LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver - removing");
+        LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
        return {};
    }

-    return shader;
-}
-
-bool ShaderCacheOpenGL::GenerateUnspecializedShaders(
-    const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback,
-    const std::vector<ShaderDiskCacheRaw>& raws) {
-    if (callback) {
-        callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size());
-    }
-
-    for (std::size_t i = 0; i < raws.size(); ++i) {
-        if (stop_loading) {
-            return false;
-        }
-        const auto& raw{raws[i]};
-        const u64 unique_identifier{raw.GetUniqueIdentifier()};
-        const u64 calculated_hash{
-            GetUniqueIdentifier(raw.GetType(), raw.HasProgramA(), raw.GetCode(), raw.GetCodeB())};
-        if (unique_identifier != calculated_hash) {
-            LOG_ERROR(Render_OpenGL,
-                      "Invalid hash in entry={:016x} (obtained hash={:016x}) - "
-                      "removing shader cache",
-                      raw.GetUniqueIdentifier(), calculated_hash);
-            disk_cache.InvalidateTransferable();
-            return false;
-        }
-
-        const u32 main_offset =
-            raw.GetType() == ShaderType::Compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
-        ConstBufferLocker locker(raw.GetType());
-        const ShaderIR ir(raw.GetCode(), main_offset, COMPILER_SETTINGS, locker);
-        // TODO(Rodrigo): Handle VertexA shaders
-        // std::optional<ShaderIR> ir_b;
-        // if (raw.HasProgramA()) {
-        //     ir_b.emplace(raw.GetProgramCodeB(), main_offset);
-        // }
-
-        UnspecializedShader unspecialized;
-        unspecialized.entries = GLShader::GetEntries(ir);
-        unspecialized.type = raw.GetType();
-        unspecialized.code = raw.GetCode();
-        unspecialized.code_b = raw.GetCodeB();
-        unspecialized_shaders.emplace(raw.GetUniqueIdentifier(), unspecialized);
-
-        if (callback) {
-            callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size());
-        }
-    }
-    return true;
+    return program;
 }

 Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
-    if (!system.GPU().Maxwell3D().dirty.shaders) {
+    if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) {
        return last_shaders[static_cast<std::size_t>(program)];
    }

@@ -647,17 +466,17 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {

    const auto unique_identifier = GetUniqueIdentifier(
        GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
-    const auto precompiled_variants = GetPrecompiledVariants(unique_identifier);
    const auto cpu_addr{*memory_manager.GpuToCpuAddress(address)};
-    const ShaderParameters params{system,   disk_cache, precompiled_variants, device,
+    const ShaderParameters params{system,   disk_cache, device,
                                  cpu_addr, host_ptr,   unique_identifier};

-    const auto found = unspecialized_shaders.find(unique_identifier);
-    if (found == unspecialized_shaders.end()) {
+    const auto found = runtime_cache.find(unique_identifier);
+    if (found == runtime_cache.end()) {
        shader = CachedShader::CreateStageFromMemory(params, program, std::move(code),
                                                     std::move(code_b));
    } else {
-        shader = CachedShader::CreateFromCache(params, found->second);
+        const std::size_t size_in_bytes = code.size() * sizeof(u64);
+        shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
    }
    Register(shader);

@@ -672,19 +491,19 @@ Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
        return kernel;
    }

-    // No kernel found - create a new one
+    // No kernel found, create a new one
    auto code{GetShaderCode(memory_manager, code_addr, host_ptr)};
-    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code, {})};
-    const auto precompiled_variants = GetPrecompiledVariants(unique_identifier);
+    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
    const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)};
-    const ShaderParameters params{system,   disk_cache, precompiled_variants, device,
+    const ShaderParameters params{system,   disk_cache, device,
                                  cpu_addr, host_ptr,   unique_identifier};

-    const auto found = unspecialized_shaders.find(unique_identifier);
-    if (found == unspecialized_shaders.end()) {
+    const auto found = runtime_cache.find(unique_identifier);
+    if (found == runtime_cache.end()) {
        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
    } else {
-        kernel = CachedShader::CreateFromCache(params, found->second);
+        const std::size_t size_in_bytes = code.size() * sizeof(u64);
+        kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
    }

    Register(kernel);
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -22,7 +22,7 @@
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
-#include "video_core/shader/const_buffer_locker.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"

 namespace Core {
@@ -41,22 +41,17 @@ class RasterizerOpenGL;
 struct UnspecializedShader;

 using Shader = std::shared_ptr<CachedShader>;
-using CachedProgram = std::shared_ptr<OGLProgram>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using PrecompiledPrograms = std::unordered_map<ShaderDiskCacheUsage, CachedProgram>;
-using PrecompiledVariants = std::vector<PrecompiledPrograms::iterator>;

-struct UnspecializedShader {
-    GLShader::ShaderEntries entries;
-    Tegra::Engines::ShaderType type;
-    ProgramCode code;
-    ProgramCode code_b;
+struct PrecompiledShader {
+    std::shared_ptr<OGLProgram> program;
+    std::shared_ptr<VideoCommon::Shader::Registry> registry;
+    ShaderEntries entries;
 };

 struct ShaderParameters {
    Core::System& system;
    ShaderDiskCacheOpenGL& disk_cache;
-    const PrecompiledVariants* precompiled_variants;
    const Device& device;
    VAddr cpu_addr;
    u8* host_ptr;
@@ -65,61 +60,45 @@ struct ShaderParameters {

 class CachedShader final : public RasterizerCacheObject {
 public:
+    ~CachedShader();
+
+    /// Gets the GL program handle for the shader
+    GLuint GetHandle() const;
+
+    /// Returns the guest CPU address of the shader
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
+    }
+
+    /// Returns the size in bytes of the shader
+    std::size_t GetSizeInBytes() const override {
+        return size_in_bytes;
+    }
+
+    /// Gets the shader entries for the shader
+    const ShaderEntries& GetEntries() const {
+        return entries;
+    }
+
    static Shader CreateStageFromMemory(const ShaderParameters& params,
                                        Maxwell::ShaderProgram program_type,
                                        ProgramCode program_code, ProgramCode program_code_b);
    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);

    static Shader CreateFromCache(const ShaderParameters& params,
-                                  const UnspecializedShader& unspecialized);
-
-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return code.size() * sizeof(u64);
-    }
-
-    /// Gets the shader entries for the shader
-    const GLShader::ShaderEntries& GetShaderEntries() const {
-        return entries;
-    }
-
-    /// Gets the GL program handle for the shader
-    GLuint GetHandle(const ProgramVariant& variant);
+                                  const PrecompiledShader& precompiled_shader,
+                                  std::size_t size_in_bytes);

 private:
-    struct LockerVariant {
-        std::unique_ptr<VideoCommon::Shader::ConstBufferLocker> locker;
-        std::unordered_map<ProgramVariant, CachedProgram> programs;
-    };
+    explicit CachedShader(const u8* host_ptr, VAddr cpu_addr, std::size_t size_in_bytes,
+                          std::shared_ptr<VideoCommon::Shader::Registry> registry,
+                          ShaderEntries entries, std::shared_ptr<OGLProgram> program);

-    explicit CachedShader(const ShaderParameters& params, Tegra::Engines::ShaderType shader_type,
-                          GLShader::ShaderEntries entries, ProgramCode program_code,
-                          ProgramCode program_code_b);
-
-    bool EnsureValidLockerVariant();
-
-    ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant,
-                                  const VideoCommon::Shader::ConstBufferLocker& locker) const;
-
-    Core::System& system;
-    ShaderDiskCacheOpenGL& disk_cache;
-    const Device& device;
-
-    VAddr cpu_addr{};
-
-    u64 unique_identifier{};
-    Tegra::Engines::ShaderType shader_type{};
-
-    GLShader::ShaderEntries entries;
-
-    ProgramCode code;
-    ProgramCode code_b;
-
-    LockerVariant* curr_locker_variant = nullptr;
-    std::vector<std::unique_ptr<LockerVariant>> locker_variants;
+    std::shared_ptr<VideoCommon::Shader::Registry> registry;
+    ShaderEntries entries;
+    VAddr cpu_addr = 0;
+    std::size_t size_in_bytes = 0;
+    std::shared_ptr<OGLProgram> program;
 };

 class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
@@ -142,25 +121,15 @@ protected:
    void FlushObjectInner(const Shader& object) override {}

 private:
-    bool GenerateUnspecializedShaders(const std::atomic_bool& stop_loading,
-                                      const VideoCore::DiskResourceLoadCallback& callback,
-                                      const std::vector<ShaderDiskCacheRaw>& raws);
-
-    CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump,
-                                             const std::unordered_set<GLenum>& supported_formats);
-
-    const PrecompiledVariants* GetPrecompiledVariants(u64 unique_identifier) const;
+    std::shared_ptr<OGLProgram> GeneratePrecompiledProgram(
+        const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
+        const std::unordered_set<GLenum>& supported_formats);

    Core::System& system;
    Core::Frontend::EmuWindow& emu_window;
    const Device& device;
-
    ShaderDiskCacheOpenGL disk_cache;
-
-    PrecompiledPrograms precompiled_programs;
-    std::unordered_map<u64, PrecompiledVariants> precompiled_variants;
-
-    std::unordered_map<u64, UnspecializedShader> unspecialized_shaders;
+    std::unordered_map<u64, PrecompiledShader> runtime_cache;

    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
 };
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -23,8 +23,9 @@
 #include "video_core/shader/ast.h"
 #include "video_core/shader/node.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader/transform_feedback.h"

-namespace OpenGL::GLShader {
+namespace OpenGL {

 namespace {

@@ -36,6 +37,8 @@ using Tegra::Shader::IpaInterpMode;
 using Tegra::Shader::IpaMode;
 using Tegra::Shader::IpaSampleMode;
 using Tegra::Shader::Register;
+using VideoCommon::Shader::BuildTransformFeedback;
+using VideoCommon::Shader::Registry;

 using namespace std::string_literals;
 using namespace VideoCommon::Shader;
@@ -48,6 +51,11 @@ class ExprDecompiler;

 enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat };

+constexpr std::array FLOAT_TYPES{"float", "vec2", "vec3", "vec4"};
+
+constexpr std::string_view INPUT_ATTRIBUTE_NAME = "in_attr";
+constexpr std::string_view OUTPUT_ATTRIBUTE_NAME = "out_attr";
+
 struct TextureOffset {};
 struct TextureDerivates {};
 using TextureArgument = std::pair<Type, Node>;
@@ -56,6 +64,25 @@ using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
    static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));

+constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
+#define ftou floatBitsToUint
+#define itof intBitsToFloat
+#define utof uintBitsToFloat
+
+bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{
+    bvec2 is_nan1 = isnan(pair1);
+    bvec2 is_nan2 = isnan(pair2);
+    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || is_nan2.y);
+}}
+
+const float fswzadd_modifiers_a[] = float[4](-1.0f,  1.0f, -1.0f,  0.0f );
+const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f,  1.0f, -1.0f );
+
+layout (std140, binding = {}) uniform vs_config {{
+    float y_direction;
+}};
+)";
+
 class ShaderWriter final {
 public:
    void AddExpression(std::string_view text) {
@@ -269,12 +296,41 @@ const char* GetImageTypeDeclaration(Tegra::Shader::ImageType image_type) {
    }
 }

+/// Describes primitive behavior on geometry shaders
+std::pair<const char*, u32> GetPrimitiveDescription(Maxwell::PrimitiveTopology topology) {
+    switch (topology) {
+    case Maxwell::PrimitiveTopology::Points:
+        return {"points", 1};
+    case Maxwell::PrimitiveTopology::Lines:
+    case Maxwell::PrimitiveTopology::LineStrip:
+        return {"lines", 2};
+    case Maxwell::PrimitiveTopology::LinesAdjacency:
+    case Maxwell::PrimitiveTopology::LineStripAdjacency:
+        return {"lines_adjacency", 4};
+    case Maxwell::PrimitiveTopology::Triangles:
+    case Maxwell::PrimitiveTopology::TriangleStrip:
+    case Maxwell::PrimitiveTopology::TriangleFan:
+        return {"triangles", 3};
+    case Maxwell::PrimitiveTopology::TrianglesAdjacency:
+    case Maxwell::PrimitiveTopology::TriangleStripAdjacency:
+        return {"triangles_adjacency", 6};
+    default:
+        UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
+        return {"points", 1};
+    }
+}
+
 /// Generates code to use for a swizzle operation.
-constexpr const char* GetSwizzle(u32 element) {
+constexpr const char* GetSwizzle(std::size_t element) {
    constexpr std::array swizzle = {".x", ".y", ".z", ".w"};
    return swizzle.at(element);
 }

+constexpr const char* GetColorSwizzle(std::size_t element) {
+    constexpr std::array swizzle = {".r", ".g", ".b", ".a"};
+    return swizzle.at(element);
+}
+
 /// Translate topology
 std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
    switch (topology) {
@@ -341,11 +397,66 @@ std::string FlowStackTopName(MetaStackClass stack) {
    return stage == ShaderType::Vertex;
 }

+struct GenericVaryingDescription {
+    std::string name;
+    u8 first_element = 0;
+    bool is_scalar = false;
+};
+
 class GLSLDecompiler final {
 public:
-    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderType stage,
-                            std::string suffix)
-        : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
+    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                            ShaderType stage, std::string_view identifier, std::string_view suffix)
+        : device{device}, ir{ir}, registry{registry}, stage{stage},
+          identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
+        if (stage != ShaderType::Compute) {
+            transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
+        }
+    }
+
+    void Decompile() {
+        DeclareHeader();
+        DeclareVertex();
+        DeclareGeometry();
+        DeclareFragment();
+        DeclareCompute();
+        DeclareInputAttributes();
+        DeclareOutputAttributes();
+        DeclareImages();
+        DeclareSamplers();
+        DeclareGlobalMemory();
+        DeclareConstantBuffers();
+        DeclareLocalMemory();
+        DeclareRegisters();
+        DeclarePredicates();
+        DeclareInternalFlags();
+        DeclareCustomVariables();
+        DeclarePhysicalAttributeReader();
+
+        code.AddLine("void main() {{");
+        ++code.scope;
+
+        if (stage == ShaderType::Vertex) {
+            code.AddLine("gl_Position = vec4(0.0f, 0.0f, 0.0f, 1.0f);");
+        }
+
+        if (ir.IsDecompiled()) {
+            DecompileAST();
+        } else {
+            DecompileBranchMode();
+        }
+
+        --code.scope;
+        code.AddLine("}}");
+    }
+
+    std::string GetResult() {
+        return code.GetResult();
+    }
+
+private:
+    friend class ASTDecompiler;
+    friend class ExprDecompiler;

    void DecompileBranchMode() {
        // VM's program counter
@@ -387,43 +498,36 @@ public:

    void DecompileAST();

-    void Decompile() {
-        DeclareVertex();
-        DeclareGeometry();
-        DeclareRegisters();
-        DeclareCustomVariables();
-        DeclarePredicates();
-        DeclareLocalMemory();
-        DeclareInternalFlags();
-        DeclareInputAttributes();
-        DeclareOutputAttributes();
-        DeclareConstantBuffers();
-        DeclareGlobalMemory();
-        DeclareSamplers();
-        DeclareImages();
-        DeclarePhysicalAttributeReader();
-
-        code.AddLine("void execute_{}() {{", suffix);
-        ++code.scope;
-
-        if (ir.IsDecompiled()) {
-            DecompileAST();
-        } else {
-            DecompileBranchMode();
+    void DeclareHeader() {
+        if (!identifier.empty()) {
+            code.AddLine("// {}", identifier);
        }
+        code.AddLine("#version 440 core");
+        code.AddLine("#extension GL_ARB_separate_shader_objects : enable");
+        if (device.HasShaderBallot()) {
+            code.AddLine("#extension GL_ARB_shader_ballot : require");
+        }
+        if (device.HasVertexViewportLayer()) {
+            code.AddLine("#extension GL_ARB_shader_viewport_layer_array : require");
+        }
+        if (device.HasImageLoadFormatted()) {
+            code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
+        }
+        if (device.HasWarpIntrinsics()) {
+            code.AddLine("#extension GL_NV_gpu_shader5 : require");
+            code.AddLine("#extension GL_NV_shader_thread_group : require");
+            code.AddLine("#extension GL_NV_shader_thread_shuffle : require");
+        }
+        // This pragma stops Nvidia's driver from over optimizing math (probably using fp16
+        // operations) on places where we don't want to.
+        // Thanks to Ryujinx for finding this workaround.
+        code.AddLine("#pragma optionNV(fastmath off)");

-        --code.scope;
-        code.AddLine("}}");
+        code.AddNewLine();
+
+        code.AddLine(CommonDeclarations, EmulationUniformBlockBinding);
    }

-    std::string GetResult() {
-        return code.GetResult();
-    }
-
-private:
-    friend class ASTDecompiler;
-    friend class ExprDecompiler;
-
    void DeclareVertex() {
        if (!IsVertexShader(stage))
            return;
@@ -436,9 +540,15 @@ private:
            return;
        }

+        const auto& info = registry.GetGraphicsInfo();
+        const auto input_topology = info.primitive_topology;
+        const auto [glsl_topology, max_vertices] = GetPrimitiveDescription(input_topology);
+        max_input_vertices = max_vertices;
+        code.AddLine("layout ({}) in;", glsl_topology);
+
        const auto topology = GetTopologyName(header.common3.output_topology);
-        const auto max_vertices = header.common4.max_output_vertices.Value();
-        code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_vertices);
+        const auto max_output_vertices = header.common4.max_output_vertices.Value();
+        code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_output_vertices);
        code.AddNewLine();

        code.AddLine("in gl_PerVertex {{");
@@ -450,11 +560,40 @@ private:
        DeclareVertexRedeclarations();
    }

+    void DeclareFragment() {
+        if (stage != ShaderType::Fragment) {
+            return;
+        }
+        for (u32 rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
+            code.AddLine("layout (location = {}) out vec4 frag_color{};", rt, rt);
+        }
+    }
+
+    void DeclareCompute() {
+        if (stage != ShaderType::Compute) {
+            return;
+        }
+        const auto& info = registry.GetComputeInfo();
+        if (const u32 size = info.shared_memory_size_in_words; size > 0) {
+            code.AddLine("shared uint smem[{}];", size);
+            code.AddNewLine();
+        }
+        code.AddLine("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;",
+                     info.workgroup_size[0], info.workgroup_size[1], info.workgroup_size[2]);
+        code.AddNewLine();
+    }
+
    void DeclareVertexRedeclarations() {
        code.AddLine("out gl_PerVertex {{");
        ++code.scope;

-        code.AddLine("vec4 gl_Position;");
+        auto pos_xfb = GetTransformFeedbackDecoration(Attribute::Index::Position);
+        if (!pos_xfb.empty()) {
+            pos_xfb = fmt::format("layout ({}) ", pos_xfb);
+        }
+        const char* pos_type =
+            FLOAT_TYPES.at(GetNumComponents(Attribute::Index::Position).value_or(4) - 1);
+        code.AddLine("{}{} gl_Position;", pos_xfb, pos_type);

        for (const auto attribute : ir.GetOutputAttributes()) {
            if (attribute == Attribute::Index::ClipDistances0123 ||
@@ -525,18 +664,16 @@ private:
    }

    void DeclareLocalMemory() {
+        u64 local_memory_size = 0;
        if (stage == ShaderType::Compute) {
-            code.AddLine("#ifdef LOCAL_MEMORY_SIZE");
-            code.AddLine("uint {}[LOCAL_MEMORY_SIZE];", GetLocalMemory());
-            code.AddLine("#endif");
-            return;
+            local_memory_size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL;
+        } else {
+            local_memory_size = header.GetLocalMemorySize();
        }
-
-        const u64 local_memory_size = header.GetLocalMemorySize();
        if (local_memory_size == 0) {
            return;
        }
-        const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
+        const u64 element_count = Common::AlignUp(local_memory_size, 4) / 4;
        code.AddLine("uint {}[{}];", GetLocalMemory(), element_count);
        code.AddNewLine();
    }
@@ -589,7 +726,7 @@ private:
    void DeclareInputAttribute(Attribute::Index index, bool skip_unused) {
        const u32 location{GetGenericAttributeIndex(index)};

-        std::string name{GetInputAttribute(index)};
+        std::string name{GetGenericInputAttribute(index)};
        if (stage == ShaderType::Geometry) {
            name = "gs_" + name + "[]";
        }
@@ -626,9 +763,59 @@ private:
        }
    }

+    std::optional<std::size_t> GetNumComponents(Attribute::Index index, u8 element = 0) const {
+        const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
+        const auto it = transform_feedback.find(location);
+        if (it == transform_feedback.end()) {
+            return {};
+        }
+        return it->second.components;
+    }
+
+    std::string GetTransformFeedbackDecoration(Attribute::Index index, u8 element = 0) const {
+        const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
+        const auto it = transform_feedback.find(location);
+        if (it == transform_feedback.end()) {
+            return {};
+        }
+
+        const VaryingTFB& tfb = it->second;
+        return fmt::format("xfb_buffer = {}, xfb_offset = {}, xfb_stride = {}", tfb.buffer,
+                           tfb.offset, tfb.stride);
+    }
+
    void DeclareOutputAttribute(Attribute::Index index) {
-        const u32 location{GetGenericAttributeIndex(index)};
-        code.AddLine("layout (location = {}) out vec4 {};", location, GetOutputAttribute(index));
+        static constexpr std::string_view swizzle = "xyzw";
+        u8 element = 0;
+        while (element < 4) {
+            auto xfb = GetTransformFeedbackDecoration(index, element);
+            if (!xfb.empty()) {
+                xfb = fmt::format(", {}", xfb);
+            }
+            const std::size_t remainder = 4 - element;
+            const std::size_t num_components = GetNumComponents(index, element).value_or(remainder);
+            const char* const type = FLOAT_TYPES.at(num_components - 1);
+
+            const u32 location = GetGenericAttributeIndex(index);
+
+            GenericVaryingDescription description;
+            description.first_element = static_cast<u8>(element);
+            description.is_scalar = num_components == 1;
+            description.name = AppendSuffix(location, OUTPUT_ATTRIBUTE_NAME);
+            if (element != 0 || num_components != 4) {
+                const std::string_view name_swizzle = swizzle.substr(element, num_components);
+                description.name = fmt::format("{}_{}", description.name, name_swizzle);
+            }
+            for (std::size_t i = 0; i < num_components; ++i) {
+                const u8 offset = static_cast<u8>(location * 4 + element + i);
+                varying_description.insert({offset, description});
+            }
+
+            code.AddLine("layout (location = {}, component = {}{}) out {} {};", location, element,
+                         xfb, type, description.name);
+
+            element = static_cast<u8>(static_cast<std::size_t>(element) + num_components);
+        }
    }

    void DeclareConstantBuffers() {
@@ -925,7 +1112,8 @@ private:
                // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
                // set an 0x80000000 index for those and the shader fails to build. Find out why
                // this happens and what's its intent.
-                return fmt::format("gs_{}[{} % MAX_VERTEX_INPUT]", name, Visit(buffer).AsUint());
+                return fmt::format("gs_{}[{} % {}]", name, Visit(buffer).AsUint(),
+                                   max_input_vertices.value());
            }
            return std::string(name);
        };
@@ -980,7 +1168,7 @@ private:
            return {"0", Type::Int};
        default:
            if (IsGenericAttribute(attribute)) {
-                return {GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element),
+                return {GeometryPass(GetGenericInputAttribute(attribute)) + GetSwizzle(element),
                        Type::Float};
            }
            break;
@@ -1049,8 +1237,7 @@ private:
            return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}};
        default:
            if (IsGenericAttribute(attribute)) {
-                return {
-                    {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}};
+                return {{GetGenericOutputAttribute(attribute, abuf->GetElement()), Type::Float}};
            }
            UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
            return {};
@@ -1945,7 +2132,7 @@ private:
            // TODO(Subv): Figure out how dual-source blending is configured in the Switch.
            for (u32 component = 0; component < 4; ++component) {
                if (header.ps.IsColorComponentOutputEnabled(render_target, component)) {
-                    code.AddLine("FragColor{}[{}] = {};", render_target, component,
+                    code.AddLine("frag_color{}{} = {};", render_target, GetColorSwizzle(component),
                                 SafeGetRegister(current_reg).AsFloat());
                    ++current_reg;
                }
@@ -2261,27 +2448,34 @@ private:
    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

    std::string GetRegister(u32 index) const {
-        return GetDeclarationWithSuffix(index, "gpr");
+        return AppendSuffix(index, "gpr");
    }

    std::string GetCustomVariable(u32 index) const {
-        return GetDeclarationWithSuffix(index, "custom_var");
+        return AppendSuffix(index, "custom_var");
    }

    std::string GetPredicate(Tegra::Shader::Pred pred) const {
-        return GetDeclarationWithSuffix(static_cast<u32>(pred), "pred");
+        return AppendSuffix(static_cast<u32>(pred), "pred");
    }

-    std::string GetInputAttribute(Attribute::Index attribute) const {
-        return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "input_attr");
+    std::string GetGenericInputAttribute(Attribute::Index attribute) const {
+        return AppendSuffix(GetGenericAttributeIndex(attribute), INPUT_ATTRIBUTE_NAME);
    }

-    std::string GetOutputAttribute(Attribute::Index attribute) const {
-        return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "output_attr");
+    std::unordered_map<u8, GenericVaryingDescription> varying_description;
+
+    std::string GetGenericOutputAttribute(Attribute::Index attribute, std::size_t element) const {
+        const u8 offset = static_cast<u8>(GetGenericAttributeIndex(attribute) * 4 + element);
+        const auto& description = varying_description.at(offset);
+        if (description.is_scalar) {
+            return description.name;
+        }
+        return fmt::format("{}[{}]", description.name, element - description.first_element);
    }

    std::string GetConstBuffer(u32 index) const {
-        return GetDeclarationWithSuffix(index, "cbuf");
+        return AppendSuffix(index, "cbuf");
    }

    std::string GetGlobalMemory(const GlobalMemoryBase& descriptor) const {
@@ -2294,11 +2488,15 @@ private:
    }

    std::string GetConstBufferBlock(u32 index) const {
-        return GetDeclarationWithSuffix(index, "cbuf_block");
+        return AppendSuffix(index, "cbuf_block");
    }

    std::string GetLocalMemory() const {
-        return "lmem_" + suffix;
+        if (suffix.empty()) {
+            return "lmem";
+        } else {
+            return "lmem_" + std::string{suffix};
+        }
    }

    std::string GetInternalFlag(InternalFlag flag) const {
@@ -2307,19 +2505,27 @@ private:
        const auto index = static_cast<u32>(flag);
        ASSERT(index < static_cast<u32>(InternalFlag::Amount));

-        return fmt::format("{}_{}", InternalFlagNames[index], suffix);
+        if (suffix.empty()) {
+            return InternalFlagNames[index];
+        } else {
+            return fmt::format("{}_{}", InternalFlagNames[index], suffix);
+        }
    }

    std::string GetSampler(const Sampler& sampler) const {
-        return GetDeclarationWithSuffix(static_cast<u32>(sampler.GetIndex()), "sampler");
+        return AppendSuffix(static_cast<u32>(sampler.GetIndex()), "sampler");
    }

    std::string GetImage(const Image& image) const {
-        return GetDeclarationWithSuffix(static_cast<u32>(image.GetIndex()), "image");
+        return AppendSuffix(static_cast<u32>(image.GetIndex()), "image");
    }

-    std::string GetDeclarationWithSuffix(u32 index, std::string_view name) const {
-        return fmt::format("{}_{}_{}", name, index, suffix);
+    std::string AppendSuffix(u32 index, std::string_view name) const {
+        if (suffix.empty()) {
+            return fmt::format("{}{}", name, index);
+        } else {
+            return fmt::format("{}{}_{}", name, index, suffix);
+        }
    }

    u32 GetNumPhysicalInputAttributes() const {
@@ -2334,17 +2540,31 @@ private:
        return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings);
    }

+    bool IsRenderTargetEnabled(u32 render_target) const {
+        for (u32 component = 0; component < 4; ++component) {
+            if (header.ps.IsColorComponentOutputEnabled(render_target, component)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
    const Device& device;
    const ShaderIR& ir;
+    const Registry& registry;
    const ShaderType stage;
-    const std::string suffix;
+    const std::string_view identifier;
+    const std::string_view suffix;
    const Header header;
+    std::unordered_map<u8, VaryingTFB> transform_feedback;

    ShaderWriter code;
+
+    std::optional<u32> max_input_vertices;
 };

-std::string GetFlowVariable(u32 i) {
-    return fmt::format("flow_var_{}", i);
+std::string GetFlowVariable(u32 index) {
+    return fmt::format("flow_var{}", index);
 }

 class ExprDecompiler {
@@ -2531,7 +2751,7 @@ void GLSLDecompiler::DecompileAST() {

 } // Anonymous namespace

-ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) {
+ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
    ShaderEntries entries;
    for (const auto& cbuf : ir.GetConstantBuffers()) {
        entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2547,33 +2767,20 @@ ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) {
    for (const auto& image : ir.GetImages()) {
        entries.images.emplace_back(image);
    }
-    entries.clip_distances = ir.GetClipDistances();
+    const auto clip_distances = ir.GetClipDistances();
+    for (std::size_t i = 0; i < std::size(clip_distances); ++i) {
+        entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
+    }
    entries.shader_length = ir.GetLength();
    return entries;
 }

-std::string GetCommonDeclarations() {
-    return R"(#define ftoi floatBitsToInt
-#define ftou floatBitsToUint
-#define itof intBitsToFloat
-#define utof uintBitsToFloat
-
-bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {
-    bvec2 is_nan1 = isnan(pair1);
-    bvec2 is_nan2 = isnan(pair2);
-    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || is_nan2.y);
-}
-
-const float fswzadd_modifiers_a[] = float[4](-1.0f,  1.0f, -1.0f,  0.0f );
-const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f,  1.0f, -1.0f );
-)";
-}
-
-std::string Decompile(const Device& device, const ShaderIR& ir, ShaderType stage,
-                      const std::string& suffix) {
-    GLSLDecompiler decompiler(device, ir, stage, suffix);
+std::string DecompileShader(const Device& device, const ShaderIR& ir, const Registry& registry,
+                            ShaderType stage, std::string_view identifier,
+                            std::string_view suffix) {
+    GLSLDecompiler decompiler(device, ir, registry, stage, identifier, suffix);
    decompiler.Decompile();
    return decompiler.GetResult();
 }

-} // namespace OpenGL::GLShader
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -6,22 +6,18 @@

 #include <array>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"

-namespace VideoCommon::Shader {
-class ShaderIR;
-}
-
 namespace OpenGL {
-class Device;
-}

-namespace OpenGL::GLShader {
+class Device;

 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 using SamplerEntry = VideoCommon::Shader::Sampler;
@@ -74,15 +70,15 @@ struct ShaderEntries {
    std::vector<GlobalMemoryEntry> global_memory_entries;
    std::vector<SamplerEntry> samplers;
    std::vector<ImageEntry> images;
-    std::array<bool, Maxwell::NumClipDistances> clip_distances{};
+    u32 clip_distances{};
    std::size_t shader_length{};
 };

-ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir);
+ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);

-std::string GetCommonDeclarations();
+std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                            const VideoCommon::Shader::Registry& registry,
+                            Tegra::Engines::ShaderType stage, std::string_view identifier,
+                            std::string_view suffix = {});

-std::string Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
-                      Tegra::Engines::ShaderType stage, const std::string& suffix);
-
-} // namespace OpenGL::GLShader
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -31,32 +31,24 @@ namespace {

 using ShaderCacheVersionHash = std::array<u8, 64>;

-enum class TransferableEntryKind : u32 {
-    Raw,
-    Usage,
-};
-
 struct ConstBufferKey {
-    u32 cbuf{};
-    u32 offset{};
-    u32 value{};
+    u32 cbuf = 0;
+    u32 offset = 0;
+    u32 value = 0;
 };

 struct BoundSamplerKey {
-    u32 offset{};
-    Tegra::Engines::SamplerDescriptor sampler{};
+    u32 offset = 0;
+    Tegra::Engines::SamplerDescriptor sampler;
 };

 struct BindlessSamplerKey {
-    u32 cbuf{};
-    u32 offset{};
-    Tegra::Engines::SamplerDescriptor sampler{};
+    u32 cbuf = 0;
+    u32 offset = 0;
+    Tegra::Engines::SamplerDescriptor sampler;
 };

-constexpr u32 NativeVersion = 12;
-
-// Making sure sizes doesn't change by accident
-static_assert(sizeof(ProgramVariant) == 20);
+constexpr u32 NativeVersion = 20;

 ShaderCacheVersionHash GetShaderCacheVersionHash() {
    ShaderCacheVersionHash hash{};
@@ -67,61 +59,124 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() {

 } // Anonymous namespace

-ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ShaderType type, ProgramCode code,
-                                       ProgramCode code_b)
-    : unique_identifier{unique_identifier}, type{type}, code{std::move(code)}, code_b{std::move(
-                                                                                   code_b)} {}
+ShaderDiskCacheEntry::ShaderDiskCacheEntry() = default;

-ShaderDiskCacheRaw::ShaderDiskCacheRaw() = default;
+ShaderDiskCacheEntry::~ShaderDiskCacheEntry() = default;

-ShaderDiskCacheRaw::~ShaderDiskCacheRaw() = default;
-
-bool ShaderDiskCacheRaw::Load(FileUtil::IOFile& file) {
-    if (file.ReadBytes(&unique_identifier, sizeof(u64)) != sizeof(u64) ||
-        file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) {
+bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
+    if (file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) {
        return false;
    }
-    u32 code_size{};
-    u32 code_size_b{};
+    u32 code_size;
+    u32 code_size_b;
    if (file.ReadBytes(&code_size, sizeof(u32)) != sizeof(u32) ||
        file.ReadBytes(&code_size_b, sizeof(u32)) != sizeof(u32)) {
        return false;
    }
-
    code.resize(code_size);
    code_b.resize(code_size_b);

-    if (file.ReadArray(code.data(), code_size) != code_size)
+    if (file.ReadArray(code.data(), code_size) != code_size) {
        return false;
-
+    }
    if (HasProgramA() && file.ReadArray(code_b.data(), code_size_b) != code_size_b) {
        return false;
    }
+
+    u8 is_texture_handler_size_known;
+    u32 texture_handler_size_value;
+    u32 num_keys;
+    u32 num_bound_samplers;
+    u32 num_bindless_samplers;
+    if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
+        file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
+        file.ReadArray(&texture_handler_size_value, 1) != 1 ||
+        file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
+        file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
+        file.ReadArray(&num_bindless_samplers, 1) != 1) {
+        return false;
+    }
+    if (is_texture_handler_size_known) {
+        texture_handler_size = texture_handler_size_value;
+    }
+
+    std::vector<ConstBufferKey> flat_keys(num_keys);
+    std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers);
+    std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers);
+    if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
+        file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
+            flat_bound_samplers.size() ||
+        file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
+            flat_bindless_samplers.size()) {
+        return false;
+    }
+    for (const auto& key : flat_keys) {
+        keys.insert({{key.cbuf, key.offset}, key.value});
+    }
+    for (const auto& key : flat_bound_samplers) {
+        bound_samplers.emplace(key.offset, key.sampler);
+    }
+    for (const auto& key : flat_bindless_samplers) {
+        bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+    }
+
    return true;
 }

-bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const {
-    if (file.WriteObject(unique_identifier) != 1 || file.WriteObject(static_cast<u32>(type)) != 1 ||
+bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
+    if (file.WriteObject(static_cast<u32>(type)) != 1 ||
        file.WriteObject(static_cast<u32>(code.size())) != 1 ||
        file.WriteObject(static_cast<u32>(code_b.size())) != 1) {
        return false;
    }
-
-    if (file.WriteArray(code.data(), code.size()) != code.size())
+    if (file.WriteArray(code.data(), code.size()) != code.size()) {
        return false;
-
+    }
    if (HasProgramA() && file.WriteArray(code_b.data(), code_b.size()) != code_b.size()) {
        return false;
    }
-    return true;
+
+    if (file.WriteObject(unique_identifier) != 1 || file.WriteObject(bound_buffer) != 1 ||
+        file.WriteObject(static_cast<u8>(texture_handler_size.has_value())) != 1 ||
+        file.WriteObject(texture_handler_size.value_or(0)) != 1 ||
+        file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
+        file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
+        return false;
+    }
+
+    std::vector<ConstBufferKey> flat_keys;
+    flat_keys.reserve(keys.size());
+    for (const auto& [address, value] : keys) {
+        flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
+    }
+
+    std::vector<BoundSamplerKey> flat_bound_samplers;
+    flat_bound_samplers.reserve(bound_samplers.size());
+    for (const auto& [address, sampler] : bound_samplers) {
+        flat_bound_samplers.push_back(BoundSamplerKey{address, sampler});
+    }
+
+    std::vector<BindlessSamplerKey> flat_bindless_samplers;
+    flat_bindless_samplers.reserve(bindless_samplers.size());
+    for (const auto& [address, sampler] : bindless_samplers) {
+        flat_bindless_samplers.push_back(
+            BindlessSamplerKey{address.first, address.second, sampler});
+    }
+
+    return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
+           file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
+               flat_bound_samplers.size() &&
+           file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
+               flat_bindless_samplers.size();
 }

 ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {}

 ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default;

-std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>>
-ShaderDiskCacheOpenGL::LoadTransferable() {
+std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() {
    // Skip games without title id
    const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0;
    if (!Settings::values.use_disk_shader_cache || !has_title_id) {
@@ -130,17 +185,14 @@ ShaderDiskCacheOpenGL::LoadTransferable() {

    FileUtil::IOFile file(GetTransferablePath(), "rb");
    if (!file.IsOpen()) {
-        LOG_INFO(Render_OpenGL, "No transferable shader cache found for game with title id={}",
-                 GetTitleID());
+        LOG_INFO(Render_OpenGL, "No transferable shader cache found");
        is_usable = true;
        return {};
    }

    u32 version{};
    if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) {
-        LOG_ERROR(Render_OpenGL,
-                  "Failed to get transferable cache version for title id={}, skipping",
-                  GetTitleID());
+        LOG_ERROR(Render_OpenGL, "Failed to get transferable cache version, skipping it");
        return {};
    }

@@ -158,105 +210,42 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
    }

    // Version is valid, load the shaders
-    constexpr const char error_loading[] = "Failed to load transferable raw entry, skipping";
-    std::vector<ShaderDiskCacheRaw> raws;
-    std::vector<ShaderDiskCacheUsage> usages;
+    std::vector<ShaderDiskCacheEntry> entries;
    while (file.Tell() < file.GetSize()) {
-        TransferableEntryKind kind{};
-        if (file.ReadBytes(&kind, sizeof(u32)) != sizeof(u32)) {
-            LOG_ERROR(Render_OpenGL, "Failed to read transferable file, skipping");
-            return {};
-        }
-
-        switch (kind) {
-        case TransferableEntryKind::Raw: {
-            ShaderDiskCacheRaw entry;
-            if (!entry.Load(file)) {
-                LOG_ERROR(Render_OpenGL, error_loading);
-                return {};
-            }
-            transferable.insert({entry.GetUniqueIdentifier(), {}});
-            raws.push_back(std::move(entry));
-            break;
-        }
-        case TransferableEntryKind::Usage: {
-            ShaderDiskCacheUsage usage;
-
-            u32 num_keys{};
-            u32 num_bound_samplers{};
-            u32 num_bindless_samplers{};
-            if (file.ReadArray(&usage.unique_identifier, 1) != 1 ||
-                file.ReadArray(&usage.variant, 1) != 1 ||
-                file.ReadArray(&usage.bound_buffer, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 ||
-                file.ReadArray(&num_bound_samplers, 1) != 1 ||
-                file.ReadArray(&num_bindless_samplers, 1) != 1) {
-                LOG_ERROR(Render_OpenGL, error_loading);
-                return {};
-            }
-
-            std::vector<ConstBufferKey> keys(num_keys);
-            std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers);
-            std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers);
-            if (file.ReadArray(keys.data(), keys.size()) != keys.size() ||
-                file.ReadArray(bound_samplers.data(), bound_samplers.size()) !=
-                    bound_samplers.size() ||
-                file.ReadArray(bindless_samplers.data(), bindless_samplers.size()) !=
-                    bindless_samplers.size()) {
-                LOG_ERROR(Render_OpenGL, error_loading);
-                return {};
-            }
-            for (const auto& key : keys) {
-                usage.keys.insert({{key.cbuf, key.offset}, key.value});
-            }
-            for (const auto& key : bound_samplers) {
-                usage.bound_samplers.emplace(key.offset, key.sampler);
-            }
-            for (const auto& key : bindless_samplers) {
-                usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
-            }
-
-            usages.push_back(std::move(usage));
-            break;
-        }
-        default:
-            LOG_ERROR(Render_OpenGL, "Unknown transferable shader cache entry kind={}, skipping",
-                      static_cast<u32>(kind));
+        ShaderDiskCacheEntry& entry = entries.emplace_back();
+        if (!entry.Load(file)) {
+            LOG_ERROR(Render_OpenGL, "Failed to load transferable raw entry, skipping");
            return {};
        }
    }

    is_usable = true;
-    return {{std::move(raws), std::move(usages)}};
+    return {std::move(entries)};
 }

-std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>
-ShaderDiskCacheOpenGL::LoadPrecompiled() {
+std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled() {
    if (!is_usable) {
        return {};
    }

-    std::string path = GetPrecompiledPath();
-    FileUtil::IOFile file(path, "rb");
+    FileUtil::IOFile file(GetPrecompiledPath(), "rb");
    if (!file.IsOpen()) {
-        LOG_INFO(Render_OpenGL, "No precompiled shader cache found for game with title id={}",
-                 GetTitleID());
+        LOG_INFO(Render_OpenGL, "No precompiled shader cache found");
        return {};
    }

-    const auto result = LoadPrecompiledFile(file);
-    if (!result) {
-        LOG_INFO(Render_OpenGL,
-                 "Failed to load precompiled cache for game with title id={}, removing",
-                 GetTitleID());
-        file.Close();
-        InvalidatePrecompiled();
-        return {};
+    if (const auto result = LoadPrecompiledFile(file)) {
+        return *result;
    }
-    return *result;
+
+    LOG_INFO(Render_OpenGL, "Failed to load precompiled cache");
+    file.Close();
+    InvalidatePrecompiled();
+    return {};
 }

-std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>
-ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
+std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::LoadPrecompiledFile(
+    FileUtil::IOFile& file) {
    // Read compressed file from disk and decompress to virtual precompiled cache file
    std::vector<u8> compressed(file.GetSize());
    file.ReadBytes(compressed.data(), compressed.size());
@@ -275,58 +264,22 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
        return {};
    }

-    ShaderDumpsMap dumps;
+    std::vector<ShaderDiskCachePrecompiled> entries;
    while (precompiled_cache_virtual_file_offset < precompiled_cache_virtual_file.GetSize()) {
-        u32 num_keys{};
-        u32 num_bound_samplers{};
-        u32 num_bindless_samplers{};
-        ShaderDiskCacheUsage usage;
-        if (!LoadObjectFromPrecompiled(usage.unique_identifier) ||
-            !LoadObjectFromPrecompiled(usage.variant) ||
-            !LoadObjectFromPrecompiled(usage.bound_buffer) ||
-            !LoadObjectFromPrecompiled(num_keys) ||
-            !LoadObjectFromPrecompiled(num_bound_samplers) ||
-            !LoadObjectFromPrecompiled(num_bindless_samplers)) {
-            return {};
-        }
-        std::vector<ConstBufferKey> keys(num_keys);
-        std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers);
-        std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers);
-        if (!LoadArrayFromPrecompiled(keys.data(), keys.size()) ||
-            !LoadArrayFromPrecompiled(bound_samplers.data(), bound_samplers.size()) !=
-                bound_samplers.size() ||
-            !LoadArrayFromPrecompiled(bindless_samplers.data(), bindless_samplers.size()) !=
-                bindless_samplers.size()) {
-            return {};
-        }
-        for (const auto& key : keys) {
-            usage.keys.insert({{key.cbuf, key.offset}, key.value});
-        }
-        for (const auto& key : bound_samplers) {
-            usage.bound_samplers.emplace(key.offset, key.sampler);
-        }
-        for (const auto& key : bindless_samplers) {
-            usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
-        }
-
-        ShaderDiskCacheDump dump;
-        if (!LoadObjectFromPrecompiled(dump.binary_format)) {
+        u32 binary_size;
+        auto& entry = entries.emplace_back();
+        if (!LoadObjectFromPrecompiled(entry.unique_identifier) ||
+            !LoadObjectFromPrecompiled(entry.binary_format) ||
+            !LoadObjectFromPrecompiled(binary_size)) {
            return {};
        }

-        u32 binary_length{};
-        if (!LoadObjectFromPrecompiled(binary_length)) {
+        entry.binary.resize(binary_size);
+        if (!LoadArrayFromPrecompiled(entry.binary.data(), entry.binary.size())) {
            return {};
        }
-
-        dump.binary.resize(binary_length);
-        if (!LoadArrayFromPrecompiled(dump.binary.data(), dump.binary.size())) {
-            return {};
-        }
-
-        dumps.emplace(std::move(usage), dump);
    }
-    return dumps;
+    return entries;
 }

 void ShaderDiskCacheOpenGL::InvalidateTransferable() {
@@ -346,13 +299,13 @@ void ShaderDiskCacheOpenGL::InvalidatePrecompiled() {
    }
 }

-void ShaderDiskCacheOpenGL::SaveRaw(const ShaderDiskCacheRaw& entry) {
+void ShaderDiskCacheOpenGL::SaveEntry(const ShaderDiskCacheEntry& entry) {
    if (!is_usable) {
        return;
    }

-    const u64 id = entry.GetUniqueIdentifier();
-    if (transferable.find(id) != transferable.end()) {
+    const u64 id = entry.unique_identifier;
+    if (stored_transferable.find(id) != stored_transferable.end()) {
        // The shader already exists
        return;
    }
@@ -361,71 +314,17 @@ void ShaderDiskCacheOpenGL::SaveRaw(const ShaderDiskCacheRaw& entry) {
    if (!file.IsOpen()) {
        return;
    }
-    if (file.WriteObject(TransferableEntryKind::Raw) != 1 || !entry.Save(file)) {
+    if (!entry.Save(file)) {
        LOG_ERROR(Render_OpenGL, "Failed to save raw transferable cache entry, removing");
        file.Close();
        InvalidateTransferable();
        return;
    }
-    transferable.insert({id, {}});
+
+    stored_transferable.insert(id);
 }

-void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) {
-    if (!is_usable) {
-        return;
-    }
-
-    const auto it = transferable.find(usage.unique_identifier);
-    ASSERT_MSG(it != transferable.end(), "Saving shader usage without storing raw previously");
-
-    auto& usages{it->second};
-    if (usages.find(usage) != usages.end()) {
-        // Skip this variant since the shader is already stored.
-        return;
-    }
-    usages.insert(usage);
-
-    FileUtil::IOFile file = AppendTransferableFile();
-    if (!file.IsOpen())
-        return;
-    const auto Close = [&] {
-        LOG_ERROR(Render_OpenGL, "Failed to save usage transferable cache entry, removing");
-        file.Close();
-        InvalidateTransferable();
-    };
-
-    if (file.WriteObject(TransferableEntryKind::Usage) != 1 ||
-        file.WriteObject(usage.unique_identifier) != 1 || file.WriteObject(usage.variant) != 1 ||
-        file.WriteObject(usage.bound_buffer) != 1 ||
-        file.WriteObject(static_cast<u32>(usage.keys.size())) != 1 ||
-        file.WriteObject(static_cast<u32>(usage.bound_samplers.size())) != 1 ||
-        file.WriteObject(static_cast<u32>(usage.bindless_samplers.size())) != 1) {
-        Close();
-        return;
-    }
-    for (const auto& [pair, value] : usage.keys) {
-        const auto [cbuf, offset] = pair;
-        if (file.WriteObject(ConstBufferKey{cbuf, offset, value}) != 1) {
-            Close();
-            return;
-        }
-    }
-    for (const auto& [offset, sampler] : usage.bound_samplers) {
-        if (file.WriteObject(BoundSamplerKey{offset, sampler}) != 1) {
-            Close();
-            return;
-        }
-    }
-    for (const auto& [pair, sampler] : usage.bindless_samplers) {
-        const auto [cbuf, offset] = pair;
-        if (file.WriteObject(BindlessSamplerKey{cbuf, offset, sampler}) != 1) {
-            Close();
-            return;
-        }
-    }
-}
-
-void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint program) {
+void ShaderDiskCacheOpenGL::SavePrecompiled(u64 unique_identifier, GLuint program) {
    if (!is_usable) {
        return;
    }
@@ -437,51 +336,19 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p
        SavePrecompiledHeaderToVirtualPrecompiledCache();
    }

-    GLint binary_length{};
+    GLint binary_length;
    glGetProgramiv(program, GL_PROGRAM_BINARY_LENGTH, &binary_length);

-    GLenum binary_format{};
+    GLenum binary_format;
    std::vector<u8> binary(binary_length);
    glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data());

-    const auto Close = [&] {
-        LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016X}, removing",
-                  usage.unique_identifier);
-        InvalidatePrecompiled();
-    };
-
-    if (!SaveObjectToPrecompiled(usage.unique_identifier) ||
-        !SaveObjectToPrecompiled(usage.variant) || !SaveObjectToPrecompiled(usage.bound_buffer) ||
-        !SaveObjectToPrecompiled(static_cast<u32>(usage.keys.size())) ||
-        !SaveObjectToPrecompiled(static_cast<u32>(usage.bound_samplers.size())) ||
-        !SaveObjectToPrecompiled(static_cast<u32>(usage.bindless_samplers.size()))) {
-        Close();
-        return;
-    }
-    for (const auto& [pair, value] : usage.keys) {
-        const auto [cbuf, offset] = pair;
-        if (SaveObjectToPrecompiled(ConstBufferKey{cbuf, offset, value}) != 1) {
-            Close();
-            return;
-        }
-    }
-    for (const auto& [offset, sampler] : usage.bound_samplers) {
-        if (SaveObjectToPrecompiled(BoundSamplerKey{offset, sampler}) != 1) {
-            Close();
-            return;
-        }
-    }
-    for (const auto& [pair, sampler] : usage.bindless_samplers) {
-        const auto [cbuf, offset] = pair;
-        if (SaveObjectToPrecompiled(BindlessSamplerKey{cbuf, offset, sampler}) != 1) {
-            Close();
-            return;
-        }
-    }
-    if (!SaveObjectToPrecompiled(static_cast<u32>(binary_format)) ||
-        !SaveObjectToPrecompiled(static_cast<u32>(binary_length)) ||
+    if (!SaveObjectToPrecompiled(unique_identifier) || !SaveObjectToPrecompiled(binary_format) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(binary.size())) ||
        !SaveArrayToPrecompiled(binary.data(), binary.size())) {
-        Close();
+        LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016X}, removing",
+                  unique_identifier);
+        InvalidatePrecompiled();
    }
 }

@@ -534,7 +401,6 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() {
    if (file.WriteBytes(compressed.data(), compressed.size()) != compressed.size()) {
        LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version in path={}",
                  precompiled_path);
-        return;
    }
 }

--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -19,8 +19,7 @@
 #include "common/common_types.h"
 #include "core/file_sys/vfs_vector.h"
 #include "video_core/engines/shader_type.h"
-#include "video_core/renderer_opengl/gl_shader_gen.h"
-#include "video_core/shader/const_buffer_locker.h"
+#include "video_core/shader/registry.h"

 namespace Core {
 class System;
@@ -32,139 +31,39 @@ class IOFile;

 namespace OpenGL {

-struct ShaderDiskCacheUsage;
-struct ShaderDiskCacheDump;
-
 using ProgramCode = std::vector<u64>;
-using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;

-/// Describes the different variants a program can be compiled with.
-struct ProgramVariant final {
-    ProgramVariant() = default;
-
-    /// Graphics constructor.
-    explicit constexpr ProgramVariant(GLenum primitive_mode) noexcept
-        : primitive_mode{primitive_mode} {}
-
-    /// Compute constructor.
-    explicit constexpr ProgramVariant(u32 block_x, u32 block_y, u32 block_z, u32 shared_memory_size,
-                                      u32 local_memory_size) noexcept
-        : block_x{block_x}, block_y{static_cast<u16>(block_y)}, block_z{static_cast<u16>(block_z)},
-          shared_memory_size{shared_memory_size}, local_memory_size{local_memory_size} {}
-
-    // Graphics specific parameters.
-    GLenum primitive_mode{};
-
-    // Compute specific parameters.
-    u32 block_x{};
-    u16 block_y{};
-    u16 block_z{};
-    u32 shared_memory_size{};
-    u32 local_memory_size{};
-
-    bool operator==(const ProgramVariant& rhs) const noexcept {
-        return std::tie(primitive_mode, block_x, block_y, block_z, shared_memory_size,
-                        local_memory_size) == std::tie(rhs.primitive_mode, rhs.block_x, rhs.block_y,
-                                                       rhs.block_z, rhs.shared_memory_size,
-                                                       rhs.local_memory_size);
-    }
-
-    bool operator!=(const ProgramVariant& rhs) const noexcept {
-        return !operator==(rhs);
-    }
-};
-static_assert(std::is_trivially_copyable_v<ProgramVariant>);
-
-/// Describes how a shader is used.
-struct ShaderDiskCacheUsage {
-    u64 unique_identifier{};
-    ProgramVariant variant;
-    u32 bound_buffer{};
-    VideoCommon::Shader::KeyMap keys;
-    VideoCommon::Shader::BoundSamplerMap bound_samplers;
-    VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
-
-    bool operator==(const ShaderDiskCacheUsage& rhs) const {
-        return std::tie(unique_identifier, variant, keys, bound_samplers, bindless_samplers) ==
-               std::tie(rhs.unique_identifier, rhs.variant, rhs.keys, rhs.bound_samplers,
-                        rhs.bindless_samplers);
-    }
-
-    bool operator!=(const ShaderDiskCacheUsage& rhs) const {
-        return !operator==(rhs);
-    }
-};
-
-} // namespace OpenGL
-
-namespace std {
-
-template <>
-struct hash<OpenGL::ProgramVariant> {
-    std::size_t operator()(const OpenGL::ProgramVariant& variant) const noexcept {
-        return (static_cast<std::size_t>(variant.primitive_mode) << 6) ^
-               static_cast<std::size_t>(variant.block_x) ^
-               (static_cast<std::size_t>(variant.block_y) << 32) ^
-               (static_cast<std::size_t>(variant.block_z) << 48) ^
-               (static_cast<std::size_t>(variant.shared_memory_size) << 16) ^
-               (static_cast<std::size_t>(variant.local_memory_size) << 36);
-    }
-};
-
-template <>
-struct hash<OpenGL::ShaderDiskCacheUsage> {
-    std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept {
-        return static_cast<std::size_t>(usage.unique_identifier) ^
-               std::hash<OpenGL::ProgramVariant>{}(usage.variant);
-    }
-};
-
-} // namespace std
-
-namespace OpenGL {
-
-/// Describes a shader how it's used by the guest GPU
-class ShaderDiskCacheRaw {
-public:
-    explicit ShaderDiskCacheRaw(u64 unique_identifier, Tegra::Engines::ShaderType type,
-                                ProgramCode code, ProgramCode code_b = {});
-    ShaderDiskCacheRaw();
-    ~ShaderDiskCacheRaw();
+/// Describes a shader and how it's used by the guest GPU
+struct ShaderDiskCacheEntry {
+    ShaderDiskCacheEntry();
+    ~ShaderDiskCacheEntry();

    bool Load(FileUtil::IOFile& file);

    bool Save(FileUtil::IOFile& file) const;

-    u64 GetUniqueIdentifier() const {
-        return unique_identifier;
-    }
-
    bool HasProgramA() const {
        return !code.empty() && !code_b.empty();
    }

-    Tegra::Engines::ShaderType GetType() const {
-        return type;
-    }
-
-    const ProgramCode& GetCode() const {
-        return code;
-    }
-
-    const ProgramCode& GetCodeB() const {
-        return code_b;
-    }
-
-private:
-    u64 unique_identifier{};
    Tegra::Engines::ShaderType type{};
    ProgramCode code;
    ProgramCode code_b;
+
+    u64 unique_identifier = 0;
+    std::optional<u32> texture_handler_size;
+    u32 bound_buffer = 0;
+    VideoCommon::Shader::GraphicsInfo graphics_info;
+    VideoCommon::Shader::ComputeInfo compute_info;
+    VideoCommon::Shader::KeyMap keys;
+    VideoCommon::Shader::BoundSamplerMap bound_samplers;
+    VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
 };

 /// Contains an OpenGL dumped binary program
-struct ShaderDiskCacheDump {
-    GLenum binary_format{};
+struct ShaderDiskCachePrecompiled {
+    u64 unique_identifier = 0;
+    GLenum binary_format = 0;
    std::vector<u8> binary;
 };

@@ -174,11 +73,10 @@ public:
    ~ShaderDiskCacheOpenGL();

    /// Loads transferable cache. If file has a old version or on failure, it deletes the file.
-    std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>>
-    LoadTransferable();
+    std::optional<std::vector<ShaderDiskCacheEntry>> LoadTransferable();

    /// Loads current game's precompiled cache. Invalidates on failure.
-    std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> LoadPrecompiled();
+    std::vector<ShaderDiskCachePrecompiled> LoadPrecompiled();

    /// Removes the transferable (and precompiled) cache file.
    void InvalidateTransferable();
@@ -187,21 +85,18 @@ public:
    void InvalidatePrecompiled();

    /// Saves a raw dump to the transferable file. Checks for collisions.
-    void SaveRaw(const ShaderDiskCacheRaw& entry);
-
-    /// Saves shader usage to the transferable file. Does not check for collisions.
-    void SaveUsage(const ShaderDiskCacheUsage& usage);
+    void SaveEntry(const ShaderDiskCacheEntry& entry);

    /// Saves a dump entry to the precompiled file. Does not check for collisions.
-    void SaveDump(const ShaderDiskCacheUsage& usage, GLuint program);
+    void SavePrecompiled(u64 unique_identifier, GLuint program);

    /// Serializes virtual precompiled shader cache file to real file
    void SaveVirtualPrecompiledFile();

 private:
    /// Loads the transferable cache. Returns empty on failure.
-    std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>
-    LoadPrecompiledFile(FileUtil::IOFile& file);
+    std::optional<std::vector<ShaderDiskCachePrecompiled>> LoadPrecompiledFile(
+        FileUtil::IOFile& file);

    /// Opens current game's transferable file and write it's header if it doesn't exist
    FileUtil::IOFile AppendTransferableFile() const;
@@ -270,7 +165,7 @@ private:
    std::size_t precompiled_cache_virtual_file_offset = 0;

    // Stored transferable shaders
-    std::unordered_map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable;
+    std::unordered_set<u64> stored_transferable;

    // The cache has been loaded at boot
    bool is_usable{};
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -1,109 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <string>
-
-#include <fmt/format.h>
-
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/engines/shader_type.h"
-#include "video_core/renderer_opengl/gl_device.h"
-#include "video_core/renderer_opengl/gl_shader_decompiler.h"
-#include "video_core/renderer_opengl/gl_shader_gen.h"
-#include "video_core/shader/shader_ir.h"
-
-namespace OpenGL::GLShader {
-
-using Tegra::Engines::Maxwell3D;
-using Tegra::Engines::ShaderType;
-using VideoCommon::Shader::CompileDepth;
-using VideoCommon::Shader::CompilerSettings;
-using VideoCommon::Shader::ProgramCode;
-using VideoCommon::Shader::ShaderIR;
-
-std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b) {
-    std::string out = GetCommonDeclarations();
-    out += fmt::format(R"(
-layout (std140, binding = {}) uniform vs_config {{
-    float y_direction;
-}};
-
-)",
-                       EmulationUniformBlockBinding);
-    out += Decompile(device, ir, ShaderType::Vertex, "vertex");
-    if (ir_b) {
-        out += Decompile(device, *ir_b, ShaderType::Vertex, "vertex_b");
-    }
-
-    out += R"(
-void main() {
-    gl_Position = vec4(0.0f, 0.0f, 0.0f, 1.0f);
-    execute_vertex();
-)";
-    if (ir_b) {
-        out += "    execute_vertex_b();";
-    }
-    out += "}\n";
-    return out;
-}
-
-std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir) {
-    std::string out = GetCommonDeclarations();
-    out += fmt::format(R"(
-layout (std140, binding = {}) uniform gs_config {{
-    float y_direction;
-}};
-
-)",
-                       EmulationUniformBlockBinding);
-    out += Decompile(device, ir, ShaderType::Geometry, "geometry");
-
-    out += R"(
-void main() {
-    execute_geometry();
-}
-)";
-    return out;
-}
-
-std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir) {
-    std::string out = GetCommonDeclarations();
-    out += fmt::format(R"(
-layout (location = 0) out vec4 FragColor0;
-layout (location = 1) out vec4 FragColor1;
-layout (location = 2) out vec4 FragColor2;
-layout (location = 3) out vec4 FragColor3;
-layout (location = 4) out vec4 FragColor4;
-layout (location = 5) out vec4 FragColor5;
-layout (location = 6) out vec4 FragColor6;
-layout (location = 7) out vec4 FragColor7;
-
-layout (std140, binding = {}) uniform fs_config {{
-    float y_direction;
-}};
-
-)",
-                       EmulationUniformBlockBinding);
-    out += Decompile(device, ir, ShaderType::Fragment, "fragment");
-
-    out += R"(
-void main() {
-    execute_fragment();
-}
-)";
-    return out;
-}
-
-std::string GenerateComputeShader(const Device& device, const ShaderIR& ir) {
-    std::string out = GetCommonDeclarations();
-    out += Decompile(device, ir, ShaderType::Compute, "compute");
-    out += R"(
-void main() {
-    execute_compute();
-}
-)";
-    return out;
-}
-
-} // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -1,34 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <vector>
-
-#include "common/common_types.h"
-#include "video_core/renderer_opengl/gl_shader_decompiler.h"
-#include "video_core/shader/shader_ir.h"
-
-namespace OpenGL {
-class Device;
-}
-
-namespace OpenGL::GLShader {
-
-using VideoCommon::Shader::ProgramCode;
-using VideoCommon::Shader::ShaderIR;
-
-/// Generates the GLSL vertex shader program source code for the given VS program
-std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b);
-
-/// Generates the GLSL geometry shader program source code for the given GS program
-std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir);
-
-/// Generates the GLSL fragment shader program source code for the given FS program
-std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir);
-
-/// Generates the GLSL compute shader program source code for the given CS program
-std::string GenerateComputeShader(const Device& device, const ShaderIR& ir);
-
-} // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -2,45 +2,52 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <glad/glad.h>
+
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"

 namespace OpenGL::GLShader {

-using Tegra::Engines::Maxwell3D;
-
-ProgramManager::ProgramManager() {
-    pipeline.Create();
-}
+ProgramManager::ProgramManager() = default;

 ProgramManager::~ProgramManager() = default;

-void ProgramManager::ApplyTo(OpenGLState& state) {
-    UpdatePipeline();
-    state.draw.shader_program = 0;
-    state.draw.program_pipeline = pipeline.handle;
+void ProgramManager::Create() {
+    graphics_pipeline.Create();
+    glBindProgramPipeline(graphics_pipeline.handle);
 }

-void ProgramManager::UpdatePipeline() {
+void ProgramManager::BindGraphicsPipeline() {
+    if (!is_graphics_bound) {
+        is_graphics_bound = true;
+        glUseProgram(0);
+    }
+
    // Avoid updating the pipeline when values have no changed
    if (old_state == current_state) {
        return;
    }

    // Workaround for AMD bug
-    constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT |
-                                     GL_FRAGMENT_SHADER_BIT};
-    glUseProgramStages(pipeline.handle, all_used_stages, 0);
-
-    glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader);
-    glUseProgramStages(pipeline.handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader);
-    glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader);
+    static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT |
+                                            GL_FRAGMENT_SHADER_BIT};
+    const GLuint handle = graphics_pipeline.handle;
+    glUseProgramStages(handle, all_used_stages, 0);
+    glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader);
+    glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader);
+    glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader);

    old_state = current_state;
 }

-void MaxwellUniformData::SetFromRegs(const Maxwell3D& maxwell) {
+void ProgramManager::BindComputeShader(GLuint program) {
+    is_graphics_bound = false;
+    glUseProgram(program);
+}
+
+void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
    const auto& regs = maxwell.regs;

    // Y_NEGATE controls what value S2R returns for the Y_DIRECTION system value.
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -9,7 +9,6 @@
 #include <glad/glad.h>

 #include "video_core/renderer_opengl/gl_resource_manager.h"
-#include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"

 namespace OpenGL::GLShader {
@@ -32,49 +31,47 @@ public:
    explicit ProgramManager();
    ~ProgramManager();

-    void ApplyTo(OpenGLState& state);
+    void Create();

-    void UseProgrammableVertexShader(GLuint program) {
+    /// Updates the graphics pipeline and binds it.
+    void BindGraphicsPipeline();
+
+    /// Binds a compute shader.
+    void BindComputeShader(GLuint program);
+
+    void UseVertexShader(GLuint program) {
        current_state.vertex_shader = program;
    }

-    void UseProgrammableGeometryShader(GLuint program) {
+    void UseGeometryShader(GLuint program) {
        current_state.geometry_shader = program;
    }

-    void UseProgrammableFragmentShader(GLuint program) {
+    void UseFragmentShader(GLuint program) {
        current_state.fragment_shader = program;
    }

-    void UseTrivialGeometryShader() {
-        current_state.geometry_shader = 0;
-    }
-
-    void UseTrivialFragmentShader() {
-        current_state.fragment_shader = 0;
-    }
-
 private:
    struct PipelineState {
-        bool operator==(const PipelineState& rhs) const {
+        bool operator==(const PipelineState& rhs) const noexcept {
            return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader &&
                   geometry_shader == rhs.geometry_shader;
        }

-        bool operator!=(const PipelineState& rhs) const {
+        bool operator!=(const PipelineState& rhs) const noexcept {
            return !operator==(rhs);
        }

-        GLuint vertex_shader{};
-        GLuint fragment_shader{};
-        GLuint geometry_shader{};
+        GLuint vertex_shader = 0;
+        GLuint fragment_shader = 0;
+        GLuint geometry_shader = 0;
    };

-    void UpdatePipeline();
-
-    OGLPipeline pipeline;
+    OGLPipeline graphics_pipeline;
+    OGLPipeline compute_pipeline;
    PipelineState current_state;
    PipelineState old_state;
+    bool is_graphics_bound = true;
 };

 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -1,569 +0,0 @@
-// Copyright 2015 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <algorithm>
-#include <iterator>
-#include <glad/glad.h>
-#include "common/assert.h"
-#include "common/logging/log.h"
-#include "common/microprofile.h"
-#include "video_core/renderer_opengl/gl_state.h"
-
-MICROPROFILE_DEFINE(OpenGL_State, "OpenGL", "State Change", MP_RGB(192, 128, 128));
-
-namespace OpenGL {
-
-using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-
-OpenGLState OpenGLState::cur_state;
-
-namespace {
-
-template <typename T>
-bool UpdateValue(T& current_value, const T new_value) {
-    const bool changed = current_value != new_value;
-    current_value = new_value;
-    return changed;
-}
-
-template <typename T1, typename T2>
-bool UpdateTie(T1 current_value, const T2 new_value) {
-    const bool changed = current_value != new_value;
-    current_value = new_value;
-    return changed;
-}
-
-template <typename T>
-std::optional<std::pair<GLuint, GLsizei>> UpdateArray(T& current_values, const T& new_values) {
-    std::optional<std::size_t> first;
-    std::size_t last;
-    for (std::size_t i = 0; i < std::size(current_values); ++i) {
-        if (!UpdateValue(current_values[i], new_values[i])) {
-            continue;
-        }
-        if (!first) {
-            first = i;
-        }
-        last = i;
-    }
-    if (!first) {
-        return std::nullopt;
-    }
-    return std::make_pair(static_cast<GLuint>(*first), static_cast<GLsizei>(last - *first + 1));
-}
-
-void Enable(GLenum cap, bool enable) {
-    if (enable) {
-        glEnable(cap);
-    } else {
-        glDisable(cap);
-    }
-}
-
-void Enable(GLenum cap, GLuint index, bool enable) {
-    if (enable) {
-        glEnablei(cap, index);
-    } else {
-        glDisablei(cap, index);
-    }
-}
-
-void Enable(GLenum cap, bool& current_value, bool new_value) {
-    if (UpdateValue(current_value, new_value)) {
-        Enable(cap, new_value);
-    }
-}
-
-void Enable(GLenum cap, GLuint index, bool& current_value, bool new_value) {
-    if (UpdateValue(current_value, new_value)) {
-        Enable(cap, index, new_value);
-    }
-}
-
-} // Anonymous namespace
-
-OpenGLState::OpenGLState() = default;
-
-void OpenGLState::SetDefaultViewports() {
-    viewports.fill(Viewport{});
-
-    depth_clamp.far_plane = false;
-    depth_clamp.near_plane = false;
-}
-
-void OpenGLState::ApplyFramebufferState() {
-    if (UpdateValue(cur_state.draw.read_framebuffer, draw.read_framebuffer)) {
-        glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer);
-    }
-    if (UpdateValue(cur_state.draw.draw_framebuffer, draw.draw_framebuffer)) {
-        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer);
-    }
-}
-
-void OpenGLState::ApplyVertexArrayState() {
-    if (UpdateValue(cur_state.draw.vertex_array, draw.vertex_array)) {
-        glBindVertexArray(draw.vertex_array);
-    }
-}
-
-void OpenGLState::ApplyShaderProgram() {
-    if (UpdateValue(cur_state.draw.shader_program, draw.shader_program)) {
-        glUseProgram(draw.shader_program);
-    }
-}
-
-void OpenGLState::ApplyProgramPipeline() {
-    if (UpdateValue(cur_state.draw.program_pipeline, draw.program_pipeline)) {
-        glBindProgramPipeline(draw.program_pipeline);
-    }
-}
-
-void OpenGLState::ApplyClipDistances() {
-    for (std::size_t i = 0; i < clip_distance.size(); ++i) {
-        Enable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i), cur_state.clip_distance[i],
-               clip_distance[i]);
-    }
-}
-
-void OpenGLState::ApplyPointSize() {
-    Enable(GL_PROGRAM_POINT_SIZE, cur_state.point.program_control, point.program_control);
-    Enable(GL_POINT_SPRITE, cur_state.point.sprite, point.sprite);
-    if (UpdateValue(cur_state.point.size, point.size)) {
-        glPointSize(point.size);
-    }
-}
-
-void OpenGLState::ApplyFragmentColorClamp() {
-    if (UpdateValue(cur_state.fragment_color_clamp.enabled, fragment_color_clamp.enabled)) {
-        glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB,
-                     fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE);
-    }
-}
-
-void OpenGLState::ApplyMultisample() {
-    Enable(GL_SAMPLE_ALPHA_TO_COVERAGE, cur_state.multisample_control.alpha_to_coverage,
-           multisample_control.alpha_to_coverage);
-    Enable(GL_SAMPLE_ALPHA_TO_ONE, cur_state.multisample_control.alpha_to_one,
-           multisample_control.alpha_to_one);
-}
-
-void OpenGLState::ApplyDepthClamp() {
-    if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane &&
-        depth_clamp.near_plane == cur_state.depth_clamp.near_plane) {
-        return;
-    }
-    cur_state.depth_clamp = depth_clamp;
-
-    UNIMPLEMENTED_IF_MSG(depth_clamp.far_plane != depth_clamp.near_plane,
-                         "Unimplemented Depth Clamp Separation!");
-
-    Enable(GL_DEPTH_CLAMP, depth_clamp.far_plane || depth_clamp.near_plane);
-}
-
-void OpenGLState::ApplySRgb() {
-    if (cur_state.framebuffer_srgb.enabled == framebuffer_srgb.enabled)
-        return;
-    cur_state.framebuffer_srgb.enabled = framebuffer_srgb.enabled;
-    if (framebuffer_srgb.enabled) {
-        glEnable(GL_FRAMEBUFFER_SRGB);
-    } else {
-        glDisable(GL_FRAMEBUFFER_SRGB);
-    }
-}
-
-void OpenGLState::ApplyCulling() {
-    Enable(GL_CULL_FACE, cur_state.cull.enabled, cull.enabled);
-
-    if (UpdateValue(cur_state.cull.mode, cull.mode)) {
-        glCullFace(cull.mode);
-    }
-
-    if (UpdateValue(cur_state.cull.front_face, cull.front_face)) {
-        glFrontFace(cull.front_face);
-    }
-}
-
-void OpenGLState::ApplyRasterizerDiscard() {
-    Enable(GL_RASTERIZER_DISCARD, cur_state.rasterizer_discard, rasterizer_discard);
-}
-
-void OpenGLState::ApplyColorMask() {
-    if (!dirty.color_mask) {
-        return;
-    }
-    dirty.color_mask = false;
-
-    for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) {
-        const auto& updated = color_mask[i];
-        auto& current = cur_state.color_mask[i];
-        if (updated.red_enabled != current.red_enabled ||
-            updated.green_enabled != current.green_enabled ||
-            updated.blue_enabled != current.blue_enabled ||
-            updated.alpha_enabled != current.alpha_enabled) {
-            current = updated;
-            glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled,
-                         updated.blue_enabled, updated.alpha_enabled);
-        }
-    }
-}
-
-void OpenGLState::ApplyDepth() {
-    Enable(GL_DEPTH_TEST, cur_state.depth.test_enabled, depth.test_enabled);
-
-    if (cur_state.depth.test_func != depth.test_func) {
-        cur_state.depth.test_func = depth.test_func;
-        glDepthFunc(depth.test_func);
-    }
-
-    if (cur_state.depth.write_mask != depth.write_mask) {
-        cur_state.depth.write_mask = depth.write_mask;
-        glDepthMask(depth.write_mask);
-    }
-}
-
-void OpenGLState::ApplyPrimitiveRestart() {
-    Enable(GL_PRIMITIVE_RESTART, cur_state.primitive_restart.enabled, primitive_restart.enabled);
-
-    if (cur_state.primitive_restart.index != primitive_restart.index) {
-        cur_state.primitive_restart.index = primitive_restart.index;
-        glPrimitiveRestartIndex(primitive_restart.index);
-    }
-}
-
-void OpenGLState::ApplyStencilTest() {
-    if (!dirty.stencil_state) {
-        return;
-    }
-    dirty.stencil_state = false;
-
-    Enable(GL_STENCIL_TEST, cur_state.stencil.test_enabled, stencil.test_enabled);
-
-    const auto ConfigStencil = [](GLenum face, const auto& config, auto& current) {
-        if (current.test_func != config.test_func || current.test_ref != config.test_ref ||
-            current.test_mask != config.test_mask) {
-            current.test_func = config.test_func;
-            current.test_ref = config.test_ref;
-            current.test_mask = config.test_mask;
-            glStencilFuncSeparate(face, config.test_func, config.test_ref, config.test_mask);
-        }
-        if (current.action_depth_fail != config.action_depth_fail ||
-            current.action_depth_pass != config.action_depth_pass ||
-            current.action_stencil_fail != config.action_stencil_fail) {
-            current.action_depth_fail = config.action_depth_fail;
-            current.action_depth_pass = config.action_depth_pass;
-            current.action_stencil_fail = config.action_stencil_fail;
-            glStencilOpSeparate(face, config.action_stencil_fail, config.action_depth_fail,
-                                config.action_depth_pass);
-        }
-        if (current.write_mask != config.write_mask) {
-            current.write_mask = config.write_mask;
-            glStencilMaskSeparate(face, config.write_mask);
-        }
-    };
-    ConfigStencil(GL_FRONT, stencil.front, cur_state.stencil.front);
-    ConfigStencil(GL_BACK, stencil.back, cur_state.stencil.back);
-}
-
-void OpenGLState::ApplyViewport() {
-    for (GLuint i = 0; i < static_cast<GLuint>(Maxwell::NumViewports); ++i) {
-        const auto& updated = viewports[i];
-        auto& current = cur_state.viewports[i];
-
-        if (current.x != updated.x || current.y != updated.y || current.width != updated.width ||
-            current.height != updated.height) {
-            current.x = updated.x;
-            current.y = updated.y;
-            current.width = updated.width;
-            current.height = updated.height;
-            glViewportIndexedf(i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y),
-                               static_cast<GLfloat>(updated.width),
-                               static_cast<GLfloat>(updated.height));
-        }
-        if (current.depth_range_near != updated.depth_range_near ||
-            current.depth_range_far != updated.depth_range_far) {
-            current.depth_range_near = updated.depth_range_near;
-            current.depth_range_far = updated.depth_range_far;
-            glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far);
-        }
-
-        Enable(GL_SCISSOR_TEST, i, current.scissor.enabled, updated.scissor.enabled);
-
-        if (current.scissor.x != updated.scissor.x || current.scissor.y != updated.scissor.y ||
-            current.scissor.width != updated.scissor.width ||
-            current.scissor.height != updated.scissor.height) {
-            current.scissor.x = updated.scissor.x;
-            current.scissor.y = updated.scissor.y;
-            current.scissor.width = updated.scissor.width;
-            current.scissor.height = updated.scissor.height;
-            glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width,
-                             updated.scissor.height);
-        }
-    }
-}
-
-void OpenGLState::ApplyGlobalBlending() {
-    const Blend& updated = blend[0];
-    Blend& current = cur_state.blend[0];
-
-    Enable(GL_BLEND, current.enabled, updated.enabled);
-
-    if (current.src_rgb_func != updated.src_rgb_func ||
-        current.dst_rgb_func != updated.dst_rgb_func || current.src_a_func != updated.src_a_func ||
-        current.dst_a_func != updated.dst_a_func) {
-        current.src_rgb_func = updated.src_rgb_func;
-        current.dst_rgb_func = updated.dst_rgb_func;
-        current.src_a_func = updated.src_a_func;
-        current.dst_a_func = updated.dst_a_func;
-        glBlendFuncSeparate(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func,
-                            updated.dst_a_func);
-    }
-
-    if (current.rgb_equation != updated.rgb_equation || current.a_equation != updated.a_equation) {
-        current.rgb_equation = updated.rgb_equation;
-        current.a_equation = updated.a_equation;
-        glBlendEquationSeparate(updated.rgb_equation, updated.a_equation);
-    }
-}
-
-void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) {
-    const Blend& updated = blend[target];
-    Blend& current = cur_state.blend[target];
-
-    if (current.enabled != updated.enabled || force) {
-        current.enabled = updated.enabled;
-        Enable(GL_BLEND, static_cast<GLuint>(target), updated.enabled);
-    }
-
-    if (UpdateTie(std::tie(current.src_rgb_func, current.dst_rgb_func, current.src_a_func,
-                           current.dst_a_func),
-                  std::tie(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func,
-                           updated.dst_a_func))) {
-        glBlendFuncSeparatei(static_cast<GLuint>(target), updated.src_rgb_func,
-                             updated.dst_rgb_func, updated.src_a_func, updated.dst_a_func);
-    }
-
-    if (UpdateTie(std::tie(current.rgb_equation, current.a_equation),
-                  std::tie(updated.rgb_equation, updated.a_equation))) {
-        glBlendEquationSeparatei(static_cast<GLuint>(target), updated.rgb_equation,
-                                 updated.a_equation);
-    }
-}
-
-void OpenGLState::ApplyBlending() {
-    if (!dirty.blend_state) {
-        return;
-    }
-    dirty.blend_state = false;
-
-    if (independant_blend.enabled) {
-        const bool force = independant_blend.enabled != cur_state.independant_blend.enabled;
-        for (std::size_t target = 0; target < Maxwell::NumRenderTargets; ++target) {
-            ApplyTargetBlending(target, force);
-        }
-    } else {
-        ApplyGlobalBlending();
-    }
-    cur_state.independant_blend.enabled = independant_blend.enabled;
-
-    if (UpdateTie(
-            std::tie(cur_state.blend_color.red, cur_state.blend_color.green,
-                     cur_state.blend_color.blue, cur_state.blend_color.alpha),
-            std::tie(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha))) {
-        glBlendColor(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha);
-    }
-}
-
-void OpenGLState::ApplyLogicOp() {
-    Enable(GL_COLOR_LOGIC_OP, cur_state.logic_op.enabled, logic_op.enabled);
-
-    if (UpdateValue(cur_state.logic_op.operation, logic_op.operation)) {
-        glLogicOp(logic_op.operation);
-    }
-}
-
-void OpenGLState::ApplyPolygonOffset() {
-    if (!dirty.polygon_offset) {
-        return;
-    }
-    dirty.polygon_offset = false;
-
-    Enable(GL_POLYGON_OFFSET_FILL, cur_state.polygon_offset.fill_enable,
-           polygon_offset.fill_enable);
-    Enable(GL_POLYGON_OFFSET_LINE, cur_state.polygon_offset.line_enable,
-           polygon_offset.line_enable);
-    Enable(GL_POLYGON_OFFSET_POINT, cur_state.polygon_offset.point_enable,
-           polygon_offset.point_enable);
-
-    if (UpdateTie(std::tie(cur_state.polygon_offset.factor, cur_state.polygon_offset.units,
-                           cur_state.polygon_offset.clamp),
-                  std::tie(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp))) {
-        if (GLAD_GL_EXT_polygon_offset_clamp && polygon_offset.clamp != 0) {
-            glPolygonOffsetClamp(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp);
-        } else {
-            UNIMPLEMENTED_IF_MSG(polygon_offset.clamp != 0,
-                                 "Unimplemented Depth polygon offset clamp.");
-            glPolygonOffset(polygon_offset.factor, polygon_offset.units);
-        }
-    }
-}
-
-void OpenGLState::ApplyAlphaTest() {
-    Enable(GL_ALPHA_TEST, cur_state.alpha_test.enabled, alpha_test.enabled);
-    if (UpdateTie(std::tie(cur_state.alpha_test.func, cur_state.alpha_test.ref),
-                  std::tie(alpha_test.func, alpha_test.ref))) {
-        glAlphaFunc(alpha_test.func, alpha_test.ref);
-    }
-}
-
-void OpenGLState::ApplyClipControl() {
-    if (UpdateTie(std::tie(cur_state.clip_control.origin, cur_state.clip_control.depth_mode),
-                  std::tie(clip_control.origin, clip_control.depth_mode))) {
-        glClipControl(clip_control.origin, clip_control.depth_mode);
-    }
-}
-
-void OpenGLState::ApplyRenderBuffer() {
-    if (cur_state.renderbuffer != renderbuffer) {
-        cur_state.renderbuffer = renderbuffer;
-        glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer);
-    }
-}
-
-void OpenGLState::ApplyTextures() {
-    const std::size_t size = std::size(textures);
-    for (std::size_t i = 0; i < size; ++i) {
-        if (UpdateValue(cur_state.textures[i], textures[i])) {
-            // BindTextureUnit doesn't support binding null textures, skip those binds.
-            // TODO(Rodrigo): Stop using null textures
-            if (textures[i] != 0) {
-                glBindTextureUnit(static_cast<GLuint>(i), textures[i]);
-            }
-        }
-    }
-}
-
-void OpenGLState::ApplySamplers() {
-    const std::size_t size = std::size(samplers);
-    for (std::size_t i = 0; i < size; ++i) {
-        if (UpdateValue(cur_state.samplers[i], samplers[i])) {
-            glBindSampler(static_cast<GLuint>(i), samplers[i]);
-        }
-    }
-}
-
-void OpenGLState::ApplyImages() {
-    if (const auto update = UpdateArray(cur_state.images, images)) {
-        glBindImageTextures(update->first, update->second, images.data() + update->first);
-    }
-}
-
-void OpenGLState::Apply() {
-    MICROPROFILE_SCOPE(OpenGL_State);
-    ApplyFramebufferState();
-    ApplyVertexArrayState();
-    ApplyShaderProgram();
-    ApplyProgramPipeline();
-    ApplyClipDistances();
-    ApplyPointSize();
-    ApplyFragmentColorClamp();
-    ApplyMultisample();
-    ApplyRasterizerDiscard();
-    ApplyColorMask();
-    ApplyDepthClamp();
-    ApplyViewport();
-    ApplyStencilTest();
-    ApplySRgb();
-    ApplyCulling();
-    ApplyDepth();
-    ApplyPrimitiveRestart();
-    ApplyBlending();
-    ApplyLogicOp();
-    ApplyTextures();
-    ApplySamplers();
-    ApplyImages();
-    ApplyPolygonOffset();
-    ApplyAlphaTest();
-    ApplyClipControl();
-    ApplyRenderBuffer();
-}
-
-void OpenGLState::EmulateViewportWithScissor() {
-    auto& current = viewports[0];
-    if (current.scissor.enabled) {
-        const GLint left = std::max(current.x, current.scissor.x);
-        const GLint right =
-            std::max(current.x + current.width, current.scissor.x + current.scissor.width);
-        const GLint bottom = std::max(current.y, current.scissor.y);
-        const GLint top =
-            std::max(current.y + current.height, current.scissor.y + current.scissor.height);
-        current.scissor.x = std::max(left, 0);
-        current.scissor.y = std::max(bottom, 0);
-        current.scissor.width = std::max(right - left, 0);
-        current.scissor.height = std::max(top - bottom, 0);
-    } else {
-        current.scissor.enabled = true;
-        current.scissor.x = current.x;
-        current.scissor.y = current.y;
-        current.scissor.width = current.width;
-        current.scissor.height = current.height;
-    }
-}
-
-OpenGLState& OpenGLState::UnbindTexture(GLuint handle) {
-    for (auto& texture : textures) {
-        if (texture == handle) {
-            texture = 0;
-        }
-    }
-    return *this;
-}
-
-OpenGLState& OpenGLState::ResetSampler(GLuint handle) {
-    for (auto& sampler : samplers) {
-        if (sampler == handle) {
-            sampler = 0;
-        }
-    }
-    return *this;
-}
-
-OpenGLState& OpenGLState::ResetProgram(GLuint handle) {
-    if (draw.shader_program == handle) {
-        draw.shader_program = 0;
-    }
-    return *this;
-}
-
-OpenGLState& OpenGLState::ResetPipeline(GLuint handle) {
-    if (draw.program_pipeline == handle) {
-        draw.program_pipeline = 0;
-    }
-    return *this;
-}
-
-OpenGLState& OpenGLState::ResetVertexArray(GLuint handle) {
-    if (draw.vertex_array == handle) {
-        draw.vertex_array = 0;
-    }
-    return *this;
-}
-
-OpenGLState& OpenGLState::ResetFramebuffer(GLuint handle) {
-    if (draw.read_framebuffer == handle) {
-        draw.read_framebuffer = 0;
-    }
-    if (draw.draw_framebuffer == handle) {
-        draw.draw_framebuffer = 0;
-    }
-    return *this;
-}
-
-OpenGLState& OpenGLState::ResetRenderbuffer(GLuint handle) {
-    if (renderbuffer == handle) {
-        renderbuffer = 0;
-    }
-    return *this;
-}
-
-} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -1,251 +0,0 @@
-// Copyright 2015 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <array>
-#include <type_traits>
-#include <glad/glad.h>
-#include "video_core/engines/maxwell_3d.h"
-
-namespace OpenGL {
-
-class OpenGLState {
-public:
-    struct {
-        bool enabled = false; // GL_FRAMEBUFFER_SRGB
-    } framebuffer_srgb;
-
-    struct {
-        bool alpha_to_coverage = false; // GL_ALPHA_TO_COVERAGE
-        bool alpha_to_one = false;      // GL_ALPHA_TO_ONE
-    } multisample_control;
-
-    struct {
-        bool enabled = false; // GL_CLAMP_FRAGMENT_COLOR_ARB
-    } fragment_color_clamp;
-
-    struct {
-        bool far_plane = false;
-        bool near_plane = false;
-    } depth_clamp; // GL_DEPTH_CLAMP
-
-    struct {
-        bool enabled = false;       // GL_CULL_FACE
-        GLenum mode = GL_BACK;      // GL_CULL_FACE_MODE
-        GLenum front_face = GL_CCW; // GL_FRONT_FACE
-    } cull;
-
-    struct {
-        bool test_enabled = false;      // GL_DEPTH_TEST
-        GLboolean write_mask = GL_TRUE; // GL_DEPTH_WRITEMASK
-        GLenum test_func = GL_LESS;     // GL_DEPTH_FUNC
-    } depth;
-
-    struct {
-        bool enabled = false;
-        GLuint index = 0;
-    } primitive_restart; // GL_PRIMITIVE_RESTART
-
-    bool rasterizer_discard = false; // GL_RASTERIZER_DISCARD
-
-    struct ColorMask {
-        GLboolean red_enabled = GL_TRUE;
-        GLboolean green_enabled = GL_TRUE;
-        GLboolean blue_enabled = GL_TRUE;
-        GLboolean alpha_enabled = GL_TRUE;
-    };
-    std::array<ColorMask, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets>
-        color_mask; // GL_COLOR_WRITEMASK
-
-    struct {
-        bool test_enabled = false; // GL_STENCIL_TEST
-        struct {
-            GLenum test_func = GL_ALWAYS;         // GL_STENCIL_FUNC
-            GLint test_ref = 0;                   // GL_STENCIL_REF
-            GLuint test_mask = 0xFFFFFFFF;        // GL_STENCIL_VALUE_MASK
-            GLuint write_mask = 0xFFFFFFFF;       // GL_STENCIL_WRITEMASK
-            GLenum action_stencil_fail = GL_KEEP; // GL_STENCIL_FAIL
-            GLenum action_depth_fail = GL_KEEP;   // GL_STENCIL_PASS_DEPTH_FAIL
-            GLenum action_depth_pass = GL_KEEP;   // GL_STENCIL_PASS_DEPTH_PASS
-        } front, back;
-    } stencil;
-
-    struct Blend {
-        bool enabled = false;              // GL_BLEND
-        GLenum rgb_equation = GL_FUNC_ADD; // GL_BLEND_EQUATION_RGB
-        GLenum a_equation = GL_FUNC_ADD;   // GL_BLEND_EQUATION_ALPHA
-        GLenum src_rgb_func = GL_ONE;      // GL_BLEND_SRC_RGB
-        GLenum dst_rgb_func = GL_ZERO;     // GL_BLEND_DST_RGB
-        GLenum src_a_func = GL_ONE;        // GL_BLEND_SRC_ALPHA
-        GLenum dst_a_func = GL_ZERO;       // GL_BLEND_DST_ALPHA
-    };
-    std::array<Blend, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> blend;
-
-    struct {
-        bool enabled = false;
-    } independant_blend;
-
-    struct {
-        GLclampf red = 0.0f;
-        GLclampf green = 0.0f;
-        GLclampf blue = 0.0f;
-        GLclampf alpha = 0.0f;
-    } blend_color; // GL_BLEND_COLOR
-
-    struct {
-        bool enabled = false; // GL_LOGIC_OP_MODE
-        GLenum operation = GL_COPY;
-    } logic_op;
-
-    static constexpr std::size_t NumSamplers = 32 * 5;
-    static constexpr std::size_t NumImages = 8 * 5;
-    std::array<GLuint, NumSamplers> textures = {};
-    std::array<GLuint, NumSamplers> samplers = {};
-    std::array<GLuint, NumImages> images = {};
-
-    struct {
-        GLuint read_framebuffer = 0; // GL_READ_FRAMEBUFFER_BINDING
-        GLuint draw_framebuffer = 0; // GL_DRAW_FRAMEBUFFER_BINDING
-        GLuint vertex_array = 0;     // GL_VERTEX_ARRAY_BINDING
-        GLuint shader_program = 0;   // GL_CURRENT_PROGRAM
-        GLuint program_pipeline = 0; // GL_PROGRAM_PIPELINE_BINDING
-    } draw;
-
-    struct Viewport {
-        GLint x = 0;
-        GLint y = 0;
-        GLint width = 0;
-        GLint height = 0;
-        GLfloat depth_range_near = 0.0f; // GL_DEPTH_RANGE
-        GLfloat depth_range_far = 1.0f;  // GL_DEPTH_RANGE
-        struct {
-            bool enabled = false; // GL_SCISSOR_TEST
-            GLint x = 0;
-            GLint y = 0;
-            GLsizei width = 0;
-            GLsizei height = 0;
-        } scissor;
-    };
-    std::array<Viewport, Tegra::Engines::Maxwell3D::Regs::NumViewports> viewports;
-
-    struct {
-        bool program_control = false; // GL_PROGRAM_POINT_SIZE
-        bool sprite = false;          // GL_POINT_SPRITE
-        GLfloat size = 1.0f;          // GL_POINT_SIZE
-    } point;
-
-    struct {
-        bool point_enable = false;
-        bool line_enable = false;
-        bool fill_enable = false;
-        GLfloat units = 0.0f;
-        GLfloat factor = 0.0f;
-        GLfloat clamp = 0.0f;
-    } polygon_offset;
-
-    struct {
-        bool enabled = false;    // GL_ALPHA_TEST
-        GLenum func = GL_ALWAYS; // GL_ALPHA_TEST_FUNC
-        GLfloat ref = 0.0f;      // GL_ALPHA_TEST_REF
-    } alpha_test;
-
-    std::array<bool, 8> clip_distance = {}; // GL_CLIP_DISTANCE
-
-    struct {
-        GLenum origin = GL_LOWER_LEFT;
-        GLenum depth_mode = GL_NEGATIVE_ONE_TO_ONE;
-    } clip_control;
-
-    GLuint renderbuffer{}; // GL_RENDERBUFFER_BINDING
-
-    OpenGLState();
-
-    /// Get the currently active OpenGL state
-    static OpenGLState GetCurState() {
-        return cur_state;
-    }
-
-    void SetDefaultViewports();
-    /// Apply this state as the current OpenGL state
-    void Apply();
-
-    void ApplyFramebufferState();
-    void ApplyVertexArrayState();
-    void ApplyShaderProgram();
-    void ApplyProgramPipeline();
-    void ApplyClipDistances();
-    void ApplyPointSize();
-    void ApplyFragmentColorClamp();
-    void ApplyMultisample();
-    void ApplySRgb();
-    void ApplyCulling();
-    void ApplyRasterizerDiscard();
-    void ApplyColorMask();
-    void ApplyDepth();
-    void ApplyPrimitiveRestart();
-    void ApplyStencilTest();
-    void ApplyViewport();
-    void ApplyTargetBlending(std::size_t target, bool force);
-    void ApplyGlobalBlending();
-    void ApplyBlending();
-    void ApplyLogicOp();
-    void ApplyTextures();
-    void ApplySamplers();
-    void ApplyImages();
-    void ApplyDepthClamp();
-    void ApplyPolygonOffset();
-    void ApplyAlphaTest();
-    void ApplyClipControl();
-    void ApplyRenderBuffer();
-
-    /// Resets any references to the given resource
-    OpenGLState& UnbindTexture(GLuint handle);
-    OpenGLState& ResetSampler(GLuint handle);
-    OpenGLState& ResetProgram(GLuint handle);
-    OpenGLState& ResetPipeline(GLuint handle);
-    OpenGLState& ResetVertexArray(GLuint handle);
-    OpenGLState& ResetFramebuffer(GLuint handle);
-    OpenGLState& ResetRenderbuffer(GLuint handle);
-
-    /// Viewport does not affects glClearBuffer so emulate viewport using scissor test
-    void EmulateViewportWithScissor();
-
-    void MarkDirtyBlendState() {
-        dirty.blend_state = true;
-    }
-
-    void MarkDirtyStencilState() {
-        dirty.stencil_state = true;
-    }
-
-    void MarkDirtyPolygonOffset() {
-        dirty.polygon_offset = true;
-    }
-
-    void MarkDirtyColorMask() {
-        dirty.color_mask = true;
-    }
-
-    void AllDirty() {
-        dirty.blend_state = true;
-        dirty.stencil_state = true;
-        dirty.polygon_offset = true;
-        dirty.color_mask = true;
-    }
-
-private:
-    static OpenGLState cur_state;
-
-    struct {
-        bool blend_state;
-        bool stencil_state;
-        bool viewport_state;
-        bool polygon_offset;
-        bool color_mask;
-    } dirty{};
-};
-static_assert(std::is_trivially_copyable_v<OpenGLState>);
-
-} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_state_tracker.cpp
+++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp
@@ -0,0 +1,248 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/renderer_opengl/gl_state_tracker.h"
+
+#define OFF(field_name) MAXWELL3D_REG_INDEX(field_name)
+#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / sizeof(u32))
+
+namespace OpenGL {
+
+namespace {
+
+using namespace Dirty;
+using namespace VideoCommon::Dirty;
+using Tegra::Engines::Maxwell3D;
+using Regs = Maxwell3D::Regs;
+using Tables = Maxwell3D::DirtyState::Tables;
+using Table = Maxwell3D::DirtyState::Table;
+
+void SetupDirtyColorMasks(Tables& tables) {
+    tables[0][OFF(color_mask_common)] = ColorMaskCommon;
+    for (std::size_t rt = 0; rt < Regs::NumRenderTargets; ++rt) {
+        const std::size_t offset = OFF(color_mask) + rt * NUM(color_mask[0]);
+        FillBlock(tables[0], offset, NUM(color_mask[0]), ColorMask0 + rt);
+    }
+
+    FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks);
+}
+
+void SetupDirtyVertexArrays(Tables& tables) {
+    static constexpr std::size_t num_array = 3;
+    static constexpr std::size_t instance_base_offset = 3;
+    for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) {
+        const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]);
+        const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]);
+
+        FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers);
+        FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers);
+
+        const std::size_t instance_array_offset = array_offset + instance_base_offset;
+        tables[0][instance_array_offset] = static_cast<u8>(VertexInstance0 + i);
+        tables[1][instance_array_offset] = VertexInstances;
+
+        const std::size_t instance_offset = OFF(instanced_arrays) + i;
+        tables[0][instance_offset] = static_cast<u8>(VertexInstance0 + i);
+        tables[1][instance_offset] = VertexInstances;
+    }
+}
+
+void SetupDirtyVertexFormat(Tables& tables) {
+    for (std::size_t i = 0; i < Regs::NumVertexAttributes; ++i) {
+        const std::size_t offset = OFF(vertex_attrib_format) + i * NUM(vertex_attrib_format[0]);
+        FillBlock(tables[0], offset, NUM(vertex_attrib_format[0]), VertexFormat0 + i);
+    }
+
+    FillBlock(tables[1], OFF(vertex_attrib_format), Regs::NumVertexAttributes, VertexFormats);
+}
+
+void SetupDirtyViewports(Tables& tables) {
+    for (std::size_t i = 0; i < Regs::NumViewports; ++i) {
+        const std::size_t transf_offset = OFF(viewport_transform) + i * NUM(viewport_transform[0]);
+        const std::size_t viewport_offset = OFF(viewports) + i * NUM(viewports[0]);
+
+        FillBlock(tables[0], transf_offset, NUM(viewport_transform[0]), Viewport0 + i);
+        FillBlock(tables[0], viewport_offset, NUM(viewports[0]), Viewport0 + i);
+    }
+
+    FillBlock(tables[1], OFF(viewport_transform), NUM(viewport_transform), Viewports);
+    FillBlock(tables[1], OFF(viewports), NUM(viewports), Viewports);
+
+    tables[0][OFF(viewport_transform_enabled)] = ViewportTransform;
+    tables[1][OFF(viewport_transform_enabled)] = Viewports;
+}
+
+void SetupDirtyScissors(Tables& tables) {
+    for (std::size_t i = 0; i < Regs::NumViewports; ++i) {
+        const std::size_t offset = OFF(scissor_test) + i * NUM(scissor_test[0]);
+        FillBlock(tables[0], offset, NUM(scissor_test[0]), Scissor0 + i);
+    }
+    FillBlock(tables[1], OFF(scissor_test), NUM(scissor_test), Scissors);
+}
+
+void SetupDirtyShaders(Tables& tables) {
+    FillBlock(tables[0], OFF(shader_config[0]), NUM(shader_config[0]) * Regs::MaxShaderProgram,
+              Shaders);
+}
+
+void SetupDirtyPolygonModes(Tables& tables) {
+    tables[0][OFF(polygon_mode_front)] = PolygonModeFront;
+    tables[0][OFF(polygon_mode_back)] = PolygonModeBack;
+
+    tables[1][OFF(polygon_mode_front)] = PolygonModes;
+    tables[1][OFF(polygon_mode_back)] = PolygonModes;
+    tables[0][OFF(fill_rectangle)] = PolygonModes;
+}
+
+void SetupDirtyDepthTest(Tables& tables) {
+    auto& table = tables[0];
+    table[OFF(depth_test_enable)] = DepthTest;
+    table[OFF(depth_write_enabled)] = DepthMask;
+    table[OFF(depth_test_func)] = DepthTest;
+}
+
+void SetupDirtyStencilTest(Tables& tables) {
+    static constexpr std::array offsets = {
+        OFF(stencil_enable),          OFF(stencil_front_func_func), OFF(stencil_front_func_ref),
+        OFF(stencil_front_func_mask), OFF(stencil_front_op_fail),   OFF(stencil_front_op_zfail),
+        OFF(stencil_front_op_zpass),  OFF(stencil_front_mask),      OFF(stencil_two_side_enable),
+        OFF(stencil_back_func_func),  OFF(stencil_back_func_ref),   OFF(stencil_back_func_mask),
+        OFF(stencil_back_op_fail),    OFF(stencil_back_op_zfail),   OFF(stencil_back_op_zpass),
+        OFF(stencil_back_mask)};
+    for (const auto offset : offsets) {
+        tables[0][offset] = StencilTest;
+    }
+}
+
+void SetupDirtyAlphaTest(Tables& tables) {
+    auto& table = tables[0];
+    table[OFF(alpha_test_ref)] = AlphaTest;
+    table[OFF(alpha_test_func)] = AlphaTest;
+    table[OFF(alpha_test_enabled)] = AlphaTest;
+}
+
+void SetupDirtyBlend(Tables& tables) {
+    FillBlock(tables[0], OFF(blend_color), NUM(blend_color), BlendColor);
+
+    tables[0][OFF(independent_blend_enable)] = BlendIndependentEnabled;
+
+    for (std::size_t i = 0; i < Regs::NumRenderTargets; ++i) {
+        const std::size_t offset = OFF(independent_blend) + i * NUM(independent_blend[0]);
+        FillBlock(tables[0], offset, NUM(independent_blend[0]), BlendState0 + i);
+
+        tables[0][OFF(blend.enable) + i] = static_cast<u8>(BlendState0 + i);
+    }
+    FillBlock(tables[1], OFF(independent_blend), NUM(independent_blend), BlendStates);
+    FillBlock(tables[1], OFF(blend), NUM(blend), BlendStates);
+}
+
+void SetupDirtyPrimitiveRestart(Tables& tables) {
+    FillBlock(tables[0], OFF(primitive_restart), NUM(primitive_restart), PrimitiveRestart);
+}
+
+void SetupDirtyPolygonOffset(Tables& tables) {
+    auto& table = tables[0];
+    table[OFF(polygon_offset_fill_enable)] = PolygonOffset;
+    table[OFF(polygon_offset_line_enable)] = PolygonOffset;
+    table[OFF(polygon_offset_point_enable)] = PolygonOffset;
+    table[OFF(polygon_offset_factor)] = PolygonOffset;
+    table[OFF(polygon_offset_units)] = PolygonOffset;
+    table[OFF(polygon_offset_clamp)] = PolygonOffset;
+}
+
+void SetupDirtyMultisampleControl(Tables& tables) {
+    FillBlock(tables[0], OFF(multisample_control), NUM(multisample_control), MultisampleControl);
+}
+
+void SetupDirtyRasterizeEnable(Tables& tables) {
+    tables[0][OFF(rasterize_enable)] = RasterizeEnable;
+}
+
+void SetupDirtyFramebufferSRGB(Tables& tables) {
+    tables[0][OFF(framebuffer_srgb)] = FramebufferSRGB;
+}
+
+void SetupDirtyLogicOp(Tables& tables) {
+    FillBlock(tables[0], OFF(logic_op), NUM(logic_op), LogicOp);
+}
+
+void SetupDirtyFragmentClampColor(Tables& tables) {
+    tables[0][OFF(frag_color_clamp)] = FragmentClampColor;
+}
+
+void SetupDirtyPointSize(Tables& tables) {
+    tables[0][OFF(vp_point_size)] = PointSize;
+    tables[0][OFF(point_size)] = PointSize;
+    tables[0][OFF(point_sprite_enable)] = PointSize;
+}
+
+void SetupDirtyClipControl(Tables& tables) {
+    auto& table = tables[0];
+    table[OFF(screen_y_control)] = ClipControl;
+    table[OFF(depth_mode)] = ClipControl;
+}
+
+void SetupDirtyDepthClampEnabled(Tables& tables) {
+    tables[0][OFF(view_volume_clip_control)] = DepthClampEnabled;
+}
+
+void SetupDirtyMisc(Tables& tables) {
+    auto& table = tables[0];
+
+    table[OFF(clip_distance_enabled)] = ClipDistances;
+
+    table[OFF(front_face)] = FrontFace;
+
+    table[OFF(cull_test_enabled)] = CullTest;
+    table[OFF(cull_face)] = CullTest;
+}
+
+} // Anonymous namespace
+
+StateTracker::StateTracker(Core::System& system) : system{system} {}
+
+void StateTracker::Initialize() {
+    auto& dirty = system.GPU().Maxwell3D().dirty;
+    auto& tables = dirty.tables;
+    SetupDirtyRenderTargets(tables);
+    SetupDirtyColorMasks(tables);
+    SetupDirtyViewports(tables);
+    SetupDirtyScissors(tables);
+    SetupDirtyVertexArrays(tables);
+    SetupDirtyVertexFormat(tables);
+    SetupDirtyShaders(tables);
+    SetupDirtyPolygonModes(tables);
+    SetupDirtyDepthTest(tables);
+    SetupDirtyStencilTest(tables);
+    SetupDirtyAlphaTest(tables);
+    SetupDirtyBlend(tables);
+    SetupDirtyPrimitiveRestart(tables);
+    SetupDirtyPolygonOffset(tables);
+    SetupDirtyMultisampleControl(tables);
+    SetupDirtyRasterizeEnable(tables);
+    SetupDirtyFramebufferSRGB(tables);
+    SetupDirtyLogicOp(tables);
+    SetupDirtyFragmentClampColor(tables);
+    SetupDirtyPointSize(tables);
+    SetupDirtyClipControl(tables);
+    SetupDirtyDepthClampEnabled(tables);
+    SetupDirtyMisc(tables);
+
+    auto& store = dirty.on_write_stores;
+    SetupCommonOnWriteStores(store);
+    store[VertexBuffers] = true;
+    for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) {
+        store[VertexBuffer0 + i] = true;
+    }
+}
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_state_tracker.h
+++ b/src/video_core/renderer_opengl/gl_state_tracker.h
@@ -0,0 +1,215 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <limits>
+
+#include <glad/glad.h>
+
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/dirty_flags.h"
+#include "video_core/engines/maxwell_3d.h"
+
+namespace Core {
+class System;
+}
+
+namespace OpenGL {
+
+namespace Dirty {
+
+enum : u8 {
+    First = VideoCommon::Dirty::LastCommonEntry,
+
+    VertexFormats,
+    VertexFormat0,
+    VertexFormat31 = VertexFormat0 + 31,
+
+    VertexBuffers,
+    VertexBuffer0,
+    VertexBuffer31 = VertexBuffer0 + 31,
+
+    VertexInstances,
+    VertexInstance0,
+    VertexInstance31 = VertexInstance0 + 31,
+
+    ViewportTransform,
+    Viewports,
+    Viewport0,
+    Viewport15 = Viewport0 + 15,
+
+    Scissors,
+    Scissor0,
+    Scissor15 = Scissor0 + 15,
+
+    ColorMaskCommon,
+    ColorMasks,
+    ColorMask0,
+    ColorMask7 = ColorMask0 + 7,
+
+    BlendColor,
+    BlendIndependentEnabled,
+    BlendStates,
+    BlendState0,
+    BlendState7 = BlendState0 + 7,
+
+    Shaders,
+    ClipDistances,
+
+    PolygonModes,
+    PolygonModeFront,
+    PolygonModeBack,
+
+    ColorMask,
+    FrontFace,
+    CullTest,
+    DepthMask,
+    DepthTest,
+    StencilTest,
+    AlphaTest,
+    PrimitiveRestart,
+    PolygonOffset,
+    MultisampleControl,
+    RasterizeEnable,
+    FramebufferSRGB,
+    LogicOp,
+    FragmentClampColor,
+    PointSize,
+    ClipControl,
+    DepthClampEnabled,
+
+    Last
+};
+static_assert(Last <= std::numeric_limits<u8>::max());
+
+} // namespace Dirty
+
+class StateTracker {
+public:
+    explicit StateTracker(Core::System& system);
+
+    void Initialize();
+
+    void BindIndexBuffer(GLuint new_index_buffer) {
+        if (index_buffer == new_index_buffer) {
+            return;
+        }
+        index_buffer = new_index_buffer;
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, new_index_buffer);
+    }
+
+    void NotifyScreenDrawVertexArray() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::VertexFormats] = true;
+        flags[OpenGL::Dirty::VertexFormat0 + 0] = true;
+        flags[OpenGL::Dirty::VertexFormat0 + 1] = true;
+
+        flags[OpenGL::Dirty::VertexBuffers] = true;
+        flags[OpenGL::Dirty::VertexBuffer0] = true;
+
+        flags[OpenGL::Dirty::VertexInstances] = true;
+        flags[OpenGL::Dirty::VertexInstance0 + 0] = true;
+        flags[OpenGL::Dirty::VertexInstance0 + 1] = true;
+    }
+
+    void NotifyPolygonModes() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::PolygonModes] = true;
+        flags[OpenGL::Dirty::PolygonModeFront] = true;
+        flags[OpenGL::Dirty::PolygonModeBack] = true;
+    }
+
+    void NotifyViewport0() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::Viewports] = true;
+        flags[OpenGL::Dirty::Viewport0] = true;
+    }
+
+    void NotifyScissor0() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::Scissors] = true;
+        flags[OpenGL::Dirty::Scissor0] = true;
+    }
+
+    void NotifyColorMask0() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::ColorMasks] = true;
+        flags[OpenGL::Dirty::ColorMask0] = true;
+    }
+
+    void NotifyBlend0() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::BlendStates] = true;
+        flags[OpenGL::Dirty::BlendState0] = true;
+    }
+
+    void NotifyFramebuffer() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[VideoCommon::Dirty::RenderTargets] = true;
+    }
+
+    void NotifyFrontFace() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::FrontFace] = true;
+    }
+
+    void NotifyCullTest() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::CullTest] = true;
+    }
+
+    void NotifyDepthMask() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::DepthMask] = true;
+    }
+
+    void NotifyDepthTest() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::DepthTest] = true;
+    }
+
+    void NotifyStencilTest() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::StencilTest] = true;
+    }
+
+    void NotifyPolygonOffset() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::PolygonOffset] = true;
+    }
+
+    void NotifyRasterizeEnable() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::RasterizeEnable] = true;
+    }
+
+    void NotifyFramebufferSRGB() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::FramebufferSRGB] = true;
+    }
+
+    void NotifyLogicOp() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::LogicOp] = true;
+    }
+
+    void NotifyClipControl() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::ClipControl] = true;
+    }
+
+    void NotifyAlphaTest() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::AlphaTest] = true;
+    }
+
+private:
+    Core::System& system;
+
+    GLuint index_buffer = 0;
+};
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -7,7 +7,6 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/microprofile.h"
-#include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"

 MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -10,7 +10,7 @@
 #include "core/core.h"
 #include "video_core/morton.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
-#include "video_core/renderer_opengl/gl_state.h"
+#include "video_core/renderer_opengl/gl_state_tracker.h"
 #include "video_core/renderer_opengl/gl_texture_cache.h"
 #include "video_core/renderer_opengl/utils.h"
 #include "video_core/texture_cache/surface_base.h"
@@ -53,6 +53,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
    {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, false},                             // R8UI
    {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false},                                    // RGBA16F
    {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, false},                                 // RGBA16U
+    {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT, false},                                    // RGBA16S
    {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, false},                       // RGBA16UI
    {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false},            // R11FG11FB10F
    {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, false},                         // RGBA32UI
@@ -397,6 +398,7 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p
                                     const bool is_proxy)
    : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} {
    target = GetTextureTarget(params.target);
+    format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format;
    if (!is_proxy) {
        texture_view = CreateTextureView();
    }
@@ -467,25 +469,20 @@ void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_sou
 }

 OGLTextureView CachedSurfaceView::CreateTextureView() const {
-    const auto& owner_params = surface.GetSurfaceParams();
    OGLTextureView texture_view;
    texture_view.Create();

-    const GLuint handle{texture_view.handle};
-    const FormatTuple& tuple{GetFormatTuple(owner_params.pixel_format)};
-
-    glTextureView(handle, target, surface.texture.handle, tuple.internal_format, params.base_level,
+    glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level,
                  params.num_levels, params.base_layer, params.num_layers);
-
-    ApplyTextureDefaults(owner_params, handle);
+    ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle);

    return texture_view;
 }

 TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system,
                                       VideoCore::RasterizerInterface& rasterizer,
-                                       const Device& device)
-    : TextureCacheBase{system, rasterizer} {
+                                       const Device& device, StateTracker& state_tracker)
+    : TextureCacheBase{system, rasterizer}, state_tracker{state_tracker} {
    src_framebuffer.Create();
    dst_framebuffer.Create();
 }
@@ -519,25 +516,26 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
                                   const Tegra::Engines::Fermi2D::Config& copy_config) {
    const auto& src_params{src_view->GetSurfaceParams()};
    const auto& dst_params{dst_view->GetSurfaceParams()};
-
-    OpenGLState prev_state{OpenGLState::GetCurState()};
-    SCOPE_EXIT({
-        prev_state.AllDirty();
-        prev_state.Apply();
-    });
-
-    OpenGLState state;
-    state.draw.read_framebuffer = src_framebuffer.handle;
-    state.draw.draw_framebuffer = dst_framebuffer.handle;
-    state.framebuffer_srgb.enabled = dst_params.srgb_conversion;
-    state.AllDirty();
-    state.Apply();
-
-    u32 buffers{};
-
    UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D);
    UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D);

+    state_tracker.NotifyScissor0();
+    state_tracker.NotifyFramebuffer();
+    state_tracker.NotifyRasterizeEnable();
+    state_tracker.NotifyFramebufferSRGB();
+
+    if (dst_params.srgb_conversion) {
+        glEnable(GL_FRAMEBUFFER_SRGB);
+    } else {
+        glDisable(GL_FRAMEBUFFER_SRGB);
+    }
+    glDisable(GL_RASTERIZER_DISCARD);
+    glDisablei(GL_SCISSOR_TEST, 0);
+
+    glBindFramebuffer(GL_READ_FRAMEBUFFER, src_framebuffer.handle);
+    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, dst_framebuffer.handle);
+
+    GLenum buffers = 0;
    if (src_params.type == SurfaceType::ColorTexture) {
        src_view->Attach(GL_COLOR_ATTACHMENT0, GL_READ_FRAMEBUFFER);
        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -27,6 +27,7 @@ using VideoCommon::ViewParams;
 class CachedSurfaceView;
 class CachedSurface;
 class TextureCacheOpenGL;
+class StateTracker;

 using Surface = std::shared_ptr<CachedSurface>;
 using View = std::shared_ptr<CachedSurfaceView>;
@@ -96,6 +97,10 @@ public:
        return texture_view.handle;
    }

+    GLenum GetFormat() const {
+        return format;
+    }
+
    const SurfaceParams& GetSurfaceParams() const {
        return surface.GetSurfaceParams();
    }
@@ -113,6 +118,7 @@ private:

    CachedSurface& surface;
    GLenum target{};
+    GLenum format{};

    OGLTextureView texture_view;
    u32 swizzle{};
@@ -122,7 +128,7 @@ private:
 class TextureCacheOpenGL final : public TextureCacheBase {
 public:
    explicit TextureCacheOpenGL(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                                const Device& device);
+                                const Device& device, StateTracker& state_tracker);
    ~TextureCacheOpenGL();

 protected:
@@ -139,6 +145,8 @@ protected:
 private:
    GLuint FetchPBO(std::size_t buffer_size);

+    StateTracker& state_tracker;
+
    OGLFramebuffer src_framebuffer;
    OGLFramebuffer dst_framebuffer;
    std::unordered_map<u32, OGLBuffer> copy_pbo_cache;
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -92,8 +92,32 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
        }
    case Maxwell::VertexAttribute::Type::UnsignedScaled:
        switch (attrib.size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
        case Maxwell::VertexAttribute::Size::Size_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
            return GL_UNSIGNED_BYTE;
+        case Maxwell::VertexAttribute::Size::Size_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return GL_UNSIGNED_SHORT;
+        default:
+            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+            return {};
+        }
+    case Maxwell::VertexAttribute::Type::SignedScaled:
+        switch (attrib.size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+            return GL_BYTE;
+        case Maxwell::VertexAttribute::Size::Size_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return GL_SHORT;
        default:
            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
            return {};
@@ -401,24 +425,24 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) {
    return GL_KEEP;
 }

-inline GLenum FrontFace(Maxwell::Cull::FrontFace front_face) {
+inline GLenum FrontFace(Maxwell::FrontFace front_face) {
    switch (front_face) {
-    case Maxwell::Cull::FrontFace::ClockWise:
+    case Maxwell::FrontFace::ClockWise:
        return GL_CW;
-    case Maxwell::Cull::FrontFace::CounterClockWise:
+    case Maxwell::FrontFace::CounterClockWise:
        return GL_CCW;
    }
    LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face));
    return GL_CCW;
 }

-inline GLenum CullFace(Maxwell::Cull::CullFace cull_face) {
+inline GLenum CullFace(Maxwell::CullFace cull_face) {
    switch (cull_face) {
-    case Maxwell::Cull::CullFace::Front:
+    case Maxwell::CullFace::Front:
        return GL_FRONT;
-    case Maxwell::Cull::CullFace::Back:
+    case Maxwell::CullFace::Back:
        return GL_BACK;
-    case Maxwell::Cull::CullFace::FrontAndBack:
+    case Maxwell::CullFace::FrontAndBack:
        return GL_FRONT_AND_BACK;
    }
    LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face));
@@ -464,5 +488,18 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) {
    return GL_COPY;
 }

+inline GLenum PolygonMode(Maxwell::PolygonMode polygon_mode) {
+    switch (polygon_mode) {
+    case Maxwell::PolygonMode::Point:
+        return GL_POINT;
+    case Maxwell::PolygonMode::Line:
+        return GL_LINE;
+    case Maxwell::PolygonMode::Fill:
+        return GL_FILL;
+    }
+    UNREACHABLE_MSG("Invalid polygon mode={}", static_cast<int>(polygon_mode));
+    return GL_FILL;
+}
+
 } // namespace MaxwellToGL
 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -5,8 +5,11 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdlib>
+#include <cstring>
 #include <memory>
+
 #include <glad/glad.h>
+
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/microprofile.h"
@@ -20,10 +23,13 @@
 #include "core/telemetry_session.h"
 #include "video_core/morton.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
+#include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"

 namespace OpenGL {

+namespace {
+
 // If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have
 // to wait on available presentation frames.
 constexpr std::size_t SWAP_CHAIN_SIZE = 3;
@@ -40,133 +46,13 @@ struct Frame {
    bool is_srgb{};                   /// Framebuffer is sRGB or RGB
 };

-/**
- * For smooth Vsync rendering, we want to always present the latest frame that the core generates,
- * but also make sure that rendering happens at the pace that the frontend dictates. This is a
- * helper class that the renderer uses to sync frames between the render thread and the presentation
- * thread
- */
-class FrameMailbox {
-public:
-    std::mutex swap_chain_lock;
-    std::condition_variable present_cv;
-    std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{};
-    std::queue<Frame*> free_queue;
-    std::deque<Frame*> present_queue;
-    Frame* previous_frame{};
-
-    FrameMailbox() {
-        for (auto& frame : swap_chain) {
-            free_queue.push(&frame);
-        }
-    }
-
-    ~FrameMailbox() {
-        // lock the mutex and clear out the present and free_queues and notify any people who are
-        // blocked to prevent deadlock on shutdown
-        std::scoped_lock lock{swap_chain_lock};
-        std::queue<Frame*>().swap(free_queue);
-        present_queue.clear();
-        present_cv.notify_all();
-    }
-
-    void ReloadPresentFrame(Frame* frame, u32 height, u32 width) {
-        frame->present.Release();
-        frame->present.Create();
-        GLint previous_draw_fbo{};
-        glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo);
-        glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle);
-        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
-                                  frame->color.handle);
-        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
-            LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!");
-        }
-        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo);
-        frame->color_reloaded = false;
-    }
-
-    void ReloadRenderFrame(Frame* frame, u32 width, u32 height) {
-        OpenGLState prev_state = OpenGLState::GetCurState();
-        OpenGLState state = OpenGLState::GetCurState();
-
-        // Recreate the color texture attachment
-        frame->color.Release();
-        frame->color.Create();
-        state.renderbuffer = frame->color.handle;
-        state.Apply();
-        glRenderbufferStorage(GL_RENDERBUFFER, frame->is_srgb ? GL_SRGB8 : GL_RGB8, width, height);
-
-        // Recreate the FBO for the render target
-        frame->render.Release();
-        frame->render.Create();
-        state.draw.read_framebuffer = frame->render.handle;
-        state.draw.draw_framebuffer = frame->render.handle;
-        state.Apply();
-        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
-                                  frame->color.handle);
-        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
-            LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!");
-        }
-        prev_state.Apply();
-        frame->width = width;
-        frame->height = height;
-        frame->color_reloaded = true;
-    }
-
-    Frame* GetRenderFrame() {
-        std::unique_lock lock{swap_chain_lock};
-
-        // If theres no free frames, we will reuse the oldest render frame
-        if (free_queue.empty()) {
-            auto frame = present_queue.back();
-            present_queue.pop_back();
-            return frame;
-        }
-
-        Frame* frame = free_queue.front();
-        free_queue.pop();
-        return frame;
-    }
-
-    void ReleaseRenderFrame(Frame* frame) {
-        std::unique_lock lock{swap_chain_lock};
-        present_queue.push_front(frame);
-        present_cv.notify_one();
-    }
-
-    Frame* TryGetPresentFrame(int timeout_ms) {
-        std::unique_lock lock{swap_chain_lock};
-        // wait for new entries in the present_queue
-        present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms),
-                            [&] { return !present_queue.empty(); });
-        if (present_queue.empty()) {
-            // timed out waiting for a frame to draw so return the previous frame
-            return previous_frame;
-        }
-
-        // free the previous frame and add it back to the free queue
-        if (previous_frame) {
-            free_queue.push(previous_frame);
-        }
-
-        // the newest entries are pushed to the front of the queue
-        Frame* frame = present_queue.front();
-        present_queue.pop_front();
-        // remove all old entries from the present queue and move them back to the free_queue
-        for (auto f : present_queue) {
-            free_queue.push(f);
-        }
-        present_queue.clear();
-        previous_frame = frame;
-        return frame;
-    }
-};
-
-namespace {
-
-constexpr char vertex_shader[] = R"(
+constexpr char VERTEX_SHADER[] = R"(
 #version 430 core

+out gl_PerVertex {
+    vec4 gl_Position;
+};
+
 layout (location = 0) in vec2 vert_position;
 layout (location = 1) in vec2 vert_tex_coord;
 layout (location = 0) out vec2 frag_tex_coord;
@@ -187,7 +73,7 @@ void main() {
 }
 )";

-constexpr char fragment_shader[] = R"(
+constexpr char FRAGMENT_SHADER[] = R"(
 #version 430 core

 layout (location = 0) in vec2 frag_tex_coord;
@@ -196,7 +82,7 @@ layout (location = 0) out vec4 color;
 layout (binding = 0) uniform sampler2D color_texture;

 void main() {
-    color = texture(color_texture, frag_tex_coord);
+    color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f);
 }
 )";

@@ -205,13 +91,31 @@ constexpr GLint TexCoordLocation = 1;
 constexpr GLint ModelViewMatrixLocation = 0;

 struct ScreenRectVertex {
-    constexpr ScreenRectVertex(GLfloat x, GLfloat y, GLfloat u, GLfloat v)
-        : position{{x, y}}, tex_coord{{u, v}} {}
+    constexpr ScreenRectVertex(u32 x, u32 y, GLfloat u, GLfloat v)
+        : position{{static_cast<GLfloat>(x), static_cast<GLfloat>(y)}}, tex_coord{{u, v}} {}

    std::array<GLfloat, 2> position;
    std::array<GLfloat, 2> tex_coord;
 };

+/// Returns true if any debug tool is attached
+bool HasDebugTool() {
+    const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
+    if (nsight) {
+        return true;
+    }
+
+    GLint num_extensions;
+    glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions);
+    for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) {
+        const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index));
+        if (!std::strcmp(name, "GL_EXT_debug_tool")) {
+            return true;
+        }
+    }
+    return false;
+}
+
 /**
 * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left
 * corner and (width, height) on the lower-bottom.
@@ -295,6 +199,153 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit

 } // Anonymous namespace

+/**
+ * For smooth Vsync rendering, we want to always present the latest frame that the core generates,
+ * but also make sure that rendering happens at the pace that the frontend dictates. This is a
+ * helper class that the renderer uses to sync frames between the render thread and the presentation
+ * thread
+ */
+class FrameMailbox {
+public:
+    std::mutex swap_chain_lock;
+    std::condition_variable present_cv;
+    std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{};
+    std::queue<Frame*> free_queue;
+    std::deque<Frame*> present_queue;
+    Frame* previous_frame{};
+
+    FrameMailbox() : has_debug_tool{HasDebugTool()} {
+        for (auto& frame : swap_chain) {
+            free_queue.push(&frame);
+        }
+    }
+
+    ~FrameMailbox() {
+        // lock the mutex and clear out the present and free_queues and notify any people who are
+        // blocked to prevent deadlock on shutdown
+        std::scoped_lock lock{swap_chain_lock};
+        std::queue<Frame*>().swap(free_queue);
+        present_queue.clear();
+        present_cv.notify_all();
+    }
+
+    void ReloadPresentFrame(Frame* frame, u32 height, u32 width) {
+        frame->present.Release();
+        frame->present.Create();
+        GLint previous_draw_fbo{};
+        glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo);
+        glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle);
+        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
+                                  frame->color.handle);
+        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+            LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!");
+        }
+        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo);
+        frame->color_reloaded = false;
+    }
+
+    void ReloadRenderFrame(Frame* frame, u32 width, u32 height) {
+        // Recreate the color texture attachment
+        frame->color.Release();
+        frame->color.Create();
+        const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8;
+        glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height);
+
+        // Recreate the FBO for the render target
+        frame->render.Release();
+        frame->render.Create();
+        glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle);
+        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
+                                  frame->color.handle);
+        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+            LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!");
+        }
+
+        frame->width = width;
+        frame->height = height;
+        frame->color_reloaded = true;
+    }
+
+    Frame* GetRenderFrame() {
+        std::unique_lock lock{swap_chain_lock};
+
+        // If theres no free frames, we will reuse the oldest render frame
+        if (free_queue.empty()) {
+            auto frame = present_queue.back();
+            present_queue.pop_back();
+            return frame;
+        }
+
+        Frame* frame = free_queue.front();
+        free_queue.pop();
+        return frame;
+    }
+
+    void ReleaseRenderFrame(Frame* frame) {
+        std::unique_lock lock{swap_chain_lock};
+        present_queue.push_front(frame);
+        present_cv.notify_one();
+
+        DebugNotifyNextFrame();
+    }
+
+    Frame* TryGetPresentFrame(int timeout_ms) {
+        DebugWaitForNextFrame();
+
+        std::unique_lock lock{swap_chain_lock};
+        // wait for new entries in the present_queue
+        present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms),
+                            [&] { return !present_queue.empty(); });
+        if (present_queue.empty()) {
+            // timed out waiting for a frame to draw so return the previous frame
+            return previous_frame;
+        }
+
+        // free the previous frame and add it back to the free queue
+        if (previous_frame) {
+            free_queue.push(previous_frame);
+        }
+
+        // the newest entries are pushed to the front of the queue
+        Frame* frame = present_queue.front();
+        present_queue.pop_front();
+        // remove all old entries from the present queue and move them back to the free_queue
+        for (auto f : present_queue) {
+            free_queue.push(f);
+        }
+        present_queue.clear();
+        previous_frame = frame;
+        return frame;
+    }
+
+private:
+    std::mutex debug_synch_mutex;
+    std::condition_variable debug_synch_condition;
+    std::atomic_int frame_for_debug{};
+    const bool has_debug_tool; // When true, using a GPU debugger, so keep frames in lock-step
+
+    /// Signal that a new frame is available (called from GPU thread)
+    void DebugNotifyNextFrame() {
+        if (!has_debug_tool) {
+            return;
+        }
+        frame_for_debug++;
+        std::lock_guard lock{debug_synch_mutex};
+        debug_synch_condition.notify_one();
+    }
+
+    /// Wait for a new frame to be available (called from presentation thread)
+    void DebugWaitForNextFrame() {
+        if (!has_debug_tool) {
+            return;
+        }
+        const int last_frame = frame_for_debug;
+        std::unique_lock lock{debug_synch_mutex};
+        debug_synch_condition.wait(lock,
+                                   [this, last_frame] { return frame_for_debug > last_frame; });
+    }
+};
+
 RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system)
    : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system},
      frame_mailbox{std::make_unique<FrameMailbox>()} {}
@@ -311,11 +362,6 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
        return;
    }

-    // Maintain the rasterizer's state as a priority
-    OpenGLState prev_state = OpenGLState::GetCurState();
-    state.AllDirty();
-    state.Apply();
-
    PrepareRendertarget(framebuffer);
    RenderScreenshot();

@@ -353,13 +399,12 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {

        // Recreate the frame if the size of the window has changed
        if (layout.width != frame->width || layout.height != frame->height ||
-            is_srgb != frame->is_srgb) {
+            screen_info.display_srgb != frame->is_srgb) {
            LOG_DEBUG(Render_OpenGL, "Reloading render frame");
-            is_srgb = frame->is_srgb = screen_info.display_srgb;
+            frame->is_srgb = screen_info.display_srgb;
            frame_mailbox->ReloadRenderFrame(frame, layout.width, layout.height);
        }
-        state.draw.draw_framebuffer = frame->render.handle;
-        state.Apply();
+        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, frame->render.handle);
        DrawScreen(layout);
        // Create a fence for the frontend to wait on and swap this frame to OffTex
        frame->render_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
@@ -368,10 +413,6 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
        m_current_frame++;
        rasterizer->TickFrame();
    }
-
-    // Restore the rasterizer state
-    prev_state.AllDirty();
-    prev_state.Apply();
 }

 void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) {
@@ -442,31 +483,24 @@ void RendererOpenGL::InitOpenGLObjects() {
    glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
                 0.0f);

-    // Link shaders and get variable locations
-    shader.CreateFromSource(vertex_shader, nullptr, fragment_shader);
-    state.draw.shader_program = shader.handle;
-    state.AllDirty();
-    state.Apply();
+    // Create shader programs
+    OGLShader vertex_shader;
+    vertex_shader.Create(VERTEX_SHADER, GL_VERTEX_SHADER);
+
+    OGLShader fragment_shader;
+    fragment_shader.Create(FRAGMENT_SHADER, GL_FRAGMENT_SHADER);
+
+    vertex_program.Create(true, false, vertex_shader.handle);
+    fragment_program.Create(true, false, fragment_shader.handle);
+
+    // Create program pipeline
+    program_manager.Create();

    // Generate VBO handle for drawing
    vertex_buffer.Create();

-    // Generate VAO
-    vertex_array.Create();
-    state.draw.vertex_array = vertex_array.handle;
-
    // Attach vertex data to VAO
    glNamedBufferData(vertex_buffer.handle, sizeof(ScreenRectVertex) * 4, nullptr, GL_STREAM_DRAW);
-    glVertexArrayAttribFormat(vertex_array.handle, PositionLocation, 2, GL_FLOAT, GL_FALSE,
-                              offsetof(ScreenRectVertex, position));
-    glVertexArrayAttribFormat(vertex_array.handle, TexCoordLocation, 2, GL_FLOAT, GL_FALSE,
-                              offsetof(ScreenRectVertex, tex_coord));
-    glVertexArrayAttribBinding(vertex_array.handle, PositionLocation, 0);
-    glVertexArrayAttribBinding(vertex_array.handle, TexCoordLocation, 0);
-    glEnableVertexArrayAttrib(vertex_array.handle, PositionLocation);
-    glEnableVertexArrayAttrib(vertex_array.handle, TexCoordLocation);
-    glVertexArrayVertexBuffer(vertex_array.handle, 0, vertex_buffer.handle, 0,
-                              sizeof(ScreenRectVertex));

    // Allocate textures for the screen
    screen_info.texture.resource.Create(GL_TEXTURE_2D);
@@ -499,7 +533,8 @@ void RendererOpenGL::CreateRasterizer() {
    if (rasterizer) {
        return;
    }
-    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info);
+    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info,
+                                                    program_manager, state_tracker);
 }

 void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
@@ -538,8 +573,19 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
    glTextureStorage2D(texture.resource.handle, 1, internal_format, texture.width, texture.height);
 }

-void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, float y, float w,
-                                         float h) {
+void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
+    if (renderer_settings.set_background_color) {
+        // Update background color before drawing
+        glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
+                     0.0f);
+    }
+
+    // Set projection matrix
+    const std::array ortho_matrix =
+        MakeOrthographicMatrix(static_cast<float>(layout.width), static_cast<float>(layout.height));
+    glProgramUniformMatrix3x2fv(vertex_program.handle, ModelViewMatrixLocation, 1, GL_FALSE,
+                                std::data(ortho_matrix));
+
    const auto& texcoords = screen_info.display_texcoords;
    auto left = texcoords.left;
    auto right = texcoords.right;
@@ -571,46 +617,79 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
                  static_cast<f32>(screen_info.texture.height);
    }

-    const std::array vertices = {
-        ScreenRectVertex(x, y, texcoords.top * scale_u, left * scale_v),
-        ScreenRectVertex(x + w, y, texcoords.bottom * scale_u, left * scale_v),
-        ScreenRectVertex(x, y + h, texcoords.top * scale_u, right * scale_v),
-        ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, right * scale_v),
-    };
-
-    state.textures[0] = screen_info.display_texture;
-    state.framebuffer_srgb.enabled = screen_info.display_srgb;
-    state.AllDirty();
-    state.Apply();
-    glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), std::data(vertices));
-    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
-    // Restore default state
-    state.framebuffer_srgb.enabled = false;
-    state.textures[0] = 0;
-    state.AllDirty();
-    state.Apply();
-}
-
-void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
-    if (renderer_settings.set_background_color) {
-        // Update background color before drawing
-        glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
-                     0.0f);
-    }
-
    const auto& screen = layout.screen;
+    const std::array vertices = {
+        ScreenRectVertex(screen.left, screen.top, texcoords.top * scale_u, left * scale_v),
+        ScreenRectVertex(screen.right, screen.top, texcoords.bottom * scale_u, left * scale_v),
+        ScreenRectVertex(screen.left, screen.bottom, texcoords.top * scale_u, right * scale_v),
+        ScreenRectVertex(screen.right, screen.bottom, texcoords.bottom * scale_u, right * scale_v),
+    };
+    glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), std::data(vertices));
+
+    // TODO: Signal state tracker about these changes
+    state_tracker.NotifyScreenDrawVertexArray();
+    state_tracker.NotifyPolygonModes();
+    state_tracker.NotifyViewport0();
+    state_tracker.NotifyScissor0();
+    state_tracker.NotifyColorMask0();
+    state_tracker.NotifyBlend0();
+    state_tracker.NotifyFramebuffer();
+    state_tracker.NotifyFrontFace();
+    state_tracker.NotifyCullTest();
+    state_tracker.NotifyDepthTest();
+    state_tracker.NotifyStencilTest();
+    state_tracker.NotifyPolygonOffset();
+    state_tracker.NotifyRasterizeEnable();
+    state_tracker.NotifyFramebufferSRGB();
+    state_tracker.NotifyLogicOp();
+    state_tracker.NotifyClipControl();
+    state_tracker.NotifyAlphaTest();
+
+    program_manager.UseVertexShader(vertex_program.handle);
+    program_manager.UseGeometryShader(0);
+    program_manager.UseFragmentShader(fragment_program.handle);
+    program_manager.BindGraphicsPipeline();
+
+    glEnable(GL_CULL_FACE);
+    if (screen_info.display_srgb) {
+        glEnable(GL_FRAMEBUFFER_SRGB);
+    } else {
+        glDisable(GL_FRAMEBUFFER_SRGB);
+    }
+    glDisable(GL_COLOR_LOGIC_OP);
+    glDisable(GL_DEPTH_TEST);
+    glDisable(GL_STENCIL_TEST);
+    glDisable(GL_POLYGON_OFFSET_FILL);
+    glDisable(GL_RASTERIZER_DISCARD);
+    glDisable(GL_ALPHA_TEST);
+    glDisablei(GL_BLEND, 0);
+    glDisablei(GL_SCISSOR_TEST, 0);
+    glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+    glCullFace(GL_BACK);
+    glFrontFace(GL_CW);
+    glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+    glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
+    glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width),
+                       static_cast<GLfloat>(layout.height));
+    glDepthRangeIndexed(0, 0.0, 0.0);
+
+    glEnableVertexAttribArray(PositionLocation);
+    glEnableVertexAttribArray(TexCoordLocation);
+    glVertexAttribDivisor(PositionLocation, 0);
+    glVertexAttribDivisor(TexCoordLocation, 0);
+    glVertexAttribFormat(PositionLocation, 2, GL_FLOAT, GL_FALSE,
+                         offsetof(ScreenRectVertex, position));
+    glVertexAttribFormat(TexCoordLocation, 2, GL_FLOAT, GL_FALSE,
+                         offsetof(ScreenRectVertex, tex_coord));
+    glVertexAttribBinding(PositionLocation, 0);
+    glVertexAttribBinding(TexCoordLocation, 0);
+    glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+
+    glBindTextureUnit(0, screen_info.display_texture);
+    glBindSampler(0, 0);

-    glViewport(0, 0, layout.width, layout.height);
    glClear(GL_COLOR_BUFFER_BIT);
-
-    // Set projection matrix
-    const std::array ortho_matrix =
-        MakeOrthographicMatrix(static_cast<float>(layout.width), static_cast<float>(layout.height));
-    glUniformMatrix3x2fv(ModelViewMatrixLocation, 1, GL_FALSE, ortho_matrix.data());
-
-    DrawScreenTriangles(screen_info, static_cast<float>(screen.left),
-                        static_cast<float>(screen.top), static_cast<float>(screen.GetWidth()),
-                        static_cast<float>(screen.GetHeight()));
+    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
 }

 void RendererOpenGL::TryPresent(int timeout_ms) {
@@ -653,13 +732,14 @@ void RendererOpenGL::RenderScreenshot() {
        return;
    }

+    GLint old_read_fb;
+    GLint old_draw_fb;
+    glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING, &old_read_fb);
+    glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &old_draw_fb);
+
    // Draw the current frame to the screenshot framebuffer
    screenshot_framebuffer.Create();
-    GLuint old_read_fb = state.draw.read_framebuffer;
-    GLuint old_draw_fb = state.draw.draw_framebuffer;
-    state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle;
-    state.AllDirty();
-    state.Apply();
+    glBindFramebuffer(GL_FRAMEBUFFER, screenshot_framebuffer.handle);

    Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};

@@ -676,12 +756,11 @@ void RendererOpenGL::RenderScreenshot() {
                 renderer_settings.screenshot_bits);

    screenshot_framebuffer.Release();
-    state.draw.read_framebuffer = old_read_fb;
-    state.draw.draw_framebuffer = old_draw_fb;
-    state.AllDirty();
-    state.Apply();
    glDeleteRenderbuffers(1, &renderbuffer);

+    glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb);
+    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb);
+
    renderer_settings.screenshot_complete_callback();
    renderer_settings.screenshot_requested = false;
 }
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -10,7 +10,8 @@
 #include "common/math_util.h"
 #include "video_core/renderer_base.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
-#include "video_core/renderer_opengl/gl_state.h"
+#include "video_core/renderer_opengl/gl_shader_manager.h"
+#include "video_core/renderer_opengl/gl_state_tracker.h"

 namespace Core {
 class System;
@@ -76,8 +77,6 @@ private:
    /// Draws the emulated screens to the emulator window.
    void DrawScreen(const Layout::FramebufferLayout& layout);

-    void DrawScreenTriangles(const ScreenInfo& screen_info, float x, float y, float w, float h);
-
    void RenderScreenshot();

    /// Loads framebuffer from emulated memory into the active OpenGL texture.
@@ -93,17 +92,20 @@ private:
    Core::Frontend::EmuWindow& emu_window;
    Core::System& system;

-    OpenGLState state;
+    StateTracker state_tracker{system};

    // OpenGL object IDs
-    OGLVertexArray vertex_array;
    OGLBuffer vertex_buffer;
-    OGLProgram shader;
+    OGLProgram vertex_program;
+    OGLProgram fragment_program;
    OGLFramebuffer screenshot_framebuffer;

    /// Display information for Switch screen
    ScreenInfo screen_info;

+    /// Global dummy shader pipeline
+    GLShader::ProgramManager program_manager;
+
    /// OpenGL framebuffer data
    std::vector<u8> gl_framebuffer_data;

@@ -111,9 +113,6 @@ private:
    Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags;
    Common::Rectangle<int> framebuffer_crop_rect;

-    /// Represents if the final render frame is sRGB
-    bool is_srgb{};
-
    /// Frame presentation mailbox
    std::unique_ptr<FrameMailbox> frame_mailbox;
 };
--- a/src/video_core/renderer_opengl/utils.cpp
+++ b/src/video_core/renderer_opengl/utils.cpp
@@ -9,6 +9,7 @@
 #include <glad/glad.h>

 #include "common/common_types.h"
+#include "video_core/renderer_opengl/gl_state_tracker.h"
 #include "video_core/renderer_opengl/utils.h"

 namespace OpenGL {
@@ -20,12 +21,12 @@ struct VertexArrayPushBuffer::Entry {
    GLsizei stride{};
 };

-VertexArrayPushBuffer::VertexArrayPushBuffer() = default;
+VertexArrayPushBuffer::VertexArrayPushBuffer(StateTracker& state_tracker)
+    : state_tracker{state_tracker} {}

 VertexArrayPushBuffer::~VertexArrayPushBuffer() = default;

-void VertexArrayPushBuffer::Setup(GLuint vao_) {
-    vao = vao_;
+void VertexArrayPushBuffer::Setup() {
    index_buffer = nullptr;
    vertex_buffers.clear();
 }
@@ -41,13 +42,11 @@ void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint*

 void VertexArrayPushBuffer::Bind() {
    if (index_buffer) {
-        glVertexArrayElementBuffer(vao, *index_buffer);
+        state_tracker.BindIndexBuffer(*index_buffer);
    }

-    // TODO(Rodrigo): Find a way to ARB_multi_bind this
    for (const auto& entry : vertex_buffers) {
-        glVertexArrayVertexBuffer(vao, entry.binding_index, *entry.buffer, entry.offset,
-                                  entry.stride);
+        glBindVertexBuffer(entry.binding_index, *entry.buffer, entry.offset, entry.stride);
    }
 }

--- a/src/video_core/renderer_opengl/utils.h
+++ b/src/video_core/renderer_opengl/utils.h
@@ -11,12 +11,14 @@

 namespace OpenGL {

+class StateTracker;
+
 class VertexArrayPushBuffer final {
 public:
-    explicit VertexArrayPushBuffer();
+    explicit VertexArrayPushBuffer(StateTracker& state_tracker);
    ~VertexArrayPushBuffer();

-    void Setup(GLuint vao_);
+    void Setup();

    void SetIndexBuffer(const GLuint* buffer);

@@ -28,7 +30,8 @@ public:
 private:
    struct Entry;

-    GLuint vao{};
+    StateTracker& state_tracker;
+
    const GLuint* index_buffer{};
    std::vector<Entry> vertex_buffers;
 };
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
@@ -112,19 +112,18 @@ constexpr FixedPipelineState::Rasterizer GetRasterizerState(const Maxwell& regs)
    const auto& clip = regs.view_volume_clip_control;
    const bool depth_clamp_enabled = clip.depth_clamp_near == 1 || clip.depth_clamp_far == 1;

-    Maxwell::Cull::FrontFace front_face = regs.cull.front_face;
+    Maxwell::FrontFace front_face = regs.front_face;
    if (regs.screen_y_control.triangle_rast_flip != 0 &&
        regs.viewport_transform[0].scale_y > 0.0f) {
-        if (front_face == Maxwell::Cull::FrontFace::CounterClockWise)
-            front_face = Maxwell::Cull::FrontFace::ClockWise;
-        else if (front_face == Maxwell::Cull::FrontFace::ClockWise)
-            front_face = Maxwell::Cull::FrontFace::CounterClockWise;
+        if (front_face == Maxwell::FrontFace::CounterClockWise)
+            front_face = Maxwell::FrontFace::ClockWise;
+        else if (front_face == Maxwell::FrontFace::ClockWise)
+            front_face = Maxwell::FrontFace::CounterClockWise;
    }

    const bool gl_ndc = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne;
-    return FixedPipelineState::Rasterizer(regs.cull.enabled, depth_bias_enabled,
-                                          depth_clamp_enabled, gl_ndc, regs.cull.cull_face,
-                                          front_face);
+    return FixedPipelineState::Rasterizer(regs.cull_test_enabled, depth_bias_enabled,
+                                          depth_clamp_enabled, gl_ndc, regs.cull_face, front_face);
 }

 } // Anonymous namespace
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.h
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.h
@@ -171,8 +171,8 @@ struct FixedPipelineState {

    struct Rasterizer {
        constexpr Rasterizer(bool cull_enable, bool depth_bias_enable, bool depth_clamp_enable,
-                             bool ndc_minus_one_to_one, Maxwell::Cull::CullFace cull_face,
-                             Maxwell::Cull::FrontFace front_face)
+                             bool ndc_minus_one_to_one, Maxwell::CullFace cull_face,
+                             Maxwell::FrontFace front_face)
            : cull_enable{cull_enable}, depth_bias_enable{depth_bias_enable},
              depth_clamp_enable{depth_clamp_enable}, ndc_minus_one_to_one{ndc_minus_one_to_one},
              cull_face{cull_face}, front_face{front_face} {}
@@ -182,8 +182,8 @@ struct FixedPipelineState {
        bool depth_bias_enable;
        bool depth_clamp_enable;
        bool ndc_minus_one_to_one;
-        Maxwell::Cull::CullFace cull_face;
-        Maxwell::Cull::FrontFace front_face;
+        Maxwell::CullFace cull_face;
+        Maxwell::FrontFace front_face;

        std::size_t Hash() const noexcept;

--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -120,11 +120,12 @@ struct FormatTuple {
    {vk::Format::eA8B8G8R8UintPack32, Attachable | Storage},     // ABGR8UI
    {vk::Format::eB5G6R5UnormPack16, {}},                        // B5G6R5U
    {vk::Format::eA2B10G10R10UnormPack32, Attachable | Storage}, // A2B10G10R10U
-    {vk::Format::eA1R5G5B5UnormPack16, Attachable | Storage},    // A1B5G5R5U (flipped with swizzle)
+    {vk::Format::eA1R5G5B5UnormPack16, Attachable},              // A1B5G5R5U (flipped with swizzle)
    {vk::Format::eR8Unorm, Attachable | Storage},                // R8U
    {vk::Format::eR8Uint, Attachable | Storage},                 // R8UI
    {vk::Format::eR16G16B16A16Sfloat, Attachable | Storage},     // RGBA16F
    {vk::Format::eR16G16B16A16Unorm, Attachable | Storage},      // RGBA16U
+    {vk::Format::eR16G16B16A16Snorm, Attachable | Storage},      // RGBA16S
    {vk::Format::eR16G16B16A16Uint, Attachable | Storage},       // RGBA16UI
    {vk::Format::eB10G11R11UfloatPack32, Attachable | Storage},  // R11FG11FB10F
    {vk::Format::eR32G32B32A32Uint, Attachable | Storage},       // RGBA32UI
@@ -331,6 +332,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr
            return vk::Format::eR16G16B16Unorm;
        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
            return vk::Format::eR16G16B16A16Unorm;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return vk::Format::eA2B10G10R10UnormPack32;
        default:
            break;
        }
@@ -371,8 +374,22 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr
        }
    case Maxwell::VertexAttribute::Type::UnsignedScaled:
        switch (size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+            return vk::Format::eR8Uscaled;
        case Maxwell::VertexAttribute::Size::Size_8_8:
            return vk::Format::eR8G8Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
+            return vk::Format::eR8G8B8Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+            return vk::Format::eR8G8B8A8Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16:
+            return vk::Format::eR16Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return vk::Format::eR16G16Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+            return vk::Format::eR16G16B16Uscaled;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return vk::Format::eR16G16B16A16Uscaled;
        default:
            break;
        }
@@ -572,24 +589,24 @@ vk::BlendFactor BlendFactor(Maxwell::Blend::Factor factor) {
    return {};
 }

-vk::FrontFace FrontFace(Maxwell::Cull::FrontFace front_face) {
+vk::FrontFace FrontFace(Maxwell::FrontFace front_face) {
    switch (front_face) {
-    case Maxwell::Cull::FrontFace::ClockWise:
+    case Maxwell::FrontFace::ClockWise:
        return vk::FrontFace::eClockwise;
-    case Maxwell::Cull::FrontFace::CounterClockWise:
+    case Maxwell::FrontFace::CounterClockWise:
        return vk::FrontFace::eCounterClockwise;
    }
    UNIMPLEMENTED_MSG("Unimplemented front face={}", static_cast<u32>(front_face));
    return {};
 }

-vk::CullModeFlags CullFace(Maxwell::Cull::CullFace cull_face) {
+vk::CullModeFlags CullFace(Maxwell::CullFace cull_face) {
    switch (cull_face) {
-    case Maxwell::Cull::CullFace::Front:
+    case Maxwell::CullFace::Front:
        return vk::CullModeFlagBits::eFront;
-    case Maxwell::Cull::CullFace::Back:
+    case Maxwell::CullFace::Back:
        return vk::CullModeFlagBits::eBack;
-    case Maxwell::Cull::CullFace::FrontAndBack:
+    case Maxwell::CullFace::FrontAndBack:
        return vk::CullModeFlagBits::eFrontAndBack;
    }
    UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face));
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.h
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h
@@ -54,9 +54,9 @@ vk::BlendOp BlendEquation(Maxwell::Blend::Equation equation);

 vk::BlendFactor BlendFactor(Maxwell::Blend::Factor factor);

-vk::FrontFace FrontFace(Maxwell::Cull::FrontFace front_face);
+vk::FrontFace FrontFace(Maxwell::FrontFace front_face);

-vk::CullModeFlags CullFace(Maxwell::Cull::CullFace cull_face);
+vk::CullModeFlags CullFace(Maxwell::CullFace cull_face);

 vk::ComponentSwizzle SwizzleSource(Tegra::Texture::SwizzleSource swizzle);

--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -27,6 +27,7 @@
 #include "video_core/renderer_vulkan/vk_rasterizer.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
+#include "video_core/renderer_vulkan/vk_state_tracker.h"
 #include "video_core/renderer_vulkan/vk_swapchain.h"

 namespace Vulkan {
@@ -177,10 +178,13 @@ bool RendererVulkan::Init() {
    swapchain = std::make_unique<VKSwapchain>(surface, *device);
    swapchain->Create(framebuffer.width, framebuffer.height, false);

-    scheduler = std::make_unique<VKScheduler>(*device, *resource_manager);
+    state_tracker = std::make_unique<StateTracker>(system);
+
+    scheduler = std::make_unique<VKScheduler>(*device, *resource_manager, *state_tracker);

    rasterizer = std::make_unique<RasterizerVulkan>(system, render_window, screen_info, *device,
-                                                    *resource_manager, *memory_manager, *scheduler);
+                                                    *resource_manager, *memory_manager,
+                                                    *state_tracker, *scheduler);

    blit_screen = std::make_unique<VKBlitScreen>(system, render_window, *rasterizer, *device,
                                                 *resource_manager, *memory_manager, *swapchain,
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -4,8 +4,10 @@

 #pragma once

+#include <memory>
 #include <optional>
 #include <vector>
+
 #include "video_core/renderer_base.h"
 #include "video_core/renderer_vulkan/declarations.h"

@@ -15,6 +17,7 @@ class System;

 namespace Vulkan {

+class StateTracker;
 class VKBlitScreen;
 class VKDevice;
 class VKFence;
@@ -61,6 +64,7 @@ private:
    std::unique_ptr<VKSwapchain> swapchain;
    std::unique_ptr<VKMemoryManager> memory_manager;
    std::unique_ptr<VKResourceManager> resource_manager;
+    std::unique_ptr<StateTracker> state_tracker;
    std::unique_ptr<VKScheduler> scheduler;
    std::unique_ptr<VKBlitScreen> blit_screen;
 };
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -73,7 +73,7 @@ UniqueDescriptorUpdateTemplate VKComputePipeline::CreateDescriptorUpdateTemplate
    std::vector<vk::DescriptorUpdateTemplateEntry> template_entries;
    u32 binding = 0;
    u32 offset = 0;
-    FillDescriptorUpdateTemplateEntries(device, entries, binding, offset, template_entries);
+    FillDescriptorUpdateTemplateEntries(entries, binding, offset, template_entries);
    if (template_entries.empty()) {
        // If the shader doesn't use descriptor sets, skip template creation.
        return UniqueDescriptorUpdateTemplate{};
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -107,8 +107,7 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
    features.occlusionQueryPrecise = true;
    features.fragmentStoresAndAtomics = true;
    features.shaderImageGatherExtended = true;
-    features.shaderStorageImageReadWithoutFormat =
-        is_shader_storage_img_read_without_format_supported;
+    features.shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported;
    features.shaderStorageImageWriteWithoutFormat = true;
    features.textureCompressionASTC_LDR = is_optimal_astc_supported;

@@ -148,6 +147,15 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
        LOG_INFO(Render_Vulkan, "Device doesn't support uint8 indexes");
    }

+    vk::PhysicalDeviceTransformFeedbackFeaturesEXT transform_feedback;
+    if (ext_transform_feedback) {
+        transform_feedback.transformFeedback = true;
+        transform_feedback.geometryStreams = true;
+        SetNext(next, transform_feedback);
+    } else {
+        LOG_INFO(Render_Vulkan, "Device doesn't support transform feedbacks");
+    }
+
    if (!ext_depth_range_unrestricted) {
        LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted");
    }
@@ -385,7 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
        }
    };

-    extensions.reserve(14);
+    extensions.reserve(15);
    extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
    extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME);
    extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME);
@@ -397,18 +405,22 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami

    [[maybe_unused]] const bool nsight =
        std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
-    bool khr_shader_float16_int8{};
-    bool ext_subgroup_size_control{};
+    bool has_khr_shader_float16_int8{};
+    bool has_ext_subgroup_size_control{};
+    bool has_ext_transform_feedback{};
    for (const auto& extension : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) {
        Test(extension, khr_uniform_buffer_standard_layout,
             VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true);
-        Test(extension, khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false);
+        Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
+             false);
        Test(extension, ext_depth_range_unrestricted,
             VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true);
        Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true);
        Test(extension, ext_shader_viewport_index_layer,
             VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true);
-        Test(extension, ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
+        Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
+             false);
+        Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME,
             false);
        if (Settings::values.renderer_debug) {
            Test(extension, nv_device_diagnostic_checkpoints,
@@ -416,13 +428,13 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
        }
    }

-    if (khr_shader_float16_int8) {
+    if (has_khr_shader_float16_int8) {
        is_float16_supported =
            GetFeatures<vk::PhysicalDeviceFloat16Int8FeaturesKHR>(physical, dldi).shaderFloat16;
        extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME);
    }

-    if (ext_subgroup_size_control) {
+    if (has_ext_subgroup_size_control) {
        const auto features =
            GetFeatures<vk::PhysicalDeviceSubgroupSizeControlFeaturesEXT>(physical, dldi);
        const auto properties =
@@ -439,6 +451,20 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
        is_warp_potentially_bigger = true;
    }

+    if (has_ext_transform_feedback) {
+        const auto features =
+            GetFeatures<vk::PhysicalDeviceTransformFeedbackFeaturesEXT>(physical, dldi);
+        const auto properties =
+            GetProperties<vk::PhysicalDeviceTransformFeedbackPropertiesEXT>(physical, dldi);
+
+        if (features.transformFeedback && features.geometryStreams &&
+            properties.maxTransformFeedbackStreams >= 4 && properties.maxTransformFeedbackBuffers &&
+            properties.transformFeedbackQueries && properties.transformFeedbackDraw) {
+            extensions.push_back(VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME);
+            ext_transform_feedback = true;
+        }
+    }
+
    return extensions;
 }

@@ -467,8 +493,7 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK

 void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) {
    const auto supported_features{physical.getFeatures(dldi)};
-    is_shader_storage_img_read_without_format_supported =
-        supported_features.shaderStorageImageReadWithoutFormat;
+    is_formatless_image_load_supported = supported_features.shaderStorageImageReadWithoutFormat;
    is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi);
 }

@@ -510,6 +535,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti
                                        vk::Format::eR32G32Sfloat,
                                        vk::Format::eR32G32Uint,
                                        vk::Format::eR16G16B16A16Uint,
+                                        vk::Format::eR16G16B16A16Snorm,
                                        vk::Format::eR16G16B16A16Unorm,
                                        vk::Format::eR16G16Unorm,
                                        vk::Format::eR16G16Snorm,
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -122,11 +122,6 @@ public:
        return properties.limits.maxPushConstantsSize;
    }

-    /// Returns true if Shader storage Image Read Without Format supported.
-    bool IsShaderStorageImageReadWithoutFormatSupported() const {
-        return is_shader_storage_img_read_without_format_supported;
-    }
-
    /// Returns true if ASTC is natively supported.
    bool IsOptimalAstcSupported() const {
        return is_optimal_astc_supported;
@@ -147,6 +142,11 @@ public:
        return (guest_warp_stages & stage) != vk::ShaderStageFlags{};
    }

+    /// Returns true if formatless image load is supported.
+    bool IsFormatlessImageLoadSupported() const {
+        return is_formatless_image_load_supported;
+    }
+
    /// Returns true if the device supports VK_EXT_scalar_block_layout.
    bool IsKhrUniformBufferStandardLayoutSupported() const {
        return khr_uniform_buffer_standard_layout;
@@ -167,6 +167,11 @@ public:
        return ext_shader_viewport_index_layer;
    }

+    /// Returns true if the device supports VK_EXT_transform_feedback.
+    bool IsExtTransformFeedbackSupported() const {
+        return ext_transform_feedback;
+    }
+
    /// Returns true if the device supports VK_NV_device_diagnostic_checkpoints.
    bool IsNvDeviceDiagnosticCheckpoints() const {
        return nv_device_diagnostic_checkpoints;
@@ -214,26 +219,26 @@ private:
    static std::unordered_map<vk::Format, vk::FormatProperties> GetFormatProperties(
        const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical);

-    const vk::PhysicalDevice physical;         ///< Physical device.
-    vk::DispatchLoaderDynamic dld;             ///< Device function pointers.
-    vk::PhysicalDeviceProperties properties;   ///< Device properties.
-    UniqueDevice logical;                      ///< Logical device.
-    vk::Queue graphics_queue;                  ///< Main graphics queue.
-    vk::Queue present_queue;                   ///< Main present queue.
-    u32 graphics_family{};                     ///< Main graphics queue family index.
-    u32 present_family{};                      ///< Main present queue family index.
-    vk::DriverIdKHR driver_id{};               ///< Driver ID.
-    vk::ShaderStageFlags guest_warp_stages{};  ///< Stages where the guest warp size can be forced.
-    bool is_optimal_astc_supported{};          ///< Support for native ASTC.
-    bool is_float16_supported{};               ///< Support for float16 arithmetics.
-    bool is_warp_potentially_bigger{};         ///< Host warp size can be bigger than guest.
+    const vk::PhysicalDevice physical;        ///< Physical device.
+    vk::DispatchLoaderDynamic dld;            ///< Device function pointers.
+    vk::PhysicalDeviceProperties properties;  ///< Device properties.
+    UniqueDevice logical;                     ///< Logical device.
+    vk::Queue graphics_queue;                 ///< Main graphics queue.
+    vk::Queue present_queue;                  ///< Main present queue.
+    u32 graphics_family{};                    ///< Main graphics queue family index.
+    u32 present_family{};                     ///< Main present queue family index.
+    vk::DriverIdKHR driver_id{};              ///< Driver ID.
+    vk::ShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced.ed
+    bool is_optimal_astc_supported{};         ///< Support for native ASTC.
+    bool is_float16_supported{};              ///< Support for float16 arithmetics.
+    bool is_warp_potentially_bigger{};        ///< Host warp size can be bigger than guest.
+    bool is_formatless_image_load_supported{}; ///< Support for shader image read without format.
    bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs.
    bool ext_index_type_uint8{};               ///< Support for VK_EXT_index_type_uint8.
    bool ext_depth_range_unrestricted{};       ///< Support for VK_EXT_depth_range_unrestricted.
    bool ext_shader_viewport_index_layer{};    ///< Support for VK_EXT_shader_viewport_index_layer.
+    bool ext_transform_feedback{};             ///< Support for VK_EXT_transform_feedback.
    bool nv_device_diagnostic_checkpoints{};   ///< Support for VK_NV_device_diagnostic_checkpoints.
-    bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage
-                                                                ///< image read without format

    // Telemetry parameters
    std::string vendor_name;                      ///< Device's driver name.
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -97,8 +97,7 @@ UniqueDescriptorUpdateTemplate VKGraphicsPipeline::CreateDescriptorUpdateTemplat
    u32 offset = 0;
    for (const auto& stage : program) {
        if (stage) {
-            FillDescriptorUpdateTemplateEntries(device, stage->entries, binding, offset,
-                                                template_entries);
+            FillDescriptorUpdateTemplateEntries(stage->entries, binding, offset, template_entries);
        }
    }
    if (template_entries.empty()) {
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -36,6 +36,13 @@ using Tegra::Engines::ShaderType;

 namespace {

+// C++20's using enum
+constexpr auto eUniformBuffer = vk::DescriptorType::eUniformBuffer;
+constexpr auto eStorageBuffer = vk::DescriptorType::eStorageBuffer;
+constexpr auto eUniformTexelBuffer = vk::DescriptorType::eUniformTexelBuffer;
+constexpr auto eCombinedImageSampler = vk::DescriptorType::eCombinedImageSampler;
+constexpr auto eStorageImage = vk::DescriptorType::eStorageImage;
+
 constexpr VideoCommon::Shader::CompilerSettings compiler_settings{
    VideoCommon::Shader::CompileDepth::FullDecompile};

@@ -119,23 +126,32 @@ ShaderType GetShaderType(Maxwell::ShaderProgram program) {
    }
 }

+template <vk::DescriptorType descriptor_type, class Container>
+void AddBindings(std::vector<vk::DescriptorSetLayoutBinding>& bindings, u32& binding,
+                 vk::ShaderStageFlags stage_flags, const Container& container) {
+    const u32 num_entries = static_cast<u32>(std::size(container));
+    for (std::size_t i = 0; i < num_entries; ++i) {
+        u32 count = 1;
+        if constexpr (descriptor_type == eCombinedImageSampler) {
+            // Combined image samplers can be arrayed.
+            count = container[i].Size();
+        }
+        bindings.emplace_back(binding++, descriptor_type, count, stage_flags, nullptr);
+    }
+}
+
 u32 FillDescriptorLayout(const ShaderEntries& entries,
                         std::vector<vk::DescriptorSetLayoutBinding>& bindings,
                         Maxwell::ShaderProgram program_type, u32 base_binding) {
    const ShaderType stage = GetStageFromProgram(program_type);
-    const vk::ShaderStageFlags stage_flags = MaxwellToVK::ShaderStage(stage);
+    const vk::ShaderStageFlags flags = MaxwellToVK::ShaderStage(stage);

    u32 binding = base_binding;
-    const auto AddBindings = [&](vk::DescriptorType descriptor_type, std::size_t num_entries) {
-        for (std::size_t i = 0; i < num_entries; ++i) {
-            bindings.emplace_back(binding++, descriptor_type, 1, stage_flags, nullptr);
-        }
-    };
-    AddBindings(vk::DescriptorType::eUniformBuffer, entries.const_buffers.size());
-    AddBindings(vk::DescriptorType::eStorageBuffer, entries.global_buffers.size());
-    AddBindings(vk::DescriptorType::eUniformTexelBuffer, entries.texel_buffers.size());
-    AddBindings(vk::DescriptorType::eCombinedImageSampler, entries.samplers.size());
-    AddBindings(vk::DescriptorType::eStorageImage, entries.images.size());
+    AddBindings<eUniformBuffer>(bindings, binding, flags, entries.const_buffers);
+    AddBindings<eStorageBuffer>(bindings, binding, flags, entries.global_buffers);
+    AddBindings<eUniformTexelBuffer>(bindings, binding, flags, entries.texel_buffers);
+    AddBindings<eCombinedImageSampler>(bindings, binding, flags, entries.samplers);
+    AddBindings<eStorageImage>(bindings, binding, flags, entries.images);
    return binding;
 }

@@ -145,8 +161,8 @@ CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stag
                           GPUVAddr gpu_addr, VAddr cpu_addr, u8* host_ptr,
                           ProgramCode program_code, u32 main_offset)
    : RasterizerCacheObject{host_ptr}, gpu_addr{gpu_addr}, cpu_addr{cpu_addr},
-      program_code{std::move(program_code)}, locker{stage, GetEngine(system, stage)},
-      shader_ir{this->program_code, main_offset, compiler_settings, locker},
+      program_code{std::move(program_code)}, registry{stage, GetEngine(system, stage)},
+      shader_ir{this->program_code, main_offset, compiler_settings, registry},
      entries{GenerateShaderEntries(shader_ir)} {}

 CachedShader::~CachedShader() = default;
@@ -172,11 +188,6 @@ VKPipelineCache::~VKPipelineCache() = default;

 std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
    const auto& gpu = system.GPU().Maxwell3D();
-    auto& dirty = system.GPU().Maxwell3D().dirty.shaders;
-    if (!dirty) {
-        return last_shaders;
-    }
-    dirty = false;

    std::array<Shader, Maxwell::MaxShaderProgram> shaders;
    for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@@ -262,9 +273,9 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
    specialization.workgroup_size = key.workgroup_size;
    specialization.shared_memory_size = key.shared_memory_size;

-    const SPIRVShader spirv_shader{
-        Decompile(device, shader->GetIR(), ShaderType::Compute, specialization),
-        shader->GetEntries()};
+    const SPIRVShader spirv_shader{Decompile(device, shader->GetIR(), ShaderType::Compute,
+                                             shader->GetRegistry(), specialization),
+                                   shader->GetEntries()};
    entry = std::make_unique<VKComputePipeline>(device, scheduler, descriptor_pool,
                                                update_descriptor_queue, spirv_shader);
    return *entry;
@@ -313,8 +324,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
    const auto& gpu = system.GPU().Maxwell3D();

    Specialization specialization;
-    specialization.primitive_topology = fixed_state.input_assembly.topology;
-    if (specialization.primitive_topology == Maxwell::PrimitiveTopology::Points) {
+    if (fixed_state.input_assembly.topology == Maxwell::PrimitiveTopology::Points) {
        ASSERT(fixed_state.input_assembly.point_size != 0.0f);
        specialization.point_size = fixed_state.input_assembly.point_size;
    }
@@ -322,9 +332,6 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
        specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].type;
    }
    specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one;
-    specialization.tessellation.primitive = fixed_state.tessellation.primitive;
-    specialization.tessellation.spacing = fixed_state.tessellation.spacing;
-    specialization.tessellation.clockwise = fixed_state.tessellation.clockwise;

    SPIRVProgram program;
    std::vector<vk::DescriptorSetLayoutBinding> bindings;
@@ -345,8 +352,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
        const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
        const auto program_type = GetShaderType(program_enum);
        const auto& entries = shader->GetEntries();
-        program[stage] = {Decompile(device, shader->GetIR(), program_type, specialization),
-                          entries};
+        program[stage] = {
+            Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization),
+            entries};

        if (program_enum == Maxwell::ShaderProgram::VertexA) {
            // VertexB was combined with VertexA, so we skip the VertexB iteration
@@ -361,32 +369,45 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
    return {std::move(program), std::move(bindings)};
 }

-void FillDescriptorUpdateTemplateEntries(
-    const VKDevice& device, const ShaderEntries& entries, u32& binding, u32& offset,
-    std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries) {
-    static constexpr auto entry_size = static_cast<u32>(sizeof(DescriptorUpdateEntry));
-    const auto AddEntry = [&](vk::DescriptorType descriptor_type, std::size_t count_) {
-        const u32 count = static_cast<u32>(count_);
-        if (descriptor_type == vk::DescriptorType::eUniformTexelBuffer &&
-            device.GetDriverID() == vk::DriverIdKHR::eNvidiaProprietary) {
-            // Nvidia has a bug where updating multiple uniform texels at once causes the driver to
-            // crash.
-            for (u32 i = 0; i < count; ++i) {
-                template_entries.emplace_back(binding + i, 0, 1, descriptor_type,
-                                              offset + i * entry_size, entry_size);
-            }
-        } else if (count != 0) {
-            template_entries.emplace_back(binding, 0, count, descriptor_type, offset, entry_size);
-        }
-        offset += count * entry_size;
-        binding += count;
-    };
+template <vk::DescriptorType descriptor_type, class Container>
+void AddEntry(std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries, u32& binding,
+              u32& offset, const Container& container) {
+    static constexpr u32 entry_size = static_cast<u32>(sizeof(DescriptorUpdateEntry));
+    const u32 count = static_cast<u32>(std::size(container));

-    AddEntry(vk::DescriptorType::eUniformBuffer, entries.const_buffers.size());
-    AddEntry(vk::DescriptorType::eStorageBuffer, entries.global_buffers.size());
-    AddEntry(vk::DescriptorType::eUniformTexelBuffer, entries.texel_buffers.size());
-    AddEntry(vk::DescriptorType::eCombinedImageSampler, entries.samplers.size());
-    AddEntry(vk::DescriptorType::eStorageImage, entries.images.size());
+    if constexpr (descriptor_type == eCombinedImageSampler) {
+        for (u32 i = 0; i < count; ++i) {
+            const u32 num_samplers = container[i].Size();
+            template_entries.emplace_back(binding, 0, num_samplers, descriptor_type, offset,
+                                          entry_size);
+            ++binding;
+            offset += num_samplers * entry_size;
+        }
+        return;
+    }
+
+    if constexpr (descriptor_type == eUniformTexelBuffer) {
+        // Nvidia has a bug where updating multiple uniform texels at once causes the driver to
+        // crash.
+        for (u32 i = 0; i < count; ++i) {
+            template_entries.emplace_back(binding + i, 0, 1, descriptor_type,
+                                          offset + i * entry_size, entry_size);
+        }
+    } else if (count > 0) {
+        template_entries.emplace_back(binding, 0, count, descriptor_type, offset, entry_size);
+    }
+    offset += count * entry_size;
+    binding += count;
+}
+
+void FillDescriptorUpdateTemplateEntries(
+    const ShaderEntries& entries, u32& binding, u32& offset,
+    std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries) {
+    AddEntry<eUniformBuffer>(template_entries, offset, binding, entries.const_buffers);
+    AddEntry<eStorageBuffer>(template_entries, offset, binding, entries.global_buffers);
+    AddEntry<eUniformTexelBuffer>(template_entries, offset, binding, entries.texel_buffers);
+    AddEntry<eCombinedImageSampler>(template_entries, offset, binding, entries.samplers);
+    AddEntry<eStorageImage>(template_entries, offset, binding, entries.images);
 }

 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -25,7 +25,7 @@
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_shader_decompiler.h"
-#include "video_core/shader/const_buffer_locker.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
 #include "video_core/surface.h"

@@ -132,6 +132,10 @@ public:
        return shader_ir;
    }

+    const VideoCommon::Shader::Registry& GetRegistry() const {
+        return registry;
+    }
+
    const VideoCommon::Shader::ShaderIR& GetIR() const {
        return shader_ir;
    }
@@ -147,7 +151,7 @@ private:
    GPUVAddr gpu_addr{};
    VAddr cpu_addr{};
    ProgramCode program_code;
-    VideoCommon::Shader::ConstBufferLocker locker;
+    VideoCommon::Shader::Registry registry;
    VideoCommon::Shader::ShaderIR shader_ir;
    ShaderEntries entries;
 };
@@ -194,7 +198,7 @@ private:
 };

 void FillDescriptorUpdateTemplateEntries(
-    const VKDevice& device, const ShaderEntries& entries, u32& binding, u32& offset,
+    const ShaderEntries& entries, u32& binding, u32& offset,
    std::vector<vk::DescriptorUpdateTemplateEntry>& template_entries);

 } // namespace Vulkan
--- a/Show More
+++ b/Show More