GPU: Flush commands on every dma pusher step.

This commit ensures that the host gpu is constantly fed with commands to work with, while the guest gpu keeps producing the rest of the commands. This reduces syncing time between host and guest gpu.
Merge pull request #2592 from FernandoS27/sync1
2019-07-26 16:54:22 -04:00 · 2019-07-26 14:26:44 -04:00 · 2019-07-25 13:04:56 -04:00 · 2019-07-25 12:41:52 -04:00 · 2019-07-25 12:35:07 -04:00 · 2019-07-25 12:34:36 -04:00
85 changed files with 1909 additions and 653 deletions
--- a/.ci/templates/build-single.yml
+++ b/.ci/templates/build-single.yml
@@ -14,7 +14,7 @@ steps:
    cacheHitVar: CACHE_RESTORED
 - script: chmod a+x ./.ci/scripts/$(ScriptFolder)/exec.sh && ./.ci/scripts/$(ScriptFolder)/exec.sh
  displayName: 'Build'
- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && ./.ci/scripts/$(ScriptFolder)/upload.sh
+- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && RELEASE_NAME=$(BuildName) ./.ci/scripts/$(ScriptFolder)/upload.sh
  displayName: 'Package Artifacts'
 - publish: artifacts
  artifact: 'yuzu-$(BuildName)-$(BuildSuffix)'
--- a/.ci/templates/build-standard.yml
+++ b/.ci/templates/build-standard.yml
@@ -3,7 +3,7 @@ jobs:
  displayName: 'standard'
  pool:
    vmImage: ubuntu-latest
-  strategy: 
+  strategy:
    maxParallel: 10
    matrix:
      windows:
--- a/.ci/templates/build-testing.yml
+++ b/.ci/templates/build-testing.yml
@@ -3,19 +3,21 @@ jobs:
  displayName: 'testing'
  pool:
    vmImage: ubuntu-latest
-  strategy: 
-    maxParallel: 10
+  strategy:
+    maxParallel: 5
    matrix:
      windows:
        BuildSuffix: 'windows-testing'
        ScriptFolder: 'windows'
  steps:
+  - script: pip install requests urllib3
+    displayName: 'Prepare Environment'
  - task: PythonScript@0
    condition: eq(variables['Build.Reason'], 'PullRequest')
    displayName: 'Determine Testing Status'
    inputs:
      scriptSource: 'filePath'
-      scriptPath: '../scripts/merge/check-label-presence.py'
+      scriptPath: '.ci/scripts/merge/check-label-presence.py'
      arguments: '$(System.PullRequest.PullRequestNumber) create-testing-build'
  - ${{ if eq(variables.enabletesting, 'true') }}:
    - template: ./sync-source.yml
@@ -27,4 +29,4 @@ jobs:
        matchLabel: 'testing-merge'
    - template: ./build-single.yml
      parameters:
-        artifactSource: 'false'
+        artifactSource: 'false'
--- a/.ci/templates/release.yml
+++ b/.ci/templates/release.yml
@@ -1,29 +0,0 @@
-steps:
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Windows Release'
-    inputs:
-      artifactName: 'yuzu-$(BuildName)-windows-mingw'
-      buildType: 'current'
-      targetPath: '$(Build.ArtifactStagingDirectory)'
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Linux Release'
-    inputs:
-      artifactName: 'yuzu-$(BuildName)-linux'
-      buildType: 'current'
-      targetPath: '$(Build.ArtifactStagingDirectory)'
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Release Point'
-    inputs:
-      artifactName: 'yuzu-$(BuildName)-release-point'
-      buildType: 'current'
-      targetPath: '$(Build.ArtifactStagingDirectory)'
-  - script: echo '##vso[task.setvariable variable=tagcommit]' && cat $(Build.ArtifactStagingDirectory)/tag-commit.sha
-    displayName: 'Calculate Release Point'
-  - task: GitHubRelease@0
-    inputs:
-      gitHubConnection: $(GitHubReleaseConnectionName)
-      repositoryName: '$(GitHubReleaseRepoName)'
-      action: 'create'
-      target: $(variables.tagcommit)
-      title: 'yuzu $(BuildName) #$(Build.BuildId)'
-      assets: '$(Build.ArtifactStagingDirectory)/*'
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@ yuzu emulator
 =============
 [![Travis CI Build Status](https://travis-ci.org/yuzu-emu/yuzu.svg?branch=master)](https://travis-ci.org/yuzu-emu/yuzu)
 [![AppVeyor CI Build Status](https://ci.appveyor.com/api/projects/status/77k97svb2usreu68?svg=true)](https://ci.appveyor.com/project/bunnei/yuzu)
+[![Azure Mainline CI Build Status](https://dev.azure.com/yuzu-emu/yuzu/_apis/build/status/yuzu%20mainline?branchName=master)](https://dev.azure.com/yuzu-emu/yuzu/)

 yuzu is an experimental open-source emulator for the Nintendo Switch from the creators of [Citra](https://citra-emu.org/).

--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -111,6 +111,8 @@ add_library(core STATIC
    frontend/scope_acquire_window_context.h
    gdbstub/gdbstub.cpp
    gdbstub/gdbstub.h
+    hardware_interrupt_manager.cpp
+    hardware_interrupt_manager.h
    hle/ipc.h
    hle/ipc_helpers.h
    hle/kernel/address_arbiter.cpp
@@ -372,6 +374,7 @@ add_library(core STATIC
    hle/service/nvdrv/devices/nvmap.h
    hle/service/nvdrv/interface.cpp
    hle/service/nvdrv/interface.h
+    hle/service/nvdrv/nvdata.h
    hle/service/nvdrv/nvdrv.cpp
    hle/service/nvdrv/nvdrv.h
    hle/service/nvdrv/nvmemp.cpp
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -19,6 +19,7 @@
 #include "core/file_sys/vfs_concat.h"
 #include "core/file_sys/vfs_real.h"
 #include "core/gdbstub/gdbstub.h"
+#include "core/hardware_interrupt_manager.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/process.h"
@@ -151,7 +152,7 @@ struct System::Impl {
        if (!renderer->Init()) {
            return ResultStatus::ErrorVideoCore;
        }
-
+        interrupt_manager = std::make_unique<Core::Hardware::InterruptManager>(system);
        gpu_core = VideoCore::CreateGPU(system);

        is_powered_on = true;
@@ -298,6 +299,7 @@ struct System::Impl {
    std::unique_ptr<VideoCore::RendererBase> renderer;
    std::unique_ptr<Tegra::GPU> gpu_core;
    std::shared_ptr<Tegra::DebugContext> debug_context;
+    std::unique_ptr<Core::Hardware::InterruptManager> interrupt_manager;
    CpuCoreManager cpu_core_manager;
    bool is_powered_on = false;

@@ -444,6 +446,14 @@ const Tegra::GPU& System::GPU() const {
    return *impl->gpu_core;
 }

+Core::Hardware::InterruptManager& System::InterruptManager() {
+    return *impl->interrupt_manager;
+}
+
+const Core::Hardware::InterruptManager& System::InterruptManager() const {
+    return *impl->interrupt_manager;
+}
+
 VideoCore::RendererBase& System::Renderer() {
    return *impl->renderer;
 }
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -70,6 +70,10 @@ namespace Core::Timing {
 class CoreTiming;
 }

+namespace Core::Hardware {
+class InterruptManager;
+}
+
 namespace Core {

 class ARM_Interface;
@@ -234,6 +238,12 @@ public:
    /// Provides a constant reference to the core timing instance.
    const Timing::CoreTiming& CoreTiming() const;

+    /// Provides a reference to the interrupt manager instance.
+    Core::Hardware::InterruptManager& InterruptManager();
+
+    /// Provides a constant reference to the interrupt manager instance.
+    const Core::Hardware::InterruptManager& InterruptManager() const;
+
    /// Provides a reference to the kernel instance.
    Kernel::KernelCore& Kernel();

--- a/src/core/hardware_interrupt_manager.cpp
+++ b/src/core/hardware_interrupt_manager.cpp
@@ -0,0 +1,30 @@
+// Copyright 2019 Yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/hardware_interrupt_manager.h"
+#include "core/hle/service/nvdrv/interface.h"
+#include "core/hle/service/sm/sm.h"
+
+namespace Core::Hardware {
+
+InterruptManager::InterruptManager(Core::System& system_in) : system(system_in) {
+    gpu_interrupt_event =
+        system.CoreTiming().RegisterEvent("GPUInterrupt", [this](u64 message, s64) {
+            auto nvdrv = system.ServiceManager().GetService<Service::Nvidia::NVDRV>("nvdrv");
+            const u32 syncpt = static_cast<u32>(message >> 32);
+            const u32 value = static_cast<u32>(message);
+            nvdrv->SignalGPUInterruptSyncpt(syncpt, value);
+        });
+}
+
+InterruptManager::~InterruptManager() = default;
+
+void InterruptManager::GPUInterruptSyncpt(const u32 syncpoint_id, const u32 value) {
+    const u64 msg = (static_cast<u64>(syncpoint_id) << 32ULL) | value;
+    system.CoreTiming().ScheduleEvent(10, gpu_interrupt_event, msg);
+}
+
+} // namespace Core::Hardware
--- a/src/core/hardware_interrupt_manager.h
+++ b/src/core/hardware_interrupt_manager.h
@@ -0,0 +1,31 @@
+// Copyright 2019 Yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Core {
+class System;
+}
+
+namespace Core::Timing {
+struct EventType;
+}
+
+namespace Core::Hardware {
+
+class InterruptManager {
+public:
+    explicit InterruptManager(Core::System& system);
+    ~InterruptManager();
+
+    void GPUInterruptSyncpt(u32 syncpoint_id, u32 value);
+
+private:
+    Core::System& system;
+    Core::Timing::EventType* gpu_interrupt_event{};
+};
+
+} // namespace Core::Hardware
--- a/src/core/hle/service/nvdrv/devices/nvdevice.h
+++ b/src/core/hle/service/nvdrv/devices/nvdevice.h
@@ -8,6 +8,11 @@
 #include "common/bit_field.h"
 #include "common/common_types.h"
 #include "common/swap.h"
+#include "core/hle/service/nvdrv/nvdata.h"
+
+namespace Core {
+class System;
+}

 namespace Service::Nvidia::Devices {

@@ -15,7 +20,7 @@ namespace Service::Nvidia::Devices {
 /// implement the ioctl interface.
 class nvdevice {
 public:
-    nvdevice() = default;
+    explicit nvdevice(Core::System& system) : system{system} {};
    virtual ~nvdevice() = default;
    union Ioctl {
        u32_le raw;
@@ -33,7 +38,11 @@ public:
     * @param output A buffer where the output data will be written to.
     * @returns The result code of the ioctl.
     */
-    virtual u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) = 0;
+    virtual u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                      IoctlCtrl& ctrl) = 0;
+
+protected:
+    Core::System& system;
 };

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
@@ -13,10 +13,12 @@

 namespace Service::Nvidia::Devices {

-nvdisp_disp0::nvdisp_disp0(std::shared_ptr<nvmap> nvmap_dev) : nvmap_dev(std::move(nvmap_dev)) {}
+nvdisp_disp0::nvdisp_disp0(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
 nvdisp_disp0 ::~nvdisp_disp0() = default;

-u32 nvdisp_disp0::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvdisp_disp0::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                        IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl");
    return 0;
 }
@@ -34,9 +36,8 @@ void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u3
        addr,      offset,   width, height, stride, static_cast<PixelFormat>(format),
        transform, crop_rect};

-    auto& instance = Core::System::GetInstance();
-    instance.GetPerfStats().EndGameFrame();
-    instance.GPU().SwapBuffers(framebuffer);
+    system.GetPerfStats().EndGameFrame();
+    system.GPU().SwapBuffers(framebuffer);
 }

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
@@ -17,10 +17,11 @@ class nvmap;

 class nvdisp_disp0 final : public nvdevice {
 public:
-    explicit nvdisp_disp0(std::shared_ptr<nvmap> nvmap_dev);
+    explicit nvdisp_disp0(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
    ~nvdisp_disp0() override;

-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;

    /// Performs a screen flip, drawing the buffer pointed to by the handle.
    void flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u32 height, u32 stride,
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
@@ -22,10 +22,12 @@ enum {
 };
 }

-nvhost_as_gpu::nvhost_as_gpu(std::shared_ptr<nvmap> nvmap_dev) : nvmap_dev(std::move(nvmap_dev)) {}
+nvhost_as_gpu::nvhost_as_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
 nvhost_as_gpu::~nvhost_as_gpu() = default;

-u32 nvhost_as_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_as_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                         IoctlCtrl& ctrl) {
    LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
              command.raw, input.size(), output.size());

@@ -65,7 +67,7 @@ u32 nvhost_as_gpu::AllocateSpace(const std::vector<u8>& input, std::vector<u8>&
    LOG_DEBUG(Service_NVDRV, "called, pages={:X}, page_size={:X}, flags={:X}", params.pages,
              params.page_size, params.flags);

-    auto& gpu = Core::System::GetInstance().GPU();
+    auto& gpu = system.GPU();
    const u64 size{static_cast<u64>(params.pages) * static_cast<u64>(params.page_size)};
    if (params.flags & 1) {
        params.offset = gpu.MemoryManager().AllocateSpace(params.offset, size, 1);
@@ -85,7 +87,7 @@ u32 nvhost_as_gpu::Remap(const std::vector<u8>& input, std::vector<u8>& output)
    std::vector<IoctlRemapEntry> entries(num_entries);
    std::memcpy(entries.data(), input.data(), input.size());

-    auto& gpu = Core::System::GetInstance().GPU();
+    auto& gpu = system.GPU();
    for (const auto& entry : entries) {
        LOG_WARNING(Service_NVDRV, "remap entry, offset=0x{:X} handle=0x{:X} pages=0x{:X}",
                    entry.offset, entry.nvmap_handle, entry.pages);
@@ -136,7 +138,7 @@ u32 nvhost_as_gpu::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& ou
    // case to prevent unexpected behavior.
    ASSERT(object->id == params.nvmap_handle);

-    auto& gpu = Core::System::GetInstance().GPU();
+    auto& gpu = system.GPU();

    if (params.flags & 1) {
        params.offset = gpu.MemoryManager().MapBufferEx(object->addr, params.offset, object->size);
@@ -173,8 +175,7 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& ou
        return 0;
    }

-    params.offset = Core::System::GetInstance().GPU().MemoryManager().UnmapBuffer(params.offset,
-                                                                                  itr->second.size);
+    params.offset = system.GPU().MemoryManager().UnmapBuffer(params.offset, itr->second.size);
    buffer_mappings.erase(itr->second.offset);

    std::memcpy(output.data(), &params, output.size());
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
@@ -17,10 +17,11 @@ class nvmap;

 class nvhost_as_gpu final : public nvdevice {
 public:
-    explicit nvhost_as_gpu(std::shared_ptr<nvmap> nvmap_dev);
+    explicit nvhost_as_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
    ~nvhost_as_gpu() override;

-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;

 private:
    enum class IoctlCommand : u32_le {
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
@@ -7,14 +7,20 @@

 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
+#include "core/hle/kernel/readable_event.h"
+#include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/nvdrv/devices/nvhost_ctrl.h"
+#include "video_core/gpu.h"

 namespace Service::Nvidia::Devices {

-nvhost_ctrl::nvhost_ctrl() = default;
+nvhost_ctrl::nvhost_ctrl(Core::System& system, EventInterface& events_interface)
+    : nvdevice(system), events_interface{events_interface} {}
 nvhost_ctrl::~nvhost_ctrl() = default;

-u32 nvhost_ctrl::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_ctrl::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                       IoctlCtrl& ctrl) {
    LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
              command.raw, input.size(), output.size());

@@ -22,11 +28,15 @@ u32 nvhost_ctrl::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<
    case IoctlCommand::IocGetConfigCommand:
        return NvOsGetConfigU32(input, output);
    case IoctlCommand::IocCtrlEventWaitCommand:
-        return IocCtrlEventWait(input, output, false);
+        return IocCtrlEventWait(input, output, false, ctrl);
    case IoctlCommand::IocCtrlEventWaitAsyncCommand:
-        return IocCtrlEventWait(input, output, true);
+        return IocCtrlEventWait(input, output, true, ctrl);
    case IoctlCommand::IocCtrlEventRegisterCommand:
        return IocCtrlEventRegister(input, output);
+    case IoctlCommand::IocCtrlEventUnregisterCommand:
+        return IocCtrlEventUnregister(input, output);
+    case IoctlCommand::IocCtrlEventSignalCommand:
+        return IocCtrlEventSignal(input, output);
    }
    UNIMPLEMENTED_MSG("Unimplemented ioctl");
    return 0;
@@ -41,23 +51,137 @@ u32 nvhost_ctrl::NvOsGetConfigU32(const std::vector<u8>& input, std::vector<u8>&
 }

 u32 nvhost_ctrl::IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output,
-                                  bool is_async) {
+                                  bool is_async, IoctlCtrl& ctrl) {
    IocCtrlEventWaitParams params{};
    std::memcpy(&params, input.data(), sizeof(params));
-    LOG_WARNING(Service_NVDRV,
-                "(STUBBED) called, syncpt_id={}, threshold={}, timeout={}, is_async={}",
-                params.syncpt_id, params.threshold, params.timeout, is_async);
+    LOG_DEBUG(Service_NVDRV, "syncpt_id={}, threshold={}, timeout={}, is_async={}",
+              params.syncpt_id, params.threshold, params.timeout, is_async);

-    // TODO(Subv): Implement actual syncpt waiting.
-    params.value = 0;
+    if (params.syncpt_id >= MaxSyncPoints) {
+        return NvResult::BadParameter;
+    }
+
+    auto& gpu = system.GPU();
+    // This is mostly to take into account unimplemented features. As synced
+    // gpu is always synced.
+    if (!gpu.IsAsync()) {
+        return NvResult::Success;
+    }
+    auto lock = gpu.LockSync();
+    const u32 current_syncpoint_value = gpu.GetSyncpointValue(params.syncpt_id);
+    const s32 diff = current_syncpoint_value - params.threshold;
+    if (diff >= 0) {
+        params.value = current_syncpoint_value;
+        std::memcpy(output.data(), &params, sizeof(params));
+        return NvResult::Success;
+    }
+    const u32 target_value = current_syncpoint_value - diff;
+
+    if (!is_async) {
+        params.value = 0;
+    }
+
+    if (params.timeout == 0) {
+        std::memcpy(output.data(), &params, sizeof(params));
+        return NvResult::Timeout;
+    }
+
+    u32 event_id;
+    if (is_async) {
+        event_id = params.value & 0x00FF;
+        if (event_id >= MaxNvEvents) {
+            std::memcpy(output.data(), &params, sizeof(params));
+            return NvResult::BadParameter;
+        }
+    } else {
+        if (ctrl.fresh_call) {
+            const auto result = events_interface.GetFreeEvent();
+            if (result) {
+                event_id = *result;
+            } else {
+                LOG_CRITICAL(Service_NVDRV, "No Free Events available!");
+                event_id = params.value & 0x00FF;
+            }
+        } else {
+            event_id = ctrl.event_id;
+        }
+    }
+
+    EventState status = events_interface.status[event_id];
+    if (event_id < MaxNvEvents || status == EventState::Free || status == EventState::Registered) {
+        events_interface.SetEventStatus(event_id, EventState::Waiting);
+        events_interface.assigned_syncpt[event_id] = params.syncpt_id;
+        events_interface.assigned_value[event_id] = target_value;
+        if (is_async) {
+            params.value = params.syncpt_id << 4;
+        } else {
+            params.value = ((params.syncpt_id & 0xfff) << 16) | 0x10000000;
+        }
+        params.value |= event_id;
+        events_interface.events[event_id].writable->Clear();
+        gpu.RegisterSyncptInterrupt(params.syncpt_id, target_value);
+        if (!is_async && ctrl.fresh_call) {
+            ctrl.must_delay = true;
+            ctrl.timeout = params.timeout;
+            ctrl.event_id = event_id;
+            return NvResult::Timeout;
+        }
+        std::memcpy(output.data(), &params, sizeof(params));
+        return NvResult::Timeout;
+    }
    std::memcpy(output.data(), &params, sizeof(params));
-    return 0;
+    return NvResult::BadParameter;
 }

 u32 nvhost_ctrl::IocCtrlEventRegister(const std::vector<u8>& input, std::vector<u8>& output) {
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
-    // TODO(bunnei): Implement this.
-    return 0;
+    IocCtrlEventRegisterParams params{};
+    std::memcpy(&params, input.data(), sizeof(params));
+    const u32 event_id = params.user_event_id & 0x00FF;
+    LOG_DEBUG(Service_NVDRV, " called, user_event_id: {:X}", event_id);
+    if (event_id >= MaxNvEvents) {
+        return NvResult::BadParameter;
+    }
+    if (events_interface.registered[event_id]) {
+        return NvResult::BadParameter;
+    }
+    events_interface.RegisterEvent(event_id);
+    return NvResult::Success;
+}
+
+u32 nvhost_ctrl::IocCtrlEventUnregister(const std::vector<u8>& input, std::vector<u8>& output) {
+    IocCtrlEventUnregisterParams params{};
+    std::memcpy(&params, input.data(), sizeof(params));
+    const u32 event_id = params.user_event_id & 0x00FF;
+    LOG_DEBUG(Service_NVDRV, " called, user_event_id: {:X}", event_id);
+    if (event_id >= MaxNvEvents) {
+        return NvResult::BadParameter;
+    }
+    if (!events_interface.registered[event_id]) {
+        return NvResult::BadParameter;
+    }
+    events_interface.UnregisterEvent(event_id);
+    return NvResult::Success;
+}
+
+u32 nvhost_ctrl::IocCtrlEventSignal(const std::vector<u8>& input, std::vector<u8>& output) {
+    IocCtrlEventSignalParams params{};
+    std::memcpy(&params, input.data(), sizeof(params));
+    // TODO(Blinkhawk): This is normally called when an NvEvents timeout on WaitSynchronization
+    // It is believed from RE to cancel the GPU Event. However, better research is required
+    u32 event_id = params.user_event_id & 0x00FF;
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called, user_event_id: {:X}", event_id);
+    if (event_id >= MaxNvEvents) {
+        return NvResult::BadParameter;
+    }
+    if (events_interface.status[event_id] == EventState::Waiting) {
+        auto& gpu = system.GPU();
+        if (gpu.CancelSyncptInterrupt(events_interface.assigned_syncpt[event_id],
+                                      events_interface.assigned_value[event_id])) {
+            events_interface.LiberateEvent(event_id);
+            events_interface.events[event_id].writable->Signal();
+        }
+    }
+    return NvResult::Success;
 }

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
@@ -8,15 +8,17 @@
 #include <vector>
 #include "common/common_types.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
+#include "core/hle/service/nvdrv/nvdrv.h"

 namespace Service::Nvidia::Devices {

 class nvhost_ctrl final : public nvdevice {
 public:
-    nvhost_ctrl();
+    explicit nvhost_ctrl(Core::System& system, EventInterface& events_interface);
    ~nvhost_ctrl() override;

-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;

 private:
    enum class IoctlCommand : u32_le {
@@ -132,9 +134,16 @@ private:

    u32 NvOsGetConfigU32(const std::vector<u8>& input, std::vector<u8>& output);

-    u32 IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output, bool is_async);
+    u32 IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output, bool is_async,
+                         IoctlCtrl& ctrl);

    u32 IocCtrlEventRegister(const std::vector<u8>& input, std::vector<u8>& output);
+
+    u32 IocCtrlEventUnregister(const std::vector<u8>& input, std::vector<u8>& output);
+
+    u32 IocCtrlEventSignal(const std::vector<u8>& input, std::vector<u8>& output);
+
+    EventInterface& events_interface;
 };

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
@@ -12,10 +12,11 @@

 namespace Service::Nvidia::Devices {

-nvhost_ctrl_gpu::nvhost_ctrl_gpu() = default;
+nvhost_ctrl_gpu::nvhost_ctrl_gpu(Core::System& system) : nvdevice(system) {}
 nvhost_ctrl_gpu::~nvhost_ctrl_gpu() = default;

-u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                           IoctlCtrl& ctrl) {
    LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
              command.raw, input.size(), output.size());

@@ -185,7 +186,7 @@ u32 nvhost_ctrl_gpu::GetGpuTime(const std::vector<u8>& input, std::vector<u8>& o

    IoctlGetGpuTime params{};
    std::memcpy(&params, input.data(), input.size());
-    const auto ns = Core::Timing::CyclesToNs(Core::System::GetInstance().CoreTiming().GetTicks());
+    const auto ns = Core::Timing::CyclesToNs(system.CoreTiming().GetTicks());
    params.gpu_time = static_cast<u64_le>(ns.count());
    std::memcpy(output.data(), &params, output.size());
    return 0;
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {

 class nvhost_ctrl_gpu final : public nvdevice {
 public:
-    nvhost_ctrl_gpu();
+    explicit nvhost_ctrl_gpu(Core::System& system);
    ~nvhost_ctrl_gpu() override;

-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;

 private:
    enum class IoctlCommand : u32_le {
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -13,10 +13,12 @@

 namespace Service::Nvidia::Devices {

-nvhost_gpu::nvhost_gpu(std::shared_ptr<nvmap> nvmap_dev) : nvmap_dev(std::move(nvmap_dev)) {}
+nvhost_gpu::nvhost_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
 nvhost_gpu::~nvhost_gpu() = default;

-u32 nvhost_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                      IoctlCtrl& ctrl) {
    LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
              command.raw, input.size(), output.size());

@@ -119,8 +121,10 @@ u32 nvhost_gpu::AllocGPFIFOEx2(const std::vector<u8>& input, std::vector<u8>& ou
                params.num_entries, params.flags, params.unk0, params.unk1, params.unk2,
                params.unk3);

-    params.fence_out.id = 0;
-    params.fence_out.value = 0;
+    auto& gpu = system.GPU();
+    params.fence_out.id = assigned_syncpoints;
+    params.fence_out.value = gpu.GetSyncpointValue(assigned_syncpoints);
+    assigned_syncpoints++;
    std::memcpy(output.data(), &params, output.size());
    return 0;
 }
@@ -143,7 +147,7 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
    IoctlSubmitGpfifo params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
    LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
-                params.address, params.num_entries, params.flags);
+                params.address, params.num_entries, params.flags.raw);

    ASSERT_MSG(input.size() == sizeof(IoctlSubmitGpfifo) +
                                   params.num_entries * sizeof(Tegra::CommandListHeader),
@@ -153,10 +157,18 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
    std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
                params.num_entries * sizeof(Tegra::CommandListHeader));

-    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));
+    UNIMPLEMENTED_IF(params.flags.add_wait.Value() != 0);
+    UNIMPLEMENTED_IF(params.flags.add_increment.Value() != 0);
+
+    auto& gpu = system.GPU();
+    u32 current_syncpoint_value = gpu.GetSyncpointValue(params.fence_out.id);
+    if (params.flags.increment.Value()) {
+        params.fence_out.value += current_syncpoint_value;
+    } else {
+        params.fence_out.value = current_syncpoint_value;
+    }
+    gpu.PushGPUEntries(std::move(entries));

-    params.fence_out.id = 0;
-    params.fence_out.value = 0;
    std::memcpy(output.data(), &params, sizeof(IoctlSubmitGpfifo));
    return 0;
 }
@@ -168,16 +180,24 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
    IoctlSubmitGpfifo params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
    LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
-                params.address, params.num_entries, params.flags);
+                params.address, params.num_entries, params.flags.raw);

    Tegra::CommandList entries(params.num_entries);
    Memory::ReadBlock(params.address, entries.data(),
                      params.num_entries * sizeof(Tegra::CommandListHeader));

-    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));
+    UNIMPLEMENTED_IF(params.flags.add_wait.Value() != 0);
+    UNIMPLEMENTED_IF(params.flags.add_increment.Value() != 0);
+
+    auto& gpu = system.GPU();
+    u32 current_syncpoint_value = gpu.GetSyncpointValue(params.fence_out.id);
+    if (params.flags.increment.Value()) {
+        params.fence_out.value += current_syncpoint_value;
+    } else {
+        params.fence_out.value = current_syncpoint_value;
+    }
+    gpu.PushGPUEntries(std::move(entries));

-    params.fence_out.id = 0;
-    params.fence_out.value = 0;
    std::memcpy(output.data(), &params, output.size());
    return 0;
 }
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
@@ -10,6 +10,7 @@
 #include "common/common_types.h"
 #include "common/swap.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
+#include "core/hle/service/nvdrv/nvdata.h"

 namespace Service::Nvidia::Devices {

@@ -20,10 +21,11 @@ constexpr u32 NVGPU_IOCTL_CHANNEL_KICKOFF_PB(0x1b);

 class nvhost_gpu final : public nvdevice {
 public:
-    explicit nvhost_gpu(std::shared_ptr<nvmap> nvmap_dev);
+    explicit nvhost_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
    ~nvhost_gpu() override;

-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;

 private:
    enum class IoctlCommand : u32_le {
@@ -113,11 +115,7 @@ private:
    static_assert(sizeof(IoctlGetErrorNotification) == 16,
                  "IoctlGetErrorNotification is incorrect size");

-    struct IoctlFence {
-        u32_le id;
-        u32_le value;
-    };
-    static_assert(sizeof(IoctlFence) == 8, "IoctlFence is incorrect size");
+    static_assert(sizeof(Fence) == 8, "Fence is incorrect size");

    struct IoctlAllocGpfifoEx {
        u32_le num_entries;
@@ -132,13 +130,13 @@ private:
    static_assert(sizeof(IoctlAllocGpfifoEx) == 32, "IoctlAllocGpfifoEx is incorrect size");

    struct IoctlAllocGpfifoEx2 {
-        u32_le num_entries;   // in
-        u32_le flags;         // in
-        u32_le unk0;          // in (1 works)
-        IoctlFence fence_out; // out
-        u32_le unk1;          // in
-        u32_le unk2;          // in
-        u32_le unk3;          // in
+        u32_le num_entries; // in
+        u32_le flags;       // in
+        u32_le unk0;        // in (1 works)
+        Fence fence_out;    // out
+        u32_le unk1;        // in
+        u32_le unk2;        // in
+        u32_le unk3;        // in
    };
    static_assert(sizeof(IoctlAllocGpfifoEx2) == 32, "IoctlAllocGpfifoEx2 is incorrect size");

@@ -153,10 +151,16 @@ private:
    struct IoctlSubmitGpfifo {
        u64_le address;     // pointer to gpfifo entry structs
        u32_le num_entries; // number of fence objects being submitted
-        u32_le flags;
-        IoctlFence fence_out; // returned new fence object for others to wait on
+        union {
+            u32_le raw;
+            BitField<0, 1, u32_le> add_wait;      // append a wait sync_point to the list
+            BitField<1, 1, u32_le> add_increment; // append an increment to the list
+            BitField<2, 1, u32_le> new_hw_format; // Mostly ignored
+            BitField<8, 1, u32_le> increment;     // increment the returned fence
+        } flags;
+        Fence fence_out; // returned new fence object for others to wait on
    };
-    static_assert(sizeof(IoctlSubmitGpfifo) == 16 + sizeof(IoctlFence),
+    static_assert(sizeof(IoctlSubmitGpfifo) == 16 + sizeof(Fence),
                  "IoctlSubmitGpfifo is incorrect size");

    struct IoctlGetWaitbase {
@@ -184,6 +188,7 @@ private:
    u32 ChannelSetTimeout(const std::vector<u8>& input, std::vector<u8>& output);

    std::shared_ptr<nvmap> nvmap_dev;
+    u32 assigned_syncpoints{};
 };

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -10,10 +10,11 @@

 namespace Service::Nvidia::Devices {

-nvhost_nvdec::nvhost_nvdec() = default;
+nvhost_nvdec::nvhost_nvdec(Core::System& system) : nvdevice(system) {}
 nvhost_nvdec::~nvhost_nvdec() = default;

-u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                        IoctlCtrl& ctrl) {
    LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
              command.raw, input.size(), output.size());

--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {

 class nvhost_nvdec final : public nvdevice {
 public:
-    nvhost_nvdec();
+    explicit nvhost_nvdec(Core::System& system);
    ~nvhost_nvdec() override;

-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;

 private:
    enum class IoctlCommand : u32_le {
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
@@ -10,10 +10,11 @@

 namespace Service::Nvidia::Devices {

-nvhost_nvjpg::nvhost_nvjpg() = default;
+nvhost_nvjpg::nvhost_nvjpg(Core::System& system) : nvdevice(system) {}
 nvhost_nvjpg::~nvhost_nvjpg() = default;

-u32 nvhost_nvjpg::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_nvjpg::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                        IoctlCtrl& ctrl) {
    LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
              command.raw, input.size(), output.size());

--- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {

 class nvhost_nvjpg final : public nvdevice {
 public:
-    nvhost_nvjpg();
+    explicit nvhost_nvjpg(Core::System& system);
    ~nvhost_nvjpg() override;

-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;

 private:
    enum class IoctlCommand : u32_le {
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -10,10 +10,11 @@

 namespace Service::Nvidia::Devices {

-nvhost_vic::nvhost_vic() = default;
+nvhost_vic::nvhost_vic(Core::System& system) : nvdevice(system) {}
 nvhost_vic::~nvhost_vic() = default;

-u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                      IoctlCtrl& ctrl) {
    LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
              command.raw, input.size(), output.size());

--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {

 class nvhost_vic final : public nvdevice {
 public:
-    nvhost_vic();
+    explicit nvhost_vic(Core::System& system);
    ~nvhost_vic() override;

-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;

 private:
    enum class IoctlCommand : u32_le {
--- a/src/core/hle/service/nvdrv/devices/nvmap.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvmap.cpp
@@ -18,7 +18,7 @@ enum {
 };
 }

-nvmap::nvmap() = default;
+nvmap::nvmap(Core::System& system) : nvdevice(system) {}
 nvmap::~nvmap() = default;

 VAddr nvmap::GetObjectAddress(u32 handle) const {
@@ -28,7 +28,8 @@ VAddr nvmap::GetObjectAddress(u32 handle) const {
    return object->addr;
 }

-u32 nvmap::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvmap::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                 IoctlCtrl& ctrl) {
    switch (static_cast<IoctlCommand>(command.raw)) {
    case IoctlCommand::Create:
        return IocCreate(input, output);
--- a/src/core/hle/service/nvdrv/devices/nvmap.h
+++ b/src/core/hle/service/nvdrv/devices/nvmap.h
@@ -16,13 +16,14 @@ namespace Service::Nvidia::Devices {

 class nvmap final : public nvdevice {
 public:
-    nvmap();
+    explicit nvmap(Core::System& system);
    ~nvmap() override;

    /// Returns the allocated address of an nvmap object given its handle.
    VAddr GetObjectAddress(u32 handle) const;

-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;

    /// Represents an nvmap object.
    struct Object {
--- a/src/core/hle/service/nvdrv/interface.cpp
+++ b/src/core/hle/service/nvdrv/interface.cpp
@@ -8,12 +8,18 @@
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/readable_event.h"
+#include "core/hle/kernel/thread.h"
 #include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/nvdrv/interface.h"
+#include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvdrv/nvdrv.h"

 namespace Service::Nvidia {

+void NVDRV::SignalGPUInterruptSyncpt(const u32 syncpoint_id, const u32 value) {
+    nvdrv->SignalSyncpt(syncpoint_id, value);
+}
+
 void NVDRV::Open(Kernel::HLERequestContext& ctx) {
    LOG_DEBUG(Service_NVDRV, "called");

@@ -36,11 +42,31 @@ void NVDRV::Ioctl(Kernel::HLERequestContext& ctx) {

    std::vector<u8> output(ctx.GetWriteBufferSize());

+    IoctlCtrl ctrl{};
+
+    u32 result = nvdrv->Ioctl(fd, command, ctx.ReadBuffer(), output, ctrl);
+
+    if (ctrl.must_delay) {
+        ctrl.fresh_call = false;
+        ctx.SleepClientThread(
+            "NVServices::DelayedResponse", ctrl.timeout,
+            [=](Kernel::SharedPtr<Kernel::Thread> thread, Kernel::HLERequestContext& ctx,
+                Kernel::ThreadWakeupReason reason) {
+                IoctlCtrl ctrl2{ctrl};
+                std::vector<u8> output2 = output;
+                u32 result = nvdrv->Ioctl(fd, command, ctx.ReadBuffer(), output2, ctrl2);
+                ctx.WriteBuffer(output2);
+                IPC::ResponseBuilder rb{ctx, 3};
+                rb.Push(RESULT_SUCCESS);
+                rb.Push(result);
+            },
+            nvdrv->GetEventWriteable(ctrl.event_id));
+    } else {
+        ctx.WriteBuffer(output);
+    }
    IPC::ResponseBuilder rb{ctx, 3};
    rb.Push(RESULT_SUCCESS);
-    rb.Push(nvdrv->Ioctl(fd, command, ctx.ReadBuffer(), output));
-
-    ctx.WriteBuffer(output);
+    rb.Push(result);
 }

 void NVDRV::Close(Kernel::HLERequestContext& ctx) {
@@ -66,13 +92,19 @@ void NVDRV::Initialize(Kernel::HLERequestContext& ctx) {
 void NVDRV::QueryEvent(Kernel::HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};
    u32 fd = rp.Pop<u32>();
-    u32 event_id = rp.Pop<u32>();
+    // TODO(Blinkhawk): Figure the meaning of the flag at bit 16
+    u32 event_id = rp.Pop<u32>() & 0x000000FF;
    LOG_WARNING(Service_NVDRV, "(STUBBED) called, fd={:X}, event_id={:X}", fd, event_id);

    IPC::ResponseBuilder rb{ctx, 3, 1};
    rb.Push(RESULT_SUCCESS);
-    rb.PushCopyObjects(query_event.readable);
-    rb.Push<u32>(0);
+    if (event_id < MaxNvEvents) {
+        rb.PushCopyObjects(nvdrv->GetEvent(event_id));
+        rb.Push<u32>(NvResult::Success);
+    } else {
+        rb.Push<u32>(0);
+        rb.Push<u32>(NvResult::BadParameter);
+    }
 }

 void NVDRV::SetClientPID(Kernel::HLERequestContext& ctx) {
@@ -127,10 +159,6 @@ NVDRV::NVDRV(std::shared_ptr<Module> nvdrv, const char* name)
        {13, &NVDRV::FinishInitialize, "FinishInitialize"},
    };
    RegisterHandlers(functions);
-
-    auto& kernel = Core::System::GetInstance().Kernel();
-    query_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
-                                                         "NVDRV::query_event");
 }

 NVDRV::~NVDRV() = default;
--- a/src/core/hle/service/nvdrv/interface.h
+++ b/src/core/hle/service/nvdrv/interface.h
@@ -19,6 +19,8 @@ public:
    NVDRV(std::shared_ptr<Module> nvdrv, const char* name);
    ~NVDRV() override;

+    void SignalGPUInterruptSyncpt(const u32 syncpoint_id, const u32 value);
+
 private:
    void Open(Kernel::HLERequestContext& ctx);
    void Ioctl(Kernel::HLERequestContext& ctx);
@@ -33,8 +35,6 @@ private:
    std::shared_ptr<Module> nvdrv;

    u64 pid{};
-
-    Kernel::EventPair query_event;
 };

 } // namespace Service::Nvidia
--- a/src/core/hle/service/nvdrv/nvdata.h
+++ b/src/core/hle/service/nvdrv/nvdata.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <array>
+#include "common/common_types.h"
+
+namespace Service::Nvidia {
+
+constexpr u32 MaxSyncPoints = 192;
+constexpr u32 MaxNvEvents = 64;
+
+struct Fence {
+    s32 id;
+    u32 value;
+};
+
+static_assert(sizeof(Fence) == 8, "Fence has wrong size");
+
+struct MultiFence {
+    u32 num_fences;
+    std::array<Fence, 4> fences;
+};
+
+enum NvResult : u32 {
+    Success = 0,
+    BadParameter = 4,
+    Timeout = 5,
+    ResourceError = 15,
+};
+
+enum class EventState {
+    Free = 0,
+    Registered = 1,
+    Waiting = 2,
+    Busy = 3,
+};
+
+struct IoctlCtrl {
+    // First call done to the servioce for services that call itself again after a call.
+    bool fresh_call{true};
+    // Tells the Ioctl Wrapper that it must delay the IPC response and send the thread to sleep
+    bool must_delay{};
+    // Timeout for the delay
+    s64 timeout{};
+    // NV Event Id
+    s32 event_id{-1};
+};
+
+} // namespace Service::Nvidia
--- a/src/core/hle/service/nvdrv/nvdrv.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv.cpp
@@ -4,7 +4,10 @@

 #include <utility>

+#include <fmt/format.h>
 #include "core/hle/ipc_helpers.h"
+#include "core/hle/kernel/readable_event.h"
+#include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
 #include "core/hle/service/nvdrv/devices/nvdisp_disp0.h"
 #include "core/hle/service/nvdrv/devices/nvhost_as_gpu.h"
@@ -22,8 +25,9 @@

 namespace Service::Nvidia {

-void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger) {
-    auto module_ = std::make_shared<Module>();
+void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger,
+                       Core::System& system) {
+    auto module_ = std::make_shared<Module>(system);
    std::make_shared<NVDRV>(module_, "nvdrv")->InstallAsService(service_manager);
    std::make_shared<NVDRV>(module_, "nvdrv:a")->InstallAsService(service_manager);
    std::make_shared<NVDRV>(module_, "nvdrv:s")->InstallAsService(service_manager);
@@ -32,17 +36,25 @@ void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger
    nvflinger.SetNVDrvInstance(module_);
 }

-Module::Module() {
-    auto nvmap_dev = std::make_shared<Devices::nvmap>();
-    devices["/dev/nvhost-as-gpu"] = std::make_shared<Devices::nvhost_as_gpu>(nvmap_dev);
-    devices["/dev/nvhost-gpu"] = std::make_shared<Devices::nvhost_gpu>(nvmap_dev);
-    devices["/dev/nvhost-ctrl-gpu"] = std::make_shared<Devices::nvhost_ctrl_gpu>();
+Module::Module(Core::System& system) {
+    auto& kernel = system.Kernel();
+    for (u32 i = 0; i < MaxNvEvents; i++) {
+        std::string event_label = fmt::format("NVDRV::NvEvent_{}", i);
+        events_interface.events[i] = Kernel::WritableEvent::CreateEventPair(
+            kernel, Kernel::ResetType::Automatic, event_label);
+        events_interface.status[i] = EventState::Free;
+        events_interface.registered[i] = false;
+    }
+    auto nvmap_dev = std::make_shared<Devices::nvmap>(system);
+    devices["/dev/nvhost-as-gpu"] = std::make_shared<Devices::nvhost_as_gpu>(system, nvmap_dev);
+    devices["/dev/nvhost-gpu"] = std::make_shared<Devices::nvhost_gpu>(system, nvmap_dev);
+    devices["/dev/nvhost-ctrl-gpu"] = std::make_shared<Devices::nvhost_ctrl_gpu>(system);
    devices["/dev/nvmap"] = nvmap_dev;
-    devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(nvmap_dev);
-    devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>();
-    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>();
-    devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>();
-    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>();
+    devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(system, nvmap_dev);
+    devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>(system, events_interface);
+    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system);
+    devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>(system);
+    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system);
 }

 Module::~Module() = default;
@@ -59,12 +71,13 @@ u32 Module::Open(const std::string& device_name) {
    return fd;
 }

-u32 Module::Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 Module::Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output,
+                  IoctlCtrl& ctrl) {
    auto itr = open_files.find(fd);
    ASSERT_MSG(itr != open_files.end(), "Tried to talk to an invalid device");

    auto& device = itr->second;
-    return device->ioctl({command}, input, output);
+    return device->ioctl({command}, input, output, ctrl);
 }

 ResultCode Module::Close(u32 fd) {
@@ -77,4 +90,22 @@ ResultCode Module::Close(u32 fd) {
    return RESULT_SUCCESS;
 }

+void Module::SignalSyncpt(const u32 syncpoint_id, const u32 value) {
+    for (u32 i = 0; i < MaxNvEvents; i++) {
+        if (events_interface.assigned_syncpt[i] == syncpoint_id &&
+            events_interface.assigned_value[i] == value) {
+            events_interface.LiberateEvent(i);
+            events_interface.events[i].writable->Signal();
+        }
+    }
+}
+
+Kernel::SharedPtr<Kernel::ReadableEvent> Module::GetEvent(const u32 event_id) const {
+    return events_interface.events[event_id].readable;
+}
+
+Kernel::SharedPtr<Kernel::WritableEvent> Module::GetEventWriteable(const u32 event_id) const {
+    return events_interface.events[event_id].writable;
+}
+
 } // namespace Service::Nvidia
--- a/src/core/hle/service/nvdrv/nvdrv.h
+++ b/src/core/hle/service/nvdrv/nvdrv.h
@@ -8,8 +8,14 @@
 #include <unordered_map>
 #include <vector>
 #include "common/common_types.h"
+#include "core/hle/kernel/writable_event.h"
+#include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/service.h"

+namespace Core {
+class System;
+}
+
 namespace Service::NVFlinger {
 class NVFlinger;
 }
@@ -20,16 +26,72 @@ namespace Devices {
 class nvdevice;
 }

-struct IoctlFence {
-    u32 id;
-    u32 value;
+struct EventInterface {
+    // Mask representing currently busy events
+    u64 events_mask{};
+    // Each kernel event associated to an NV event
+    std::array<Kernel::EventPair, MaxNvEvents> events;
+    // The status of the current NVEvent
+    std::array<EventState, MaxNvEvents> status{};
+    // Tells if an NVEvent is registered or not
+    std::array<bool, MaxNvEvents> registered{};
+    // When an NVEvent is waiting on GPU interrupt, this is the sync_point
+    // associated with it.
+    std::array<u32, MaxNvEvents> assigned_syncpt{};
+    // This is the value of the GPU interrupt for which the NVEvent is waiting
+    // for.
+    std::array<u32, MaxNvEvents> assigned_value{};
+    // Constant to denote an unasigned syncpoint.
+    static constexpr u32 unassigned_syncpt = 0xFFFFFFFF;
+    std::optional<u32> GetFreeEvent() const {
+        u64 mask = events_mask;
+        for (u32 i = 0; i < MaxNvEvents; i++) {
+            const bool is_free = (mask & 0x1) == 0;
+            if (is_free) {
+                if (status[i] == EventState::Registered || status[i] == EventState::Free) {
+                    return {i};
+                }
+            }
+            mask = mask >> 1;
+        }
+        return {};
+    }
+    void SetEventStatus(const u32 event_id, EventState new_status) {
+        EventState old_status = status[event_id];
+        if (old_status == new_status) {
+            return;
+        }
+        status[event_id] = new_status;
+        if (new_status == EventState::Registered) {
+            registered[event_id] = true;
+        }
+        if (new_status == EventState::Waiting || new_status == EventState::Busy) {
+            events_mask |= (1ULL << event_id);
+        }
+    }
+    void RegisterEvent(const u32 event_id) {
+        registered[event_id] = true;
+        if (status[event_id] == EventState::Free) {
+            status[event_id] = EventState::Registered;
+        }
+    }
+    void UnregisterEvent(const u32 event_id) {
+        registered[event_id] = false;
+        if (status[event_id] == EventState::Registered) {
+            status[event_id] = EventState::Free;
+        }
+    }
+    void LiberateEvent(const u32 event_id) {
+        status[event_id] = registered[event_id] ? EventState::Registered : EventState::Free;
+        events_mask &= ~(1ULL << event_id);
+        assigned_syncpt[event_id] = unassigned_syncpt;
+        assigned_value[event_id] = 0;
+    }
 };

-static_assert(sizeof(IoctlFence) == 8, "IoctlFence has wrong size");
-
 class Module final {
 public:
-    Module();
+    Module(Core::System& system);
    ~Module();

    /// Returns a pointer to one of the available devices, identified by its name.
@@ -44,10 +106,17 @@ public:
    /// Opens a device node and returns a file descriptor to it.
    u32 Open(const std::string& device_name);
    /// Sends an ioctl command to the specified file descriptor.
-    u32 Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output);
+    u32 Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl);
    /// Closes a device file descriptor and returns operation success.
    ResultCode Close(u32 fd);

+    void SignalSyncpt(const u32 syncpoint_id, const u32 value);
+
+    Kernel::SharedPtr<Kernel::ReadableEvent> GetEvent(u32 event_id) const;
+
+    Kernel::SharedPtr<Kernel::WritableEvent> GetEventWriteable(u32 event_id) const;
+
 private:
    /// Id to use for the next open file descriptor.
    u32 next_fd = 1;
@@ -57,9 +126,12 @@ private:

    /// Mapping of device node names to their implementation.
    std::unordered_map<std::string, std::shared_ptr<Devices::nvdevice>> devices;
+
+    EventInterface events_interface;
 };

 /// Registers all NVDRV services with the specified service manager.
-void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger);
+void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger,
+                       Core::System& system);

 } // namespace Service::Nvidia
--- a/src/core/hle/service/nvflinger/buffer_queue.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue.cpp
@@ -34,7 +34,8 @@ void BufferQueue::SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer)
    buffer_wait_event.writable->Signal();
 }

-std::optional<u32> BufferQueue::DequeueBuffer(u32 width, u32 height) {
+std::optional<std::pair<u32, Service::Nvidia::MultiFence*>> BufferQueue::DequeueBuffer(u32 width,
+                                                                                       u32 height) {
    auto itr = std::find_if(queue.begin(), queue.end(), [&](const Buffer& buffer) {
        // Only consider free buffers. Buffers become free once again after they've been Acquired
        // and Released by the compositor, see the NVFlinger::Compose method.
@@ -51,7 +52,7 @@ std::optional<u32> BufferQueue::DequeueBuffer(u32 width, u32 height) {
    }

    itr->status = Buffer::Status::Dequeued;
-    return itr->slot;
+    return {{itr->slot, &itr->multi_fence}};
 }

 const IGBPBuffer& BufferQueue::RequestBuffer(u32 slot) const {
@@ -63,7 +64,8 @@ const IGBPBuffer& BufferQueue::RequestBuffer(u32 slot) const {
 }

 void BufferQueue::QueueBuffer(u32 slot, BufferTransformFlags transform,
-                              const Common::Rectangle<int>& crop_rect) {
+                              const Common::Rectangle<int>& crop_rect, u32 swap_interval,
+                              Service::Nvidia::MultiFence& multi_fence) {
    auto itr = std::find_if(queue.begin(), queue.end(),
                            [&](const Buffer& buffer) { return buffer.slot == slot; });
    ASSERT(itr != queue.end());
@@ -71,12 +73,21 @@ void BufferQueue::QueueBuffer(u32 slot, BufferTransformFlags transform,
    itr->status = Buffer::Status::Queued;
    itr->transform = transform;
    itr->crop_rect = crop_rect;
+    itr->swap_interval = swap_interval;
+    itr->multi_fence = multi_fence;
+    queue_sequence.push_back(slot);
 }

 std::optional<std::reference_wrapper<const BufferQueue::Buffer>> BufferQueue::AcquireBuffer() {
-    auto itr = std::find_if(queue.begin(), queue.end(), [](const Buffer& buffer) {
-        return buffer.status == Buffer::Status::Queued;
-    });
+    auto itr = queue.end();
+    // Iterate to find a queued buffer matching the requested slot.
+    while (itr == queue.end() && !queue_sequence.empty()) {
+        u32 slot = queue_sequence.front();
+        itr = std::find_if(queue.begin(), queue.end(), [&slot](const Buffer& buffer) {
+            return buffer.status == Buffer::Status::Queued && buffer.slot == slot;
+        });
+        queue_sequence.pop_front();
+    }
    if (itr == queue.end())
        return {};
    itr->status = Buffer::Status::Acquired;
--- a/src/core/hle/service/nvflinger/buffer_queue.h
+++ b/src/core/hle/service/nvflinger/buffer_queue.h
@@ -4,6 +4,7 @@

 #pragma once

+#include <list>
 #include <optional>
 #include <vector>

@@ -12,6 +13,7 @@
 #include "common/swap.h"
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/writable_event.h"
+#include "core/hle/service/nvdrv/nvdata.h"

 namespace Service::NVFlinger {

@@ -68,13 +70,17 @@ public:
        IGBPBuffer igbp_buffer;
        BufferTransformFlags transform;
        Common::Rectangle<int> crop_rect;
+        u32 swap_interval;
+        Service::Nvidia::MultiFence multi_fence;
    };

    void SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer);
-    std::optional<u32> DequeueBuffer(u32 width, u32 height);
+    std::optional<std::pair<u32, Service::Nvidia::MultiFence*>> DequeueBuffer(u32 width,
+                                                                              u32 height);
    const IGBPBuffer& RequestBuffer(u32 slot) const;
    void QueueBuffer(u32 slot, BufferTransformFlags transform,
-                     const Common::Rectangle<int>& crop_rect);
+                     const Common::Rectangle<int>& crop_rect, u32 swap_interval,
+                     Service::Nvidia::MultiFence& multi_fence);
    std::optional<std::reference_wrapper<const Buffer>> AcquireBuffer();
    void ReleaseBuffer(u32 slot);
    u32 Query(QueryType type);
@@ -92,6 +98,7 @@ private:
    u64 layer_id;

    std::vector<Buffer> queue;
+    std::list<u32> queue_sequence;
    Kernel::EventPair buffer_wait_event;
 };

--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -37,15 +37,14 @@ NVFlinger::NVFlinger(Core::Timing::CoreTiming& core_timing) : core_timing{core_t
    displays.emplace_back(4, "Null");

    // Schedule the screen composition events
-    const auto ticks = Settings::values.force_30fps_mode ? frame_ticks_30fps : frame_ticks;
+    composition_event = core_timing.RegisterEvent("ScreenComposition", [this](u64 userdata,
+                                                                              s64 cycles_late) {
+        Compose();
+        const auto ticks = Settings::values.force_30fps_mode ? frame_ticks_30fps : GetNextTicks();
+        this->core_timing.ScheduleEvent(std::max<s64>(0LL, ticks - cycles_late), composition_event);
+    });

-    composition_event = core_timing.RegisterEvent(
-        "ScreenComposition", [this, ticks](u64 userdata, s64 cycles_late) {
-            Compose();
-            this->core_timing.ScheduleEvent(ticks - cycles_late, composition_event);
-        });
-
-    core_timing.ScheduleEvent(ticks, composition_event);
+    core_timing.ScheduleEvent(frame_ticks, composition_event);
 }

 NVFlinger::~NVFlinger() {
@@ -206,8 +205,14 @@ void NVFlinger::Compose() {
                     igbp_buffer.width, igbp_buffer.height, igbp_buffer.stride,
                     buffer->get().transform, buffer->get().crop_rect);

+        swap_interval = buffer->get().swap_interval;
        buffer_queue.ReleaseBuffer(buffer->get().slot);
    }
 }

+s64 NVFlinger::GetNextTicks() const {
+    constexpr s64 max_hertz = 120LL;
+    return (Core::Timing::BASE_CLOCK_RATE * (1LL << swap_interval)) / max_hertz;
+}
+
 } // namespace Service::NVFlinger
--- a/src/core/hle/service/nvflinger/nvflinger.h
+++ b/src/core/hle/service/nvflinger/nvflinger.h
@@ -74,6 +74,8 @@ public:
    /// finished.
    void Compose();

+    s64 GetNextTicks() const;
+
 private:
    /// Finds the display identified by the specified ID.
    VI::Display* FindDisplay(u64 display_id);
@@ -98,6 +100,8 @@ private:
    /// layers.
    u32 next_buffer_queue_id = 1;

+    u32 swap_interval = 1;
+
    /// Event that handles screen composition.
    Core::Timing::EventType* composition_event;

--- a/src/core/hle/service/service.cpp
+++ b/src/core/hle/service/service.cpp
@@ -236,7 +236,7 @@ void Init(std::shared_ptr<SM::ServiceManager>& sm, Core::System& system) {
    NIM::InstallInterfaces(*sm);
    NPNS::InstallInterfaces(*sm);
    NS::InstallInterfaces(*sm);
-    Nvidia::InstallInterfaces(*sm, *nv_flinger);
+    Nvidia::InstallInterfaces(*sm, *nv_flinger, system);
    PCIe::InstallInterfaces(*sm);
    PCTL::InstallInterfaces(*sm);
    PCV::InstallInterfaces(*sm);
--- a/src/core/hle/service/vi/vi.cpp
+++ b/src/core/hle/service/vi/vi.cpp
@@ -21,6 +21,7 @@
 #include "core/hle/kernel/readable_event.h"
 #include "core/hle/kernel/thread.h"
 #include "core/hle/kernel/writable_event.h"
+#include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvdrv/nvdrv.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "core/hle/service/nvflinger/nvflinger.h"
@@ -328,32 +329,22 @@ public:
    Data data;
 };

-struct BufferProducerFence {
-    u32 is_valid;
-    std::array<Nvidia::IoctlFence, 4> fences;
-};
-static_assert(sizeof(BufferProducerFence) == 36, "BufferProducerFence has wrong size");
-
 class IGBPDequeueBufferResponseParcel : public Parcel {
 public:
-    explicit IGBPDequeueBufferResponseParcel(u32 slot) : slot(slot) {}
+    explicit IGBPDequeueBufferResponseParcel(u32 slot, Service::Nvidia::MultiFence& multi_fence)
+        : slot(slot), multi_fence(multi_fence) {}
    ~IGBPDequeueBufferResponseParcel() override = default;

 protected:
    void SerializeData() override {
-        // TODO(Subv): Find out how this Fence is used.
-        BufferProducerFence fence = {};
-        fence.is_valid = 1;
-        for (auto& fence_ : fence.fences)
-            fence_.id = -1;
-
        Write(slot);
        Write<u32_le>(1);
-        WriteObject(fence);
+        WriteObject(multi_fence);
        Write<u32_le>(0);
    }

    u32_le slot;
+    Service::Nvidia::MultiFence multi_fence;
 };

 class IGBPRequestBufferRequestParcel : public Parcel {
@@ -400,12 +391,6 @@ public:
        data = Read<Data>();
    }

-    struct Fence {
-        u32_le id;
-        u32_le value;
-    };
-    static_assert(sizeof(Fence) == 8, "Fence has wrong size");
-
    struct Data {
        u32_le slot;
        INSERT_PADDING_WORDS(3);
@@ -418,15 +403,15 @@ public:
        s32_le scaling_mode;
        NVFlinger::BufferQueue::BufferTransformFlags transform;
        u32_le sticky_transform;
-        INSERT_PADDING_WORDS(2);
-        u32_le fence_is_valid;
-        std::array<Fence, 2> fences;
+        INSERT_PADDING_WORDS(1);
+        u32_le swap_interval;
+        Service::Nvidia::MultiFence multi_fence;

        Common::Rectangle<int> GetCropRect() const {
            return {crop_left, crop_top, crop_right, crop_bottom};
        }
    };
-    static_assert(sizeof(Data) == 80, "ParcelData has wrong size");
+    static_assert(sizeof(Data) == 96, "ParcelData has wrong size");

    Data data;
 };
@@ -547,11 +532,11 @@ private:
            IGBPDequeueBufferRequestParcel request{ctx.ReadBuffer()};
            const u32 width{request.data.width};
            const u32 height{request.data.height};
-            std::optional<u32> slot = buffer_queue.DequeueBuffer(width, height);
+            auto result = buffer_queue.DequeueBuffer(width, height);

-            if (slot) {
+            if (result) {
                // Buffer is available
-                IGBPDequeueBufferResponseParcel response{*slot};
+                IGBPDequeueBufferResponseParcel response{result->first, *result->second};
                ctx.WriteBuffer(response.Serialize());
            } else {
                // Wait the current thread until a buffer becomes available
@@ -561,10 +546,10 @@ private:
                        Kernel::ThreadWakeupReason reason) {
                        // Repeat TransactParcel DequeueBuffer when a buffer is available
                        auto& buffer_queue = nv_flinger->FindBufferQueue(id);
-                        std::optional<u32> slot = buffer_queue.DequeueBuffer(width, height);
-                        ASSERT_MSG(slot != std::nullopt, "Could not dequeue buffer.");
+                        auto result = buffer_queue.DequeueBuffer(width, height);
+                        ASSERT_MSG(result != std::nullopt, "Could not dequeue buffer.");

-                        IGBPDequeueBufferResponseParcel response{*slot};
+                        IGBPDequeueBufferResponseParcel response{result->first, *result->second};
                        ctx.WriteBuffer(response.Serialize());
                        IPC::ResponseBuilder rb{ctx, 2};
                        rb.Push(RESULT_SUCCESS);
@@ -582,7 +567,8 @@ private:
            IGBPQueueBufferRequestParcel request{ctx.ReadBuffer()};

            buffer_queue.QueueBuffer(request.data.slot, request.data.transform,
-                                     request.data.GetCropRect());
+                                     request.data.GetCropRect(), request.data.swap_interval,
+                                     request.data.multi_fence);

            IGBPQueueBufferResponseParcel response{1280, 720};
            ctx.WriteBuffer(response.Serialize());
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -22,7 +22,7 @@ void DmaPusher::DispatchCalls() {
    MICROPROFILE_SCOPE(DispatchCalls);

    // On entering GPU code, assume all memory may be touched by the ARM core.
-    gpu.Maxwell3D().dirty_flags.OnMemoryWrite();
+    gpu.Maxwell3D().dirty.OnMemoryWrite();

    dma_pushbuffer_subindex = 0;

@@ -31,6 +31,7 @@ void DmaPusher::DispatchCalls() {
            break;
        }
    }
+    gpu.FlushCommands();
 }

 bool DmaPusher::Step() {
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -37,7 +37,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
-            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+            system.GPU().Maxwell3D().dirty.OnMemoryWrite();
        }
        break;
    }
@@ -50,13 +50,14 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
 }

 void KeplerCompute::ProcessLaunch() {
-
    const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
    memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
                                   LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));

-    const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start;
-    LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc);
+    const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start;
+    LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr);
+
+    rasterizer.DispatchCompute(code_addr);
 }

 } // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -34,7 +34,7 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
-            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+            system.GPU().Maxwell3D().dirty.OnMemoryWrite();
        }
        break;
    }
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -22,6 +22,7 @@ Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& raste
                     MemoryManager& memory_manager)
    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
+    InitDirtySettings();
    InitializeRegisterDefaults();
 }

@@ -69,6 +70,10 @@ void Maxwell3D::InitializeRegisterDefaults() {
    regs.stencil_back_func_mask = 0xFFFFFFFF;
    regs.stencil_back_mask = 0xFFFFFFFF;

+    regs.depth_test_func = Regs::ComparisonOp::Always;
+    regs.cull.front_face = Regs::Cull::FrontFace::CounterClockWise;
+    regs.cull.cull_face = Regs::Cull::CullFace::Back;
+
    // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a
    // register carrying a default value. Assume it's OpenGL's default (1).
    regs.point_size = 1.0f;
@@ -86,6 +91,159 @@ void Maxwell3D::InitializeRegisterDefaults() {
    regs.rt_separate_frag_data = 1;
 }

+#define DIRTY_REGS_POS(field_name) (offsetof(Maxwell3D::DirtyRegs, field_name))
+
+void Maxwell3D::InitDirtySettings() {
+    const auto set_block = [this](const u32 start, const u32 range, const u8 position) {
+        const auto start_itr = dirty_pointers.begin() + start;
+        const auto end_itr = start_itr + range;
+        std::fill(start_itr, end_itr, position);
+    };
+    dirty.regs.fill(true);
+
+    // Init Render Targets
+    constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
+    constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt);
+    constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8;
+    u32 rt_dirty_reg = DIRTY_REGS_POS(render_target);
+    for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) {
+        set_block(rt_reg, registers_per_rt, rt_dirty_reg);
+        rt_dirty_reg++;
+    }
+    constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer);
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag;
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_width)] = depth_buffer_flag;
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_height)] = depth_buffer_flag;
+    constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
+    constexpr u32 zeta_reg = MAXWELL3D_REG_INDEX(zeta);
+    set_block(zeta_reg, registers_in_zeta, depth_buffer_flag);
+
+    // Init Vertex Arrays
+    constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array);
+    constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32);
+    constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays;
+    u32 va_reg = DIRTY_REGS_POS(vertex_array);
+    u32 vi_reg = DIRTY_REGS_POS(vertex_instance);
+    for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end;
+         vertex_reg += vertex_array_size) {
+        set_block(vertex_reg, 3, va_reg);
+        // The divisor concerns vertex array instances
+        dirty_pointers[vertex_reg + 3] = vi_reg;
+        va_reg++;
+        vi_reg++;
+    }
+    constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit);
+    constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32);
+    constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays;
+    va_reg = DIRTY_REGS_POS(vertex_array);
+    for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end;
+         vertex_reg += vertex_limit_size) {
+        set_block(vertex_reg, vertex_limit_size, va_reg);
+        va_reg++;
+    }
+    constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays);
+    constexpr u32 vertex_instance_size =
+        sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32);
+    constexpr u32 vertex_instance_end =
+        vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays;
+    vi_reg = DIRTY_REGS_POS(vertex_instance);
+    for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end;
+         vertex_reg += vertex_instance_size) {
+        set_block(vertex_reg, vertex_instance_size, vi_reg);
+        vi_reg++;
+    }
+    set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(),
+              DIRTY_REGS_POS(vertex_attrib_format));
+
+    // Init Shaders
+    constexpr u32 shader_registers_count =
+        sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
+    set_block(MAXWELL3D_REG_INDEX(shader_config[0]), shader_registers_count,
+              DIRTY_REGS_POS(shaders));
+
+    // State
+
+    // Viewport
+    constexpr u32 viewport_dirty_reg = DIRTY_REGS_POS(viewport);
+    constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports);
+    constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32);
+    set_block(viewport_start, viewport_size, viewport_dirty_reg);
+    constexpr u32 view_volume_start = MAXWELL3D_REG_INDEX(view_volume_clip_control);
+    constexpr u32 view_volume_size = sizeof(regs.view_volume_clip_control) / sizeof(u32);
+    set_block(view_volume_start, view_volume_size, viewport_dirty_reg);
+
+    // Viewport transformation
+    constexpr u32 viewport_trans_start = MAXWELL3D_REG_INDEX(viewport_transform);
+    constexpr u32 viewport_trans_size = sizeof(regs.viewport_transform) / sizeof(u32);
+    set_block(viewport_trans_start, viewport_trans_size, DIRTY_REGS_POS(viewport_transform));
+
+    // Cullmode
+    constexpr u32 cull_mode_start = MAXWELL3D_REG_INDEX(cull);
+    constexpr u32 cull_mode_size = sizeof(regs.cull) / sizeof(u32);
+    set_block(cull_mode_start, cull_mode_size, DIRTY_REGS_POS(cull_mode));
+
+    // Screen y control
+    dirty_pointers[MAXWELL3D_REG_INDEX(screen_y_control)] = DIRTY_REGS_POS(screen_y_control);
+
+    // Primitive Restart
+    constexpr u32 primitive_restart_start = MAXWELL3D_REG_INDEX(primitive_restart);
+    constexpr u32 primitive_restart_size = sizeof(regs.primitive_restart) / sizeof(u32);
+    set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart));
+
+    // Depth Test
+    constexpr u32 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test);
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg;
+
+    // Stencil Test
+    constexpr u32 stencil_test_dirty_reg = DIRTY_REGS_POS(stencil_test);
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_enable)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_func)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_ref)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_fail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zfail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zpass)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_two_side_enable)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_func)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_ref)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_fail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zfail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zpass)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg;
+
+    // Color Mask
+    constexpr u32 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask);
+    dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg;
+    set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32),
+              color_mask_dirty_reg);
+    // Blend State
+    constexpr u32 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state);
+    set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32),
+              blend_state_dirty_reg);
+    dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg;
+    set_block(MAXWELL3D_REG_INDEX(blend), sizeof(regs.blend) / sizeof(u32), blend_state_dirty_reg);
+    set_block(MAXWELL3D_REG_INDEX(independent_blend), sizeof(regs.independent_blend) / sizeof(u32),
+              blend_state_dirty_reg);
+
+    // Scissor State
+    constexpr u32 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test);
+    set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32),
+              scissor_test_dirty_reg);
+
+    // Polygon Offset
+    constexpr u32 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset);
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_units)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_factor)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg;
+}
+
 void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
    // Reset the current macro.
    executing_macro = 0;
@@ -108,6 +266,14 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {

    const u32 method = method_call.method;

+    if (method == cb_data_state.current) {
+        regs.reg_array[method] = method_call.argument;
+        ProcessCBData(method_call.argument);
+        return;
+    } else if (cb_data_state.current != null_cb_data) {
+        FinishCBData();
+    }
+
    // It is an error to write to a register other than the current macro's ARG register before it
    // has finished execution.
    if (executing_macro != 0) {
@@ -143,49 +309,19 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {

    if (regs.reg_array[method] != method_call.argument) {
        regs.reg_array[method] = method_call.argument;
-        // Color buffers
-        constexpr u32 first_rt_reg = MAXWELL3D_REG_INDEX(rt);
-        constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
-        if (method >= first_rt_reg &&
-            method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) {
-            const std::size_t rt_index = (method - first_rt_reg) / registers_per_rt;
-            dirty_flags.color_buffer.set(rt_index);
-        }
-
-        // Zeta buffer
-        constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
-        if (method == MAXWELL3D_REG_INDEX(zeta_enable) ||
-            method == MAXWELL3D_REG_INDEX(zeta_width) ||
-            method == MAXWELL3D_REG_INDEX(zeta_height) ||
-            (method >= MAXWELL3D_REG_INDEX(zeta) &&
-             method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) {
-            dirty_flags.zeta_buffer = true;
-        }
-
-        // Shader
-        constexpr u32 shader_registers_count =
-            sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
-        if (method >= MAXWELL3D_REG_INDEX(shader_config[0]) &&
-            method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) {
-            dirty_flags.shaders = true;
-        }
-
-        // Vertex format
-        if (method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) &&
-            method < MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) {
-            dirty_flags.vertex_attrib_format = true;
-        }
-
-        // Vertex buffer
-        if (method >= MAXWELL3D_REG_INDEX(vertex_array) &&
-            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
-        } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
-                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
-        } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
-                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays));
+        const std::size_t dirty_reg = dirty_pointers[method];
+        if (dirty_reg) {
+            dirty.regs[dirty_reg] = true;
+            if (dirty_reg >= DIRTY_REGS_POS(vertex_array) &&
+                dirty_reg < DIRTY_REGS_POS(vertex_array_buffers)) {
+                dirty.vertex_array_buffers = true;
+            } else if (dirty_reg >= DIRTY_REGS_POS(vertex_instance) &&
+                       dirty_reg < DIRTY_REGS_POS(vertex_instances)) {
+                dirty.vertex_instances = true;
+            } else if (dirty_reg >= DIRTY_REGS_POS(render_target) &&
+                       dirty_reg < DIRTY_REGS_POS(render_settings)) {
+                dirty.render_settings = true;
+            }
        }
    }

@@ -214,7 +350,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
-        ProcessCBData(method_call.argument);
+        StartCBData(method);
        break;
    }
    case MAXWELL3D_REG_INDEX(cb_bind[0].raw_config): {
@@ -249,6 +385,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
        ProcessQueryGet();
        break;
    }
+    case MAXWELL3D_REG_INDEX(condition.mode): {
+        ProcessQueryCondition();
+        break;
+    }
    case MAXWELL3D_REG_INDEX(sync_info): {
        ProcessSyncPoint();
        break;
@@ -261,7 +401,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
-            dirty_flags.OnMemoryWrite();
+            dirty.OnMemoryWrite();
        }
        break;
    }
@@ -302,6 +442,7 @@ void Maxwell3D::ProcessQueryGet() {
        result = regs.query.query_sequence;
        break;
    default:
+        result = 1;
        UNIMPLEMENTED_MSG("Unimplemented query select type {}",
                          static_cast<u32>(regs.query.query_get.select.Value()));
    }
@@ -333,7 +474,6 @@ void Maxwell3D::ProcessQueryGet() {
            query_result.timestamp = system.CoreTiming().GetTicks();
            memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
        }
-        dirty_flags.OnMemoryWrite();
        break;
    }
    default:
@@ -342,12 +482,52 @@ void Maxwell3D::ProcessQueryGet() {
    }
 }

+void Maxwell3D::ProcessQueryCondition() {
+    const GPUVAddr condition_address{regs.condition.Address()};
+    switch (regs.condition.mode) {
+    case Regs::ConditionMode::Always: {
+        execute_on = true;
+        break;
+    }
+    case Regs::ConditionMode::Never: {
+        execute_on = false;
+        break;
+    }
+    case Regs::ConditionMode::ResNonZero: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
+        break;
+    }
+    case Regs::ConditionMode::Equal: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
+        break;
+    }
+    case Regs::ConditionMode::NotEqual: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
+        break;
+    }
+    default: {
+        UNIMPLEMENTED_MSG("Uninplemented Condition Mode!");
+        execute_on = true;
+        break;
+    }
+    }
+}
+
 void Maxwell3D::ProcessSyncPoint() {
    const u32 sync_point = regs.sync_info.sync_point.Value();
    const u32 increment = regs.sync_info.increment.Value();
    const u32 cache_flush = regs.sync_info.unknown.Value();
-    LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
-              cache_flush);
+    if (increment) {
+        system.GPU().IncrementSyncPoint(sync_point);
+    }
 }

 void Maxwell3D::DrawArrays() {
@@ -405,23 +585,39 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
 }

 void Maxwell3D::ProcessCBData(u32 value) {
+    const u32 id = cb_data_state.id;
+    cb_data_state.buffer[id][cb_data_state.counter] = value;
+    // Increment the current buffer position.
+    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+    cb_data_state.counter++;
+}
+
+void Maxwell3D::StartCBData(u32 method) {
+    constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
+    cb_data_state.start_pos = regs.const_buffer.cb_pos;
+    cb_data_state.id = method - first_cb_data;
+    cb_data_state.current = method;
+    cb_data_state.counter = 0;
+    ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]);
+}
+
+void Maxwell3D::FinishCBData() {
    // Write the input value to the current const buffer at the current position.
    const GPUVAddr buffer_address = regs.const_buffer.BufferAddress();
    ASSERT(buffer_address != 0);

    // Don't allow writing past the end of the buffer.
-    ASSERT(regs.const_buffer.cb_pos + sizeof(u32) <= regs.const_buffer.cb_size);
+    ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size);

-    const GPUVAddr address{buffer_address + regs.const_buffer.cb_pos};
+    const GPUVAddr address{buffer_address + cb_data_state.start_pos};
+    const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos;

-    u8* ptr{memory_manager.GetPointer(address)};
-    rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32));
-    memory_manager.Write<u32>(address, value);
+    const u32 id = cb_data_state.id;
+    memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
+    dirty.OnMemoryWrite();

-    dirty_flags.OnMemoryWrite();
-
-    // Increment the current buffer position.
-    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+    cb_data_state.id = null_cb_data;
+    cb_data_state.current = null_cb_data;
 }

 Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -90,6 +90,20 @@ public:

        enum class QuerySelect : u32 {
            Zero = 0,
+            TimeElapsed = 2,
+            TransformFeedbackPrimitivesGenerated = 11,
+            PrimitivesGenerated = 18,
+            SamplesPassed = 21,
+            TransformFeedbackUnknown = 26,
+        };
+
+        struct QueryCompare {
+            u32 initial_sequence;
+            u32 initial_mode;
+            u32 unknown1;
+            u32 unknown2;
+            u32 current_sequence;
+            u32 current_mode;
        };

        enum class QuerySyncCondition : u32 {
@@ -97,6 +111,14 @@ public:
            GreaterThan = 1,
        };

+        enum class ConditionMode : u32 {
+            Never = 0,
+            Always = 1,
+            ResNonZero = 2,
+            Equal = 3,
+            NotEqual = 4,
+        };
+
        enum class ShaderProgram : u32 {
            VertexA = 0,
            VertexB = 1,
@@ -815,7 +837,18 @@ public:
                    BitField<4, 1, u32> alpha_to_one;
                } multisample_control;

-                INSERT_PADDING_WORDS(0x7);
+                INSERT_PADDING_WORDS(0x4);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    ConditionMode mode;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } condition;

                struct {
                    u32 tsc_address_high;
@@ -1124,23 +1157,77 @@ public:

    State state{};

-    struct DirtyFlags {
-        std::bitset<8> color_buffer{0xFF};
-        std::bitset<32> vertex_array{0xFFFFFFFF};
+    struct DirtyRegs {
+        static constexpr std::size_t NUM_REGS = 256;
+        union {
+            struct {
+                bool null_dirty;

-        bool vertex_attrib_format = true;
-        bool zeta_buffer = true;
-        bool shaders = true;
+                // Vertex Attributes
+                bool vertex_attrib_format;
+
+                // Vertex Arrays
+                std::array<bool, 32> vertex_array;
+
+                bool vertex_array_buffers;
+
+                // Vertex Instances
+                std::array<bool, 32> vertex_instance;
+
+                bool vertex_instances;
+
+                // Render Targets
+                std::array<bool, 8> render_target;
+                bool depth_buffer;
+
+                bool render_settings;
+
+                // Shaders
+                bool shaders;
+
+                // Rasterizer State
+                bool viewport;
+                bool clip_coefficient;
+                bool cull_mode;
+                bool primitive_restart;
+                bool depth_test;
+                bool stencil_test;
+                bool blend_state;
+                bool scissor_test;
+                bool transform_feedback;
+                bool color_mask;
+                bool polygon_offset;
+
+                // Complementary
+                bool viewport_transform;
+                bool screen_y_control;
+
+                bool memory_general;
+            };
+            std::array<bool, NUM_REGS> regs;
+        };
+
+        void ResetVertexArrays() {
+            vertex_array.fill(true);
+            vertex_array_buffers = true;
+        }
+
+        void ResetRenderTargets() {
+            depth_buffer = true;
+            render_target.fill(true);
+            render_settings = true;
+        }

        void OnMemoryWrite() {
-            zeta_buffer = true;
            shaders = true;
-            color_buffer.set();
-            vertex_array.set();
+            memory_general = true;
+            ResetRenderTargets();
+            ResetVertexArrays();
        }
-    };

-    DirtyFlags dirty_flags;
+    } dirty{};
+
+    std::array<u8, Regs::NUM_REGS> dirty_pointers{};

    /// Reads a register value located at the input method address
    u32 GetRegisterValue(u32 method) const;
@@ -1169,6 +1256,10 @@ public:
        return macro_memory;
    }

+    bool ShouldExecute() const {
+        return execute_on;
+    }
+
 private:
    void InitializeRegisterDefaults();

@@ -1192,14 +1283,27 @@ private:
    /// Interpreter for the macro codes uploaded to the GPU.
    MacroInterpreter macro_interpreter;

+    static constexpr u32 null_cb_data = 0xFFFFFFFF;
+    struct {
+        std::array<std::array<u32, 0x4000>, 16> buffer;
+        u32 current{null_cb_data};
+        u32 id{null_cb_data};
+        u32 start_pos{};
+        u32 counter{};
+    } cb_data_state;
+
    Upload::State upload_state;

+    bool execute_on{true};
+
    /// Retrieves information about a specific TIC entry from the TIC buffer.
    Texture::TICEntry GetTICEntry(u32 tic_index) const;

    /// Retrieves information about a specific TSC entry from the TSC buffer.
    Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;

+    void InitDirtySettings();
+
    /**
     * Call a macro on this engine.
     * @param method Method to call
@@ -1219,11 +1323,16 @@ private:
    /// Handles a write to the QUERY_GET register.
    void ProcessQueryGet();

+    // Handles Conditional Rendering
+    void ProcessQueryCondition();
+
    /// Handles writes to syncing register.
    void ProcessSyncPoint();

    /// Handles a write to the CB_DATA[i] register.
+    void StartCBData(u32 method);
    void ProcessCBData(u32 value);
+    void FinishCBData();

    /// Handles a write to the CB_BIND register.
    void ProcessCBBind(Regs::ShaderStage stage);
@@ -1290,6 +1399,7 @@ ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
 ASSERT_REG_POSITION(point_size, 0x546);
 ASSERT_REG_POSITION(zeta_enable, 0x54E);
 ASSERT_REG_POSITION(multisample_control, 0x54F);
+ASSERT_REG_POSITION(condition, 0x554);
 ASSERT_REG_POSITION(tsc, 0x557);
 ASSERT_REG_POSITION(polygon_offset_factor, 0x55b);
 ASSERT_REG_POSITION(tic, 0x55D);
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -38,7 +38,7 @@ void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
 }

 void MaxwellDMA::HandleCopy() {
-    LOG_WARNING(HW_GPU, "Requested a DMA copy");
+    LOG_TRACE(HW_GPU, "Requested a DMA copy");

    const GPUVAddr source = regs.src_address.Address();
    const GPUVAddr dest = regs.dst_address.Address();
@@ -58,7 +58,7 @@ void MaxwellDMA::HandleCopy() {
    }

    // All copies here update the main memory, so mark all rasterizer states as invalid.
-    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+    system.GPU().Maxwell3D().dirty.OnMemoryWrite();

    if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
        // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -931,8 +931,6 @@ union Instruction {
    } csetp;

    union {
-        BitField<35, 4, PredCondition> cond;
-        BitField<49, 1, u64> h_and;
        BitField<6, 1, u64> ftz;
        BitField<45, 2, PredOperation> op;
        BitField<3, 3, u64> pred3;
@@ -940,9 +938,21 @@ union Instruction {
        BitField<43, 1, u64> negate_a;
        BitField<44, 1, u64> abs_a;
        BitField<47, 2, HalfType> type_a;
-        BitField<31, 1, u64> negate_b;
-        BitField<30, 1, u64> abs_b;
-        BitField<28, 2, HalfType> type_b;
+        union {
+            BitField<35, 4, PredCondition> cond;
+            BitField<49, 1, u64> h_and;
+            BitField<31, 1, u64> negate_b;
+            BitField<30, 1, u64> abs_b;
+            BitField<28, 2, HalfType> type_b;
+        } reg;
+        union {
+            BitField<56, 1, u64> negate_b;
+            BitField<54, 1, u64> abs_b;
+        } cbuf;
+        union {
+            BitField<49, 4, PredCondition> cond;
+            BitField<53, 1, u64> h_and;
+        } cbuf_and_imm;
        BitField<42, 1, u64> neg_pred;
        BitField<39, 3, u64> pred39;
    } hsetp2;
@@ -1548,7 +1558,9 @@ public:
        HFMA2_RC,
        HFMA2_RR,
        HFMA2_IMM_R,
+        HSETP2_C,
        HSETP2_R,
+        HSETP2_IMM,
        HSET2_R,
        POPC_C,
        POPC_R,
@@ -1831,7 +1843,9 @@ private:
            INST("01100---1-------", Id::HFMA2_RC, Type::Hfma2, "HFMA2_RC"),
            INST("0101110100000---", Id::HFMA2_RR, Type::Hfma2, "HFMA2_RR"),
            INST("01110---0-------", Id::HFMA2_IMM_R, Type::Hfma2, "HFMA2_R_IMM"),
-            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP_R"),
+            INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
+            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
+            INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
            INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
            INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
            INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -29,7 +29,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
    UNREACHABLE();
 }

-GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
+GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
+    : system{system}, renderer{renderer}, is_async{is_async} {
    auto& rasterizer{renderer.Rasterizer()};
    memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
@@ -50,6 +51,14 @@ const Engines::Maxwell3D& GPU::Maxwell3D() const {
    return *maxwell_3d;
 }

+Engines::KeplerCompute& GPU::KeplerCompute() {
+    return *kepler_compute;
+}
+
+const Engines::KeplerCompute& GPU::KeplerCompute() const {
+    return *kepler_compute;
+}
+
 MemoryManager& GPU::MemoryManager() {
    return *memory_manager;
 }
@@ -66,6 +75,55 @@ const DmaPusher& GPU::DmaPusher() const {
    return *dma_pusher;
 }

+void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
+    syncpoints[syncpoint_id]++;
+    std::lock_guard lock{sync_mutex};
+    if (!syncpt_interrupts[syncpoint_id].empty()) {
+        u32 value = syncpoints[syncpoint_id].load();
+        auto it = syncpt_interrupts[syncpoint_id].begin();
+        while (it != syncpt_interrupts[syncpoint_id].end()) {
+            if (value >= *it) {
+                TriggerCpuInterrupt(syncpoint_id, *it);
+                it = syncpt_interrupts[syncpoint_id].erase(it);
+                continue;
+            }
+            it++;
+        }
+    }
+}
+
+u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
+    return syncpoints[syncpoint_id].load();
+}
+
+void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
+    auto& interrupt = syncpt_interrupts[syncpoint_id];
+    bool contains = std::any_of(interrupt.begin(), interrupt.end(),
+                                [value](u32 in_value) { return in_value == value; });
+    if (contains) {
+        return;
+    }
+    syncpt_interrupts[syncpoint_id].emplace_back(value);
+}
+
+bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
+    std::lock_guard lock{sync_mutex};
+    auto& interrupt = syncpt_interrupts[syncpoint_id];
+    const auto iter =
+        std::find_if(interrupt.begin(), interrupt.end(),
+                     [value](u32 interrupt_value) { return value == interrupt_value; });
+
+    if (iter == interrupt.end()) {
+        return false;
+    }
+    interrupt.erase(iter);
+    return true;
+}
+
+void GPU::FlushCommands() {
+    renderer.Rasterizer().FlushCommands();
+}
+
 u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
    ASSERT(format != RenderTargetFormat::NONE);

@@ -143,12 +201,12 @@ enum class BufferMethods {
    NotifyIntr = 0x8,
    WrcacheFlush = 0x9,
    Unk28 = 0xA,
-    Unk2c = 0xB,
+    UnkCacheFlush = 0xB,
    RefCnt = 0x14,
    SemaphoreAcquire = 0x1A,
    SemaphoreRelease = 0x1B,
-    Unk70 = 0x1C,
-    Unk74 = 0x1D,
+    FenceValue = 0x1C,
+    FenceAction = 0x1D,
    Unk78 = 0x1E,
    Unk7c = 0x1F,
    Yield = 0x20,
@@ -194,6 +252,10 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
    case BufferMethods::SemaphoreAddressLow:
    case BufferMethods::SemaphoreSequence:
    case BufferMethods::RefCnt:
+    case BufferMethods::UnkCacheFlush:
+    case BufferMethods::WrcacheFlush:
+    case BufferMethods::FenceValue:
+    case BufferMethods::FenceAction:
        break;
    case BufferMethods::SemaphoreTrigger: {
        ProcessSemaphoreTriggerMethod();
@@ -204,21 +266,11 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
        LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
        break;
    }
-    case BufferMethods::WrcacheFlush: {
-        // TODO(Kmather73): Research and implement this method.
-        LOG_ERROR(HW_GPU, "Special puller engine method WrcacheFlush not implemented");
-        break;
-    }
    case BufferMethods::Unk28: {
        // TODO(Kmather73): Research and implement this method.
        LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
        break;
    }
-    case BufferMethods::Unk2c: {
-        // TODO(Kmather73): Research and implement this method.
-        LOG_ERROR(HW_GPU, "Special puller engine method Unk2c not implemented");
-        break;
-    }
    case BufferMethods::SemaphoreAcquire: {
        ProcessSemaphoreAcquire();
        break;
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -5,8 +5,12 @@
 #pragma once

 #include <array>
+#include <atomic>
+#include <list>
 #include <memory>
+#include <mutex>
 #include "common/common_types.h"
+#include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/dma_pusher.h"

@@ -127,7 +131,7 @@ class MemoryManager;

 class GPU {
 public:
-    explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
+    explicit GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async);

    virtual ~GPU();

@@ -149,12 +153,20 @@ public:
    /// Calls a GPU method.
    void CallMethod(const MethodCall& method_call);

+    void FlushCommands();
+
    /// Returns a reference to the Maxwell3D GPU engine.
    Engines::Maxwell3D& Maxwell3D();

    /// Returns a const reference to the Maxwell3D GPU engine.
    const Engines::Maxwell3D& Maxwell3D() const;

+    /// Returns a reference to the KeplerCompute GPU engine.
+    Engines::KeplerCompute& KeplerCompute();
+
+    /// Returns a reference to the KeplerCompute GPU engine.
+    const Engines::KeplerCompute& KeplerCompute() const;
+
    /// Returns a reference to the GPU memory manager.
    Tegra::MemoryManager& MemoryManager();

@@ -164,6 +176,22 @@ public:
    /// Returns a reference to the GPU DMA pusher.
    Tegra::DmaPusher& DmaPusher();

+    void IncrementSyncPoint(u32 syncpoint_id);
+
+    u32 GetSyncpointValue(u32 syncpoint_id) const;
+
+    void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
+
+    bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
+
+    std::unique_lock<std::mutex> LockSync() {
+        return std::unique_lock{sync_mutex};
+    }
+
+    bool IsAsync() const {
+        return is_async;
+    }
+
    /// Returns a const reference to the GPU DMA pusher.
    const Tegra::DmaPusher& DmaPusher() const;

@@ -194,7 +222,12 @@ public:

                u32 semaphore_acquire;
                u32 semaphore_release;
-                INSERT_PADDING_WORDS(0xE4);
+                u32 fence_value;
+                union {
+                    BitField<4, 4, u32> operation;
+                    BitField<8, 8, u32> id;
+                } fence_action;
+                INSERT_PADDING_WORDS(0xE2);

                // Puller state
                u32 acquire_mode;
@@ -228,6 +261,9 @@ public:
    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;

+protected:
+    virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;
+
 private:
    void ProcessBindMethod(const MethodCall& method_call);
    void ProcessSemaphoreTriggerMethod();
@@ -246,6 +282,7 @@ private:
 protected:
    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
    VideoCore::RendererBase& renderer;
+    Core::System& system;

 private:
    std::unique_ptr<Tegra::MemoryManager> memory_manager;
@@ -262,6 +299,14 @@ private:
    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
    /// Inline memory engine
    std::unique_ptr<Engines::KeplerMemory> kepler_memory;
+
+    std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
+
+    std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
+
+    std::mutex sync_mutex;
+
+    const bool is_async;
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
@@ -274,6 +319,8 @@ ASSERT_REG_POSITION(semaphore_trigger, 0x7);
 ASSERT_REG_POSITION(reference_count, 0x14);
 ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
 ASSERT_REG_POSITION(semaphore_release, 0x1B);
+ASSERT_REG_POSITION(fence_value, 0x1C);
+ASSERT_REG_POSITION(fence_action, 0x1D);

 ASSERT_REG_POSITION(acquire_mode, 0x100);
 ASSERT_REG_POSITION(acquire_source, 0x101);
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include "core/core.h"
+#include "core/hardware_interrupt_manager.h"
 #include "video_core/gpu_asynch.h"
 #include "video_core/gpu_thread.h"
 #include "video_core/renderer_base.h"
@@ -9,7 +11,7 @@
 namespace VideoCommon {

 GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
-    : GPU(system, renderer), gpu_thread{system} {}
+    : GPU(system, renderer, true), gpu_thread{system} {}

 GPUAsynch::~GPUAsynch() = default;

@@ -38,4 +40,9 @@ void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
    gpu_thread.FlushAndInvalidateRegion(addr, size);
 }

+void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
+    auto& interrupt_manager = system.InterruptManager();
+    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
+}
+
 } // namespace VideoCommon
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -27,6 +27,9 @@ public:
    void InvalidateRegion(CacheAddr addr, u64 size) override;
    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;

+protected:
+    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
+
 private:
    GPUThread::ThreadManager gpu_thread;
 };
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -8,7 +8,7 @@
 namespace VideoCommon {

 GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
-    : GPU(system, renderer) {}
+    : GPU(system, renderer, false) {}

 GPUSynch::~GPUSynch() = default;

--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -25,6 +25,10 @@ public:
    void FlushRegion(CacheAddr addr, u64 size) override;
    void InvalidateRegion(CacheAddr addr, u64 size) override;
    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+
+protected:
+    void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
+                             [[maybe_unused]] u32 value) const override {}
 };

 } // namespace VideoCommon
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -21,7 +21,8 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
    MicroProfileOnThreadCreate("GpuThread");

    // Wait for first GPU command before acquiring the window context
-    state.WaitForCommands();
+    while (state.queue.Empty())
+        ;

    // If emulation was stopped during disk shader loading, abort before trying to acquire context
    if (!state.is_running) {
@@ -32,7 +33,6 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p

    CommandDataContainer next;
    while (state.is_running) {
-        state.WaitForCommands();
        while (!state.queue.Empty()) {
            state.queue.Pop(next);
            if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
@@ -49,8 +49,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
            } else {
                UNREACHABLE();
            }
-            state.signaled_fence = next.fence;
-            state.TrySynchronize();
+            state.signaled_fence.store(next.fence);
        }
    }
 }
@@ -89,12 +88,7 @@ void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
 }

 void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
-    if (state.queue.Empty()) {
-        // It's quicker to invalidate a single region on the CPU if the queue is already empty
-        system.Renderer().Rasterizer().InvalidateRegion(addr, size);
-    } else {
-        PushCommand(InvalidateRegionCommand(addr, size));
-    }
+    system.Renderer().Rasterizer().InvalidateRegion(addr, size);
 }

 void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -105,22 +99,13 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
 u64 ThreadManager::PushCommand(CommandData&& command_data) {
    const u64 fence{++state.last_fence};
    state.queue.Push(CommandDataContainer(std::move(command_data), fence));
-    state.SignalCommands();
    return fence;
 }

 MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
 void SynchState::WaitForSynchronization(u64 fence) {
-    if (signaled_fence >= fence) {
-        return;
-    }
-
-    // Wait for the GPU to be idle (all commands to be executed)
-    {
-        MICROPROFILE_SCOPE(GPU_wait);
-        std::unique_lock lock{synchronization_mutex};
-        synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
-    }
+    while (signaled_fence.load() < fence)
+        ;
 }

 } // namespace VideoCommon::GPUThread
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -88,41 +88,9 @@ struct CommandDataContainer {
 /// Struct used to synchronize the GPU thread
 struct SynchState final {
    std::atomic_bool is_running{true};
-    std::atomic_int queued_frame_count{};
-    std::mutex synchronization_mutex;
-    std::mutex commands_mutex;
-    std::condition_variable commands_condition;
-    std::condition_variable synchronization_condition;
-
-    /// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU
-    /// synchronized. This is entirely empirical.
-    bool IsSynchronized() const {
-        constexpr std::size_t max_queue_gap{5};
-        return queue.Size() <= max_queue_gap;
-    }
-
-    void TrySynchronize() {
-        if (IsSynchronized()) {
-            std::lock_guard lock{synchronization_mutex};
-            synchronization_condition.notify_one();
-        }
-    }

    void WaitForSynchronization(u64 fence);

-    void SignalCommands() {
-        if (queue.Empty()) {
-            return;
-        }
-
-        commands_condition.notify_one();
-    }
-
-    void WaitForCommands() {
-        std::unique_lock lock{commands_mutex};
-        commands_condition.wait(lock, [this] { return !queue.Empty(); });
-    }
-
    using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
    CommandQueue queue;
    u64 last_fence{};
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -34,6 +34,9 @@ public:
    /// Clear the current framebuffer
    virtual void Clear() = 0;

+    /// Dispatches a compute shader invocation
+    virtual void DispatchCompute(GPUVAddr code_addr) = 0;
+
    /// Notify rasterizer that all caches should be flushed to Switch memory
    virtual void FlushAll() = 0;

@@ -47,6 +50,9 @@ public:
    /// and invalidated
    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;

+    // Notify the rasterizer to send all written commands to the host GPU.
+    virtual void FlushCommands() = 0;
+
    /// Notify rasterizer that a frame is about to finish
    virtual void TickFrame() = 0;

--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -4,6 +4,7 @@

 #include <algorithm>
 #include <array>
+#include <bitset>
 #include <memory>
 #include <string>
 #include <string_view>
@@ -19,6 +20,7 @@
 #include "core/core.h"
 #include "core/hle/kernel/process.h"
 #include "core/settings.h"
+#include "video_core/engines/kepler_compute.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
@@ -105,6 +107,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind
    shader_program_manager = std::make_unique<GLShader::ProgramManager>();
    state.draw.shader_program = 0;
    state.Apply();
+    clear_framebuffer.Create();

    LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here");
    CheckExtensions();
@@ -124,10 +127,10 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
    auto& gpu = system.GPU().Maxwell3D();
    const auto& regs = gpu.regs;

-    if (!gpu.dirty_flags.vertex_attrib_format) {
+    if (!gpu.dirty.vertex_attrib_format) {
        return state.draw.vertex_array;
    }
-    gpu.dirty_flags.vertex_attrib_format = false;
+    gpu.dirty.vertex_attrib_format = false;

    MICROPROFILE_SCOPE(OpenGL_VAO);

@@ -181,7 +184,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
    }

    // Rebinding the VAO invalidates the vertex buffer bindings.
-    gpu.dirty_flags.vertex_array.set();
+    gpu.dirty.ResetVertexArrays();

    state.draw.vertex_array = vao_entry.handle;
    return vao_entry.handle;
@@ -189,17 +192,20 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {

 void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
    auto& gpu = system.GPU().Maxwell3D();
-    const auto& regs = gpu.regs;
-
-    if (gpu.dirty_flags.vertex_array.none())
+    if (!gpu.dirty.vertex_array_buffers)
        return;
+    gpu.dirty.vertex_array_buffers = false;
+
+    const auto& regs = gpu.regs;

    MICROPROFILE_SCOPE(OpenGL_VB);

    // Upload all guest vertex arrays sequentially to our buffer
    for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
-        if (!gpu.dirty_flags.vertex_array[index])
+        if (!gpu.dirty.vertex_array[index])
            continue;
+        gpu.dirty.vertex_array[index] = false;
+        gpu.dirty.vertex_instance[index] = false;

        const auto& vertex_array = regs.vertex_array[index];
        if (!vertex_array.IsEnabled())
@@ -224,8 +230,32 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
            glVertexArrayBindingDivisor(vao, index, 0);
        }
    }
+}

-    gpu.dirty_flags.vertex_array.reset();
+void RasterizerOpenGL::SetupVertexInstances(GLuint vao) {
+    auto& gpu = system.GPU().Maxwell3D();
+
+    if (!gpu.dirty.vertex_instances)
+        return;
+    gpu.dirty.vertex_instances = false;
+
+    const auto& regs = gpu.regs;
+    // Upload all guest vertex arrays sequentially to our buffer
+    for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
+        if (!gpu.dirty.vertex_instance[index])
+            continue;
+
+        gpu.dirty.vertex_instance[index] = false;
+
+        if (regs.instanced_arrays.IsInstancingEnabled(index) &&
+            regs.vertex_array[index].divisor != 0) {
+            // Enable vertex buffer instancing with the specified divisor.
+            glVertexArrayBindingDivisor(vao, index, regs.vertex_array[index].divisor);
+        } else {
+            // Disable the vertex buffer instancing.
+            glVertexArrayBindingDivisor(vao, index, 0);
+        }
+    }
 }

 GLintptr RasterizerOpenGL::SetupIndexBuffer() {
@@ -298,9 +328,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {

        Shader shader{shader_cache.GetStageProgram(program)};

-        const auto stage_enum{static_cast<Maxwell::ShaderStage>(stage)};
+        const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage);
        SetupDrawConstBuffers(stage_enum, shader);
-        SetupGlobalRegions(stage_enum, shader);
+        SetupDrawGlobalMemory(stage_enum, shader);
        const auto texture_buffer_usage{SetupTextures(stage_enum, shader, base_bindings)};

        const ProgramVariant variant{base_bindings, primitive_mode, texture_buffer_usage};
@@ -341,7 +371,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {

    SyncClipEnabled(clip_distances);

-    gpu.dirty_flags.shaders = false;
+    gpu.dirty.shaders = false;
 }

 std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
@@ -424,13 +454,13 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(

    const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents,
                                                 single_color_target};
-    if (fb_config_state == current_framebuffer_config_state &&
-        gpu.dirty_flags.color_buffer.none() && !gpu.dirty_flags.zeta_buffer) {
+    if (fb_config_state == current_framebuffer_config_state && !gpu.dirty.render_settings) {
        // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or
        // single color targets). This is done because the guest registers may not change but the
        // host framebuffer may contain different attachments
        return current_depth_stencil_usage;
    }
+    gpu.dirty.render_settings = false;
    current_framebuffer_config_state = fb_config_state;

    texture_cache.GuardRenderTargets(true);
@@ -519,13 +549,71 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
    return current_depth_stencil_usage = {static_cast<bool>(depth_surface), fbkey.stencil_enable};
 }

+void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
+                                                 bool using_depth_fb, bool using_stencil_fb) {
+    auto& gpu = system.GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
+
+    texture_cache.GuardRenderTargets(true);
+    View color_surface{};
+    if (using_color_fb) {
+        color_surface = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT, false);
+    }
+    View depth_surface{};
+    if (using_depth_fb || using_stencil_fb) {
+        depth_surface = texture_cache.GetDepthBufferSurface(false);
+    }
+    texture_cache.GuardRenderTargets(false);
+
+    current_state.draw.draw_framebuffer = clear_framebuffer.handle;
+    current_state.ApplyFramebufferState();
+
+    if (color_surface) {
+        color_surface->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER);
+    } else {
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
+    }
+
+    if (depth_surface) {
+        const auto& params = depth_surface->GetSurfaceParams();
+        switch (params.type) {
+        case VideoCore::Surface::SurfaceType::Depth: {
+            depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER);
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
+            break;
+        }
+        case VideoCore::Surface::SurfaceType::DepthStencil: {
+            depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER);
+            break;
+        }
+        default: { UNIMPLEMENTED(); }
+        }
+    } else {
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
+                               0);
+    }
+}
+
 void RasterizerOpenGL::Clear() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& maxwell3d = system.GPU().Maxwell3D();
+
+    if (!maxwell3d.ShouldExecute()) {
+        return;
+    }
+
+    const auto& regs = maxwell3d.regs;
    bool use_color{};
    bool use_depth{};
    bool use_stencil{};

-    OpenGLState clear_state;
+    OpenGLState prev_state{OpenGLState::GetCurState()};
+    SCOPE_EXIT({
+        prev_state.AllDirty();
+        prev_state.Apply();
+    });
+
+    OpenGLState clear_state{OpenGLState::GetCurState()};
+    clear_state.SetDefaultViewports();
    if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
        regs.clear_buffers.A) {
        use_color = true;
@@ -545,6 +633,7 @@ void RasterizerOpenGL::Clear() {
        // true.
        clear_state.depth.test_enabled = true;
        clear_state.depth.test_func = GL_ALWAYS;
+        clear_state.depth.write_mask = GL_TRUE;
    }
    if (regs.clear_buffers.S) {
        ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!");
@@ -581,8 +670,9 @@ void RasterizerOpenGL::Clear() {
        return;
    }

-    const auto [clear_depth, clear_stencil] = ConfigureFramebuffers(
-        clear_state, use_color, use_depth || use_stencil, false, regs.clear_buffers.RT.Value());
+    ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil);
+
+    SyncViewport(clear_state);
    if (regs.clear_flags.scissor) {
        SyncScissorTest(clear_state);
    }
@@ -591,21 +681,18 @@ void RasterizerOpenGL::Clear() {
        clear_state.EmulateViewportWithScissor();
    }

-    clear_state.ApplyColorMask();
-    clear_state.ApplyDepth();
-    clear_state.ApplyStencilTest();
-    clear_state.ApplyViewport();
-    clear_state.ApplyFramebufferState();
+    clear_state.AllDirty();
+    clear_state.Apply();

    if (use_color) {
-        glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
+        glClearBufferfv(GL_COLOR, 0, regs.clear_color);
    }

-    if (clear_depth && clear_stencil) {
+    if (use_depth && use_stencil) {
        glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil);
-    } else if (clear_depth) {
+    } else if (use_depth) {
        glClearBufferfv(GL_DEPTH, 0, &regs.clear_depth);
-    } else if (clear_stencil) {
+    } else if (use_stencil) {
        glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
    }
 }
@@ -616,6 +703,11 @@ void RasterizerOpenGL::DrawArrays() {

    MICROPROFILE_SCOPE(OpenGL_Drawing);
    auto& gpu = system.GPU().Maxwell3D();
+
+    if (!gpu.ShouldExecute()) {
+        return;
+    }
+
    const auto& regs = gpu.regs;

    SyncColorMask();
@@ -661,6 +753,7 @@ void RasterizerOpenGL::DrawArrays() {

    // Upload vertex and index data.
    SetupVertexBuffer(vao);
+    SetupVertexInstances(vao);
    const GLintptr index_buffer_offset = SetupIndexBuffer();

    // Setup draw parameters. It will automatically choose what glDraw* method to use.
@@ -687,7 +780,7 @@ void RasterizerOpenGL::DrawArrays() {

    if (invalidate) {
        // As all cached buffers are invalidated, we need to recheck their state.
-        gpu.dirty_flags.vertex_array.set();
+        gpu.dirty.ResetVertexArrays();
    }

    shader_program_manager->ApplyTo(state);
@@ -700,6 +793,46 @@ void RasterizerOpenGL::DrawArrays() {
    params.DispatchDraw();

    accelerate_draw = AccelDraw::Disabled;
+    gpu.dirty.memory_general = false;
+}
+
+void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
+    if (!GLAD_GL_ARB_compute_variable_group_size) {
+        LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the "
+                                 "lack of GL_ARB_compute_variable_group_size");
+        return;
+    }
+
+    auto kernel = shader_cache.GetComputeKernel(code_addr);
+    const auto [program, next_bindings] = kernel->GetProgramHandle({});
+    state.draw.shader_program = program;
+    state.draw.program_pipeline = 0;
+
+    const std::size_t buffer_size =
+        Tegra::Engines::KeplerCompute::NumConstBuffers *
+        (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
+    buffer_cache.Map(buffer_size);
+
+    bind_ubo_pushbuffer.Setup(0);
+    bind_ssbo_pushbuffer.Setup(0);
+
+    SetupComputeConstBuffers(kernel);
+    SetupComputeGlobalMemory(kernel);
+
+    // TODO(Rodrigo): Bind images and samplers
+
+    buffer_cache.Unmap();
+
+    bind_ubo_pushbuffer.Bind();
+    bind_ssbo_pushbuffer.Bind();
+
+    state.ApplyShaderProgram();
+    state.ApplyProgramPipeline();
+
+    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y,
+                                  launch_desc.grid_dim_z, launch_desc.block_dim_x,
+                                  launch_desc.block_dim_y, launch_desc.block_dim_z);
 }

 void RasterizerOpenGL::FlushAll() {}
@@ -730,6 +863,10 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
    InvalidateRegion(addr, size);
 }

+void RasterizerOpenGL::FlushCommands() {
+    glFlush();
+}
+
 void RasterizerOpenGL::TickFrame() {
    buffer_cache.TickFrame();
 }
@@ -775,12 +912,25 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
 void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                                             const Shader& shader) {
    MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto stage_index = static_cast<std::size_t>(stage);
-    const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index];
-
-    // Upload only the enabled buffers from the 16 constbuffers of each shader stage
+    const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
+    const auto& shader_stage = stages[static_cast<std::size_t>(stage)];
    for (const auto& entry : shader->GetShaderEntries().const_buffers) {
-        SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry);
+        const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
+        SetupConstBuffer(buffer, entry);
+    }
+}
+
+void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
+    MICROPROFILE_SCOPE(OpenGL_UBO);
+    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    for (const auto& entry : kernel->GetShaderEntries().const_buffers) {
+        const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
+        const std::bitset<8> mask = launch_desc.memory_config.const_buffer_enable_mask.Value();
+        Tegra::Engines::ConstBufferInfo buffer;
+        buffer.address = config.Address();
+        buffer.size = config.size;
+        buffer.enabled = mask[entry.GetIndex()];
+        SetupConstBuffer(buffer, entry);
    }
 }

@@ -801,24 +951,39 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b
    bind_ubo_pushbuffer.Push(cbuf, offset, size);
 }

-void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                          const Shader& shader) {
+void RasterizerOpenGL::SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                                             const Shader& shader) {
    auto& gpu{system.GPU()};
    auto& memory_manager{gpu.MemoryManager()};
    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
-    const auto alignment{device.GetShaderStorageBufferAlignment()};
-
    for (const auto& entry : shader->GetShaderEntries().global_memory_entries) {
        const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
-        const auto actual_addr{memory_manager.Read<u64>(addr)};
+        const auto gpu_addr{memory_manager.Read<u64>(addr)};
        const auto size{memory_manager.Read<u32>(addr + 8)};
-
-        const auto [ssbo, buffer_offset] =
-            buffer_cache.UploadMemory(actual_addr, size, alignment, true, entry.IsWritten());
-        bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
+        SetupGlobalMemory(entry, gpu_addr, size);
    }
 }

+void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
+    auto& gpu{system.GPU()};
+    auto& memory_manager{gpu.MemoryManager()};
+    const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
+    for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) {
+        const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
+        const auto gpu_addr{memory_manager.Read<u64>(addr)};
+        const auto size{memory_manager.Read<u32>(addr + 8)};
+        SetupGlobalMemory(entry, gpu_addr, size);
+    }
+}
+
+void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry,
+                                         GPUVAddr gpu_addr, std::size_t size) {
+    const auto alignment{device.GetShaderStorageBufferAlignment()};
+    const auto [ssbo, buffer_offset] =
+        buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten());
+    bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
+}
+
 TextureBufferUsage RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
                                                   BaseBindings base_bindings) {
    MICROPROFILE_SCOPE(OpenGL_Texture);
@@ -907,10 +1072,11 @@ void RasterizerOpenGL::SyncClipCoef() {
 }

 void RasterizerOpenGL::SyncCullMode() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+
+    const auto& regs = maxwell3d.regs;

    state.cull.enabled = regs.cull.enabled != 0;
-
    if (state.cull.enabled) {
        state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face);
        state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face);
@@ -943,16 +1109,21 @@ void RasterizerOpenGL::SyncDepthTestState() {
    state.depth.test_enabled = regs.depth_test_enable != 0;
    state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE;

-    if (!state.depth.test_enabled)
+    if (!state.depth.test_enabled) {
        return;
+    }

    state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func);
 }

 void RasterizerOpenGL::SyncStencilTestState() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-    state.stencil.test_enabled = regs.stencil_enable != 0;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.stencil_test) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;

+    state.stencil.test_enabled = regs.stencil_enable != 0;
    if (!regs.stencil_enable) {
        return;
    }
@@ -981,10 +1152,17 @@ void RasterizerOpenGL::SyncStencilTestState() {
        state.stencil.back.action_depth_fail = GL_KEEP;
        state.stencil.back.action_depth_pass = GL_KEEP;
    }
+    state.MarkDirtyStencilState();
+    maxwell3d.dirty.stencil_test = false;
 }

 void RasterizerOpenGL::SyncColorMask() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.color_mask) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
+
    const std::size_t count =
        regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1;
    for (std::size_t i = 0; i < count; i++) {
@@ -995,6 +1173,9 @@ void RasterizerOpenGL::SyncColorMask() {
        dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE;
        dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE;
    }
+
+    state.MarkDirtyColorMask();
+    maxwell3d.dirty.color_mask = false;
 }

 void RasterizerOpenGL::SyncMultiSampleState() {
@@ -1009,7 +1190,11 @@ void RasterizerOpenGL::SyncFragmentColorClampState() {
 }

 void RasterizerOpenGL::SyncBlendState() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.blend_state) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;

    state.blend_color.red = regs.blend_color.r;
    state.blend_color.green = regs.blend_color.g;
@@ -1032,6 +1217,8 @@ void RasterizerOpenGL::SyncBlendState() {
        for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
            state.blend[i].enabled = false;
        }
+        maxwell3d.dirty.blend_state = false;
+        state.MarkDirtyBlendState();
        return;
    }

@@ -1048,6 +1235,9 @@ void RasterizerOpenGL::SyncBlendState() {
        blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a);
        blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a);
    }
+
+    state.MarkDirtyBlendState();
+    maxwell3d.dirty.blend_state = false;
 }

 void RasterizerOpenGL::SyncLogicOpState() {
@@ -1099,13 +1289,21 @@ void RasterizerOpenGL::SyncPointState() {
 }

 void RasterizerOpenGL::SyncPolygonOffset() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.polygon_offset) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
+
    state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0;
    state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0;
    state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0;
    state.polygon_offset.units = regs.polygon_offset_units;
    state.polygon_offset.factor = regs.polygon_offset_factor;
    state.polygon_offset.clamp = regs.polygon_offset_clamp;
+
+    state.MarkDirtyPolygonOffset();
+    maxwell3d.dirty.polygon_offset = false;
 }

 void RasterizerOpenGL::SyncAlphaTest() {
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -58,10 +58,12 @@ public:

    void DrawArrays() override;
    void Clear() override;
+    void DispatchCompute(GPUVAddr code_addr) override;
    void FlushAll() override;
    void FlushRegion(CacheAddr addr, u64 size) override;
    void InvalidateRegion(CacheAddr addr, u64 size) override;
    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+    void FlushCommands() override;
    void TickFrame() override;
    bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                               const Tegra::Engines::Fermi2D::Regs::Surface& dst,
@@ -108,17 +110,30 @@ private:
        OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true,
        bool preserve_contents = true, std::optional<std::size_t> single_color_target = {});

+    void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
+                                   bool using_depth_fb, bool using_stencil_fb);
+
    /// Configures the current constbuffers to use for the draw command.
    void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                               const Shader& shader);

+    /// Configures the current constbuffers to use for the kernel invocation.
+    void SetupComputeConstBuffers(const Shader& kernel);
+
    /// Configures a constant buffer.
    void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer,
                          const GLShader::ConstBufferEntry& entry);

    /// Configures the current global memory entries to use for the draw command.
-    void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                            const Shader& shader);
+    void SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                               const Shader& shader);
+
+    /// Configures the current global memory entries to use for the kernel invocation.
+    void SetupComputeGlobalMemory(const Shader& kernel);
+
+    /// Configures a constant buffer.
+    void SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
+                           std::size_t size);

    /// Configures the current textures to use for the draw command. Returns shaders texture buffer
    /// usage.
@@ -216,6 +231,7 @@ private:
    GLuint SetupVertexFormat();

    void SetupVertexBuffer(GLuint vao);
+    void SetupVertexInstances(GLuint vao);

    GLintptr SetupIndexBuffer();

@@ -226,6 +242,8 @@ private:
    enum class AccelDraw { Disabled, Arrays, Indexed };
    AccelDraw accelerate_draw = AccelDraw::Disabled;

+    OGLFramebuffer clear_framebuffer;
+
    using CachedPageMap = boost::icl::interval_map<u64, int>;
    CachedPageMap cached_pages;
 };
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -23,13 +23,13 @@ namespace OpenGL {

 using VideoCommon::Shader::ProgramCode;

-// One UBO is always reserved for emulation values
-constexpr u32 RESERVED_UBOS = 1;
+// One UBO is always reserved for emulation values on staged shaders
+constexpr u32 STAGE_RESERVED_UBOS = 1;

 struct UnspecializedShader {
    std::string code;
    GLShader::ShaderEntries entries;
-    Maxwell::ShaderProgram program_type;
+    ProgramType program_type;
 };

 namespace {
@@ -55,15 +55,17 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr g
 }

 /// Gets the shader type from a Maxwell program type
-constexpr GLenum GetShaderType(Maxwell::ShaderProgram program_type) {
+constexpr GLenum GetShaderType(ProgramType program_type) {
    switch (program_type) {
-    case Maxwell::ShaderProgram::VertexA:
-    case Maxwell::ShaderProgram::VertexB:
+    case ProgramType::VertexA:
+    case ProgramType::VertexB:
        return GL_VERTEX_SHADER;
-    case Maxwell::ShaderProgram::Geometry:
+    case ProgramType::Geometry:
        return GL_GEOMETRY_SHADER;
-    case Maxwell::ShaderProgram::Fragment:
+    case ProgramType::Fragment:
        return GL_FRAGMENT_SHADER;
+    case ProgramType::Compute:
+        return GL_COMPUTE_SHADER;
    default:
        return GL_NONE;
    }
@@ -100,6 +102,25 @@ constexpr std::tuple<const char*, const char*, u32> GetPrimitiveDescription(GLen
    }
 }

+ProgramType GetProgramType(Maxwell::ShaderProgram program) {
+    switch (program) {
+    case Maxwell::ShaderProgram::VertexA:
+        return ProgramType::VertexA;
+    case Maxwell::ShaderProgram::VertexB:
+        return ProgramType::VertexB;
+    case Maxwell::ShaderProgram::TesselationControl:
+        return ProgramType::TessellationControl;
+    case Maxwell::ShaderProgram::TesselationEval:
+        return ProgramType::TessellationEval;
+    case Maxwell::ShaderProgram::Geometry:
+        return ProgramType::Geometry;
+    case Maxwell::ShaderProgram::Fragment:
+        return ProgramType::Fragment;
+    }
+    UNREACHABLE();
+    return {};
+}
+
 /// Calculates the size of a program stream
 std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
    constexpr std::size_t start_offset = 10;
@@ -128,13 +149,13 @@ std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
 }

 /// Hashes one (or two) program streams
-u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& code,
+u64 GetUniqueIdentifier(ProgramType program_type, const ProgramCode& code,
                        const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) {
    if (size_a == 0) {
        size_a = CalculateProgramSize(code);
    }
    u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a);
-    if (program_type != Maxwell::ShaderProgram::VertexA) {
+    if (program_type != ProgramType::VertexA) {
        return unique_identifier;
    }
    // VertexA programs include two programs
@@ -152,12 +173,12 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
 }

 /// Creates an unspecialized program from code streams
-GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
+GLShader::ProgramResult CreateProgram(const Device& device, ProgramType program_type,
                                      ProgramCode program_code, ProgramCode program_code_b) {
    GLShader::ShaderSetup setup(program_code);
    setup.program.size_a = CalculateProgramSize(program_code);
    setup.program.size_b = 0;
-    if (program_type == Maxwell::ShaderProgram::VertexA) {
+    if (program_type == ProgramType::VertexA) {
        // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
        // Conventional HW does not support this, so we combine VertexA and VertexB into one
        // stage here.
@@ -168,22 +189,23 @@ GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgr
        program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b);

    switch (program_type) {
-    case Maxwell::ShaderProgram::VertexA:
-    case Maxwell::ShaderProgram::VertexB:
+    case ProgramType::VertexA:
+    case ProgramType::VertexB:
        return GLShader::GenerateVertexShader(device, setup);
-    case Maxwell::ShaderProgram::Geometry:
+    case ProgramType::Geometry:
        return GLShader::GenerateGeometryShader(device, setup);
-    case Maxwell::ShaderProgram::Fragment:
+    case ProgramType::Fragment:
        return GLShader::GenerateFragmentShader(device, setup);
+    case ProgramType::Compute:
+        return GLShader::GenerateComputeShader(device, setup);
    default:
-        LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type));
-        UNREACHABLE();
+        UNIMPLEMENTED_MSG("Unimplemented program_type={}", static_cast<u32>(program_type));
        return {};
    }
 }

 CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEntries& entries,
-                               Maxwell::ShaderProgram program_type, const ProgramVariant& variant,
+                               ProgramType program_type, const ProgramVariant& variant,
                               bool hint_retrievable = false) {
    auto base_bindings{variant.base_bindings};
    const auto primitive_mode{variant.primitive_mode};
@@ -194,7 +216,14 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
    if (entries.shader_viewport_layer_array) {
        source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
    }
-    source += fmt::format("\n#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
+    if (program_type == ProgramType::Compute) {
+        source += "#extension GL_ARB_compute_variable_group_size : require\n";
+    }
+    source += '\n';
+
+    if (program_type != ProgramType::Compute) {
+        source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
+    }

    for (const auto& cbuf : entries.const_buffers) {
        source +=
@@ -221,13 +250,16 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
        source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i);
    }

-    if (program_type == Maxwell::ShaderProgram::Geometry) {
+    if (program_type == ProgramType::Geometry) {
        const auto [glsl_topology, debug_name, max_vertices] =
            GetPrimitiveDescription(primitive_mode);

        source += "layout (" + std::string(glsl_topology) + ") in;\n";
        source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
    }
+    if (program_type == ProgramType::Compute) {
+        source += "layout (local_size_variable) in;\n";
+    }

    source += code;

@@ -255,7 +287,7 @@ std::set<GLenum> GetSupportedFormats() {

 } // Anonymous namespace

-CachedShader::CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type,
+CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,
                           GLShader::ProgramResult result)
    : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr},
      unique_identifier{params.unique_identifier}, program_type{program_type},
@@ -268,29 +300,50 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
                                           ProgramCode&& program_code_b) {
    const auto code_size{CalculateProgramSize(program_code)};
    const auto code_size_b{CalculateProgramSize(program_code_b)};
-    auto result{CreateProgram(params.device, program_type, program_code, program_code_b)};
+    auto result{
+        CreateProgram(params.device, GetProgramType(program_type), program_code, program_code_b)};
    if (result.first.empty()) {
        // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now
        return {};
    }

    params.disk_cache.SaveRaw(ShaderDiskCacheRaw(
-        params.unique_identifier, program_type, static_cast<u32>(code_size / sizeof(u64)),
-        static_cast<u32>(code_size_b / sizeof(u64)), std::move(program_code),
-        std::move(program_code_b)));
+        params.unique_identifier, GetProgramType(program_type),
+        static_cast<u32>(code_size / sizeof(u64)), static_cast<u32>(code_size_b / sizeof(u64)),
+        std::move(program_code), std::move(program_code_b)));

-    return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result)));
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, GetProgramType(program_type), std::move(result)));
 }

 Shader CachedShader::CreateStageFromCache(const ShaderParameters& params,
                                          Maxwell::ShaderProgram program_type,
                                          GLShader::ProgramResult result) {
-    return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result)));
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, GetProgramType(program_type), std::move(result)));
+}
+
+Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code) {
+    auto result{CreateProgram(params.device, ProgramType::Compute, code, {})};
+
+    const auto code_size{CalculateProgramSize(code)};
+    params.disk_cache.SaveRaw(ShaderDiskCacheRaw(params.unique_identifier, ProgramType::Compute,
+                                                 static_cast<u32>(code_size / sizeof(u64)), 0,
+                                                 std::move(code), {}));
+
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, ProgramType::Compute, std::move(result)));
+}
+
+Shader CachedShader::CreateKernelFromCache(const ShaderParameters& params,
+                                           GLShader::ProgramResult result) {
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, ProgramType::Compute, std::move(result)));
 }

 std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) {
    GLuint handle{};
-    if (program_type == Maxwell::ShaderProgram::Geometry) {
+    if (program_type == ProgramType::Geometry) {
        handle = GetGeometryShader(variant);
    } else {
        const auto [entry, is_cache_miss] = programs.try_emplace(variant);
@@ -308,8 +361,11 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVar
        handle = program->handle;
    }

-    auto base_bindings{variant.base_bindings};
-    base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()) + RESERVED_UBOS;
+    auto base_bindings = variant.base_bindings;
+    base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size());
+    if (program_type != ProgramType::Compute) {
+        base_bindings.cbuf += STAGE_RESERVED_UBOS;
+    }
    base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size());
    base_bindings.sampler += static_cast<u32>(entries.samplers.size());

@@ -572,7 +628,7 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
 }

 Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
-    if (!system.GPU().Maxwell3D().dirty_flags.shaders) {
+    if (!system.GPU().Maxwell3D().dirty.shaders) {
        return last_shaders[static_cast<std::size_t>(program)];
    }

@@ -589,13 +645,15 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
    // No shader found - create a new one
    ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
    ProgramCode program_code_b;
-    if (program == Maxwell::ShaderProgram::VertexA) {
+    const bool is_program_a{program == Maxwell::ShaderProgram::VertexA};
+    if (is_program_a) {
        const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
        program_code_b = GetShaderCode(memory_manager, program_addr_b,
                                       memory_manager.GetPointer(program_addr_b));
    }

-    const auto unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
+    const auto unique_identifier =
+        GetUniqueIdentifier(GetProgramType(program), program_code, program_code_b);
    const auto cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
    const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
                                  host_ptr,   unique_identifier};
@@ -612,4 +670,30 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
    return last_shaders[static_cast<std::size_t>(program)] = shader;
 }

+Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
+    auto& memory_manager{system.GPU().MemoryManager()};
+    const auto host_ptr{memory_manager.GetPointer(code_addr)};
+    auto kernel = TryGet(host_ptr);
+    if (kernel) {
+        return kernel;
+    }
+
+    // No kernel found - create a new one
+    auto code{GetShaderCode(memory_manager, code_addr, host_ptr)};
+    const auto unique_identifier{GetUniqueIdentifier(ProgramType::Compute, code, {})};
+    const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)};
+    const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
+                                  host_ptr,   unique_identifier};
+
+    const auto found = precompiled_shaders.find(unique_identifier);
+    if (found == precompiled_shaders.end()) {
+        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
+    } else {
+        kernel = CachedShader::CreateKernelFromCache(params, found->second);
+    }
+
+    Register(kernel);
+    return kernel;
+}
+
 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -61,6 +61,11 @@ public:
                                       Maxwell::ShaderProgram program_type,
                                       GLShader::ProgramResult result);

+    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code);
+
+    static Shader CreateKernelFromCache(const ShaderParameters& params,
+                                        GLShader::ProgramResult result);
+
    VAddr GetCpuAddr() const override {
        return cpu_addr;
    }
@@ -78,7 +83,7 @@ public:
    std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant);

 private:
-    explicit CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type,
+    explicit CachedShader(const ShaderParameters& params, ProgramType program_type,
                          GLShader::ProgramResult result);

    // Geometry programs. These are needed because GLSL needs an input topology but it's not
@@ -104,7 +109,7 @@ private:
    u8* host_ptr{};
    VAddr cpu_addr{};
    u64 unique_identifier{};
-    Maxwell::ShaderProgram program_type{};
+    ProgramType program_type{};
    ShaderDiskCacheOpenGL& disk_cache;
    const PrecompiledPrograms& precompiled_programs;

@@ -132,6 +137,9 @@ public:
    /// Gets the current specified shader stage program
    Shader GetStageProgram(Maxwell::ShaderProgram program);

+    /// Gets a compute kernel in the passed address
+    Shader GetComputeKernel(GPUVAddr code_addr);
+
 protected:
    // We do not have to flush this cache as things in it are never modified by us.
    void FlushObjectInner(const Shader& object) override {}
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,7 +37,6 @@ using namespace std::string_literals;
 using namespace VideoCommon::Shader;

 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage;
 using Operation = const OperationNode&;

 enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
@@ -162,9 +161,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
    return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
 }

+constexpr bool IsVertexShader(ProgramType stage) {
+    return stage == ProgramType::VertexA || stage == ProgramType::VertexB;
+}
+
 class GLSLDecompiler final {
 public:
-    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
+    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ProgramType stage,
                            std::string suffix)
        : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}

@@ -248,25 +251,21 @@ public:
        }
        entries.clip_distances = ir.GetClipDistances();
        entries.shader_viewport_layer_array =
-            stage == ShaderStage::Vertex && (ir.UsesLayer() || ir.UsesViewportIndex());
+            IsVertexShader(stage) && (ir.UsesLayer() || ir.UsesViewportIndex());
        entries.shader_length = ir.GetLength();
        return entries;
    }

 private:
-    using OperationDecompilerFn = std::string (GLSLDecompiler::*)(Operation);
-    using OperationDecompilersArray =
-        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
-
    void DeclareVertex() {
-        if (stage != ShaderStage::Vertex)
+        if (!IsVertexShader(stage))
            return;

        DeclareVertexRedeclarations();
    }

    void DeclareGeometry() {
-        if (stage != ShaderStage::Geometry) {
+        if (stage != ProgramType::Geometry) {
            return;
        }

@@ -297,14 +296,14 @@ private:
                break;
            }
        }
-        if (stage != ShaderStage::Vertex || device.HasVertexViewportLayer()) {
+        if (!IsVertexShader(stage) || device.HasVertexViewportLayer()) {
            if (ir.UsesLayer()) {
                code.AddLine("int gl_Layer;");
            }
            if (ir.UsesViewportIndex()) {
                code.AddLine("int gl_ViewportIndex;");
            }
-        } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderStage::Vertex &&
+        } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && IsVertexShader(stage) &&
                   !device.HasVertexViewportLayer()) {
            LOG_ERROR(
                Render_OpenGL,
@@ -341,11 +340,16 @@ private:
    }

    void DeclareLocalMemory() {
-        if (const u64 local_memory_size = header.GetLocalMemorySize(); local_memory_size > 0) {
-            const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
-            code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
-            code.AddNewLine();
+        // TODO(Rodrigo): Unstub kernel local memory size and pass it from a register at
+        // specialization time.
+        const u64 local_memory_size =
+            stage == ProgramType::Compute ? 0x400 : header.GetLocalMemorySize();
+        if (local_memory_size == 0) {
+            return;
        }
+        const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
+        code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
+        code.AddNewLine();
    }

    void DeclareInternalFlags() {
@@ -399,12 +403,12 @@ private:
        const u32 location{GetGenericAttributeIndex(index)};

        std::string name{GetInputAttribute(index)};
-        if (stage == ShaderStage::Geometry) {
+        if (stage == ProgramType::Geometry) {
            name = "gs_" + name + "[]";
        }

        std::string suffix;
-        if (stage == ShaderStage::Fragment) {
+        if (stage == ProgramType::Fragment) {
            const auto input_mode{header.ps.GetAttributeUse(location)};
            if (skip_unused && input_mode == AttributeUse::Unused) {
                return;
@@ -416,7 +420,7 @@ private:
    }

    void DeclareOutputAttributes() {
-        if (ir.HasPhysicalAttributes() && stage != ShaderStage::Fragment) {
+        if (ir.HasPhysicalAttributes() && stage != ProgramType::Fragment) {
            for (u32 i = 0; i < GetNumPhysicalVaryings(); ++i) {
                DeclareOutputAttribute(ToGenericAttribute(i));
            }
@@ -538,7 +542,7 @@ private:
                constexpr u32 element_stride{4};
                const u32 address{generic_base + index * generic_stride + element * element_stride};

-                const bool declared{stage != ShaderStage::Fragment ||
+                const bool declared{stage != ProgramType::Fragment ||
                                    header.ps.GetAttributeUse(index) != AttributeUse::Unused};
                const std::string value{declared ? ReadAttribute(attribute, element) : "0"};
                code.AddLine("case 0x{:x}: return {};", address, value);
@@ -642,7 +646,7 @@ private:
        }

        if (const auto abuf = std::get_if<AbufNode>(&*node)) {
-            UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ShaderStage::Geometry,
+            UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ProgramType::Geometry,
                                 "Physical attributes in geometry shaders are not implemented");
            if (abuf->IsPhysicalBuffer()) {
                return fmt::format("readPhysicalAttribute(ftou({}))",
@@ -697,6 +701,9 @@ private:
        }

        if (const auto lmem = std::get_if<LmemNode>(&*node)) {
+            if (stage == ProgramType::Compute) {
+                LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
+            }
            return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
        }

@@ -726,7 +733,7 @@ private:

    std::string ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) {
        const auto GeometryPass = [&](std::string_view name) {
-            if (stage == ShaderStage::Geometry && buffer) {
+            if (stage == ProgramType::Geometry && buffer) {
                // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
                // set an 0x80000000 index for those and the shader fails to build. Find out why
                // this happens and what's its intent.
@@ -738,10 +745,10 @@ private:
        switch (attribute) {
        case Attribute::Index::Position:
            switch (stage) {
-            case ShaderStage::Geometry:
+            case ProgramType::Geometry:
                return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer),
                                   GetSwizzle(element));
-            case ShaderStage::Fragment:
+            case ProgramType::Fragment:
                return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element));
            default:
                UNREACHABLE();
@@ -762,7 +769,7 @@ private:
            // TODO(Subv): Find out what the values are for the first two elements when inside a
            // vertex shader, and what's the value of the fourth element when inside a Tess Eval
            // shader.
-            ASSERT(stage == ShaderStage::Vertex);
+            ASSERT(IsVertexShader(stage));
            switch (element) {
            case 2:
                // Config pack's first value is instance_id.
@@ -774,7 +781,7 @@ private:
            return "0";
        case Attribute::Index::FrontFacing:
            // TODO(Subv): Find out what the values are for the other elements.
-            ASSERT(stage == ShaderStage::Fragment);
+            ASSERT(stage == ProgramType::Fragment);
            switch (element) {
            case 3:
                return "itof(gl_FrontFacing ? -1 : 0)";
@@ -796,7 +803,7 @@ private:
            return value;
        }
        // There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders
-        const std::string precise = stage != ShaderStage::Fragment ? "precise " : "";
+        const std::string precise = stage != ProgramType::Fragment ? "precise " : "";

        const std::string temporary = code.GenerateTemporary();
        code.AddLine("{}float {} = {};", precise, temporary, value);
@@ -831,12 +838,12 @@ private:
                UNIMPLEMENTED();
                return {};
            case 1:
-                if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
+                if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                    return {};
                }
                return std::make_pair("gl_Layer", true);
            case 2:
-                if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
+                if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                    return {};
                }
                return std::make_pair("gl_ViewportIndex", true);
@@ -1073,6 +1080,9 @@ private:
            target = result->first;
            is_integer = result->second;
        } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
+            if (stage == ProgramType::Compute) {
+                LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
+            }
            target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
        } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
            const std::string real = Visit(gmem->GetRealAddress());
@@ -1400,14 +1410,10 @@ private:
        return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint));
    }

-    std::string LogicalAll2(Operation operation) {
+    std::string LogicalAnd2(Operation operation) {
        return GenerateUnary(operation, "all", Type::Bool, Type::Bool2);
    }

-    std::string LogicalAny2(Operation operation) {
-        return GenerateUnary(operation, "any", Type::Bool, Type::Bool2);
-    }
-
    template <bool with_nan>
    std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) {
        const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
@@ -1630,7 +1636,7 @@ private:
    }

    std::string Exit(Operation operation) {
-        if (stage != ShaderStage::Fragment) {
+        if (stage != ProgramType::Fragment) {
            code.AddLine("return;");
            return {};
        }
@@ -1681,7 +1687,7 @@ private:
    }

    std::string EmitVertex(Operation operation) {
-        ASSERT_MSG(stage == ShaderStage::Geometry,
+        ASSERT_MSG(stage == ProgramType::Geometry,
                   "EmitVertex is expected to be used in a geometry shader.");

        // If a geometry shader is attached, it will always flip (it's the last stage before
@@ -1692,7 +1698,7 @@ private:
    }

    std::string EndPrimitive(Operation operation) {
-        ASSERT_MSG(stage == ShaderStage::Geometry,
+        ASSERT_MSG(stage == ProgramType::Geometry,
                   "EndPrimitive is expected to be used in a geometry shader.");

        code.AddLine("EndPrimitive();");
@@ -1714,7 +1720,7 @@ private:
        return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
    }

-    static constexpr OperationDecompilersArray operation_decompilers = {
+    static constexpr std::array operation_decompilers = {
        &GLSLDecompiler::Assign,

        &GLSLDecompiler::Select,
@@ -1798,8 +1804,7 @@ private:
        &GLSLDecompiler::LogicalXor,
        &GLSLDecompiler::LogicalNegate,
        &GLSLDecompiler::LogicalPick2,
-        &GLSLDecompiler::LogicalAll2,
-        &GLSLDecompiler::LogicalAny2,
+        &GLSLDecompiler::LogicalAnd2,

        &GLSLDecompiler::LogicalLessThan<Type::Float>,
        &GLSLDecompiler::LogicalEqual<Type::Float>,
@@ -1863,6 +1868,7 @@ private:
        &GLSLDecompiler::WorkGroupId<1>,
        &GLSLDecompiler::WorkGroupId<2>,
    };
+    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

    std::string GetRegister(u32 index) const {
        return GetDeclarationWithSuffix(index, "gpr");
@@ -1927,7 +1933,7 @@ private:
    }

    u32 GetNumPhysicalInputAttributes() const {
-        return stage == ShaderStage::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
+        return IsVertexShader(stage) ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
    }

    u32 GetNumPhysicalAttributes() const {
@@ -1940,7 +1946,7 @@ private:

    const Device& device;
    const ShaderIR& ir;
-    const ShaderStage stage;
+    const ProgramType stage;
    const std::string suffix;
    const Header header;

@@ -1971,7 +1977,7 @@ std::string GetCommonDeclarations() {
        MAX_CONSTBUFFER_ELEMENTS);
 }

-ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage,
+ProgramResult Decompile(const Device& device, const ShaderIR& ir, ProgramType stage,
                        const std::string& suffix) {
    GLSLDecompiler decompiler(device, ir, stage, suffix);
    decompiler.Decompile();
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -12,14 +12,26 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/shader/shader_ir.h"

-namespace OpenGL {
-class Device;
-}
-
 namespace VideoCommon::Shader {
 class ShaderIR;
 }

+namespace OpenGL {
+
+class Device;
+
+enum class ProgramType : u32 {
+    VertexA = 0,
+    VertexB = 1,
+    TessellationControl = 2,
+    TessellationEval = 3,
+    Geometry = 4,
+    Fragment = 5,
+    Compute = 6
+};
+
+} // namespace OpenGL
+
 namespace OpenGL::GLShader {

 struct ShaderEntries;
@@ -85,6 +97,6 @@ struct ShaderEntries {
 std::string GetCommonDeclarations();

 ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
-                        Maxwell::ShaderStage stage, const std::string& suffix);
+                        ProgramType stage, const std::string& suffix);

 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -51,7 +51,7 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() {

 } // namespace

-ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
+ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
                                       u32 program_code_size, u32 program_code_size_b,
                                       ProgramCode program_code, ProgramCode program_code_b)
    : unique_identifier{unique_identifier}, program_type{program_type},
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -18,7 +18,6 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "core/file_sys/vfs_vector.h"
-#include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"

 namespace Core {
@@ -34,14 +33,11 @@ namespace OpenGL {
 struct ShaderDiskCacheUsage;
 struct ShaderDiskCacheDump;

-using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
-
 using ProgramCode = std::vector<u64>;
-using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-
+using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
 using TextureBufferUsage = std::bitset<64>;

-/// Allocated bindings used by an OpenGL shader program.
+/// Allocated bindings used by an OpenGL shader program
 struct BaseBindings {
    u32 cbuf{};
    u32 gmem{};
@@ -126,7 +122,7 @@ namespace OpenGL {
 /// Describes a shader how it's used by the guest GPU
 class ShaderDiskCacheRaw {
 public:
-    explicit ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
+    explicit ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
                                u32 program_code_size, u32 program_code_size_b,
                                ProgramCode program_code, ProgramCode program_code_b);
    ShaderDiskCacheRaw();
@@ -141,30 +137,13 @@ public:
    }

    bool HasProgramA() const {
-        return program_type == Maxwell::ShaderProgram::VertexA;
+        return program_type == ProgramType::VertexA;
    }

-    Maxwell::ShaderProgram GetProgramType() const {
+    ProgramType GetProgramType() const {
        return program_type;
    }

-    Maxwell::ShaderStage GetProgramStage() const {
-        switch (program_type) {
-        case Maxwell::ShaderProgram::VertexA:
-        case Maxwell::ShaderProgram::VertexB:
-            return Maxwell::ShaderStage::Vertex;
-        case Maxwell::ShaderProgram::TesselationControl:
-            return Maxwell::ShaderStage::TesselationControl;
-        case Maxwell::ShaderProgram::TesselationEval:
-            return Maxwell::ShaderStage::TesselationEval;
-        case Maxwell::ShaderProgram::Geometry:
-            return Maxwell::ShaderStage::Geometry;
-        case Maxwell::ShaderProgram::Fragment:
-            return Maxwell::ShaderStage::Fragment;
-        }
-        UNREACHABLE();
-    }
-
    const ProgramCode& GetProgramCode() const {
        return program_code;
    }
@@ -175,7 +154,7 @@ public:

 private:
    u64 unique_identifier{};
-    Maxwell::ShaderProgram program_type{};
+    ProgramType program_type{};
    u32 program_code_size{};
    u32 program_code_size_b{};

--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -14,7 +14,8 @@ using Tegra::Engines::Maxwell3D;
 using VideoCommon::Shader::ProgramCode;
 using VideoCommon::Shader::ShaderIR;

-static constexpr u32 PROGRAM_OFFSET{10};
+static constexpr u32 PROGRAM_OFFSET = 10;
+static constexpr u32 COMPUTE_OFFSET = 0;

 ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) {
    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
@@ -29,17 +30,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 };

 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");

+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
+    const auto stage = setup.IsDualProgram() ? ProgramType::VertexA : ProgramType::VertexB;
+    ProgramResult program = Decompile(device, program_ir, stage, "vertex");
    out += program.first;

    if (setup.IsDualProgram()) {
        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b);
-        ProgramResult program_b =
-            Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
-
+        ProgramResult program_b = Decompile(device, program_ir_b, ProgramType::VertexB, "vertex_b");
        out += program_b.first;
    }

@@ -80,9 +79,9 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
 };

 )";
+
    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Geometry, "geometry");
    out += program.first;

    out += R"(
@@ -116,9 +115,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {

 )";
    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
-
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Fragment, "fragment");
    out += program.first;

    out += R"(
@@ -130,4 +127,22 @@ void main() {
    return {std::move(out), std::move(program.second)};
 }

+ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup) {
+    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
+
+    std::string out = "// Shader Unique Id: CS" + id + "\n\n";
+    out += GetCommonDeclarations();
+
+    const ShaderIR program_ir(setup.program.code, COMPUTE_OFFSET, setup.program.size_a);
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Compute, "compute");
+    out += program.first;
+
+    out += R"(
+void main() {
+    execute_compute();
+}
+)";
+    return {std::move(out), std::move(program.second)};
+}
+
 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -54,4 +54,7 @@ ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& se
 /// Generates the GLSL fragment shader program source code for the given FS program
 ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup);

+/// Generates the GLSL compute shader program source code for the given CS program
+ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup);
+
 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_util.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
@@ -10,21 +10,25 @@

 namespace OpenGL::GLShader {

-GLuint LoadShader(const char* source, GLenum type) {
-    const char* debug_type;
+namespace {
+const char* GetStageDebugName(GLenum type) {
    switch (type) {
    case GL_VERTEX_SHADER:
-        debug_type = "vertex";
-        break;
+        return "vertex";
    case GL_GEOMETRY_SHADER:
-        debug_type = "geometry";
-        break;
+        return "geometry";
    case GL_FRAGMENT_SHADER:
-        debug_type = "fragment";
-        break;
-    default:
-        UNREACHABLE();
+        return "fragment";
+    case GL_COMPUTE_SHADER:
+        return "compute";
    }
+    UNIMPLEMENTED();
+    return "unknown";
+}
+} // Anonymous namespace
+
+GLuint LoadShader(const char* source, GLenum type) {
+    const char* debug_type = GetStageDebugName(type);
    const GLuint shader_id = glCreateShader(type);
    glShaderSource(shader_id, 1, &source, nullptr);
    LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -165,6 +165,25 @@ OpenGLState::OpenGLState() {
    alpha_test.ref = 0.0f;
 }

+void OpenGLState::SetDefaultViewports() {
+    for (auto& item : viewports) {
+        item.x = 0;
+        item.y = 0;
+        item.width = 0;
+        item.height = 0;
+        item.depth_range_near = 0.0f;
+        item.depth_range_far = 1.0f;
+        item.scissor.enabled = false;
+        item.scissor.x = 0;
+        item.scissor.y = 0;
+        item.scissor.width = 0;
+        item.scissor.height = 0;
+    }
+
+    depth_clamp.far_plane = false;
+    depth_clamp.near_plane = false;
+}
+
 void OpenGLState::ApplyDefaultState() {
    glEnable(GL_BLEND);
    glDisable(GL_FRAMEBUFFER_SRGB);
@@ -526,7 +545,7 @@ void OpenGLState::ApplySamplers() const {
    }
 }

-void OpenGLState::Apply() const {
+void OpenGLState::Apply() {
    MICROPROFILE_SCOPE(OpenGL_State);
    ApplyFramebufferState();
    ApplyVertexArrayState();
@@ -536,19 +555,31 @@ void OpenGLState::Apply() const {
    ApplyPointSize();
    ApplyFragmentColorClamp();
    ApplyMultisample();
+    if (dirty.color_mask) {
+        ApplyColorMask();
+        dirty.color_mask = false;
+    }
    ApplyDepthClamp();
-    ApplyColorMask();
    ApplyViewport();
-    ApplyStencilTest();
+    if (dirty.stencil_state) {
+        ApplyStencilTest();
+        dirty.stencil_state = false;
+    }
    ApplySRgb();
    ApplyCulling();
    ApplyDepth();
    ApplyPrimitiveRestart();
-    ApplyBlending();
+    if (dirty.blend_state) {
+        ApplyBlending();
+        dirty.blend_state = false;
+    }
    ApplyLogicOp();
    ApplyTextures();
    ApplySamplers();
-    ApplyPolygonOffset();
+    if (dirty.polygon_offset) {
+        ApplyPolygonOffset();
+        dirty.polygon_offset = false;
+    }
    ApplyAlphaTest();
 }

--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -195,8 +195,9 @@ public:
        s_rgb_used = false;
    }

+    void SetDefaultViewports();
    /// Apply this state as the current OpenGL state
-    void Apply() const;
+    void Apply();

    void ApplyFramebufferState() const;
    void ApplyVertexArrayState() const;
@@ -237,11 +238,41 @@ public:
    /// Viewport does not affects glClearBuffer so emulate viewport using scissor test
    void EmulateViewportWithScissor();

+    void MarkDirtyBlendState() {
+        dirty.blend_state = true;
+    }
+
+    void MarkDirtyStencilState() {
+        dirty.stencil_state = true;
+    }
+
+    void MarkDirtyPolygonOffset() {
+        dirty.polygon_offset = true;
+    }
+
+    void MarkDirtyColorMask() {
+        dirty.color_mask = true;
+    }
+
+    void AllDirty() {
+        dirty.blend_state = true;
+        dirty.stencil_state = true;
+        dirty.polygon_offset = true;
+        dirty.color_mask = true;
+    }
+
 private:
    static OpenGLState cur_state;

    // Workaround for sRGB problems caused by QT not supporting srgb output
    static bool s_rgb_used;
+    struct {
+        bool blend_state;
+        bool stencil_state;
+        bool viewport_state;
+        bool polygon_offset;
+        bool color_mask;
+    } dirty{};
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -137,7 +137,6 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
 const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
    ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size());
    const auto& format{tex_format_tuples[static_cast<std::size_t>(pixel_format)]};
-    ASSERT(component_type == format.component_type);
    return format;
 }

@@ -485,11 +484,15 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
    const auto& dst_params{dst_view->GetSurfaceParams()};

    OpenGLState prev_state{OpenGLState::GetCurState()};
-    SCOPE_EXIT({ prev_state.Apply(); });
+    SCOPE_EXIT({
+        prev_state.AllDirty();
+        prev_state.Apply();
+    });

    OpenGLState state;
    state.draw.read_framebuffer = src_framebuffer.handle;
    state.draw.draw_framebuffer = dst_framebuffer.handle;
+    state.AllDirty();
    state.Apply();

    u32 buffers{};
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -108,6 +108,7 @@ void RendererOpenGL::SwapBuffers(

    // Maintain the rasterizer's state as a priority
    OpenGLState prev_state = OpenGLState::GetCurState();
+    state.AllDirty();
    state.Apply();

    if (framebuffer) {
@@ -140,6 +141,7 @@ void RendererOpenGL::SwapBuffers(
    system.GetPerfStats().BeginSystemFrame();

    // Restore the rasterizer state
+    prev_state.AllDirty();
    prev_state.Apply();
 }

@@ -206,6 +208,7 @@ void RendererOpenGL::InitOpenGLObjects() {
    // Link shaders and get variable locations
    shader.CreateFromSource(vertex_shader, nullptr, fragment_shader);
    state.draw.shader_program = shader.handle;
+    state.AllDirty();
    state.Apply();
    uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix");
    uniform_color_texture = glGetUniformLocation(shader.handle, "color_texture");
@@ -338,12 +341,14 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
    // Workaround brigthness problems in SMO by enabling sRGB in the final output
    // if it has been used in the frame. Needed because of this bug in QT: QTBUG-50987
    state.framebuffer_srgb.enabled = OpenGLState::GetsRGBUsed();
+    state.AllDirty();
    state.Apply();
    glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data());
    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
    // Restore default state
    state.framebuffer_srgb.enabled = false;
    state.texture_units[0].texture = 0;
+    state.AllDirty();
    state.Apply();
    // Clear sRGB state for the next frame
    OpenGLState::ClearsRGBUsed();
@@ -388,6 +393,7 @@ void RendererOpenGL::CaptureScreenshot() {
    GLuint old_read_fb = state.draw.read_framebuffer;
    GLuint old_draw_fb = state.draw.draw_framebuffer;
    state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle;
+    state.AllDirty();
    state.Apply();

    Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
@@ -407,6 +413,7 @@ void RendererOpenGL::CaptureScreenshot() {
    screenshot_framebuffer.Release();
    state.draw.read_framebuffer = old_read_fb;
    state.draw.draw_framebuffer = old_draw_fb;
+    state.AllDirty();
    state.Apply();
    glDeleteRenderbuffers(1, &renderbuffer);

--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -205,10 +205,6 @@ public:
    }

 private:
-    using OperationDecompilerFn = Id (SPIRVDecompiler::*)(Operation);
-    using OperationDecompilersArray =
-        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
-
    static constexpr auto INTERNAL_FLAGS_COUNT = static_cast<std::size_t>(InternalFlag::Amount);

    void AllocateBindings() {
@@ -804,12 +800,7 @@ private:
        return {};
    }

-    Id LogicalAll2(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Id LogicalAny2(Operation operation) {
+    Id LogicalAnd2(Operation operation) {
        UNIMPLEMENTED();
        return {};
    }
@@ -1206,7 +1197,7 @@ private:
        return {};
    }

-    static constexpr OperationDecompilersArray operation_decompilers = {
+    static constexpr std::array operation_decompilers = {
        &SPIRVDecompiler::Assign,

        &SPIRVDecompiler::Ternary<&Module::OpSelect, Type::Float, Type::Bool, Type::Float,
@@ -1291,8 +1282,7 @@ private:
        &SPIRVDecompiler::Binary<&Module::OpLogicalNotEqual, Type::Bool>,
        &SPIRVDecompiler::Unary<&Module::OpLogicalNot, Type::Bool>,
        &SPIRVDecompiler::LogicalPick2,
-        &SPIRVDecompiler::LogicalAll2,
-        &SPIRVDecompiler::LogicalAny2,
+        &SPIRVDecompiler::LogicalAnd2,

        &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::Float>,
        &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::Float>,
@@ -1357,6 +1347,7 @@ private:
        &SPIRVDecompiler::WorkGroupId<1>,
        &SPIRVDecompiler::WorkGroupId<2>,
    };
+    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

    const VKDevice& device;
    const ShaderIR& ir;
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -15,7 +15,7 @@
 #include "video_core/shader/shader_ir.h"

 namespace VideoCommon::Shader {
-
+namespace {
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;

@@ -29,8 +29,7 @@ struct Query {

 struct BlockStack {
    BlockStack() = default;
-    BlockStack(const BlockStack& b) = default;
-    BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
+    explicit BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
    std::stack<u32> ssy_stack{};
    std::stack<u32> pbk_stack{};
 };
@@ -58,7 +57,7 @@ struct BlockInfo {
 struct CFGRebuildState {
    explicit CFGRebuildState(const ProgramCode& program_code, const std::size_t program_size,
                             const u32 start)
-        : program_code{program_code}, program_size{program_size}, start{start} {}
+        : start{start}, program_code{program_code}, program_size{program_size} {}

    u32 start{};
    std::vector<BlockInfo> block_info{};
@@ -85,7 +84,7 @@ std::pair<BlockCollision, u32> TryGetBlock(CFGRebuildState& state, u32 address)
            return {BlockCollision::Inside, index};
        }
    }
-    return {BlockCollision::None, -1};
+    return {BlockCollision::None, 0xFFFFFFFF};
 }

 struct ParseInfo {
@@ -365,27 +364,29 @@ bool TryQuery(CFGRebuildState& state) {
        const auto gather_end = labels.upper_bound(block.end);
        while (gather_start != gather_end) {
            cc.push(gather_start->second);
-            gather_start++;
+            ++gather_start;
        }
    };
    if (state.queries.empty()) {
        return false;
    }
+
    Query& q = state.queries.front();
    const u32 block_index = state.registered[q.address];
    BlockInfo& block = state.block_info[block_index];
-    // If the block is visted, check if the stacks match, else gather the ssy/pbk
+    // If the block is visited, check if the stacks match, else gather the ssy/pbk
    // labels into the current stack and look if the branch at the end of the block
    // consumes a label. Schedule new queries accordingly
    if (block.visited) {
        BlockStack& stack = state.stacks[q.address];
-        const bool all_okay = (stack.ssy_stack.size() == 0 || q.ssy_stack == stack.ssy_stack) &&
-                              (stack.pbk_stack.size() == 0 || q.pbk_stack == stack.pbk_stack);
+        const bool all_okay = (stack.ssy_stack.empty() || q.ssy_stack == stack.ssy_stack) &&
+                              (stack.pbk_stack.empty() || q.pbk_stack == stack.pbk_stack);
        state.queries.pop_front();
        return all_okay;
    }
    block.visited = true;
-    state.stacks[q.address] = BlockStack{q};
+    state.stacks.insert_or_assign(q.address, BlockStack{q});
+
    Query q2(q);
    state.queries.pop_front();
    gather_labels(q2.ssy_stack, state.ssy_labels, block);
@@ -394,6 +395,7 @@ bool TryQuery(CFGRebuildState& state) {
        q2.address = block.end + 1;
        state.queries.push_back(q2);
    }
+
    Query conditional_query{q2};
    if (block.branch.is_sync) {
        if (block.branch.address == unassigned_branch) {
@@ -408,13 +410,15 @@ bool TryQuery(CFGRebuildState& state) {
        conditional_query.pbk_stack.pop();
    }
    conditional_query.address = block.branch.address;
-    state.queries.push_back(conditional_query);
+    state.queries.push_back(std::move(conditional_query));
    return true;
 }
+} // Anonymous namespace

-std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
-                                              u32 start_address) {
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
+                                              std::size_t program_size, u32 start_address) {
    CFGRebuildState state{program_code, program_size, start_address};
+
    // Inspect Code and generate blocks
    state.labels.clear();
    state.labels.emplace(start_address);
@@ -424,10 +428,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
            return {};
        }
    }
+
    // Decompile Stacks
-    Query start_query{};
-    start_query.address = state.start;
-    state.queries.push_back(start_query);
+    state.queries.push_back(Query{state.start, {}, {}});
    bool decompiled = true;
    while (!state.queries.empty()) {
        if (!TryQuery(state)) {
@@ -435,14 +438,15 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
            break;
        }
    }
+
    // Sort and organize results
    std::sort(state.block_info.begin(), state.block_info.end(),
-              [](const BlockInfo& a, const BlockInfo& b) -> bool { return a.start < b.start; });
+              [](const BlockInfo& a, const BlockInfo& b) { return a.start < b.start; });
    ShaderCharacteristics result_out{};
    result_out.decompilable = decompiled;
    result_out.start = start_address;
    result_out.end = start_address;
-    for (auto& block : state.block_info) {
+    for (const auto& block : state.block_info) {
        ShaderBlock new_block{};
        new_block.start = block.start;
        new_block.end = block.end;
@@ -457,8 +461,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
    }
    if (result_out.decompilable) {
        result_out.labels = std::move(state.labels);
-        return {result_out};
+        return {std::move(result_out)};
    }
+
    // If it's not decompilable, merge the unlabelled blocks together
    auto back = result_out.blocks.begin();
    auto next = std::next(back);
@@ -469,8 +474,8 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
            continue;
        }
        back = next;
-        next++;
+        ++next;
    }
-    return {result_out};
+    return {std::move(result_out)};
 }
 } // namespace VideoCommon::Shader
--- a/src/video_core/shader/control_flow.h
+++ b/src/video_core/shader/control_flow.h
@@ -4,7 +4,6 @@

 #pragma once

-#include <cstring>
 #include <list>
 #include <optional>
 #include <unordered_set>
@@ -26,27 +25,44 @@ struct Condition {
    bool IsUnconditional() const {
        return predicate == Pred::UnusedIndex && cc == ConditionCode::T;
    }
+
    bool operator==(const Condition& other) const {
        return std::tie(predicate, cc) == std::tie(other.predicate, other.cc);
    }
+
+    bool operator!=(const Condition& other) const {
+        return !operator==(other);
+    }
 };

 struct ShaderBlock {
-    u32 start{};
-    u32 end{};
-    bool ignore_branch{};
    struct Branch {
        Condition cond{};
        bool kills{};
        s32 address{};
+
        bool operator==(const Branch& b) const {
            return std::tie(cond, kills, address) == std::tie(b.cond, b.kills, b.address);
        }
-    } branch{};
+
+        bool operator!=(const Branch& b) const {
+            return !operator==(b);
+        }
+    };
+
+    u32 start{};
+    u32 end{};
+    bool ignore_branch{};
+    Branch branch{};
+
    bool operator==(const ShaderBlock& sb) const {
        return std::tie(start, end, ignore_branch, branch) ==
               std::tie(sb.start, sb.end, sb.ignore_branch, sb.branch);
    }
+
+    bool operator!=(const ShaderBlock& sb) const {
+        return !operator==(sb);
+    }
 };

 struct ShaderCharacteristics {
@@ -57,7 +73,7 @@ struct ShaderCharacteristics {
    std::unordered_set<u32> labels{};
 };

-std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
-                                              u32 start_address);
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
+                                              std::size_t program_size, u32 start_address);

 } // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -47,14 +47,14 @@ void ShaderIR::Decode() {
        if (shader_info.decompilable) {
            disable_flow_stack = true;
            const auto insert_block = [this](NodeBlock& nodes, u32 label) {
-                if (label == exit_branch) {
+                if (label == static_cast<u32>(exit_branch)) {
                    return;
                }
                basic_blocks.insert({label, nodes});
            };
            const auto& blocks = shader_info.blocks;
            NodeBlock current_block;
-            u32 current_label = exit_branch;
+            u32 current_label = static_cast<u32>(exit_branch);
            for (auto& block : blocks) {
                if (shader_info.labels.count(block.start) != 0) {
                    insert_block(current_block, current_label);
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -42,11 +42,14 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
    case OpCode::Id::FMUL_R:
    case OpCode::Id::FMUL_IMM: {
        // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit.
-        UNIMPLEMENTED_IF_MSG(instr.fmul.tab5cb8_2 != 0, "FMUL tab5cb8_2({}) is not implemented",
-                             instr.fmul.tab5cb8_2.Value());
-        UNIMPLEMENTED_IF_MSG(
-            instr.fmul.tab5c68_0 != 1, "FMUL tab5cb8_0({}) is not implemented",
-            instr.fmul.tab5c68_0.Value()); // SMO typical sends 1 here which seems to be the default
+        if (instr.fmul.tab5cb8_2 != 0) {
+            LOG_WARNING(HW_GPU, "FMUL tab5cb8_2({}) is not implemented",
+                        instr.fmul.tab5cb8_2.Value());
+        }
+        if (instr.fmul.tab5c68_0 != 1) {
+            LOG_WARNING(HW_GPU, "FMUL tab5cb8_0({}) is not implemented",
+                        instr.fmul.tab5c68_0.Value());
+        }

        op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b);

--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -23,7 +23,9 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
        }
    } else {
-        UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None);
+        if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None) {
+            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        }
    }

    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a);
--- a/src/video_core/shader/decode/ffma.cpp
+++ b/src/video_core/shader/decode/ffma.cpp
@@ -18,10 +18,12 @@ u32 ShaderIR::DecodeFfma(NodeBlock& bb, u32 pc) {
    const auto opcode = OpCode::Decode(instr);

    UNIMPLEMENTED_IF_MSG(instr.ffma.cc != 0, "FFMA cc not implemented");
-    UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_0 != 1, "FFMA tab5980_0({}) not implemented",
-                         instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO
-    UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_1 != 0, "FFMA tab5980_1({}) not implemented",
-                         instr.ffma.tab5980_1.Value());
+    if (instr.ffma.tab5980_0 != 1) {
+        LOG_WARNING(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value());
+    }
+    if (instr.ffma.tab5980_1 != 0) {
+        LOG_WARNING(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value());
+    }

    const Node op_a = GetRegister(instr.gpr8);

--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -18,43 +18,56 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
    const Instruction instr = {program_code[pc]};
    const auto opcode = OpCode::Decode(instr);

-    UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0);
+    DEBUG_ASSERT(instr.hsetp2.ftz == 0);

    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
    op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);

-    Node op_b = [&]() {
-        switch (opcode->get().GetId()) {
-        case OpCode::Id::HSETP2_R:
-            return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a,
-                                        instr.hsetp2.negate_b);
-        default:
-            UNREACHABLE();
-            return Immediate(0);
-        }
-    }();
-    op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b);
-
-    // We can't use the constant predicate as destination.
-    ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex));
-
-    const Node second_pred = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred != 0);
+    Tegra::Shader::PredCondition cond{};
+    bool h_and{};
+    Node op_b{};
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSETP2_C:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = GetOperandAbsNegHalf(GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset),
+                                    instr.hsetp2.cbuf.abs_b, instr.hsetp2.cbuf.negate_b);
+        break;
+    case OpCode::Id::HSETP2_IMM:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = UnpackHalfImmediate(instr, true);
+        break;
+    case OpCode::Id::HSETP2_R:
+        cond = instr.hsetp2.reg.cond;
+        h_and = instr.hsetp2.reg.h_and;
+        op_b =
+            UnpackHalfFloat(GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.reg.abs_b,
+                                                 instr.hsetp2.reg.negate_b),
+                            instr.hsetp2.reg.type_b);
+        break;
+    default:
+        UNREACHABLE();
+        op_b = Immediate(0);
+    }

    const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op);
-    const OperationCode pair_combiner =
-        instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2;
+    const Node pred39 = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred);

-    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b);
-    const Node first_pred = Operation(pair_combiner, comparison);
+    const auto Write = [&](u64 dest, Node src) {
+        SetPredicate(bb, dest, Operation(combiner, std::move(src), pred39));
+    };

-    // Set the primary predicate to the result of Predicate OP SecondPredicate
-    const Node value = Operation(combiner, first_pred, second_pred);
-    SetPredicate(bb, instr.hsetp2.pred3, value);
-
-    if (instr.hsetp2.pred0 != static_cast<u64>(Pred::UnusedIndex)) {
-        // Set the secondary predicate to the result of !Predicate OP SecondPredicate, if enabled
-        const Node negated_pred = Operation(OperationCode::LogicalNegate, first_pred);
-        SetPredicate(bb, instr.hsetp2.pred0, Operation(combiner, negated_pred, second_pred));
+    const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b);
+    const u64 first = instr.hsetp2.pred0;
+    const u64 second = instr.hsetp2.pred3;
+    if (h_and) {
+        const Node joined = Operation(OperationCode::LogicalAnd2, comparison);
+        Write(first, joined);
+        Write(second, Operation(OperationCode::LogicalNegate, joined));
+    } else {
+        Write(first, Operation(OperationCode::LogicalPick2, comparison, Immediate(0u)));
+        Write(second, Operation(OperationCode::LogicalPick2, comparison, Immediate(1u)));
    }

    return pc;
--- a/src/video_core/shader/decode/hfma2.cpp
+++ b/src/video_core/shader/decode/hfma2.cpp
@@ -22,9 +22,9 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
    const auto opcode = OpCode::Decode(instr);

    if (opcode->get().GetId() == OpCode::Id::HFMA2_RR) {
-        UNIMPLEMENTED_IF(instr.hfma2.rr.precision != HalfPrecision::None);
+        DEBUG_ASSERT(instr.hfma2.rr.precision == HalfPrecision::None);
    } else {
-        UNIMPLEMENTED_IF(instr.hfma2.precision != HalfPrecision::None);
+        DEBUG_ASSERT(instr.hfma2.precision == HalfPrecision::None);
    }

    constexpr auto identity = HalfType::H0_H1;
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -101,8 +101,7 @@ enum class OperationCode {
    LogicalXor,    /// (bool a, bool b) -> bool
    LogicalNegate, /// (bool a) -> bool
    LogicalPick2,  /// (bool2 pair, uint index) -> bool
-    LogicalAll2,   /// (bool2 a) -> bool
-    LogicalAny2,   /// (bool2 a) -> bool
+    LogicalAnd2,   /// (bool2 a) -> bool

    LogicalFLessThan,     /// (float a, float b) -> bool
    LogicalFEqual,        /// (float a, float b) -> bool
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -59,8 +59,8 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co
        return TrackCbuf(source, code, new_cursor);
    }
    if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
-        for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
-            if (auto found = TrackCbuf((*operation)[i], code, cursor); std::get<0>(found)) {
+        for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
+            if (auto found = TrackCbuf((*operation)[i - 1], code, cursor); std::get<0>(found)) {
                // Cbuf found in operand.
                return found;
            }
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -24,9 +24,8 @@ StagingCache::StagingCache() = default;
 StagingCache::~StagingCache() = default;

 SurfaceBaseImpl::SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params)
-    : params{params}, mipmap_sizes(params.num_levels),
-      mipmap_offsets(params.num_levels), gpu_addr{gpu_addr}, host_memory_size{
-                                                                 params.GetHostSizeInBytes()} {
+    : params{params}, host_memory_size{params.GetHostSizeInBytes()}, gpu_addr{gpu_addr},
+      mipmap_sizes(params.num_levels), mipmap_offsets(params.num_levels) {
    std::size_t offset = 0;
    for (u32 level = 0; level < params.num_levels; ++level) {
        const std::size_t mipmap_size{params.GetGuestMipmapSize(level)};
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -116,10 +116,10 @@ public:
        std::lock_guard lock{mutex};
        auto& maxwell3d = system.GPU().Maxwell3D();

-        if (!maxwell3d.dirty_flags.zeta_buffer) {
+        if (!maxwell3d.dirty.depth_buffer) {
            return depth_buffer.view;
        }
-        maxwell3d.dirty_flags.zeta_buffer = false;
+        maxwell3d.dirty.depth_buffer = false;

        const auto& regs{maxwell3d.regs};
        const auto gpu_addr{regs.zeta.Address()};
@@ -145,10 +145,10 @@ public:
        std::lock_guard lock{mutex};
        ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
        auto& maxwell3d = system.GPU().Maxwell3D();
-        if (!maxwell3d.dirty_flags.color_buffer[index]) {
+        if (!maxwell3d.dirty.render_target[index]) {
            return render_targets[index].view;
        }
-        maxwell3d.dirty_flags.color_buffer.reset(index);
+        maxwell3d.dirty.render_target[index] = false;

        const auto& regs{maxwell3d.regs};
        if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 ||
@@ -274,10 +274,11 @@ protected:
        auto& maxwell3d = system.GPU().Maxwell3D();
        const u32 index = surface->GetRenderTarget();
        if (index == DEPTH_RT) {
-            maxwell3d.dirty_flags.zeta_buffer = true;
+            maxwell3d.dirty.depth_buffer = true;
        } else {
-            maxwell3d.dirty_flags.color_buffer.set(index, true);
+            maxwell3d.dirty.render_target[index] = true;
        }
+        maxwell3d.dirty.render_settings = true;
    }

    void Register(TSurface surface) {
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -1843,13 +1843,14 @@ void GMainWindow::OnCoreError(Core::System::ResultStatus result, std::string det
           "data, or other bugs.");
    switch (result) {
    case Core::System::ResultStatus::ErrorSystemFiles: {
-        QString message = tr("yuzu was unable to locate a Switch system archive");
-        if (!details.empty()) {
-            message.append(tr(": %1. ").arg(QString::fromStdString(details)));
+        QString message;
+        if (details.empty()) {
+            message =
+                tr("yuzu was unable to locate a Switch system archive. %1").arg(common_message);
        } else {
-            message.append(tr(". "));
+            message = tr("yuzu was unable to locate a Switch system archive: %1. %2")
+                          .arg(QString::fromStdString(details), common_message);
        }
-        message.append(common_message);

        answer = QMessageBox::question(this, tr("System Archive Not Found"), message,
                                       QMessageBox::Yes | QMessageBox::No, QMessageBox::No);
@@ -1858,8 +1859,8 @@ void GMainWindow::OnCoreError(Core::System::ResultStatus result, std::string det
    }

    case Core::System::ResultStatus::ErrorSharedFont: {
-        QString message = tr("yuzu was unable to locate the Switch shared fonts. ");
-        message.append(common_message);
+        const QString message =
+            tr("yuzu was unable to locate the Switch shared fonts. %1").arg(common_message);
        answer = QMessageBox::question(this, tr("Shared Fonts Not Found"), message,
                                       QMessageBox::Yes | QMessageBox::No, QMessageBox::No);
        status_message = tr("Shared Font Missing");
Author	SHA1	Message	Date
Fernando Sahmkow	e52c895559	GPU: Flush commands on every dma pusher step. This commit ensures that the host gpu is constantly fed with commands to work with, while the guest gpu keeps producing the rest of the commands. This reduces syncing time between host and guest gpu.	2019-07-26 16:54:22 -04:00
bunnei	52f54c728d	Merge pull request #2592 from FernandoS27/sync1 Implement GPU Synchronization Mechanisms & Correct NVFlinger	2019-07-26 14:26:44 -04:00
bunnei	b0ff3179ef	Merge pull request #2739 from lioncash/cflow video_core/control_flow: Minor changes/warning cleanup	2019-07-25 13:04:56 -04:00
bunnei	4d26550f5f	Merge pull request #2737 from FernandoS27/track-fix Shader_Ir: Correct tracking to track from right to left	2019-07-25 12:41:52 -04:00
bunnei	ccbc554949	Merge pull request #2689 from lioncash/tl yuzu/main: Make error messages within OnCoreError more localization-friendly	2019-07-25 12:35:07 -04:00
bunnei	31e8a61527	Merge pull request #2743 from FernandoS27/surpress-assert Downgrade and suppress a series of GPU asserts and debug messages.	2019-07-25 12:34:36 -04:00
bunnei	9be9600bdc	Merge pull request #2704 from FernandoS27/conditional maxwell3d: Implement Conditional Rendering	2019-07-24 17:07:57 -04:00
Zach Hilman	12514ccd35	Fix README change mistake (#2754 ) Fix README change mistake	2019-07-24 16:42:33 -04:00
bunnei	f601f25bcc	Merge pull request #2734 from ReinUsesLisp/compute-shaders gl_rasterizer: Implement compute shaders	2019-07-22 11:12:55 -04:00
bunnei	27e10e0442	Merge pull request #2735 from FernandoS27/pipeline-rework Rework Dirty Flags in GPU Pipeline, Optimize CBData and Redo Clearing mechanism	2019-07-21 00:59:52 -04:00
Zach Hilman	6738fb5fef	Update README.md	2019-07-20 21:03:30 -04:00
Fernando Sahmkow	0a67416971	Merge pull request #2693 from ReinUsesLisp/hsetp2 shader/half_set_predicate: Implement missing HSETP2 variants	2019-07-20 17:25:08 -04:00
Flame Sage	369be67039	Update README.md	2019-07-20 19:24:24 +00:00
Flame Sage	aa599ac709	Update README.md	2019-07-20 19:22:30 +00:00
Flame Sage	a2edb27158	Merge pull request #2752 from DarkLordZach/master azure: Fix clang-format and releases	2019-07-20 15:20:53 -04:00
Zach Hilman	f470bcb826	azure: Fix clang-format and releases	2019-07-20 15:19:25 -04:00
Fernando Sahmkow	7a35178ee2	Maxwell3D: Reorganize and address feedback	2019-07-20 10:18:35 -04:00
Fernando Sahmkow	1158777737	Shader_Ir: Change Debug Asserts for Log Warnings	2019-07-19 22:15:34 -04:00
ReinUsesLisp	45c162444d	shader/half_set_predicate: Fix HSETP2 implementation	2019-07-19 22:21:22 -03:00
ReinUsesLisp	6c4985edc9	shader/half_set_predicate: Implement missing HSETP2 variants	2019-07-19 22:20:47 -03:00
Lioncash	c1c89411da	video_core/control_flow: Provide operator!= for types with operator== Provides operational symmetry for the respective structures.	2019-07-18 21:03:31 -04:00
Lioncash	1780e0e3d0	video_core/control_flow: Prevent sign conversion in TryGetBlock() The return value is a u32, not an s32, so this would result in an implicit signedness conversion.	2019-07-18 21:03:31 -04:00
Lioncash	a162a844d2	video_core/control_flow: Remove unnecessary BlockStack copy constructor This is the default behavior of the copy constructor, so it doesn't need to be specified. While we're at it we can make the other non-default constructor explicit.	2019-07-18 21:03:30 -04:00
Lioncash	56bc11d952	video_core/control_flow: Use std::move where applicable Results in less work being done where avoidable.	2019-07-18 21:03:30 -04:00
Lioncash	e7b39f47f8	video_core/control_flow: Use the prefix variant of operator++ for iterators Same thing, but potentially allows a standard library implementation to pick a more efficient codepath.	2019-07-18 21:03:30 -04:00
Lioncash	6885e7e7ec	video_core/control_flow: Use empty() member function for checking emptiness It's what it's there for.	2019-07-18 21:03:30 -04:00
Lioncash	45fa12a05c	video_core: Resolve -Wreorder warnings Ensures that the constructor members are always initialized in the order that they're declared in.	2019-07-18 21:03:30 -04:00
Lioncash	47df844338	video_core/control_flow: Make program_size for ScanFlow() a std::size_t Prevents a truncation warning from occurring with MSVC. Also the internal data structures already treat it as a size_t, so this is just a discrepancy in the interface.	2019-07-18 21:03:29 -04:00
Lioncash	3df9558593	video_core/control_flow: Place all internally linked types/functions within an anonymous namespace Previously, quite a few functions were being linked with external linkage.	2019-07-18 21:03:29 -04:00
Lioncash	1109db86b7	video_core/shader/decode: Prevent sign-conversion warnings Makes it explicit that the conversions here are intentional.	2019-07-18 21:03:29 -04:00
Fernando Sahmkow	5a06e33859	Shader_Ir: correct clang format	2019-07-18 10:09:26 -04:00
Fernando Sahmkow	43f57d668c	GPU: Add missing puller methods. This adds some missing puller methods. We don't assert them as these are nop operations for us.	2019-07-18 08:54:42 -04:00
Fernando Sahmkow	3a3fee5abf	MaxwellDMA/KeplerCopy: Downgrade DMA log message to Trace. This log was just to know which games used DMA. It's no longer important.	2019-07-18 08:31:38 -04:00
Fernando Sahmkow	d3b71ff80d	Gl_Texture_Cache: Remove assert on component type in GetFormatTuple Textures can have different components types in different orders. This assert was completely inprecise and the effectiveness of such is better handled by case and within the texture cache.	2019-07-18 08:20:31 -04:00
Fernando Sahmkow	0b65e9335e	Shader_Ir: Downgrade precision and rounding asserts to debug asserts. This commit reduces the sevirity of asserts for FP precision and rounding as this are well known and have little to no consequences in gpu's accuracy.	2019-07-18 08:17:19 -04:00
Fernando Sahmkow	4be61013a1	GL_State: Feedback and fixes	2019-07-17 17:29:56 -04:00
Fernando Sahmkow	5ad889f6fd	Maxwell3D: Address Feedback	2019-07-17 17:29:55 -04:00
Fernando Sahmkow	7826f0afd9	Texture_Cache: Rebase Fixes	2019-07-17 17:29:54 -04:00
Fernando Sahmkow	8cdbfe69b1	GL_Rasterizer: Corrections to Clearing.	2019-07-17 17:29:54 -04:00
Fernando Sahmkow	0ff4a5fa39	Maxwell3D: Correct marking dirtiness on CB upload	2019-07-17 17:29:53 -04:00
Fernando Sahmkow	fec32fed18	GL_Rasterizer: Rework RenderTarget/DepthBuffer clearing	2019-07-17 17:29:52 -04:00
Fernando Sahmkow	a081dea8ab	Maxwell3D: Implement State Dirty Flags.	2019-07-17 17:29:51 -04:00
Fernando Sahmkow	0d3db58657	Maxwell3D: Rework CBData Upload	2019-07-17 17:29:50 -04:00
Fernando Sahmkow	f2e7b29c14	Maxwell3D: Rework the dirty system to be more consistant and scaleable	2019-07-17 17:29:49 -04:00
Fernando Sahmkow	e42bcf2314	maxwell3d: Implement Conditional Rendering Conditional Rendering takes care of conditionaly clearing or drawing depending on a set of queries. This PR implements the query checks to stablish if things can be rendered or not.	2019-07-17 17:13:19 -04:00
Fernando Sahmkow	d614193e49	Shader_Ir: Correct tracking to track from right to left	2019-07-16 15:06:59 -04:00
ReinUsesLisp	2a4044a858	gl_shader_cache: Fix clang-format issues	2019-07-15 20:33:51 -03:00
ReinUsesLisp	6b0d017675	gl_shader_decompiler: Stub local memory size	2019-07-15 17:38:25 -03:00
ReinUsesLisp	56bca83bde	gl_shader_cache: Address review commentaries	2019-07-15 17:38:25 -03:00
ReinUsesLisp	bbecd13697	gl_shader_cache: Address CI issues	2019-07-15 17:38:25 -03:00
ReinUsesLisp	725ba6cf63	gl_rasterizer: Implement compute shaders	2019-07-15 17:38:25 -03:00
Lioncash	5085a16d78	yuzu/main: Make error messages within OnCoreError more localization-friendly Previously, a translated string was being appended onto another string in a manner that doesn't allow the translator to control where the appended text is placed. This can be a nuisance for languages where grammar and text ordering differs from English. We now append the strings via the format strings themselves, which allows translators to reorder where the text will be placed.	2019-07-07 11:02:05 -04:00
Fernando Sahmkow	0fc98958a3	NVServices: Correct delayed responses.	2019-07-05 15:49:35 -04:00
Fernando Sahmkow	8c91d5c166	Nv_Host_Ctrl: Correct difference calculation	2019-07-05 15:49:34 -04:00
Fernando Sahmkow	f3a39e0c9c	NVServices: Address Feedback	2019-07-05 15:49:33 -04:00
Fernando Sahmkow	d20ede40b1	NVServices: Styling, define constructors as explicit and corrections	2019-07-05 15:49:32 -04:00
Fernando Sahmkow	b391e5f638	NVFlinger: Correct GCC compile error	2019-07-05 15:49:31 -04:00
Fernando Sahmkow	0335a25d1f	NVServices: Make NVEvents Automatic according to documentation.	2019-07-05 15:49:29 -04:00
Fernando Sahmkow	b6844bec60	NVServices: Correct CtrlEventWaitSync to block the ipc until timeout.	2019-07-05 15:49:28 -04:00
Fernando Sahmkow	7d1b974bca	GPU: Correct Interrupts to interrupt on syncpt/value instead of event, mirroring hardware	2019-07-05 15:49:26 -04:00
Fernando Sahmkow	61697864c3	nvflinger: Make the force 30 fps still force 30 fps	2019-07-05 15:49:25 -04:00
Fernando Sahmkow	efdeab3a1d	nv_services: Fixes to event liberation.	2019-07-05 15:49:24 -04:00
Fernando Sahmkow	ea97589624	nvflinger: Acquire buffers in the same order as they were queued.	2019-07-05 15:49:23 -04:00
Fernando Sahmkow	24408cce9b	nv_services: Deglobalize NvServices	2019-07-05 15:49:22 -04:00
Fernando Sahmkow	f2e026a1d8	gpu_asynch: Simplify synchronization to a simpler consumer->producer scheme.	2019-07-05 15:49:20 -04:00
Fernando Sahmkow	0706d633bf	nv_host_ctrl: Make Sync GPU variant always return synced result.	2019-07-05 15:49:20 -04:00
Fernando Sahmkow	600dddf88d	Async GPU: do invalidate as synced operation Async GPU: Always invalidate synced.	2019-07-05 15:49:19 -04:00
Fernando Sahmkow	c13433aee4	Gpu: use an std mutex instead of a spin_lock to guard syncpoints	2019-07-05 15:49:18 -04:00
Fernando Sahmkow	78add28aab	nvhost_ctrl: Corrections to event handling	2019-07-05 15:49:17 -04:00
Fernando Sahmkow	eef55f493b	Gpu: Mark areas as protected.	2019-07-05 15:49:16 -04:00
Fernando Sahmkow	a45643cb3b	nv_services: Stub CtrlEventSignal	2019-07-05 15:49:15 -04:00
Fernando Sahmkow	8942047d41	Gpu: Implement Hardware Interrupt Manager and manage GPU interrupts	2019-07-05 15:49:14 -04:00
Fernando Sahmkow	e0027eba85	nv_services: Implement NvQueryEvent, NvCtrlEventWait, NvEventRegister, NvEventUnregister	2019-07-05 15:49:13 -04:00
Fernando Sahmkow	7039ece0a0	nv_services: Create GPU channels correctly	2019-07-05 15:49:12 -04:00
Fernando Sahmkow	82b829625b	video_core: Implement GPU side Syncpoints	2019-07-05 15:49:11 -04:00
Fernando Sahmkow	737e978f5b	nv_services: Correct buffer queue fencing and GPFifo fencing	2019-07-05 15:49:10 -04:00
Fernando Sahmkow	ceb5f5079c	nvflinger: Implement swap intervals	2019-07-05 15:49:08 -04:00