Adapt Bindless to work with AOFFI

Move ConstBufferAccessor to Maxwell3d, correct mistakes and clang format.
Fix bad rebase
2019-04-08 12:07:56 -04:00 · 2019-04-08 11:36:11 -04:00 · 2019-04-08 11:35:22 -04:00 · 2019-04-08 11:35:22 -04:00 · 2019-04-08 11:35:19 -04:00 · 2019-04-08 11:35:18 -04:00
33 changed files with 1052 additions and 587 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -40,3 +40,6 @@
 [submodule "Vulkan-Headers"]
    path = externals/Vulkan-Headers
    url = https://github.com/KhronosGroup/Vulkan-Headers.git
+[submodule "externals/zstd"]
+    path = externals/zstd
+    url = https://github.com/facebook/zstd
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -309,7 +309,7 @@ if (CLANG_FORMAT)
    set(CCOMMENT "Running clang format against all the .h and .cpp files in src/")
    if (WIN32)
        add_custom_target(clang-format
-            COMMAND powershell.exe -Command "Get-ChildItem ${SRCS}/* -Include *.cpp,*.h -Recurse | Foreach {${CLANG_FORMAT} -i $_.fullname}"
+            COMMAND powershell.exe -Command "Get-ChildItem '${SRCS}/*' -Include *.cpp,*.h -Recurse | Foreach {&'${CLANG_FORMAT}' -i $_.fullname}"
            COMMENT ${CCOMMENT})
    elseif(MINGW)
        add_custom_target(clang-format
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -49,6 +49,10 @@ add_subdirectory(open_source_archives EXCLUDE_FROM_ALL)
 add_library(unicorn-headers INTERFACE)
 target_include_directories(unicorn-headers INTERFACE ./unicorn/include)

+# Zstandard
+add_subdirectory(zstd/build/cmake EXCLUDE_FROM_ALL)
+target_include_directories(libzstd_static INTERFACE ./zstd/lib)
+
 # SoundTouch
 add_subdirectory(soundtouch)

--- a/externals/zstd
+++ b/externals/zstd
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -125,6 +125,8 @@ add_library(common STATIC
    uint128.h
    vector_math.h
    web_result.h
+    zstd_compression.cpp
+    zstd_compression.h
 )

 if(ARCHITECTURE_x86_64)
@@ -138,4 +140,4 @@ endif()
 create_target_directory_groups(common)

 target_link_libraries(common PUBLIC Boost::boost fmt microprofile)
-target_link_libraries(common PRIVATE lz4_static)
+target_link_libraries(common PRIVATE lz4_static libzstd_static)
--- a/src/common/assert.h
+++ b/src/common/assert.h
@@ -57,3 +57,21 @@ __declspec(noinline, noreturn)

 #define UNIMPLEMENTED_IF(cond) ASSERT_MSG(!(cond), "Unimplemented code!")
 #define UNIMPLEMENTED_IF_MSG(cond, ...) ASSERT_MSG(!(cond), __VA_ARGS__)
+
+// If the assert is ignored, execute _b_
+#define ASSERT_OR_EXECUTE(_a_, _b_)                                                                \
+    do {                                                                                           \
+        ASSERT(_a_);                                                                               \
+        if (!(_a_)) {                                                                              \
+            _b_                                                                                    \
+        }                                                                                          \
+    } while (0)
+
+// If the assert is ignored, execute _b_
+#define ASSERT_OR_EXECUTE_MSG(_a_, _b_, ...)                                                       \
+    do {                                                                                           \
+        ASSERT_MSG(_a_, __VA_ARGS__);                                                              \
+        if (!(_a_)) {                                                                              \
+            _b_                                                                                    \
+        }                                                                                          \
+    } while (0)
--- a/src/common/zstd_compression.cpp
+++ b/src/common/zstd_compression.cpp
@@ -0,0 +1,53 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <zstd.h>
+
+#include "common/assert.h"
+#include "common/zstd_compression.h"
+
+namespace Common::Compression {
+
+std::vector<u8> CompressDataZSTD(const u8* source, std::size_t source_size, s32 compression_level) {
+    compression_level = std::clamp(compression_level, 1, ZSTD_maxCLevel());
+
+    const std::size_t max_compressed_size = ZSTD_compressBound(source_size);
+    std::vector<u8> compressed(max_compressed_size);
+
+    const std::size_t compressed_size =
+        ZSTD_compress(compressed.data(), compressed.size(), source, source_size, compression_level);
+
+    if (ZSTD_isError(compressed_size)) {
+        // Compression failed
+        return {};
+    }
+
+    compressed.resize(compressed_size);
+
+    return compressed;
+}
+
+std::vector<u8> CompressDataZSTDDefault(const u8* source, std::size_t source_size) {
+    return CompressDataZSTD(source, source_size, ZSTD_CLEVEL_DEFAULT);
+}
+
+std::vector<u8> DecompressDataZSTD(const std::vector<u8>& compressed) {
+    const std::size_t decompressed_size =
+        ZSTD_getDecompressedSize(compressed.data(), compressed.size());
+    std::vector<u8> decompressed(decompressed_size);
+
+    const std::size_t uncompressed_result_size = ZSTD_decompress(
+        decompressed.data(), decompressed.size(), compressed.data(), compressed.size());
+
+    if (decompressed_size != uncompressed_result_size || ZSTD_isError(uncompressed_result_size)) {
+        // Decompression failed
+        return {};
+    }
+    return decompressed;
+}
+
+} // namespace Common::Compression
--- a/src/common/zstd_compression.h
+++ b/src/common/zstd_compression.h
@@ -0,0 +1,42 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace Common::Compression {
+
+/**
+ * Compresses a source memory region with Zstandard and returns the compressed data in a vector.
+ *
+ * @param source the uncompressed source memory region.
+ * @param source_size the size in bytes of the uncompressed source memory region.
+ * @param compression_level the used compression level. Should be between 1 and 22.
+ *
+ * @return the compressed data.
+ */
+std::vector<u8> CompressDataZSTD(const u8* source, std::size_t source_size, s32 compression_level);
+
+/**
+ * Compresses a source memory region with Zstandard with the default compression level and returns
+ * the compressed data in a vector.
+ *
+ * @param source the uncompressed source memory region.
+ * @param source_size the size in bytes of the uncompressed source memory region.
+ *
+ * @return the compressed data.
+ */
+std::vector<u8> CompressDataZSTDDefault(const u8* source, std::size_t source_size);
+
+/**
+ * Decompresses a source memory region with Zstandard and returns the uncompressed data in a vector.
+ *
+ * @param compressed the compressed source memory region.
+ *
+ * @return the decompressed data.
+ */
+std::vector<u8> DecompressDataZSTD(const std::vector<u8>& compressed);
+
+} // namespace Common::Compression
--- a/src/core/arm/dynarmic/arm_dynarmic.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic.cpp
@@ -163,7 +163,6 @@ MICROPROFILE_DEFINE(ARM_Jit_Dynarmic, "ARM JIT", "Dynarmic", MP_RGB(255, 64, 64)

 void ARM_Dynarmic::Run() {
    MICROPROFILE_SCOPE(ARM_Jit_Dynarmic);
-    ASSERT(Memory::GetCurrentPageTable() == current_page_table);

    jit->Run();
 }
@@ -278,7 +277,6 @@ void ARM_Dynarmic::ClearExclusiveState() {

 void ARM_Dynarmic::PageTableChanged() {
    jit = MakeJit();
-    current_page_table = Memory::GetCurrentPageTable();
 }

 DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(std::size_t core_count) : monitor(core_count) {}
--- a/src/core/arm/dynarmic/arm_dynarmic.h
+++ b/src/core/arm/dynarmic/arm_dynarmic.h
@@ -12,10 +12,6 @@
 #include "core/arm/exclusive_monitor.h"
 #include "core/arm/unicorn/arm_unicorn.h"

-namespace Common {
-struct PageTable;
-}
-
 namespace Core::Timing {
 class CoreTiming;
 }
@@ -69,8 +65,6 @@ private:
    std::size_t core_index;
    Timing::CoreTiming& core_timing;
    DynarmicExclusiveMonitor& exclusive_monitor;
-
-    Common::PageTable* current_page_table = nullptr;
 };

 class DynarmicExclusiveMonitor final : public ExclusiveMonitor {
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -21,6 +21,7 @@
 #include "core/hle/kernel/thread.h"
 #include "core/hle/lock.h"
 #include "core/hle/result.h"
+#include "core/memory.h"

 namespace Kernel {

@@ -181,6 +182,7 @@ void KernelCore::AppendNewProcess(SharedPtr<Process> process) {

 void KernelCore::MakeCurrentProcess(Process* process) {
    impl->current_process = process;
+    Memory::SetCurrentPageTable(&process->VMManager().page_table);
 }

 Process* KernelCore::CurrentProcess() {
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -32,9 +32,6 @@ namespace {
 * @param priority The priority to give the main thread
 */
 void SetupMainThread(Process& owner_process, KernelCore& kernel, VAddr entry_point, u32 priority) {
-    // Setup page table so we can write to memory
-    Memory::SetCurrentPageTable(&owner_process.VMManager().page_table);
-
    // Initialize new "main" thread
    const VAddr stack_top = owner_process.VMManager().GetTLSIORegionEndAddress();
    auto thread_res = Thread::Create(kernel, "main", entry_point, priority, 0,
--- a/src/core/hle/kernel/scheduler.cpp
+++ b/src/core/hle/kernel/scheduler.cpp
@@ -101,7 +101,6 @@ void Scheduler::SwitchContext(Thread* new_thread) {
        auto* const thread_owner_process = current_thread->GetOwnerProcess();
        if (previous_process != thread_owner_process) {
            system.Kernel().MakeCurrentProcess(thread_owner_process);
-            Memory::SetCurrentPageTable(&thread_owner_process->VMManager().page_table);
        }

        cpu_core.LoadContext(new_thread->GetContext());
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -38,10 +38,6 @@ void SetCurrentPageTable(Common::PageTable* page_table) {
    }
 }

-Common::PageTable* GetCurrentPageTable() {
-    return current_page_table;
-}
-
 static void MapPages(Common::PageTable& page_table, VAddr base, u64 size, u8* memory,
                     Common::PageType type) {
    LOG_DEBUG(HW_Memory, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * PAGE_SIZE,
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -28,16 +28,6 @@ constexpr u64 PAGE_MASK = PAGE_SIZE - 1;

 /// Virtual user-space memory regions
 enum : VAddr {
-    /// Read-only page containing kernel and system configuration values.
-    CONFIG_MEMORY_VADDR = 0x1FF80000,
-    CONFIG_MEMORY_SIZE = 0x00001000,
-    CONFIG_MEMORY_VADDR_END = CONFIG_MEMORY_VADDR + CONFIG_MEMORY_SIZE,
-
-    /// Usually read-only page containing mostly values read from hardware.
-    SHARED_PAGE_VADDR = 0x1FF81000,
-    SHARED_PAGE_SIZE = 0x00001000,
-    SHARED_PAGE_VADDR_END = SHARED_PAGE_VADDR + SHARED_PAGE_SIZE,
-
    /// TLS (Thread-Local Storage) related.
    TLS_ENTRY_SIZE = 0x200,

@@ -50,9 +40,8 @@ enum : VAddr {
    KERNEL_REGION_END = KERNEL_REGION_VADDR + KERNEL_REGION_SIZE,
 };

-/// Currently active page table
+/// Changes the currently active page table.
 void SetCurrentPageTable(Common::PageTable* page_table);
-Common::PageTable* GetCurrentPageTable();

 /// Determines if the given VAddr is valid for the specified process.
 bool IsValidVirtualAddress(const Kernel::Process& process, VAddr vaddr);
--- a/src/tests/core/arm/arm_test_common.cpp
+++ b/src/tests/core/arm/arm_test_common.cpp
@@ -17,7 +17,6 @@ TestEnvironment::TestEnvironment(bool mutable_memory_)
    : mutable_memory(mutable_memory_),
      test_memory(std::make_shared<TestMemory>(this)), kernel{Core::System::GetInstance()} {
    auto process = Kernel::Process::Create(Core::System::GetInstance(), "");
-    kernel.MakeCurrentProcess(process.get());
    page_table = &process->VMManager().page_table;

    std::fill(page_table->pointers.begin(), page_table->pointers.end(), nullptr);
@@ -28,7 +27,7 @@ TestEnvironment::TestEnvironment(bool mutable_memory_)
    Memory::MapIoRegion(*page_table, 0x00000000, 0x80000000, test_memory);
    Memory::MapIoRegion(*page_table, 0x80000000, 0x80000000, test_memory);

-    Memory::SetCurrentPageTable(page_table);
+    kernel.MakeCurrentProcess(process.get());
 }

 TestEnvironment::~TestEnvironment() {
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -334,8 +334,8 @@ void Maxwell3D::ProcessSyncPoint() {
    const u32 sync_point = regs.sync_info.sync_point.Value();
    const u32 increment = regs.sync_info.increment.Value();
    const u32 cache_flush = regs.sync_info.unknown.Value();
-    UNIMPLEMENTED_MSG("Syncpoint Set {}, increment: {}, unk: {}", sync_point, increment,
-                      cache_flush);
+    LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
+              cache_flush);
 }

 void Maxwell3D::DrawArrays() {
@@ -482,19 +482,8 @@ std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderSt
    return textures;
 }

-Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage,
-                                                    std::size_t offset) const {
-    auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
-    auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index];
-    ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
-
-    const GPUVAddr tex_info_address =
-        tex_info_buffer.address + offset * sizeof(Texture::TextureHandle);
-
-    ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size);
-
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
-
+Texture::FullTextureInfo Maxwell3D::GetTextureInfo(const Texture::TextureHandle tex_handle,
+                                                   std::size_t offset) const {
    Texture::FullTextureInfo tex_info{};
    tex_info.index = static_cast<u32>(offset);

@@ -511,6 +500,22 @@ Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage,
    return tex_info;
 }

+Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage,
+                                                    std::size_t offset) const {
+    const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index];
+    ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
+
+    const GPUVAddr tex_info_address =
+        tex_info_buffer.address + offset * sizeof(Texture::TextureHandle);
+
+    ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size);
+
+    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+
+    return GetTextureInfo(tex_handle, offset);
+}
+
 u32 Maxwell3D::GetRegisterValue(u32 method) const {
    ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register");
    return regs.reg_array[method];
@@ -524,4 +529,12 @@ void Maxwell3D::ProcessClearBuffers() {
    rasterizer.Clear();
 }

+u32 Maxwell3D::AccessConstBuffer32(Regs::ShaderStage stage, u64 const_buffer, u64 offset) const {
+    const auto& shader_stage = state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto& buffer = shader_stage.const_buffers[const_buffer];
+    u32 result;
+    std::memcpy(&result, memory_manager.GetPointer(buffer.address + offset), sizeof(u32));
+    return result;
+}
+
 } // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1131,12 +1131,18 @@ public:
    /// Write the value to the register identified by method.
    void CallMethod(const GPU::MethodCall& method_call);

+    /// Given a Texture Handle, returns the TSC and TIC entries.
+    Texture::FullTextureInfo GetTextureInfo(const Texture::TextureHandle tex_handle,
+                                            std::size_t offset) const;
+
    /// Returns a list of enabled textures for the specified shader stage.
    std::vector<Texture::FullTextureInfo> GetStageTextures(Regs::ShaderStage stage) const;

    /// Returns the texture information for a specific texture in a specific shader stage.
    Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, std::size_t offset) const;

+    u32 AccessConstBuffer32(Regs::ShaderStage stage, u64 const_buffer, u64 offset) const;
+
    /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
    /// we've seen used.
    using MacroMemory = std::array<u32, 0x40000>;
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -966,6 +966,38 @@ union Instruction {
        }
    } tex;

+    union {
+        BitField<28, 1, u64> array;
+        BitField<29, 2, TextureType> texture_type;
+        BitField<31, 4, u64> component_mask;
+        BitField<49, 1, u64> nodep_flag;
+        BitField<50, 1, u64> dc_flag;
+        BitField<36, 1, u64> aoffi_flag;
+        BitField<37, 3, TextureProcessMode> process_mode;
+
+        bool IsComponentEnabled(std::size_t component) const {
+            return ((1ULL << component) & component_mask) != 0;
+        }
+
+        TextureProcessMode GetTextureProcessMode() const {
+            return process_mode;
+        }
+
+        bool UsesMiscMode(TextureMiscMode mode) const {
+            switch (mode) {
+            case TextureMiscMode::DC:
+                return dc_flag != 0;
+            case TextureMiscMode::NODEP:
+                return nodep_flag != 0;
+            case TextureMiscMode::AOFFI:
+                return aoffi_flag != 0;
+            default:
+                break;
+            }
+            return false;
+        }
+    } tex_b;
+
    union {
        BitField<22, 6, TextureQueryType> query_type;
        BitField<31, 4, u64> component_mask;
@@ -1309,7 +1341,9 @@ public:
        LDG, // Load from global memory
        STG, // Store in global memory
        TEX,
+        TEX_B,  // Texture Load Bindless
        TXQ,    // Texture Query
+        TXQ_B,  // Texture Query Bindless
        TEXS,   // Texture Fetch with scalar/non-vec4 source/destinations
        TLDS,   // Texture Load with scalar/non-vec4 source/destinations
        TLD4,   // Texture Load 4
@@ -1577,7 +1611,9 @@ private:
            INST("1110111011010---", Id::LDG, Type::Memory, "LDG"),
            INST("1110111011011---", Id::STG, Type::Memory, "STG"),
            INST("110000----111---", Id::TEX, Type::Texture, "TEX"),
+            INST("1101111010111---", Id::TEX_B, Type::Texture, "TEX_B"),
            INST("1101111101001---", Id::TXQ, Type::Texture, "TXQ"),
+            INST("1101111101010---", Id::TXQ_B, Type::Texture, "TXQ_B"),
            INST("1101-00---------", Id::TEXS, Type::Texture, "TEXS"),
            INST("1101101---------", Id::TLDS, Type::Texture, "TLDS"),
            INST("110010----111---", Id::TLD4, Type::Texture, "TLD4"),
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -31,7 +31,7 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {

 GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
    auto& rasterizer{renderer.Rasterizer()};
-    memory_manager = std::make_unique<Tegra::MemoryManager>();
+    memory_manager = std::make_unique<Tegra::MemoryManager>(rasterizer);
    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -5,16 +5,13 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
-#include "core/core.h"
 #include "core/memory.h"
-#include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
-#include "video_core/renderer_base.h"

 namespace Tegra {

-MemoryManager::MemoryManager() {
+MemoryManager::MemoryManager(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {
    std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr);
    std::fill(page_table.attributes.begin(), page_table.attributes.end(),
              Common::PageType::Unmapped);
@@ -70,8 +67,7 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
    const u64 aligned_size{Common::AlignUp(size, page_size)};
    const CacheAddr cache_addr{ToCacheAddr(GetPointer(gpu_addr))};

-    Core::System::GetInstance().Renderer().Rasterizer().FlushAndInvalidateRegion(cache_addr,
-                                                                                 aligned_size);
+    rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size);
    UnmapRange(gpu_addr, aligned_size);

    return gpu_addr;
@@ -204,14 +200,85 @@ const u8* MemoryManager::GetPointer(GPUVAddr addr) const {
 }

 void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const {
-    std::memcpy(dest_buffer, GetPointer(src_addr), size);
+    std::size_t remaining_size{size};
+    std::size_t page_index{src_addr >> page_bits};
+    std::size_t page_offset{src_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+        switch (page_table.attributes[page_index]) {
+        case Common::PageType::Memory: {
+            const u8* src_ptr{page_table.pointers[page_index] + page_offset};
+            rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
+            std::memcpy(dest_buffer, src_ptr, copy_amount);
+            break;
+        }
+        default:
+            UNREACHABLE();
+        }
+
+        page_index++;
+        page_offset = 0;
+        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
 }
+
 void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size) {
-    std::memcpy(GetPointer(dest_addr), src_buffer, size);
+    std::size_t remaining_size{size};
+    std::size_t page_index{dest_addr >> page_bits};
+    std::size_t page_offset{dest_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+        switch (page_table.attributes[page_index]) {
+        case Common::PageType::Memory: {
+            u8* dest_ptr{page_table.pointers[page_index] + page_offset};
+            rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount);
+            std::memcpy(dest_ptr, src_buffer, copy_amount);
+            break;
+        }
+        default:
+            UNREACHABLE();
+        }
+
+        page_index++;
+        page_offset = 0;
+        src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
 }

 void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size) {
-    std::memcpy(GetPointer(dest_addr), GetPointer(src_addr), size);
+    std::size_t remaining_size{size};
+    std::size_t page_index{src_addr >> page_bits};
+    std::size_t page_offset{src_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+
+        switch (page_table.attributes[page_index]) {
+        case Common::PageType::Memory: {
+            const u8* src_ptr{page_table.pointers[page_index] + page_offset};
+            rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
+            WriteBlock(dest_addr, src_ptr, copy_amount);
+            break;
+        }
+        default:
+            UNREACHABLE();
+        }
+
+        page_index++;
+        page_offset = 0;
+        dest_addr += static_cast<VAddr>(copy_amount);
+        src_addr += static_cast<VAddr>(copy_amount);
+        remaining_size -= copy_amount;
+    }
 }

 void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type,
@@ -351,7 +418,7 @@ MemoryManager::VMAIter MemoryManager::CarveVMA(GPUVAddr base, u64 size) {
    const VirtualMemoryArea& vma{vma_handle->second};
    if (vma.type == VirtualMemoryArea::Type::Mapped) {
        // Region is already allocated
-        return {};
+        return vma_handle;
    }

    const VAddr start_in_vma{base - vma.base};
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -10,6 +10,10 @@
 #include "common/common_types.h"
 #include "common/page_table.h"

+namespace VideoCore {
+class RasterizerInterface;
+}
+
 namespace Tegra {

 /**
@@ -43,7 +47,7 @@ struct VirtualMemoryArea {

 class MemoryManager final {
 public:
-    MemoryManager();
+    MemoryManager(VideoCore::RasterizerInterface& rasterizer);

    GPUVAddr AllocateSpace(u64 size, u64 align);
    GPUVAddr AllocateSpace(GPUVAddr addr, u64 size, u64 align);
@@ -144,6 +148,7 @@ private:

    Common::PageTable page_table{page_bits};
    VMAMap vma_map;
+    VideoCore::RasterizerInterface& rasterizer;
 };

 } // namespace Tegra
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -983,7 +983,15 @@ void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& s

    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
        const auto& entry = entries[bindpoint];
-        const auto texture = maxwell3d.GetStageTexture(stage, entry.GetOffset());
+        Tegra::Texture::FullTextureInfo texture;
+        if (entry.IsBindless()) {
+            const auto cbuf = entry.GetBindlessCBuf();
+            Tegra::Texture::TextureHandle tex_handle;
+            tex_handle.raw = maxwell3d.AccessConstBuffer32(stage, cbuf.first, cbuf.second);
+            texture = maxwell3d.GetTextureInfo(tex_handle, entry.GetOffset());
+        } else {
+            texture = maxwell3d.GetStageTexture(stage, entry.GetOffset());
+        }
        const u32 current_bindpoint = base_bindings.sampler + bindpoint;

        texture_samplers[current_bindpoint].SyncWithConfig(texture.tsc);
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -39,6 +39,10 @@ GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) {
 /// Gets the shader program code from memory for the specified address
 ProgramCode GetShaderCode(const u8* host_ptr) {
    ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
+    ASSERT_OR_EXECUTE(host_ptr != nullptr, {
+        std::fill(program_code.begin(), program_code.end(), 0);
+        return program_code;
+    });
    std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64));
    return program_code;
 }
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -21,6 +21,8 @@

 namespace OpenGL::GLShader {

+namespace {
+
 using Tegra::Shader::Attribute;
 using Tegra::Shader::AttributeUse;
 using Tegra::Shader::Header;
@@ -34,14 +36,18 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage;
 using Operation = const OperationNode&;

+enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
+
+struct TextureAoffi {};
+using TextureArgument = std::pair<Type, Node>;
+using TextureIR = std::variant<TextureAoffi, TextureArgument>;
+
 enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 };
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
    static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
 constexpr u32 MAX_GLOBALMEMORY_ELEMENTS =
    static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize) / sizeof(float);

-enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
-
 class ShaderWriter {
 public:
    void AddExpression(std::string_view text) {
@@ -91,7 +97,7 @@ private:
 };

 /// Generates code to use for a swizzle operation.
-static std::string GetSwizzle(u32 elem) {
+std::string GetSwizzle(u32 elem) {
    ASSERT(elem <= 3);
    std::string swizzle = ".";
    swizzle += "xyzw"[elem];
@@ -99,7 +105,7 @@ static std::string GetSwizzle(u32 elem) {
 }

 /// Translate topology
-static std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
+std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
    switch (topology) {
    case Tegra::Shader::OutputTopology::PointList:
        return "points";
@@ -114,7 +120,7 @@ static std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
 }

 /// Returns true if an object has to be treated as precise
-static bool IsPrecise(Operation operand) {
+bool IsPrecise(Operation operand) {
    const auto& meta = operand.GetMeta();

    if (const auto arithmetic = std::get_if<MetaArithmetic>(&meta)) {
@@ -126,7 +132,7 @@ static bool IsPrecise(Operation operand) {
    return false;
 }

-static bool IsPrecise(Node node) {
+bool IsPrecise(Node node) {
    if (const auto operation = std::get_if<OperationNode>(node)) {
        return IsPrecise(*operation);
    }
@@ -723,8 +729,8 @@ private:
                                                         result_type));
    }

-    std::string GenerateTexture(Operation operation, const std::string& func,
-                                const std::vector<std::pair<Type, Node>>& extras) {
+    std::string GenerateTexture(Operation operation, const std::string& function_suffix,
+                                const std::vector<TextureIR>& extras) {
        constexpr std::array<const char*, 4> coord_constructors = {"float", "vec2", "vec3", "vec4"};

        const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
@@ -734,11 +740,11 @@ private:
        const bool has_array = meta->sampler.IsArray();
        const bool has_shadow = meta->sampler.IsShadow();

-        std::string expr = func;
-        expr += '(';
-        expr += GetSampler(meta->sampler);
-        expr += ", ";
-
+        std::string expr = "texture" + function_suffix;
+        if (!meta->aoffi.empty()) {
+            expr += "Offset";
+        }
+        expr += '(' + GetSampler(meta->sampler) + ", ";
        expr += coord_constructors.at(count + (has_array ? 1 : 0) + (has_shadow ? 1 : 0) - 1);
        expr += '(';
        for (std::size_t i = 0; i < count; ++i) {
@@ -756,38 +762,76 @@ private:
        }
        expr += ')';

-        for (const auto& extra_pair : extras) {
-            const auto [type, operand] = extra_pair;
-            if (operand == nullptr) {
-                continue;
-            }
-            expr += ", ";
-
-            switch (type) {
-            case Type::Int:
-                if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
-                    // Inline the string as an immediate integer in GLSL (some extra arguments are
-                    // required to be constant)
-                    expr += std::to_string(static_cast<s32>(immediate->GetValue()));
-                } else {
-                    expr += "ftoi(" + Visit(operand) + ')';
-                }
-                break;
-            case Type::Float:
-                expr += Visit(operand);
-                break;
-            default: {
-                const auto type_int = static_cast<u32>(type);
-                UNIMPLEMENTED_MSG("Unimplemented extra type={}", type_int);
-                expr += '0';
-                break;
-            }
+        for (const auto& variant : extras) {
+            if (const auto argument = std::get_if<TextureArgument>(&variant)) {
+                expr += GenerateTextureArgument(*argument);
+            } else if (std::get_if<TextureAoffi>(&variant)) {
+                expr += GenerateTextureAoffi(meta->aoffi);
+            } else {
+                UNREACHABLE();
            }
        }

        return expr + ')';
    }

+    std::string GenerateTextureArgument(TextureArgument argument) {
+        const auto [type, operand] = argument;
+        if (operand == nullptr) {
+            return {};
+        }
+
+        std::string expr = ", ";
+        switch (type) {
+        case Type::Int:
+            if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
+                // Inline the string as an immediate integer in GLSL (some extra arguments are
+                // required to be constant)
+                expr += std::to_string(static_cast<s32>(immediate->GetValue()));
+            } else {
+                expr += "ftoi(" + Visit(operand) + ')';
+            }
+            break;
+        case Type::Float:
+            expr += Visit(operand);
+            break;
+        default: {
+            const auto type_int = static_cast<u32>(type);
+            UNIMPLEMENTED_MSG("Unimplemented extra type={}", type_int);
+            expr += '0';
+            break;
+        }
+        }
+        return expr;
+    }
+
+    std::string GenerateTextureAoffi(const std::vector<Node>& aoffi) {
+        if (aoffi.empty()) {
+            return {};
+        }
+        constexpr std::array<const char*, 3> coord_constructors = {"int", "ivec2", "ivec3"};
+        std::string expr = ", ";
+        expr += coord_constructors.at(aoffi.size() - 1);
+        expr += '(';
+
+        for (std::size_t index = 0; index < aoffi.size(); ++index) {
+            const auto operand{aoffi.at(index)};
+            if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
+                // Inline the string as an immediate integer in GLSL (AOFFI arguments are required
+                // to be constant by the standard).
+                expr += std::to_string(static_cast<s32>(immediate->GetValue()));
+            } else {
+                expr += "ftoi(" + Visit(operand) + ')';
+            }
+            if (index + 1 < aoffi.size()) {
+                expr += ", ";
+            }
+        }
+        expr += ')';
+
+        return expr;
+    }
+
    std::string Assign(Operation operation) {
        const Node dest = operation[0];
        const Node src = operation[1];
@@ -1164,7 +1208,8 @@ private:
        const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
        ASSERT(meta);

-        std::string expr = GenerateTexture(operation, "texture", {{Type::Float, meta->bias}});
+        std::string expr = GenerateTexture(
+            operation, "", {TextureAoffi{}, TextureArgument{Type::Float, meta->bias}});
        if (meta->sampler.IsShadow()) {
            expr = "vec4(" + expr + ')';
        }
@@ -1175,7 +1220,8 @@ private:
        const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
        ASSERT(meta);

-        std::string expr = GenerateTexture(operation, "textureLod", {{Type::Float, meta->lod}});
+        std::string expr = GenerateTexture(
+            operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureAoffi{}});
        if (meta->sampler.IsShadow()) {
            expr = "vec4(" + expr + ')';
        }
@@ -1187,7 +1233,8 @@ private:
        ASSERT(meta);

        const auto type = meta->sampler.IsShadow() ? Type::Float : Type::Int;
-        return GenerateTexture(operation, "textureGather", {{type, meta->component}}) +
+        return GenerateTexture(operation, "Gather",
+                               {TextureArgument{type, meta->component}, TextureAoffi{}}) +
               GetSwizzle(meta->element);
    }

@@ -1217,8 +1264,8 @@ private:
        ASSERT(meta);

        if (meta->element < 2) {
-            return "itof(int((" + GenerateTexture(operation, "textureQueryLod", {}) +
-                   " * vec2(256))" + GetSwizzle(meta->element) + "))";
+            return "itof(int((" + GenerateTexture(operation, "QueryLod", {}) + " * vec2(256))" +
+                   GetSwizzle(meta->element) + "))";
        }
        return "0";
    }
@@ -1571,6 +1618,8 @@ private:
    ShaderWriter code;
 };

+} // Anonymous namespace
+
 std::string GetCommonDeclarations() {
    const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS);
    const auto gmem = std::to_string(MAX_GLOBALMEMORY_ELEMENTS);
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -58,6 +58,7 @@ private:
 struct ShaderEntries {
    std::vector<ConstBufferEntry> const_buffers;
    std::vector<SamplerEntry> samplers;
+    std::vector<SamplerEntry> bindless_samplers;
    std::vector<GlobalMemoryEntry> global_memory_entries;
    std::array<bool, Maxwell::NumClipDistances> clip_distances{};
    std::size_t shader_length{};
@@ -68,4 +69,4 @@ std::string GetCommonDeclarations();
 ProgramResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage,
                        const std::string& suffix);

-} // namespace OpenGL::GLShader
+} // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -10,8 +10,8 @@
 #include "common/common_types.h"
 #include "common/file_util.h"
 #include "common/logging/log.h"
-#include "common/lz4_compression.h"
 #include "common/scm_rev.h"
+#include "common/zstd_compression.h"

 #include "core/core.h"
 #include "core/hle/kernel/process.h"
@@ -259,7 +259,7 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
                return {};
            }

-            dump.binary = Common::Compression::DecompressDataLZ4(compressed_binary, binary_length);
+            dump.binary = Common::Compression::DecompressDataZSTD(compressed_binary);
            if (dump.binary.empty()) {
                return {};
            }
@@ -288,7 +288,7 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
        return {};
    }

-    const std::vector<u8> code = Common::Compression::DecompressDataLZ4(compressed_code, code_size);
+    const std::vector<u8> code = Common::Compression::DecompressDataZSTD(compressed_code);
    if (code.empty()) {
        return {};
    }
@@ -319,16 +319,19 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
        u32 type{};
        u8 is_array{};
        u8 is_shadow{};
+        u8 is_bindless{};
        if (file.ReadBytes(&offset, sizeof(u64)) != sizeof(u64) ||
            file.ReadBytes(&index, sizeof(u64)) != sizeof(u64) ||
            file.ReadBytes(&type, sizeof(u32)) != sizeof(u32) ||
            file.ReadBytes(&is_array, sizeof(u8)) != sizeof(u8) ||
-            file.ReadBytes(&is_shadow, sizeof(u8)) != sizeof(u8)) {
+            file.ReadBytes(&is_shadow, sizeof(u8)) != sizeof(u8) ||
+            file.ReadBytes(&is_bindless, sizeof(u8)) != sizeof(u8)) {
            return {};
        }
-        entry.entries.samplers.emplace_back(
-            static_cast<std::size_t>(offset), static_cast<std::size_t>(index),
-            static_cast<Tegra::Shader::TextureType>(type), is_array != 0, is_shadow != 0);
+        entry.entries.samplers.emplace_back(static_cast<std::size_t>(offset),
+                                            static_cast<std::size_t>(index),
+                                            static_cast<Tegra::Shader::TextureType>(type),
+                                            is_array != 0, is_shadow != 0, is_bindless != 0);
    }

    u32 global_memory_count{};
@@ -388,7 +391,8 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(FileUtil::IOFile& file, u64 uniqu
            file.WriteObject(static_cast<u64>(sampler.GetIndex())) != 1 ||
            file.WriteObject(static_cast<u32>(sampler.GetType())) != 1 ||
            file.WriteObject(static_cast<u8>(sampler.IsArray() ? 1 : 0)) != 1 ||
-            file.WriteObject(static_cast<u8>(sampler.IsShadow() ? 1 : 0)) != 1) {
+            file.WriteObject(static_cast<u8>(sampler.IsShadow() ? 1 : 0)) != 1 ||
+            file.WriteObject(static_cast<u8>(sampler.IsBindless() ? 1 : 0)) != 1) {
            return false;
        }
    }
@@ -474,8 +478,8 @@ void ShaderDiskCacheOpenGL::SaveDecompiled(u64 unique_identifier, const std::str
    if (!IsUsable())
        return;

-    const std::vector<u8> compressed_code{Common::Compression::CompressDataLZ4HC(
-        reinterpret_cast<const u8*>(code.data()), code.size(), 9)};
+    const std::vector<u8> compressed_code{Common::Compression::CompressDataZSTDDefault(
+        reinterpret_cast<const u8*>(code.data()), code.size())};
    if (compressed_code.empty()) {
        LOG_ERROR(Render_OpenGL, "Failed to compress GLSL code - skipping shader {:016x}",
                  unique_identifier);
@@ -506,7 +510,7 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p
    glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data());

    const std::vector<u8> compressed_binary =
-        Common::Compression::CompressDataLZ4HC(binary.data(), binary.size(), 9);
+        Common::Compression::CompressDataZSTDDefault(binary.data(), binary.size());

    if (compressed_binary.empty()) {
        LOG_ERROR(Render_OpenGL, "Failed to compress binary program in shader={:016x}",
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -62,7 +62,6 @@ public:
        UpdatePipeline();
        state.draw.shader_program = 0;
        state.draw.program_pipeline = pipeline.handle;
-        state.geometry_shaders.enabled = (gs != 0);
    }

 private:
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -10,16 +10,62 @@

 namespace OpenGL {

-OpenGLState OpenGLState::cur_state;
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;

+OpenGLState OpenGLState::cur_state;
 bool OpenGLState::s_rgb_used;

+namespace {
+
+template <typename T>
+bool UpdateValue(T& current_value, const T new_value) {
+    const bool changed = current_value != new_value;
+    current_value = new_value;
+    return changed;
+}
+
+template <typename T1, typename T2>
+bool UpdateTie(T1 current_value, const T2 new_value) {
+    const bool changed = current_value != new_value;
+    current_value = new_value;
+    return changed;
+}
+
+void Enable(GLenum cap, bool enable) {
+    if (enable) {
+        glEnable(cap);
+    } else {
+        glDisable(cap);
+    }
+}
+
+void Enable(GLenum cap, GLuint index, bool enable) {
+    if (enable) {
+        glEnablei(cap, index);
+    } else {
+        glDisablei(cap, index);
+    }
+}
+
+void Enable(GLenum cap, bool& current_value, bool new_value) {
+    if (UpdateValue(current_value, new_value))
+        Enable(cap, new_value);
+}
+
+void Enable(GLenum cap, GLuint index, bool& current_value, bool new_value) {
+    if (UpdateValue(current_value, new_value))
+        Enable(cap, index, new_value);
+}
+
+} // namespace
+
 OpenGLState::OpenGLState() {
    // These all match default OpenGL values
-    geometry_shaders.enabled = false;
    framebuffer_srgb.enabled = false;
+
    multisample_control.alpha_to_coverage = false;
    multisample_control.alpha_to_one = false;
+
    cull.enabled = false;
    cull.mode = GL_BACK;
    cull.front_face = GL_CCW;
@@ -30,14 +76,15 @@ OpenGLState::OpenGLState() {

    primitive_restart.enabled = false;
    primitive_restart.index = 0;
+
    for (auto& item : color_mask) {
        item.red_enabled = GL_TRUE;
        item.green_enabled = GL_TRUE;
        item.blue_enabled = GL_TRUE;
        item.alpha_enabled = GL_TRUE;
    }
-    stencil.test_enabled = false;
-    auto reset_stencil = [](auto& config) {
+
+    const auto ResetStencil = [](auto& config) {
        config.test_func = GL_ALWAYS;
        config.test_ref = 0;
        config.test_mask = 0xFFFFFFFF;
@@ -46,8 +93,10 @@ OpenGLState::OpenGLState() {
        config.action_depth_pass = GL_KEEP;
        config.action_stencil_fail = GL_KEEP;
    };
-    reset_stencil(stencil.front);
-    reset_stencil(stencil.back);
+    stencil.test_enabled = false;
+    ResetStencil(stencil.front);
+    ResetStencil(stencil.back);
+
    for (auto& item : viewports) {
        item.x = 0;
        item.y = 0;
@@ -61,6 +110,7 @@ OpenGLState::OpenGLState() {
        item.scissor.width = 0;
        item.scissor.height = 0;
    }
+
    for (auto& item : blend) {
        item.enabled = true;
        item.rgb_equation = GL_FUNC_ADD;
@@ -70,11 +120,14 @@ OpenGLState::OpenGLState() {
        item.src_a_func = GL_ONE;
        item.dst_a_func = GL_ZERO;
    }
+
    independant_blend.enabled = false;
+
    blend_color.red = 0.0f;
    blend_color.green = 0.0f;
    blend_color.blue = 0.0f;
    blend_color.alpha = 0.0f;
+
    logic_op.enabled = false;
    logic_op.operation = GL_COPY;

@@ -91,9 +144,12 @@ OpenGLState::OpenGLState() {
    clip_distance = {};

    point.size = 1;
+
    fragment_color_clamp.enabled = false;
+
    depth_clamp.far_plane = false;
    depth_clamp.near_plane = false;
+
    polygon_offset.fill_enable = false;
    polygon_offset.line_enable = false;
    polygon_offset.point_enable = false;
@@ -103,132 +159,380 @@ OpenGLState::OpenGLState() {
 }

 void OpenGLState::ApplyDefaultState() {
+    glEnable(GL_BLEND);
    glDisable(GL_FRAMEBUFFER_SRGB);
    glDisable(GL_CULL_FACE);
    glDisable(GL_DEPTH_TEST);
    glDisable(GL_PRIMITIVE_RESTART);
    glDisable(GL_STENCIL_TEST);
-    glEnable(GL_BLEND);
    glDisable(GL_COLOR_LOGIC_OP);
    glDisable(GL_SCISSOR_TEST);
 }

+void OpenGLState::ApplyFramebufferState() const {
+    if (UpdateValue(cur_state.draw.read_framebuffer, draw.read_framebuffer)) {
+        glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer);
+    }
+    if (UpdateValue(cur_state.draw.draw_framebuffer, draw.draw_framebuffer)) {
+        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer);
+    }
+}
+
+void OpenGLState::ApplyVertexArrayState() const {
+    if (UpdateValue(cur_state.draw.vertex_array, draw.vertex_array)) {
+        glBindVertexArray(draw.vertex_array);
+    }
+}
+
+void OpenGLState::ApplyShaderProgram() const {
+    if (UpdateValue(cur_state.draw.shader_program, draw.shader_program)) {
+        glUseProgram(draw.shader_program);
+    }
+}
+
+void OpenGLState::ApplyProgramPipeline() const {
+    if (UpdateValue(cur_state.draw.program_pipeline, draw.program_pipeline)) {
+        glBindProgramPipeline(draw.program_pipeline);
+    }
+}
+
+void OpenGLState::ApplyClipDistances() const {
+    for (std::size_t i = 0; i < clip_distance.size(); ++i) {
+        Enable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i), cur_state.clip_distance[i],
+               clip_distance[i]);
+    }
+}
+
+void OpenGLState::ApplyPointSize() const {
+    if (UpdateValue(cur_state.point.size, point.size)) {
+        glPointSize(point.size);
+    }
+}
+
+void OpenGLState::ApplyFragmentColorClamp() const {
+    if (UpdateValue(cur_state.fragment_color_clamp.enabled, fragment_color_clamp.enabled)) {
+        glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB,
+                     fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE);
+    }
+}
+
+void OpenGLState::ApplyMultisample() const {
+    Enable(GL_SAMPLE_ALPHA_TO_COVERAGE, cur_state.multisample_control.alpha_to_coverage,
+           multisample_control.alpha_to_coverage);
+    Enable(GL_SAMPLE_ALPHA_TO_ONE, cur_state.multisample_control.alpha_to_one,
+           multisample_control.alpha_to_one);
+}
+
+void OpenGLState::ApplyDepthClamp() const {
+    if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane &&
+        depth_clamp.near_plane == cur_state.depth_clamp.near_plane) {
+        return;
+    }
+    cur_state.depth_clamp = depth_clamp;
+
+    UNIMPLEMENTED_IF_MSG(depth_clamp.far_plane != depth_clamp.near_plane,
+                         "Unimplemented Depth Clamp Separation!");
+
+    Enable(GL_DEPTH_CLAMP, depth_clamp.far_plane || depth_clamp.near_plane);
+}
+
 void OpenGLState::ApplySRgb() const {
-    if (framebuffer_srgb.enabled != cur_state.framebuffer_srgb.enabled) {
-        if (framebuffer_srgb.enabled) {
-            // Track if sRGB is used
-            s_rgb_used = true;
-            glEnable(GL_FRAMEBUFFER_SRGB);
-        } else {
-            glDisable(GL_FRAMEBUFFER_SRGB);
-        }
+    if (cur_state.framebuffer_srgb.enabled == framebuffer_srgb.enabled)
+        return;
+    cur_state.framebuffer_srgb.enabled = framebuffer_srgb.enabled;
+    if (framebuffer_srgb.enabled) {
+        // Track if sRGB is used
+        s_rgb_used = true;
+        glEnable(GL_FRAMEBUFFER_SRGB);
+    } else {
+        glDisable(GL_FRAMEBUFFER_SRGB);
    }
 }

 void OpenGLState::ApplyCulling() const {
-    if (cull.enabled != cur_state.cull.enabled) {
-        if (cull.enabled) {
-            glEnable(GL_CULL_FACE);
-        } else {
-            glDisable(GL_CULL_FACE);
-        }
-    }
+    Enable(GL_CULL_FACE, cur_state.cull.enabled, cull.enabled);

-    if (cull.mode != cur_state.cull.mode) {
+    if (UpdateValue(cur_state.cull.mode, cull.mode)) {
        glCullFace(cull.mode);
    }

-    if (cull.front_face != cur_state.cull.front_face) {
+    if (UpdateValue(cur_state.cull.front_face, cull.front_face)) {
        glFrontFace(cull.front_face);
    }
 }

 void OpenGLState::ApplyColorMask() const {
-    if (independant_blend.enabled) {
-        for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
-            const auto& updated = color_mask[i];
-            const auto& current = cur_state.color_mask[i];
-            if (updated.red_enabled != current.red_enabled ||
-                updated.green_enabled != current.green_enabled ||
-                updated.blue_enabled != current.blue_enabled ||
-                updated.alpha_enabled != current.alpha_enabled) {
-                glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled,
-                             updated.blue_enabled, updated.alpha_enabled);
-            }
-        }
-    } else {
-        const auto& updated = color_mask[0];
-        const auto& current = cur_state.color_mask[0];
+    for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) {
+        const auto& updated = color_mask[i];
+        auto& current = cur_state.color_mask[i];
        if (updated.red_enabled != current.red_enabled ||
            updated.green_enabled != current.green_enabled ||
            updated.blue_enabled != current.blue_enabled ||
            updated.alpha_enabled != current.alpha_enabled) {
-            glColorMask(updated.red_enabled, updated.green_enabled, updated.blue_enabled,
-                        updated.alpha_enabled);
+            current = updated;
+            glColorMaski(static_cast<GLuint>(i), updated.red_enabled, updated.green_enabled,
+                         updated.blue_enabled, updated.alpha_enabled);
        }
    }
 }

 void OpenGLState::ApplyDepth() const {
-    if (depth.test_enabled != cur_state.depth.test_enabled) {
-        if (depth.test_enabled) {
-            glEnable(GL_DEPTH_TEST);
-        } else {
-            glDisable(GL_DEPTH_TEST);
-        }
-    }
+    Enable(GL_DEPTH_TEST, cur_state.depth.test_enabled, depth.test_enabled);

-    if (depth.test_func != cur_state.depth.test_func) {
+    if (cur_state.depth.test_func != depth.test_func) {
+        cur_state.depth.test_func = depth.test_func;
        glDepthFunc(depth.test_func);
    }

-    if (depth.write_mask != cur_state.depth.write_mask) {
+    if (cur_state.depth.write_mask != depth.write_mask) {
+        cur_state.depth.write_mask = depth.write_mask;
        glDepthMask(depth.write_mask);
    }
 }

 void OpenGLState::ApplyPrimitiveRestart() const {
-    if (primitive_restart.enabled != cur_state.primitive_restart.enabled) {
-        if (primitive_restart.enabled) {
-            glEnable(GL_PRIMITIVE_RESTART);
-        } else {
-            glDisable(GL_PRIMITIVE_RESTART);
-        }
-    }
+    Enable(GL_PRIMITIVE_RESTART, cur_state.primitive_restart.enabled, primitive_restart.enabled);

-    if (primitive_restart.index != cur_state.primitive_restart.index) {
+    if (cur_state.primitive_restart.index != primitive_restart.index) {
+        cur_state.primitive_restart.index = primitive_restart.index;
        glPrimitiveRestartIndex(primitive_restart.index);
    }
 }

 void OpenGLState::ApplyStencilTest() const {
-    if (stencil.test_enabled != cur_state.stencil.test_enabled) {
-        if (stencil.test_enabled) {
-            glEnable(GL_STENCIL_TEST);
-        } else {
-            glDisable(GL_STENCIL_TEST);
-        }
-    }
+    Enable(GL_STENCIL_TEST, cur_state.stencil.test_enabled, stencil.test_enabled);

-    const auto ConfigStencil = [](GLenum face, const auto& config, const auto& prev_config) {
-        if (config.test_func != prev_config.test_func || config.test_ref != prev_config.test_ref ||
-            config.test_mask != prev_config.test_mask) {
+    const auto ConfigStencil = [](GLenum face, const auto& config, auto& current) {
+        if (current.test_func != config.test_func || current.test_ref != config.test_ref ||
+            current.test_mask != config.test_mask) {
+            current.test_func = config.test_func;
+            current.test_ref = config.test_ref;
+            current.test_mask = config.test_mask;
            glStencilFuncSeparate(face, config.test_func, config.test_ref, config.test_mask);
        }
-        if (config.action_depth_fail != prev_config.action_depth_fail ||
-            config.action_depth_pass != prev_config.action_depth_pass ||
-            config.action_stencil_fail != prev_config.action_stencil_fail) {
+        if (current.action_depth_fail != config.action_depth_fail ||
+            current.action_depth_pass != config.action_depth_pass ||
+            current.action_stencil_fail != config.action_stencil_fail) {
+            current.action_depth_fail = config.action_depth_fail;
+            current.action_depth_pass = config.action_depth_pass;
+            current.action_stencil_fail = config.action_stencil_fail;
            glStencilOpSeparate(face, config.action_stencil_fail, config.action_depth_fail,
                                config.action_depth_pass);
        }
-        if (config.write_mask != prev_config.write_mask) {
+        if (current.write_mask != config.write_mask) {
+            current.write_mask = config.write_mask;
            glStencilMaskSeparate(face, config.write_mask);
        }
    };
    ConfigStencil(GL_FRONT, stencil.front, cur_state.stencil.front);
    ConfigStencil(GL_BACK, stencil.back, cur_state.stencil.back);
 }
-// Viewport does not affects glClearBuffer so emulate viewport using scissor test
+
+void OpenGLState::ApplyViewport() const {
+    for (GLuint i = 0; i < static_cast<GLuint>(Maxwell::NumViewports); ++i) {
+        const auto& updated = viewports[i];
+        auto& current = cur_state.viewports[i];
+
+        if (current.x != updated.x || current.y != updated.y || current.width != updated.width ||
+            current.height != updated.height) {
+            current.x = updated.x;
+            current.y = updated.y;
+            current.width = updated.width;
+            current.height = updated.height;
+            glViewportIndexedf(i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y),
+                               static_cast<GLfloat>(updated.width),
+                               static_cast<GLfloat>(updated.height));
+        }
+        if (current.depth_range_near != updated.depth_range_near ||
+            current.depth_range_far != updated.depth_range_far) {
+            current.depth_range_near = updated.depth_range_near;
+            current.depth_range_far = updated.depth_range_far;
+            glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far);
+        }
+
+        Enable(GL_SCISSOR_TEST, i, current.scissor.enabled, updated.scissor.enabled);
+
+        if (current.scissor.x != updated.scissor.x || current.scissor.y != updated.scissor.y ||
+            current.scissor.width != updated.scissor.width ||
+            current.scissor.height != updated.scissor.height) {
+            current.scissor.x = updated.scissor.x;
+            current.scissor.y = updated.scissor.y;
+            current.scissor.width = updated.scissor.width;
+            current.scissor.height = updated.scissor.height;
+            glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width,
+                             updated.scissor.height);
+        }
+    }
+}
+
+void OpenGLState::ApplyGlobalBlending() const {
+    const Blend& updated = blend[0];
+    Blend& current = cur_state.blend[0];
+
+    Enable(GL_BLEND, current.enabled, updated.enabled);
+
+    if (current.src_rgb_func != updated.src_rgb_func ||
+        current.dst_rgb_func != updated.dst_rgb_func || current.src_a_func != updated.src_a_func ||
+        current.dst_a_func != updated.dst_a_func) {
+        current.src_rgb_func = updated.src_rgb_func;
+        current.dst_rgb_func = updated.dst_rgb_func;
+        current.src_a_func = updated.src_a_func;
+        current.dst_a_func = updated.dst_a_func;
+        glBlendFuncSeparate(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func,
+                            updated.dst_a_func);
+    }
+
+    if (current.rgb_equation != updated.rgb_equation || current.a_equation != updated.a_equation) {
+        current.rgb_equation = updated.rgb_equation;
+        current.a_equation = updated.a_equation;
+        glBlendEquationSeparate(updated.rgb_equation, updated.a_equation);
+    }
+}
+
+void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) const {
+    const Blend& updated = blend[target];
+    Blend& current = cur_state.blend[target];
+
+    if (current.enabled != updated.enabled || force) {
+        current.enabled = updated.enabled;
+        Enable(GL_BLEND, static_cast<GLuint>(target), updated.enabled);
+    }
+
+    if (UpdateTie(std::tie(current.src_rgb_func, current.dst_rgb_func, current.src_a_func,
+                           current.dst_a_func),
+                  std::tie(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func,
+                           updated.dst_a_func))) {
+        glBlendFuncSeparatei(static_cast<GLuint>(target), updated.src_rgb_func,
+                             updated.dst_rgb_func, updated.src_a_func, updated.dst_a_func);
+    }
+
+    if (UpdateTie(std::tie(current.rgb_equation, current.a_equation),
+                  std::tie(updated.rgb_equation, updated.a_equation))) {
+        glBlendEquationSeparatei(static_cast<GLuint>(target), updated.rgb_equation,
+                                 updated.a_equation);
+    }
+}
+
+void OpenGLState::ApplyBlending() const {
+    if (independant_blend.enabled) {
+        const bool force = independant_blend.enabled != cur_state.independant_blend.enabled;
+        for (std::size_t target = 0; target < Maxwell::NumRenderTargets; ++target) {
+            ApplyTargetBlending(target, force);
+        }
+    } else {
+        ApplyGlobalBlending();
+    }
+    cur_state.independant_blend.enabled = independant_blend.enabled;
+
+    if (UpdateTie(
+            std::tie(cur_state.blend_color.red, cur_state.blend_color.green,
+                     cur_state.blend_color.blue, cur_state.blend_color.alpha),
+            std::tie(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha))) {
+        glBlendColor(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha);
+    }
+}
+
+void OpenGLState::ApplyLogicOp() const {
+    Enable(GL_COLOR_LOGIC_OP, cur_state.logic_op.enabled, logic_op.enabled);
+
+    if (UpdateValue(cur_state.logic_op.operation, logic_op.operation)) {
+        glLogicOp(logic_op.operation);
+    }
+}
+
+void OpenGLState::ApplyPolygonOffset() const {
+    Enable(GL_POLYGON_OFFSET_FILL, cur_state.polygon_offset.fill_enable,
+           polygon_offset.fill_enable);
+    Enable(GL_POLYGON_OFFSET_LINE, cur_state.polygon_offset.line_enable,
+           polygon_offset.line_enable);
+    Enable(GL_POLYGON_OFFSET_POINT, cur_state.polygon_offset.point_enable,
+           polygon_offset.point_enable);
+
+    if (UpdateTie(std::tie(cur_state.polygon_offset.factor, cur_state.polygon_offset.units,
+                           cur_state.polygon_offset.clamp),
+                  std::tie(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp))) {
+        if (GLAD_GL_EXT_polygon_offset_clamp && polygon_offset.clamp != 0) {
+            glPolygonOffsetClamp(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp);
+        } else {
+            UNIMPLEMENTED_IF_MSG(polygon_offset.clamp != 0,
+                                 "Unimplemented Depth polygon offset clamp.");
+            glPolygonOffset(polygon_offset.factor, polygon_offset.units);
+        }
+    }
+}
+
+void OpenGLState::ApplyTextures() const {
+    bool has_delta{};
+    std::size_t first{};
+    std::size_t last{};
+    std::array<GLuint, Maxwell::NumTextureSamplers> textures;
+
+    for (std::size_t i = 0; i < std::size(texture_units); ++i) {
+        const auto& texture_unit = texture_units[i];
+        auto& cur_state_texture_unit = cur_state.texture_units[i];
+        textures[i] = texture_unit.texture;
+        if (cur_state_texture_unit.texture == textures[i])
+            continue;
+        cur_state_texture_unit.texture = textures[i];
+        if (!has_delta) {
+            first = i;
+            has_delta = true;
+        }
+        last = i;
+    }
+    if (has_delta) {
+        glBindTextures(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1),
+                       textures.data() + first);
+    }
+}
+
+void OpenGLState::ApplySamplers() const {
+    bool has_delta{};
+    std::size_t first{};
+    std::size_t last{};
+    std::array<GLuint, Maxwell::NumTextureSamplers> samplers;
+
+    for (std::size_t i = 0; i < std::size(samplers); ++i) {
+        if (cur_state.texture_units[i].sampler == texture_units[i].sampler)
+            continue;
+        cur_state.texture_units[i].sampler = texture_units[i].sampler;
+        samplers[i] = texture_units[i].sampler;
+        if (!has_delta) {
+            first = i;
+            has_delta = true;
+        }
+        last = i;
+    }
+    if (has_delta) {
+        glBindSamplers(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1),
+                       samplers.data() + first);
+    }
+}
+
+void OpenGLState::Apply() const {
+    ApplyFramebufferState();
+    ApplyVertexArrayState();
+    ApplyShaderProgram();
+    ApplyProgramPipeline();
+    ApplyClipDistances();
+    ApplyPointSize();
+    ApplyFragmentColorClamp();
+    ApplyMultisample();
+    ApplyDepthClamp();
+    ApplyColorMask();
+    ApplyViewport();
+    ApplyStencilTest();
+    ApplySRgb();
+    ApplyCulling();
+    ApplyDepth();
+    ApplyPrimitiveRestart();
+    ApplyBlending();
+    ApplyLogicOp();
+    ApplyTextures();
+    ApplySamplers();
+    ApplyPolygonOffset();
+}
+
 void OpenGLState::EmulateViewportWithScissor() {
    auto& current = viewports[0];
    if (current.scissor.enabled) {
@@ -251,332 +555,6 @@ void OpenGLState::EmulateViewportWithScissor() {
    }
 }

-void OpenGLState::ApplyViewport() const {
-    if (geometry_shaders.enabled) {
-        for (GLuint i = 0; i < static_cast<GLuint>(Tegra::Engines::Maxwell3D::Regs::NumViewports);
-             i++) {
-            const auto& current = cur_state.viewports[i];
-            const auto& updated = viewports[i];
-            if (updated.x != current.x || updated.y != current.y ||
-                updated.width != current.width || updated.height != current.height) {
-                glViewportIndexedf(
-                    i, static_cast<GLfloat>(updated.x), static_cast<GLfloat>(updated.y),
-                    static_cast<GLfloat>(updated.width), static_cast<GLfloat>(updated.height));
-            }
-            if (updated.depth_range_near != current.depth_range_near ||
-                updated.depth_range_far != current.depth_range_far) {
-                glDepthRangeIndexed(i, updated.depth_range_near, updated.depth_range_far);
-            }
-
-            if (updated.scissor.enabled != current.scissor.enabled) {
-                if (updated.scissor.enabled) {
-                    glEnablei(GL_SCISSOR_TEST, i);
-                } else {
-                    glDisablei(GL_SCISSOR_TEST, i);
-                }
-            }
-
-            if (updated.scissor.x != current.scissor.x || updated.scissor.y != current.scissor.y ||
-                updated.scissor.width != current.scissor.width ||
-                updated.scissor.height != current.scissor.height) {
-                glScissorIndexed(i, updated.scissor.x, updated.scissor.y, updated.scissor.width,
-                                 updated.scissor.height);
-            }
-        }
-    } else {
-        const auto& current = cur_state.viewports[0];
-        const auto& updated = viewports[0];
-        if (updated.x != current.x || updated.y != current.y || updated.width != current.width ||
-            updated.height != current.height) {
-            glViewport(updated.x, updated.y, updated.width, updated.height);
-        }
-
-        if (updated.depth_range_near != current.depth_range_near ||
-            updated.depth_range_far != current.depth_range_far) {
-            glDepthRange(updated.depth_range_near, updated.depth_range_far);
-        }
-
-        if (updated.scissor.enabled != current.scissor.enabled) {
-            if (updated.scissor.enabled) {
-                glEnable(GL_SCISSOR_TEST);
-            } else {
-                glDisable(GL_SCISSOR_TEST);
-            }
-        }
-
-        if (updated.scissor.x != current.scissor.x || updated.scissor.y != current.scissor.y ||
-            updated.scissor.width != current.scissor.width ||
-            updated.scissor.height != current.scissor.height) {
-            glScissor(updated.scissor.x, updated.scissor.y, updated.scissor.width,
-                      updated.scissor.height);
-        }
-    }
-}
-
-void OpenGLState::ApplyGlobalBlending() const {
-    const Blend& current = cur_state.blend[0];
-    const Blend& updated = blend[0];
-    if (updated.enabled != current.enabled) {
-        if (updated.enabled) {
-            glEnable(GL_BLEND);
-        } else {
-            glDisable(GL_BLEND);
-        }
-    }
-    if (!updated.enabled) {
-        return;
-    }
-    if (updated.src_rgb_func != current.src_rgb_func ||
-        updated.dst_rgb_func != current.dst_rgb_func || updated.src_a_func != current.src_a_func ||
-        updated.dst_a_func != current.dst_a_func) {
-        glBlendFuncSeparate(updated.src_rgb_func, updated.dst_rgb_func, updated.src_a_func,
-                            updated.dst_a_func);
-    }
-
-    if (updated.rgb_equation != current.rgb_equation || updated.a_equation != current.a_equation) {
-        glBlendEquationSeparate(updated.rgb_equation, updated.a_equation);
-    }
-}
-
-void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) const {
-    const Blend& updated = blend[target];
-    const Blend& current = cur_state.blend[target];
-    if (updated.enabled != current.enabled || force) {
-        if (updated.enabled) {
-            glEnablei(GL_BLEND, static_cast<GLuint>(target));
-        } else {
-            glDisablei(GL_BLEND, static_cast<GLuint>(target));
-        }
-    }
-
-    if (updated.src_rgb_func != current.src_rgb_func ||
-        updated.dst_rgb_func != current.dst_rgb_func || updated.src_a_func != current.src_a_func ||
-        updated.dst_a_func != current.dst_a_func) {
-        glBlendFuncSeparatei(static_cast<GLuint>(target), updated.src_rgb_func,
-                             updated.dst_rgb_func, updated.src_a_func, updated.dst_a_func);
-    }
-
-    if (updated.rgb_equation != current.rgb_equation || updated.a_equation != current.a_equation) {
-        glBlendEquationSeparatei(static_cast<GLuint>(target), updated.rgb_equation,
-                                 updated.a_equation);
-    }
-}
-
-void OpenGLState::ApplyBlending() const {
-    if (independant_blend.enabled) {
-        for (size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
-            ApplyTargetBlending(i,
-                                independant_blend.enabled != cur_state.independant_blend.enabled);
-        }
-    } else {
-        ApplyGlobalBlending();
-    }
-    if (blend_color.red != cur_state.blend_color.red ||
-        blend_color.green != cur_state.blend_color.green ||
-        blend_color.blue != cur_state.blend_color.blue ||
-        blend_color.alpha != cur_state.blend_color.alpha) {
-        glBlendColor(blend_color.red, blend_color.green, blend_color.blue, blend_color.alpha);
-    }
-}
-
-void OpenGLState::ApplyLogicOp() const {
-    if (logic_op.enabled != cur_state.logic_op.enabled) {
-        if (logic_op.enabled) {
-            glEnable(GL_COLOR_LOGIC_OP);
-        } else {
-            glDisable(GL_COLOR_LOGIC_OP);
-        }
-    }
-
-    if (logic_op.operation != cur_state.logic_op.operation) {
-        glLogicOp(logic_op.operation);
-    }
-}
-
-void OpenGLState::ApplyPolygonOffset() const {
-    const bool fill_enable_changed =
-        polygon_offset.fill_enable != cur_state.polygon_offset.fill_enable;
-    const bool line_enable_changed =
-        polygon_offset.line_enable != cur_state.polygon_offset.line_enable;
-    const bool point_enable_changed =
-        polygon_offset.point_enable != cur_state.polygon_offset.point_enable;
-    const bool factor_changed = polygon_offset.factor != cur_state.polygon_offset.factor;
-    const bool units_changed = polygon_offset.units != cur_state.polygon_offset.units;
-    const bool clamp_changed = polygon_offset.clamp != cur_state.polygon_offset.clamp;
-
-    if (fill_enable_changed) {
-        if (polygon_offset.fill_enable) {
-            glEnable(GL_POLYGON_OFFSET_FILL);
-        } else {
-            glDisable(GL_POLYGON_OFFSET_FILL);
-        }
-    }
-
-    if (line_enable_changed) {
-        if (polygon_offset.line_enable) {
-            glEnable(GL_POLYGON_OFFSET_LINE);
-        } else {
-            glDisable(GL_POLYGON_OFFSET_LINE);
-        }
-    }
-
-    if (point_enable_changed) {
-        if (polygon_offset.point_enable) {
-            glEnable(GL_POLYGON_OFFSET_POINT);
-        } else {
-            glDisable(GL_POLYGON_OFFSET_POINT);
-        }
-    }
-
-    if (factor_changed || units_changed || clamp_changed) {
-        if (GLAD_GL_EXT_polygon_offset_clamp && polygon_offset.clamp != 0) {
-            glPolygonOffsetClamp(polygon_offset.factor, polygon_offset.units, polygon_offset.clamp);
-        } else {
-            glPolygonOffset(polygon_offset.factor, polygon_offset.units);
-            UNIMPLEMENTED_IF_MSG(polygon_offset.clamp != 0,
-                                 "Unimplemented Depth polygon offset clamp.");
-        }
-    }
-}
-
-void OpenGLState::ApplyTextures() const {
-    bool has_delta{};
-    std::size_t first{};
-    std::size_t last{};
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> textures;
-
-    for (std::size_t i = 0; i < std::size(texture_units); ++i) {
-        const auto& texture_unit = texture_units[i];
-        const auto& cur_state_texture_unit = cur_state.texture_units[i];
-        textures[i] = texture_unit.texture;
-
-        if (textures[i] != cur_state_texture_unit.texture) {
-            if (!has_delta) {
-                first = i;
-                has_delta = true;
-            }
-            last = i;
-        }
-    }
-
-    if (has_delta) {
-        glBindTextures(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1),
-                       textures.data() + first);
-    }
-}
-
-void OpenGLState::ApplySamplers() const {
-    bool has_delta{};
-    std::size_t first{};
-    std::size_t last{};
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> samplers;
-    for (std::size_t i = 0; i < std::size(samplers); ++i) {
-        samplers[i] = texture_units[i].sampler;
-        if (samplers[i] != cur_state.texture_units[i].sampler) {
-            if (!has_delta) {
-                first = i;
-                has_delta = true;
-            }
-            last = i;
-        }
-    }
-    if (has_delta) {
-        glBindSamplers(static_cast<GLuint>(first), static_cast<GLsizei>(last - first + 1),
-                       samplers.data() + first);
-    }
-}
-
-void OpenGLState::ApplyFramebufferState() const {
-    if (draw.read_framebuffer != cur_state.draw.read_framebuffer) {
-        glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer);
-    }
-    if (draw.draw_framebuffer != cur_state.draw.draw_framebuffer) {
-        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, draw.draw_framebuffer);
-    }
-}
-
-void OpenGLState::ApplyVertexArrayState() const {
-    if (draw.vertex_array != cur_state.draw.vertex_array) {
-        glBindVertexArray(draw.vertex_array);
-    }
-}
-
-void OpenGLState::ApplyDepthClamp() const {
-    if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane &&
-        depth_clamp.near_plane == cur_state.depth_clamp.near_plane) {
-        return;
-    }
-    UNIMPLEMENTED_IF_MSG(depth_clamp.far_plane != depth_clamp.near_plane,
-                         "Unimplemented Depth Clamp Separation!");
-
-    if (depth_clamp.far_plane || depth_clamp.near_plane) {
-        glEnable(GL_DEPTH_CLAMP);
-    } else {
-        glDisable(GL_DEPTH_CLAMP);
-    }
-}
-
-void OpenGLState::Apply() const {
-    ApplyFramebufferState();
-    ApplyVertexArrayState();
-
-    // Shader program
-    if (draw.shader_program != cur_state.draw.shader_program) {
-        glUseProgram(draw.shader_program);
-    }
-
-    // Program pipeline
-    if (draw.program_pipeline != cur_state.draw.program_pipeline) {
-        glBindProgramPipeline(draw.program_pipeline);
-    }
-    // Clip distance
-    for (std::size_t i = 0; i < clip_distance.size(); ++i) {
-        if (clip_distance[i] != cur_state.clip_distance[i]) {
-            if (clip_distance[i]) {
-                glEnable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
-            } else {
-                glDisable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
-            }
-        }
-    }
-    // Point
-    if (point.size != cur_state.point.size) {
-        glPointSize(point.size);
-    }
-    if (fragment_color_clamp.enabled != cur_state.fragment_color_clamp.enabled) {
-        glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB,
-                     fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE);
-    }
-    if (multisample_control.alpha_to_coverage != cur_state.multisample_control.alpha_to_coverage) {
-        if (multisample_control.alpha_to_coverage) {
-            glEnable(GL_SAMPLE_ALPHA_TO_COVERAGE);
-        } else {
-            glDisable(GL_SAMPLE_ALPHA_TO_COVERAGE);
-        }
-    }
-    if (multisample_control.alpha_to_one != cur_state.multisample_control.alpha_to_one) {
-        if (multisample_control.alpha_to_one) {
-            glEnable(GL_SAMPLE_ALPHA_TO_ONE);
-        } else {
-            glDisable(GL_SAMPLE_ALPHA_TO_ONE);
-        }
-    }
-    ApplyDepthClamp();
-    ApplyColorMask();
-    ApplyViewport();
-    ApplyStencilTest();
-    ApplySRgb();
-    ApplyCulling();
-    ApplyDepth();
-    ApplyPrimitiveRestart();
-    ApplyBlending();
-    ApplyLogicOp();
-    ApplyTextures();
-    ApplySamplers();
-    ApplyPolygonOffset();
-    cur_state = *this;
-}
-
 OpenGLState& OpenGLState::UnbindTexture(GLuint handle) {
    for (auto& unit : texture_units) {
        if (unit.texture == handle) {
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -53,10 +53,6 @@ public:
        bool near_plane;
    } depth_clamp; // GL_DEPTH_CLAMP

-    struct {
-        bool enabled; // viewports arrays are only supported when geometry shaders are enabled.
-    } geometry_shaders;
-
    struct {
        bool enabled;      // GL_CULL_FACE
        GLenum mode;       // GL_CULL_FACE_MODE
@@ -184,34 +180,26 @@ public:
    static OpenGLState GetCurState() {
        return cur_state;
    }
+
    static bool GetsRGBUsed() {
        return s_rgb_used;
    }
+
    static void ClearsRGBUsed() {
        s_rgb_used = false;
    }
+
    /// Apply this state as the current OpenGL state
    void Apply() const;
-    /// Apply only the state affecting the framebuffer
-    void ApplyFramebufferState() const;
-    /// Apply only the state affecting the vertex array
-    void ApplyVertexArrayState() const;
-    /// Set the initial OpenGL state
-    static void ApplyDefaultState();
-    /// Resets any references to the given resource
-    OpenGLState& UnbindTexture(GLuint handle);
-    OpenGLState& ResetSampler(GLuint handle);
-    OpenGLState& ResetProgram(GLuint handle);
-    OpenGLState& ResetPipeline(GLuint handle);
-    OpenGLState& ResetVertexArray(GLuint handle);
-    OpenGLState& ResetFramebuffer(GLuint handle);
-    void EmulateViewportWithScissor();

-private:
-    static OpenGLState cur_state;
-    // Workaround for sRGB problems caused by
-    // QT not supporting srgb output
-    static bool s_rgb_used;
+    void ApplyFramebufferState() const;
+    void ApplyVertexArrayState() const;
+    void ApplyShaderProgram() const;
+    void ApplyProgramPipeline() const;
+    void ApplyClipDistances() const;
+    void ApplyPointSize() const;
+    void ApplyFragmentColorClamp() const;
+    void ApplyMultisample() const;
    void ApplySRgb() const;
    void ApplyCulling() const;
    void ApplyColorMask() const;
@@ -227,6 +215,26 @@ private:
    void ApplySamplers() const;
    void ApplyDepthClamp() const;
    void ApplyPolygonOffset() const;
+
+    /// Set the initial OpenGL state
+    static void ApplyDefaultState();
+
+    /// Resets any references to the given resource
+    OpenGLState& UnbindTexture(GLuint handle);
+    OpenGLState& ResetSampler(GLuint handle);
+    OpenGLState& ResetProgram(GLuint handle);
+    OpenGLState& ResetPipeline(GLuint handle);
+    OpenGLState& ResetVertexArray(GLuint handle);
+    OpenGLState& ResetFramebuffer(GLuint handle);
+
+    /// Viewport does not affects glClearBuffer so emulate viewport using scissor test
+    void EmulateViewportWithScissor();
+
+private:
+    static OpenGLState cur_state;
+
+    // Workaround for sRGB problems caused by QT not supporting srgb output
+    static bool s_rgb_used;
 };

 } // namespace OpenGL
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -7,7 +7,9 @@
 #include <fmt/format.h>

 #include "common/assert.h"
+#include "common/bit_field.h"
 #include "common/common_types.h"
+#include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/shader_ir.h"

@@ -38,9 +40,24 @@ static std::size_t GetCoordCount(TextureType texture_type) {
 u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
    const Instruction instr = {program_code[pc]};
    const auto opcode = OpCode::Decode(instr);
-
+    bool is_bindless = false;
    switch (opcode->get().GetId()) {
    case OpCode::Id::TEX: {
+        if (instr.tex.UsesMiscMode(TextureMiscMode::NODEP)) {
+            LOG_WARNING(HW_GPU, "TEX.NODEP implementation is incomplete");
+        }
+
+        const TextureType texture_type{instr.tex.texture_type};
+        const bool is_array = instr.tex.array != 0;
+        const bool is_aoffi = instr.tex.UsesMiscMode(TextureMiscMode::AOFFI);
+        const bool depth_compare = instr.tex.UsesMiscMode(TextureMiscMode::DC);
+        const auto process_mode = instr.tex.GetTextureProcessMode();
+        WriteTexInstructionFloat(
+            bb, instr,
+            GetTexCode(instr, texture_type, process_mode, depth_compare, is_array, is_aoffi, {}));
+        break;
+    }
+    case OpCode::Id::TEX_B: {
        UNIMPLEMENTED_IF_MSG(instr.tex.UsesMiscMode(TextureMiscMode::AOFFI),
                             "AOFFI is not implemented");

@@ -48,12 +65,14 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
            LOG_WARNING(HW_GPU, "TEX.NODEP implementation is incomplete");
        }

-        const TextureType texture_type{instr.tex.texture_type};
-        const bool is_array = instr.tex.array != 0;
-        const bool depth_compare = instr.tex.UsesMiscMode(TextureMiscMode::DC);
-        const auto process_mode = instr.tex.GetTextureProcessMode();
-        WriteTexInstructionFloat(
-            bb, instr, GetTexCode(instr, texture_type, process_mode, depth_compare, is_array));
+        const TextureType texture_type{instr.tex_b.texture_type};
+        const bool is_array = instr.tex_b.array != 0;
+        const bool is_aoffi = instr.tex.UsesMiscMode(TextureMiscMode::AOFFI);
+        const bool depth_compare = instr.tex_b.UsesMiscMode(TextureMiscMode::DC);
+        const auto process_mode = instr.tex_b.GetTextureProcessMode();
+        WriteTexInstructionFloat(bb, instr,
+                                 GetTexCode(instr, texture_type, process_mode, depth_compare,
+                                            is_array, is_aoffi, {instr.gpr20}));
        break;
    }
    case OpCode::Id::TEXS: {
@@ -78,8 +97,6 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
    }
    case OpCode::Id::TLD4: {
        ASSERT(instr.tld4.array == 0);
-        UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::AOFFI),
-                             "AOFFI is not implemented");
        UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::NDV),
                             "NDV is not implemented");
        UNIMPLEMENTED_IF_MSG(instr.tld4.UsesMiscMode(TextureMiscMode::PTP),
@@ -92,8 +109,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
        const auto texture_type = instr.tld4.texture_type.Value();
        const bool depth_compare = instr.tld4.UsesMiscMode(TextureMiscMode::DC);
        const bool is_array = instr.tld4.array != 0;
-        WriteTexInstructionFloat(bb, instr,
-                                 GetTld4Code(instr, texture_type, depth_compare, is_array));
+        const bool is_aoffi = instr.tld4.UsesMiscMode(TextureMiscMode::AOFFI);
+        WriteTexInstructionFloat(
+            bb, instr, GetTld4Code(instr, texture_type, depth_compare, is_array, is_aoffi));
        break;
    }
    case OpCode::Id::TLD4S: {
@@ -127,13 +145,16 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
        Node4 values;
        for (u32 element = 0; element < values.size(); ++element) {
            auto coords_copy = coords;
-            MetaTexture meta{sampler, {}, {}, {}, {}, component, element};
+            MetaTexture meta{sampler, {}, {}, {}, {}, {}, component, element};
            values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy));
        }

        WriteTexsInstructionFloat(bb, instr, values);
        break;
    }
+    case OpCode::Id::TXQ_B:
+        is_bindless = true;
+        [[fallthrough]];
    case OpCode::Id::TXQ: {
        if (instr.txq.UsesMiscMode(TextureMiscMode::NODEP)) {
            LOG_WARNING(HW_GPU, "TXQ.NODEP implementation is incomplete");
@@ -143,7 +164,10 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
        // Sadly, not all texture instructions specify the type of texture their sampler
        // uses. This must be fixed at a later instance.
        const auto& sampler =
-            GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false, false);
+            is_bindless
+                ? GetBindlessSampler(instr.gpr8, Tegra::Shader::TextureType::Texture2D, false,
+                                     false)
+                : GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false, false);

        u32 indexer = 0;
        switch (instr.txq.query_type) {
@@ -152,9 +176,10 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
                if (!instr.txq.IsComponentEnabled(element)) {
                    continue;
                }
-                MetaTexture meta{sampler, {}, {}, {}, {}, {}, element};
+                MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};
                const Node value =
-                    Operation(OperationCode::TextureQueryDimensions, meta, GetRegister(instr.gpr8));
+                    Operation(OperationCode::TextureQueryDimensions, meta,
+                              GetRegister(instr.gpr8.Value() + (is_bindless ? 1 : 0)));
                SetTemporal(bb, indexer++, value);
            }
            for (u32 i = 0; i < indexer; ++i) {
@@ -168,6 +193,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
        }
        break;
    }
+    case OpCode::Id::TMML_B:
+        is_bindless = true;
+        [[fallthrough]];
    case OpCode::Id::TMML: {
        UNIMPLEMENTED_IF_MSG(instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV),
                             "NDV is not implemented");
@@ -178,7 +206,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {

        auto texture_type = instr.tmml.texture_type.Value();
        const bool is_array = instr.tmml.array != 0;
-        const auto& sampler = GetSampler(instr.sampler, texture_type, is_array, false);
+        const auto& sampler = is_bindless
+                                  ? GetBindlessSampler(instr.gpr20, texture_type, is_array, false)
+                                  : GetSampler(instr.sampler, texture_type, is_array, false);

        std::vector<Node> coords;

@@ -199,17 +229,19 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
            coords.push_back(GetRegister(instr.gpr8.Value() + 1));
            texture_type = TextureType::Texture2D;
        }
-
+        u32 indexer = 0;
        for (u32 element = 0; element < 2; ++element) {
+            if (!instr.tmml.IsComponentEnabled(element)) {
+                continue;
+            }
            auto params = coords;
-            MetaTexture meta{sampler, {}, {}, {}, {}, {}, element};
+            MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};
            const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params));
-            SetTemporal(bb, element, value);
+            SetTemporal(bb, indexer++, value);
        }
-        for (u32 element = 0; element < 2; ++element) {
-            SetRegister(bb, instr.gpr0.Value() + element, GetTemporal(element));
+        for (u32 i = 0; i < indexer; ++i) {
+            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
        }
-
        break;
    }
    case OpCode::Id::TLDS: {
@@ -254,6 +286,34 @@ const Sampler& ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler, Textu
    return *used_samplers.emplace(entry).first;
 }

+const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg, TextureType type,
+                                            bool is_array, bool is_shadow) {
+    const Node sampler_register = GetRegister(reg);
+    const Node base_sampler =
+        TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size()));
+    const auto cbuf = std::get_if<CbufNode>(base_sampler);
+    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
+    ASSERT(cbuf_offset_imm != nullptr);
+    const auto cbuf_offset = cbuf_offset_imm->GetValue();
+    const auto cbuf_index = cbuf->GetIndex();
+    const u64 cbuf_key = (cbuf_index << 32) | cbuf_offset;
+
+    // If this sampler has already been used, return the existing mapping.
+    const auto itr =
+        std::find_if(used_samplers.begin(), used_samplers.end(),
+                     [&](const Sampler& entry) { return entry.GetOffset() == cbuf_key; });
+    if (itr != used_samplers.end()) {
+        ASSERT(itr->GetType() == type && itr->IsArray() == is_array &&
+               itr->IsShadow() == is_shadow);
+        return *itr;
+    }
+
+    // Otherwise create a new mapping for this sampler
+    const std::size_t next_index = used_samplers.size();
+    const Sampler entry{cbuf_index, cbuf_offset, next_index, type, is_array, is_shadow};
+    return *used_samplers.emplace(entry).first;
+}
+
 void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const Node4& components) {
    u32 dest_elem = 0;
    for (u32 elem = 0; elem < 4; ++elem) {
@@ -325,22 +385,28 @@ void ShaderIR::WriteTexsInstructionHalfFloat(NodeBlock& bb, Instruction instr,

 Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
                               TextureProcessMode process_mode, std::vector<Node> coords,
-                               Node array, Node depth_compare, u32 bias_offset) {
+                               Node array, Node depth_compare, u32 bias_offset,
+                               std::vector<Node> aoffi,
+                               std::optional<Tegra::Shader::Register> bindless_reg) {
    const bool is_array = array;
    const bool is_shadow = depth_compare;
+    const bool is_bindless = bindless_reg.has_value();

    UNIMPLEMENTED_IF_MSG((texture_type == TextureType::Texture3D && (is_array || is_shadow)) ||
                             (texture_type == TextureType::TextureCube && is_array && is_shadow),
                         "This method is not supported.");

-    const auto& sampler = GetSampler(instr.sampler, texture_type, is_array, is_shadow);
+    const auto& sampler = is_bindless
+                              ? GetBindlessSampler(*bindless_reg, texture_type, is_array, is_shadow)
+                              : GetSampler(instr.sampler, texture_type, is_array, is_shadow);

    const bool lod_needed = process_mode == TextureProcessMode::LZ ||
                            process_mode == TextureProcessMode::LL ||
                            process_mode == TextureProcessMode::LLA;

-    // LOD selection (either via bias or explicit textureLod) not supported in GL for
-    // sampler2DArrayShadow and samplerCubeArrayShadow.
+    // LOD selection (either via bias or explicit textureLod) not
+    // supported in GL for sampler2DArrayShadow and
+    // samplerCubeArrayShadow.
    const bool gl_lod_supported =
        !((texture_type == Tegra::Shader::TextureType::Texture2D && is_array && is_shadow) ||
          (texture_type == Tegra::Shader::TextureType::TextureCube && is_array && is_shadow));
@@ -358,8 +424,9 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
            lod = Immediate(0.0f);
            break;
        case TextureProcessMode::LB:
-            // If present, lod or bias are always stored in the register indexed by the gpr20
-            // field with an offset depending on the usage of the other registers
+            // If present, lod or bias are always stored in the register
+            // indexed by the gpr20 field with an offset depending on the
+            // usage of the other registers
            bias = GetRegister(instr.gpr20.Value() + bias_offset);
            break;
        case TextureProcessMode::LL:
@@ -374,7 +441,7 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
    Node4 values;
    for (u32 element = 0; element < values.size(); ++element) {
        auto copy_coords = coords;
-        MetaTexture meta{sampler, array, depth_compare, bias, lod, {}, element};
+        MetaTexture meta{sampler, array, depth_compare, aoffi, bias, lod, {}, element};
        values[element] = Operation(read_method, meta, std::move(copy_coords));
    }

@@ -382,9 +449,22 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
 }

 Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type,
-                           TextureProcessMode process_mode, bool depth_compare, bool is_array) {
-    const bool lod_bias_enabled =
-        (process_mode != TextureProcessMode::None && process_mode != TextureProcessMode::LZ);
+                           TextureProcessMode process_mode, bool depth_compare, bool is_array,
+                           bool is_aoffi, std::optional<Tegra::Shader::Register> bindless_reg) {
+    const bool lod_bias_enabled{
+        (process_mode != TextureProcessMode::None && process_mode != TextureProcessMode::LZ)};
+
+    const bool is_bindless = bindless_reg.has_value();
+
+    u64 parameter_register = instr.gpr20.Value();
+    if (is_bindless) {
+        ++parameter_register;
+    }
+
+    const u32 bias_lod_offset = (is_bindless ? 1 : 0);
+    if (lod_bias_enabled) {
+        ++parameter_register;
+    }

    const auto [coord_count, total_coord_count] = ValidateAndGetCoordinateElement(
        texture_type, depth_compare, is_array, lod_bias_enabled, 4, 5);
@@ -404,15 +484,20 @@ Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type,

    const Node array = is_array ? GetRegister(array_register) : nullptr;

+    std::vector<Node> aoffi;
+    if (is_aoffi) {
+        aoffi = GetAoffiCoordinates(GetRegister(parameter_register++), coord_count, false);
+    }
+
    Node dc{};
    if (depth_compare) {
        // Depth is always stored in the register signaled by gpr20 or in the next register if lod
        // or bias are used
-        const u64 depth_register = instr.gpr20.Value() + (lod_bias_enabled ? 1 : 0);
-        dc = GetRegister(depth_register);
+        dc = GetRegister(parameter_register++);
    }

-    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, 0);
+    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_lod_offset,
+                          aoffi, bindless_reg);
 }

 Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type,
@@ -448,11 +533,12 @@ Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type,
        dc = GetRegister(depth_register);
    }

-    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_offset);
+    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_offset, {},
+                          {});
 }

 Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool depth_compare,
-                            bool is_array) {
+                            bool is_array, bool is_aoffi) {
    const std::size_t coord_count = GetCoordCount(texture_type);
    const std::size_t total_coord_count = coord_count + (is_array ? 1 : 0);
    const std::size_t total_reg_count = total_coord_count + (depth_compare ? 1 : 0);
@@ -463,15 +549,27 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
    const u64 coord_register = array_register + (is_array ? 1 : 0);

    std::vector<Node> coords;
-    for (size_t i = 0; i < coord_count; ++i)
+    for (std::size_t i = 0; i < coord_count; ++i) {
        coords.push_back(GetRegister(coord_register + i));
+    }
+
+    u64 parameter_register = instr.gpr20.Value();
+    std::vector<Node> aoffi;
+    if (is_aoffi) {
+        aoffi = GetAoffiCoordinates(GetRegister(parameter_register++), coord_count, true);
+    }
+
+    Node dc{};
+    if (depth_compare) {
+        dc = GetRegister(parameter_register++);
+    }

    const auto& sampler = GetSampler(instr.sampler, texture_type, is_array, depth_compare);

    Node4 values;
    for (u32 element = 0; element < values.size(); ++element) {
        auto coords_copy = coords;
-        MetaTexture meta{sampler, GetRegister(array_register), {}, {}, {}, {}, element};
+        MetaTexture meta{sampler, GetRegister(array_register), dc, aoffi, {}, {}, {}, element};
        values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy));
    }

@@ -507,7 +605,7 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is
    Node4 values;
    for (u32 element = 0; element < values.size(); ++element) {
        auto coords_copy = coords;
-        MetaTexture meta{sampler, array, {}, {}, lod, {}, element};
+        MetaTexture meta{sampler, array, {}, {}, {}, lod, {}, element};
        values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy));
    }
    return values;
@@ -531,4 +629,45 @@ std::tuple<std::size_t, std::size_t> ShaderIR::ValidateAndGetCoordinateElement(
    return {coord_count, total_coord_count};
 }

-} // namespace VideoCommon::Shader
+std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count,
+                                                bool is_tld4) {
+    const auto [coord_offsets, size, wrap_value,
+                diff_value] = [is_tld4]() -> std::tuple<std::array<u32, 3>, u32, s32, s32> {
+        if (is_tld4) {
+            return {{0, 8, 16}, 6, 32, 64};
+        } else {
+            return {{0, 4, 8}, 4, 8, 16};
+        }
+    }();
+    const u32 mask = (1U << size) - 1;
+
+    std::vector<Node> aoffi;
+    aoffi.reserve(coord_count);
+
+    const auto aoffi_immediate{
+        TrackImmediate(aoffi_reg, global_code, static_cast<s64>(global_code.size()))};
+    if (!aoffi_immediate) {
+        // Variable access, not supported on AMD.
+        LOG_WARNING(HW_GPU,
+                    "AOFFI constant folding failed, some hardware might have graphical issues");
+        for (std::size_t coord = 0; coord < coord_count; ++coord) {
+            const Node value = BitfieldExtract(aoffi_reg, coord_offsets.at(coord), size);
+            const Node condition =
+                Operation(OperationCode::LogicalIGreaterEqual, value, Immediate(wrap_value));
+            const Node negative = Operation(OperationCode::IAdd, value, Immediate(-diff_value));
+            aoffi.push_back(Operation(OperationCode::Select, condition, negative, value));
+        }
+        return aoffi;
+    }
+
+    for (std::size_t coord = 0; coord < coord_count; ++coord) {
+        s32 value = (*aoffi_immediate >> coord_offsets.at(coord)) & mask;
+        if (value >= wrap_value) {
+            value -= diff_value;
+        }
+        aoffi.push_back(Immediate(value));
+    }
+    return aoffi;
+}
+
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -7,6 +7,7 @@
 #include <array>
 #include <cstring>
 #include <map>
+#include <optional>
 #include <set>
 #include <string>
 #include <tuple>
@@ -195,9 +196,23 @@ enum class ExitMethod {

 class Sampler {
 public:
+    // Use this constructor for bounded Samplers
    explicit Sampler(std::size_t offset, std::size_t index, Tegra::Shader::TextureType type,
                     bool is_array, bool is_shadow)
-        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow} {}
+        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow},
+          is_bindless{false} {}
+
+    // Use this constructor for bindless Samplers
+    explicit Sampler(u32 cbuf_index, u32 cbuf_offset, std::size_t index,
+                     Tegra::Shader::TextureType type, bool is_array, bool is_shadow)
+        : offset{(static_cast<u64>(cbuf_index) << 32) | cbuf_offset}, index{index}, type{type},
+          is_array{is_array}, is_shadow{is_shadow}, is_bindless{true} {}
+
+    // Use this only for serialization/deserialization
+    explicit Sampler(std::size_t offset, std::size_t index, Tegra::Shader::TextureType type,
+                     bool is_array, bool is_shadow, bool is_bindless)
+        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow},
+          is_bindless{is_bindless} {}

    std::size_t GetOffset() const {
        return offset;
@@ -219,6 +234,14 @@ public:
        return is_shadow;
    }

+    bool IsBindless() const {
+        return is_bindless;
+    }
+
+    std::pair<u32, u32> GetBindlessCBuf() const {
+        return {static_cast<u32>(offset >> 32), static_cast<u32>(offset)};
+    }
+
    bool operator<(const Sampler& rhs) const {
        return std::tie(offset, index, type, is_array, is_shadow) <
               std::tie(rhs.offset, rhs.index, rhs.type, rhs.is_array, rhs.is_shadow);
@@ -230,8 +253,9 @@ private:
    std::size_t offset{};
    std::size_t index{}; ///< Value used to index into the generated GLSL sampler array.
    Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
-    bool is_array{};  ///< Whether the texture is being sampled as an array texture or not.
-    bool is_shadow{}; ///< Whether the texture is being sampled as a depth texture or not.
+    bool is_array{};    ///< Whether the texture is being sampled as an array texture or not.
+    bool is_shadow{};   ///< Whether the texture is being sampled as a depth texture or not.
+    bool is_bindless{}; ///< Whether this sampler belongs to a bindless texture or not.
 };

 class ConstBuffer {
@@ -290,6 +314,7 @@ struct MetaTexture {
    const Sampler& sampler;
    Node array{};
    Node depth_compare{};
+    std::vector<Node> aoffi;
    Node bias{};
    Node lod{};
    Node component{};
@@ -728,6 +753,11 @@ private:
    const Sampler& GetSampler(const Tegra::Shader::Sampler& sampler,
                              Tegra::Shader::TextureType type, bool is_array, bool is_shadow);

+    // Accesses a texture sampler for a bindless texture.
+    const Sampler& GetBindlessSampler(const Tegra::Shader::Register& reg,
+                                      Tegra::Shader::TextureType type, bool is_array,
+                                      bool is_shadow);
+
    /// Extracts a sequence of bits from a node
    Node BitfieldExtract(Node value, u32 offset, u32 bits);

@@ -741,14 +771,15 @@ private:

    Node4 GetTexCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                     Tegra::Shader::TextureProcessMode process_mode, bool depth_compare,
-                     bool is_array);
+                     bool is_array, bool is_aoffi,
+                     std::optional<Tegra::Shader::Register> bindless_reg);

    Node4 GetTexsCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                      Tegra::Shader::TextureProcessMode process_mode, bool depth_compare,
                      bool is_array);

    Node4 GetTld4Code(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
-                      bool depth_compare, bool is_array);
+                      bool depth_compare, bool is_array, bool is_aoffi);

    Node4 GetTldsCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                      bool is_array);
@@ -757,9 +788,12 @@ private:
        Tegra::Shader::TextureType texture_type, bool depth_compare, bool is_array,
        bool lod_bias_enabled, std::size_t max_coords, std::size_t max_inputs);

+    std::vector<Node> GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count, bool is_tld4);
+
    Node4 GetTextureCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                         Tegra::Shader::TextureProcessMode process_mode, std::vector<Node> coords,
-                         Node array, Node depth_compare, u32 bias_offset);
+                         Node array, Node depth_compare, u32 bias_offset, std::vector<Node> aoffi,
+                         std::optional<Tegra::Shader::Register> bindless_reg);

    Node GetVideoOperand(Node op, bool is_chunk, bool is_signed, Tegra::Shader::VideoType type,
                         u64 byte_height);
@@ -773,6 +807,8 @@ private:

    Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor);

+    std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor);
+
    std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor);

    template <typename... T>
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -6,6 +6,7 @@
 #include <utility>
 #include <variant>

+#include "common/common_types.h"
 #include "video_core/shader/shader_ir.h"

 namespace VideoCommon::Shader {
@@ -14,7 +15,7 @@ namespace {
 std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
                                   OperationCode operation_code) {
    for (; cursor >= 0; --cursor) {
-        const Node node = code[cursor];
+        const Node node = code.at(cursor);
        if (const auto operation = std::get_if<OperationNode>(node)) {
            if (operation->GetCode() == operation_code)
                return {node, cursor};
@@ -64,6 +65,20 @@ Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) {
    return nullptr;
 }

+std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) {
+    // Reduce the cursor in one to avoid infinite loops when the instruction sets the same register
+    // that it uses as operand
+    const auto [found, found_cursor] =
+        TrackRegister(&std::get<GprNode>(*tracked), code, cursor - 1);
+    if (!found) {
+        return {};
+    }
+    if (const auto immediate = std::get_if<ImmediateNode>(found)) {
+        return immediate->GetValue();
+    }
+    return {};
+}
+
 std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const NodeBlock& code,
                                             s64 cursor) {
    for (; cursor >= 0; --cursor) {
Author	SHA1	Message	Date
Fernando Sahmkow	ef8be408d3	Adapt Bindless to work with AOFFI	2019-04-08 12:07:56 -04:00
Fernando Sahmkow	492040bd9c	Move ConstBufferAccessor to Maxwell3d, correct mistakes and clang format.	2019-04-08 11:36:11 -04:00
Fernando Sahmkow	797e351bf8	Fix bad rebase	2019-04-08 11:35:22 -04:00
Fernando Sahmkow	c60b0b8432	Fix TMML	2019-04-08 11:35:22 -04:00
Fernando Sahmkow	a77e9a27b0	Simplify ConstBufferAccessor	2019-04-08 11:35:19 -04:00
Fernando Sahmkow	fd4e994de3	Refactor GetTextureCode and GetTexCode to use an optional instead of optional parameters	2019-04-08 11:35:18 -04:00
Fernando Sahmkow	4841440382	Implement TXQ_B	2019-04-08 11:29:52 -04:00
Fernando Sahmkow	189bd1980c	Implement TMML_B	2019-04-08 11:29:49 -04:00
Fernando Sahmkow	ac3ba9a33e	Corrections to TEX_B	2019-04-08 11:28:44 -04:00
Fernando Sahmkow	90d06acfed	Fixes to Const Buffer Accessor and Formatting	2019-04-08 11:23:47 -04:00
Fernando Sahmkow	7af82ca022	Implement Bindless Handling on SetupTexture	2019-04-08 11:23:46 -04:00
Fernando Sahmkow	fe392fff24	Unify both sampler types.	2019-04-08 11:23:45 -04:00
Fernando Sahmkow	e28fd3d0a5	Implement Bindless Samplers and TEX_B in the IR.	2019-04-08 11:23:42 -04:00
Fernando Sahmkow	c4ac05c82c	Implement Const Buffer Accessor	2019-04-08 11:19:34 -04:00
bunnei	f14328bf0a	Merge pull request #2300 from FernandoS27/null-shader shader_cache: Permit a Null Shader in case of a bad host_ptr.	2019-04-07 17:58:27 -04:00
bunnei	c2fee0e519	Merge pull request #2355 from ReinUsesLisp/sync-point maxwell_3d: Reduce severity of ProcessSyncPoint	2019-04-07 17:56:11 -04:00
bunnei	06ece52cfe	Merge pull request #2359 from FearlessTobi/port-2-prs Port citra-emu/citra#4718: "fix clang-format target when using a path with spaces on windows"	2019-04-07 17:54:57 -04:00
bunnei	8aaf418bd6	Merge pull request #2306 from ReinUsesLisp/aoffi shader_ir: Implement AOFFI for TEX and TLD4	2019-04-07 17:52:30 -04:00
bunnei	3c1ce290d0	Merge pull request #2361 from lioncash/pagetable core/memory: Minor simplifications to page table management	2019-04-07 17:50:31 -04:00
bunnei	6b18a1592f	Merge pull request #2321 from ReinUsesLisp/gl-state-rework gl_state: Rework to enable individual applies	2019-04-07 17:50:07 -04:00
bunnei	21a4e7deea	Merge pull request #2098 from FreddyFunk/disk-cache-zstd gl_shader_disk_cache: Use Zstandard for compression	2019-04-07 17:48:33 -04:00
bunnei	52ad5fa0e8	Merge pull request #2356 from lioncash/pair kernel/{server_port, server_session}: Return pairs instead of tuples from pair creation functions	2019-04-07 17:48:00 -04:00
bunnei	d9b1c24f4f	Merge pull request #2362 from lioncash/enum core/memory: Remove unused enum constants	2019-04-07 17:46:09 -04:00
bunnei	80162888e6	Merge pull request #2352 from bunnei/mem-manager-fixes memory_manager: Improved implementation of read/write/copy block.	2019-04-07 17:44:59 -04:00
Fernando Sahmkow	021cd56bc9	Permit a Null Shader in case of a bad host_ptr.	2019-04-07 07:52:01 -04:00
Lioncash	36a1e6a982	core/memory: Remove unused enum constants These are holdovers from Citra and can be removed.	2019-04-07 03:04:55 -04:00
Lioncash	abae7577d2	core/memory: Remove GetCurrentPageTable() Now that nothing actually touches the internal page table aside from the memory subsystem itself, we can remove the accessor to it.	2019-04-07 02:47:37 -04:00
Lioncash	a6a82bb004	arm/arm_dynarmic: Remove unnecessary current_page_table member Given the page table will always be guaranteed to be that of whatever the current process is, we no longer need to keep this around.	2019-04-07 02:43:51 -04:00
Lioncash	e779686a76	kernel: Handle page table switching within MakeCurrentProcess() Centralizes the page table switching to one spot, rather than making calling code deal with it everywhere.	2019-04-07 01:12:54 -04:00
khang06	945e39471d	fix clang-format target when using a path with spaces on windows	2019-04-07 02:10:01 +02:00
ReinUsesLisp	ddcb711ee8	maxwell_3d: Reduce severity of ProcessSyncPoint	2019-04-06 02:18:20 -03:00
bunnei	20be92d5e6	memory_manager: Improved implementation of read/write/copy block. - Fixes graphical issues with Chocobo's Mystery Dungeon EVERY BUDDY! - Fixes a crash with Mario Tennis Aces	2019-04-05 23:43:34 -04:00
ReinUsesLisp	78bd66d037	gl_state: Rework to enable individual applies	2019-04-03 20:26:27 -03:00
ReinUsesLisp	38658b38b4	gl_shader_decompiler: Hide local definitions inside an anonymous namespace	2019-03-31 00:26:34 -03:00
Mat M	da02946f4f	shader_ir/decode: Silent implicit sign conversion warning Co-Authored-By: ReinUsesLisp <reinuseslisp@airmail.cc>	2019-03-31 00:12:54 -03:00
ReinUsesLisp	e8abe4b77c	gl_shader_decompiler: Add AOFFI backing implementation	2019-03-30 02:55:18 -03:00
ReinUsesLisp	cb68ce7c2f	shader_ir/decode: Implement AOFFI for TEX and TLD4	2019-03-30 02:53:29 -03:00
ReinUsesLisp	cf4ecc1945	shader_ir: Implement immediate register tracking	2019-03-30 02:53:16 -03:00
unknown	eadc834bb3	gitmodules: Replace taps with spaces	2019-03-29 18:22:08 +01:00
unknown	b4857e326f	common/zstd_compression: simplify decompression interface	2019-03-29 18:22:08 +01:00
unknown	aa92da205e	gl_shader_disk_cache: Fixup clang format	2019-03-29 18:22:08 +01:00
unknown	35ebbbc167	gl_shader_disk_cache: Use Zstandard for compression	2019-03-29 18:22:08 +01:00
unknown	72477731ed	common/zstd_compression: Add Zstandard wrapper	2019-03-29 18:22:08 +01:00
unknown	ca82589350	common: Link libzstd_static	2019-03-29 18:22:07 +01:00
unknown	d85c1141b9	externals: Add libzstd_static to externals CMakeLists.txt	2019-03-29 18:22:07 +01:00
unknown	93de7a7b40	externals: Add Zstandard v1.3.8	2019-03-29 18:22:07 +01:00
unknown	a05f94dcc8	Addressed feedback	2019-03-29 18:22:07 +01:00
unknown	cec7da37b9	core: Do not link LZ4 to core. Use common/data_compression for nso segment decompression instead.	2019-03-29 18:20:48 +01:00