remove <f32>

We can remove this since its already a f32 value
explicitly represent 1 as a float (1.0f instead of 1)
2019-09-03 23:20:19 -04:00 · 2019-09-03 23:06:32 -04:00 · 2019-09-03 22:30:20 -04:00 · 2019-09-03 16:05:33 -04:00 · 2019-09-01 13:13:05 -04:00 · 2019-08-30 14:08:00 -04:00
46 changed files with 941 additions and 400 deletions
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -81,6 +81,7 @@ set(HASH_FILES
    "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"
    "${VIDEO_CORE}/shader/decode/shift.cpp"
    "${VIDEO_CORE}/shader/decode/video.cpp"
+    "${VIDEO_CORE}/shader/decode/warp.cpp"
    "${VIDEO_CORE}/shader/decode/xmad.cpp"
    "${VIDEO_CORE}/shader/control_flow.cpp"
    "${VIDEO_CORE}/shader/control_flow.h"
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -55,6 +55,7 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"
      "${VIDEO_CORE}/shader/decode/shift.cpp"
      "${VIDEO_CORE}/shader/decode/video.cpp"
+      "${VIDEO_CORE}/shader/decode/warp.cpp"
      "${VIDEO_CORE}/shader/decode/xmad.cpp"
      "${VIDEO_CORE}/shader/control_flow.cpp"
      "${VIDEO_CORE}/shader/control_flow.h"
--- a/src/core/hle/service/audio/audren_u.cpp
+++ b/src/core/hle/service/audio/audren_u.cpp
@@ -165,13 +165,13 @@ public:
        static const FunctionInfo functions[] = {
            {0, &IAudioDevice::ListAudioDeviceName, "ListAudioDeviceName"},
            {1, &IAudioDevice::SetAudioDeviceOutputVolume, "SetAudioDeviceOutputVolume"},
-            {2, nullptr, "GetAudioDeviceOutputVolume"},
+            {2, &IAudioDevice::GetAudioDeviceOutputVolume, "GetAudioDeviceOutputVolume"},
            {3, &IAudioDevice::GetActiveAudioDeviceName, "GetActiveAudioDeviceName"},
            {4, &IAudioDevice::QueryAudioDeviceSystemEvent, "QueryAudioDeviceSystemEvent"},
            {5, &IAudioDevice::GetActiveChannelCount, "GetActiveChannelCount"},
            {6, &IAudioDevice::ListAudioDeviceName, "ListAudioDeviceNameAuto"},
            {7, &IAudioDevice::SetAudioDeviceOutputVolume, "SetAudioDeviceOutputVolumeAuto"},
-            {8, nullptr, "GetAudioDeviceOutputVolumeAuto"},
+            {8, &IAudioDevice::GetAudioDeviceOutputVolume, "GetAudioDeviceOutputVolumeAuto"},
            {10, &IAudioDevice::GetActiveAudioDeviceName, "GetActiveAudioDeviceNameAuto"},
            {11, nullptr, "QueryAudioDeviceInputEvent"},
            {12, &IAudioDevice::QueryAudioDeviceOutputEvent, "QueryAudioDeviceOutputEvent"},
@@ -246,6 +246,19 @@ private:
        rb.Push(RESULT_SUCCESS);
    }

+    void GetAudioDeviceOutputVolume(Kernel::HLERequestContext& ctx) {
+        IPC::RequestParser rp{ctx};
+
+        const auto device_name_buffer = ctx.ReadBuffer();
+        const std::string name = Common::StringFromBuffer(device_name_buffer);
+
+        LOG_WARNING(Service_Audio, "(STUBBED) called. name={}", name);
+
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push(1.0f);
+    }
+
    void GetActiveAudioDeviceName(Kernel::HLERequestContext& ctx) {
        LOG_WARNING(Service_Audio, "(STUBBED) called");

--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_library(video_core STATIC
-    buffer_cache.h
+    buffer_cache/buffer_block.h
+    buffer_cache/buffer_cache.h
+    buffer_cache/map_interval.h
    dma_pusher.cpp
    dma_pusher.h
    debug_utils/debug_utils.cpp
@@ -100,6 +102,7 @@ add_library(video_core STATIC
    shader/decode/integer_set.cpp
    shader/decode/half_set.cpp
    shader/decode/video.cpp
+    shader/decode/warp.cpp
    shader/decode/xmad.cpp
    shader/decode/other.cpp
    shader/control_flow.cpp
--- a/src/video_core/buffer_cache.h
+++ b/src/video_core/buffer_cache.h
@@ -1,299 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <array>
-#include <memory>
-#include <mutex>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "common/alignment.h"
-#include "common/common_types.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
-#include "video_core/rasterizer_cache.h"
-
-namespace VideoCore {
-class RasterizerInterface;
-}
-
-namespace VideoCommon {
-
-template <typename BufferStorageType>
-class CachedBuffer final : public RasterizerCacheObject {
-public:
-    explicit CachedBuffer(VAddr cpu_addr, u8* host_ptr)
-        : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr} {}
-    ~CachedBuffer() override = default;
-
-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return size;
-    }
-
-    u8* GetWritableHostPtr() const {
-        return host_ptr;
-    }
-
-    std::size_t GetSize() const {
-        return size;
-    }
-
-    std::size_t GetCapacity() const {
-        return capacity;
-    }
-
-    bool IsInternalized() const {
-        return is_internal;
-    }
-
-    const BufferStorageType& GetBuffer() const {
-        return buffer;
-    }
-
-    void SetSize(std::size_t new_size) {
-        size = new_size;
-    }
-
-    void SetInternalState(bool is_internal_) {
-        is_internal = is_internal_;
-    }
-
-    BufferStorageType ExchangeBuffer(BufferStorageType buffer_, std::size_t new_capacity) {
-        capacity = new_capacity;
-        std::swap(buffer, buffer_);
-        return buffer_;
-    }
-
-private:
-    u8* host_ptr{};
-    VAddr cpu_addr{};
-    std::size_t size{};
-    std::size_t capacity{};
-    bool is_internal{};
-    BufferStorageType buffer;
-};
-
-template <typename BufferStorageType, typename BufferType, typename StreamBuffer>
-class BufferCache : public RasterizerCache<std::shared_ptr<CachedBuffer<BufferStorageType>>> {
-public:
-    using Buffer = std::shared_ptr<CachedBuffer<BufferStorageType>>;
-    using BufferInfo = std::pair<const BufferType*, u64>;
-
-    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                         std::unique_ptr<StreamBuffer> stream_buffer)
-        : RasterizerCache<Buffer>{rasterizer}, system{system},
-          stream_buffer{std::move(stream_buffer)}, stream_buffer_handle{
-                                                       this->stream_buffer->GetHandle()} {}
-    ~BufferCache() = default;
-
-    void Unregister(const Buffer& entry) override {
-        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
-        if (entry->IsInternalized()) {
-            internalized_entries.erase(entry->GetCacheAddr());
-        }
-        ReserveBuffer(entry);
-        RasterizerCache<Buffer>::Unregister(entry);
-    }
-
-    void TickFrame() {
-        marked_for_destruction_index =
-            (marked_for_destruction_index + 1) % marked_for_destruction_ring_buffer.size();
-        MarkedForDestruction().clear();
-    }
-
-    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
-                            bool internalize = false, bool is_written = false) {
-        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
-
-        auto& memory_manager = system.GPU().MemoryManager();
-        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
-        if (!host_ptr) {
-            return {GetEmptyBuffer(size), 0};
-        }
-        const auto cache_addr = ToCacheAddr(host_ptr);
-
-        // Cache management is a big overhead, so only cache entries with a given size.
-        // TODO: Figure out which size is the best for given games.
-        constexpr std::size_t max_stream_size = 0x800;
-        if (!internalize && size < max_stream_size &&
-            internalized_entries.find(cache_addr) == internalized_entries.end()) {
-            return StreamBufferUpload(host_ptr, size, alignment);
-        }
-
-        auto entry = RasterizerCache<Buffer>::TryGet(cache_addr);
-        if (!entry) {
-            return FixedBufferUpload(gpu_addr, host_ptr, size, internalize, is_written);
-        }
-
-        if (entry->GetSize() < size) {
-            IncreaseBufferSize(entry, size);
-        }
-        if (is_written) {
-            entry->MarkAsModified(true, *this);
-        }
-        return {ToHandle(entry->GetBuffer()), 0};
-    }
-
-    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
-    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
-                                std::size_t alignment = 4) {
-        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
-        return StreamBufferUpload(raw_pointer, size, alignment);
-    }
-
-    void Map(std::size_t max_size) {
-        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
-        buffer_offset = buffer_offset_base;
-    }
-
-    /// Finishes the upload stream, returns true on bindings invalidation.
-    bool Unmap() {
-        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
-        return std::exchange(invalidated, false);
-    }
-
-    virtual const BufferType* GetEmptyBuffer(std::size_t size) = 0;
-
-protected:
-    void FlushObjectInner(const Buffer& entry) override {
-        DownloadBufferData(entry->GetBuffer(), 0, entry->GetSize(), entry->GetWritableHostPtr());
-    }
-
-    virtual BufferStorageType CreateBuffer(std::size_t size) = 0;
-
-    virtual const BufferType* ToHandle(const BufferStorageType& storage) = 0;
-
-    virtual void UploadBufferData(const BufferStorageType& buffer, std::size_t offset,
-                                  std::size_t size, const u8* data) = 0;
-
-    virtual void DownloadBufferData(const BufferStorageType& buffer, std::size_t offset,
-                                    std::size_t size, u8* data) = 0;
-
-    virtual void CopyBufferData(const BufferStorageType& src, const BufferStorageType& dst,
-                                std::size_t src_offset, std::size_t dst_offset,
-                                std::size_t size) = 0;
-
-private:
-    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
-                                  std::size_t alignment) {
-        AlignBuffer(alignment);
-        const std::size_t uploaded_offset = buffer_offset;
-        std::memcpy(buffer_ptr, raw_pointer, size);
-
-        buffer_ptr += size;
-        buffer_offset += size;
-        return {&stream_buffer_handle, uploaded_offset};
-    }
-
-    BufferInfo FixedBufferUpload(GPUVAddr gpu_addr, u8* host_ptr, std::size_t size,
-                                 bool internalize, bool is_written) {
-        auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
-        ASSERT(cpu_addr);
-
-        auto entry = GetUncachedBuffer(*cpu_addr, host_ptr);
-        entry->SetSize(size);
-        entry->SetInternalState(internalize);
-        RasterizerCache<Buffer>::Register(entry);
-
-        if (internalize) {
-            internalized_entries.emplace(ToCacheAddr(host_ptr));
-        }
-        if (is_written) {
-            entry->MarkAsModified(true, *this);
-        }
-
-        if (entry->GetCapacity() < size) {
-            MarkedForDestruction().push_back(entry->ExchangeBuffer(CreateBuffer(size), size));
-        }
-
-        UploadBufferData(entry->GetBuffer(), 0, size, host_ptr);
-        return {ToHandle(entry->GetBuffer()), 0};
-    }
-
-    void IncreaseBufferSize(Buffer& entry, std::size_t new_size) {
-        const std::size_t old_size = entry->GetSize();
-        if (entry->GetCapacity() < new_size) {
-            const auto& old_buffer = entry->GetBuffer();
-            auto new_buffer = CreateBuffer(new_size);
-
-            // Copy bits from the old buffer to the new buffer.
-            CopyBufferData(old_buffer, new_buffer, 0, 0, old_size);
-            MarkedForDestruction().push_back(
-                entry->ExchangeBuffer(std::move(new_buffer), new_size));
-
-            // This buffer could have been used
-            invalidated = true;
-        }
-        // Upload the new bits.
-        const std::size_t size_diff = new_size - old_size;
-        UploadBufferData(entry->GetBuffer(), old_size, size_diff, entry->GetHostPtr() + old_size);
-
-        // Update entry's size in the object and in the cache.
-        Unregister(entry);
-
-        entry->SetSize(new_size);
-        RasterizerCache<Buffer>::Register(entry);
-    }
-
-    Buffer GetUncachedBuffer(VAddr cpu_addr, u8* host_ptr) {
-        if (auto entry = TryGetReservedBuffer(host_ptr)) {
-            return entry;
-        }
-        return std::make_shared<CachedBuffer<BufferStorageType>>(cpu_addr, host_ptr);
-    }
-
-    Buffer TryGetReservedBuffer(u8* host_ptr) {
-        const auto it = buffer_reserve.find(ToCacheAddr(host_ptr));
-        if (it == buffer_reserve.end()) {
-            return {};
-        }
-        auto& reserve = it->second;
-        auto entry = reserve.back();
-        reserve.pop_back();
-        return entry;
-    }
-
-    void ReserveBuffer(Buffer entry) {
-        buffer_reserve[entry->GetCacheAddr()].push_back(std::move(entry));
-    }
-
-    void AlignBuffer(std::size_t alignment) {
-        // Align the offset, not the mapped pointer
-        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
-        buffer_ptr += offset_aligned - buffer_offset;
-        buffer_offset = offset_aligned;
-    }
-
-    std::vector<BufferStorageType>& MarkedForDestruction() {
-        return marked_for_destruction_ring_buffer[marked_for_destruction_index];
-    }
-
-    Core::System& system;
-
-    std::unique_ptr<StreamBuffer> stream_buffer;
-    BufferType stream_buffer_handle{};
-
-    bool invalidated = false;
-
-    u8* buffer_ptr = nullptr;
-    u64 buffer_offset = 0;
-    u64 buffer_offset_base = 0;
-
-    std::size_t marked_for_destruction_index = 0;
-    std::array<std::vector<BufferStorageType>, 4> marked_for_destruction_ring_buffer;
-
-    std::unordered_set<CacheAddr> internalized_entries;
-    std::unordered_map<CacheAddr, std::vector<Buffer>> buffer_reserve;
-};
-
-} // namespace VideoCommon
--- a/src/video_core/buffer_cache/buffer_block.h
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -0,0 +1,76 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <unordered_set>
+#include <utility>
+
+#include "common/alignment.h"
+#include "common/common_types.h"
+#include "video_core/gpu.h"
+
+namespace VideoCommon {
+
+class BufferBlock {
+public:
+    bool Overlaps(const CacheAddr start, const CacheAddr end) const {
+        return (cache_addr < end) && (cache_addr_end > start);
+    }
+
+    bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const {
+        return cache_addr <= other_start && other_end <= cache_addr_end;
+    }
+
+    u8* GetWritableHostPtr() const {
+        return FromCacheAddr(cache_addr);
+    }
+
+    u8* GetWritableHostPtr(std::size_t offset) const {
+        return FromCacheAddr(cache_addr + offset);
+    }
+
+    std::size_t GetOffset(const CacheAddr in_addr) {
+        return static_cast<std::size_t>(in_addr - cache_addr);
+    }
+
+    CacheAddr GetCacheAddr() const {
+        return cache_addr;
+    }
+
+    CacheAddr GetCacheAddrEnd() const {
+        return cache_addr_end;
+    }
+
+    void SetCacheAddr(const CacheAddr new_addr) {
+        cache_addr = new_addr;
+        cache_addr_end = new_addr + size;
+    }
+
+    std::size_t GetSize() const {
+        return size;
+    }
+
+    void SetEpoch(u64 new_epoch) {
+        epoch = new_epoch;
+    }
+
+    u64 GetEpoch() {
+        return epoch;
+    }
+
+protected:
+    explicit BufferBlock(CacheAddr cache_addr, const std::size_t size) : size{size} {
+        SetCacheAddr(cache_addr);
+    }
+    ~BufferBlock() = default;
+
+private:
+    CacheAddr cache_addr{};
+    CacheAddr cache_addr_end{};
+    std::size_t size{};
+    u64 epoch{};
+};
+
+} // namespace VideoCommon
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -0,0 +1,447 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/alignment.h"
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/buffer_cache/buffer_block.h"
+#include "video_core/buffer_cache/map_interval.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+using MapInterval = std::shared_ptr<MapIntervalBase>;
+
+template <typename TBuffer, typename TBufferType, typename StreamBuffer>
+class BufferCache {
+public:
+    using BufferInfo = std::pair<const TBufferType*, u64>;
+
+    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
+                            bool is_written = false) {
+        std::lock_guard lock{mutex};
+
+        auto& memory_manager = system.GPU().MemoryManager();
+        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+        if (!host_ptr) {
+            return {GetEmptyBuffer(size), 0};
+        }
+        const auto cache_addr = ToCacheAddr(host_ptr);
+
+        // Cache management is a big overhead, so only cache entries with a given size.
+        // TODO: Figure out which size is the best for given games.
+        constexpr std::size_t max_stream_size = 0x800;
+        if (size < max_stream_size) {
+            if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) {
+                return StreamBufferUpload(host_ptr, size, alignment);
+            }
+        }
+
+        auto block = GetBlock(cache_addr, size);
+        auto map = MapAddress(block, gpu_addr, cache_addr, size);
+        if (is_written) {
+            map->MarkAsModified(true, GetModifiedTicks());
+            if (!map->IsWritten()) {
+                map->MarkAsWritten(true);
+                MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+            }
+        } else {
+            if (map->IsWritten()) {
+                WriteBarrier();
+            }
+        }
+
+        const u64 offset = static_cast<u64>(block->GetOffset(cache_addr));
+
+        return {ToHandle(block), offset};
+    }
+
+    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
+    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
+                                std::size_t alignment = 4) {
+        std::lock_guard lock{mutex};
+        return StreamBufferUpload(raw_pointer, size, alignment);
+    }
+
+    void Map(std::size_t max_size) {
+        std::lock_guard lock{mutex};
+
+        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
+        buffer_offset = buffer_offset_base;
+    }
+
+    /// Finishes the upload stream, returns true on bindings invalidation.
+    bool Unmap() {
+        std::lock_guard lock{mutex};
+
+        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
+        return std::exchange(invalidated, false);
+    }
+
+    void TickFrame() {
+        ++epoch;
+        while (!pending_destruction.empty()) {
+            if (pending_destruction.front()->GetEpoch() + 1 > epoch) {
+                break;
+            }
+            pending_destruction.pop_front();
+        }
+    }
+
+    /// Write any cached resources overlapping the specified region back to memory
+    void FlushRegion(CacheAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
+        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
+        std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) {
+            return a->GetModificationTick() < b->GetModificationTick();
+        });
+        for (auto& object : objects) {
+            if (object->IsModified() && object->IsRegistered()) {
+                FlushMap(object);
+            }
+        }
+    }
+
+    /// Mark the specified region as being invalidated
+    void InvalidateRegion(CacheAddr addr, u64 size) {
+        std::lock_guard lock{mutex};
+
+        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
+        for (auto& object : objects) {
+            if (object->IsRegistered()) {
+                Unregister(object);
+            }
+        }
+    }
+
+    virtual const TBufferType* GetEmptyBuffer(std::size_t size) = 0;
+
+protected:
+    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+                         std::unique_ptr<StreamBuffer> stream_buffer)
+        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
+          stream_buffer_handle{this->stream_buffer->GetHandle()} {}
+
+    ~BufferCache() = default;
+
+    virtual const TBufferType* ToHandle(const TBuffer& storage) = 0;
+
+    virtual void WriteBarrier() = 0;
+
+    virtual TBuffer CreateBlock(CacheAddr cache_addr, std::size_t size) = 0;
+
+    virtual void UploadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
+                                 const u8* data) = 0;
+
+    virtual void DownloadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
+                                   u8* data) = 0;
+
+    virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset,
+                           std::size_t dst_offset, std::size_t size) = 0;
+
+    /// Register an object into the cache
+    void Register(const MapInterval& new_map, bool inherit_written = false) {
+        const CacheAddr cache_ptr = new_map->GetStart();
+        const std::optional<VAddr> cpu_addr =
+            system.GPU().MemoryManager().GpuToCpuAddress(new_map->GetGpuAddress());
+        if (!cache_ptr || !cpu_addr) {
+            LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
+                         new_map->GetGpuAddress());
+            return;
+        }
+        const std::size_t size = new_map->GetEnd() - new_map->GetStart();
+        new_map->SetCpuAddress(*cpu_addr);
+        new_map->MarkAsRegistered(true);
+        const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
+        mapped_addresses.insert({interval, new_map});
+        rasterizer.UpdatePagesCachedCount(*cpu_addr, size, 1);
+        if (inherit_written) {
+            MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1);
+            new_map->MarkAsWritten(true);
+        }
+    }
+
+    /// Unregisters an object from the cache
+    void Unregister(MapInterval& map) {
+        const std::size_t size = map->GetEnd() - map->GetStart();
+        rasterizer.UpdatePagesCachedCount(map->GetCpuAddress(), size, -1);
+        map->MarkAsRegistered(false);
+        if (map->IsWritten()) {
+            UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+        }
+        const IntervalType delete_interval{map->GetStart(), map->GetEnd()};
+        mapped_addresses.erase(delete_interval);
+    }
+
+private:
+    MapInterval CreateMap(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr) {
+        return std::make_shared<MapIntervalBase>(start, end, gpu_addr);
+    }
+
+    MapInterval MapAddress(const TBuffer& block, const GPUVAddr gpu_addr,
+                           const CacheAddr cache_addr, const std::size_t size) {
+
+        std::vector<MapInterval> overlaps = GetMapsInRange(cache_addr, size);
+        if (overlaps.empty()) {
+            const CacheAddr cache_addr_end = cache_addr + size;
+            MapInterval new_map = CreateMap(cache_addr, cache_addr_end, gpu_addr);
+            u8* host_ptr = FromCacheAddr(cache_addr);
+            UploadBlockData(block, block->GetOffset(cache_addr), size, host_ptr);
+            Register(new_map);
+            return new_map;
+        }
+
+        const CacheAddr cache_addr_end = cache_addr + size;
+        if (overlaps.size() == 1) {
+            MapInterval& current_map = overlaps[0];
+            if (current_map->IsInside(cache_addr, cache_addr_end)) {
+                return current_map;
+            }
+        }
+        CacheAddr new_start = cache_addr;
+        CacheAddr new_end = cache_addr_end;
+        bool write_inheritance = false;
+        bool modified_inheritance = false;
+        // Calculate new buffer parameters
+        for (auto& overlap : overlaps) {
+            new_start = std::min(overlap->GetStart(), new_start);
+            new_end = std::max(overlap->GetEnd(), new_end);
+            write_inheritance |= overlap->IsWritten();
+            modified_inheritance |= overlap->IsModified();
+        }
+        GPUVAddr new_gpu_addr = gpu_addr + new_start - cache_addr;
+        for (auto& overlap : overlaps) {
+            Unregister(overlap);
+        }
+        UpdateBlock(block, new_start, new_end, overlaps);
+        MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr);
+        if (modified_inheritance) {
+            new_map->MarkAsModified(true, GetModifiedTicks());
+        }
+        Register(new_map, write_inheritance);
+        return new_map;
+    }
+
+    void UpdateBlock(const TBuffer& block, CacheAddr start, CacheAddr end,
+                     std::vector<MapInterval>& overlaps) {
+        const IntervalType base_interval{start, end};
+        IntervalSet interval_set{};
+        interval_set.add(base_interval);
+        for (auto& overlap : overlaps) {
+            const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()};
+            interval_set.subtract(subtract);
+        }
+        for (auto& interval : interval_set) {
+            std::size_t size = interval.upper() - interval.lower();
+            if (size > 0) {
+                u8* host_ptr = FromCacheAddr(interval.lower());
+                UploadBlockData(block, block->GetOffset(interval.lower()), size, host_ptr);
+            }
+        }
+    }
+
+    std::vector<MapInterval> GetMapsInRange(CacheAddr addr, std::size_t size) {
+        if (size == 0) {
+            return {};
+        }
+
+        std::vector<MapInterval> objects{};
+        const IntervalType interval{addr, addr + size};
+        for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) {
+            objects.push_back(pair.second);
+        }
+
+        return objects;
+    }
+
+    /// Returns a ticks counter used for tracking when cached objects were last modified
+    u64 GetModifiedTicks() {
+        return ++modified_ticks;
+    }
+
+    void FlushMap(MapInterval map) {
+        std::size_t size = map->GetEnd() - map->GetStart();
+        TBuffer block = blocks[map->GetStart() >> block_page_bits];
+        u8* host_ptr = FromCacheAddr(map->GetStart());
+        DownloadBlockData(block, block->GetOffset(map->GetStart()), size, host_ptr);
+        map->MarkAsModified(false, 0);
+    }
+
+    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
+                                  std::size_t alignment) {
+        AlignBuffer(alignment);
+        const std::size_t uploaded_offset = buffer_offset;
+        std::memcpy(buffer_ptr, raw_pointer, size);
+
+        buffer_ptr += size;
+        buffer_offset += size;
+        return {&stream_buffer_handle, uploaded_offset};
+    }
+
+    void AlignBuffer(std::size_t alignment) {
+        // Align the offset, not the mapped pointer
+        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
+        buffer_ptr += offset_aligned - buffer_offset;
+        buffer_offset = offset_aligned;
+    }
+
+    TBuffer EnlargeBlock(TBuffer buffer) {
+        const std::size_t old_size = buffer->GetSize();
+        const std::size_t new_size = old_size + block_page_size;
+        const CacheAddr cache_addr = buffer->GetCacheAddr();
+        TBuffer new_buffer = CreateBlock(cache_addr, new_size);
+        CopyBlock(buffer, new_buffer, 0, 0, old_size);
+        buffer->SetEpoch(epoch);
+        pending_destruction.push_back(buffer);
+        const CacheAddr cache_addr_end = cache_addr + new_size - 1;
+        u64 page_start = cache_addr >> block_page_bits;
+        const u64 page_end = cache_addr_end >> block_page_bits;
+        while (page_start <= page_end) {
+            blocks[page_start] = new_buffer;
+            ++page_start;
+        }
+        return new_buffer;
+    }
+
+    TBuffer MergeBlocks(TBuffer first, TBuffer second) {
+        const std::size_t size_1 = first->GetSize();
+        const std::size_t size_2 = second->GetSize();
+        const CacheAddr first_addr = first->GetCacheAddr();
+        const CacheAddr second_addr = second->GetCacheAddr();
+        const CacheAddr new_addr = std::min(first_addr, second_addr);
+        const std::size_t new_size = size_1 + size_2;
+        TBuffer new_buffer = CreateBlock(new_addr, new_size);
+        CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
+        CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
+        first->SetEpoch(epoch);
+        second->SetEpoch(epoch);
+        pending_destruction.push_back(first);
+        pending_destruction.push_back(second);
+        const CacheAddr cache_addr_end = new_addr + new_size - 1;
+        u64 page_start = new_addr >> block_page_bits;
+        const u64 page_end = cache_addr_end >> block_page_bits;
+        while (page_start <= page_end) {
+            blocks[page_start] = new_buffer;
+            ++page_start;
+        }
+        return new_buffer;
+    }
+
+    TBuffer GetBlock(const CacheAddr cache_addr, const std::size_t size) {
+        TBuffer found{};
+        const CacheAddr cache_addr_end = cache_addr + size - 1;
+        u64 page_start = cache_addr >> block_page_bits;
+        const u64 page_end = cache_addr_end >> block_page_bits;
+        while (page_start <= page_end) {
+            auto it = blocks.find(page_start);
+            if (it == blocks.end()) {
+                if (found) {
+                    found = EnlargeBlock(found);
+                } else {
+                    const CacheAddr start_addr = (page_start << block_page_bits);
+                    found = CreateBlock(start_addr, block_page_size);
+                    blocks[page_start] = found;
+                }
+            } else {
+                if (found) {
+                    if (found == it->second) {
+                        ++page_start;
+                        continue;
+                    }
+                    found = MergeBlocks(found, it->second);
+                } else {
+                    found = it->second;
+                }
+            }
+            ++page_start;
+        }
+        return found;
+    }
+
+    void MarkRegionAsWritten(const CacheAddr start, const CacheAddr end) {
+        u64 page_start = start >> write_page_bit;
+        const u64 page_end = end >> write_page_bit;
+        while (page_start <= page_end) {
+            auto it = written_pages.find(page_start);
+            if (it != written_pages.end()) {
+                it->second = it->second + 1;
+            } else {
+                written_pages[page_start] = 1;
+            }
+            page_start++;
+        }
+    }
+
+    void UnmarkRegionAsWritten(const CacheAddr start, const CacheAddr end) {
+        u64 page_start = start >> write_page_bit;
+        const u64 page_end = end >> write_page_bit;
+        while (page_start <= page_end) {
+            auto it = written_pages.find(page_start);
+            if (it != written_pages.end()) {
+                if (it->second > 1) {
+                    it->second = it->second - 1;
+                } else {
+                    written_pages.erase(it);
+                }
+            }
+            page_start++;
+        }
+    }
+
+    bool IsRegionWritten(const CacheAddr start, const CacheAddr end) const {
+        u64 page_start = start >> write_page_bit;
+        const u64 page_end = end >> write_page_bit;
+        while (page_start <= page_end) {
+            if (written_pages.count(page_start) > 0) {
+                return true;
+            }
+            page_start++;
+        }
+        return false;
+    }
+
+    VideoCore::RasterizerInterface& rasterizer;
+    Core::System& system;
+    std::unique_ptr<StreamBuffer> stream_buffer;
+
+    TBufferType stream_buffer_handle{};
+
+    bool invalidated = false;
+
+    u8* buffer_ptr = nullptr;
+    u64 buffer_offset = 0;
+    u64 buffer_offset_base = 0;
+
+    using IntervalSet = boost::icl::interval_set<CacheAddr>;
+    using IntervalCache = boost::icl::interval_map<CacheAddr, MapInterval>;
+    using IntervalType = typename IntervalCache::interval_type;
+    IntervalCache mapped_addresses{};
+
+    static constexpr u64 write_page_bit{11};
+    std::unordered_map<u64, u32> written_pages{};
+
+    static constexpr u64 block_page_bits{21};
+    static constexpr u64 block_page_size{1 << block_page_bits};
+    std::unordered_map<u64, TBuffer> blocks{};
+
+    std::list<TBuffer> pending_destruction{};
+    u64 epoch{};
+    u64 modified_ticks{};
+
+    std::recursive_mutex mutex;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/buffer_cache/map_interval.h
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -0,0 +1,89 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+#include "video_core/gpu.h"
+
+namespace VideoCommon {
+
+class MapIntervalBase {
+public:
+    MapIntervalBase(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr)
+        : start{start}, end{end}, gpu_addr{gpu_addr} {}
+
+    void SetCpuAddress(VAddr new_cpu_addr) {
+        cpu_addr = new_cpu_addr;
+    }
+
+    VAddr GetCpuAddress() const {
+        return cpu_addr;
+    }
+
+    GPUVAddr GetGpuAddress() const {
+        return gpu_addr;
+    }
+
+    bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const {
+        return (start <= other_start && other_end <= end);
+    }
+
+    bool operator==(const MapIntervalBase& rhs) const {
+        return std::tie(start, end) == std::tie(rhs.start, rhs.end);
+    }
+
+    bool operator!=(const MapIntervalBase& rhs) const {
+        return !operator==(rhs);
+    }
+
+    void MarkAsRegistered(const bool registered) {
+        is_registered = registered;
+    }
+
+    bool IsRegistered() const {
+        return is_registered;
+    }
+
+    CacheAddr GetStart() const {
+        return start;
+    }
+
+    CacheAddr GetEnd() const {
+        return end;
+    }
+
+    void MarkAsModified(const bool is_modified_, const u64 tick) {
+        is_modified = is_modified_;
+        ticks = tick;
+    }
+
+    bool IsModified() const {
+        return is_modified;
+    }
+
+    u64 GetModificationTick() const {
+        return ticks;
+    }
+
+    void MarkAsWritten(const bool is_written_) {
+        is_written = is_written_;
+    }
+
+    bool IsWritten() const {
+        return is_written;
+    }
+
+private:
+    CacheAddr start;
+    CacheAddr end;
+    GPUVAddr gpu_addr;
+    VAddr cpu_addr{};
+    bool is_written{};
+    bool is_modified{};
+    bool is_registered{};
+    u64 ticks{};
+};
+
+} // namespace VideoCommon
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -10,8 +10,7 @@

 namespace Tegra::Engines {

-Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager)
-    : rasterizer{rasterizer}, memory_manager{memory_manager} {}
+Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}

 void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
    ASSERT_MSG(method_call.method < Regs::NUM_REGS,
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -33,7 +33,7 @@ namespace Tegra::Engines {

 class Fermi2D final {
 public:
-    explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager);
+    explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer);
    ~Fermi2D() = default;

    /// Write the value to the register identified by method.
@@ -145,7 +145,6 @@ public:

 private:
    VideoCore::RasterizerInterface& rasterizer;
-    MemoryManager& memory_manager;

    /// Performs the copy from the source surface to the destination surface as configured in the
    /// registers.
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -15,7 +15,7 @@
 namespace Tegra::Engines {

 KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
-    : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {}
+    : system{system}, upload_state{memory_manager, regs.upload} {}

 KeplerMemory::~KeplerMemory() = default;

--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -65,7 +65,6 @@ public:

 private:
    Core::System& system;
-    MemoryManager& memory_manager;
    Upload::State upload_state;
 };

--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -524,7 +524,7 @@ void Maxwell3D::ProcessQueryCondition() {
 void Maxwell3D::ProcessSyncPoint() {
    const u32 sync_point = regs.sync_info.sync_point.Value();
    const u32 increment = regs.sync_info.increment.Value();
-    const u32 cache_flush = regs.sync_info.unknown.Value();
+    [[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value();
    if (increment) {
        system.GPU().IncrementSyncPoint(sync_point);
    }
@@ -626,10 +626,10 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
    Texture::TICEntry tic_entry;
    memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));

-    const auto r_type{tic_entry.r_type.Value()};
-    const auto g_type{tic_entry.g_type.Value()};
-    const auto b_type{tic_entry.b_type.Value()};
-    const auto a_type{tic_entry.a_type.Value()};
+    [[maybe_unused]] const auto r_type{tic_entry.r_type.Value()};
+    [[maybe_unused]] const auto g_type{tic_entry.g_type.Value()};
+    [[maybe_unused]] const auto b_type{tic_entry.b_type.Value()};
+    [[maybe_unused]] const auto a_type{tic_entry.a_type.Value()};

    // TODO(Subv): Different data types for separate components are not supported
    DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -5,18 +5,17 @@
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/core.h"
+#include "core/settings.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/memory_manager.h"
-#include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
 #include "video_core/textures/decoders.h"

 namespace Tegra::Engines {

-MaxwellDMA::MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                       MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
+MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager)
+    : system{system}, memory_manager{memory_manager} {}

 void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
    ASSERT_MSG(method_call.method < Regs::NUM_REGS,
@@ -84,13 +83,17 @@ void MaxwellDMA::HandleCopy() {
    ASSERT(regs.exec.enable_2d == 1);

    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
-        ASSERT(regs.src_params.size_z == 1);
+        ASSERT(regs.src_params.BlockDepth() == 0);
        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
-        const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
+        const u32 bytes_per_pixel = regs.dst_pitch / regs.x_count;
        const std::size_t src_size = Texture::CalculateSize(
-            true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
+            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
            regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth());

+        const std::size_t src_layer_size = Texture::CalculateSize(
+            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 1,
+            regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
+
        const std::size_t dst_size = regs.dst_pitch * regs.y_count;

        if (read_buffer.size() < src_size) {
@@ -104,23 +107,23 @@ void MaxwellDMA::HandleCopy() {
        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);

-        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
-                                  regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(),
-                                  write_buffer.data(), regs.src_params.BlockHeight(),
-                                  regs.src_params.pos_x, regs.src_params.pos_y);
+        Texture::UnswizzleSubrect(
+            regs.x_count, regs.y_count, regs.dst_pitch, regs.src_params.size_x, bytes_per_pixel,
+            read_buffer.data() + src_layer_size * regs.src_params.pos_z, write_buffer.data(),
+            regs.src_params.BlockHeight(), regs.src_params.pos_x, regs.src_params.pos_y);

        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
    } else {
        ASSERT(regs.dst_params.BlockDepth() == 0);

-        const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count;
+        const u32 bytes_per_pixel = regs.src_pitch / regs.x_count;

        const std::size_t dst_size = Texture::CalculateSize(
-            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
+            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
            regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());

        const std::size_t dst_layer_size = Texture::CalculateSize(
-            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
+            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
            regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());

        const std::size_t src_size = regs.src_pitch * regs.y_count;
@@ -133,14 +136,19 @@ void MaxwellDMA::HandleCopy() {
            write_buffer.resize(dst_size);
        }

-        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
-        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
+        if (Settings::values.use_accurate_gpu_emulation) {
+            memory_manager.ReadBlock(source, read_buffer.data(), src_size);
+            memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
+        } else {
+            memory_manager.ReadBlockUnsafe(source, read_buffer.data(), src_size);
+            memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size);
+        }

        // If the input is linear and the output is tiled, swizzle the input and copy it over.
-        Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
-                                src_bytes_per_pixel,
-                                write_buffer.data() + dst_layer_size * regs.dst_params.pos_z,
-                                read_buffer.data(), regs.dst_params.BlockHeight());
+        Texture::SwizzleSubrect(
+            regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, bytes_per_pixel,
+            write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, read_buffer.data(),
+            regs.dst_params.BlockHeight(), regs.dst_params.pos_x, regs.dst_params.pos_y);

        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
    }
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -20,10 +20,6 @@ namespace Tegra {
 class MemoryManager;
 }

-namespace VideoCore {
-class RasterizerInterface;
-}
-
 namespace Tegra::Engines {

 /**
@@ -33,8 +29,7 @@ namespace Tegra::Engines {

 class MaxwellDMA final {
 public:
-    explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                        MemoryManager& memory_manager);
+    explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager);
    ~MaxwellDMA() = default;

    /// Write the value to the register identified by method.
@@ -180,8 +175,6 @@ public:
 private:
    Core::System& system;

-    VideoCore::RasterizerInterface& rasterizer;
-
    MemoryManager& memory_manager;

    std::vector<u8> read_buffer;
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -538,6 +538,12 @@ enum class PhysicalAttributeDirection : u64 {
    Output = 1,
 };

+enum class VoteOperation : u64 {
+    All = 0, // allThreadsNV
+    Any = 1, // anyThreadNV
+    Eq = 2,  // allThreadsEqualNV
+};
+
 union Instruction {
    Instruction& operator=(const Instruction& instr) {
        value = instr.value;
@@ -564,6 +570,13 @@ union Instruction {
        BitField<13, 1, u64> trigger;
    } nop;

+    union {
+        BitField<48, 2, VoteOperation> operation;
+        BitField<45, 3, u64> dest_pred;
+        BitField<39, 3, u64> value;
+        BitField<42, 1, u64> negate_value;
+    } vote;
+
    union {
        BitField<8, 8, Register> gpr;
        BitField<20, 24, s64> offset;
@@ -1487,6 +1500,7 @@ public:
        SYNC,
        BRK,
        DEPBAR,
+        VOTE,
        BFE_C,
        BFE_R,
        BFE_IMM,
@@ -1649,6 +1663,7 @@ public:
        Hfma2,
        Flow,
        Synch,
+        Warp,
        Memory,
        Texture,
        Image,
@@ -1775,6 +1790,7 @@ private:
            INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
            INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
            INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
+            INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),
            INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
            INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
            INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"),
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -35,9 +35,9 @@ GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
    memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
-    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
+    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer);
    kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager);
-    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager);
+    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager);
    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
 }

--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -19,6 +19,10 @@ inline CacheAddr ToCacheAddr(const void* host_ptr) {
    return reinterpret_cast<CacheAddr>(host_ptr);
 }

+inline u8* FromCacheAddr(CacheAddr cache_addr) {
+    return reinterpret_cast<u8*>(cache_addr);
+}
+
 namespace Core {
 class System;
 }
@@ -281,8 +285,8 @@ private:

 protected:
    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
-    VideoCore::RendererBase& renderer;
    Core::System& system;
+    VideoCore::RendererBase& renderer;

 private:
    std::unique_ptr<Tegra::MemoryManager> memory_manager;
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -50,7 +50,7 @@ public:
    /// and invalidated
    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;

-    // Notify the rasterizer to send all written commands to the host GPU.
+    /// Notify the rasterizer to send all written commands to the host GPU.
    virtual void FlushCommands() = 0;

    /// Notify rasterizer that a frame is about to finish
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -7,28 +7,41 @@
 #include <glad/glad.h>

 #include "common/assert.h"
+#include "common/microprofile.h"
+#include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"

 namespace OpenGL {

+MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
+
+CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size)
+    : VideoCommon::BufferBlock{cache_addr, size} {
+    gl_buffer.Create();
+    glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+}
+
+CachedBufferBlock::~CachedBufferBlock() = default;
+
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                               std::size_t stream_size)
-    : VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer>{
+    : VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{
          rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}

 OGLBufferCache::~OGLBufferCache() = default;

-OGLBuffer OGLBufferCache::CreateBuffer(std::size_t size) {
-    OGLBuffer buffer;
-    buffer.Create();
-    glNamedBufferData(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
-    return buffer;
+Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) {
+    return std::make_shared<CachedBufferBlock>(cache_addr, size);
 }

-const GLuint* OGLBufferCache::ToHandle(const OGLBuffer& buffer) {
-    return &buffer.handle;
+void OGLBufferCache::WriteBarrier() {
+    glMemoryBarrier(GL_ALL_BARRIER_BITS);
+}
+
+const GLuint* OGLBufferCache::ToHandle(const Buffer& buffer) {
+    return buffer->GetHandle();
 }

 const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
@@ -36,23 +49,24 @@ const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
    return &null_buffer;
 }

-void OGLBufferCache::UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
-                                      const u8* data) {
-    glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                                     const u8* data) {
+    glNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
                         static_cast<GLsizeiptr>(size), data);
 }

-void OGLBufferCache::DownloadBufferData(const OGLBuffer& buffer, std::size_t offset,
-                                        std::size_t size, u8* data) {
-    glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                                       u8* data) {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    glGetNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
                            static_cast<GLsizeiptr>(size), data);
 }

-void OGLBufferCache::CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst,
-                                    std::size_t src_offset, std::size_t dst_offset,
-                                    std::size_t size) {
-    glCopyNamedBufferSubData(src.handle, dst.handle, static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
+                               std::size_t dst_offset, std::size_t size) {
+    glCopyNamedBufferSubData(*src->GetHandle(), *dst->GetHandle(),
+                             static_cast<GLintptr>(src_offset), static_cast<GLintptr>(dst_offset),
+                             static_cast<GLsizeiptr>(size));
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -7,7 +7,7 @@
 #include <memory>

 #include "common/common_types.h"
-#include "video_core/buffer_cache.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
@@ -21,7 +21,24 @@ namespace OpenGL {
 class OGLStreamBuffer;
 class RasterizerOpenGL;

-class OGLBufferCache final : public VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer> {
+class CachedBufferBlock;
+
+using Buffer = std::shared_ptr<CachedBufferBlock>;
+
+class CachedBufferBlock : public VideoCommon::BufferBlock {
+public:
+    explicit CachedBufferBlock(CacheAddr cache_addr, const std::size_t size);
+    ~CachedBufferBlock();
+
+    const GLuint* GetHandle() const {
+        return &gl_buffer.handle;
+    }
+
+private:
+    OGLBuffer gl_buffer{};
+};
+
+class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> {
 public:
    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                            std::size_t stream_size);
@@ -30,18 +47,20 @@ public:
    const GLuint* GetEmptyBuffer(std::size_t) override;

 protected:
-    OGLBuffer CreateBuffer(std::size_t size) override;
+    Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override;

-    const GLuint* ToHandle(const OGLBuffer& buffer) override;
+    void WriteBarrier() override;

-    void UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
-                          const u8* data) override;
+    const GLuint* ToHandle(const Buffer& buffer) override;

-    void DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
-                            u8* data) override;
+    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                         const u8* data) override;

-    void CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, std::size_t src_offset,
-                        std::size_t dst_offset, std::size_t size) override;
+    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                           u8* data) override;
+
+    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
+                   std::size_t dst_offset, std::size_t size) override;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -27,6 +27,8 @@ Device::Device() {
    shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
    max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
+    has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group &&
+                          GLAD_GL_NV_shader_thread_shuffle;
    has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
    has_variable_aoffi = TestVariableAoffi();
    has_component_indexing_bug = TestComponentIndexingBug();
@@ -36,6 +38,7 @@ Device::Device(std::nullptr_t) {
    uniform_buffer_alignment = 0;
    max_vertex_attributes = 16;
    max_varyings = 15;
+    has_warp_intrinsics = true;
    has_vertex_viewport_layer = true;
    has_variable_aoffi = true;
    has_component_indexing_bug = false;
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -30,6 +30,10 @@ public:
        return max_varyings;
    }

+    bool HasWarpIntrinsics() const {
+        return has_warp_intrinsics;
+    }
+
    bool HasVertexViewportLayer() const {
        return has_vertex_viewport_layer;
    }
@@ -50,6 +54,7 @@ private:
    std::size_t shader_storage_alignment{};
    u32 max_vertex_attributes{};
    u32 max_varyings{};
+    bool has_warp_intrinsics{};
    bool has_vertex_viewport_layer{};
    bool has_variable_aoffi{};
    bool has_component_indexing_bug{};
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -708,8 +708,6 @@ void RasterizerOpenGL::DrawArrays() {
        return;
    }

-    const auto& regs = gpu.regs;
-
    SyncColorMask();
    SyncFragmentColorClampState();
    SyncMultiSampleState();
@@ -980,7 +978,7 @@ void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entr
                                         GPUVAddr gpu_addr, std::size_t size) {
    const auto alignment{device.GetShaderStorageBufferAlignment()};
    const auto [ssbo, buffer_offset] =
-        buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten());
+        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten());
    bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
 }

--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -212,7 +212,9 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
    const auto texture_buffer_usage{variant.texture_buffer_usage};

    std::string source = "#version 430 core\n"
-                         "#extension GL_ARB_separate_shader_objects : enable\n";
+                         "#extension GL_ARB_separate_shader_objects : enable\n"
+                         "#extension GL_NV_gpu_shader5 : enable\n"
+                         "#extension GL_NV_shader_thread_group : enable\n";
    if (entries.shader_viewport_layer_array) {
        source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
    }
@@ -247,20 +249,24 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
        if (!texture_buffer_usage.test(i)) {
            continue;
        }
-        source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i);
+        source += fmt::format("#define SAMPLER_{}_IS_BUFFER\n", i);
+    }
+    if (texture_buffer_usage.any()) {
+        source += '\n';
    }

    if (program_type == ProgramType::Geometry) {
        const auto [glsl_topology, debug_name, max_vertices] =
            GetPrimitiveDescription(primitive_mode);

-        source += "layout (" + std::string(glsl_topology) + ") in;\n";
+        source += "layout (" + std::string(glsl_topology) + ") in;\n\n";
        source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
    }
    if (program_type == ProgramType::Compute) {
        source += "layout (local_size_variable) in;\n";
    }

+    source += '\n';
    source += code;

    OGLShader shader;
@@ -289,7 +295,7 @@ std::set<GLenum> GetSupportedFormats() {

 CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,
                           GLShader::ProgramResult result)
-    : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr},
+    : RasterizerCacheObject{params.host_ptr}, cpu_addr{params.cpu_addr},
      unique_identifier{params.unique_identifier}, program_type{program_type},
      disk_cache{params.disk_cache}, precompiled_programs{params.precompiled_programs},
      entries{result.second}, code{std::move(result.first)}, shader_length{entries.shader_length} {}
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -106,7 +106,6 @@ private:

    ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant) const;

-    u8* host_ptr{};
    VAddr cpu_addr{};
    u64 unique_identifier{};
    ProgramType program_type{};
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -565,7 +565,7 @@ private:
                case Tegra::Shader::ImageType::Texture1D:
                    return "image1D";
                case Tegra::Shader::ImageType::TextureBuffer:
-                    return "bufferImage";
+                    return "imageBuffer";
                case Tegra::Shader::ImageType::Texture1DArray:
                    return "image1DArray";
                case Tegra::Shader::ImageType::Texture2D:
@@ -1735,6 +1735,48 @@ private:
        return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
    }

+    std::string BallotThread(Operation operation) {
+        const std::string value = VisitOperand(operation, 0, Type::Bool);
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia warp intrinsics are not available and its required by a shader");
+            // Stub on non-Nvidia devices by simulating all threads voting the same as the active
+            // one.
+            return fmt::format("utof({} ? 0xFFFFFFFFU : 0U)", value);
+        }
+        return fmt::format("utof(ballotThreadNV({}))", value);
+    }
+
+    std::string Vote(Operation operation, const char* func) {
+        const std::string value = VisitOperand(operation, 0, Type::Bool);
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia vote intrinsics are not available and its required by a shader");
+            // Stub with a warp size of one.
+            return value;
+        }
+        return fmt::format("{}({})", func, value);
+    }
+
+    std::string VoteAll(Operation operation) {
+        return Vote(operation, "allThreadsNV");
+    }
+
+    std::string VoteAny(Operation operation) {
+        return Vote(operation, "anyThreadNV");
+    }
+
+    std::string VoteEqual(Operation operation) {
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia vote intrinsics are not available and its required by a shader");
+            // We must return true here since a stub for a theoretical warp size of 1 will always
+            // return an equal result for all its votes.
+            return "true";
+        }
+        return Vote(operation, "allThreadsEqualNV");
+    }
+
    static constexpr std::array operation_decompilers = {
        &GLSLDecompiler::Assign,

@@ -1885,6 +1927,11 @@ private:
        &GLSLDecompiler::WorkGroupId<0>,
        &GLSLDecompiler::WorkGroupId<1>,
        &GLSLDecompiler::WorkGroupId<2>,
+
+        &GLSLDecompiler::BallotThread,
+        &GLSLDecompiler::VoteAll,
+        &GLSLDecompiler::VoteAny,
+        &GLSLDecompiler::VoteEqual,
    };
    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -184,6 +184,9 @@ GLint GetSwizzleSource(SwizzleSource source) {
 }

 void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) {
+    if (params.IsBuffer()) {
+        return;
+    }
    glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
@@ -208,6 +211,7 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
        glNamedBufferStorage(texture_buffer.handle, params.width * params.GetBytesPerPixel(),
                             nullptr, GL_DYNAMIC_STORAGE_BIT);
        glTextureBuffer(texture.handle, internal_format, texture_buffer.handle);
+        break;
    case SurfaceTarget::Texture2D:
    case SurfaceTarget::TextureCubemap:
        glTextureStorage2D(texture.handle, params.emulated_levels, internal_format, params.width,
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -51,7 +51,7 @@ public:
    }

 protected:
-    void DecorateSurfaceName();
+    void DecorateSurfaceName() override;

    View CreateView(const ViewParams& view_key) override;
    View CreateViewInner(const ViewParams& view_key, bool is_proxy);
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1072,6 +1072,26 @@ private:
        return {};
    }

+    Id BallotThread(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteAll(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteAny(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteEqual(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
    Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,
                      const std::string& name) {
        const Id id = OpVariable(type, storage);
@@ -1364,6 +1384,11 @@ private:
        &SPIRVDecompiler::WorkGroupId<0>,
        &SPIRVDecompiler::WorkGroupId<1>,
        &SPIRVDecompiler::WorkGroupId<2>,
+
+        &SPIRVDecompiler::BallotThread,
+        &SPIRVDecompiler::VoteAll,
+        &SPIRVDecompiler::VoteAny,
+        &SPIRVDecompiler::VoteEqual,
    };
    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -176,6 +176,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
        {OpCode::Type::Ffma, &ShaderIR::DecodeFfma},
        {OpCode::Type::Hfma2, &ShaderIR::DecodeHfma2},
        {OpCode::Type::Conversion, &ShaderIR::DecodeConversion},
+        {OpCode::Type::Warp, &ShaderIR::DecodeWarp},
        {OpCode::Type::Memory, &ShaderIR::DecodeMemory},
        {OpCode::Type::Texture, &ShaderIR::DecodeTexture},
        {OpCode::Type::Image, &ShaderIR::DecodeImage},
--- a/src/video_core/shader/decode/float_set.cpp
+++ b/src/video_core/shader/decode/float_set.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;

 u32 ShaderIR::DecodeFloatSet(NodeBlock& bb, u32 pc) {
    const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);

    const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fset.abs_a != 0,
                                            instr.fset.neg_a != 0);
--- a/src/video_core/shader/decode/float_set_predicate.cpp
+++ b/src/video_core/shader/decode/float_set_predicate.cpp
@@ -16,7 +16,6 @@ using Tegra::Shader::Pred;

 u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {
    const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);

    const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0,
                                            instr.fsetp.neg_a != 0);
--- a/src/video_core/shader/decode/integer_set.cpp
+++ b/src/video_core/shader/decode/integer_set.cpp
@@ -14,7 +14,6 @@ using Tegra::Shader::OpCode;

 u32 ShaderIR::DecodeIntegerSet(NodeBlock& bb, u32 pc) {
    const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);

    const Node op_a = GetRegister(instr.gpr8);
    const Node op_b = [&]() {
--- a/src/video_core/shader/decode/integer_set_predicate.cpp
+++ b/src/video_core/shader/decode/integer_set_predicate.cpp
@@ -16,7 +16,6 @@ using Tegra::Shader::Pred;

 u32 ShaderIR::DecodeIntegerSetPredicate(NodeBlock& bb, u32 pc) {
    const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);

    const Node op_a = GetRegister(instr.gpr8);

--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -74,6 +74,13 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
            case SystemVariable::InvocationInfo:
                LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete");
                return Immediate(0u);
+            case SystemVariable::Tid: {
+                Node value = Immediate(0);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdX), 0, 9);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdY), 16, 9);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdZ), 26, 5);
+                return value;
+            }
            case SystemVariable::TidX:
                return Operation(OperationCode::LocalInvocationIdX);
            case SystemVariable::TidY:
--- a/src/video_core/shader/decode/predicate_set_register.cpp
+++ b/src/video_core/shader/decode/predicate_set_register.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;

 u32 ShaderIR::DecodePredicateSetRegister(NodeBlock& bb, u32 pc) {
    const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);

    UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                         "Condition codes generation in PSET is not implemented");
--- a/src/video_core/shader/decode/warp.cpp
+++ b/src/video_core/shader/decode/warp.cpp
@@ -0,0 +1,55 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::Instruction;
+using Tegra::Shader::OpCode;
+using Tegra::Shader::Pred;
+using Tegra::Shader::VoteOperation;
+
+namespace {
+OperationCode GetOperationCode(VoteOperation vote_op) {
+    switch (vote_op) {
+    case VoteOperation::All:
+        return OperationCode::VoteAll;
+    case VoteOperation::Any:
+        return OperationCode::VoteAny;
+    case VoteOperation::Eq:
+        return OperationCode::VoteEqual;
+    default:
+        UNREACHABLE_MSG("Invalid vote operation={}", static_cast<u64>(vote_op));
+        return OperationCode::VoteAll;
+    }
+}
+} // Anonymous namespace
+
+u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
+    const Instruction instr = {program_code[pc]};
+    const auto opcode = OpCode::Decode(instr);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::VOTE: {
+        const Node value = GetPredicate(instr.vote.value, instr.vote.negate_value != 0);
+        const Node active = Operation(OperationCode::BallotThread, value);
+        const Node vote = Operation(GetOperationCode(instr.vote.operation), value);
+        SetRegister(bb, instr.gpr0, active);
+        SetPredicate(bb, instr.vote.dest_pred, vote);
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName());
+        break;
+    }
+
+    return pc;
+}
+
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -168,6 +168,11 @@ enum class OperationCode {
    WorkGroupIdY,       /// () -> uint
    WorkGroupIdZ,       /// () -> uint

+    BallotThread, /// (bool) -> uint
+    VoteAll,      /// (bool) -> bool
+    VoteAny,      /// (bool) -> bool
+    VoteEqual,    /// (bool) -> bool
+
    Amount,
 };

--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -405,4 +405,9 @@ Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {
                     Immediate(offset), Immediate(bits));
 }

+Node ShaderIR::BitfieldInsert(Node base, Node insert, u32 offset, u32 bits) {
+    return Operation(OperationCode::UBitfieldInsert, NO_PRECISE, base, insert, Immediate(offset),
+                     Immediate(bits));
+}
+
 } // namespace VideoCommon::Shader
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -167,6 +167,7 @@ private:
    u32 DecodeFfma(NodeBlock& bb, u32 pc);
    u32 DecodeHfma2(NodeBlock& bb, u32 pc);
    u32 DecodeConversion(NodeBlock& bb, u32 pc);
+    u32 DecodeWarp(NodeBlock& bb, u32 pc);
    u32 DecodeMemory(NodeBlock& bb, u32 pc);
    u32 DecodeTexture(NodeBlock& bb, u32 pc);
    u32 DecodeImage(NodeBlock& bb, u32 pc);
@@ -279,6 +280,9 @@ private:
    /// Extracts a sequence of bits from a node
    Node BitfieldExtract(Node value, u32 offset, u32 bits);

+    /// Inserts a sequence of bits from a node
+    Node BitfieldInsert(Node base, Node insert, u32 offset, u32 bits);
+
    void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
                                  const Node4& components);

--- a/src/video_core/texture_cache/surface_params.h
+++ b/src/video_core/texture_cache/surface_params.h
@@ -58,7 +58,6 @@ public:
    std::size_t GetHostSizeInBytes() const {
        std::size_t host_size_in_bytes;
        if (GetCompressionType() == SurfaceCompression::Converted) {
-            constexpr std::size_t rgb8_bpp = 4ULL;
            // ASTC is uncompressed in software, in emulated as RGBA8
            host_size_in_bytes = 0;
            for (u32 level = 0; level < num_levels; ++level) {
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -308,8 +308,6 @@ protected:
        if (!guard_render_targets && surface->IsRenderTarget()) {
            ManageRenderTargetUnregister(surface);
        }
-        const GPUVAddr gpu_addr = surface->GetGpuAddr();
-        const CacheAddr cache_ptr = surface->GetCacheAddr();
        const std::size_t size = surface->GetSizeInBytes();
        const VAddr cpu_addr = surface->GetCpuAddr();
        rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1);
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -257,19 +257,21 @@ std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y,

 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data,
-                    u32 block_height_bit) {
+                    u32 block_height_bit, u32 offset_x, u32 offset_y) {
    const u32 block_height = 1U << block_height_bit;
    const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) /
                                  gob_size_x};
    for (u32 line = 0; line < subrect_height; ++line) {
+        const u32 dst_y = line + offset_y;
        const u32 gob_address_y =
-            (line / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
-            ((line % (gob_size_y * block_height)) / gob_size_y) * gob_size;
-        const auto& table = legacy_swizzle_table[line % gob_size_y];
+            (dst_y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
+            ((dst_y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
+        const auto& table = legacy_swizzle_table[dst_y % gob_size_y];
        for (u32 x = 0; x < subrect_width; ++x) {
+            const u32 dst_x = x + offset_x;
            const u32 gob_address =
-                gob_address_y + (x * bytes_per_pixel / gob_size_x) * gob_size * block_height;
-            const u32 swizzled_offset = gob_address + table[(x * bytes_per_pixel) % gob_size_x];
+                gob_address_y + (dst_x * bytes_per_pixel / gob_size_x) * gob_size * block_height;
+            const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % gob_size_x];
            u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
            u8* dest_addr = swizzled_data + swizzled_offset;

--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -44,7 +44,8 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height

 /// Copies an untiled subrectangle into a tiled surface.
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
-                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height);
+                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
+                    u32 offset_x, u32 offset_y);

 /// Copies a tiled subrectangle into a linear surface.
 void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -213,7 +213,7 @@ struct TICEntry {
        if (header_version != TICHeaderVersion::OneDBuffer) {
            return width_minus_1 + 1;
        }
-        return (buffer_high_width_minus_one << 16) | buffer_low_width_minus_one;
+        return ((buffer_high_width_minus_one << 16) | buffer_low_width_minus_one) + 1;
    }

    u32 Height() const {
Author	SHA1	Message	Date
Morph1984	ec95c73a12	remove <f32> We can remove this since its already a f32 value	2019-09-03 23:20:19 -04:00
Morph1984	58783b8a46	explicitly represent 1 as a float (1.0f instead of 1)	2019-09-03 23:06:32 -04:00
Morph1984	b1ca56bed2	Change u32 -> f32 Volume is a f32 value. (SwIPC describes it as a u32, but it is actually f32 as corroborated by switchbrew docs and SetAudioDeviceOutputVolume) ```cpp const f32 volume = rp.Pop<f32>(); ```	2019-09-03 22:30:20 -04:00
Morph1984	ba661c8d9a	service/audio/audren_u: Stub IAudioDevice::GetAudioDeviceOutputVolume	2019-09-03 16:05:33 -04:00
bunnei	50b5bb44a0	Merge pull request #2765 from FernandoS27/dma-fix MaxwellDMA: Fixes, corrections and relaxations.	2019-09-01 13:13:05 -04:00
Rodrigo Locatti	4d4f9cc104	video_core: Silent miscellaneous warnings (#2820 ) * texture_cache/surface_params: Remove unused local variable * rasterizer_interface: Add missing documentation commentary * maxwell_dma: Remove unused rasterizer reference * video_core/gpu: Sort member declaration order to silent -Wreorder warning * fermi_2d: Remove unused MemoryManager reference * video_core: Silent unused variable warnings * buffer_cache: Silent -Wreorder warnings * kepler_memory: Remove unused MemoryManager reference * gl_texture_cache: Add missing override * buffer_cache: Add missing include * shader/decode: Remove unused variables	2019-08-30 14:08:00 -04:00
Fernando Sahmkow	67cc2d5046	Merge pull request #2819 from ReinUsesLisp/fixup-clang gl_buffer_cache: Add missing include	2019-08-29 18:13:35 -04:00
ReinUsesLisp	878adee0a3	gl_buffer_cache: Add missing include RasterizerInterface was considered an incomplete object by clang.	2019-08-29 22:02:52 +00:00
bunnei	a67c4e6e02	Merge pull request #2742 from ReinUsesLisp/fix-texture-buffers gl_texture_cache: Miscellaneous texture buffer fixes	2019-08-29 15:59:17 -04:00
James Rowe	f0c75573b1	Revert "externals: Update FMT to 6.0.0" This reverts commit `ca4ca8a6dc`.	2019-08-29 12:23:34 -06:00
Ethan	ca4ca8a6dc	externals: Update FMT to 6.0.0	2019-08-29 19:37:46 +02:00
bunnei	e424615839	Merge pull request #2783 from FernandoS27/new-buffer-cache Implement a New LLE Buffer Cache	2019-08-29 13:07:01 -04:00
bunnei	f8cc5668f8	Merge pull request #2758 from ReinUsesLisp/packed-tid shader/decode: Implement S2R Tic	2019-08-29 12:58:43 -04:00
bunnei	680ab61327	Merge pull request #2786 from ReinUsesLisp/vote shader_ir: Implement VOTE on Nvidia drivers	2019-08-29 12:58:10 -04:00
ReinUsesLisp	4e35177e23	shader_ir: Implement VOTE Implement VOTE using Nvidia's intrinsics. Documentation about these can be found here https://developer.nvidia.com/reading-between-threads-shader-intrinsics Instead of using portable ARB instructions I opted to use Nvidia intrinsics because these are the closest we have to how Tegra X1 hardware renders. To stub VOTE on non-Nvidia drivers (including nouveau) this commit simulates a GPU with a warp size of one, returning what is meaningful for the instruction being emulated: * anyThreadNV(value) -> value * allThreadsNV(value) -> value * allThreadsEqualNV(value) -> true ballotARB, also known as "uint64_t(activeThreadsNV())", emits VOTE.ANY Rd, PT, PT; on nouveau's compiler. This doesn't match exactly to Nvidia's code VOTE.ALL Rd, PT, PT; Which is emulated with activeThreadsNV() by this commit. In theory this shouldn't really matter since .ANY, .ALL and .EQ affect the predicates (set to PT on those cases) and not the registers.	2019-08-21 14:50:38 -03:00
Fernando Sahmkow	83ec2091c1	Buffer Cache: Adress Feedback.	2019-08-21 12:14:27 -04:00
Fernando Sahmkow	6ce2c85047	Buffer_Cache: Implement flushing.	2019-08-21 12:14:26 -04:00
Fernando Sahmkow	de8ff8a1c6	Buffer_Cache: Implement barriers.	2019-08-21 12:14:25 -04:00
Fernando Sahmkow	286f4c446a	Buffer_Cache: Optimize and track written areas.	2019-08-21 12:14:25 -04:00
Fernando Sahmkow	5f4b746a1e	BufferCache: Rework mapping caching.	2019-08-21 12:14:24 -04:00
Fernando Sahmkow	86d8563314	Buffer_Cache: Fixes and optimizations.	2019-08-21 12:14:23 -04:00
Fernando Sahmkow	862bec001b	Video_Core: Implement a new Buffer Cache	2019-08-21 12:14:22 -04:00
Fernando Sahmkow	a452ff983d	MaxwellDMA: Fixes, corrections and relaxations. This commit fixes offsets on Linear -> Tiled copies, corrects z pos fortiled->linear copies, corrects bytes_per_pixel calculation in tiled -> linear copies and relaxes some limitations set by latest dma fixes refactors.	2019-07-25 20:41:42 -04:00
ReinUsesLisp	104641db07	shader/decode: Implement S2R Tic	2019-07-22 16:16:10 -03:00
ReinUsesLisp	74632c76ce	gl_shader_decompiler: Rename bufferImage to imageBuffer The online OpenGL documentation is wrong. The type definition is imageBuffer.	2019-07-18 01:16:44 -03:00
ReinUsesLisp	87909d327f	gl_shader_cache: Fix newline on buffer preprocessor definitions	2019-07-18 01:16:15 -03:00
ReinUsesLisp	e7bdf8b22a	textures: Fix texture buffer size calculation	2019-07-18 01:07:08 -03:00
ReinUsesLisp	84027f4808	gl_texture_cache: Do not set texture parameters to buffers	2019-07-18 01:06:26 -03:00
ReinUsesLisp	73b2dc6d4f	gl_texture_cache: Add missing break in CreateTexture	2019-07-18 01:04:18 -03:00