From 0bad8394e6900dac91fc7dc4e7aad9fa7e9db21b Mon Sep 17 00:00:00 2001 From: bunnei Date: Sat, 12 Jan 2019 01:28:16 -0500 Subject: [PATCH] gpu: Move flush and invalidate to GPU thread. --- .../service/nvdrv/devices/nvhost_as_gpu.cpp | 3 +- src/core/memory.cpp | 9 +++--- src/video_core/engines/kepler_memory.cpp | 2 +- src/video_core/engines/maxwell_dma.cpp | 4 +-- src/video_core/gpu.cpp | 14 +++++++-- src/video_core/gpu.h | 8 +++-- src/video_core/gpu_thread.cpp | 30 +++++++++++++++---- src/video_core/gpu_thread.h | 19 +++++++++--- src/video_core/rasterizer_interface.h | 4 --- .../renderer_opengl/gl_rasterizer.cpp | 18 ++++------- .../renderer_opengl/gl_rasterizer.h | 1 - 11 files changed, 71 insertions(+), 41 deletions(-) diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp index 466db7ccd3..4edf6a817d 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp @@ -178,7 +178,8 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector& input, std::vector& ou auto& gpu = system_instance.GPU(); auto cpu_addr = gpu.MemoryManager().GpuToCpuAddress(params.offset); ASSERT(cpu_addr); - system_instance.Renderer().Rasterizer().FlushAndInvalidateRegion(*cpu_addr, itr->second.size); + gpu.FlushRegion(*cpu_addr, itr->second.size); + gpu.InvalidateRegion(*cpu_addr, itr->second.size); params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size); diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 3598a7381f..d02620d612 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -351,16 +351,17 @@ void RasterizerFlushVirtualRegion(VAddr start, u64 size, FlushMode mode) { const VAddr overlap_end = std::min(end, region_end); const VAddr overlap_size = overlap_end - overlap_start; - auto& rasterizer = system_instance.Renderer().Rasterizer(); + auto& gpu = system_instance.GPU(); switch (mode) { case FlushMode::Flush: - rasterizer.FlushRegion(overlap_start, overlap_size); + gpu.FlushRegion(overlap_start, overlap_size); break; case FlushMode::Invalidate: - rasterizer.InvalidateRegion(overlap_start, overlap_size); + gpu.InvalidateRegion(overlap_start, overlap_size); break; case FlushMode::FlushAndInvalidate: - rasterizer.FlushAndInvalidateRegion(overlap_start, overlap_size); + gpu.FlushRegion(overlap_start, overlap_size); + gpu.InvalidateRegion(overlap_start, overlap_size); break; } }; diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 4880191fc6..4c0e6fc789 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -46,7 +46,7 @@ void KeplerMemory::ProcessData(u32 data) { // We have to invalidate the destination region to evict any outdated surfaces from the cache. // We do this before actually writing the new data because the destination address might contain // a dirty surface that will have to be written back to memory. - rasterizer.InvalidateRegion(dest_address, sizeof(u32)); + Core::System::GetInstance().GPU().InvalidateRegion(dest_address, sizeof(u32)); Memory::Write32(dest_address, data); Core::System::GetInstance().GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 06462f570d..ec916fa4c6 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -87,12 +87,12 @@ void MaxwellDMA::HandleCopy() { const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) { // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated // copying. - rasterizer.FlushRegion(source_cpu, src_size); + Core::System::GetInstance().GPU().FlushRegion(source_cpu, src_size); // We have to invalidate the destination region to evict any outdated surfaces from the // cache. We do this before actually writing the new data because the destination address // might contain a dirty surface that will have to be written back to memory. - rasterizer.InvalidateRegion(dest_cpu, dst_size); + Core::System::GetInstance().GPU().InvalidateRegion(dest_cpu, dst_size); }; if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 7947bbff39..3b482ece3a 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -86,11 +86,19 @@ void GPU::SwapBuffers( } } -void GPU::WaitUntilIdle(std::function callback) { +void GPU::FlushRegion(VAddr addr, u64 size) { if (Settings::values.use_asynchronous_gpu_emulation) { - gpu_thread->WaitUntilIdle(std::move(callback)); + gpu_thread->FlushRegion(addr, size); } else { - callback(); + renderer.Rasterizer().FlushRegion(addr, size); + } +} + +void GPU::InvalidateRegion(VAddr addr, u64 size) { + if (Settings::values.use_asynchronous_gpu_emulation) { + gpu_thread->InvalidateRegion(addr, size); + } else { + renderer.Rasterizer().InvalidateRegion(addr, size); } } diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 2a15a649f2..f65327f285 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -5,7 +5,6 @@ #pragma once #include -#include #include #include #include "common/common_types.h" @@ -165,8 +164,11 @@ public: void SwapBuffers( std::optional> framebuffer); - /// Waits the caller until the thread is idle, and then calls the callback - void WaitUntilIdle(std::function callback); + /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory + void FlushRegion(VAddr addr, u64 size); + + /// Notify rasterizer that any caches of the specified region should be invalidated + void InvalidateRegion(VAddr addr, u64 size); private: std::unique_ptr dma_pusher; diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 03e8a8370b..255617ff82 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -39,10 +39,26 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p if (is_dma_pending) { // Process pending DMA pushbuffer commands - std::lock_guard lock{state.running_mutex}; + std::lock_guard lock{state.running_mutex}; dma_pusher.DispatchCalls(); } + { + // Cache management + std::lock_guard lock{state.cache_mutex}; + + for (const auto& region : state.flush_regions) { + renderer.Rasterizer().FlushRegion(region.addr, region.size); + } + + for (const auto& region : state.invalidate_regions) { + renderer.Rasterizer().InvalidateRegion(region.addr, region.size); + } + + state.flush_regions.clear(); + state.invalidate_regions.clear(); + } + if (is_swapbuffers_pending) { // Process pending SwapBuffers renderer.SwapBuffers(state.pending_swapbuffers_config); @@ -106,10 +122,14 @@ void GPUThread::SwapBuffers( } } -void GPUThread::WaitUntilIdle(std::function callback) { - // Needs to be a recursive mutex, as this can be called by the GPU thread - std::unique_lock lock{state.running_mutex}; - callback(); +void GPUThread::FlushRegion(VAddr addr, u64 size) { + std::lock_guard lock{state.cache_mutex}; + state.flush_regions.push_back({addr, size}); +} + +void GPUThread::InvalidateRegion(VAddr addr, u64 size) { + std::lock_guard lock{state.cache_mutex}; + state.invalidate_regions.push_back({addr, size}); } } // namespace VideoCore diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index c8876738cd..1a63345172 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -5,7 +5,6 @@ #pragma once #include -#include #include #include #include @@ -29,7 +28,16 @@ struct GPUThreadState final { std::condition_variable signal_condition; std::condition_variable running_condition; std::mutex signal_mutex; - std::recursive_mutex running_mutex; + std::mutex running_mutex; + std::recursive_mutex cache_mutex; + + struct MemoryRegion final { + const VAddr addr; + const u64 size; + }; + + std::vector flush_regions; + std::vector invalidate_regions; }; class GPUThread final { @@ -44,8 +52,11 @@ public: void SwapBuffers( std::optional> framebuffer); - /// Waits the caller until the thread is idle, and then calls the callback - void WaitUntilIdle(std::function callback); + /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory + void FlushRegion(VAddr addr, u64 size); + + /// Notify rasterizer that any caches of the specified region should be invalidated + void InvalidateRegion(VAddr addr, u64 size); private: GPUThreadState state; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 06fc59dbe8..8dcfa073bb 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -30,10 +30,6 @@ public: /// Notify rasterizer that any caches of the specified region should be invalidated virtual void InvalidateRegion(VAddr addr, u64 size) = 0; - /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory - /// and invalidated - virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; - /// Attempt to use a faster method to perform a surface copy virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, const Tegra::Engines::Fermi2D::Regs::Surface& dst) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 64d882cfb5..4468840708 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -751,24 +751,16 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { if (Settings::values.use_accurate_gpu_emulation) { // Only flush if use_accurate_gpu_emulation is enabled, as it incurs a performance hit - Core::System::GetInstance().GPU().WaitUntilIdle( - [this, addr, size]() { res_cache.FlushRegion(addr, size); }); + res_cache.FlushRegion(addr, size); } } void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); - Core::System::GetInstance().GPU().WaitUntilIdle([this, addr, size]() { - res_cache.InvalidateRegion(addr, size); - shader_cache.InvalidateRegion(addr, size); - global_cache.InvalidateRegion(addr, size); - buffer_cache.InvalidateRegion(addr, size); - }); -} - -void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { - FlushRegion(addr, size); - InvalidateRegion(addr, size); + res_cache.InvalidateRegion(addr, size); + shader_cache.InvalidateRegion(addr, size); + global_cache.InvalidateRegion(addr, size); + buffer_cache.InvalidateRegion(addr, size); } bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index a53edee6d6..85f01d6a6e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -53,7 +53,6 @@ public: void FlushAll() override; void FlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; - void FlushAndInvalidateRegion(VAddr addr, u64 size) override; bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, const Tegra::Engines::Fermi2D::Regs::Surface& dst) override; bool AccelerateFill(const void* config) override;