Compare commits

...

77 Commits

Author SHA1 Message Date
Fernando Sahmkow
e52c895559 GPU: Flush commands on every dma pusher step.
This commit ensures that the host gpu is constantly fed with commands to
work with, while the guest gpu keeps producing the rest of the commands.
This reduces syncing time between host and guest gpu.
2019-07-26 16:54:22 -04:00
bunnei
52f54c728d Merge pull request #2592 from FernandoS27/sync1
Implement GPU Synchronization Mechanisms & Correct NVFlinger
2019-07-26 14:26:44 -04:00
bunnei
b0ff3179ef Merge pull request #2739 from lioncash/cflow
video_core/control_flow: Minor changes/warning cleanup
2019-07-25 13:04:56 -04:00
bunnei
4d26550f5f Merge pull request #2737 from FernandoS27/track-fix
Shader_Ir: Correct tracking to track from right to left
2019-07-25 12:41:52 -04:00
bunnei
ccbc554949 Merge pull request #2689 from lioncash/tl
yuzu/main: Make error messages within OnCoreError more localization-friendly
2019-07-25 12:35:07 -04:00
bunnei
31e8a61527 Merge pull request #2743 from FernandoS27/surpress-assert
Downgrade and suppress a series of GPU asserts and debug messages.
2019-07-25 12:34:36 -04:00
bunnei
9be9600bdc Merge pull request #2704 from FernandoS27/conditional
maxwell3d: Implement Conditional Rendering
2019-07-24 17:07:57 -04:00
Zach Hilman
12514ccd35 Fix README change mistake (#2754)
Fix README change mistake
2019-07-24 16:42:33 -04:00
bunnei
f601f25bcc Merge pull request #2734 from ReinUsesLisp/compute-shaders
gl_rasterizer: Implement compute shaders
2019-07-22 11:12:55 -04:00
bunnei
27e10e0442 Merge pull request #2735 from FernandoS27/pipeline-rework
Rework Dirty Flags in GPU Pipeline, Optimize CBData and Redo Clearing mechanism
2019-07-21 00:59:52 -04:00
Zach Hilman
6738fb5fef Update README.md 2019-07-20 21:03:30 -04:00
Fernando Sahmkow
0a67416971 Merge pull request #2693 from ReinUsesLisp/hsetp2
shader/half_set_predicate: Implement missing HSETP2 variants
2019-07-20 17:25:08 -04:00
Flame Sage
369be67039 Update README.md 2019-07-20 19:24:24 +00:00
Flame Sage
aa599ac709 Update README.md 2019-07-20 19:22:30 +00:00
Flame Sage
a2edb27158 Merge pull request #2752 from DarkLordZach/master
azure: Fix clang-format and releases
2019-07-20 15:20:53 -04:00
Zach Hilman
f470bcb826 azure: Fix clang-format and releases 2019-07-20 15:19:25 -04:00
Fernando Sahmkow
7a35178ee2 Maxwell3D: Reorganize and address feedback 2019-07-20 10:18:35 -04:00
Fernando Sahmkow
1158777737 Shader_Ir: Change Debug Asserts for Log Warnings 2019-07-19 22:15:34 -04:00
ReinUsesLisp
45c162444d shader/half_set_predicate: Fix HSETP2 implementation 2019-07-19 22:21:22 -03:00
ReinUsesLisp
6c4985edc9 shader/half_set_predicate: Implement missing HSETP2 variants 2019-07-19 22:20:47 -03:00
Lioncash
c1c89411da video_core/control_flow: Provide operator!= for types with operator==
Provides operational symmetry for the respective structures.
2019-07-18 21:03:31 -04:00
Lioncash
1780e0e3d0 video_core/control_flow: Prevent sign conversion in TryGetBlock()
The return value is a u32, not an s32, so this would result in an
implicit signedness conversion.
2019-07-18 21:03:31 -04:00
Lioncash
a162a844d2 video_core/control_flow: Remove unnecessary BlockStack copy constructor
This is the default behavior of the copy constructor, so it doesn't need
to be specified.

While we're at it we can make the other non-default constructor
explicit.
2019-07-18 21:03:30 -04:00
Lioncash
56bc11d952 video_core/control_flow: Use std::move where applicable
Results in less work being done where avoidable.
2019-07-18 21:03:30 -04:00
Lioncash
e7b39f47f8 video_core/control_flow: Use the prefix variant of operator++ for iterators
Same thing, but potentially allows a standard library implementation to
pick a more efficient codepath.
2019-07-18 21:03:30 -04:00
Lioncash
6885e7e7ec video_core/control_flow: Use empty() member function for checking emptiness
It's what it's there for.
2019-07-18 21:03:30 -04:00
Lioncash
45fa12a05c video_core: Resolve -Wreorder warnings
Ensures that the constructor members are always initialized in the order
that they're declared in.
2019-07-18 21:03:30 -04:00
Lioncash
47df844338 video_core/control_flow: Make program_size for ScanFlow() a std::size_t
Prevents a truncation warning from occurring with MSVC. Also the
internal data structures already treat it as a size_t, so this is just a
discrepancy in the interface.
2019-07-18 21:03:29 -04:00
Lioncash
3df9558593 video_core/control_flow: Place all internally linked types/functions within an anonymous namespace
Previously, quite a few functions were being linked with external
linkage.
2019-07-18 21:03:29 -04:00
Lioncash
1109db86b7 video_core/shader/decode: Prevent sign-conversion warnings
Makes it explicit that the conversions here are intentional.
2019-07-18 21:03:29 -04:00
Fernando Sahmkow
5a06e33859 Shader_Ir: correct clang format 2019-07-18 10:09:26 -04:00
Fernando Sahmkow
43f57d668c GPU: Add missing puller methods.
This adds some missing puller methods. We don't assert them as these are 
nop operations for us.
2019-07-18 08:54:42 -04:00
Fernando Sahmkow
3a3fee5abf MaxwellDMA/KeplerCopy: Downgrade DMA log message to Trace.
This log was just to know which games used DMA. It's no longer 
important.
2019-07-18 08:31:38 -04:00
Fernando Sahmkow
d3b71ff80d Gl_Texture_Cache: Remove assert on component type in GetFormatTuple
Textures can have different components types in different orders. This 
assert was completely inprecise and the effectiveness of such is better 
handled by case and within the texture cache.
2019-07-18 08:20:31 -04:00
Fernando Sahmkow
0b65e9335e Shader_Ir: Downgrade precision and rounding asserts to debug asserts.
This commit reduces the sevirity of asserts for FP precision and 
rounding as this are well known and have little to no consequences in 
gpu's accuracy.
2019-07-18 08:17:19 -04:00
Fernando Sahmkow
4be61013a1 GL_State: Feedback and fixes 2019-07-17 17:29:56 -04:00
Fernando Sahmkow
5ad889f6fd Maxwell3D: Address Feedback 2019-07-17 17:29:55 -04:00
Fernando Sahmkow
7826f0afd9 Texture_Cache: Rebase Fixes 2019-07-17 17:29:54 -04:00
Fernando Sahmkow
8cdbfe69b1 GL_Rasterizer: Corrections to Clearing. 2019-07-17 17:29:54 -04:00
Fernando Sahmkow
0ff4a5fa39 Maxwell3D: Correct marking dirtiness on CB upload 2019-07-17 17:29:53 -04:00
Fernando Sahmkow
fec32fed18 GL_Rasterizer: Rework RenderTarget/DepthBuffer clearing 2019-07-17 17:29:52 -04:00
Fernando Sahmkow
a081dea8ab Maxwell3D: Implement State Dirty Flags. 2019-07-17 17:29:51 -04:00
Fernando Sahmkow
0d3db58657 Maxwell3D: Rework CBData Upload 2019-07-17 17:29:50 -04:00
Fernando Sahmkow
f2e7b29c14 Maxwell3D: Rework the dirty system to be more consistant and scaleable 2019-07-17 17:29:49 -04:00
Fernando Sahmkow
e42bcf2314 maxwell3d: Implement Conditional Rendering
Conditional Rendering takes care of conditionaly clearing or drawing
depending on a set of queries. This PR implements the query checks to
stablish if things can be rendered or not.
2019-07-17 17:13:19 -04:00
Fernando Sahmkow
d614193e49 Shader_Ir: Correct tracking to track from right to left 2019-07-16 15:06:59 -04:00
ReinUsesLisp
2a4044a858 gl_shader_cache: Fix clang-format issues 2019-07-15 20:33:51 -03:00
ReinUsesLisp
6b0d017675 gl_shader_decompiler: Stub local memory size 2019-07-15 17:38:25 -03:00
ReinUsesLisp
56bca83bde gl_shader_cache: Address review commentaries 2019-07-15 17:38:25 -03:00
ReinUsesLisp
bbecd13697 gl_shader_cache: Address CI issues 2019-07-15 17:38:25 -03:00
ReinUsesLisp
725ba6cf63 gl_rasterizer: Implement compute shaders 2019-07-15 17:38:25 -03:00
Lioncash
5085a16d78 yuzu/main: Make error messages within OnCoreError more localization-friendly
Previously, a translated string was being appended onto another string
in a manner that doesn't allow the translator to control where the
appended text is placed. This can be a nuisance for languages where
grammar and text ordering differs from English.

We now append the strings via the format strings themselves, which
allows translators to reorder where the text will be placed.
2019-07-07 11:02:05 -04:00
Fernando Sahmkow
0fc98958a3 NVServices: Correct delayed responses. 2019-07-05 15:49:35 -04:00
Fernando Sahmkow
8c91d5c166 Nv_Host_Ctrl: Correct difference calculation 2019-07-05 15:49:34 -04:00
Fernando Sahmkow
f3a39e0c9c NVServices: Address Feedback 2019-07-05 15:49:33 -04:00
Fernando Sahmkow
d20ede40b1 NVServices: Styling, define constructors as explicit and corrections 2019-07-05 15:49:32 -04:00
Fernando Sahmkow
b391e5f638 NVFlinger: Correct GCC compile error 2019-07-05 15:49:31 -04:00
Fernando Sahmkow
0335a25d1f NVServices: Make NVEvents Automatic according to documentation. 2019-07-05 15:49:29 -04:00
Fernando Sahmkow
b6844bec60 NVServices: Correct CtrlEventWaitSync to block the ipc until timeout. 2019-07-05 15:49:28 -04:00
Fernando Sahmkow
7d1b974bca GPU: Correct Interrupts to interrupt on syncpt/value instead of event, mirroring hardware 2019-07-05 15:49:26 -04:00
Fernando Sahmkow
61697864c3 nvflinger: Make the force 30 fps still force 30 fps 2019-07-05 15:49:25 -04:00
Fernando Sahmkow
efdeab3a1d nv_services: Fixes to event liberation. 2019-07-05 15:49:24 -04:00
Fernando Sahmkow
ea97589624 nvflinger: Acquire buffers in the same order as they were queued. 2019-07-05 15:49:23 -04:00
Fernando Sahmkow
24408cce9b nv_services: Deglobalize NvServices 2019-07-05 15:49:22 -04:00
Fernando Sahmkow
f2e026a1d8 gpu_asynch: Simplify synchronization to a simpler consumer->producer scheme. 2019-07-05 15:49:20 -04:00
Fernando Sahmkow
0706d633bf nv_host_ctrl: Make Sync GPU variant always return synced result. 2019-07-05 15:49:20 -04:00
Fernando Sahmkow
600dddf88d Async GPU: do invalidate as synced operation
Async GPU: Always invalidate synced.
2019-07-05 15:49:19 -04:00
Fernando Sahmkow
c13433aee4 Gpu: use an std mutex instead of a spin_lock to guard syncpoints 2019-07-05 15:49:18 -04:00
Fernando Sahmkow
78add28aab nvhost_ctrl: Corrections to event handling 2019-07-05 15:49:17 -04:00
Fernando Sahmkow
eef55f493b Gpu: Mark areas as protected. 2019-07-05 15:49:16 -04:00
Fernando Sahmkow
a45643cb3b nv_services: Stub CtrlEventSignal 2019-07-05 15:49:15 -04:00
Fernando Sahmkow
8942047d41 Gpu: Implement Hardware Interrupt Manager and manage GPU interrupts 2019-07-05 15:49:14 -04:00
Fernando Sahmkow
e0027eba85 nv_services: Implement NvQueryEvent, NvCtrlEventWait, NvEventRegister, NvEventUnregister 2019-07-05 15:49:13 -04:00
Fernando Sahmkow
7039ece0a0 nv_services: Create GPU channels correctly 2019-07-05 15:49:12 -04:00
Fernando Sahmkow
82b829625b video_core: Implement GPU side Syncpoints 2019-07-05 15:49:11 -04:00
Fernando Sahmkow
737e978f5b nv_services: Correct buffer queue fencing and GPFifo fencing 2019-07-05 15:49:10 -04:00
Fernando Sahmkow
ceb5f5079c nvflinger: Implement swap intervals 2019-07-05 15:49:08 -04:00
85 changed files with 1909 additions and 653 deletions

View File

@@ -14,7 +14,7 @@ steps:
cacheHitVar: CACHE_RESTORED
- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/exec.sh && ./.ci/scripts/$(ScriptFolder)/exec.sh
displayName: 'Build'
- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && ./.ci/scripts/$(ScriptFolder)/upload.sh
- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && RELEASE_NAME=$(BuildName) ./.ci/scripts/$(ScriptFolder)/upload.sh
displayName: 'Package Artifacts'
- publish: artifacts
artifact: 'yuzu-$(BuildName)-$(BuildSuffix)'

View File

@@ -3,7 +3,7 @@ jobs:
displayName: 'standard'
pool:
vmImage: ubuntu-latest
strategy:
strategy:
maxParallel: 10
matrix:
windows:

View File

@@ -3,19 +3,21 @@ jobs:
displayName: 'testing'
pool:
vmImage: ubuntu-latest
strategy:
maxParallel: 10
strategy:
maxParallel: 5
matrix:
windows:
BuildSuffix: 'windows-testing'
ScriptFolder: 'windows'
steps:
- script: pip install requests urllib3
displayName: 'Prepare Environment'
- task: PythonScript@0
condition: eq(variables['Build.Reason'], 'PullRequest')
displayName: 'Determine Testing Status'
inputs:
scriptSource: 'filePath'
scriptPath: '../scripts/merge/check-label-presence.py'
scriptPath: '.ci/scripts/merge/check-label-presence.py'
arguments: '$(System.PullRequest.PullRequestNumber) create-testing-build'
- ${{ if eq(variables.enabletesting, 'true') }}:
- template: ./sync-source.yml
@@ -27,4 +29,4 @@ jobs:
matchLabel: 'testing-merge'
- template: ./build-single.yml
parameters:
artifactSource: 'false'
artifactSource: 'false'

View File

@@ -1,29 +0,0 @@
steps:
- task: DownloadPipelineArtifact@2
displayName: 'Download Windows Release'
inputs:
artifactName: 'yuzu-$(BuildName)-windows-mingw'
buildType: 'current'
targetPath: '$(Build.ArtifactStagingDirectory)'
- task: DownloadPipelineArtifact@2
displayName: 'Download Linux Release'
inputs:
artifactName: 'yuzu-$(BuildName)-linux'
buildType: 'current'
targetPath: '$(Build.ArtifactStagingDirectory)'
- task: DownloadPipelineArtifact@2
displayName: 'Download Release Point'
inputs:
artifactName: 'yuzu-$(BuildName)-release-point'
buildType: 'current'
targetPath: '$(Build.ArtifactStagingDirectory)'
- script: echo '##vso[task.setvariable variable=tagcommit]' && cat $(Build.ArtifactStagingDirectory)/tag-commit.sha
displayName: 'Calculate Release Point'
- task: GitHubRelease@0
inputs:
gitHubConnection: $(GitHubReleaseConnectionName)
repositoryName: '$(GitHubReleaseRepoName)'
action: 'create'
target: $(variables.tagcommit)
title: 'yuzu $(BuildName) #$(Build.BuildId)'
assets: '$(Build.ArtifactStagingDirectory)/*'

View File

@@ -2,6 +2,7 @@ yuzu emulator
=============
[![Travis CI Build Status](https://travis-ci.org/yuzu-emu/yuzu.svg?branch=master)](https://travis-ci.org/yuzu-emu/yuzu)
[![AppVeyor CI Build Status](https://ci.appveyor.com/api/projects/status/77k97svb2usreu68?svg=true)](https://ci.appveyor.com/project/bunnei/yuzu)
[![Azure Mainline CI Build Status](https://dev.azure.com/yuzu-emu/yuzu/_apis/build/status/yuzu%20mainline?branchName=master)](https://dev.azure.com/yuzu-emu/yuzu/)
yuzu is an experimental open-source emulator for the Nintendo Switch from the creators of [Citra](https://citra-emu.org/).

View File

@@ -111,6 +111,8 @@ add_library(core STATIC
frontend/scope_acquire_window_context.h
gdbstub/gdbstub.cpp
gdbstub/gdbstub.h
hardware_interrupt_manager.cpp
hardware_interrupt_manager.h
hle/ipc.h
hle/ipc_helpers.h
hle/kernel/address_arbiter.cpp
@@ -372,6 +374,7 @@ add_library(core STATIC
hle/service/nvdrv/devices/nvmap.h
hle/service/nvdrv/interface.cpp
hle/service/nvdrv/interface.h
hle/service/nvdrv/nvdata.h
hle/service/nvdrv/nvdrv.cpp
hle/service/nvdrv/nvdrv.h
hle/service/nvdrv/nvmemp.cpp

View File

@@ -19,6 +19,7 @@
#include "core/file_sys/vfs_concat.h"
#include "core/file_sys/vfs_real.h"
#include "core/gdbstub/gdbstub.h"
#include "core/hardware_interrupt_manager.h"
#include "core/hle/kernel/client_port.h"
#include "core/hle/kernel/kernel.h"
#include "core/hle/kernel/process.h"
@@ -151,7 +152,7 @@ struct System::Impl {
if (!renderer->Init()) {
return ResultStatus::ErrorVideoCore;
}
interrupt_manager = std::make_unique<Core::Hardware::InterruptManager>(system);
gpu_core = VideoCore::CreateGPU(system);
is_powered_on = true;
@@ -298,6 +299,7 @@ struct System::Impl {
std::unique_ptr<VideoCore::RendererBase> renderer;
std::unique_ptr<Tegra::GPU> gpu_core;
std::shared_ptr<Tegra::DebugContext> debug_context;
std::unique_ptr<Core::Hardware::InterruptManager> interrupt_manager;
CpuCoreManager cpu_core_manager;
bool is_powered_on = false;
@@ -444,6 +446,14 @@ const Tegra::GPU& System::GPU() const {
return *impl->gpu_core;
}
Core::Hardware::InterruptManager& System::InterruptManager() {
return *impl->interrupt_manager;
}
const Core::Hardware::InterruptManager& System::InterruptManager() const {
return *impl->interrupt_manager;
}
VideoCore::RendererBase& System::Renderer() {
return *impl->renderer;
}

View File

@@ -70,6 +70,10 @@ namespace Core::Timing {
class CoreTiming;
}
namespace Core::Hardware {
class InterruptManager;
}
namespace Core {
class ARM_Interface;
@@ -234,6 +238,12 @@ public:
/// Provides a constant reference to the core timing instance.
const Timing::CoreTiming& CoreTiming() const;
/// Provides a reference to the interrupt manager instance.
Core::Hardware::InterruptManager& InterruptManager();
/// Provides a constant reference to the interrupt manager instance.
const Core::Hardware::InterruptManager& InterruptManager() const;
/// Provides a reference to the kernel instance.
Kernel::KernelCore& Kernel();

View File

@@ -0,0 +1,30 @@
// Copyright 2019 Yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "core/core.h"
#include "core/core_timing.h"
#include "core/hardware_interrupt_manager.h"
#include "core/hle/service/nvdrv/interface.h"
#include "core/hle/service/sm/sm.h"
namespace Core::Hardware {
InterruptManager::InterruptManager(Core::System& system_in) : system(system_in) {
gpu_interrupt_event =
system.CoreTiming().RegisterEvent("GPUInterrupt", [this](u64 message, s64) {
auto nvdrv = system.ServiceManager().GetService<Service::Nvidia::NVDRV>("nvdrv");
const u32 syncpt = static_cast<u32>(message >> 32);
const u32 value = static_cast<u32>(message);
nvdrv->SignalGPUInterruptSyncpt(syncpt, value);
});
}
InterruptManager::~InterruptManager() = default;
void InterruptManager::GPUInterruptSyncpt(const u32 syncpoint_id, const u32 value) {
const u64 msg = (static_cast<u64>(syncpoint_id) << 32ULL) | value;
system.CoreTiming().ScheduleEvent(10, gpu_interrupt_event, msg);
}
} // namespace Core::Hardware

View File

@@ -0,0 +1,31 @@
// Copyright 2019 Yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "common/common_types.h"
namespace Core {
class System;
}
namespace Core::Timing {
struct EventType;
}
namespace Core::Hardware {
class InterruptManager {
public:
explicit InterruptManager(Core::System& system);
~InterruptManager();
void GPUInterruptSyncpt(u32 syncpoint_id, u32 value);
private:
Core::System& system;
Core::Timing::EventType* gpu_interrupt_event{};
};
} // namespace Core::Hardware

View File

@@ -8,6 +8,11 @@
#include "common/bit_field.h"
#include "common/common_types.h"
#include "common/swap.h"
#include "core/hle/service/nvdrv/nvdata.h"
namespace Core {
class System;
}
namespace Service::Nvidia::Devices {
@@ -15,7 +20,7 @@ namespace Service::Nvidia::Devices {
/// implement the ioctl interface.
class nvdevice {
public:
nvdevice() = default;
explicit nvdevice(Core::System& system) : system{system} {};
virtual ~nvdevice() = default;
union Ioctl {
u32_le raw;
@@ -33,7 +38,11 @@ public:
* @param output A buffer where the output data will be written to.
* @returns The result code of the ioctl.
*/
virtual u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) = 0;
virtual u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) = 0;
protected:
Core::System& system;
};
} // namespace Service::Nvidia::Devices

View File

@@ -13,10 +13,12 @@
namespace Service::Nvidia::Devices {
nvdisp_disp0::nvdisp_disp0(std::shared_ptr<nvmap> nvmap_dev) : nvmap_dev(std::move(nvmap_dev)) {}
nvdisp_disp0::nvdisp_disp0(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
: nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
nvdisp_disp0 ::~nvdisp_disp0() = default;
u32 nvdisp_disp0::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
u32 nvdisp_disp0::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) {
UNIMPLEMENTED_MSG("Unimplemented ioctl");
return 0;
}
@@ -34,9 +36,8 @@ void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u3
addr, offset, width, height, stride, static_cast<PixelFormat>(format),
transform, crop_rect};
auto& instance = Core::System::GetInstance();
instance.GetPerfStats().EndGameFrame();
instance.GPU().SwapBuffers(framebuffer);
system.GetPerfStats().EndGameFrame();
system.GPU().SwapBuffers(framebuffer);
}
} // namespace Service::Nvidia::Devices

View File

@@ -17,10 +17,11 @@ class nvmap;
class nvdisp_disp0 final : public nvdevice {
public:
explicit nvdisp_disp0(std::shared_ptr<nvmap> nvmap_dev);
explicit nvdisp_disp0(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
~nvdisp_disp0() override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) override;
/// Performs a screen flip, drawing the buffer pointed to by the handle.
void flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u32 height, u32 stride,

View File

@@ -22,10 +22,12 @@ enum {
};
}
nvhost_as_gpu::nvhost_as_gpu(std::shared_ptr<nvmap> nvmap_dev) : nvmap_dev(std::move(nvmap_dev)) {}
nvhost_as_gpu::nvhost_as_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
: nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
nvhost_as_gpu::~nvhost_as_gpu() = default;
u32 nvhost_as_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
u32 nvhost_as_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) {
LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
command.raw, input.size(), output.size());
@@ -65,7 +67,7 @@ u32 nvhost_as_gpu::AllocateSpace(const std::vector<u8>& input, std::vector<u8>&
LOG_DEBUG(Service_NVDRV, "called, pages={:X}, page_size={:X}, flags={:X}", params.pages,
params.page_size, params.flags);
auto& gpu = Core::System::GetInstance().GPU();
auto& gpu = system.GPU();
const u64 size{static_cast<u64>(params.pages) * static_cast<u64>(params.page_size)};
if (params.flags & 1) {
params.offset = gpu.MemoryManager().AllocateSpace(params.offset, size, 1);
@@ -85,7 +87,7 @@ u32 nvhost_as_gpu::Remap(const std::vector<u8>& input, std::vector<u8>& output)
std::vector<IoctlRemapEntry> entries(num_entries);
std::memcpy(entries.data(), input.data(), input.size());
auto& gpu = Core::System::GetInstance().GPU();
auto& gpu = system.GPU();
for (const auto& entry : entries) {
LOG_WARNING(Service_NVDRV, "remap entry, offset=0x{:X} handle=0x{:X} pages=0x{:X}",
entry.offset, entry.nvmap_handle, entry.pages);
@@ -136,7 +138,7 @@ u32 nvhost_as_gpu::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& ou
// case to prevent unexpected behavior.
ASSERT(object->id == params.nvmap_handle);
auto& gpu = Core::System::GetInstance().GPU();
auto& gpu = system.GPU();
if (params.flags & 1) {
params.offset = gpu.MemoryManager().MapBufferEx(object->addr, params.offset, object->size);
@@ -173,8 +175,7 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& ou
return 0;
}
params.offset = Core::System::GetInstance().GPU().MemoryManager().UnmapBuffer(params.offset,
itr->second.size);
params.offset = system.GPU().MemoryManager().UnmapBuffer(params.offset, itr->second.size);
buffer_mappings.erase(itr->second.offset);
std::memcpy(output.data(), &params, output.size());

View File

@@ -17,10 +17,11 @@ class nvmap;
class nvhost_as_gpu final : public nvdevice {
public:
explicit nvhost_as_gpu(std::shared_ptr<nvmap> nvmap_dev);
explicit nvhost_as_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
~nvhost_as_gpu() override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) override;
private:
enum class IoctlCommand : u32_le {

View File

@@ -7,14 +7,20 @@
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/core.h"
#include "core/hle/kernel/readable_event.h"
#include "core/hle/kernel/writable_event.h"
#include "core/hle/service/nvdrv/devices/nvhost_ctrl.h"
#include "video_core/gpu.h"
namespace Service::Nvidia::Devices {
nvhost_ctrl::nvhost_ctrl() = default;
nvhost_ctrl::nvhost_ctrl(Core::System& system, EventInterface& events_interface)
: nvdevice(system), events_interface{events_interface} {}
nvhost_ctrl::~nvhost_ctrl() = default;
u32 nvhost_ctrl::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
u32 nvhost_ctrl::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) {
LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
command.raw, input.size(), output.size());
@@ -22,11 +28,15 @@ u32 nvhost_ctrl::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<
case IoctlCommand::IocGetConfigCommand:
return NvOsGetConfigU32(input, output);
case IoctlCommand::IocCtrlEventWaitCommand:
return IocCtrlEventWait(input, output, false);
return IocCtrlEventWait(input, output, false, ctrl);
case IoctlCommand::IocCtrlEventWaitAsyncCommand:
return IocCtrlEventWait(input, output, true);
return IocCtrlEventWait(input, output, true, ctrl);
case IoctlCommand::IocCtrlEventRegisterCommand:
return IocCtrlEventRegister(input, output);
case IoctlCommand::IocCtrlEventUnregisterCommand:
return IocCtrlEventUnregister(input, output);
case IoctlCommand::IocCtrlEventSignalCommand:
return IocCtrlEventSignal(input, output);
}
UNIMPLEMENTED_MSG("Unimplemented ioctl");
return 0;
@@ -41,23 +51,137 @@ u32 nvhost_ctrl::NvOsGetConfigU32(const std::vector<u8>& input, std::vector<u8>&
}
u32 nvhost_ctrl::IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output,
bool is_async) {
bool is_async, IoctlCtrl& ctrl) {
IocCtrlEventWaitParams params{};
std::memcpy(&params, input.data(), sizeof(params));
LOG_WARNING(Service_NVDRV,
"(STUBBED) called, syncpt_id={}, threshold={}, timeout={}, is_async={}",
params.syncpt_id, params.threshold, params.timeout, is_async);
LOG_DEBUG(Service_NVDRV, "syncpt_id={}, threshold={}, timeout={}, is_async={}",
params.syncpt_id, params.threshold, params.timeout, is_async);
// TODO(Subv): Implement actual syncpt waiting.
params.value = 0;
if (params.syncpt_id >= MaxSyncPoints) {
return NvResult::BadParameter;
}
auto& gpu = system.GPU();
// This is mostly to take into account unimplemented features. As synced
// gpu is always synced.
if (!gpu.IsAsync()) {
return NvResult::Success;
}
auto lock = gpu.LockSync();
const u32 current_syncpoint_value = gpu.GetSyncpointValue(params.syncpt_id);
const s32 diff = current_syncpoint_value - params.threshold;
if (diff >= 0) {
params.value = current_syncpoint_value;
std::memcpy(output.data(), &params, sizeof(params));
return NvResult::Success;
}
const u32 target_value = current_syncpoint_value - diff;
if (!is_async) {
params.value = 0;
}
if (params.timeout == 0) {
std::memcpy(output.data(), &params, sizeof(params));
return NvResult::Timeout;
}
u32 event_id;
if (is_async) {
event_id = params.value & 0x00FF;
if (event_id >= MaxNvEvents) {
std::memcpy(output.data(), &params, sizeof(params));
return NvResult::BadParameter;
}
} else {
if (ctrl.fresh_call) {
const auto result = events_interface.GetFreeEvent();
if (result) {
event_id = *result;
} else {
LOG_CRITICAL(Service_NVDRV, "No Free Events available!");
event_id = params.value & 0x00FF;
}
} else {
event_id = ctrl.event_id;
}
}
EventState status = events_interface.status[event_id];
if (event_id < MaxNvEvents || status == EventState::Free || status == EventState::Registered) {
events_interface.SetEventStatus(event_id, EventState::Waiting);
events_interface.assigned_syncpt[event_id] = params.syncpt_id;
events_interface.assigned_value[event_id] = target_value;
if (is_async) {
params.value = params.syncpt_id << 4;
} else {
params.value = ((params.syncpt_id & 0xfff) << 16) | 0x10000000;
}
params.value |= event_id;
events_interface.events[event_id].writable->Clear();
gpu.RegisterSyncptInterrupt(params.syncpt_id, target_value);
if (!is_async && ctrl.fresh_call) {
ctrl.must_delay = true;
ctrl.timeout = params.timeout;
ctrl.event_id = event_id;
return NvResult::Timeout;
}
std::memcpy(output.data(), &params, sizeof(params));
return NvResult::Timeout;
}
std::memcpy(output.data(), &params, sizeof(params));
return 0;
return NvResult::BadParameter;
}
u32 nvhost_ctrl::IocCtrlEventRegister(const std::vector<u8>& input, std::vector<u8>& output) {
LOG_WARNING(Service_NVDRV, "(STUBBED) called");
// TODO(bunnei): Implement this.
return 0;
IocCtrlEventRegisterParams params{};
std::memcpy(&params, input.data(), sizeof(params));
const u32 event_id = params.user_event_id & 0x00FF;
LOG_DEBUG(Service_NVDRV, " called, user_event_id: {:X}", event_id);
if (event_id >= MaxNvEvents) {
return NvResult::BadParameter;
}
if (events_interface.registered[event_id]) {
return NvResult::BadParameter;
}
events_interface.RegisterEvent(event_id);
return NvResult::Success;
}
u32 nvhost_ctrl::IocCtrlEventUnregister(const std::vector<u8>& input, std::vector<u8>& output) {
IocCtrlEventUnregisterParams params{};
std::memcpy(&params, input.data(), sizeof(params));
const u32 event_id = params.user_event_id & 0x00FF;
LOG_DEBUG(Service_NVDRV, " called, user_event_id: {:X}", event_id);
if (event_id >= MaxNvEvents) {
return NvResult::BadParameter;
}
if (!events_interface.registered[event_id]) {
return NvResult::BadParameter;
}
events_interface.UnregisterEvent(event_id);
return NvResult::Success;
}
u32 nvhost_ctrl::IocCtrlEventSignal(const std::vector<u8>& input, std::vector<u8>& output) {
IocCtrlEventSignalParams params{};
std::memcpy(&params, input.data(), sizeof(params));
// TODO(Blinkhawk): This is normally called when an NvEvents timeout on WaitSynchronization
// It is believed from RE to cancel the GPU Event. However, better research is required
u32 event_id = params.user_event_id & 0x00FF;
LOG_WARNING(Service_NVDRV, "(STUBBED) called, user_event_id: {:X}", event_id);
if (event_id >= MaxNvEvents) {
return NvResult::BadParameter;
}
if (events_interface.status[event_id] == EventState::Waiting) {
auto& gpu = system.GPU();
if (gpu.CancelSyncptInterrupt(events_interface.assigned_syncpt[event_id],
events_interface.assigned_value[event_id])) {
events_interface.LiberateEvent(event_id);
events_interface.events[event_id].writable->Signal();
}
}
return NvResult::Success;
}
} // namespace Service::Nvidia::Devices

View File

@@ -8,15 +8,17 @@
#include <vector>
#include "common/common_types.h"
#include "core/hle/service/nvdrv/devices/nvdevice.h"
#include "core/hle/service/nvdrv/nvdrv.h"
namespace Service::Nvidia::Devices {
class nvhost_ctrl final : public nvdevice {
public:
nvhost_ctrl();
explicit nvhost_ctrl(Core::System& system, EventInterface& events_interface);
~nvhost_ctrl() override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) override;
private:
enum class IoctlCommand : u32_le {
@@ -132,9 +134,16 @@ private:
u32 NvOsGetConfigU32(const std::vector<u8>& input, std::vector<u8>& output);
u32 IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output, bool is_async);
u32 IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output, bool is_async,
IoctlCtrl& ctrl);
u32 IocCtrlEventRegister(const std::vector<u8>& input, std::vector<u8>& output);
u32 IocCtrlEventUnregister(const std::vector<u8>& input, std::vector<u8>& output);
u32 IocCtrlEventSignal(const std::vector<u8>& input, std::vector<u8>& output);
EventInterface& events_interface;
};
} // namespace Service::Nvidia::Devices

View File

@@ -12,10 +12,11 @@
namespace Service::Nvidia::Devices {
nvhost_ctrl_gpu::nvhost_ctrl_gpu() = default;
nvhost_ctrl_gpu::nvhost_ctrl_gpu(Core::System& system) : nvdevice(system) {}
nvhost_ctrl_gpu::~nvhost_ctrl_gpu() = default;
u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) {
LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
command.raw, input.size(), output.size());
@@ -185,7 +186,7 @@ u32 nvhost_ctrl_gpu::GetGpuTime(const std::vector<u8>& input, std::vector<u8>& o
IoctlGetGpuTime params{};
std::memcpy(&params, input.data(), input.size());
const auto ns = Core::Timing::CyclesToNs(Core::System::GetInstance().CoreTiming().GetTicks());
const auto ns = Core::Timing::CyclesToNs(system.CoreTiming().GetTicks());
params.gpu_time = static_cast<u64_le>(ns.count());
std::memcpy(output.data(), &params, output.size());
return 0;

View File

@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {
class nvhost_ctrl_gpu final : public nvdevice {
public:
nvhost_ctrl_gpu();
explicit nvhost_ctrl_gpu(Core::System& system);
~nvhost_ctrl_gpu() override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) override;
private:
enum class IoctlCommand : u32_le {

View File

@@ -13,10 +13,12 @@
namespace Service::Nvidia::Devices {
nvhost_gpu::nvhost_gpu(std::shared_ptr<nvmap> nvmap_dev) : nvmap_dev(std::move(nvmap_dev)) {}
nvhost_gpu::nvhost_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
: nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
nvhost_gpu::~nvhost_gpu() = default;
u32 nvhost_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
u32 nvhost_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) {
LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
command.raw, input.size(), output.size());
@@ -119,8 +121,10 @@ u32 nvhost_gpu::AllocGPFIFOEx2(const std::vector<u8>& input, std::vector<u8>& ou
params.num_entries, params.flags, params.unk0, params.unk1, params.unk2,
params.unk3);
params.fence_out.id = 0;
params.fence_out.value = 0;
auto& gpu = system.GPU();
params.fence_out.id = assigned_syncpoints;
params.fence_out.value = gpu.GetSyncpointValue(assigned_syncpoints);
assigned_syncpoints++;
std::memcpy(output.data(), &params, output.size());
return 0;
}
@@ -143,7 +147,7 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
IoctlSubmitGpfifo params{};
std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
params.address, params.num_entries, params.flags);
params.address, params.num_entries, params.flags.raw);
ASSERT_MSG(input.size() == sizeof(IoctlSubmitGpfifo) +
params.num_entries * sizeof(Tegra::CommandListHeader),
@@ -153,10 +157,18 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
params.num_entries * sizeof(Tegra::CommandListHeader));
Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));
UNIMPLEMENTED_IF(params.flags.add_wait.Value() != 0);
UNIMPLEMENTED_IF(params.flags.add_increment.Value() != 0);
auto& gpu = system.GPU();
u32 current_syncpoint_value = gpu.GetSyncpointValue(params.fence_out.id);
if (params.flags.increment.Value()) {
params.fence_out.value += current_syncpoint_value;
} else {
params.fence_out.value = current_syncpoint_value;
}
gpu.PushGPUEntries(std::move(entries));
params.fence_out.id = 0;
params.fence_out.value = 0;
std::memcpy(output.data(), &params, sizeof(IoctlSubmitGpfifo));
return 0;
}
@@ -168,16 +180,24 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
IoctlSubmitGpfifo params{};
std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
params.address, params.num_entries, params.flags);
params.address, params.num_entries, params.flags.raw);
Tegra::CommandList entries(params.num_entries);
Memory::ReadBlock(params.address, entries.data(),
params.num_entries * sizeof(Tegra::CommandListHeader));
Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));
UNIMPLEMENTED_IF(params.flags.add_wait.Value() != 0);
UNIMPLEMENTED_IF(params.flags.add_increment.Value() != 0);
auto& gpu = system.GPU();
u32 current_syncpoint_value = gpu.GetSyncpointValue(params.fence_out.id);
if (params.flags.increment.Value()) {
params.fence_out.value += current_syncpoint_value;
} else {
params.fence_out.value = current_syncpoint_value;
}
gpu.PushGPUEntries(std::move(entries));
params.fence_out.id = 0;
params.fence_out.value = 0;
std::memcpy(output.data(), &params, output.size());
return 0;
}

View File

@@ -10,6 +10,7 @@
#include "common/common_types.h"
#include "common/swap.h"
#include "core/hle/service/nvdrv/devices/nvdevice.h"
#include "core/hle/service/nvdrv/nvdata.h"
namespace Service::Nvidia::Devices {
@@ -20,10 +21,11 @@ constexpr u32 NVGPU_IOCTL_CHANNEL_KICKOFF_PB(0x1b);
class nvhost_gpu final : public nvdevice {
public:
explicit nvhost_gpu(std::shared_ptr<nvmap> nvmap_dev);
explicit nvhost_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
~nvhost_gpu() override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) override;
private:
enum class IoctlCommand : u32_le {
@@ -113,11 +115,7 @@ private:
static_assert(sizeof(IoctlGetErrorNotification) == 16,
"IoctlGetErrorNotification is incorrect size");
struct IoctlFence {
u32_le id;
u32_le value;
};
static_assert(sizeof(IoctlFence) == 8, "IoctlFence is incorrect size");
static_assert(sizeof(Fence) == 8, "Fence is incorrect size");
struct IoctlAllocGpfifoEx {
u32_le num_entries;
@@ -132,13 +130,13 @@ private:
static_assert(sizeof(IoctlAllocGpfifoEx) == 32, "IoctlAllocGpfifoEx is incorrect size");
struct IoctlAllocGpfifoEx2 {
u32_le num_entries; // in
u32_le flags; // in
u32_le unk0; // in (1 works)
IoctlFence fence_out; // out
u32_le unk1; // in
u32_le unk2; // in
u32_le unk3; // in
u32_le num_entries; // in
u32_le flags; // in
u32_le unk0; // in (1 works)
Fence fence_out; // out
u32_le unk1; // in
u32_le unk2; // in
u32_le unk3; // in
};
static_assert(sizeof(IoctlAllocGpfifoEx2) == 32, "IoctlAllocGpfifoEx2 is incorrect size");
@@ -153,10 +151,16 @@ private:
struct IoctlSubmitGpfifo {
u64_le address; // pointer to gpfifo entry structs
u32_le num_entries; // number of fence objects being submitted
u32_le flags;
IoctlFence fence_out; // returned new fence object for others to wait on
union {
u32_le raw;
BitField<0, 1, u32_le> add_wait; // append a wait sync_point to the list
BitField<1, 1, u32_le> add_increment; // append an increment to the list
BitField<2, 1, u32_le> new_hw_format; // Mostly ignored
BitField<8, 1, u32_le> increment; // increment the returned fence
} flags;
Fence fence_out; // returned new fence object for others to wait on
};
static_assert(sizeof(IoctlSubmitGpfifo) == 16 + sizeof(IoctlFence),
static_assert(sizeof(IoctlSubmitGpfifo) == 16 + sizeof(Fence),
"IoctlSubmitGpfifo is incorrect size");
struct IoctlGetWaitbase {
@@ -184,6 +188,7 @@ private:
u32 ChannelSetTimeout(const std::vector<u8>& input, std::vector<u8>& output);
std::shared_ptr<nvmap> nvmap_dev;
u32 assigned_syncpoints{};
};
} // namespace Service::Nvidia::Devices

View File

@@ -10,10 +10,11 @@
namespace Service::Nvidia::Devices {
nvhost_nvdec::nvhost_nvdec() = default;
nvhost_nvdec::nvhost_nvdec(Core::System& system) : nvdevice(system) {}
nvhost_nvdec::~nvhost_nvdec() = default;
u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) {
LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
command.raw, input.size(), output.size());

View File

@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {
class nvhost_nvdec final : public nvdevice {
public:
nvhost_nvdec();
explicit nvhost_nvdec(Core::System& system);
~nvhost_nvdec() override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) override;
private:
enum class IoctlCommand : u32_le {

View File

@@ -10,10 +10,11 @@
namespace Service::Nvidia::Devices {
nvhost_nvjpg::nvhost_nvjpg() = default;
nvhost_nvjpg::nvhost_nvjpg(Core::System& system) : nvdevice(system) {}
nvhost_nvjpg::~nvhost_nvjpg() = default;
u32 nvhost_nvjpg::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
u32 nvhost_nvjpg::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) {
LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
command.raw, input.size(), output.size());

View File

@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {
class nvhost_nvjpg final : public nvdevice {
public:
nvhost_nvjpg();
explicit nvhost_nvjpg(Core::System& system);
~nvhost_nvjpg() override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) override;
private:
enum class IoctlCommand : u32_le {

View File

@@ -10,10 +10,11 @@
namespace Service::Nvidia::Devices {
nvhost_vic::nvhost_vic() = default;
nvhost_vic::nvhost_vic(Core::System& system) : nvdevice(system) {}
nvhost_vic::~nvhost_vic() = default;
u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) {
LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
command.raw, input.size(), output.size());

View File

@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {
class nvhost_vic final : public nvdevice {
public:
nvhost_vic();
explicit nvhost_vic(Core::System& system);
~nvhost_vic() override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) override;
private:
enum class IoctlCommand : u32_le {

View File

@@ -18,7 +18,7 @@ enum {
};
}
nvmap::nvmap() = default;
nvmap::nvmap(Core::System& system) : nvdevice(system) {}
nvmap::~nvmap() = default;
VAddr nvmap::GetObjectAddress(u32 handle) const {
@@ -28,7 +28,8 @@ VAddr nvmap::GetObjectAddress(u32 handle) const {
return object->addr;
}
u32 nvmap::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
u32 nvmap::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) {
switch (static_cast<IoctlCommand>(command.raw)) {
case IoctlCommand::Create:
return IocCreate(input, output);

View File

@@ -16,13 +16,14 @@ namespace Service::Nvidia::Devices {
class nvmap final : public nvdevice {
public:
nvmap();
explicit nvmap(Core::System& system);
~nvmap() override;
/// Returns the allocated address of an nvmap object given its handle.
VAddr GetObjectAddress(u32 handle) const;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) override;
/// Represents an nvmap object.
struct Object {

View File

@@ -8,12 +8,18 @@
#include "core/hle/ipc_helpers.h"
#include "core/hle/kernel/kernel.h"
#include "core/hle/kernel/readable_event.h"
#include "core/hle/kernel/thread.h"
#include "core/hle/kernel/writable_event.h"
#include "core/hle/service/nvdrv/interface.h"
#include "core/hle/service/nvdrv/nvdata.h"
#include "core/hle/service/nvdrv/nvdrv.h"
namespace Service::Nvidia {
void NVDRV::SignalGPUInterruptSyncpt(const u32 syncpoint_id, const u32 value) {
nvdrv->SignalSyncpt(syncpoint_id, value);
}
void NVDRV::Open(Kernel::HLERequestContext& ctx) {
LOG_DEBUG(Service_NVDRV, "called");
@@ -36,11 +42,31 @@ void NVDRV::Ioctl(Kernel::HLERequestContext& ctx) {
std::vector<u8> output(ctx.GetWriteBufferSize());
IoctlCtrl ctrl{};
u32 result = nvdrv->Ioctl(fd, command, ctx.ReadBuffer(), output, ctrl);
if (ctrl.must_delay) {
ctrl.fresh_call = false;
ctx.SleepClientThread(
"NVServices::DelayedResponse", ctrl.timeout,
[=](Kernel::SharedPtr<Kernel::Thread> thread, Kernel::HLERequestContext& ctx,
Kernel::ThreadWakeupReason reason) {
IoctlCtrl ctrl2{ctrl};
std::vector<u8> output2 = output;
u32 result = nvdrv->Ioctl(fd, command, ctx.ReadBuffer(), output2, ctrl2);
ctx.WriteBuffer(output2);
IPC::ResponseBuilder rb{ctx, 3};
rb.Push(RESULT_SUCCESS);
rb.Push(result);
},
nvdrv->GetEventWriteable(ctrl.event_id));
} else {
ctx.WriteBuffer(output);
}
IPC::ResponseBuilder rb{ctx, 3};
rb.Push(RESULT_SUCCESS);
rb.Push(nvdrv->Ioctl(fd, command, ctx.ReadBuffer(), output));
ctx.WriteBuffer(output);
rb.Push(result);
}
void NVDRV::Close(Kernel::HLERequestContext& ctx) {
@@ -66,13 +92,19 @@ void NVDRV::Initialize(Kernel::HLERequestContext& ctx) {
void NVDRV::QueryEvent(Kernel::HLERequestContext& ctx) {
IPC::RequestParser rp{ctx};
u32 fd = rp.Pop<u32>();
u32 event_id = rp.Pop<u32>();
// TODO(Blinkhawk): Figure the meaning of the flag at bit 16
u32 event_id = rp.Pop<u32>() & 0x000000FF;
LOG_WARNING(Service_NVDRV, "(STUBBED) called, fd={:X}, event_id={:X}", fd, event_id);
IPC::ResponseBuilder rb{ctx, 3, 1};
rb.Push(RESULT_SUCCESS);
rb.PushCopyObjects(query_event.readable);
rb.Push<u32>(0);
if (event_id < MaxNvEvents) {
rb.PushCopyObjects(nvdrv->GetEvent(event_id));
rb.Push<u32>(NvResult::Success);
} else {
rb.Push<u32>(0);
rb.Push<u32>(NvResult::BadParameter);
}
}
void NVDRV::SetClientPID(Kernel::HLERequestContext& ctx) {
@@ -127,10 +159,6 @@ NVDRV::NVDRV(std::shared_ptr<Module> nvdrv, const char* name)
{13, &NVDRV::FinishInitialize, "FinishInitialize"},
};
RegisterHandlers(functions);
auto& kernel = Core::System::GetInstance().Kernel();
query_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
"NVDRV::query_event");
}
NVDRV::~NVDRV() = default;

View File

@@ -19,6 +19,8 @@ public:
NVDRV(std::shared_ptr<Module> nvdrv, const char* name);
~NVDRV() override;
void SignalGPUInterruptSyncpt(const u32 syncpoint_id, const u32 value);
private:
void Open(Kernel::HLERequestContext& ctx);
void Ioctl(Kernel::HLERequestContext& ctx);
@@ -33,8 +35,6 @@ private:
std::shared_ptr<Module> nvdrv;
u64 pid{};
Kernel::EventPair query_event;
};
} // namespace Service::Nvidia

View File

@@ -0,0 +1,48 @@
#pragma once
#include <array>
#include "common/common_types.h"
namespace Service::Nvidia {
constexpr u32 MaxSyncPoints = 192;
constexpr u32 MaxNvEvents = 64;
struct Fence {
s32 id;
u32 value;
};
static_assert(sizeof(Fence) == 8, "Fence has wrong size");
struct MultiFence {
u32 num_fences;
std::array<Fence, 4> fences;
};
enum NvResult : u32 {
Success = 0,
BadParameter = 4,
Timeout = 5,
ResourceError = 15,
};
enum class EventState {
Free = 0,
Registered = 1,
Waiting = 2,
Busy = 3,
};
struct IoctlCtrl {
// First call done to the servioce for services that call itself again after a call.
bool fresh_call{true};
// Tells the Ioctl Wrapper that it must delay the IPC response and send the thread to sleep
bool must_delay{};
// Timeout for the delay
s64 timeout{};
// NV Event Id
s32 event_id{-1};
};
} // namespace Service::Nvidia

View File

@@ -4,7 +4,10 @@
#include <utility>
#include <fmt/format.h>
#include "core/hle/ipc_helpers.h"
#include "core/hle/kernel/readable_event.h"
#include "core/hle/kernel/writable_event.h"
#include "core/hle/service/nvdrv/devices/nvdevice.h"
#include "core/hle/service/nvdrv/devices/nvdisp_disp0.h"
#include "core/hle/service/nvdrv/devices/nvhost_as_gpu.h"
@@ -22,8 +25,9 @@
namespace Service::Nvidia {
void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger) {
auto module_ = std::make_shared<Module>();
void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger,
Core::System& system) {
auto module_ = std::make_shared<Module>(system);
std::make_shared<NVDRV>(module_, "nvdrv")->InstallAsService(service_manager);
std::make_shared<NVDRV>(module_, "nvdrv:a")->InstallAsService(service_manager);
std::make_shared<NVDRV>(module_, "nvdrv:s")->InstallAsService(service_manager);
@@ -32,17 +36,25 @@ void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger
nvflinger.SetNVDrvInstance(module_);
}
Module::Module() {
auto nvmap_dev = std::make_shared<Devices::nvmap>();
devices["/dev/nvhost-as-gpu"] = std::make_shared<Devices::nvhost_as_gpu>(nvmap_dev);
devices["/dev/nvhost-gpu"] = std::make_shared<Devices::nvhost_gpu>(nvmap_dev);
devices["/dev/nvhost-ctrl-gpu"] = std::make_shared<Devices::nvhost_ctrl_gpu>();
Module::Module(Core::System& system) {
auto& kernel = system.Kernel();
for (u32 i = 0; i < MaxNvEvents; i++) {
std::string event_label = fmt::format("NVDRV::NvEvent_{}", i);
events_interface.events[i] = Kernel::WritableEvent::CreateEventPair(
kernel, Kernel::ResetType::Automatic, event_label);
events_interface.status[i] = EventState::Free;
events_interface.registered[i] = false;
}
auto nvmap_dev = std::make_shared<Devices::nvmap>(system);
devices["/dev/nvhost-as-gpu"] = std::make_shared<Devices::nvhost_as_gpu>(system, nvmap_dev);
devices["/dev/nvhost-gpu"] = std::make_shared<Devices::nvhost_gpu>(system, nvmap_dev);
devices["/dev/nvhost-ctrl-gpu"] = std::make_shared<Devices::nvhost_ctrl_gpu>(system);
devices["/dev/nvmap"] = nvmap_dev;
devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(nvmap_dev);
devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>();
devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>();
devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>();
devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>();
devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(system, nvmap_dev);
devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>(system, events_interface);
devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system);
devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>(system);
devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system);
}
Module::~Module() = default;
@@ -59,12 +71,13 @@ u32 Module::Open(const std::string& device_name) {
return fd;
}
u32 Module::Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output) {
u32 Module::Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl) {
auto itr = open_files.find(fd);
ASSERT_MSG(itr != open_files.end(), "Tried to talk to an invalid device");
auto& device = itr->second;
return device->ioctl({command}, input, output);
return device->ioctl({command}, input, output, ctrl);
}
ResultCode Module::Close(u32 fd) {
@@ -77,4 +90,22 @@ ResultCode Module::Close(u32 fd) {
return RESULT_SUCCESS;
}
void Module::SignalSyncpt(const u32 syncpoint_id, const u32 value) {
for (u32 i = 0; i < MaxNvEvents; i++) {
if (events_interface.assigned_syncpt[i] == syncpoint_id &&
events_interface.assigned_value[i] == value) {
events_interface.LiberateEvent(i);
events_interface.events[i].writable->Signal();
}
}
}
Kernel::SharedPtr<Kernel::ReadableEvent> Module::GetEvent(const u32 event_id) const {
return events_interface.events[event_id].readable;
}
Kernel::SharedPtr<Kernel::WritableEvent> Module::GetEventWriteable(const u32 event_id) const {
return events_interface.events[event_id].writable;
}
} // namespace Service::Nvidia

View File

@@ -8,8 +8,14 @@
#include <unordered_map>
#include <vector>
#include "common/common_types.h"
#include "core/hle/kernel/writable_event.h"
#include "core/hle/service/nvdrv/nvdata.h"
#include "core/hle/service/service.h"
namespace Core {
class System;
}
namespace Service::NVFlinger {
class NVFlinger;
}
@@ -20,16 +26,72 @@ namespace Devices {
class nvdevice;
}
struct IoctlFence {
u32 id;
u32 value;
struct EventInterface {
// Mask representing currently busy events
u64 events_mask{};
// Each kernel event associated to an NV event
std::array<Kernel::EventPair, MaxNvEvents> events;
// The status of the current NVEvent
std::array<EventState, MaxNvEvents> status{};
// Tells if an NVEvent is registered or not
std::array<bool, MaxNvEvents> registered{};
// When an NVEvent is waiting on GPU interrupt, this is the sync_point
// associated with it.
std::array<u32, MaxNvEvents> assigned_syncpt{};
// This is the value of the GPU interrupt for which the NVEvent is waiting
// for.
std::array<u32, MaxNvEvents> assigned_value{};
// Constant to denote an unasigned syncpoint.
static constexpr u32 unassigned_syncpt = 0xFFFFFFFF;
std::optional<u32> GetFreeEvent() const {
u64 mask = events_mask;
for (u32 i = 0; i < MaxNvEvents; i++) {
const bool is_free = (mask & 0x1) == 0;
if (is_free) {
if (status[i] == EventState::Registered || status[i] == EventState::Free) {
return {i};
}
}
mask = mask >> 1;
}
return {};
}
void SetEventStatus(const u32 event_id, EventState new_status) {
EventState old_status = status[event_id];
if (old_status == new_status) {
return;
}
status[event_id] = new_status;
if (new_status == EventState::Registered) {
registered[event_id] = true;
}
if (new_status == EventState::Waiting || new_status == EventState::Busy) {
events_mask |= (1ULL << event_id);
}
}
void RegisterEvent(const u32 event_id) {
registered[event_id] = true;
if (status[event_id] == EventState::Free) {
status[event_id] = EventState::Registered;
}
}
void UnregisterEvent(const u32 event_id) {
registered[event_id] = false;
if (status[event_id] == EventState::Registered) {
status[event_id] = EventState::Free;
}
}
void LiberateEvent(const u32 event_id) {
status[event_id] = registered[event_id] ? EventState::Registered : EventState::Free;
events_mask &= ~(1ULL << event_id);
assigned_syncpt[event_id] = unassigned_syncpt;
assigned_value[event_id] = 0;
}
};
static_assert(sizeof(IoctlFence) == 8, "IoctlFence has wrong size");
class Module final {
public:
Module();
Module(Core::System& system);
~Module();
/// Returns a pointer to one of the available devices, identified by its name.
@@ -44,10 +106,17 @@ public:
/// Opens a device node and returns a file descriptor to it.
u32 Open(const std::string& device_name);
/// Sends an ioctl command to the specified file descriptor.
u32 Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output);
u32 Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output,
IoctlCtrl& ctrl);
/// Closes a device file descriptor and returns operation success.
ResultCode Close(u32 fd);
void SignalSyncpt(const u32 syncpoint_id, const u32 value);
Kernel::SharedPtr<Kernel::ReadableEvent> GetEvent(u32 event_id) const;
Kernel::SharedPtr<Kernel::WritableEvent> GetEventWriteable(u32 event_id) const;
private:
/// Id to use for the next open file descriptor.
u32 next_fd = 1;
@@ -57,9 +126,12 @@ private:
/// Mapping of device node names to their implementation.
std::unordered_map<std::string, std::shared_ptr<Devices::nvdevice>> devices;
EventInterface events_interface;
};
/// Registers all NVDRV services with the specified service manager.
void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger);
void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger,
Core::System& system);
} // namespace Service::Nvidia

View File

@@ -34,7 +34,8 @@ void BufferQueue::SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer)
buffer_wait_event.writable->Signal();
}
std::optional<u32> BufferQueue::DequeueBuffer(u32 width, u32 height) {
std::optional<std::pair<u32, Service::Nvidia::MultiFence*>> BufferQueue::DequeueBuffer(u32 width,
u32 height) {
auto itr = std::find_if(queue.begin(), queue.end(), [&](const Buffer& buffer) {
// Only consider free buffers. Buffers become free once again after they've been Acquired
// and Released by the compositor, see the NVFlinger::Compose method.
@@ -51,7 +52,7 @@ std::optional<u32> BufferQueue::DequeueBuffer(u32 width, u32 height) {
}
itr->status = Buffer::Status::Dequeued;
return itr->slot;
return {{itr->slot, &itr->multi_fence}};
}
const IGBPBuffer& BufferQueue::RequestBuffer(u32 slot) const {
@@ -63,7 +64,8 @@ const IGBPBuffer& BufferQueue::RequestBuffer(u32 slot) const {
}
void BufferQueue::QueueBuffer(u32 slot, BufferTransformFlags transform,
const Common::Rectangle<int>& crop_rect) {
const Common::Rectangle<int>& crop_rect, u32 swap_interval,
Service::Nvidia::MultiFence& multi_fence) {
auto itr = std::find_if(queue.begin(), queue.end(),
[&](const Buffer& buffer) { return buffer.slot == slot; });
ASSERT(itr != queue.end());
@@ -71,12 +73,21 @@ void BufferQueue::QueueBuffer(u32 slot, BufferTransformFlags transform,
itr->status = Buffer::Status::Queued;
itr->transform = transform;
itr->crop_rect = crop_rect;
itr->swap_interval = swap_interval;
itr->multi_fence = multi_fence;
queue_sequence.push_back(slot);
}
std::optional<std::reference_wrapper<const BufferQueue::Buffer>> BufferQueue::AcquireBuffer() {
auto itr = std::find_if(queue.begin(), queue.end(), [](const Buffer& buffer) {
return buffer.status == Buffer::Status::Queued;
});
auto itr = queue.end();
// Iterate to find a queued buffer matching the requested slot.
while (itr == queue.end() && !queue_sequence.empty()) {
u32 slot = queue_sequence.front();
itr = std::find_if(queue.begin(), queue.end(), [&slot](const Buffer& buffer) {
return buffer.status == Buffer::Status::Queued && buffer.slot == slot;
});
queue_sequence.pop_front();
}
if (itr == queue.end())
return {};
itr->status = Buffer::Status::Acquired;

View File

@@ -4,6 +4,7 @@
#pragma once
#include <list>
#include <optional>
#include <vector>
@@ -12,6 +13,7 @@
#include "common/swap.h"
#include "core/hle/kernel/object.h"
#include "core/hle/kernel/writable_event.h"
#include "core/hle/service/nvdrv/nvdata.h"
namespace Service::NVFlinger {
@@ -68,13 +70,17 @@ public:
IGBPBuffer igbp_buffer;
BufferTransformFlags transform;
Common::Rectangle<int> crop_rect;
u32 swap_interval;
Service::Nvidia::MultiFence multi_fence;
};
void SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer);
std::optional<u32> DequeueBuffer(u32 width, u32 height);
std::optional<std::pair<u32, Service::Nvidia::MultiFence*>> DequeueBuffer(u32 width,
u32 height);
const IGBPBuffer& RequestBuffer(u32 slot) const;
void QueueBuffer(u32 slot, BufferTransformFlags transform,
const Common::Rectangle<int>& crop_rect);
const Common::Rectangle<int>& crop_rect, u32 swap_interval,
Service::Nvidia::MultiFence& multi_fence);
std::optional<std::reference_wrapper<const Buffer>> AcquireBuffer();
void ReleaseBuffer(u32 slot);
u32 Query(QueryType type);
@@ -92,6 +98,7 @@ private:
u64 layer_id;
std::vector<Buffer> queue;
std::list<u32> queue_sequence;
Kernel::EventPair buffer_wait_event;
};

View File

@@ -37,15 +37,14 @@ NVFlinger::NVFlinger(Core::Timing::CoreTiming& core_timing) : core_timing{core_t
displays.emplace_back(4, "Null");
// Schedule the screen composition events
const auto ticks = Settings::values.force_30fps_mode ? frame_ticks_30fps : frame_ticks;
composition_event = core_timing.RegisterEvent("ScreenComposition", [this](u64 userdata,
s64 cycles_late) {
Compose();
const auto ticks = Settings::values.force_30fps_mode ? frame_ticks_30fps : GetNextTicks();
this->core_timing.ScheduleEvent(std::max<s64>(0LL, ticks - cycles_late), composition_event);
});
composition_event = core_timing.RegisterEvent(
"ScreenComposition", [this, ticks](u64 userdata, s64 cycles_late) {
Compose();
this->core_timing.ScheduleEvent(ticks - cycles_late, composition_event);
});
core_timing.ScheduleEvent(ticks, composition_event);
core_timing.ScheduleEvent(frame_ticks, composition_event);
}
NVFlinger::~NVFlinger() {
@@ -206,8 +205,14 @@ void NVFlinger::Compose() {
igbp_buffer.width, igbp_buffer.height, igbp_buffer.stride,
buffer->get().transform, buffer->get().crop_rect);
swap_interval = buffer->get().swap_interval;
buffer_queue.ReleaseBuffer(buffer->get().slot);
}
}
s64 NVFlinger::GetNextTicks() const {
constexpr s64 max_hertz = 120LL;
return (Core::Timing::BASE_CLOCK_RATE * (1LL << swap_interval)) / max_hertz;
}
} // namespace Service::NVFlinger

View File

@@ -74,6 +74,8 @@ public:
/// finished.
void Compose();
s64 GetNextTicks() const;
private:
/// Finds the display identified by the specified ID.
VI::Display* FindDisplay(u64 display_id);
@@ -98,6 +100,8 @@ private:
/// layers.
u32 next_buffer_queue_id = 1;
u32 swap_interval = 1;
/// Event that handles screen composition.
Core::Timing::EventType* composition_event;

View File

@@ -236,7 +236,7 @@ void Init(std::shared_ptr<SM::ServiceManager>& sm, Core::System& system) {
NIM::InstallInterfaces(*sm);
NPNS::InstallInterfaces(*sm);
NS::InstallInterfaces(*sm);
Nvidia::InstallInterfaces(*sm, *nv_flinger);
Nvidia::InstallInterfaces(*sm, *nv_flinger, system);
PCIe::InstallInterfaces(*sm);
PCTL::InstallInterfaces(*sm);
PCV::InstallInterfaces(*sm);

View File

@@ -21,6 +21,7 @@
#include "core/hle/kernel/readable_event.h"
#include "core/hle/kernel/thread.h"
#include "core/hle/kernel/writable_event.h"
#include "core/hle/service/nvdrv/nvdata.h"
#include "core/hle/service/nvdrv/nvdrv.h"
#include "core/hle/service/nvflinger/buffer_queue.h"
#include "core/hle/service/nvflinger/nvflinger.h"
@@ -328,32 +329,22 @@ public:
Data data;
};
struct BufferProducerFence {
u32 is_valid;
std::array<Nvidia::IoctlFence, 4> fences;
};
static_assert(sizeof(BufferProducerFence) == 36, "BufferProducerFence has wrong size");
class IGBPDequeueBufferResponseParcel : public Parcel {
public:
explicit IGBPDequeueBufferResponseParcel(u32 slot) : slot(slot) {}
explicit IGBPDequeueBufferResponseParcel(u32 slot, Service::Nvidia::MultiFence& multi_fence)
: slot(slot), multi_fence(multi_fence) {}
~IGBPDequeueBufferResponseParcel() override = default;
protected:
void SerializeData() override {
// TODO(Subv): Find out how this Fence is used.
BufferProducerFence fence = {};
fence.is_valid = 1;
for (auto& fence_ : fence.fences)
fence_.id = -1;
Write(slot);
Write<u32_le>(1);
WriteObject(fence);
WriteObject(multi_fence);
Write<u32_le>(0);
}
u32_le slot;
Service::Nvidia::MultiFence multi_fence;
};
class IGBPRequestBufferRequestParcel : public Parcel {
@@ -400,12 +391,6 @@ public:
data = Read<Data>();
}
struct Fence {
u32_le id;
u32_le value;
};
static_assert(sizeof(Fence) == 8, "Fence has wrong size");
struct Data {
u32_le slot;
INSERT_PADDING_WORDS(3);
@@ -418,15 +403,15 @@ public:
s32_le scaling_mode;
NVFlinger::BufferQueue::BufferTransformFlags transform;
u32_le sticky_transform;
INSERT_PADDING_WORDS(2);
u32_le fence_is_valid;
std::array<Fence, 2> fences;
INSERT_PADDING_WORDS(1);
u32_le swap_interval;
Service::Nvidia::MultiFence multi_fence;
Common::Rectangle<int> GetCropRect() const {
return {crop_left, crop_top, crop_right, crop_bottom};
}
};
static_assert(sizeof(Data) == 80, "ParcelData has wrong size");
static_assert(sizeof(Data) == 96, "ParcelData has wrong size");
Data data;
};
@@ -547,11 +532,11 @@ private:
IGBPDequeueBufferRequestParcel request{ctx.ReadBuffer()};
const u32 width{request.data.width};
const u32 height{request.data.height};
std::optional<u32> slot = buffer_queue.DequeueBuffer(width, height);
auto result = buffer_queue.DequeueBuffer(width, height);
if (slot) {
if (result) {
// Buffer is available
IGBPDequeueBufferResponseParcel response{*slot};
IGBPDequeueBufferResponseParcel response{result->first, *result->second};
ctx.WriteBuffer(response.Serialize());
} else {
// Wait the current thread until a buffer becomes available
@@ -561,10 +546,10 @@ private:
Kernel::ThreadWakeupReason reason) {
// Repeat TransactParcel DequeueBuffer when a buffer is available
auto& buffer_queue = nv_flinger->FindBufferQueue(id);
std::optional<u32> slot = buffer_queue.DequeueBuffer(width, height);
ASSERT_MSG(slot != std::nullopt, "Could not dequeue buffer.");
auto result = buffer_queue.DequeueBuffer(width, height);
ASSERT_MSG(result != std::nullopt, "Could not dequeue buffer.");
IGBPDequeueBufferResponseParcel response{*slot};
IGBPDequeueBufferResponseParcel response{result->first, *result->second};
ctx.WriteBuffer(response.Serialize());
IPC::ResponseBuilder rb{ctx, 2};
rb.Push(RESULT_SUCCESS);
@@ -582,7 +567,8 @@ private:
IGBPQueueBufferRequestParcel request{ctx.ReadBuffer()};
buffer_queue.QueueBuffer(request.data.slot, request.data.transform,
request.data.GetCropRect());
request.data.GetCropRect(), request.data.swap_interval,
request.data.multi_fence);
IGBPQueueBufferResponseParcel response{1280, 720};
ctx.WriteBuffer(response.Serialize());

View File

@@ -22,7 +22,7 @@ void DmaPusher::DispatchCalls() {
MICROPROFILE_SCOPE(DispatchCalls);
// On entering GPU code, assume all memory may be touched by the ARM core.
gpu.Maxwell3D().dirty_flags.OnMemoryWrite();
gpu.Maxwell3D().dirty.OnMemoryWrite();
dma_pushbuffer_subindex = 0;
@@ -31,6 +31,7 @@ void DmaPusher::DispatchCalls() {
break;
}
}
gpu.FlushCommands();
}
bool DmaPusher::Step() {

View File

@@ -37,7 +37,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
const bool is_last_call = method_call.IsLastCall();
upload_state.ProcessData(method_call.argument, is_last_call);
if (is_last_call) {
system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
system.GPU().Maxwell3D().dirty.OnMemoryWrite();
}
break;
}
@@ -50,13 +50,14 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
}
void KeplerCompute::ProcessLaunch() {
const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));
const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start;
LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc);
const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start;
LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr);
rasterizer.DispatchCompute(code_addr);
}
} // namespace Tegra::Engines

View File

@@ -34,7 +34,7 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
const bool is_last_call = method_call.IsLastCall();
upload_state.ProcessData(method_call.argument, is_last_call);
if (is_last_call) {
system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
system.GPU().Maxwell3D().dirty.OnMemoryWrite();
}
break;
}

View File

@@ -22,6 +22,7 @@ Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& raste
MemoryManager& memory_manager)
: system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
InitDirtySettings();
InitializeRegisterDefaults();
}
@@ -69,6 +70,10 @@ void Maxwell3D::InitializeRegisterDefaults() {
regs.stencil_back_func_mask = 0xFFFFFFFF;
regs.stencil_back_mask = 0xFFFFFFFF;
regs.depth_test_func = Regs::ComparisonOp::Always;
regs.cull.front_face = Regs::Cull::FrontFace::CounterClockWise;
regs.cull.cull_face = Regs::Cull::CullFace::Back;
// TODO(Rodrigo): Most games do not set a point size. I think this is a case of a
// register carrying a default value. Assume it's OpenGL's default (1).
regs.point_size = 1.0f;
@@ -86,6 +91,159 @@ void Maxwell3D::InitializeRegisterDefaults() {
regs.rt_separate_frag_data = 1;
}
#define DIRTY_REGS_POS(field_name) (offsetof(Maxwell3D::DirtyRegs, field_name))
void Maxwell3D::InitDirtySettings() {
const auto set_block = [this](const u32 start, const u32 range, const u8 position) {
const auto start_itr = dirty_pointers.begin() + start;
const auto end_itr = start_itr + range;
std::fill(start_itr, end_itr, position);
};
dirty.regs.fill(true);
// Init Render Targets
constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt);
constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8;
u32 rt_dirty_reg = DIRTY_REGS_POS(render_target);
for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) {
set_block(rt_reg, registers_per_rt, rt_dirty_reg);
rt_dirty_reg++;
}
constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer);
dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag;
dirty_pointers[MAXWELL3D_REG_INDEX(zeta_width)] = depth_buffer_flag;
dirty_pointers[MAXWELL3D_REG_INDEX(zeta_height)] = depth_buffer_flag;
constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
constexpr u32 zeta_reg = MAXWELL3D_REG_INDEX(zeta);
set_block(zeta_reg, registers_in_zeta, depth_buffer_flag);
// Init Vertex Arrays
constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array);
constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32);
constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays;
u32 va_reg = DIRTY_REGS_POS(vertex_array);
u32 vi_reg = DIRTY_REGS_POS(vertex_instance);
for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end;
vertex_reg += vertex_array_size) {
set_block(vertex_reg, 3, va_reg);
// The divisor concerns vertex array instances
dirty_pointers[vertex_reg + 3] = vi_reg;
va_reg++;
vi_reg++;
}
constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit);
constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32);
constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays;
va_reg = DIRTY_REGS_POS(vertex_array);
for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end;
vertex_reg += vertex_limit_size) {
set_block(vertex_reg, vertex_limit_size, va_reg);
va_reg++;
}
constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays);
constexpr u32 vertex_instance_size =
sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32);
constexpr u32 vertex_instance_end =
vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays;
vi_reg = DIRTY_REGS_POS(vertex_instance);
for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end;
vertex_reg += vertex_instance_size) {
set_block(vertex_reg, vertex_instance_size, vi_reg);
vi_reg++;
}
set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(),
DIRTY_REGS_POS(vertex_attrib_format));
// Init Shaders
constexpr u32 shader_registers_count =
sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
set_block(MAXWELL3D_REG_INDEX(shader_config[0]), shader_registers_count,
DIRTY_REGS_POS(shaders));
// State
// Viewport
constexpr u32 viewport_dirty_reg = DIRTY_REGS_POS(viewport);
constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports);
constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32);
set_block(viewport_start, viewport_size, viewport_dirty_reg);
constexpr u32 view_volume_start = MAXWELL3D_REG_INDEX(view_volume_clip_control);
constexpr u32 view_volume_size = sizeof(regs.view_volume_clip_control) / sizeof(u32);
set_block(view_volume_start, view_volume_size, viewport_dirty_reg);
// Viewport transformation
constexpr u32 viewport_trans_start = MAXWELL3D_REG_INDEX(viewport_transform);
constexpr u32 viewport_trans_size = sizeof(regs.viewport_transform) / sizeof(u32);
set_block(viewport_trans_start, viewport_trans_size, DIRTY_REGS_POS(viewport_transform));
// Cullmode
constexpr u32 cull_mode_start = MAXWELL3D_REG_INDEX(cull);
constexpr u32 cull_mode_size = sizeof(regs.cull) / sizeof(u32);
set_block(cull_mode_start, cull_mode_size, DIRTY_REGS_POS(cull_mode));
// Screen y control
dirty_pointers[MAXWELL3D_REG_INDEX(screen_y_control)] = DIRTY_REGS_POS(screen_y_control);
// Primitive Restart
constexpr u32 primitive_restart_start = MAXWELL3D_REG_INDEX(primitive_restart);
constexpr u32 primitive_restart_size = sizeof(regs.primitive_restart) / sizeof(u32);
set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart));
// Depth Test
constexpr u32 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test);
dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg;
// Stencil Test
constexpr u32 stencil_test_dirty_reg = DIRTY_REGS_POS(stencil_test);
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_enable)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_func)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_ref)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_mask)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_fail)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zfail)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zpass)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_mask)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_two_side_enable)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_func)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_ref)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_mask)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_fail)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zfail)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zpass)] = stencil_test_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg;
// Color Mask
constexpr u32 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask);
dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg;
set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32),
color_mask_dirty_reg);
// Blend State
constexpr u32 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state);
set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32),
blend_state_dirty_reg);
dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg;
set_block(MAXWELL3D_REG_INDEX(blend), sizeof(regs.blend) / sizeof(u32), blend_state_dirty_reg);
set_block(MAXWELL3D_REG_INDEX(independent_blend), sizeof(regs.independent_blend) / sizeof(u32),
blend_state_dirty_reg);
// Scissor State
constexpr u32 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test);
set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32),
scissor_test_dirty_reg);
// Polygon Offset
constexpr u32 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset);
dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_units)] = polygon_offset_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_factor)] = polygon_offset_dirty_reg;
dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg;
}
void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
// Reset the current macro.
executing_macro = 0;
@@ -108,6 +266,14 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
const u32 method = method_call.method;
if (method == cb_data_state.current) {
regs.reg_array[method] = method_call.argument;
ProcessCBData(method_call.argument);
return;
} else if (cb_data_state.current != null_cb_data) {
FinishCBData();
}
// It is an error to write to a register other than the current macro's ARG register before it
// has finished execution.
if (executing_macro != 0) {
@@ -143,49 +309,19 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
if (regs.reg_array[method] != method_call.argument) {
regs.reg_array[method] = method_call.argument;
// Color buffers
constexpr u32 first_rt_reg = MAXWELL3D_REG_INDEX(rt);
constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
if (method >= first_rt_reg &&
method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) {
const std::size_t rt_index = (method - first_rt_reg) / registers_per_rt;
dirty_flags.color_buffer.set(rt_index);
}
// Zeta buffer
constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
if (method == MAXWELL3D_REG_INDEX(zeta_enable) ||
method == MAXWELL3D_REG_INDEX(zeta_width) ||
method == MAXWELL3D_REG_INDEX(zeta_height) ||
(method >= MAXWELL3D_REG_INDEX(zeta) &&
method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) {
dirty_flags.zeta_buffer = true;
}
// Shader
constexpr u32 shader_registers_count =
sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
if (method >= MAXWELL3D_REG_INDEX(shader_config[0]) &&
method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) {
dirty_flags.shaders = true;
}
// Vertex format
if (method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) &&
method < MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) {
dirty_flags.vertex_attrib_format = true;
}
// Vertex buffer
if (method >= MAXWELL3D_REG_INDEX(vertex_array) &&
method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * Regs::NumVertexArrays) {
dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
} else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * Regs::NumVertexArrays) {
dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
} else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
method < MAXWELL3D_REG_INDEX(instanced_arrays) + Regs::NumVertexArrays) {
dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays));
const std::size_t dirty_reg = dirty_pointers[method];
if (dirty_reg) {
dirty.regs[dirty_reg] = true;
if (dirty_reg >= DIRTY_REGS_POS(vertex_array) &&
dirty_reg < DIRTY_REGS_POS(vertex_array_buffers)) {
dirty.vertex_array_buffers = true;
} else if (dirty_reg >= DIRTY_REGS_POS(vertex_instance) &&
dirty_reg < DIRTY_REGS_POS(vertex_instances)) {
dirty.vertex_instances = true;
} else if (dirty_reg >= DIRTY_REGS_POS(render_target) &&
dirty_reg < DIRTY_REGS_POS(render_settings)) {
dirty.render_settings = true;
}
}
}
@@ -214,7 +350,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
ProcessCBData(method_call.argument);
StartCBData(method);
break;
}
case MAXWELL3D_REG_INDEX(cb_bind[0].raw_config): {
@@ -249,6 +385,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
ProcessQueryGet();
break;
}
case MAXWELL3D_REG_INDEX(condition.mode): {
ProcessQueryCondition();
break;
}
case MAXWELL3D_REG_INDEX(sync_info): {
ProcessSyncPoint();
break;
@@ -261,7 +401,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
const bool is_last_call = method_call.IsLastCall();
upload_state.ProcessData(method_call.argument, is_last_call);
if (is_last_call) {
dirty_flags.OnMemoryWrite();
dirty.OnMemoryWrite();
}
break;
}
@@ -302,6 +442,7 @@ void Maxwell3D::ProcessQueryGet() {
result = regs.query.query_sequence;
break;
default:
result = 1;
UNIMPLEMENTED_MSG("Unimplemented query select type {}",
static_cast<u32>(regs.query.query_get.select.Value()));
}
@@ -333,7 +474,6 @@ void Maxwell3D::ProcessQueryGet() {
query_result.timestamp = system.CoreTiming().GetTicks();
memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
}
dirty_flags.OnMemoryWrite();
break;
}
default:
@@ -342,12 +482,52 @@ void Maxwell3D::ProcessQueryGet() {
}
}
void Maxwell3D::ProcessQueryCondition() {
const GPUVAddr condition_address{regs.condition.Address()};
switch (regs.condition.mode) {
case Regs::ConditionMode::Always: {
execute_on = true;
break;
}
case Regs::ConditionMode::Never: {
execute_on = false;
break;
}
case Regs::ConditionMode::ResNonZero: {
Regs::QueryCompare cmp;
memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
break;
}
case Regs::ConditionMode::Equal: {
Regs::QueryCompare cmp;
memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
execute_on =
cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
break;
}
case Regs::ConditionMode::NotEqual: {
Regs::QueryCompare cmp;
memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
execute_on =
cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
break;
}
default: {
UNIMPLEMENTED_MSG("Uninplemented Condition Mode!");
execute_on = true;
break;
}
}
}
void Maxwell3D::ProcessSyncPoint() {
const u32 sync_point = regs.sync_info.sync_point.Value();
const u32 increment = regs.sync_info.increment.Value();
const u32 cache_flush = regs.sync_info.unknown.Value();
LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
cache_flush);
if (increment) {
system.GPU().IncrementSyncPoint(sync_point);
}
}
void Maxwell3D::DrawArrays() {
@@ -405,23 +585,39 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
}
void Maxwell3D::ProcessCBData(u32 value) {
const u32 id = cb_data_state.id;
cb_data_state.buffer[id][cb_data_state.counter] = value;
// Increment the current buffer position.
regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
cb_data_state.counter++;
}
void Maxwell3D::StartCBData(u32 method) {
constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
cb_data_state.start_pos = regs.const_buffer.cb_pos;
cb_data_state.id = method - first_cb_data;
cb_data_state.current = method;
cb_data_state.counter = 0;
ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]);
}
void Maxwell3D::FinishCBData() {
// Write the input value to the current const buffer at the current position.
const GPUVAddr buffer_address = regs.const_buffer.BufferAddress();
ASSERT(buffer_address != 0);
// Don't allow writing past the end of the buffer.
ASSERT(regs.const_buffer.cb_pos + sizeof(u32) <= regs.const_buffer.cb_size);
ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size);
const GPUVAddr address{buffer_address + regs.const_buffer.cb_pos};
const GPUVAddr address{buffer_address + cb_data_state.start_pos};
const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos;
u8* ptr{memory_manager.GetPointer(address)};
rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32));
memory_manager.Write<u32>(address, value);
const u32 id = cb_data_state.id;
memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
dirty.OnMemoryWrite();
dirty_flags.OnMemoryWrite();
// Increment the current buffer position.
regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
cb_data_state.id = null_cb_data;
cb_data_state.current = null_cb_data;
}
Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {

View File

@@ -90,6 +90,20 @@ public:
enum class QuerySelect : u32 {
Zero = 0,
TimeElapsed = 2,
TransformFeedbackPrimitivesGenerated = 11,
PrimitivesGenerated = 18,
SamplesPassed = 21,
TransformFeedbackUnknown = 26,
};
struct QueryCompare {
u32 initial_sequence;
u32 initial_mode;
u32 unknown1;
u32 unknown2;
u32 current_sequence;
u32 current_mode;
};
enum class QuerySyncCondition : u32 {
@@ -97,6 +111,14 @@ public:
GreaterThan = 1,
};
enum class ConditionMode : u32 {
Never = 0,
Always = 1,
ResNonZero = 2,
Equal = 3,
NotEqual = 4,
};
enum class ShaderProgram : u32 {
VertexA = 0,
VertexB = 1,
@@ -815,7 +837,18 @@ public:
BitField<4, 1, u32> alpha_to_one;
} multisample_control;
INSERT_PADDING_WORDS(0x7);
INSERT_PADDING_WORDS(0x4);
struct {
u32 address_high;
u32 address_low;
ConditionMode mode;
GPUVAddr Address() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
address_low);
}
} condition;
struct {
u32 tsc_address_high;
@@ -1124,23 +1157,77 @@ public:
State state{};
struct DirtyFlags {
std::bitset<8> color_buffer{0xFF};
std::bitset<32> vertex_array{0xFFFFFFFF};
struct DirtyRegs {
static constexpr std::size_t NUM_REGS = 256;
union {
struct {
bool null_dirty;
bool vertex_attrib_format = true;
bool zeta_buffer = true;
bool shaders = true;
// Vertex Attributes
bool vertex_attrib_format;
// Vertex Arrays
std::array<bool, 32> vertex_array;
bool vertex_array_buffers;
// Vertex Instances
std::array<bool, 32> vertex_instance;
bool vertex_instances;
// Render Targets
std::array<bool, 8> render_target;
bool depth_buffer;
bool render_settings;
// Shaders
bool shaders;
// Rasterizer State
bool viewport;
bool clip_coefficient;
bool cull_mode;
bool primitive_restart;
bool depth_test;
bool stencil_test;
bool blend_state;
bool scissor_test;
bool transform_feedback;
bool color_mask;
bool polygon_offset;
// Complementary
bool viewport_transform;
bool screen_y_control;
bool memory_general;
};
std::array<bool, NUM_REGS> regs;
};
void ResetVertexArrays() {
vertex_array.fill(true);
vertex_array_buffers = true;
}
void ResetRenderTargets() {
depth_buffer = true;
render_target.fill(true);
render_settings = true;
}
void OnMemoryWrite() {
zeta_buffer = true;
shaders = true;
color_buffer.set();
vertex_array.set();
memory_general = true;
ResetRenderTargets();
ResetVertexArrays();
}
};
DirtyFlags dirty_flags;
} dirty{};
std::array<u8, Regs::NUM_REGS> dirty_pointers{};
/// Reads a register value located at the input method address
u32 GetRegisterValue(u32 method) const;
@@ -1169,6 +1256,10 @@ public:
return macro_memory;
}
bool ShouldExecute() const {
return execute_on;
}
private:
void InitializeRegisterDefaults();
@@ -1192,14 +1283,27 @@ private:
/// Interpreter for the macro codes uploaded to the GPU.
MacroInterpreter macro_interpreter;
static constexpr u32 null_cb_data = 0xFFFFFFFF;
struct {
std::array<std::array<u32, 0x4000>, 16> buffer;
u32 current{null_cb_data};
u32 id{null_cb_data};
u32 start_pos{};
u32 counter{};
} cb_data_state;
Upload::State upload_state;
bool execute_on{true};
/// Retrieves information about a specific TIC entry from the TIC buffer.
Texture::TICEntry GetTICEntry(u32 tic_index) const;
/// Retrieves information about a specific TSC entry from the TSC buffer.
Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;
void InitDirtySettings();
/**
* Call a macro on this engine.
* @param method Method to call
@@ -1219,11 +1323,16 @@ private:
/// Handles a write to the QUERY_GET register.
void ProcessQueryGet();
// Handles Conditional Rendering
void ProcessQueryCondition();
/// Handles writes to syncing register.
void ProcessSyncPoint();
/// Handles a write to the CB_DATA[i] register.
void StartCBData(u32 method);
void ProcessCBData(u32 value);
void FinishCBData();
/// Handles a write to the CB_BIND register.
void ProcessCBBind(Regs::ShaderStage stage);
@@ -1290,6 +1399,7 @@ ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
ASSERT_REG_POSITION(point_size, 0x546);
ASSERT_REG_POSITION(zeta_enable, 0x54E);
ASSERT_REG_POSITION(multisample_control, 0x54F);
ASSERT_REG_POSITION(condition, 0x554);
ASSERT_REG_POSITION(tsc, 0x557);
ASSERT_REG_POSITION(polygon_offset_factor, 0x55b);
ASSERT_REG_POSITION(tic, 0x55D);

View File

@@ -38,7 +38,7 @@ void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
}
void MaxwellDMA::HandleCopy() {
LOG_WARNING(HW_GPU, "Requested a DMA copy");
LOG_TRACE(HW_GPU, "Requested a DMA copy");
const GPUVAddr source = regs.src_address.Address();
const GPUVAddr dest = regs.dst_address.Address();
@@ -58,7 +58,7 @@ void MaxwellDMA::HandleCopy() {
}
// All copies here update the main memory, so mark all rasterizer states as invalid.
system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
system.GPU().Maxwell3D().dirty.OnMemoryWrite();
if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
// When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D

View File

@@ -931,8 +931,6 @@ union Instruction {
} csetp;
union {
BitField<35, 4, PredCondition> cond;
BitField<49, 1, u64> h_and;
BitField<6, 1, u64> ftz;
BitField<45, 2, PredOperation> op;
BitField<3, 3, u64> pred3;
@@ -940,9 +938,21 @@ union Instruction {
BitField<43, 1, u64> negate_a;
BitField<44, 1, u64> abs_a;
BitField<47, 2, HalfType> type_a;
BitField<31, 1, u64> negate_b;
BitField<30, 1, u64> abs_b;
BitField<28, 2, HalfType> type_b;
union {
BitField<35, 4, PredCondition> cond;
BitField<49, 1, u64> h_and;
BitField<31, 1, u64> negate_b;
BitField<30, 1, u64> abs_b;
BitField<28, 2, HalfType> type_b;
} reg;
union {
BitField<56, 1, u64> negate_b;
BitField<54, 1, u64> abs_b;
} cbuf;
union {
BitField<49, 4, PredCondition> cond;
BitField<53, 1, u64> h_and;
} cbuf_and_imm;
BitField<42, 1, u64> neg_pred;
BitField<39, 3, u64> pred39;
} hsetp2;
@@ -1548,7 +1558,9 @@ public:
HFMA2_RC,
HFMA2_RR,
HFMA2_IMM_R,
HSETP2_C,
HSETP2_R,
HSETP2_IMM,
HSET2_R,
POPC_C,
POPC_R,
@@ -1831,7 +1843,9 @@ private:
INST("01100---1-------", Id::HFMA2_RC, Type::Hfma2, "HFMA2_RC"),
INST("0101110100000---", Id::HFMA2_RR, Type::Hfma2, "HFMA2_RR"),
INST("01110---0-------", Id::HFMA2_IMM_R, Type::Hfma2, "HFMA2_R_IMM"),
INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP_R"),
INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),

View File

@@ -29,7 +29,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
UNREACHABLE();
}
GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
: system{system}, renderer{renderer}, is_async{is_async} {
auto& rasterizer{renderer.Rasterizer()};
memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
@@ -50,6 +51,14 @@ const Engines::Maxwell3D& GPU::Maxwell3D() const {
return *maxwell_3d;
}
Engines::KeplerCompute& GPU::KeplerCompute() {
return *kepler_compute;
}
const Engines::KeplerCompute& GPU::KeplerCompute() const {
return *kepler_compute;
}
MemoryManager& GPU::MemoryManager() {
return *memory_manager;
}
@@ -66,6 +75,55 @@ const DmaPusher& GPU::DmaPusher() const {
return *dma_pusher;
}
void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
syncpoints[syncpoint_id]++;
std::lock_guard lock{sync_mutex};
if (!syncpt_interrupts[syncpoint_id].empty()) {
u32 value = syncpoints[syncpoint_id].load();
auto it = syncpt_interrupts[syncpoint_id].begin();
while (it != syncpt_interrupts[syncpoint_id].end()) {
if (value >= *it) {
TriggerCpuInterrupt(syncpoint_id, *it);
it = syncpt_interrupts[syncpoint_id].erase(it);
continue;
}
it++;
}
}
}
u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
return syncpoints[syncpoint_id].load();
}
void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
auto& interrupt = syncpt_interrupts[syncpoint_id];
bool contains = std::any_of(interrupt.begin(), interrupt.end(),
[value](u32 in_value) { return in_value == value; });
if (contains) {
return;
}
syncpt_interrupts[syncpoint_id].emplace_back(value);
}
bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
std::lock_guard lock{sync_mutex};
auto& interrupt = syncpt_interrupts[syncpoint_id];
const auto iter =
std::find_if(interrupt.begin(), interrupt.end(),
[value](u32 interrupt_value) { return value == interrupt_value; });
if (iter == interrupt.end()) {
return false;
}
interrupt.erase(iter);
return true;
}
void GPU::FlushCommands() {
renderer.Rasterizer().FlushCommands();
}
u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
ASSERT(format != RenderTargetFormat::NONE);
@@ -143,12 +201,12 @@ enum class BufferMethods {
NotifyIntr = 0x8,
WrcacheFlush = 0x9,
Unk28 = 0xA,
Unk2c = 0xB,
UnkCacheFlush = 0xB,
RefCnt = 0x14,
SemaphoreAcquire = 0x1A,
SemaphoreRelease = 0x1B,
Unk70 = 0x1C,
Unk74 = 0x1D,
FenceValue = 0x1C,
FenceAction = 0x1D,
Unk78 = 0x1E,
Unk7c = 0x1F,
Yield = 0x20,
@@ -194,6 +252,10 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
case BufferMethods::SemaphoreAddressLow:
case BufferMethods::SemaphoreSequence:
case BufferMethods::RefCnt:
case BufferMethods::UnkCacheFlush:
case BufferMethods::WrcacheFlush:
case BufferMethods::FenceValue:
case BufferMethods::FenceAction:
break;
case BufferMethods::SemaphoreTrigger: {
ProcessSemaphoreTriggerMethod();
@@ -204,21 +266,11 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
break;
}
case BufferMethods::WrcacheFlush: {
// TODO(Kmather73): Research and implement this method.
LOG_ERROR(HW_GPU, "Special puller engine method WrcacheFlush not implemented");
break;
}
case BufferMethods::Unk28: {
// TODO(Kmather73): Research and implement this method.
LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
break;
}
case BufferMethods::Unk2c: {
// TODO(Kmather73): Research and implement this method.
LOG_ERROR(HW_GPU, "Special puller engine method Unk2c not implemented");
break;
}
case BufferMethods::SemaphoreAcquire: {
ProcessSemaphoreAcquire();
break;

View File

@@ -5,8 +5,12 @@
#pragma once
#include <array>
#include <atomic>
#include <list>
#include <memory>
#include <mutex>
#include "common/common_types.h"
#include "core/hle/service/nvdrv/nvdata.h"
#include "core/hle/service/nvflinger/buffer_queue.h"
#include "video_core/dma_pusher.h"
@@ -127,7 +131,7 @@ class MemoryManager;
class GPU {
public:
explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
explicit GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async);
virtual ~GPU();
@@ -149,12 +153,20 @@ public:
/// Calls a GPU method.
void CallMethod(const MethodCall& method_call);
void FlushCommands();
/// Returns a reference to the Maxwell3D GPU engine.
Engines::Maxwell3D& Maxwell3D();
/// Returns a const reference to the Maxwell3D GPU engine.
const Engines::Maxwell3D& Maxwell3D() const;
/// Returns a reference to the KeplerCompute GPU engine.
Engines::KeplerCompute& KeplerCompute();
/// Returns a reference to the KeplerCompute GPU engine.
const Engines::KeplerCompute& KeplerCompute() const;
/// Returns a reference to the GPU memory manager.
Tegra::MemoryManager& MemoryManager();
@@ -164,6 +176,22 @@ public:
/// Returns a reference to the GPU DMA pusher.
Tegra::DmaPusher& DmaPusher();
void IncrementSyncPoint(u32 syncpoint_id);
u32 GetSyncpointValue(u32 syncpoint_id) const;
void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
std::unique_lock<std::mutex> LockSync() {
return std::unique_lock{sync_mutex};
}
bool IsAsync() const {
return is_async;
}
/// Returns a const reference to the GPU DMA pusher.
const Tegra::DmaPusher& DmaPusher() const;
@@ -194,7 +222,12 @@ public:
u32 semaphore_acquire;
u32 semaphore_release;
INSERT_PADDING_WORDS(0xE4);
u32 fence_value;
union {
BitField<4, 4, u32> operation;
BitField<8, 8, u32> id;
} fence_action;
INSERT_PADDING_WORDS(0xE2);
// Puller state
u32 acquire_mode;
@@ -228,6 +261,9 @@ public:
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated
virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
protected:
virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;
private:
void ProcessBindMethod(const MethodCall& method_call);
void ProcessSemaphoreTriggerMethod();
@@ -246,6 +282,7 @@ private:
protected:
std::unique_ptr<Tegra::DmaPusher> dma_pusher;
VideoCore::RendererBase& renderer;
Core::System& system;
private:
std::unique_ptr<Tegra::MemoryManager> memory_manager;
@@ -262,6 +299,14 @@ private:
std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
/// Inline memory engine
std::unique_ptr<Engines::KeplerMemory> kepler_memory;
std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
std::mutex sync_mutex;
const bool is_async;
};
#define ASSERT_REG_POSITION(field_name, position) \
@@ -274,6 +319,8 @@ ASSERT_REG_POSITION(semaphore_trigger, 0x7);
ASSERT_REG_POSITION(reference_count, 0x14);
ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
ASSERT_REG_POSITION(semaphore_release, 0x1B);
ASSERT_REG_POSITION(fence_value, 0x1C);
ASSERT_REG_POSITION(fence_action, 0x1D);
ASSERT_REG_POSITION(acquire_mode, 0x100);
ASSERT_REG_POSITION(acquire_source, 0x101);

View File

@@ -2,6 +2,8 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "core/core.h"
#include "core/hardware_interrupt_manager.h"
#include "video_core/gpu_asynch.h"
#include "video_core/gpu_thread.h"
#include "video_core/renderer_base.h"
@@ -9,7 +11,7 @@
namespace VideoCommon {
GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
: GPU(system, renderer), gpu_thread{system} {}
: GPU(system, renderer, true), gpu_thread{system} {}
GPUAsynch::~GPUAsynch() = default;
@@ -38,4 +40,9 @@ void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
gpu_thread.FlushAndInvalidateRegion(addr, size);
}
void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
auto& interrupt_manager = system.InterruptManager();
interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
}
} // namespace VideoCommon

View File

@@ -27,6 +27,9 @@ public:
void InvalidateRegion(CacheAddr addr, u64 size) override;
void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
protected:
void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
private:
GPUThread::ThreadManager gpu_thread;
};

View File

@@ -8,7 +8,7 @@
namespace VideoCommon {
GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
: GPU(system, renderer) {}
: GPU(system, renderer, false) {}
GPUSynch::~GPUSynch() = default;

View File

@@ -25,6 +25,10 @@ public:
void FlushRegion(CacheAddr addr, u64 size) override;
void InvalidateRegion(CacheAddr addr, u64 size) override;
void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
protected:
void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
[[maybe_unused]] u32 value) const override {}
};
} // namespace VideoCommon

View File

@@ -21,7 +21,8 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
MicroProfileOnThreadCreate("GpuThread");
// Wait for first GPU command before acquiring the window context
state.WaitForCommands();
while (state.queue.Empty())
;
// If emulation was stopped during disk shader loading, abort before trying to acquire context
if (!state.is_running) {
@@ -32,7 +33,6 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
CommandDataContainer next;
while (state.is_running) {
state.WaitForCommands();
while (!state.queue.Empty()) {
state.queue.Pop(next);
if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
@@ -49,8 +49,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
} else {
UNREACHABLE();
}
state.signaled_fence = next.fence;
state.TrySynchronize();
state.signaled_fence.store(next.fence);
}
}
}
@@ -89,12 +88,7 @@ void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
}
void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
if (state.queue.Empty()) {
// It's quicker to invalidate a single region on the CPU if the queue is already empty
system.Renderer().Rasterizer().InvalidateRegion(addr, size);
} else {
PushCommand(InvalidateRegionCommand(addr, size));
}
system.Renderer().Rasterizer().InvalidateRegion(addr, size);
}
void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -105,22 +99,13 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
u64 ThreadManager::PushCommand(CommandData&& command_data) {
const u64 fence{++state.last_fence};
state.queue.Push(CommandDataContainer(std::move(command_data), fence));
state.SignalCommands();
return fence;
}
MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
void SynchState::WaitForSynchronization(u64 fence) {
if (signaled_fence >= fence) {
return;
}
// Wait for the GPU to be idle (all commands to be executed)
{
MICROPROFILE_SCOPE(GPU_wait);
std::unique_lock lock{synchronization_mutex};
synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
}
while (signaled_fence.load() < fence)
;
}
} // namespace VideoCommon::GPUThread

View File

@@ -88,41 +88,9 @@ struct CommandDataContainer {
/// Struct used to synchronize the GPU thread
struct SynchState final {
std::atomic_bool is_running{true};
std::atomic_int queued_frame_count{};
std::mutex synchronization_mutex;
std::mutex commands_mutex;
std::condition_variable commands_condition;
std::condition_variable synchronization_condition;
/// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU
/// synchronized. This is entirely empirical.
bool IsSynchronized() const {
constexpr std::size_t max_queue_gap{5};
return queue.Size() <= max_queue_gap;
}
void TrySynchronize() {
if (IsSynchronized()) {
std::lock_guard lock{synchronization_mutex};
synchronization_condition.notify_one();
}
}
void WaitForSynchronization(u64 fence);
void SignalCommands() {
if (queue.Empty()) {
return;
}
commands_condition.notify_one();
}
void WaitForCommands() {
std::unique_lock lock{commands_mutex};
commands_condition.wait(lock, [this] { return !queue.Empty(); });
}
using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
CommandQueue queue;
u64 last_fence{};

View File

@@ -34,6 +34,9 @@ public:
/// Clear the current framebuffer
virtual void Clear() = 0;
/// Dispatches a compute shader invocation
virtual void DispatchCompute(GPUVAddr code_addr) = 0;
/// Notify rasterizer that all caches should be flushed to Switch memory
virtual void FlushAll() = 0;
@@ -47,6 +50,9 @@ public:
/// and invalidated
virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
// Notify the rasterizer to send all written commands to the host GPU.
virtual void FlushCommands() = 0;
/// Notify rasterizer that a frame is about to finish
virtual void TickFrame() = 0;

View File

@@ -4,6 +4,7 @@
#include <algorithm>
#include <array>
#include <bitset>
#include <memory>
#include <string>
#include <string_view>
@@ -19,6 +20,7 @@
#include "core/core.h"
#include "core/hle/kernel/process.h"
#include "core/settings.h"
#include "video_core/engines/kepler_compute.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/memory_manager.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
@@ -105,6 +107,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind
shader_program_manager = std::make_unique<GLShader::ProgramManager>();
state.draw.shader_program = 0;
state.Apply();
clear_framebuffer.Create();
LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here");
CheckExtensions();
@@ -124,10 +127,10 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
auto& gpu = system.GPU().Maxwell3D();
const auto& regs = gpu.regs;
if (!gpu.dirty_flags.vertex_attrib_format) {
if (!gpu.dirty.vertex_attrib_format) {
return state.draw.vertex_array;
}
gpu.dirty_flags.vertex_attrib_format = false;
gpu.dirty.vertex_attrib_format = false;
MICROPROFILE_SCOPE(OpenGL_VAO);
@@ -181,7 +184,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
}
// Rebinding the VAO invalidates the vertex buffer bindings.
gpu.dirty_flags.vertex_array.set();
gpu.dirty.ResetVertexArrays();
state.draw.vertex_array = vao_entry.handle;
return vao_entry.handle;
@@ -189,17 +192,20 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
auto& gpu = system.GPU().Maxwell3D();
const auto& regs = gpu.regs;
if (gpu.dirty_flags.vertex_array.none())
if (!gpu.dirty.vertex_array_buffers)
return;
gpu.dirty.vertex_array_buffers = false;
const auto& regs = gpu.regs;
MICROPROFILE_SCOPE(OpenGL_VB);
// Upload all guest vertex arrays sequentially to our buffer
for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
if (!gpu.dirty_flags.vertex_array[index])
if (!gpu.dirty.vertex_array[index])
continue;
gpu.dirty.vertex_array[index] = false;
gpu.dirty.vertex_instance[index] = false;
const auto& vertex_array = regs.vertex_array[index];
if (!vertex_array.IsEnabled())
@@ -224,8 +230,32 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
glVertexArrayBindingDivisor(vao, index, 0);
}
}
}
gpu.dirty_flags.vertex_array.reset();
void RasterizerOpenGL::SetupVertexInstances(GLuint vao) {
auto& gpu = system.GPU().Maxwell3D();
if (!gpu.dirty.vertex_instances)
return;
gpu.dirty.vertex_instances = false;
const auto& regs = gpu.regs;
// Upload all guest vertex arrays sequentially to our buffer
for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
if (!gpu.dirty.vertex_instance[index])
continue;
gpu.dirty.vertex_instance[index] = false;
if (regs.instanced_arrays.IsInstancingEnabled(index) &&
regs.vertex_array[index].divisor != 0) {
// Enable vertex buffer instancing with the specified divisor.
glVertexArrayBindingDivisor(vao, index, regs.vertex_array[index].divisor);
} else {
// Disable the vertex buffer instancing.
glVertexArrayBindingDivisor(vao, index, 0);
}
}
}
GLintptr RasterizerOpenGL::SetupIndexBuffer() {
@@ -298,9 +328,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
Shader shader{shader_cache.GetStageProgram(program)};
const auto stage_enum{static_cast<Maxwell::ShaderStage>(stage)};
const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage);
SetupDrawConstBuffers(stage_enum, shader);
SetupGlobalRegions(stage_enum, shader);
SetupDrawGlobalMemory(stage_enum, shader);
const auto texture_buffer_usage{SetupTextures(stage_enum, shader, base_bindings)};
const ProgramVariant variant{base_bindings, primitive_mode, texture_buffer_usage};
@@ -341,7 +371,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
SyncClipEnabled(clip_distances);
gpu.dirty_flags.shaders = false;
gpu.dirty.shaders = false;
}
std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
@@ -424,13 +454,13 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents,
single_color_target};
if (fb_config_state == current_framebuffer_config_state &&
gpu.dirty_flags.color_buffer.none() && !gpu.dirty_flags.zeta_buffer) {
if (fb_config_state == current_framebuffer_config_state && !gpu.dirty.render_settings) {
// Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or
// single color targets). This is done because the guest registers may not change but the
// host framebuffer may contain different attachments
return current_depth_stencil_usage;
}
gpu.dirty.render_settings = false;
current_framebuffer_config_state = fb_config_state;
texture_cache.GuardRenderTargets(true);
@@ -519,13 +549,71 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
return current_depth_stencil_usage = {static_cast<bool>(depth_surface), fbkey.stencil_enable};
}
void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
bool using_depth_fb, bool using_stencil_fb) {
auto& gpu = system.GPU().Maxwell3D();
const auto& regs = gpu.regs;
texture_cache.GuardRenderTargets(true);
View color_surface{};
if (using_color_fb) {
color_surface = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT, false);
}
View depth_surface{};
if (using_depth_fb || using_stencil_fb) {
depth_surface = texture_cache.GetDepthBufferSurface(false);
}
texture_cache.GuardRenderTargets(false);
current_state.draw.draw_framebuffer = clear_framebuffer.handle;
current_state.ApplyFramebufferState();
if (color_surface) {
color_surface->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER);
} else {
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
}
if (depth_surface) {
const auto& params = depth_surface->GetSurfaceParams();
switch (params.type) {
case VideoCore::Surface::SurfaceType::Depth: {
depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER);
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
break;
}
case VideoCore::Surface::SurfaceType::DepthStencil: {
depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER);
break;
}
default: { UNIMPLEMENTED(); }
}
} else {
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
0);
}
}
void RasterizerOpenGL::Clear() {
const auto& regs = system.GPU().Maxwell3D().regs;
const auto& maxwell3d = system.GPU().Maxwell3D();
if (!maxwell3d.ShouldExecute()) {
return;
}
const auto& regs = maxwell3d.regs;
bool use_color{};
bool use_depth{};
bool use_stencil{};
OpenGLState clear_state;
OpenGLState prev_state{OpenGLState::GetCurState()};
SCOPE_EXIT({
prev_state.AllDirty();
prev_state.Apply();
});
OpenGLState clear_state{OpenGLState::GetCurState()};
clear_state.SetDefaultViewports();
if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
regs.clear_buffers.A) {
use_color = true;
@@ -545,6 +633,7 @@ void RasterizerOpenGL::Clear() {
// true.
clear_state.depth.test_enabled = true;
clear_state.depth.test_func = GL_ALWAYS;
clear_state.depth.write_mask = GL_TRUE;
}
if (regs.clear_buffers.S) {
ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!");
@@ -581,8 +670,9 @@ void RasterizerOpenGL::Clear() {
return;
}
const auto [clear_depth, clear_stencil] = ConfigureFramebuffers(
clear_state, use_color, use_depth || use_stencil, false, regs.clear_buffers.RT.Value());
ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil);
SyncViewport(clear_state);
if (regs.clear_flags.scissor) {
SyncScissorTest(clear_state);
}
@@ -591,21 +681,18 @@ void RasterizerOpenGL::Clear() {
clear_state.EmulateViewportWithScissor();
}
clear_state.ApplyColorMask();
clear_state.ApplyDepth();
clear_state.ApplyStencilTest();
clear_state.ApplyViewport();
clear_state.ApplyFramebufferState();
clear_state.AllDirty();
clear_state.Apply();
if (use_color) {
glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
glClearBufferfv(GL_COLOR, 0, regs.clear_color);
}
if (clear_depth && clear_stencil) {
if (use_depth && use_stencil) {
glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil);
} else if (clear_depth) {
} else if (use_depth) {
glClearBufferfv(GL_DEPTH, 0, &regs.clear_depth);
} else if (clear_stencil) {
} else if (use_stencil) {
glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
}
}
@@ -616,6 +703,11 @@ void RasterizerOpenGL::DrawArrays() {
MICROPROFILE_SCOPE(OpenGL_Drawing);
auto& gpu = system.GPU().Maxwell3D();
if (!gpu.ShouldExecute()) {
return;
}
const auto& regs = gpu.regs;
SyncColorMask();
@@ -661,6 +753,7 @@ void RasterizerOpenGL::DrawArrays() {
// Upload vertex and index data.
SetupVertexBuffer(vao);
SetupVertexInstances(vao);
const GLintptr index_buffer_offset = SetupIndexBuffer();
// Setup draw parameters. It will automatically choose what glDraw* method to use.
@@ -687,7 +780,7 @@ void RasterizerOpenGL::DrawArrays() {
if (invalidate) {
// As all cached buffers are invalidated, we need to recheck their state.
gpu.dirty_flags.vertex_array.set();
gpu.dirty.ResetVertexArrays();
}
shader_program_manager->ApplyTo(state);
@@ -700,6 +793,46 @@ void RasterizerOpenGL::DrawArrays() {
params.DispatchDraw();
accelerate_draw = AccelDraw::Disabled;
gpu.dirty.memory_general = false;
}
void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
if (!GLAD_GL_ARB_compute_variable_group_size) {
LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the "
"lack of GL_ARB_compute_variable_group_size");
return;
}
auto kernel = shader_cache.GetComputeKernel(code_addr);
const auto [program, next_bindings] = kernel->GetProgramHandle({});
state.draw.shader_program = program;
state.draw.program_pipeline = 0;
const std::size_t buffer_size =
Tegra::Engines::KeplerCompute::NumConstBuffers *
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
buffer_cache.Map(buffer_size);
bind_ubo_pushbuffer.Setup(0);
bind_ssbo_pushbuffer.Setup(0);
SetupComputeConstBuffers(kernel);
SetupComputeGlobalMemory(kernel);
// TODO(Rodrigo): Bind images and samplers
buffer_cache.Unmap();
bind_ubo_pushbuffer.Bind();
bind_ssbo_pushbuffer.Bind();
state.ApplyShaderProgram();
state.ApplyProgramPipeline();
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y,
launch_desc.grid_dim_z, launch_desc.block_dim_x,
launch_desc.block_dim_y, launch_desc.block_dim_z);
}
void RasterizerOpenGL::FlushAll() {}
@@ -730,6 +863,10 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
InvalidateRegion(addr, size);
}
void RasterizerOpenGL::FlushCommands() {
glFlush();
}
void RasterizerOpenGL::TickFrame() {
buffer_cache.TickFrame();
}
@@ -775,12 +912,25 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
const Shader& shader) {
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto stage_index = static_cast<std::size_t>(stage);
const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index];
// Upload only the enabled buffers from the 16 constbuffers of each shader stage
const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
const auto& shader_stage = stages[static_cast<std::size_t>(stage)];
for (const auto& entry : shader->GetShaderEntries().const_buffers) {
SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry);
const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
SetupConstBuffer(buffer, entry);
}
}
void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
for (const auto& entry : kernel->GetShaderEntries().const_buffers) {
const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
const std::bitset<8> mask = launch_desc.memory_config.const_buffer_enable_mask.Value();
Tegra::Engines::ConstBufferInfo buffer;
buffer.address = config.Address();
buffer.size = config.size;
buffer.enabled = mask[entry.GetIndex()];
SetupConstBuffer(buffer, entry);
}
}
@@ -801,24 +951,39 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b
bind_ubo_pushbuffer.Push(cbuf, offset, size);
}
void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
const Shader& shader) {
void RasterizerOpenGL::SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
const Shader& shader) {
auto& gpu{system.GPU()};
auto& memory_manager{gpu.MemoryManager()};
const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
const auto alignment{device.GetShaderStorageBufferAlignment()};
for (const auto& entry : shader->GetShaderEntries().global_memory_entries) {
const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
const auto actual_addr{memory_manager.Read<u64>(addr)};
const auto gpu_addr{memory_manager.Read<u64>(addr)};
const auto size{memory_manager.Read<u32>(addr + 8)};
const auto [ssbo, buffer_offset] =
buffer_cache.UploadMemory(actual_addr, size, alignment, true, entry.IsWritten());
bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
SetupGlobalMemory(entry, gpu_addr, size);
}
}
void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
auto& gpu{system.GPU()};
auto& memory_manager{gpu.MemoryManager()};
const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) {
const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
const auto gpu_addr{memory_manager.Read<u64>(addr)};
const auto size{memory_manager.Read<u32>(addr + 8)};
SetupGlobalMemory(entry, gpu_addr, size);
}
}
void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry,
GPUVAddr gpu_addr, std::size_t size) {
const auto alignment{device.GetShaderStorageBufferAlignment()};
const auto [ssbo, buffer_offset] =
buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten());
bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
}
TextureBufferUsage RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
BaseBindings base_bindings) {
MICROPROFILE_SCOPE(OpenGL_Texture);
@@ -907,10 +1072,11 @@ void RasterizerOpenGL::SyncClipCoef() {
}
void RasterizerOpenGL::SyncCullMode() {
const auto& regs = system.GPU().Maxwell3D().regs;
auto& maxwell3d = system.GPU().Maxwell3D();
const auto& regs = maxwell3d.regs;
state.cull.enabled = regs.cull.enabled != 0;
if (state.cull.enabled) {
state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face);
state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face);
@@ -943,16 +1109,21 @@ void RasterizerOpenGL::SyncDepthTestState() {
state.depth.test_enabled = regs.depth_test_enable != 0;
state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE;
if (!state.depth.test_enabled)
if (!state.depth.test_enabled) {
return;
}
state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func);
}
void RasterizerOpenGL::SyncStencilTestState() {
const auto& regs = system.GPU().Maxwell3D().regs;
state.stencil.test_enabled = regs.stencil_enable != 0;
auto& maxwell3d = system.GPU().Maxwell3D();
if (!maxwell3d.dirty.stencil_test) {
return;
}
const auto& regs = maxwell3d.regs;
state.stencil.test_enabled = regs.stencil_enable != 0;
if (!regs.stencil_enable) {
return;
}
@@ -981,10 +1152,17 @@ void RasterizerOpenGL::SyncStencilTestState() {
state.stencil.back.action_depth_fail = GL_KEEP;
state.stencil.back.action_depth_pass = GL_KEEP;
}
state.MarkDirtyStencilState();
maxwell3d.dirty.stencil_test = false;
}
void RasterizerOpenGL::SyncColorMask() {
const auto& regs = system.GPU().Maxwell3D().regs;
auto& maxwell3d = system.GPU().Maxwell3D();
if (!maxwell3d.dirty.color_mask) {
return;
}
const auto& regs = maxwell3d.regs;
const std::size_t count =
regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1;
for (std::size_t i = 0; i < count; i++) {
@@ -995,6 +1173,9 @@ void RasterizerOpenGL::SyncColorMask() {
dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE;
dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE;
}
state.MarkDirtyColorMask();
maxwell3d.dirty.color_mask = false;
}
void RasterizerOpenGL::SyncMultiSampleState() {
@@ -1009,7 +1190,11 @@ void RasterizerOpenGL::SyncFragmentColorClampState() {
}
void RasterizerOpenGL::SyncBlendState() {
const auto& regs = system.GPU().Maxwell3D().regs;
auto& maxwell3d = system.GPU().Maxwell3D();
if (!maxwell3d.dirty.blend_state) {
return;
}
const auto& regs = maxwell3d.regs;
state.blend_color.red = regs.blend_color.r;
state.blend_color.green = regs.blend_color.g;
@@ -1032,6 +1217,8 @@ void RasterizerOpenGL::SyncBlendState() {
for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
state.blend[i].enabled = false;
}
maxwell3d.dirty.blend_state = false;
state.MarkDirtyBlendState();
return;
}
@@ -1048,6 +1235,9 @@ void RasterizerOpenGL::SyncBlendState() {
blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a);
blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a);
}
state.MarkDirtyBlendState();
maxwell3d.dirty.blend_state = false;
}
void RasterizerOpenGL::SyncLogicOpState() {
@@ -1099,13 +1289,21 @@ void RasterizerOpenGL::SyncPointState() {
}
void RasterizerOpenGL::SyncPolygonOffset() {
const auto& regs = system.GPU().Maxwell3D().regs;
auto& maxwell3d = system.GPU().Maxwell3D();
if (!maxwell3d.dirty.polygon_offset) {
return;
}
const auto& regs = maxwell3d.regs;
state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0;
state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0;
state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0;
state.polygon_offset.units = regs.polygon_offset_units;
state.polygon_offset.factor = regs.polygon_offset_factor;
state.polygon_offset.clamp = regs.polygon_offset_clamp;
state.MarkDirtyPolygonOffset();
maxwell3d.dirty.polygon_offset = false;
}
void RasterizerOpenGL::SyncAlphaTest() {

View File

@@ -58,10 +58,12 @@ public:
void DrawArrays() override;
void Clear() override;
void DispatchCompute(GPUVAddr code_addr) override;
void FlushAll() override;
void FlushRegion(CacheAddr addr, u64 size) override;
void InvalidateRegion(CacheAddr addr, u64 size) override;
void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
void FlushCommands() override;
void TickFrame() override;
bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
const Tegra::Engines::Fermi2D::Regs::Surface& dst,
@@ -108,17 +110,30 @@ private:
OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true,
bool preserve_contents = true, std::optional<std::size_t> single_color_target = {});
void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
bool using_depth_fb, bool using_stencil_fb);
/// Configures the current constbuffers to use for the draw command.
void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
const Shader& shader);
/// Configures the current constbuffers to use for the kernel invocation.
void SetupComputeConstBuffers(const Shader& kernel);
/// Configures a constant buffer.
void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer,
const GLShader::ConstBufferEntry& entry);
/// Configures the current global memory entries to use for the draw command.
void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
const Shader& shader);
void SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
const Shader& shader);
/// Configures the current global memory entries to use for the kernel invocation.
void SetupComputeGlobalMemory(const Shader& kernel);
/// Configures a constant buffer.
void SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
std::size_t size);
/// Configures the current textures to use for the draw command. Returns shaders texture buffer
/// usage.
@@ -216,6 +231,7 @@ private:
GLuint SetupVertexFormat();
void SetupVertexBuffer(GLuint vao);
void SetupVertexInstances(GLuint vao);
GLintptr SetupIndexBuffer();
@@ -226,6 +242,8 @@ private:
enum class AccelDraw { Disabled, Arrays, Indexed };
AccelDraw accelerate_draw = AccelDraw::Disabled;
OGLFramebuffer clear_framebuffer;
using CachedPageMap = boost::icl::interval_map<u64, int>;
CachedPageMap cached_pages;
};

View File

@@ -23,13 +23,13 @@ namespace OpenGL {
using VideoCommon::Shader::ProgramCode;
// One UBO is always reserved for emulation values
constexpr u32 RESERVED_UBOS = 1;
// One UBO is always reserved for emulation values on staged shaders
constexpr u32 STAGE_RESERVED_UBOS = 1;
struct UnspecializedShader {
std::string code;
GLShader::ShaderEntries entries;
Maxwell::ShaderProgram program_type;
ProgramType program_type;
};
namespace {
@@ -55,15 +55,17 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr g
}
/// Gets the shader type from a Maxwell program type
constexpr GLenum GetShaderType(Maxwell::ShaderProgram program_type) {
constexpr GLenum GetShaderType(ProgramType program_type) {
switch (program_type) {
case Maxwell::ShaderProgram::VertexA:
case Maxwell::ShaderProgram::VertexB:
case ProgramType::VertexA:
case ProgramType::VertexB:
return GL_VERTEX_SHADER;
case Maxwell::ShaderProgram::Geometry:
case ProgramType::Geometry:
return GL_GEOMETRY_SHADER;
case Maxwell::ShaderProgram::Fragment:
case ProgramType::Fragment:
return GL_FRAGMENT_SHADER;
case ProgramType::Compute:
return GL_COMPUTE_SHADER;
default:
return GL_NONE;
}
@@ -100,6 +102,25 @@ constexpr std::tuple<const char*, const char*, u32> GetPrimitiveDescription(GLen
}
}
ProgramType GetProgramType(Maxwell::ShaderProgram program) {
switch (program) {
case Maxwell::ShaderProgram::VertexA:
return ProgramType::VertexA;
case Maxwell::ShaderProgram::VertexB:
return ProgramType::VertexB;
case Maxwell::ShaderProgram::TesselationControl:
return ProgramType::TessellationControl;
case Maxwell::ShaderProgram::TesselationEval:
return ProgramType::TessellationEval;
case Maxwell::ShaderProgram::Geometry:
return ProgramType::Geometry;
case Maxwell::ShaderProgram::Fragment:
return ProgramType::Fragment;
}
UNREACHABLE();
return {};
}
/// Calculates the size of a program stream
std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
constexpr std::size_t start_offset = 10;
@@ -128,13 +149,13 @@ std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
}
/// Hashes one (or two) program streams
u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& code,
u64 GetUniqueIdentifier(ProgramType program_type, const ProgramCode& code,
const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) {
if (size_a == 0) {
size_a = CalculateProgramSize(code);
}
u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a);
if (program_type != Maxwell::ShaderProgram::VertexA) {
if (program_type != ProgramType::VertexA) {
return unique_identifier;
}
// VertexA programs include two programs
@@ -152,12 +173,12 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
}
/// Creates an unspecialized program from code streams
GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
GLShader::ProgramResult CreateProgram(const Device& device, ProgramType program_type,
ProgramCode program_code, ProgramCode program_code_b) {
GLShader::ShaderSetup setup(program_code);
setup.program.size_a = CalculateProgramSize(program_code);
setup.program.size_b = 0;
if (program_type == Maxwell::ShaderProgram::VertexA) {
if (program_type == ProgramType::VertexA) {
// VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
// Conventional HW does not support this, so we combine VertexA and VertexB into one
// stage here.
@@ -168,22 +189,23 @@ GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgr
program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b);
switch (program_type) {
case Maxwell::ShaderProgram::VertexA:
case Maxwell::ShaderProgram::VertexB:
case ProgramType::VertexA:
case ProgramType::VertexB:
return GLShader::GenerateVertexShader(device, setup);
case Maxwell::ShaderProgram::Geometry:
case ProgramType::Geometry:
return GLShader::GenerateGeometryShader(device, setup);
case Maxwell::ShaderProgram::Fragment:
case ProgramType::Fragment:
return GLShader::GenerateFragmentShader(device, setup);
case ProgramType::Compute:
return GLShader::GenerateComputeShader(device, setup);
default:
LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type));
UNREACHABLE();
UNIMPLEMENTED_MSG("Unimplemented program_type={}", static_cast<u32>(program_type));
return {};
}
}
CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEntries& entries,
Maxwell::ShaderProgram program_type, const ProgramVariant& variant,
ProgramType program_type, const ProgramVariant& variant,
bool hint_retrievable = false) {
auto base_bindings{variant.base_bindings};
const auto primitive_mode{variant.primitive_mode};
@@ -194,7 +216,14 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
if (entries.shader_viewport_layer_array) {
source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
}
source += fmt::format("\n#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
if (program_type == ProgramType::Compute) {
source += "#extension GL_ARB_compute_variable_group_size : require\n";
}
source += '\n';
if (program_type != ProgramType::Compute) {
source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
}
for (const auto& cbuf : entries.const_buffers) {
source +=
@@ -221,13 +250,16 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i);
}
if (program_type == Maxwell::ShaderProgram::Geometry) {
if (program_type == ProgramType::Geometry) {
const auto [glsl_topology, debug_name, max_vertices] =
GetPrimitiveDescription(primitive_mode);
source += "layout (" + std::string(glsl_topology) + ") in;\n";
source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
}
if (program_type == ProgramType::Compute) {
source += "layout (local_size_variable) in;\n";
}
source += code;
@@ -255,7 +287,7 @@ std::set<GLenum> GetSupportedFormats() {
} // Anonymous namespace
CachedShader::CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type,
CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,
GLShader::ProgramResult result)
: RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr},
unique_identifier{params.unique_identifier}, program_type{program_type},
@@ -268,29 +300,50 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
ProgramCode&& program_code_b) {
const auto code_size{CalculateProgramSize(program_code)};
const auto code_size_b{CalculateProgramSize(program_code_b)};
auto result{CreateProgram(params.device, program_type, program_code, program_code_b)};
auto result{
CreateProgram(params.device, GetProgramType(program_type), program_code, program_code_b)};
if (result.first.empty()) {
// TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now
return {};
}
params.disk_cache.SaveRaw(ShaderDiskCacheRaw(
params.unique_identifier, program_type, static_cast<u32>(code_size / sizeof(u64)),
static_cast<u32>(code_size_b / sizeof(u64)), std::move(program_code),
std::move(program_code_b)));
params.unique_identifier, GetProgramType(program_type),
static_cast<u32>(code_size / sizeof(u64)), static_cast<u32>(code_size_b / sizeof(u64)),
std::move(program_code), std::move(program_code_b)));
return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result)));
return std::shared_ptr<CachedShader>(
new CachedShader(params, GetProgramType(program_type), std::move(result)));
}
Shader CachedShader::CreateStageFromCache(const ShaderParameters& params,
Maxwell::ShaderProgram program_type,
GLShader::ProgramResult result) {
return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result)));
return std::shared_ptr<CachedShader>(
new CachedShader(params, GetProgramType(program_type), std::move(result)));
}
Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code) {
auto result{CreateProgram(params.device, ProgramType::Compute, code, {})};
const auto code_size{CalculateProgramSize(code)};
params.disk_cache.SaveRaw(ShaderDiskCacheRaw(params.unique_identifier, ProgramType::Compute,
static_cast<u32>(code_size / sizeof(u64)), 0,
std::move(code), {}));
return std::shared_ptr<CachedShader>(
new CachedShader(params, ProgramType::Compute, std::move(result)));
}
Shader CachedShader::CreateKernelFromCache(const ShaderParameters& params,
GLShader::ProgramResult result) {
return std::shared_ptr<CachedShader>(
new CachedShader(params, ProgramType::Compute, std::move(result)));
}
std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) {
GLuint handle{};
if (program_type == Maxwell::ShaderProgram::Geometry) {
if (program_type == ProgramType::Geometry) {
handle = GetGeometryShader(variant);
} else {
const auto [entry, is_cache_miss] = programs.try_emplace(variant);
@@ -308,8 +361,11 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVar
handle = program->handle;
}
auto base_bindings{variant.base_bindings};
base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()) + RESERVED_UBOS;
auto base_bindings = variant.base_bindings;
base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size());
if (program_type != ProgramType::Compute) {
base_bindings.cbuf += STAGE_RESERVED_UBOS;
}
base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size());
base_bindings.sampler += static_cast<u32>(entries.samplers.size());
@@ -572,7 +628,7 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
}
Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
if (!system.GPU().Maxwell3D().dirty_flags.shaders) {
if (!system.GPU().Maxwell3D().dirty.shaders) {
return last_shaders[static_cast<std::size_t>(program)];
}
@@ -589,13 +645,15 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
// No shader found - create a new one
ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
ProgramCode program_code_b;
if (program == Maxwell::ShaderProgram::VertexA) {
const bool is_program_a{program == Maxwell::ShaderProgram::VertexA};
if (is_program_a) {
const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
program_code_b = GetShaderCode(memory_manager, program_addr_b,
memory_manager.GetPointer(program_addr_b));
}
const auto unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
const auto unique_identifier =
GetUniqueIdentifier(GetProgramType(program), program_code, program_code_b);
const auto cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
host_ptr, unique_identifier};
@@ -612,4 +670,30 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
return last_shaders[static_cast<std::size_t>(program)] = shader;
}
Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
auto& memory_manager{system.GPU().MemoryManager()};
const auto host_ptr{memory_manager.GetPointer(code_addr)};
auto kernel = TryGet(host_ptr);
if (kernel) {
return kernel;
}
// No kernel found - create a new one
auto code{GetShaderCode(memory_manager, code_addr, host_ptr)};
const auto unique_identifier{GetUniqueIdentifier(ProgramType::Compute, code, {})};
const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)};
const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
host_ptr, unique_identifier};
const auto found = precompiled_shaders.find(unique_identifier);
if (found == precompiled_shaders.end()) {
kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
} else {
kernel = CachedShader::CreateKernelFromCache(params, found->second);
}
Register(kernel);
return kernel;
}
} // namespace OpenGL

View File

@@ -61,6 +61,11 @@ public:
Maxwell::ShaderProgram program_type,
GLShader::ProgramResult result);
static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code);
static Shader CreateKernelFromCache(const ShaderParameters& params,
GLShader::ProgramResult result);
VAddr GetCpuAddr() const override {
return cpu_addr;
}
@@ -78,7 +83,7 @@ public:
std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant);
private:
explicit CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type,
explicit CachedShader(const ShaderParameters& params, ProgramType program_type,
GLShader::ProgramResult result);
// Geometry programs. These are needed because GLSL needs an input topology but it's not
@@ -104,7 +109,7 @@ private:
u8* host_ptr{};
VAddr cpu_addr{};
u64 unique_identifier{};
Maxwell::ShaderProgram program_type{};
ProgramType program_type{};
ShaderDiskCacheOpenGL& disk_cache;
const PrecompiledPrograms& precompiled_programs;
@@ -132,6 +137,9 @@ public:
/// Gets the current specified shader stage program
Shader GetStageProgram(Maxwell::ShaderProgram program);
/// Gets a compute kernel in the passed address
Shader GetComputeKernel(GPUVAddr code_addr);
protected:
// We do not have to flush this cache as things in it are never modified by us.
void FlushObjectInner(const Shader& object) override {}

View File

@@ -37,7 +37,6 @@ using namespace std::string_literals;
using namespace VideoCommon::Shader;
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage;
using Operation = const OperationNode&;
enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
@@ -162,9 +161,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
}
constexpr bool IsVertexShader(ProgramType stage) {
return stage == ProgramType::VertexA || stage == ProgramType::VertexB;
}
class GLSLDecompiler final {
public:
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ProgramType stage,
std::string suffix)
: device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
@@ -248,25 +251,21 @@ public:
}
entries.clip_distances = ir.GetClipDistances();
entries.shader_viewport_layer_array =
stage == ShaderStage::Vertex && (ir.UsesLayer() || ir.UsesViewportIndex());
IsVertexShader(stage) && (ir.UsesLayer() || ir.UsesViewportIndex());
entries.shader_length = ir.GetLength();
return entries;
}
private:
using OperationDecompilerFn = std::string (GLSLDecompiler::*)(Operation);
using OperationDecompilersArray =
std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
void DeclareVertex() {
if (stage != ShaderStage::Vertex)
if (!IsVertexShader(stage))
return;
DeclareVertexRedeclarations();
}
void DeclareGeometry() {
if (stage != ShaderStage::Geometry) {
if (stage != ProgramType::Geometry) {
return;
}
@@ -297,14 +296,14 @@ private:
break;
}
}
if (stage != ShaderStage::Vertex || device.HasVertexViewportLayer()) {
if (!IsVertexShader(stage) || device.HasVertexViewportLayer()) {
if (ir.UsesLayer()) {
code.AddLine("int gl_Layer;");
}
if (ir.UsesViewportIndex()) {
code.AddLine("int gl_ViewportIndex;");
}
} else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderStage::Vertex &&
} else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && IsVertexShader(stage) &&
!device.HasVertexViewportLayer()) {
LOG_ERROR(
Render_OpenGL,
@@ -341,11 +340,16 @@ private:
}
void DeclareLocalMemory() {
if (const u64 local_memory_size = header.GetLocalMemorySize(); local_memory_size > 0) {
const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
code.AddNewLine();
// TODO(Rodrigo): Unstub kernel local memory size and pass it from a register at
// specialization time.
const u64 local_memory_size =
stage == ProgramType::Compute ? 0x400 : header.GetLocalMemorySize();
if (local_memory_size == 0) {
return;
}
const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
code.AddNewLine();
}
void DeclareInternalFlags() {
@@ -399,12 +403,12 @@ private:
const u32 location{GetGenericAttributeIndex(index)};
std::string name{GetInputAttribute(index)};
if (stage == ShaderStage::Geometry) {
if (stage == ProgramType::Geometry) {
name = "gs_" + name + "[]";
}
std::string suffix;
if (stage == ShaderStage::Fragment) {
if (stage == ProgramType::Fragment) {
const auto input_mode{header.ps.GetAttributeUse(location)};
if (skip_unused && input_mode == AttributeUse::Unused) {
return;
@@ -416,7 +420,7 @@ private:
}
void DeclareOutputAttributes() {
if (ir.HasPhysicalAttributes() && stage != ShaderStage::Fragment) {
if (ir.HasPhysicalAttributes() && stage != ProgramType::Fragment) {
for (u32 i = 0; i < GetNumPhysicalVaryings(); ++i) {
DeclareOutputAttribute(ToGenericAttribute(i));
}
@@ -538,7 +542,7 @@ private:
constexpr u32 element_stride{4};
const u32 address{generic_base + index * generic_stride + element * element_stride};
const bool declared{stage != ShaderStage::Fragment ||
const bool declared{stage != ProgramType::Fragment ||
header.ps.GetAttributeUse(index) != AttributeUse::Unused};
const std::string value{declared ? ReadAttribute(attribute, element) : "0"};
code.AddLine("case 0x{:x}: return {};", address, value);
@@ -642,7 +646,7 @@ private:
}
if (const auto abuf = std::get_if<AbufNode>(&*node)) {
UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ShaderStage::Geometry,
UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ProgramType::Geometry,
"Physical attributes in geometry shaders are not implemented");
if (abuf->IsPhysicalBuffer()) {
return fmt::format("readPhysicalAttribute(ftou({}))",
@@ -697,6 +701,9 @@ private:
}
if (const auto lmem = std::get_if<LmemNode>(&*node)) {
if (stage == ProgramType::Compute) {
LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
}
return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
}
@@ -726,7 +733,7 @@ private:
std::string ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) {
const auto GeometryPass = [&](std::string_view name) {
if (stage == ShaderStage::Geometry && buffer) {
if (stage == ProgramType::Geometry && buffer) {
// TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
// set an 0x80000000 index for those and the shader fails to build. Find out why
// this happens and what's its intent.
@@ -738,10 +745,10 @@ private:
switch (attribute) {
case Attribute::Index::Position:
switch (stage) {
case ShaderStage::Geometry:
case ProgramType::Geometry:
return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer),
GetSwizzle(element));
case ShaderStage::Fragment:
case ProgramType::Fragment:
return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element));
default:
UNREACHABLE();
@@ -762,7 +769,7 @@ private:
// TODO(Subv): Find out what the values are for the first two elements when inside a
// vertex shader, and what's the value of the fourth element when inside a Tess Eval
// shader.
ASSERT(stage == ShaderStage::Vertex);
ASSERT(IsVertexShader(stage));
switch (element) {
case 2:
// Config pack's first value is instance_id.
@@ -774,7 +781,7 @@ private:
return "0";
case Attribute::Index::FrontFacing:
// TODO(Subv): Find out what the values are for the other elements.
ASSERT(stage == ShaderStage::Fragment);
ASSERT(stage == ProgramType::Fragment);
switch (element) {
case 3:
return "itof(gl_FrontFacing ? -1 : 0)";
@@ -796,7 +803,7 @@ private:
return value;
}
// There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders
const std::string precise = stage != ShaderStage::Fragment ? "precise " : "";
const std::string precise = stage != ProgramType::Fragment ? "precise " : "";
const std::string temporary = code.GenerateTemporary();
code.AddLine("{}float {} = {};", precise, temporary, value);
@@ -831,12 +838,12 @@ private:
UNIMPLEMENTED();
return {};
case 1:
if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
return {};
}
return std::make_pair("gl_Layer", true);
case 2:
if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
return {};
}
return std::make_pair("gl_ViewportIndex", true);
@@ -1073,6 +1080,9 @@ private:
target = result->first;
is_integer = result->second;
} else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
if (stage == ProgramType::Compute) {
LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
}
target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
} else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
const std::string real = Visit(gmem->GetRealAddress());
@@ -1400,14 +1410,10 @@ private:
return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint));
}
std::string LogicalAll2(Operation operation) {
std::string LogicalAnd2(Operation operation) {
return GenerateUnary(operation, "all", Type::Bool, Type::Bool2);
}
std::string LogicalAny2(Operation operation) {
return GenerateUnary(operation, "any", Type::Bool, Type::Bool2);
}
template <bool with_nan>
std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) {
const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
@@ -1630,7 +1636,7 @@ private:
}
std::string Exit(Operation operation) {
if (stage != ShaderStage::Fragment) {
if (stage != ProgramType::Fragment) {
code.AddLine("return;");
return {};
}
@@ -1681,7 +1687,7 @@ private:
}
std::string EmitVertex(Operation operation) {
ASSERT_MSG(stage == ShaderStage::Geometry,
ASSERT_MSG(stage == ProgramType::Geometry,
"EmitVertex is expected to be used in a geometry shader.");
// If a geometry shader is attached, it will always flip (it's the last stage before
@@ -1692,7 +1698,7 @@ private:
}
std::string EndPrimitive(Operation operation) {
ASSERT_MSG(stage == ShaderStage::Geometry,
ASSERT_MSG(stage == ProgramType::Geometry,
"EndPrimitive is expected to be used in a geometry shader.");
code.AddLine("EndPrimitive();");
@@ -1714,7 +1720,7 @@ private:
return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
}
static constexpr OperationDecompilersArray operation_decompilers = {
static constexpr std::array operation_decompilers = {
&GLSLDecompiler::Assign,
&GLSLDecompiler::Select,
@@ -1798,8 +1804,7 @@ private:
&GLSLDecompiler::LogicalXor,
&GLSLDecompiler::LogicalNegate,
&GLSLDecompiler::LogicalPick2,
&GLSLDecompiler::LogicalAll2,
&GLSLDecompiler::LogicalAny2,
&GLSLDecompiler::LogicalAnd2,
&GLSLDecompiler::LogicalLessThan<Type::Float>,
&GLSLDecompiler::LogicalEqual<Type::Float>,
@@ -1863,6 +1868,7 @@ private:
&GLSLDecompiler::WorkGroupId<1>,
&GLSLDecompiler::WorkGroupId<2>,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
std::string GetRegister(u32 index) const {
return GetDeclarationWithSuffix(index, "gpr");
@@ -1927,7 +1933,7 @@ private:
}
u32 GetNumPhysicalInputAttributes() const {
return stage == ShaderStage::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
return IsVertexShader(stage) ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
}
u32 GetNumPhysicalAttributes() const {
@@ -1940,7 +1946,7 @@ private:
const Device& device;
const ShaderIR& ir;
const ShaderStage stage;
const ProgramType stage;
const std::string suffix;
const Header header;
@@ -1971,7 +1977,7 @@ std::string GetCommonDeclarations() {
MAX_CONSTBUFFER_ELEMENTS);
}
ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage,
ProgramResult Decompile(const Device& device, const ShaderIR& ir, ProgramType stage,
const std::string& suffix) {
GLSLDecompiler decompiler(device, ir, stage, suffix);
decompiler.Decompile();

View File

@@ -12,14 +12,26 @@
#include "video_core/engines/maxwell_3d.h"
#include "video_core/shader/shader_ir.h"
namespace OpenGL {
class Device;
}
namespace VideoCommon::Shader {
class ShaderIR;
}
namespace OpenGL {
class Device;
enum class ProgramType : u32 {
VertexA = 0,
VertexB = 1,
TessellationControl = 2,
TessellationEval = 3,
Geometry = 4,
Fragment = 5,
Compute = 6
};
} // namespace OpenGL
namespace OpenGL::GLShader {
struct ShaderEntries;
@@ -85,6 +97,6 @@ struct ShaderEntries {
std::string GetCommonDeclarations();
ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
Maxwell::ShaderStage stage, const std::string& suffix);
ProgramType stage, const std::string& suffix);
} // namespace OpenGL::GLShader

View File

@@ -51,7 +51,7 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() {
} // namespace
ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
u32 program_code_size, u32 program_code_size_b,
ProgramCode program_code, ProgramCode program_code_b)
: unique_identifier{unique_identifier}, program_type{program_type},

View File

@@ -18,7 +18,6 @@
#include "common/assert.h"
#include "common/common_types.h"
#include "core/file_sys/vfs_vector.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/renderer_opengl/gl_shader_gen.h"
namespace Core {
@@ -34,14 +33,11 @@ namespace OpenGL {
struct ShaderDiskCacheUsage;
struct ShaderDiskCacheDump;
using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
using ProgramCode = std::vector<u64>;
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
using TextureBufferUsage = std::bitset<64>;
/// Allocated bindings used by an OpenGL shader program.
/// Allocated bindings used by an OpenGL shader program
struct BaseBindings {
u32 cbuf{};
u32 gmem{};
@@ -126,7 +122,7 @@ namespace OpenGL {
/// Describes a shader how it's used by the guest GPU
class ShaderDiskCacheRaw {
public:
explicit ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
explicit ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
u32 program_code_size, u32 program_code_size_b,
ProgramCode program_code, ProgramCode program_code_b);
ShaderDiskCacheRaw();
@@ -141,30 +137,13 @@ public:
}
bool HasProgramA() const {
return program_type == Maxwell::ShaderProgram::VertexA;
return program_type == ProgramType::VertexA;
}
Maxwell::ShaderProgram GetProgramType() const {
ProgramType GetProgramType() const {
return program_type;
}
Maxwell::ShaderStage GetProgramStage() const {
switch (program_type) {
case Maxwell::ShaderProgram::VertexA:
case Maxwell::ShaderProgram::VertexB:
return Maxwell::ShaderStage::Vertex;
case Maxwell::ShaderProgram::TesselationControl:
return Maxwell::ShaderStage::TesselationControl;
case Maxwell::ShaderProgram::TesselationEval:
return Maxwell::ShaderStage::TesselationEval;
case Maxwell::ShaderProgram::Geometry:
return Maxwell::ShaderStage::Geometry;
case Maxwell::ShaderProgram::Fragment:
return Maxwell::ShaderStage::Fragment;
}
UNREACHABLE();
}
const ProgramCode& GetProgramCode() const {
return program_code;
}
@@ -175,7 +154,7 @@ public:
private:
u64 unique_identifier{};
Maxwell::ShaderProgram program_type{};
ProgramType program_type{};
u32 program_code_size{};
u32 program_code_size_b{};

View File

@@ -14,7 +14,8 @@ using Tegra::Engines::Maxwell3D;
using VideoCommon::Shader::ProgramCode;
using VideoCommon::Shader::ShaderIR;
static constexpr u32 PROGRAM_OFFSET{10};
static constexpr u32 PROGRAM_OFFSET = 10;
static constexpr u32 COMPUTE_OFFSET = 0;
ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) {
const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
@@ -29,17 +30,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
};
)";
const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
ProgramResult program =
Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
const auto stage = setup.IsDualProgram() ? ProgramType::VertexA : ProgramType::VertexB;
ProgramResult program = Decompile(device, program_ir, stage, "vertex");
out += program.first;
if (setup.IsDualProgram()) {
const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b);
ProgramResult program_b =
Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
ProgramResult program_b = Decompile(device, program_ir_b, ProgramType::VertexB, "vertex_b");
out += program_b.first;
}
@@ -80,9 +79,9 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
};
)";
const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
ProgramResult program =
Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
ProgramResult program = Decompile(device, program_ir, ProgramType::Geometry, "geometry");
out += program.first;
out += R"(
@@ -116,9 +115,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {
)";
const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
ProgramResult program =
Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
ProgramResult program = Decompile(device, program_ir, ProgramType::Fragment, "fragment");
out += program.first;
out += R"(
@@ -130,4 +127,22 @@ void main() {
return {std::move(out), std::move(program.second)};
}
ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup) {
const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
std::string out = "// Shader Unique Id: CS" + id + "\n\n";
out += GetCommonDeclarations();
const ShaderIR program_ir(setup.program.code, COMPUTE_OFFSET, setup.program.size_a);
ProgramResult program = Decompile(device, program_ir, ProgramType::Compute, "compute");
out += program.first;
out += R"(
void main() {
execute_compute();
}
)";
return {std::move(out), std::move(program.second)};
}
} // namespace OpenGL::GLShader

View File

@@ -54,4 +54,7 @@ ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& se
/// Generates the GLSL fragment shader program source code for the given FS program
ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup);
/// Generates the GLSL compute shader program source code for the given CS program
ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup);
} // namespace OpenGL::GLShader

View File

@@ -10,21 +10,25 @@
namespace OpenGL::GLShader {
GLuint LoadShader(const char* source, GLenum type) {
const char* debug_type;
namespace {
const char* GetStageDebugName(GLenum type) {
switch (type) {
case GL_VERTEX_SHADER:
debug_type = "vertex";
break;
return "vertex";
case GL_GEOMETRY_SHADER:
debug_type = "geometry";
break;
return "geometry";
case GL_FRAGMENT_SHADER:
debug_type = "fragment";
break;
default:
UNREACHABLE();
return "fragment";
case GL_COMPUTE_SHADER:
return "compute";
}
UNIMPLEMENTED();
return "unknown";
}
} // Anonymous namespace
GLuint LoadShader(const char* source, GLenum type) {
const char* debug_type = GetStageDebugName(type);
const GLuint shader_id = glCreateShader(type);
glShaderSource(shader_id, 1, &source, nullptr);
LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);

View File

@@ -165,6 +165,25 @@ OpenGLState::OpenGLState() {
alpha_test.ref = 0.0f;
}
void OpenGLState::SetDefaultViewports() {
for (auto& item : viewports) {
item.x = 0;
item.y = 0;
item.width = 0;
item.height = 0;
item.depth_range_near = 0.0f;
item.depth_range_far = 1.0f;
item.scissor.enabled = false;
item.scissor.x = 0;
item.scissor.y = 0;
item.scissor.width = 0;
item.scissor.height = 0;
}
depth_clamp.far_plane = false;
depth_clamp.near_plane = false;
}
void OpenGLState::ApplyDefaultState() {
glEnable(GL_BLEND);
glDisable(GL_FRAMEBUFFER_SRGB);
@@ -526,7 +545,7 @@ void OpenGLState::ApplySamplers() const {
}
}
void OpenGLState::Apply() const {
void OpenGLState::Apply() {
MICROPROFILE_SCOPE(OpenGL_State);
ApplyFramebufferState();
ApplyVertexArrayState();
@@ -536,19 +555,31 @@ void OpenGLState::Apply() const {
ApplyPointSize();
ApplyFragmentColorClamp();
ApplyMultisample();
if (dirty.color_mask) {
ApplyColorMask();
dirty.color_mask = false;
}
ApplyDepthClamp();
ApplyColorMask();
ApplyViewport();
ApplyStencilTest();
if (dirty.stencil_state) {
ApplyStencilTest();
dirty.stencil_state = false;
}
ApplySRgb();
ApplyCulling();
ApplyDepth();
ApplyPrimitiveRestart();
ApplyBlending();
if (dirty.blend_state) {
ApplyBlending();
dirty.blend_state = false;
}
ApplyLogicOp();
ApplyTextures();
ApplySamplers();
ApplyPolygonOffset();
if (dirty.polygon_offset) {
ApplyPolygonOffset();
dirty.polygon_offset = false;
}
ApplyAlphaTest();
}

View File

@@ -195,8 +195,9 @@ public:
s_rgb_used = false;
}
void SetDefaultViewports();
/// Apply this state as the current OpenGL state
void Apply() const;
void Apply();
void ApplyFramebufferState() const;
void ApplyVertexArrayState() const;
@@ -237,11 +238,41 @@ public:
/// Viewport does not affects glClearBuffer so emulate viewport using scissor test
void EmulateViewportWithScissor();
void MarkDirtyBlendState() {
dirty.blend_state = true;
}
void MarkDirtyStencilState() {
dirty.stencil_state = true;
}
void MarkDirtyPolygonOffset() {
dirty.polygon_offset = true;
}
void MarkDirtyColorMask() {
dirty.color_mask = true;
}
void AllDirty() {
dirty.blend_state = true;
dirty.stencil_state = true;
dirty.polygon_offset = true;
dirty.color_mask = true;
}
private:
static OpenGLState cur_state;
// Workaround for sRGB problems caused by QT not supporting srgb output
static bool s_rgb_used;
struct {
bool blend_state;
bool stencil_state;
bool viewport_state;
bool polygon_offset;
bool color_mask;
} dirty{};
};
} // namespace OpenGL

View File

@@ -137,7 +137,6 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size());
const auto& format{tex_format_tuples[static_cast<std::size_t>(pixel_format)]};
ASSERT(component_type == format.component_type);
return format;
}
@@ -485,11 +484,15 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
const auto& dst_params{dst_view->GetSurfaceParams()};
OpenGLState prev_state{OpenGLState::GetCurState()};
SCOPE_EXIT({ prev_state.Apply(); });
SCOPE_EXIT({
prev_state.AllDirty();
prev_state.Apply();
});
OpenGLState state;
state.draw.read_framebuffer = src_framebuffer.handle;
state.draw.draw_framebuffer = dst_framebuffer.handle;
state.AllDirty();
state.Apply();
u32 buffers{};

View File

@@ -108,6 +108,7 @@ void RendererOpenGL::SwapBuffers(
// Maintain the rasterizer's state as a priority
OpenGLState prev_state = OpenGLState::GetCurState();
state.AllDirty();
state.Apply();
if (framebuffer) {
@@ -140,6 +141,7 @@ void RendererOpenGL::SwapBuffers(
system.GetPerfStats().BeginSystemFrame();
// Restore the rasterizer state
prev_state.AllDirty();
prev_state.Apply();
}
@@ -206,6 +208,7 @@ void RendererOpenGL::InitOpenGLObjects() {
// Link shaders and get variable locations
shader.CreateFromSource(vertex_shader, nullptr, fragment_shader);
state.draw.shader_program = shader.handle;
state.AllDirty();
state.Apply();
uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix");
uniform_color_texture = glGetUniformLocation(shader.handle, "color_texture");
@@ -338,12 +341,14 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
// Workaround brigthness problems in SMO by enabling sRGB in the final output
// if it has been used in the frame. Needed because of this bug in QT: QTBUG-50987
state.framebuffer_srgb.enabled = OpenGLState::GetsRGBUsed();
state.AllDirty();
state.Apply();
glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data());
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
// Restore default state
state.framebuffer_srgb.enabled = false;
state.texture_units[0].texture = 0;
state.AllDirty();
state.Apply();
// Clear sRGB state for the next frame
OpenGLState::ClearsRGBUsed();
@@ -388,6 +393,7 @@ void RendererOpenGL::CaptureScreenshot() {
GLuint old_read_fb = state.draw.read_framebuffer;
GLuint old_draw_fb = state.draw.draw_framebuffer;
state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle;
state.AllDirty();
state.Apply();
Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
@@ -407,6 +413,7 @@ void RendererOpenGL::CaptureScreenshot() {
screenshot_framebuffer.Release();
state.draw.read_framebuffer = old_read_fb;
state.draw.draw_framebuffer = old_draw_fb;
state.AllDirty();
state.Apply();
glDeleteRenderbuffers(1, &renderbuffer);

View File

@@ -205,10 +205,6 @@ public:
}
private:
using OperationDecompilerFn = Id (SPIRVDecompiler::*)(Operation);
using OperationDecompilersArray =
std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
static constexpr auto INTERNAL_FLAGS_COUNT = static_cast<std::size_t>(InternalFlag::Amount);
void AllocateBindings() {
@@ -804,12 +800,7 @@ private:
return {};
}
Id LogicalAll2(Operation operation) {
UNIMPLEMENTED();
return {};
}
Id LogicalAny2(Operation operation) {
Id LogicalAnd2(Operation operation) {
UNIMPLEMENTED();
return {};
}
@@ -1206,7 +1197,7 @@ private:
return {};
}
static constexpr OperationDecompilersArray operation_decompilers = {
static constexpr std::array operation_decompilers = {
&SPIRVDecompiler::Assign,
&SPIRVDecompiler::Ternary<&Module::OpSelect, Type::Float, Type::Bool, Type::Float,
@@ -1291,8 +1282,7 @@ private:
&SPIRVDecompiler::Binary<&Module::OpLogicalNotEqual, Type::Bool>,
&SPIRVDecompiler::Unary<&Module::OpLogicalNot, Type::Bool>,
&SPIRVDecompiler::LogicalPick2,
&SPIRVDecompiler::LogicalAll2,
&SPIRVDecompiler::LogicalAny2,
&SPIRVDecompiler::LogicalAnd2,
&SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::Float>,
&SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::Float>,
@@ -1357,6 +1347,7 @@ private:
&SPIRVDecompiler::WorkGroupId<1>,
&SPIRVDecompiler::WorkGroupId<2>,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
const VKDevice& device;
const ShaderIR& ir;

View File

@@ -15,7 +15,7 @@
#include "video_core/shader/shader_ir.h"
namespace VideoCommon::Shader {
namespace {
using Tegra::Shader::Instruction;
using Tegra::Shader::OpCode;
@@ -29,8 +29,7 @@ struct Query {
struct BlockStack {
BlockStack() = default;
BlockStack(const BlockStack& b) = default;
BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
explicit BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
std::stack<u32> ssy_stack{};
std::stack<u32> pbk_stack{};
};
@@ -58,7 +57,7 @@ struct BlockInfo {
struct CFGRebuildState {
explicit CFGRebuildState(const ProgramCode& program_code, const std::size_t program_size,
const u32 start)
: program_code{program_code}, program_size{program_size}, start{start} {}
: start{start}, program_code{program_code}, program_size{program_size} {}
u32 start{};
std::vector<BlockInfo> block_info{};
@@ -85,7 +84,7 @@ std::pair<BlockCollision, u32> TryGetBlock(CFGRebuildState& state, u32 address)
return {BlockCollision::Inside, index};
}
}
return {BlockCollision::None, -1};
return {BlockCollision::None, 0xFFFFFFFF};
}
struct ParseInfo {
@@ -365,27 +364,29 @@ bool TryQuery(CFGRebuildState& state) {
const auto gather_end = labels.upper_bound(block.end);
while (gather_start != gather_end) {
cc.push(gather_start->second);
gather_start++;
++gather_start;
}
};
if (state.queries.empty()) {
return false;
}
Query& q = state.queries.front();
const u32 block_index = state.registered[q.address];
BlockInfo& block = state.block_info[block_index];
// If the block is visted, check if the stacks match, else gather the ssy/pbk
// If the block is visited, check if the stacks match, else gather the ssy/pbk
// labels into the current stack and look if the branch at the end of the block
// consumes a label. Schedule new queries accordingly
if (block.visited) {
BlockStack& stack = state.stacks[q.address];
const bool all_okay = (stack.ssy_stack.size() == 0 || q.ssy_stack == stack.ssy_stack) &&
(stack.pbk_stack.size() == 0 || q.pbk_stack == stack.pbk_stack);
const bool all_okay = (stack.ssy_stack.empty() || q.ssy_stack == stack.ssy_stack) &&
(stack.pbk_stack.empty() || q.pbk_stack == stack.pbk_stack);
state.queries.pop_front();
return all_okay;
}
block.visited = true;
state.stacks[q.address] = BlockStack{q};
state.stacks.insert_or_assign(q.address, BlockStack{q});
Query q2(q);
state.queries.pop_front();
gather_labels(q2.ssy_stack, state.ssy_labels, block);
@@ -394,6 +395,7 @@ bool TryQuery(CFGRebuildState& state) {
q2.address = block.end + 1;
state.queries.push_back(q2);
}
Query conditional_query{q2};
if (block.branch.is_sync) {
if (block.branch.address == unassigned_branch) {
@@ -408,13 +410,15 @@ bool TryQuery(CFGRebuildState& state) {
conditional_query.pbk_stack.pop();
}
conditional_query.address = block.branch.address;
state.queries.push_back(conditional_query);
state.queries.push_back(std::move(conditional_query));
return true;
}
} // Anonymous namespace
std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
u32 start_address) {
std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
std::size_t program_size, u32 start_address) {
CFGRebuildState state{program_code, program_size, start_address};
// Inspect Code and generate blocks
state.labels.clear();
state.labels.emplace(start_address);
@@ -424,10 +428,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
return {};
}
}
// Decompile Stacks
Query start_query{};
start_query.address = state.start;
state.queries.push_back(start_query);
state.queries.push_back(Query{state.start, {}, {}});
bool decompiled = true;
while (!state.queries.empty()) {
if (!TryQuery(state)) {
@@ -435,14 +438,15 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
break;
}
}
// Sort and organize results
std::sort(state.block_info.begin(), state.block_info.end(),
[](const BlockInfo& a, const BlockInfo& b) -> bool { return a.start < b.start; });
[](const BlockInfo& a, const BlockInfo& b) { return a.start < b.start; });
ShaderCharacteristics result_out{};
result_out.decompilable = decompiled;
result_out.start = start_address;
result_out.end = start_address;
for (auto& block : state.block_info) {
for (const auto& block : state.block_info) {
ShaderBlock new_block{};
new_block.start = block.start;
new_block.end = block.end;
@@ -457,8 +461,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
}
if (result_out.decompilable) {
result_out.labels = std::move(state.labels);
return {result_out};
return {std::move(result_out)};
}
// If it's not decompilable, merge the unlabelled blocks together
auto back = result_out.blocks.begin();
auto next = std::next(back);
@@ -469,8 +474,8 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
continue;
}
back = next;
next++;
++next;
}
return {result_out};
return {std::move(result_out)};
}
} // namespace VideoCommon::Shader

View File

@@ -4,7 +4,6 @@
#pragma once
#include <cstring>
#include <list>
#include <optional>
#include <unordered_set>
@@ -26,27 +25,44 @@ struct Condition {
bool IsUnconditional() const {
return predicate == Pred::UnusedIndex && cc == ConditionCode::T;
}
bool operator==(const Condition& other) const {
return std::tie(predicate, cc) == std::tie(other.predicate, other.cc);
}
bool operator!=(const Condition& other) const {
return !operator==(other);
}
};
struct ShaderBlock {
u32 start{};
u32 end{};
bool ignore_branch{};
struct Branch {
Condition cond{};
bool kills{};
s32 address{};
bool operator==(const Branch& b) const {
return std::tie(cond, kills, address) == std::tie(b.cond, b.kills, b.address);
}
} branch{};
bool operator!=(const Branch& b) const {
return !operator==(b);
}
};
u32 start{};
u32 end{};
bool ignore_branch{};
Branch branch{};
bool operator==(const ShaderBlock& sb) const {
return std::tie(start, end, ignore_branch, branch) ==
std::tie(sb.start, sb.end, sb.ignore_branch, sb.branch);
}
bool operator!=(const ShaderBlock& sb) const {
return !operator==(sb);
}
};
struct ShaderCharacteristics {
@@ -57,7 +73,7 @@ struct ShaderCharacteristics {
std::unordered_set<u32> labels{};
};
std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
u32 start_address);
std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
std::size_t program_size, u32 start_address);
} // namespace VideoCommon::Shader

View File

@@ -47,14 +47,14 @@ void ShaderIR::Decode() {
if (shader_info.decompilable) {
disable_flow_stack = true;
const auto insert_block = [this](NodeBlock& nodes, u32 label) {
if (label == exit_branch) {
if (label == static_cast<u32>(exit_branch)) {
return;
}
basic_blocks.insert({label, nodes});
};
const auto& blocks = shader_info.blocks;
NodeBlock current_block;
u32 current_label = exit_branch;
u32 current_label = static_cast<u32>(exit_branch);
for (auto& block : blocks) {
if (shader_info.labels.count(block.start) != 0) {
insert_block(current_block, current_label);

View File

@@ -42,11 +42,14 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
case OpCode::Id::FMUL_R:
case OpCode::Id::FMUL_IMM: {
// FMUL does not have 'abs' bits and only the second operand has a 'neg' bit.
UNIMPLEMENTED_IF_MSG(instr.fmul.tab5cb8_2 != 0, "FMUL tab5cb8_2({}) is not implemented",
instr.fmul.tab5cb8_2.Value());
UNIMPLEMENTED_IF_MSG(
instr.fmul.tab5c68_0 != 1, "FMUL tab5cb8_0({}) is not implemented",
instr.fmul.tab5c68_0.Value()); // SMO typical sends 1 here which seems to be the default
if (instr.fmul.tab5cb8_2 != 0) {
LOG_WARNING(HW_GPU, "FMUL tab5cb8_2({}) is not implemented",
instr.fmul.tab5cb8_2.Value());
}
if (instr.fmul.tab5c68_0 != 1) {
LOG_WARNING(HW_GPU, "FMUL tab5cb8_0({}) is not implemented",
instr.fmul.tab5c68_0.Value());
}
op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b);

View File

@@ -23,7 +23,9 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
}
} else {
UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None);
if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None) {
LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
}
}
Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a);

View File

@@ -18,10 +18,12 @@ u32 ShaderIR::DecodeFfma(NodeBlock& bb, u32 pc) {
const auto opcode = OpCode::Decode(instr);
UNIMPLEMENTED_IF_MSG(instr.ffma.cc != 0, "FFMA cc not implemented");
UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_0 != 1, "FFMA tab5980_0({}) not implemented",
instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO
UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_1 != 0, "FFMA tab5980_1({}) not implemented",
instr.ffma.tab5980_1.Value());
if (instr.ffma.tab5980_0 != 1) {
LOG_WARNING(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value());
}
if (instr.ffma.tab5980_1 != 0) {
LOG_WARNING(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value());
}
const Node op_a = GetRegister(instr.gpr8);

View File

@@ -18,43 +18,56 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
const Instruction instr = {program_code[pc]};
const auto opcode = OpCode::Decode(instr);
UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0);
DEBUG_ASSERT(instr.hsetp2.ftz == 0);
Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);
Node op_b = [&]() {
switch (opcode->get().GetId()) {
case OpCode::Id::HSETP2_R:
return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a,
instr.hsetp2.negate_b);
default:
UNREACHABLE();
return Immediate(0);
}
}();
op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b);
// We can't use the constant predicate as destination.
ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex));
const Node second_pred = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred != 0);
Tegra::Shader::PredCondition cond{};
bool h_and{};
Node op_b{};
switch (opcode->get().GetId()) {
case OpCode::Id::HSETP2_C:
cond = instr.hsetp2.cbuf_and_imm.cond;
h_and = instr.hsetp2.cbuf_and_imm.h_and;
op_b = GetOperandAbsNegHalf(GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset),
instr.hsetp2.cbuf.abs_b, instr.hsetp2.cbuf.negate_b);
break;
case OpCode::Id::HSETP2_IMM:
cond = instr.hsetp2.cbuf_and_imm.cond;
h_and = instr.hsetp2.cbuf_and_imm.h_and;
op_b = UnpackHalfImmediate(instr, true);
break;
case OpCode::Id::HSETP2_R:
cond = instr.hsetp2.reg.cond;
h_and = instr.hsetp2.reg.h_and;
op_b =
UnpackHalfFloat(GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.reg.abs_b,
instr.hsetp2.reg.negate_b),
instr.hsetp2.reg.type_b);
break;
default:
UNREACHABLE();
op_b = Immediate(0);
}
const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op);
const OperationCode pair_combiner =
instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2;
const Node pred39 = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred);
const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b);
const Node first_pred = Operation(pair_combiner, comparison);
const auto Write = [&](u64 dest, Node src) {
SetPredicate(bb, dest, Operation(combiner, std::move(src), pred39));
};
// Set the primary predicate to the result of Predicate OP SecondPredicate
const Node value = Operation(combiner, first_pred, second_pred);
SetPredicate(bb, instr.hsetp2.pred3, value);
if (instr.hsetp2.pred0 != static_cast<u64>(Pred::UnusedIndex)) {
// Set the secondary predicate to the result of !Predicate OP SecondPredicate, if enabled
const Node negated_pred = Operation(OperationCode::LogicalNegate, first_pred);
SetPredicate(bb, instr.hsetp2.pred0, Operation(combiner, negated_pred, second_pred));
const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b);
const u64 first = instr.hsetp2.pred0;
const u64 second = instr.hsetp2.pred3;
if (h_and) {
const Node joined = Operation(OperationCode::LogicalAnd2, comparison);
Write(first, joined);
Write(second, Operation(OperationCode::LogicalNegate, joined));
} else {
Write(first, Operation(OperationCode::LogicalPick2, comparison, Immediate(0u)));
Write(second, Operation(OperationCode::LogicalPick2, comparison, Immediate(1u)));
}
return pc;

View File

@@ -22,9 +22,9 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
const auto opcode = OpCode::Decode(instr);
if (opcode->get().GetId() == OpCode::Id::HFMA2_RR) {
UNIMPLEMENTED_IF(instr.hfma2.rr.precision != HalfPrecision::None);
DEBUG_ASSERT(instr.hfma2.rr.precision == HalfPrecision::None);
} else {
UNIMPLEMENTED_IF(instr.hfma2.precision != HalfPrecision::None);
DEBUG_ASSERT(instr.hfma2.precision == HalfPrecision::None);
}
constexpr auto identity = HalfType::H0_H1;

View File

@@ -101,8 +101,7 @@ enum class OperationCode {
LogicalXor, /// (bool a, bool b) -> bool
LogicalNegate, /// (bool a) -> bool
LogicalPick2, /// (bool2 pair, uint index) -> bool
LogicalAll2, /// (bool2 a) -> bool
LogicalAny2, /// (bool2 a) -> bool
LogicalAnd2, /// (bool2 a) -> bool
LogicalFLessThan, /// (float a, float b) -> bool
LogicalFEqual, /// (float a, float b) -> bool

View File

@@ -59,8 +59,8 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co
return TrackCbuf(source, code, new_cursor);
}
if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
if (auto found = TrackCbuf((*operation)[i], code, cursor); std::get<0>(found)) {
for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
if (auto found = TrackCbuf((*operation)[i - 1], code, cursor); std::get<0>(found)) {
// Cbuf found in operand.
return found;
}

View File

@@ -24,9 +24,8 @@ StagingCache::StagingCache() = default;
StagingCache::~StagingCache() = default;
SurfaceBaseImpl::SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params)
: params{params}, mipmap_sizes(params.num_levels),
mipmap_offsets(params.num_levels), gpu_addr{gpu_addr}, host_memory_size{
params.GetHostSizeInBytes()} {
: params{params}, host_memory_size{params.GetHostSizeInBytes()}, gpu_addr{gpu_addr},
mipmap_sizes(params.num_levels), mipmap_offsets(params.num_levels) {
std::size_t offset = 0;
for (u32 level = 0; level < params.num_levels; ++level) {
const std::size_t mipmap_size{params.GetGuestMipmapSize(level)};

View File

@@ -116,10 +116,10 @@ public:
std::lock_guard lock{mutex};
auto& maxwell3d = system.GPU().Maxwell3D();
if (!maxwell3d.dirty_flags.zeta_buffer) {
if (!maxwell3d.dirty.depth_buffer) {
return depth_buffer.view;
}
maxwell3d.dirty_flags.zeta_buffer = false;
maxwell3d.dirty.depth_buffer = false;
const auto& regs{maxwell3d.regs};
const auto gpu_addr{regs.zeta.Address()};
@@ -145,10 +145,10 @@ public:
std::lock_guard lock{mutex};
ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
auto& maxwell3d = system.GPU().Maxwell3D();
if (!maxwell3d.dirty_flags.color_buffer[index]) {
if (!maxwell3d.dirty.render_target[index]) {
return render_targets[index].view;
}
maxwell3d.dirty_flags.color_buffer.reset(index);
maxwell3d.dirty.render_target[index] = false;
const auto& regs{maxwell3d.regs};
if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 ||
@@ -274,10 +274,11 @@ protected:
auto& maxwell3d = system.GPU().Maxwell3D();
const u32 index = surface->GetRenderTarget();
if (index == DEPTH_RT) {
maxwell3d.dirty_flags.zeta_buffer = true;
maxwell3d.dirty.depth_buffer = true;
} else {
maxwell3d.dirty_flags.color_buffer.set(index, true);
maxwell3d.dirty.render_target[index] = true;
}
maxwell3d.dirty.render_settings = true;
}
void Register(TSurface surface) {

View File

@@ -1843,13 +1843,14 @@ void GMainWindow::OnCoreError(Core::System::ResultStatus result, std::string det
"data, or other bugs.");
switch (result) {
case Core::System::ResultStatus::ErrorSystemFiles: {
QString message = tr("yuzu was unable to locate a Switch system archive");
if (!details.empty()) {
message.append(tr(": %1. ").arg(QString::fromStdString(details)));
QString message;
if (details.empty()) {
message =
tr("yuzu was unable to locate a Switch system archive. %1").arg(common_message);
} else {
message.append(tr(". "));
message = tr("yuzu was unable to locate a Switch system archive: %1. %2")
.arg(QString::fromStdString(details), common_message);
}
message.append(common_message);
answer = QMessageBox::question(this, tr("System Archive Not Found"), message,
QMessageBox::Yes | QMessageBox::No, QMessageBox::No);
@@ -1858,8 +1859,8 @@ void GMainWindow::OnCoreError(Core::System::ResultStatus result, std::string det
}
case Core::System::ResultStatus::ErrorSharedFont: {
QString message = tr("yuzu was unable to locate the Switch shared fonts. ");
message.append(common_message);
const QString message =
tr("yuzu was unable to locate the Switch shared fonts. %1").arg(common_message);
answer = QMessageBox::question(this, tr("Shared Fonts Not Found"), message,
QMessageBox::Yes | QMessageBox::No, QMessageBox::No);
status_message = tr("Shared Font Missing");