Compare commits

...

65 Commits

Author SHA1 Message Date
ReinUsesLisp
ee21e4ecd3 glsl: Squash constant buffers into a single SSBO when we hit the limit
Avoids compilation errors at the cost of shader build times and runtime
performance when a game hits the limit of uniform buffers we can use.
2020-05-31 21:33:49 -03:00
bunnei
e68ee43a1a Merge pull request #3930 from ReinUsesLisp/animal-borders
vk_rasterizer: Implement constant attributes
2020-05-31 18:40:17 -04:00
bunnei
104b334e40 Update CMakeLists.txt 2020-05-31 18:35:36 -04:00
bunnei
0ac8848eae Update CMakeLists.txt 2020-05-31 17:46:25 -04:00
bunnei
edbf3144d2 Merge pull request #3958 from FernandoS27/gl-debug
OpenGL: Enable Debug Context and Synchronous debugging when graphics debugging is enabled
2020-05-31 17:04:27 -04:00
bunnei
f7debcaa04 Merge pull request #3999 from ReinUsesLisp/opt-tex-cache
texture_cache: Optimize GetSurfacesInRegion
2020-05-31 17:02:29 -04:00
Rodrigo Locatti
a280822c82 Merge pull request #4025 from Morph1984/intel-proprietary-compute
gl_device: Enable compute shaders for Intel proprietary drivers
2020-05-31 16:45:21 -03:00
Morph
bb8ef38152 gl_device: Enable compute shaders for Intel proprietary drivers
Previously we were disabling compute shaders on Intel's proprietary driver due to broken compute. This has been fixed in the latest Intel drivers. Re-enable compute for Intel proprietary drivers and remove the check for broken compute.
2020-05-31 03:21:07 -04:00
bunnei
058ec22787 Merge pull request #3982 from ReinUsesLisp/membar-cts
shader/other: Implement MEMBAR.CTS
2020-05-30 11:51:42 -04:00
Fernando Sahmkow
9d9ffe0f94 Merge pull request #4017 from ogniK5377/xbyak
Add xbyak external
2020-05-29 21:38:34 -04:00
David Marcec
d0bdd26c26 Add xbyak external 2020-05-30 10:55:27 +10:00
bunnei
87b272699f Merge pull request #4007 from ReinUsesLisp/reduce-logs
maxwell_3d: Reduce severity of logs that can be spammed
2020-05-29 17:29:17 -04:00
bunnei
1bb3122c1f Merge pull request #3991 from ReinUsesLisp/depth-sampling
texture_cache: Implement depth stencil texture swizzles
2020-05-28 23:33:38 -04:00
bunnei
5242b21524 Merge pull request #4002 from lat9nq/fix-nix-mod-directories
patch_manager: Add support for case-sensitivity on Linux
2020-05-28 22:36:39 -04:00
ReinUsesLisp
9b06e823ee maxwell_3d: Reduce severity of logs that can be spammed
These logs were killing performance on some games when they were
spammed. Reduce them to Debug severity.
2020-05-28 18:23:25 -03:00
lat9nq
f57cbd9f24 Make copying directory string more concise 2020-05-28 13:33:50 -04:00
lat9nq
326403518d Address requested changes 2020-05-28 13:30:22 -04:00
bunnei
099ac9c2a8 Merge pull request #3993 from ReinUsesLisp/fix-zla
gl_shader_manager: Unbind GLSL program when binding a host pipeline
2020-05-28 12:15:22 -04:00
lat9nq
136c563f76 *nix systems can read any-case patch directories
Changes many patch_manager functions to use a case-less variant of
GetSubdirectory. Fixes patches not showing up on *nix systems when
patch directories are named with odd cases, i.e. `exeFS'.
2020-05-27 23:12:56 -04:00
bunnei
640f0d1cec Merge pull request #3954 from Morph1984/log-memory-amount
main: Log host system memory parameters
2020-05-27 22:58:51 -04:00
ReinUsesLisp
b8b6f94ba9 texture_cache: Use unordered_map::find instead of operator[] on hot code 2020-05-27 17:59:04 -03:00
bunnei
630fc12d4e Merge pull request #3961 from Morph1984/bgra8_srgb
maxwell_to_vk: Add format B8G8R8A8_SRGB and add Attachable capability for B8G8R8A8_UNORM
2020-05-27 16:44:22 -04:00
ReinUsesLisp
d2b2557542 texture_cache: Use small vector for surface vectors
This avoids most heap allocations when collecting surfaces into a
vector.
2020-05-27 17:31:14 -03:00
Morph
b2af304918 Fix macOS code and change "Swapfile" to "Swap" 2020-05-27 11:21:59 -04:00
ReinUsesLisp
32e6727dae shader/other: Implement MEMBAR.CTS
This silences an assertion we were hitting and uses workgroup memory
barriers when the game requests it.
2020-05-27 00:19:45 -03:00
ReinUsesLisp
b2c4521a91 texture_cache: Fix layered null surfaces
Null texture cubes were not considered arrays, causing issues on Vulkan
and OpenGL when creating views.
2020-05-26 17:50:08 -03:00
ReinUsesLisp
b17fe82973 gl_texture_cache: Implement small texture view cache for swizzles
This fixes cases where the texture swizzle was applied twice on the same
draw to a texture bound to two different slots.
2020-05-26 17:50:08 -03:00
ReinUsesLisp
8bba84a401 texture_cache: Implement depth stencil texture swizzles
Stop ignoring image swizzles on depth and stencil images.

This doesn't fix a known issue on Xenoblade Chronicles 2 where an OpenGL
texture changes swizzles twice before being used. A proper fix would be
having a small texture view cache for this like we do on Vulkan.
2020-05-26 17:44:50 -03:00
bunnei
508242c267 Merge pull request #3981 from ReinUsesLisp/bar
shader/other: Implement BAR.SYNC 0x0
2020-05-26 14:40:13 -04:00
bunnei
623d9c47a2 Merge pull request #3980 from ReinUsesLisp/red-op
shader/memory: Implement non-addition operations in RED
2020-05-26 12:50:41 -04:00
ReinUsesLisp
c13e2f1b75 gl_shader_manager: Unbind GLSL program when binding a host pipeline
Fixes regression in Link's Awakening caused by 420cc13248
2020-05-26 04:20:39 -03:00
bunnei
86345c126a Merge pull request #3978 from ReinUsesLisp/write-rz
shader_decompiler: Visit source nodes even when they assign to RZ
2020-05-25 21:31:33 -04:00
bunnei
1adabdac7f Merge pull request #3905 from FernandoS27/vulkan-fix
Correct a series of crashes and intructions on Async GPU and Vulkan Pipeline
2020-05-24 15:23:38 -04:00
bunnei
325e7eed3c Merge pull request #3964 from ReinUsesLisp/arb-integration
renderer_opengl: Add assembly program code paths
2020-05-24 00:34:12 -04:00
bunnei
487dd05170 Merge pull request #3979 from ReinUsesLisp/thread-group
shader/other: Implement thread comparisons (NV_shader_thread_group)
2020-05-24 00:33:06 -04:00
bunnei
6a5cf1473e Merge pull request #3975 from ReinUsesLisp/fast-bufcache
buffer_cache: Replace boost::icl::interval_map with boost::intrusive::set
2020-05-24 00:32:44 -04:00
Tobias
d0a9caa08f yuzu/discord_impl: Update the applicationID (#3977) 2020-05-22 18:26:26 +02:00
bunnei
1306608b2a Merge pull request #3970 from VolcaEM/patch-1
nifm: correct assert string in CreateTemporaryNetworkProfile
2020-05-21 23:46:42 -04:00
ReinUsesLisp
5d0986a53b shader/other: Implement BAR.SYNC 0x0
Trivially implement this particular case of BAR. Unless games use OpenCL
or CUDA barriers, we shouldn't hit any other case here.
2020-05-21 23:20:43 -03:00
ReinUsesLisp
103809a0ca shader/memory: Implement non-addition operations in RED
Trivially implement these instructions. They are used in Astral Chain.
2020-05-21 23:19:46 -03:00
ReinUsesLisp
e2b67a868b shader/other: Implement thread comparisons (NV_shader_thread_group)
Hardware S2R special registers match gl_Thread*MaskNV. We can trivially
implement these using Nvidia's extension on OpenGL or naively stubbing
them with the ARB instructions to match. This might cause issues if the
host device warp size doesn't match Nvidia's. That said, this is
unlikely on proper shaders.

Refer to the attached url for more documentation about these flags.
https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_thread_group.txt
2020-05-21 23:18:37 -03:00
ReinUsesLisp
ed4e324991 shader_decompiler: Visit source nodes even when they assign to RZ
Some operations like atomicMin were ignored because they returned were
being stored to RZ. This operations have a side effect and it was being
ignored.
2020-05-21 23:16:03 -03:00
ReinUsesLisp
434856c636 vk_shader_decompiler: Don't assert for void returns
Atomic instructions can be used without returning anything and this is
valid code. Remove the assert.
2020-05-21 23:16:03 -03:00
ReinUsesLisp
ebaace294f buffer_cache: Remove unused boost headers 2020-05-21 16:44:00 -03:00
ReinUsesLisp
a2dcc642c1 map_interval: Add interval allocator and drop hack
Drop the std::list hack to allocate memory indefinitely.

Instead use a custom allocator that keeps references valid until
destruction. This allocates fixed chunks of memory and puts pointers in
a free list. When an allocation is no longer used put it back to the
free list, this doesn't heap allocate because std::vector doesn't change
the capacity. If the free list is empty, allocate a new chunk.
2020-05-21 16:44:00 -03:00
ReinUsesLisp
19d4f28001 buffer_cache: Use boost::container::small_vector for maps in range
Most overlaps in the buffer cache only contain one mapped address.
We can avoid close to all heap allocations once the buffer cache is
warmed up by using a small_vector with a stack size of one.
2020-05-21 16:44:00 -03:00
ReinUsesLisp
891236124c buffer_cache: Use boost::intrusive::set for caching
Instead of using boost::icl::interval_map for caching, use
boost::intrusive::set. interval_map is intended as a container where the
keys can overlap with one another; we don't need this for caching
buffers and a std::set-like data structure that allows us to search with
lower_bound is enough.
2020-05-21 16:44:00 -03:00
ReinUsesLisp
3b0baf746e buffer_cache: Remove shared pointers
Removing shared pointers is a first step to be able to use intrusive
objects and keep allocations close to one another in memory.
2020-05-21 16:02:54 -03:00
ReinUsesLisp
599274e3f0 buffer_cache: Minor style changes
Minor style changes. Mostly done so I avoid editing it while doing other
changes.
2020-05-21 16:02:20 -03:00
VolcaEM
cb75ccc1f7 clang-format 2020-05-21 10:43:55 +02:00
VolcaEM
235805edf3 nifm: correct assert in CreateTemporaryNetworkProfile
This has been wrong since 0432af5ad1
I haven't found a game that called this function (and I haven't tried this on a real Switch), and because of this I haven't been able to check if the number in assert OR the string in the assert is wrong, but one of the two is wrong:
NetworkProfileData is 0x18E, while SfNetworkProfileData is 0x17C, according to Switchbrew
Switchbrew doesn't officially say that NetworkProfileData's size is 0x18E but it's possible to calculate its size since Switchbrew provides the size and the offset of all the components of NetworkProfileData (which isn't currently implemented in yuzu, alongside SfNetworkProfileData)
NetworkProfileData documentation: https://switchbrew.org/wiki/Network_Interface_services#NetworkProfileData
SfNetworkProfileData documentation: https://switchbrew.org/wiki/Network_Interface_services#SfNetworkProfileData
Since I trust ogniK's work on reversing NIFM, I'd assume this was just a typo in the string
2020-05-21 10:23:53 +02:00
bunnei
ae61e47cba Merge pull request #3946 from ogniK5377/sysverdat-10-0-2
file_sys: Update SystemVersion archive to version 10.0.2
2020-05-20 19:21:27 -04:00
ReinUsesLisp
420cc13248 renderer_opengl: Add assembly program code paths
Add code required to use OpenGL assembly programs based on
NV_gpu_program5. Decompilation for ARB programs is intended to be added
in a follow up commit. This does **not** include ARB decompilation and
it's not in an usable state.

The intention behind assembly programs is to reduce shader stutter
significantly on drivers supporting NV_gpu_program5 (and other required
extensions). Currently only Nvidia's proprietary driver supports these
extensions.

Add a UI option hidden for now to avoid people enabling this option
accidentally.

This code path has some limitations that OpenGL compatibility doesn't
have:
- NV_shader_storage_buffer_object is limited to 16 entries for a single
OpenGL context state (I don't know if this is an intended limitation, an
specification issue or I am missing something). Currently causes issues
on The Legend of Zelda: Link's Awakening.
- NV_parameter_buffer_object can't bind buffers using an offset
different to zero. The used workaround is to copy to a temporary buffer
(this doesn't happen often so it's not an issue).

On the other hand, it has the following advantages:
- Shaders build a lot faster.
- We have control over how floating point rounding is done over
individual instructions (SPIR-V on Vulkan can't do this).
- Operations on shared memory can be unsigned and signed.
- Transform feedbacks are dynamic state (not yet implemented).
- Parameter buffers (uniform buffers) are per stage, matching NVN and
hardware's behavior.
- The API to bind and create assembly programs makes sense, unlike
ARB_separate_shader_objects.
2020-05-19 18:00:04 -03:00
ReinUsesLisp
47a7c4f4fe yuzu: Add frontend settings for assembly shaders
Add settings for assembly shaders. Currently hidden to avoid users from
accidentally enabled them.
2020-05-19 17:53:17 -03:00
bunnei
cf4ee279c6 Merge pull request #3926 from ogniK5377/keyboard-states
hid: Clear keyboard states & fix logic issue
2020-05-19 15:55:14 -04:00
Morph
d0fc12684a maxwell_to_vk: Add format B8G8R8A8_SRGB
Add format B8G8R8A8_SRGB and add Attachable capability for B8G8R8A8_UNORM
Used by Bravely Default II
2020-05-18 13:02:09 -04:00
Fernando Sahmkow
4cff5dd194 OpenGL: Enable Debug Context and Synchronous debugging when graphics debugging is enabled.
This commit aims to help easing debugging of driver crashes without
having to modify existing code.
2020-05-17 21:45:09 -04:00
Morph
9a36d8600c main: Log host system memory parameters
Logs both physical memory and swapfile sizes, this is useful for support.
2020-05-17 14:45:12 -04:00
David Marcec
17455b7222 file_sys: Update SystemVersion archive to version 10.0.2 2020-05-16 13:56:06 +10:00
ReinUsesLisp
91dddca26e vk_rasterizer: Implement constant attributes
Constant attributes (in OpenGL known disabled attributes) are not
supported on Vulkan, even with extensions. To emulate this behavior we
return zero on reads from disabled vertex attributes in shader code.
This has no caching cost because attribute formats are not dynamic state
on Vulkan and we have to store it in the pipeline cache anyway.

- Fixes Animal Crossing: New Horizons terrain borders
2020-05-13 04:36:47 -03:00
ReinUsesLisp
cf6a40fc12 vk_rasterizer: Remove buffer check in attribute selection
This was a left over from OpenGL when disabled buffers where not properly
emulated. We no longer have to assert this as it is checked in vertex
buffer initialization.
2020-05-13 04:36:47 -03:00
David Marcec
a79f060ea2 hid: Clear keyboard states & fix logic issue
Previously we never cleared the states of the entries and the key would stay held down, also looping over the key bytes for each key lead to setting every bit for the key state instead of the key we wanted
2020-05-12 12:40:50 +10:00
Fernando Sahmkow
1887afaf9e RasterizerCache: Correct documentation. 2020-05-09 21:03:39 -04:00
Fernando Sahmkow
8d15f8b28e VkPipelineCache: Use a null shader on invalid address. 2020-05-09 20:51:34 -04:00
Fernando Sahmkow
0a4be73b9b VideoCore: Use SyncGuestMemory mechanism for Shader/Pipeline Cache invalidation. 2020-05-09 19:25:29 -04:00
64 changed files with 1577 additions and 540 deletions

3
.gitmodules vendored
View File

@@ -28,3 +28,6 @@
[submodule "libzip"]
path = externals/libzip/libzip
url = https://github.com/nih-at/libzip.git
[submodule "xbyak"]
path = externals/xbyak
url = https://github.com/herumi/xbyak.git

View File

@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.11)
cmake_minimum_required(VERSION 3.15)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules")
@@ -13,7 +13,7 @@ project(yuzu)
option(ENABLE_SDL2 "Enable the SDL2 frontend" ON)
option(ENABLE_QT "Enable the Qt frontend" ON)
CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" OFF "ENABLE_QT;MSVC" OFF)
CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" ON "ENABLE_QT;MSVC" OFF)
option(ENABLE_WEB_SERVICE "Enable web services (telemetry, etc.)" ON)

View File

@@ -75,3 +75,11 @@ if (ENABLE_WEB_SERVICE)
target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)
target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto)
endif()
if (NOT TARGET xbyak)
if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
add_library(xbyak INTERFACE)
target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
endif()
endif()

1
externals/xbyak vendored Submodule

Submodule externals/xbyak added at 82b70e6659

View File

@@ -123,6 +123,8 @@ add_library(common STATIC
lz4_compression.cpp
lz4_compression.h
math_util.h
memory_detect.cpp
memory_detect.h
memory_hook.cpp
memory_hook.h
microprofile.cpp
@@ -169,10 +171,12 @@ if(ARCHITECTURE_x86_64)
PRIVATE
x64/cpu_detect.cpp
x64/cpu_detect.h
x64/xbyak_abi.h
x64/xbyak_util.h
)
endif()
create_target_directory_groups(common)
target_link_libraries(common PUBLIC Boost::boost fmt::fmt microprofile)
target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd)
target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd xbyak)

View File

@@ -0,0 +1,60 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#ifdef _WIN32
// clang-format off
#include <windows.h>
#include <sysinfoapi.h>
// clang-format on
#else
#include <sys/types.h>
#ifdef __APPLE__
#include <sys/sysctl.h>
#else
#include <sys/sysinfo.h>
#endif
#endif
#include "common/memory_detect.h"
namespace Common {
// Detects the RAM and Swapfile sizes
static MemoryInfo Detect() {
MemoryInfo mem_info{};
#ifdef _WIN32
MEMORYSTATUSEX memorystatus;
memorystatus.dwLength = sizeof(memorystatus);
GlobalMemoryStatusEx(&memorystatus);
mem_info.TotalPhysicalMemory = memorystatus.ullTotalPhys;
mem_info.TotalSwapMemory = memorystatus.ullTotalPageFile - mem_info.TotalPhysicalMemory;
#elif defined(__APPLE__)
u64 ramsize;
struct xsw_usage vmusage;
std::size_t sizeof_ramsize = sizeof(ramsize);
std::size_t sizeof_vmusage = sizeof(vmusage);
// hw and vm are defined in sysctl.h
// https://github.com/apple/darwin-xnu/blob/master/bsd/sys/sysctl.h#L471
// sysctlbyname(const char *, void *, size_t *, void *, size_t);
sysctlbyname("hw.memsize", &ramsize, &sizeof_ramsize, NULL, 0);
sysctlbyname("vm.swapusage", &vmusage, &sizeof_vmusage, NULL, 0);
mem_info.TotalPhysicalMemory = ramsize;
mem_info.TotalSwapMemory = vmusage.xsu_total;
#else
struct sysinfo meminfo;
sysinfo(&meminfo);
mem_info.TotalPhysicalMemory = meminfo.totalram;
mem_info.TotalSwapMemory = meminfo.totalswap;
#endif
return mem_info;
}
const MemoryInfo& GetMemInfo() {
static MemoryInfo mem_info = Detect();
return mem_info;
}
} // namespace Common

View File

@@ -0,0 +1,22 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "common/common_types.h"
namespace Common {
struct MemoryInfo {
u64 TotalPhysicalMemory{};
u64 TotalSwapMemory{};
};
/**
* Gets the memory info of the host system
* @return Reference to a MemoryInfo struct with the physical and swap memory sizes in bytes
*/
const MemoryInfo& GetMemInfo();
} // namespace Common

266
src/common/x64/xbyak_abi.h Normal file
View File

@@ -0,0 +1,266 @@
// Copyright 2016 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <bitset>
#include <initializer_list>
#include <xbyak.h>
#include "common/assert.h"
namespace Common::X64 {
inline int RegToIndex(const Xbyak::Reg& reg) {
using Kind = Xbyak::Reg::Kind;
ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
"RegSet only support GPRs and XMM registers.");
ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15.");
return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16);
}
inline Xbyak::Reg64 IndexToReg64(int reg_index) {
ASSERT(reg_index < 16);
return Xbyak::Reg64(reg_index);
}
inline Xbyak::Xmm IndexToXmm(int reg_index) {
ASSERT(reg_index >= 16 && reg_index < 32);
return Xbyak::Xmm(reg_index - 16);
}
inline Xbyak::Reg IndexToReg(int reg_index) {
if (reg_index < 16) {
return IndexToReg64(reg_index);
} else {
return IndexToXmm(reg_index);
}
}
inline std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
std::bitset<32> bits;
for (const Xbyak::Reg& reg : regs) {
bits[RegToIndex(reg)] = true;
}
return bits;
}
const std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
const std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
#ifdef _WIN32
// Microsoft x64 ABI
const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx;
const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
const Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
const Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;
const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
// GPRs
Xbyak::util::rcx,
Xbyak::util::rdx,
Xbyak::util::r8,
Xbyak::util::r9,
Xbyak::util::r10,
Xbyak::util::r11,
// XMMs
Xbyak::util::xmm0,
Xbyak::util::xmm1,
Xbyak::util::xmm2,
Xbyak::util::xmm3,
Xbyak::util::xmm4,
Xbyak::util::xmm5,
});
const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
// GPRs
Xbyak::util::rbx,
Xbyak::util::rsi,
Xbyak::util::rdi,
Xbyak::util::rbp,
Xbyak::util::r12,
Xbyak::util::r13,
Xbyak::util::r14,
Xbyak::util::r15,
// XMMs
Xbyak::util::xmm6,
Xbyak::util::xmm7,
Xbyak::util::xmm8,
Xbyak::util::xmm9,
Xbyak::util::xmm10,
Xbyak::util::xmm11,
Xbyak::util::xmm12,
Xbyak::util::xmm13,
Xbyak::util::xmm14,
Xbyak::util::xmm15,
});
constexpr size_t ABI_SHADOW_SPACE = 0x20;
#else
// System V x86-64 ABI
const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi;
const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
const Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
const Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;
const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
// GPRs
Xbyak::util::rcx,
Xbyak::util::rdx,
Xbyak::util::rdi,
Xbyak::util::rsi,
Xbyak::util::r8,
Xbyak::util::r9,
Xbyak::util::r10,
Xbyak::util::r11,
// XMMs
Xbyak::util::xmm0,
Xbyak::util::xmm1,
Xbyak::util::xmm2,
Xbyak::util::xmm3,
Xbyak::util::xmm4,
Xbyak::util::xmm5,
Xbyak::util::xmm6,
Xbyak::util::xmm7,
Xbyak::util::xmm8,
Xbyak::util::xmm9,
Xbyak::util::xmm10,
Xbyak::util::xmm11,
Xbyak::util::xmm12,
Xbyak::util::xmm13,
Xbyak::util::xmm14,
Xbyak::util::xmm15,
});
const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
// GPRs
Xbyak::util::rbx,
Xbyak::util::rbp,
Xbyak::util::r12,
Xbyak::util::r13,
Xbyak::util::r14,
Xbyak::util::r15,
});
constexpr size_t ABI_SHADOW_SPACE = 0;
#endif
inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
size_t needed_frame_size, s32* out_subtraction,
s32* out_xmm_offset) {
const auto count = (regs & ABI_ALL_GPRS).count();
rsp_alignment -= count * 8;
size_t subtraction = 0;
const auto xmm_count = (regs & ABI_ALL_XMMS).count();
if (xmm_count) {
// If we have any XMMs to save, we must align the stack here.
subtraction = rsp_alignment & 0xF;
}
subtraction += 0x10 * xmm_count;
size_t xmm_base_subtraction = subtraction;
subtraction += needed_frame_size;
subtraction += ABI_SHADOW_SPACE;
// Final alignment.
rsp_alignment -= subtraction;
subtraction += rsp_alignment & 0xF;
*out_subtraction = (s32)subtraction;
*out_xmm_offset = (s32)(subtraction - xmm_base_subtraction);
}
inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
size_t rsp_alignment, size_t needed_frame_size = 0) {
s32 subtraction, xmm_offset;
ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
for (std::size_t i = 0; i < regs.size(); ++i) {
if (regs[i] && ABI_ALL_GPRS[i]) {
code.push(IndexToReg64(static_cast<int>(i)));
}
}
if (subtraction != 0) {
code.sub(code.rsp, subtraction);
}
for (int i = 0; i < regs.count(); i++) {
if (regs.test(i) & ABI_ALL_GPRS.test(i)) {
code.push(IndexToReg64(i));
}
}
for (std::size_t i = 0; i < regs.size(); ++i) {
if (regs[i] && ABI_ALL_XMMS[i]) {
code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(static_cast<int>(i)));
xmm_offset += 0x10;
}
}
return ABI_SHADOW_SPACE;
}
inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
size_t rsp_alignment, size_t needed_frame_size = 0) {
s32 subtraction, xmm_offset;
ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
for (std::size_t i = 0; i < regs.size(); ++i) {
if (regs[i] && ABI_ALL_XMMS[i]) {
code.movaps(IndexToXmm(static_cast<int>(i)), code.xword[code.rsp + xmm_offset]);
xmm_offset += 0x10;
}
}
if (subtraction != 0) {
code.add(code.rsp, subtraction);
}
// GPRs need to be popped in reverse order
for (int i = 15; i >= 0; i--) {
if (regs[i]) {
code.pop(IndexToReg64(i));
}
}
}
inline size_t ABI_PushRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
size_t rsp_alignment,
size_t needed_frame_size = 0) {
s32 subtraction, xmm_offset;
ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
for (std::size_t i = 0; i < regs.size(); ++i) {
if (regs[i] && ABI_ALL_GPRS[i]) {
code.push(IndexToReg64(static_cast<int>(i)));
}
}
if (subtraction != 0) {
code.sub(code.rsp, subtraction);
}
return ABI_SHADOW_SPACE;
}
inline void ABI_PopRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
size_t rsp_alignment, size_t needed_frame_size = 0) {
s32 subtraction, xmm_offset;
ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
if (subtraction != 0) {
code.add(code.rsp, subtraction);
}
// GPRs need to be popped in reverse order
for (int i = 15; i >= 0; i--) {
if (regs[i]) {
code.pop(IndexToReg64(i));
}
}
}
} // namespace Common::X64

View File

@@ -0,0 +1,47 @@
// Copyright 2016 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <type_traits>
#include <xbyak.h>
#include "common/x64/xbyak_abi.h"
namespace Common::X64 {
// Constants for use with cmpps/cmpss
enum {
CMP_EQ = 0,
CMP_LT = 1,
CMP_LE = 2,
CMP_UNORD = 3,
CMP_NEQ = 4,
CMP_NLT = 5,
CMP_NLE = 6,
CMP_ORD = 7,
};
constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) {
const u64 distance = target - (ref + 5);
return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL);
}
inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) {
return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target);
}
template <typename T>
inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) {
static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
size_t addr = reinterpret_cast<size_t>(f);
if (IsWithin2G(code, addr)) {
code.call(f);
} else {
// ABI_RETURN is a safe temp register to use before a call
code.mov(ABI_RETURN, addr);
code.call(ABI_RETURN);
}
}
} // namespace Common::X64

View File

@@ -10,6 +10,7 @@
#include "common/file_util.h"
#include "common/hex_util.h"
#include "common/logging/log.h"
#include "common/string_util.h"
#include "core/core.h"
#include "core/file_sys/content_archive.h"
#include "core/file_sys/control_metadata.h"
@@ -48,6 +49,23 @@ std::string FormatTitleVersion(u32 version, TitleVersionFormat format) {
return fmt::format("v{}.{}.{}", bytes[3], bytes[2], bytes[1]);
}
std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
std::string_view name) {
#ifdef _WIN32
return dir->GetSubdirectory(name);
#else
const auto subdirs = dir->GetSubdirectories();
for (const auto& subdir : subdirs) {
std::string dir_name = Common::ToLower(subdir->GetName());
if (dir_name == name) {
return subdir;
}
}
return nullptr;
#endif
}
PatchManager::PatchManager(u64 title_id) : title_id(title_id) {}
PatchManager::~PatchManager() = default;
@@ -104,7 +122,7 @@ VirtualDir PatchManager::PatchExeFS(VirtualDir exefs) const {
if (std::find(disabled.begin(), disabled.end(), subdir->GetName()) != disabled.end())
continue;
auto exefs_dir = subdir->GetSubdirectory("exefs");
auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
if (exefs_dir != nullptr)
layers.push_back(std::move(exefs_dir));
}
@@ -130,7 +148,7 @@ std::vector<VirtualFile> PatchManager::CollectPatches(const std::vector<VirtualD
if (std::find(disabled.cbegin(), disabled.cend(), subdir->GetName()) != disabled.cend())
continue;
auto exefs_dir = subdir->GetSubdirectory("exefs");
auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
if (exefs_dir != nullptr) {
for (const auto& file : exefs_dir->GetFiles()) {
if (file->GetExtension() == "ips") {
@@ -295,7 +313,7 @@ std::vector<Core::Memory::CheatEntry> PatchManager::CreateCheatList(
continue;
}
auto cheats_dir = subdir->GetSubdirectory("cheats");
auto cheats_dir = FindSubdirectoryCaseless(subdir, "cheats");
if (cheats_dir != nullptr) {
auto res = ReadCheatFileFromFolder(system, title_id, build_id_, cheats_dir, true);
if (res.has_value()) {
@@ -340,11 +358,11 @@ static void ApplyLayeredFS(VirtualFile& romfs, u64 title_id, ContentRecordType t
continue;
}
auto romfs_dir = subdir->GetSubdirectory("romfs");
auto romfs_dir = FindSubdirectoryCaseless(subdir, "romfs");
if (romfs_dir != nullptr)
layers.push_back(std::move(romfs_dir));
auto ext_dir = subdir->GetSubdirectory("romfs_ext");
auto ext_dir = FindSubdirectoryCaseless(subdir, "romfs_ext");
if (ext_dir != nullptr)
layers_ext.push_back(std::move(ext_dir));
}
@@ -470,7 +488,7 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
for (const auto& mod : mod_dir->GetSubdirectories()) {
std::string types;
const auto exefs_dir = mod->GetSubdirectory("exefs");
const auto exefs_dir = FindSubdirectoryCaseless(mod, "exefs");
if (IsDirValidAndNonEmpty(exefs_dir)) {
bool ips = false;
bool ipswitch = false;
@@ -494,9 +512,9 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
if (layeredfs)
AppendCommaIfNotEmpty(types, "LayeredExeFS");
}
if (IsDirValidAndNonEmpty(mod->GetSubdirectory("romfs")))
if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "romfs")))
AppendCommaIfNotEmpty(types, "LayeredFS");
if (IsDirValidAndNonEmpty(mod->GetSubdirectory("cheats")))
if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "cheats")))
AppendCommaIfNotEmpty(types, "Cheats");
if (types.empty())

View File

@@ -29,6 +29,11 @@ enum class TitleVersionFormat : u8 {
std::string FormatTitleVersion(u32 version,
TitleVersionFormat format = TitleVersionFormat::ThreeElements);
// Returns a directory with name matching name case-insensitive. Returns nullptr if directory
// doesn't have a directory with name.
std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
std::string_view name);
// A centralized class to manage patches to games.
class PatchManager {
public:

View File

@@ -12,17 +12,17 @@ namespace SystemVersionData {
// This section should reflect the best system version to describe yuzu's HLE api.
// TODO(DarkLordZach): Update when HLE gets better.
constexpr u8 VERSION_MAJOR = 5;
constexpr u8 VERSION_MINOR = 1;
constexpr u8 VERSION_MICRO = 0;
constexpr u8 VERSION_MAJOR = 10;
constexpr u8 VERSION_MINOR = 0;
constexpr u8 VERSION_MICRO = 2;
constexpr u8 REVISION_MAJOR = 3;
constexpr u8 REVISION_MAJOR = 1;
constexpr u8 REVISION_MINOR = 0;
constexpr char PLATFORM_STRING[] = "NX";
constexpr char VERSION_HASH[] = "23f9df53e25709d756e0c76effcb2473bd3447dd";
constexpr char DISPLAY_VERSION[] = "5.1.0";
constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 5.1.0-3.0";
constexpr char VERSION_HASH[] = "f90143fa8bbc061d4f68c35f95f04f8080c0ecdc";
constexpr char DISPLAY_VERSION[] = "10.0.2";
constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 10.0.2-1.0";
} // namespace SystemVersionData

View File

@@ -38,10 +38,11 @@ void Controller_Keyboard::OnUpdate(const Core::Timing::CoreTiming& core_timing,
cur_entry.sampling_number = last_entry.sampling_number + 1;
cur_entry.sampling_number2 = cur_entry.sampling_number;
cur_entry.key.fill(0);
cur_entry.modifier = 0;
for (std::size_t i = 0; i < keyboard_keys.size(); ++i) {
for (std::size_t k = 0; k < KEYS_PER_BYTE; ++k) {
cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << k);
}
cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << (i % KEYS_PER_BYTE));
}
for (std::size_t i = 0; i < keyboard_mods.size(); ++i) {

View File

@@ -177,7 +177,8 @@ private:
void CreateTemporaryNetworkProfile(Kernel::HLERequestContext& ctx) {
LOG_DEBUG(Service_NIFM, "called");
ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c, "NetworkProfileData is not the correct size");
ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c,
"SfNetworkProfileData is not the correct size");
u128 uuid{};
auto buffer = ctx.ReadBuffer();
std::memcpy(&uuid, buffer.data() + 8, sizeof(u128));

View File

@@ -112,6 +112,7 @@ void LogSettings() {
LogSetting("Renderer_UseAsynchronousGpuEmulation",
Settings::values.use_asynchronous_gpu_emulation);
LogSetting("Renderer_UseVsync", Settings::values.use_vsync);
LogSetting("Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy);
LogSetting("Audio_OutputEngine", Settings::values.sink_id);
LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching);

View File

@@ -446,6 +446,7 @@ struct Values {
GPUAccuracy gpu_accuracy;
bool use_asynchronous_gpu_emulation;
bool use_vsync;
bool use_assembly_shaders;
bool force_30fps_mode;
bool use_fast_gpu_time;

View File

@@ -201,6 +201,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
Settings::values.use_asynchronous_gpu_emulation);
AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync);
AddField(field_type, "Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode);
}

View File

@@ -1,6 +1,7 @@
add_library(video_core STATIC
buffer_cache/buffer_block.h
buffer_cache/buffer_cache.h
buffer_cache/map_interval.cpp
buffer_cache/map_interval.h
dirty_flags.cpp
dirty_flags.h
@@ -228,7 +229,7 @@ endif()
create_target_directory_groups(video_core)
target_link_libraries(video_core PUBLIC common core)
target_link_libraries(video_core PRIVATE glad)
target_link_libraries(video_core PRIVATE glad xbyak)
if (ENABLE_VULKAN)
target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)

View File

@@ -12,11 +12,12 @@
#include <utility>
#include <vector>
#include <boost/icl/interval_map.hpp>
#include <boost/container/small_vector.hpp>
#include <boost/icl/interval_set.hpp>
#include <boost/range/iterator_range.hpp>
#include <boost/intrusive/set.hpp>
#include "common/alignment.h"
#include "common/assert.h"
#include "common/common_types.h"
#include "common/logging/log.h"
#include "core/core.h"
@@ -29,10 +30,12 @@
namespace VideoCommon {
using MapInterval = std::shared_ptr<MapIntervalBase>;
template <typename OwnerBuffer, typename BufferType, typename StreamBuffer>
class BufferCache {
using IntervalSet = boost::icl::interval_set<VAddr>;
using IntervalType = typename IntervalSet::interval_type;
using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
public:
using BufferInfo = std::pair<BufferType, u64>;
@@ -40,14 +43,12 @@ public:
bool is_written = false, bool use_fast_cbuf = false) {
std::lock_guard lock{mutex};
const std::optional<VAddr> cpu_addr_opt =
system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
const auto& memory_manager = system.GPU().MemoryManager();
const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
if (!cpu_addr_opt) {
return {GetEmptyBuffer(size), 0};
}
VAddr cpu_addr = *cpu_addr_opt;
const VAddr cpu_addr = *cpu_addr_opt;
// Cache management is a big overhead, so only cache entries with a given size.
// TODO: Figure out which size is the best for given games.
@@ -77,16 +78,19 @@ public:
}
}
auto block = GetBlock(cpu_addr, size);
auto map = MapAddress(block, gpu_addr, cpu_addr, size);
OwnerBuffer block = GetBlock(cpu_addr, size);
MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
if (!map) {
return {GetEmptyBuffer(size), 0};
}
if (is_written) {
map->MarkAsModified(true, GetModifiedTicks());
if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
MarkForAsyncFlush(map);
}
if (!map->IsWritten()) {
map->MarkAsWritten(true);
MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
if (!map->is_written) {
map->is_written = true;
MarkRegionAsWritten(map->start, map->end - 1);
}
}
@@ -132,12 +136,11 @@ public:
void FlushRegion(VAddr addr, std::size_t size) {
std::lock_guard lock{mutex};
std::vector<MapInterval> objects = GetMapsInRange(addr, size);
std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) {
return a->GetModificationTick() < b->GetModificationTick();
});
for (auto& object : objects) {
if (object->IsModified() && object->IsRegistered()) {
VectorMapInterval objects = GetMapsInRange(addr, size);
std::sort(objects.begin(), objects.end(),
[](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
for (MapInterval* object : objects) {
if (object->is_modified && object->is_registered) {
mutex.unlock();
FlushMap(object);
mutex.lock();
@@ -148,9 +151,9 @@ public:
bool MustFlushRegion(VAddr addr, std::size_t size) {
std::lock_guard lock{mutex};
const std::vector<MapInterval> objects = GetMapsInRange(addr, size);
return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval& map) {
return map->IsModified() && map->IsRegistered();
const VectorMapInterval objects = GetMapsInRange(addr, size);
return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
return map->is_modified && map->is_registered;
});
}
@@ -158,9 +161,8 @@ public:
void InvalidateRegion(VAddr addr, u64 size) {
std::lock_guard lock{mutex};
std::vector<MapInterval> objects = GetMapsInRange(addr, size);
for (auto& object : objects) {
if (object->IsRegistered()) {
for (auto& object : GetMapsInRange(addr, size)) {
if (object->is_registered) {
Unregister(object);
}
}
@@ -169,10 +171,10 @@ public:
void OnCPUWrite(VAddr addr, std::size_t size) {
std::lock_guard lock{mutex};
for (const auto& object : GetMapsInRange(addr, size)) {
if (object->IsMemoryMarked() && object->IsRegistered()) {
for (MapInterval* object : GetMapsInRange(addr, size)) {
if (object->is_memory_marked && object->is_registered) {
UnmarkMemory(object);
object->SetSyncPending(true);
object->is_sync_pending = true;
marked_for_unregister.emplace_back(object);
}
}
@@ -181,9 +183,9 @@ public:
void SyncGuestHost() {
std::lock_guard lock{mutex};
for (const auto& object : marked_for_unregister) {
if (object->IsRegistered()) {
object->SetSyncPending(false);
for (auto& object : marked_for_unregister) {
if (object->is_registered) {
object->is_sync_pending = false;
Unregister(object);
}
}
@@ -192,9 +194,9 @@ public:
void CommitAsyncFlushes() {
if (uncommitted_flushes) {
auto commit_list = std::make_shared<std::list<MapInterval>>();
for (auto& map : *uncommitted_flushes) {
if (map->IsRegistered() && map->IsModified()) {
auto commit_list = std::make_shared<std::list<MapInterval*>>();
for (MapInterval* map : *uncommitted_flushes) {
if (map->is_registered && map->is_modified) {
// TODO(Blinkhawk): Implement backend asynchronous flushing
// AsyncFlushMap(map)
commit_list->push_back(map);
@@ -228,8 +230,8 @@ public:
committed_flushes.pop_front();
return;
}
for (MapInterval& map : *flush_list) {
if (map->IsRegistered()) {
for (MapInterval* map : *flush_list) {
if (map->is_registered) {
// TODO(Blinkhawk): Replace this for reading the asynchronous flush
FlushMap(map);
}
@@ -265,61 +267,60 @@ protected:
}
/// Register an object into the cache
void Register(const MapInterval& new_map, bool inherit_written = false) {
const VAddr cpu_addr = new_map->GetStart();
MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
const VAddr cpu_addr = new_map.start;
if (!cpu_addr) {
LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
new_map->GetGpuAddress());
return;
new_map.gpu_addr);
return nullptr;
}
const std::size_t size = new_map->GetEnd() - new_map->GetStart();
new_map->MarkAsRegistered(true);
const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
mapped_addresses.insert({interval, new_map});
const std::size_t size = new_map.end - new_map.start;
new_map.is_registered = true;
rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
new_map->SetMemoryMarked(true);
new_map.is_memory_marked = true;
if (inherit_written) {
MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1);
new_map->MarkAsWritten(true);
MarkRegionAsWritten(new_map.start, new_map.end - 1);
new_map.is_written = true;
}
MapInterval* const storage = mapped_addresses_allocator.Allocate();
*storage = new_map;
mapped_addresses.insert(*storage);
return storage;
}
void UnmarkMemory(const MapInterval& map) {
if (!map->IsMemoryMarked()) {
void UnmarkMemory(MapInterval* map) {
if (!map->is_memory_marked) {
return;
}
const std::size_t size = map->GetEnd() - map->GetStart();
rasterizer.UpdatePagesCachedCount(map->GetStart(), size, -1);
map->SetMemoryMarked(false);
const std::size_t size = map->end - map->start;
rasterizer.UpdatePagesCachedCount(map->start, size, -1);
map->is_memory_marked = false;
}
/// Unregisters an object from the cache
void Unregister(const MapInterval& map) {
void Unregister(MapInterval* map) {
UnmarkMemory(map);
map->MarkAsRegistered(false);
if (map->IsSyncPending()) {
map->is_registered = false;
if (map->is_sync_pending) {
map->is_sync_pending = false;
marked_for_unregister.remove(map);
map->SetSyncPending(false);
}
if (map->IsWritten()) {
UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
if (map->is_written) {
UnmarkRegionAsWritten(map->start, map->end - 1);
}
const IntervalType delete_interval{map->GetStart(), map->GetEnd()};
mapped_addresses.erase(delete_interval);
const auto it = mapped_addresses.find(*map);
ASSERT(it != mapped_addresses.end());
mapped_addresses.erase(it);
mapped_addresses_allocator.Release(map);
}
private:
MapInterval CreateMap(const VAddr start, const VAddr end, const GPUVAddr gpu_addr) {
return std::make_shared<MapIntervalBase>(start, end, gpu_addr);
}
MapInterval MapAddress(const OwnerBuffer& block, const GPUVAddr gpu_addr, const VAddr cpu_addr,
const std::size_t size) {
std::vector<MapInterval> overlaps = GetMapsInRange(cpu_addr, size);
MapInterval* MapAddress(const OwnerBuffer& block, GPUVAddr gpu_addr, VAddr cpu_addr,
std::size_t size) {
const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
if (overlaps.empty()) {
auto& memory_manager = system.GPU().MemoryManager();
const VAddr cpu_addr_end = cpu_addr + size;
MapInterval new_map = CreateMap(cpu_addr, cpu_addr_end, gpu_addr);
if (memory_manager.IsGranularRange(gpu_addr, size)) {
u8* host_ptr = memory_manager.GetPointer(gpu_addr);
UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr);
@@ -328,13 +329,12 @@ private:
memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data());
}
Register(new_map);
return new_map;
return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
}
const VAddr cpu_addr_end = cpu_addr + size;
if (overlaps.size() == 1) {
MapInterval& current_map = overlaps[0];
MapInterval* const current_map = overlaps[0];
if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
return current_map;
}
@@ -344,35 +344,39 @@ private:
bool write_inheritance = false;
bool modified_inheritance = false;
// Calculate new buffer parameters
for (auto& overlap : overlaps) {
new_start = std::min(overlap->GetStart(), new_start);
new_end = std::max(overlap->GetEnd(), new_end);
write_inheritance |= overlap->IsWritten();
modified_inheritance |= overlap->IsModified();
for (MapInterval* overlap : overlaps) {
new_start = std::min(overlap->start, new_start);
new_end = std::max(overlap->end, new_end);
write_inheritance |= overlap->is_written;
modified_inheritance |= overlap->is_modified;
}
GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
for (auto& overlap : overlaps) {
Unregister(overlap);
}
UpdateBlock(block, new_start, new_end, overlaps);
MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr);
const MapInterval new_map{new_start, new_end, new_gpu_addr};
MapInterval* const map = Register(new_map, write_inheritance);
if (!map) {
return nullptr;
}
if (modified_inheritance) {
new_map->MarkAsModified(true, GetModifiedTicks());
map->MarkAsModified(true, GetModifiedTicks());
if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
MarkForAsyncFlush(new_map);
MarkForAsyncFlush(map);
}
}
Register(new_map, write_inheritance);
return new_map;
return map;
}
void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end,
std::vector<MapInterval>& overlaps) {
const VectorMapInterval& overlaps) {
const IntervalType base_interval{start, end};
IntervalSet interval_set{};
interval_set.add(base_interval);
for (auto& overlap : overlaps) {
const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()};
const IntervalType subtract{overlap->start, overlap->end};
interval_set.subtract(subtract);
}
for (auto& interval : interval_set) {
@@ -386,18 +390,24 @@ private:
}
}
std::vector<MapInterval> GetMapsInRange(VAddr addr, std::size_t size) {
VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
VectorMapInterval result;
if (size == 0) {
return {};
return result;
}
std::vector<MapInterval> objects{};
const IntervalType interval{addr, addr + size};
for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) {
objects.push_back(pair.second);
const VAddr addr_end = addr + size;
auto it = mapped_addresses.lower_bound(addr);
if (it != mapped_addresses.begin()) {
--it;
}
return objects;
while (it != mapped_addresses.end() && it->start < addr_end) {
if (it->Overlaps(addr, addr_end)) {
result.push_back(&*it);
}
++it;
}
return result;
}
/// Returns a ticks counter used for tracking when cached objects were last modified
@@ -405,12 +415,12 @@ private:
return ++modified_ticks;
}
void FlushMap(MapInterval map) {
std::size_t size = map->GetEnd() - map->GetStart();
OwnerBuffer block = blocks[map->GetStart() >> block_page_bits];
void FlushMap(MapInterval* map) {
const std::size_t size = map->end - map->start;
OwnerBuffer block = blocks[map->start >> block_page_bits];
staging_buffer.resize(size);
DownloadBlockData(block, block->GetOffset(map->GetStart()), size, staging_buffer.data());
system.Memory().WriteBlockUnsafe(map->GetStart(), staging_buffer.data(), size);
DownloadBlockData(block, block->GetOffset(map->start), size, staging_buffer.data());
system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
map->MarkAsModified(false, 0);
}
@@ -515,7 +525,7 @@ private:
} else {
written_pages[page_start] = 1;
}
page_start++;
++page_start;
}
}
@@ -531,7 +541,7 @@ private:
written_pages.erase(it);
}
}
page_start++;
++page_start;
}
}
@@ -542,14 +552,14 @@ private:
if (written_pages.count(page_start) > 0) {
return true;
}
page_start++;
++page_start;
}
return false;
}
void MarkForAsyncFlush(MapInterval& map) {
void MarkForAsyncFlush(MapInterval* map) {
if (!uncommitted_flushes) {
uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval>>();
uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
}
uncommitted_flushes->insert(map);
}
@@ -566,10 +576,9 @@ private:
u64 buffer_offset = 0;
u64 buffer_offset_base = 0;
using IntervalSet = boost::icl::interval_set<VAddr>;
using IntervalCache = boost::icl::interval_map<VAddr, MapInterval>;
using IntervalType = typename IntervalCache::interval_type;
IntervalCache mapped_addresses;
MapIntervalAllocator mapped_addresses_allocator;
boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
mapped_addresses;
static constexpr u64 write_page_bit = 11;
std::unordered_map<u64, u32> written_pages;
@@ -583,10 +592,10 @@ private:
u64 modified_ticks = 0;
std::vector<u8> staging_buffer;
std::list<MapInterval> marked_for_unregister;
std::list<MapInterval*> marked_for_unregister;
std::shared_ptr<std::unordered_set<MapInterval>> uncommitted_flushes{};
std::list<std::shared_ptr<std::list<MapInterval>>> committed_flushes;
std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
std::recursive_mutex mutex;
};

View File

@@ -0,0 +1,33 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include <array>
#include <cstddef>
#include <memory>
#include "video_core/buffer_cache/map_interval.h"
namespace VideoCommon {
MapIntervalAllocator::MapIntervalAllocator() {
FillFreeList(first_chunk);
}
MapIntervalAllocator::~MapIntervalAllocator() = default;
void MapIntervalAllocator::AllocateNewChunk() {
*new_chunk = std::make_unique<Chunk>();
FillFreeList(**new_chunk);
new_chunk = &(*new_chunk)->next;
}
void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
const std::size_t old_size = free_list.size();
free_list.resize(old_size + chunk.data.size());
std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
[](MapInterval& interval) { return &interval; });
}
} // namespace VideoCommon

View File

@@ -4,104 +4,89 @@
#pragma once
#include <array>
#include <cstddef>
#include <memory>
#include <vector>
#include <boost/intrusive/set_hook.hpp>
#include "common/common_types.h"
#include "video_core/gpu.h"
namespace VideoCommon {
class MapIntervalBase {
public:
MapIntervalBase(const VAddr start, const VAddr end, const GPUVAddr gpu_addr)
: start{start}, end{end}, gpu_addr{gpu_addr} {}
struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
MapInterval() = default;
void SetCpuAddress(VAddr new_cpu_addr) {
cpu_addr = new_cpu_addr;
/*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
: start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
return start <= other_start && other_end <= end;
}
VAddr GetCpuAddress() const {
return cpu_addr;
bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
return start < other_end && other_start < end;
}
GPUVAddr GetGpuAddress() const {
return gpu_addr;
}
bool IsInside(const VAddr other_start, const VAddr other_end) const {
return (start <= other_start && other_end <= end);
}
bool operator==(const MapIntervalBase& rhs) const {
return std::tie(start, end) == std::tie(rhs.start, rhs.end);
}
bool operator!=(const MapIntervalBase& rhs) const {
return !operator==(rhs);
}
void MarkAsRegistered(const bool registered) {
is_registered = registered;
}
bool IsRegistered() const {
return is_registered;
}
void SetMemoryMarked(bool is_memory_marked_) {
is_memory_marked = is_memory_marked_;
}
bool IsMemoryMarked() const {
return is_memory_marked;
}
void SetSyncPending(bool is_sync_pending_) {
is_sync_pending = is_sync_pending_;
}
bool IsSyncPending() const {
return is_sync_pending;
}
VAddr GetStart() const {
return start;
}
VAddr GetEnd() const {
return end;
}
void MarkAsModified(const bool is_modified_, const u64 tick) {
void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
is_modified = is_modified_;
ticks = tick;
ticks = ticks_;
}
bool IsModified() const {
return is_modified;
boost::intrusive::set_member_hook<> member_hook_;
VAddr start = 0;
VAddr end = 0;
GPUVAddr gpu_addr = 0;
u64 ticks = 0;
bool is_written = false;
bool is_modified = false;
bool is_registered = false;
bool is_memory_marked = false;
bool is_sync_pending = false;
};
struct MapIntervalCompare {
constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
return lhs.start < rhs.start;
}
};
class MapIntervalAllocator {
public:
MapIntervalAllocator();
~MapIntervalAllocator();
MapInterval* Allocate() {
if (free_list.empty()) {
AllocateNewChunk();
}
MapInterval* const interval = free_list.back();
free_list.pop_back();
return interval;
}
u64 GetModificationTick() const {
return ticks;
}
void MarkAsWritten(const bool is_written_) {
is_written = is_written_;
}
bool IsWritten() const {
return is_written;
void Release(MapInterval* interval) {
free_list.push_back(interval);
}
private:
VAddr start;
VAddr end;
GPUVAddr gpu_addr;
VAddr cpu_addr{};
bool is_written{};
bool is_modified{};
bool is_registered{};
bool is_memory_marked{};
bool is_sync_pending{};
u64 ticks{};
struct Chunk {
std::unique_ptr<Chunk> next;
std::array<MapInterval, 0x8000> data;
};
void AllocateNewChunk();
void FillFreeList(Chunk& chunk);
std::vector<MapInterval*> free_list;
std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
Chunk first_chunk;
};
} // namespace VideoCommon

View File

@@ -457,8 +457,9 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
void Maxwell3D::ProcessQueryGet() {
// TODO(Subv): Support the other query units.
ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop,
"Units other than CROP are unimplemented");
if (regs.query.query_get.unit != Regs::QueryUnit::Crop) {
LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented");
}
switch (regs.query.query_get.operation) {
case Regs::QueryOperation::Release:
@@ -534,8 +535,8 @@ void Maxwell3D::ProcessCounterReset() {
rasterizer.ResetCounter(QueryType::SamplesPassed);
break;
default:
LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
static_cast<int>(regs.counter_reset));
LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}",
static_cast<int>(regs.counter_reset));
break;
}
}
@@ -592,8 +593,8 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
system.GPU().GetTicks());
return {};
default:
UNIMPLEMENTED_MSG("Unimplemented query select type {}",
static_cast<u32>(regs.query.query_get.select.Value()));
LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
static_cast<u32>(regs.query.query_get.select.Value()));
return 1;
}
}

View File

@@ -56,9 +56,27 @@ public:
last_modified_ticks = cache.GetModifiedTicks();
}
void SetMemoryMarked(bool is_memory_marked_) {
is_memory_marked = is_memory_marked_;
}
bool IsMemoryMarked() const {
return is_memory_marked;
}
void SetSyncPending(bool is_sync_pending_) {
is_sync_pending = is_sync_pending_;
}
bool IsSyncPending() const {
return is_sync_pending;
}
private:
bool is_registered{}; ///< Whether the object is currently registered with the cache
bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory)
bool is_memory_marked{}; ///< Whether the object is marking rasterizer memory.
bool is_sync_pending{}; ///< Whether the object is pending deletion.
u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
VAddr cpu_addr{}; ///< Cpu address memory, unique from emulated virtual address space
};
@@ -94,6 +112,30 @@ public:
}
}
void OnCPUWrite(VAddr addr, std::size_t size) {
std::lock_guard lock{mutex};
for (const auto& object : GetSortedObjectsFromRegion(addr, size)) {
if (object->IsRegistered()) {
UnmarkMemory(object);
object->SetSyncPending(true);
marked_for_unregister.emplace_back(object);
}
}
}
void SyncGuestHost() {
std::lock_guard lock{mutex};
for (const auto& object : marked_for_unregister) {
if (object->IsRegistered()) {
object->SetSyncPending(false);
Unregister(object);
}
}
marked_for_unregister.clear();
}
/// Invalidates everything in the cache
void InvalidateAll() {
std::lock_guard lock{mutex};
@@ -120,19 +162,32 @@ protected:
interval_cache.add({GetInterval(object), ObjectSet{object}});
map_cache.insert({object->GetCpuAddr(), object});
rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
object->SetMemoryMarked(true);
}
/// Unregisters an object from the cache
virtual void Unregister(const T& object) {
std::lock_guard lock{mutex};
UnmarkMemory(object);
object->SetIsRegistered(false);
rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
if (object->IsSyncPending()) {
marked_for_unregister.remove(object);
object->SetSyncPending(false);
}
const VAddr addr = object->GetCpuAddr();
interval_cache.subtract({GetInterval(object), ObjectSet{object}});
map_cache.erase(addr);
}
void UnmarkMemory(const T& object) {
if (!object->IsMemoryMarked()) {
return;
}
rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
object->SetMemoryMarked(false);
}
/// Returns a ticks counter used for tracking when cached objects were last modified
u64 GetModifiedTicks() {
std::lock_guard lock{mutex};
@@ -194,4 +249,5 @@ private:
IntervalCache interval_cache; ///< Cache of objects
u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing
VideoCore::RasterizerInterface& rasterizer;
std::list<T> marked_for_unregister;
};

View File

@@ -8,6 +8,7 @@
#include "common/assert.h"
#include "common/microprofile.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h"

View File

@@ -6,6 +6,7 @@
#include <array>
#include <cstddef>
#include <cstring>
#include <limits>
#include <optional>
#include <vector>
@@ -13,6 +14,7 @@
#include "common/logging/log.h"
#include "common/scope_exit.h"
#include "core/settings.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
@@ -25,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;
constexpr u32 NumStages = 5;
constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS,
GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS};
constexpr std::array LimitUBOs = {
GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS};
constexpr std::array LimitSSBOs = {
GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS};
GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
constexpr std::array LimitSamplers = {
GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
GL_MAX_TEXTURE_IMAGE_UNITS};
constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
GL_MAX_TEXTURE_IMAGE_UNITS,
GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS,
GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS,
GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS};
constexpr std::array LimitImages = {
GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS};
template <typename T>
T GetInteger(GLenum pname) {
@@ -84,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
return std::exchange(base, base + amount);
}
std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
std::array<u32, Tegra::Engines::MaxShaderTypes> max;
std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
[](GLenum pname) { return GetInteger<u32>(pname); });
return max;
}
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
@@ -158,15 +170,14 @@ bool IsASTCSupported() {
} // Anonymous namespace
Device::Device() : base_bindings{BuildBaseBindings()} {
Device::Device()
: max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
const std::vector extensions = GetExtensions();
const bool is_nvidia = vendor == "NVIDIA Corporation";
const bool is_amd = vendor == "ATI Technologies Inc.";
const bool is_intel = vendor == "Intel";
const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr;
uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
@@ -181,16 +192,23 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
has_variable_aoffi = TestVariableAoffi();
has_component_indexing_bug = is_amd;
has_precise_bug = TestPreciseBug();
has_broken_compute = is_intel_proprietary;
has_fast_buffer_sub_data = is_nvidia;
use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
GLAD_GL_NV_compute_program5;
LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
if (Settings::values.use_assembly_shaders && !use_assembly_shaders) {
LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
}
}
Device::Device(std::nullptr_t) {
uniform_buffer_alignment = 0;
max_uniform_buffers.fill(std::numeric_limits<u32>::max());
uniform_buffer_alignment = 4;
shader_storage_alignment = 4;
max_vertex_attributes = 16;
max_varyings = 15;
has_warp_intrinsics = true;
@@ -198,9 +216,6 @@ Device::Device(std::nullptr_t) {
has_vertex_viewport_layer = true;
has_image_load_formatted = true;
has_variable_aoffi = true;
has_component_indexing_bug = false;
has_broken_compute = false;
has_precise_bug = false;
}
bool Device::TestVariableAoffi() {

View File

@@ -24,6 +24,10 @@ public:
explicit Device();
explicit Device(std::nullptr_t);
u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
}
const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
return base_bindings[stage_index];
}
@@ -80,19 +84,20 @@ public:
return has_precise_bug;
}
bool HasBrokenCompute() const {
return has_broken_compute;
}
bool HasFastBufferSubData() const {
return has_fast_buffer_sub_data;
}
bool UseAssemblyShaders() const {
return use_assembly_shaders;
}
private:
static bool TestVariableAoffi();
static bool TestPreciseBug();
std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings;
std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
std::size_t uniform_buffer_alignment{};
std::size_t shader_storage_alignment{};
u32 max_vertex_attributes{};
@@ -105,8 +110,8 @@ private:
bool has_variable_aoffi{};
bool has_component_indexing_bug{};
bool has_precise_bug{};
bool has_broken_compute{};
bool has_fast_buffer_sub_data{};
bool use_assembly_shaders{};
};
} // namespace OpenGL

View File

@@ -4,6 +4,7 @@
#include "common/assert.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h"
#include "video_core/renderer_opengl/gl_fence_manager.h"
namespace OpenGL {

View File

@@ -54,6 +54,12 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255
namespace {
constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
constexpr std::size_t NumSupportedVertexAttributes = 16;
template <typename Engine, typename Entry>
@@ -94,17 +100,33 @@ void oglEnable(GLenum cap, bool state) {
} // Anonymous namespace
RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
ScreenInfo& info, GLShader::ProgramManager& program_manager,
StateTracker& state_tracker)
: RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker},
const Device& device, ScreenInfo& info,
ProgramManager& program_manager, StateTracker& state_tracker)
: RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device,
state_tracker},
shader_cache{*this, system, emu_window, device}, query_cache{system, *this},
buffer_cache{*this, system, device, STREAM_BUFFER_SIZE},
fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system},
screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
CheckExtensions();
unified_uniform_buffer.Create();
glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
if (device.UseAssemblyShaders()) {
glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
for (const GLuint cbuf : staging_cbufs) {
glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
nullptr, 0);
}
}
}
RasterizerOpenGL::~RasterizerOpenGL() {}
RasterizerOpenGL::~RasterizerOpenGL() {
if (device.UseAssemblyShaders()) {
glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
}
}
void RasterizerOpenGL::CheckExtensions() {
if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
@@ -230,6 +252,7 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
MICROPROFILE_SCOPE(OpenGL_Shader);
auto& gpu = system.GPU().Maxwell3D();
std::size_t num_ssbos = 0;
u32 clip_distances = 0;
for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@@ -261,6 +284,14 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
Shader shader{shader_cache.GetStageProgram(program)};
if (device.UseAssemblyShaders()) {
// Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
// all stages share the same bindings.
const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size();
ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage");
num_ssbos += num_stage_ssbos;
}
// Stage indices are 0 - 5
const std::size_t stage = index == 0 ? 0 : index - 1;
SetupDrawConstBuffers(stage, shader);
@@ -526,6 +557,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
SyncFramebufferSRGB();
buffer_cache.Acquire();
current_cbuf = 0;
std::size_t buffer_size = CalculateVertexArraysSize();
@@ -535,9 +567,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
}
// Uniform space for the 5 shader stages
buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
(sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
Maxwell::MaxShaderStage;
buffer_size =
Common::AlignUp<std::size_t>(buffer_size, 4) +
(sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
// Add space for at least 18 constant buffers
buffer_size += Maxwell::MaxConstBuffers *
@@ -558,12 +590,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
}
// Setup emulation uniform buffer.
GLShader::MaxwellUniformData ubo;
ubo.SetFromRegs(gpu);
const auto [buffer, offset] =
buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
static_cast<GLsizeiptr>(sizeof(ubo)));
if (!device.UseAssemblyShaders()) {
MaxwellUniformData ubo;
ubo.SetFromRegs(gpu);
const auto [buffer, offset] =
buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
static_cast<GLsizeiptr>(sizeof(ubo)));
}
// Setup shaders and their used resources.
texture_cache.GuardSamplers(true);
@@ -630,16 +664,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
}
void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
if (device.HasBrokenCompute()) {
return;
}
buffer_cache.Acquire();
current_cbuf = 0;
auto kernel = shader_cache.GetComputeKernel(code_addr);
SetupComputeTextures(kernel);
SetupComputeImages(kernel);
program_manager.BindComputeShader(kernel->GetHandle());
const std::size_t buffer_size =
Tegra::Engines::KeplerCompute::NumConstBuffers *
@@ -652,6 +682,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
buffer_cache.Unmap();
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
program_manager.BindCompute(kernel->GetHandle());
glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
++num_queued_commands;
}
@@ -701,15 +732,15 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
return;
}
texture_cache.OnCPUWrite(addr, size);
shader_cache.InvalidateRegion(addr, size);
shader_cache.OnCPUWrite(addr, size);
buffer_cache.OnCPUWrite(addr, size);
query_cache.InvalidateRegion(addr, size);
}
void RasterizerOpenGL::SyncGuestHost() {
MICROPROFILE_SCOPE(OpenGL_CacheManagement);
texture_cache.SyncGuestHost();
buffer_cache.SyncGuestHost();
shader_cache.SyncGuestHost();
}
void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
@@ -812,39 +843,72 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
}
void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
static constexpr std::array PARAMETER_LUT = {
GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV};
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
const auto& shader_stage = stages[stage_index];
const auto& entries = shader->GetEntries();
const bool use_unified = entries.use_unified_uniforms;
const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
u32 binding = device.GetBaseBindings(stage_index).uniform_buffer;
for (const auto& entry : shader->GetEntries().const_buffers) {
const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
SetupConstBuffer(binding++, buffer, entry);
const auto base_bindings = device.GetBaseBindings(stage_index);
u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
for (const auto& entry : entries.const_buffers) {
const u32 index = entry.GetIndex();
const auto& buffer = shader_stage.const_buffers[index];
SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
base_unified_offset + index * Maxwell::MaxConstBufferSize);
++binding;
}
if (use_unified) {
const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
entries.global_memory_entries.size());
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
}
}
void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
const auto& entries = kernel->GetEntries();
const bool use_unified = entries.use_unified_uniforms;
u32 binding = 0;
for (const auto& entry : kernel->GetEntries().const_buffers) {
for (const auto& entry : entries.const_buffers) {
const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
Tegra::Engines::ConstBufferInfo buffer;
buffer.address = config.Address();
buffer.size = config.size;
buffer.enabled = mask[entry.GetIndex()];
SetupConstBuffer(binding++, buffer, entry);
SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
++binding;
}
if (use_unified) {
const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
NUM_CONST_BUFFERS_BYTES_PER_STAGE);
}
}
void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
const ConstBufferEntry& entry) {
void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
const Tegra::Engines::ConstBufferInfo& buffer,
const ConstBufferEntry& entry, bool use_unified,
std::size_t unified_offset) {
if (!buffer.enabled) {
// Set values to zero to unbind buffers
glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
sizeof(float));
if (device.UseAssemblyShaders()) {
glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
} else {
glBindBufferRange(GL_UNIFORM_BUFFER, binding,
buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
}
return;
}
@@ -852,10 +916,29 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const
// UBO alignment requirements.
const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
const auto alignment = device.GetUniformBufferAlignment();
const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
device.HasFastBufferSubData());
glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
const bool fast_upload = !use_unified && device.HasFastBufferSubData();
const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
const GPUVAddr gpu_addr = buffer.address;
auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
if (device.UseAssemblyShaders()) {
UNIMPLEMENTED_IF(use_unified);
if (offset != 0) {
const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
cbuf = staging_cbuf;
offset = 0;
}
glBindBufferRangeNV(stage, binding, cbuf, offset, size);
return;
}
if (use_unified) {
glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
} else {
glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
}
}
void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
@@ -863,7 +946,8 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
auto& memory_manager{gpu.MemoryManager()};
const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer;
u32 binding =
device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
for (const auto& entry : shader->GetEntries().global_memory_entries) {
const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
@@ -929,16 +1013,12 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
glBindTextureUnit(binding, 0);
return;
}
glBindTextureUnit(binding, view->GetTexture());
if (view->GetSurfaceParams().IsBuffer()) {
return;
const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source,
texture.tic.z_source, texture.tic.w_source);
glBindTextureUnit(binding, handle);
if (!view->GetSurfaceParams().IsBuffer()) {
glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
}
// Apply swizzle to textures that are not buffers.
view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source,
texture.tic.w_source);
glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
}
void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
@@ -967,14 +1047,11 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t
glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
return;
}
if (!tic.IsBuffer()) {
view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
}
if (entry.is_written) {
view->MarkAsModified(texture_cache.Tick());
}
glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE,
view->GetFormat());
const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat());
}
void RasterizerOpenGL::SyncViewport() {

View File

@@ -56,8 +56,8 @@ struct DrawParameters;
class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
public:
explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
ScreenInfo& info, GLShader::ProgramManager& program_manager,
StateTracker& state_tracker);
const Device& device, ScreenInfo& info,
ProgramManager& program_manager, StateTracker& state_tracker);
~RasterizerOpenGL() override;
void Draw(bool is_indexed, bool is_instanced) override;
@@ -106,8 +106,9 @@ private:
void SetupComputeConstBuffers(const Shader& kernel);
/// Configures a constant buffer.
void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
const ConstBufferEntry& entry);
void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
const ConstBufferEntry& entry, bool use_unified,
std::size_t unified_offset);
/// Configures the current global memory entries to use for the draw command.
void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
@@ -224,7 +225,7 @@ private:
void SetupShaders(GLenum primitive_mode);
const Device device;
const Device& device;
TextureCacheOpenGL texture_cache;
ShaderCacheOpenGL shader_cache;
@@ -236,7 +237,7 @@ private:
Core::System& system;
ScreenInfo& screen_info;
GLShader::ProgramManager& program_manager;
ProgramManager& program_manager;
StateTracker& state_tracker;
static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
@@ -248,6 +249,13 @@ private:
std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
enabled_transform_feedback_buffers;
static constexpr std::size_t NUM_CONSTANT_BUFFERS =
Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
std::size_t current_cbuf = 0;
OGLBuffer unified_uniform_buffer;
/// Number of commands queued to the OpenGL driver. Reseted on flush.
std::size_t num_queued_commands = 0;

View File

@@ -125,6 +125,15 @@ void OGLProgram::Release() {
handle = 0;
}
void OGLAssemblyProgram::Release() {
if (handle == 0) {
return;
}
MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
glDeleteProgramsARB(1, &handle);
handle = 0;
}
void OGLPipeline::Create() {
if (handle != 0)
return;

View File

@@ -167,6 +167,22 @@ public:
GLuint handle = 0;
};
class OGLAssemblyProgram : private NonCopyable {
public:
OGLAssemblyProgram() = default;
OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
~OGLAssemblyProgram() {
Release();
}
/// Deletes the internal OpenGL resource
void Release();
GLuint handle = 0;
};
class OGLPipeline : private NonCopyable {
public:
OGLPipeline() = default;

View File

@@ -97,6 +97,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
return {};
}
constexpr GLenum AssemblyEnum(ShaderType shader_type) {
switch (shader_type) {
case ShaderType::Vertex:
return GL_VERTEX_PROGRAM_NV;
case ShaderType::TesselationControl:
return GL_TESS_CONTROL_PROGRAM_NV;
case ShaderType::TesselationEval:
return GL_TESS_EVALUATION_PROGRAM_NV;
case ShaderType::Geometry:
return GL_GEOMETRY_PROGRAM_NV;
case ShaderType::Fragment:
return GL_FRAGMENT_PROGRAM_NV;
case ShaderType::Compute:
return GL_COMPUTE_PROGRAM_NV;
}
return {};
}
std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
}
@@ -120,18 +138,43 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
return registry;
}
std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type,
u64 unique_identifier, const ShaderIR& ir,
const Registry& registry, bool hint_retrievable = false) {
ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier,
const ShaderIR& ir, const Registry& registry,
bool hint_retrievable = false) {
const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
LOG_INFO(Render_OpenGL, "{}", shader_id);
const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
OGLShader shader;
shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
auto program = std::make_shared<ProgramHandle>();
if (device.UseAssemblyShaders()) {
const std::string arb = "Not implemented";
GLuint& arb_prog = program->assembly_program.handle;
// Commented out functions signal OpenGL errors but are compatible with apitrace.
// Use them only to capture and replay on apitrace.
#if 0
glGenProgramsNV(1, &arb_prog);
glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()),
reinterpret_cast<const GLubyte*>(arb.data()));
#else
glGenProgramsARB(1, &arb_prog);
glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB,
static_cast<GLsizei>(arb.size()), arb.data());
#endif
const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV));
if (err && *err) {
LOG_CRITICAL(Render_OpenGL, "{}", err);
LOG_INFO(Render_OpenGL, "\n{}", arb);
}
} else {
const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
OGLShader shader;
shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
program->source_program.Create(true, hint_retrievable, shader.handle);
}
auto program = std::make_shared<OGLProgram>();
program->Create(true, hint_retrievable, shader.handle);
return program;
}
@@ -153,15 +196,22 @@ std::unordered_set<GLenum> GetSupportedFormats() {
CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
std::shared_ptr<VideoCommon::Shader::Registry> registry,
ShaderEntries entries, std::shared_ptr<OGLProgram> program)
ShaderEntries entries, ProgramSharedPtr program_)
: RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
size_in_bytes{size_in_bytes}, program{std::move(program)} {}
size_in_bytes{size_in_bytes}, program{std::move(program_)} {
// Assign either the assembly program or source program. We can't have both.
handle = program->assembly_program.handle;
if (handle == 0) {
handle = program->source_program.handle;
}
ASSERT(handle != 0);
}
CachedShader::~CachedShader() = default;
GLuint CachedShader::GetHandle() const {
DEBUG_ASSERT(registry->IsConsistent());
return program->handle;
return handle;
}
Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
@@ -191,8 +241,9 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
entry.bindless_samplers = registry->GetBindlessSamplers();
params.disk_cache.SaveEntry(std::move(entry));
return std::shared_ptr<CachedShader>(new CachedShader(
params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
return std::shared_ptr<CachedShader>(
new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
MakeEntries(params.device, ir, shader_type), std::move(program)));
}
Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
@@ -215,8 +266,9 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
entry.bindless_samplers = registry->GetBindlessSamplers();
params.disk_cache.SaveEntry(std::move(entry));
return std::shared_ptr<CachedShader>(new CachedShader(
params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
return std::shared_ptr<CachedShader>(
new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
MakeEntries(params.device, ir, ShaderType::Compute), std::move(program)));
}
Shader CachedShader::CreateFromCache(const ShaderParameters& params,
@@ -239,7 +291,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
return;
}
const std::vector gl_cache = disk_cache.LoadPrecompiled();
std::vector<ShaderDiskCachePrecompiled> gl_cache;
if (!device.UseAssemblyShaders()) {
// Only load precompiled cache when we are not using assembly shaders
gl_cache = disk_cache.LoadPrecompiled();
}
const auto supported_formats = GetSupportedFormats();
// Track if precompiled cache was altered during loading to know if we have to
@@ -278,7 +334,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
auto registry = MakeRegistry(entry);
const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);
std::shared_ptr<OGLProgram> program;
ProgramSharedPtr program;
if (precompiled_entry) {
// If the shader is precompiled, attempt to load it with
program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
@@ -294,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
PrecompiledShader shader;
shader.program = std::move(program);
shader.registry = std::move(registry);
shader.entries = MakeEntries(ir);
shader.entries = MakeEntries(device, ir, entry.type);
std::scoped_lock lock{mutex};
if (callback) {
@@ -332,6 +388,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
return;
}
if (device.UseAssemblyShaders()) {
// Don't store precompiled binaries for assembly shaders.
return;
}
// TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
// before precompiling them
@@ -339,7 +400,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
const u64 id = (*transferable)[i].unique_identifier;
const auto it = find_precompiled(id);
if (it == gl_cache.end()) {
const GLuint program = runtime_cache.at(id).program->handle;
const GLuint program = runtime_cache.at(id).program->source_program.handle;
disk_cache.SavePrecompiled(id, program);
precompiled_cache_altered = true;
}
@@ -350,7 +411,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
}
}
std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
const std::unordered_set<GLenum>& supported_formats) {
if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
@@ -358,15 +419,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
return {};
}
auto program = std::make_shared<OGLProgram>();
program->handle = glCreateProgram();
glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
glProgramBinary(program->handle, precompiled_entry.binary_format,
precompiled_entry.binary.data(),
auto program = std::make_shared<ProgramHandle>();
GLuint& handle = program->source_program.handle;
handle = glCreateProgram();
glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(),
static_cast<GLsizei>(precompiled_entry.binary.size()));
GLint link_status;
glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status);
glGetProgramiv(handle, GL_LINK_STATUS, &link_status);
if (link_status == GL_FALSE) {
LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
return {};

View File

@@ -43,8 +43,14 @@ struct UnspecializedShader;
using Shader = std::shared_ptr<CachedShader>;
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
struct ProgramHandle {
OGLProgram source_program;
OGLAssemblyProgram assembly_program;
};
using ProgramSharedPtr = std::shared_ptr<ProgramHandle>;
struct PrecompiledShader {
std::shared_ptr<OGLProgram> program;
ProgramSharedPtr program;
std::shared_ptr<VideoCommon::Shader::Registry> registry;
ShaderEntries entries;
};
@@ -87,12 +93,13 @@ public:
private:
explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
std::shared_ptr<VideoCommon::Shader::Registry> registry,
ShaderEntries entries, std::shared_ptr<OGLProgram> program);
ShaderEntries entries, ProgramSharedPtr program);
std::shared_ptr<VideoCommon::Shader::Registry> registry;
ShaderEntries entries;
std::size_t size_in_bytes = 0;
std::shared_ptr<OGLProgram> program;
ProgramSharedPtr program;
GLuint handle = 0;
};
class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
@@ -115,7 +122,7 @@ protected:
void FlushObjectInner(const Shader& object) override {}
private:
std::shared_ptr<OGLProgram> GeneratePrecompiledProgram(
ProgramSharedPtr GeneratePrecompiledProgram(
const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
const std::unordered_set<GLenum>& supported_formats);

View File

@@ -61,8 +61,8 @@ struct TextureDerivates {};
using TextureArgument = std::pair<Type, Node>;
using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;
constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
#define ftou floatBitsToUint
@@ -402,6 +402,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
}
bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
// We waste one UBO for emulation
const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
return num_ubos > num_available_ubos;
}
struct GenericVaryingDescription {
std::string name;
u8 first_element = 0;
@@ -412,8 +419,9 @@ class GLSLDecompiler final {
public:
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
ShaderType stage, std::string_view identifier, std::string_view suffix)
: device{device}, ir{ir}, registry{registry}, stage{stage},
identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
: device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
UseUnifiedUniforms(device, ir, stage)} {
if (stage != ShaderType::Compute) {
transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
}
@@ -834,12 +842,24 @@ private:
}
void DeclareConstantBuffers() {
if (use_unified_uniforms) {
const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
static_cast<u32>(ir.GetGlobalMemory().size());
code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
binding);
code.AddLine(" uint cbufs[];");
code.AddLine("}};");
code.AddNewLine();
return;
}
u32 binding = device.GetBaseBindings(stage).uniform_buffer;
for (const auto& buffers : ir.GetConstantBuffers()) {
const auto index = buffers.first;
for (const auto [index, info] : ir.GetConstantBuffers()) {
const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
GetConstBufferBlock(index));
code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS);
code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size);
code.AddLine("}};");
code.AddNewLine();
}
@@ -1038,42 +1058,51 @@ private:
if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
const Node offset = cbuf->GetOffset();
const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
// Direct access
const u32 offset_imm = immediate->GetValue();
ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
offset_imm / (4 * 4), (offset_imm / 4) % 4),
if (use_unified_uniforms) {
return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
Type::Uint};
} else {
return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
offset_imm / (4 * 4), (offset_imm / 4) % 4),
Type::Uint};
}
}
// Indirect access
if (use_unified_uniforms) {
return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
Visit(offset).AsUint()),
Type::Uint};
}
if (std::holds_alternative<OperationNode>(*offset)) {
// Indirect access
const std::string final_offset = code.GenerateTemporary();
code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
const std::string final_offset = code.GenerateTemporary();
code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
if (!device.HasComponentIndexingBug()) {
return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
final_offset, final_offset),
Type::Uint};
}
// AMD's proprietary GLSL compiler emits ill code for variable component access.
// To bypass this driver bug generate 4 ifs, one per each component.
const std::string pack = code.GenerateTemporary();
code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
final_offset);
const std::string result = code.GenerateTemporary();
code.AddLine("uint {};", result);
for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
pack, GetSwizzle(swizzle));
}
return {result, Type::Uint};
if (!device.HasComponentIndexingBug()) {
return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
final_offset, final_offset),
Type::Uint};
}
UNREACHABLE_MSG("Unmanaged offset node type");
// AMD's proprietary GLSL compiler emits ill code for variable component access.
// To bypass this driver bug generate 4 ifs, one per each component.
const std::string pack = code.GenerateTemporary();
code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
final_offset);
const std::string result = code.GenerateTemporary();
code.AddLine("uint {};", result);
for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
GetSwizzle(swizzle));
}
return {result, Type::Uint};
}
if (const auto gmem = std::get_if<GmemNode>(&*node)) {
@@ -1538,7 +1567,9 @@ private:
Expression target;
if (const auto gpr = std::get_if<GprNode>(&*dest)) {
if (gpr->GetIndex() == Register::ZeroIndex) {
// Writing to Register::ZeroIndex is a no op
// Writing to Register::ZeroIndex is a no op but we still have to visit the source
// as it might have side effects.
code.AddLine("{};", Visit(src).GetCode());
return {};
}
target = {GetRegister(gpr->GetIndex()), Type::Float};
@@ -2309,6 +2340,18 @@ private:
return {"gl_SubGroupInvocationARB", Type::Uint};
}
template <const std::string_view& comparison>
Expression ThreadMask(Operation) {
if (device.HasWarpIntrinsics()) {
return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint};
}
if (device.HasShaderBallot()) {
return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint};
}
LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader");
return {"0U", Type::Uint};
}
Expression ShuffleIndexed(Operation operation) {
std::string value = VisitOperand(operation, 0).AsFloat();
@@ -2321,7 +2364,21 @@ private:
return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float};
}
Expression MemoryBarrierGL(Operation) {
Expression Barrier(Operation) {
if (!ir.IsDecompiled()) {
LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled");
return {};
}
code.AddLine("barrier();");
return {};
}
Expression MemoryBarrierGroup(Operation) {
code.AddLine("groupMemoryBarrier();");
return {};
}
Expression MemoryBarrierGlobal(Operation) {
code.AddLine("memoryBarrier();");
return {};
}
@@ -2337,6 +2394,12 @@ private:
static constexpr std::string_view NotEqual = "!=";
static constexpr std::string_view GreaterEqual = ">=";
static constexpr std::string_view Eq = "Eq";
static constexpr std::string_view Ge = "Ge";
static constexpr std::string_view Gt = "Gt";
static constexpr std::string_view Le = "Le";
static constexpr std::string_view Lt = "Lt";
static constexpr std::string_view Add = "Add";
static constexpr std::string_view Min = "Min";
static constexpr std::string_view Max = "Max";
@@ -2554,9 +2617,16 @@ private:
&GLSLDecompiler::VoteEqual,
&GLSLDecompiler::ThreadId,
&GLSLDecompiler::ThreadMask<Func::Eq>,
&GLSLDecompiler::ThreadMask<Func::Ge>,
&GLSLDecompiler::ThreadMask<Func::Gt>,
&GLSLDecompiler::ThreadMask<Func::Le>,
&GLSLDecompiler::ThreadMask<Func::Lt>,
&GLSLDecompiler::ShuffleIndexed,
&GLSLDecompiler::MemoryBarrierGL,
&GLSLDecompiler::Barrier,
&GLSLDecompiler::MemoryBarrierGroup,
&GLSLDecompiler::MemoryBarrierGlobal,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
@@ -2669,6 +2739,7 @@ private:
const std::string_view identifier;
const std::string_view suffix;
const Header header;
const bool use_unified_uniforms;
std::unordered_map<u8, VaryingTFB> transform_feedback;
ShaderWriter code;
@@ -2864,7 +2935,7 @@ void GLSLDecompiler::DecompileAST() {
} // Anonymous namespace
ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
ShaderEntries entries;
for (const auto& cbuf : ir.GetConstantBuffers()) {
entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2885,6 +2956,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
}
entries.shader_length = ir.GetLength();
entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
return entries;
}

View File

@@ -53,11 +53,13 @@ struct ShaderEntries {
std::vector<GlobalMemoryEntry> global_memory_entries;
std::vector<SamplerEntry> samplers;
std::vector<ImageEntry> images;
u32 clip_distances{};
std::size_t shader_length{};
u32 clip_distances{};
bool use_unified_uniforms{};
};
ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);
ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
Tegra::Engines::ShaderType stage);
std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
const VideoCommon::Shader::Registry& registry,

View File

@@ -6,47 +6,111 @@
#include "common/common_types.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_shader_manager.h"
namespace OpenGL::GLShader {
namespace OpenGL {
ProgramManager::ProgramManager() = default;
ProgramManager::ProgramManager(const Device& device) {
use_assembly_programs = device.UseAssemblyShaders();
if (use_assembly_programs) {
glEnable(GL_COMPUTE_PROGRAM_NV);
} else {
graphics_pipeline.Create();
glBindProgramPipeline(graphics_pipeline.handle);
}
}
ProgramManager::~ProgramManager() = default;
void ProgramManager::Create() {
graphics_pipeline.Create();
glBindProgramPipeline(graphics_pipeline.handle);
void ProgramManager::BindCompute(GLuint program) {
if (use_assembly_programs) {
glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
} else {
is_graphics_bound = false;
glUseProgram(program);
}
}
void ProgramManager::BindGraphicsPipeline() {
if (use_assembly_programs) {
UpdateAssemblyPrograms();
} else {
UpdateSourcePrograms();
}
}
void ProgramManager::BindHostPipeline(GLuint pipeline) {
if (use_assembly_programs) {
if (geometry_enabled) {
geometry_enabled = false;
old_state.geometry = 0;
glDisable(GL_GEOMETRY_PROGRAM_NV);
}
} else {
if (!is_graphics_bound) {
glUseProgram(0);
}
}
glBindProgramPipeline(pipeline);
}
void ProgramManager::RestoreGuestPipeline() {
if (use_assembly_programs) {
glBindProgramPipeline(0);
} else {
glBindProgramPipeline(graphics_pipeline.handle);
}
}
void ProgramManager::UpdateAssemblyPrograms() {
const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) {
if (current == old) {
return;
}
if (current == 0) {
if (enabled) {
enabled = false;
glDisable(stage);
}
return;
}
if (!enabled) {
enabled = true;
glEnable(stage);
}
glBindProgramARB(stage, current);
};
update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex);
update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry,
old_state.geometry);
update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment,
old_state.fragment);
old_state = current_state;
}
void ProgramManager::UpdateSourcePrograms() {
if (!is_graphics_bound) {
is_graphics_bound = true;
glUseProgram(0);
}
// Avoid updating the pipeline when values have no changed
if (old_state == current_state) {
return;
}
// Workaround for AMD bug
static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT |
GL_FRAGMENT_SHADER_BIT};
const GLuint handle = graphics_pipeline.handle;
glUseProgramStages(handle, all_used_stages, 0);
glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader);
glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader);
glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader);
const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) {
if (current == old) {
return;
}
glUseProgramStages(handle, stage, current);
};
update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex);
update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry);
update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment);
old_state = current_state;
}
void ProgramManager::BindComputeShader(GLuint program) {
is_graphics_bound = false;
glUseProgram(program);
}
void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
const auto& regs = maxwell.regs;
@@ -54,4 +118,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;
}
} // namespace OpenGL::GLShader
} // namespace OpenGL

View File

@@ -11,7 +11,9 @@
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/maxwell_to_gl.h"
namespace OpenGL::GLShader {
namespace OpenGL {
class Device;
/// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
/// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
@@ -28,50 +30,58 @@ static_assert(sizeof(MaxwellUniformData) < 16384,
class ProgramManager {
public:
explicit ProgramManager();
explicit ProgramManager(const Device& device);
~ProgramManager();
void Create();
/// Binds a compute program
void BindCompute(GLuint program);
/// Updates the graphics pipeline and binds it.
/// Updates bound programs.
void BindGraphicsPipeline();
/// Binds a compute shader.
void BindComputeShader(GLuint program);
/// Binds an OpenGL pipeline object unsynchronized with the guest state.
void BindHostPipeline(GLuint pipeline);
/// Rewinds BindHostPipeline state changes.
void RestoreGuestPipeline();
void UseVertexShader(GLuint program) {
current_state.vertex_shader = program;
current_state.vertex = program;
}
void UseGeometryShader(GLuint program) {
current_state.geometry_shader = program;
current_state.geometry = program;
}
void UseFragmentShader(GLuint program) {
current_state.fragment_shader = program;
current_state.fragment = program;
}
private:
struct PipelineState {
bool operator==(const PipelineState& rhs) const noexcept {
return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader &&
geometry_shader == rhs.geometry_shader;
}
bool operator!=(const PipelineState& rhs) const noexcept {
return !operator==(rhs);
}
GLuint vertex_shader = 0;
GLuint fragment_shader = 0;
GLuint geometry_shader = 0;
GLuint vertex = 0;
GLuint geometry = 0;
GLuint fragment = 0;
};
/// Update NV_gpu_program5 programs.
void UpdateAssemblyPrograms();
/// Update GLSL programs.
void UpdateSourcePrograms();
OGLPipeline graphics_pipeline;
OGLPipeline compute_pipeline;
PipelineState current_state;
PipelineState old_state;
bool use_assembly_programs = false;
bool is_graphics_bound = true;
bool vertex_enabled = false;
bool geometry_enabled = false;
bool fragment_enabled = false;
};
} // namespace OpenGL::GLShader
} // namespace OpenGL

View File

@@ -35,7 +35,7 @@ MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
namespace {
struct FormatTuple {
GLint internal_format;
GLenum internal_format;
GLenum format = GL_NONE;
GLenum type = GL_NONE;
};
@@ -238,6 +238,12 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
return texture;
}
constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source,
SwizzleSource w_source) {
return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
(static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
}
} // Anonymous namespace
CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params,
@@ -381,7 +387,7 @@ void CachedSurface::DecorateSurfaceName() {
}
void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) {
LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix);
LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix);
}
View CachedSurface::CreateView(const ViewParams& view_key) {
@@ -397,14 +403,13 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr
}
CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params,
const bool is_proxy)
: VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} {
target = GetTextureTarget(params.target);
format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format;
bool is_proxy)
: VideoCommon::ViewBase(params), surface{surface},
format{GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format},
target{GetTextureTarget(params.target)}, is_proxy{is_proxy} {
if (!is_proxy) {
texture_view = CreateTextureView();
main_view = CreateTextureView();
}
swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A);
}
CachedSurfaceView::~CachedSurfaceView() = default;
@@ -447,27 +452,49 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
}
}
void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source,
GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source,
SwizzleSource z_source, SwizzleSource w_source) {
u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
if (new_swizzle == swizzle)
return;
swizzle = new_swizzle;
const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source),
GetSwizzleSource(z_source), GetSwizzleSource(w_source)};
const GLuint handle = GetTexture();
const PixelFormat format = surface.GetSurfaceParams().pixel_format;
switch (format) {
if (GetSurfaceParams().IsBuffer()) {
return GetTexture();
}
const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
if (current_swizzle == new_swizzle) {
return current_view;
}
current_swizzle = new_swizzle;
const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
OGLTextureView& view = entry->second;
if (!is_cache_miss) {
current_view = view.handle;
return view.handle;
}
view = CreateTextureView();
current_view = view.handle;
std::array swizzle{x_source, y_source, z_source, w_source};
switch (const PixelFormat format = GetSurfaceParams().pixel_format) {
case PixelFormat::Z24S8:
case PixelFormat::Z32FS8:
case PixelFormat::S8Z24:
glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G);
glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
GetComponent(format, x_source == SwizzleSource::R));
break;
default:
glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
// Make sure we sample the first component
std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) {
return value == SwizzleSource::G ? SwizzleSource::R : value;
});
[[fallthrough]];
default: {
const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]),
GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])};
glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
break;
}
}
return view.handle;
}
OGLTextureView CachedSurfaceView::CreateTextureView() const {

View File

@@ -83,7 +83,7 @@ public:
/// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER
void Attach(GLenum attachment, GLenum target) const;
void ApplySwizzle(Tegra::Texture::SwizzleSource x_source,
GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
Tegra::Texture::SwizzleSource y_source,
Tegra::Texture::SwizzleSource z_source,
Tegra::Texture::SwizzleSource w_source);
@@ -98,7 +98,7 @@ public:
if (is_proxy) {
return surface.GetTexture();
}
return texture_view.handle;
return main_view.handle;
}
GLenum GetFormat() const {
@@ -110,23 +110,19 @@ public:
}
private:
u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
Tegra::Texture::SwizzleSource y_source,
Tegra::Texture::SwizzleSource z_source,
Tegra::Texture::SwizzleSource w_source) const {
return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
(static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
}
OGLTextureView CreateTextureView() const;
CachedSurface& surface;
GLenum target{};
GLenum format{};
const GLenum format;
const GLenum target;
const bool is_proxy;
OGLTextureView texture_view;
u32 swizzle{};
bool is_proxy{};
std::unordered_map<u32, OGLTextureView> view_cache;
OGLTextureView main_view;
// Use an invalid default so it always fails the comparison test
u32 current_swizzle = 0xffffffff;
GLuint current_view = 0;
};
class TextureCacheOpenGL final : public TextureCacheBase {

View File

@@ -316,7 +316,7 @@ public:
RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
Core::Frontend::GraphicsContext& context)
: RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context},
has_debug_tool{HasDebugTool()} {}
program_manager{device}, has_debug_tool{HasDebugTool()} {}
RendererOpenGL::~RendererOpenGL() = default;
@@ -468,8 +468,9 @@ void RendererOpenGL::InitOpenGLObjects() {
vertex_program.Create(true, false, vertex_shader.handle);
fragment_program.Create(true, false, fragment_shader.handle);
// Create program pipeline
program_manager.Create();
pipeline.Create();
glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle);
glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle);
// Generate VBO handle for drawing
vertex_buffer.Create();
@@ -508,7 +509,7 @@ void RendererOpenGL::CreateRasterizer() {
if (rasterizer) {
return;
}
rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info,
rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, device, screen_info,
program_manager, state_tracker);
}
@@ -620,10 +621,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
state_tracker.NotifyClipControl();
state_tracker.NotifyAlphaTest();
program_manager.UseVertexShader(vertex_program.handle);
program_manager.UseGeometryShader(0);
program_manager.UseFragmentShader(fragment_program.handle);
program_manager.BindGraphicsPipeline();
program_manager.BindHostPipeline(pipeline.handle);
glEnable(GL_CULL_FACE);
if (screen_info.display_srgb) {
@@ -665,6 +663,8 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
glClear(GL_COLOR_BUFFER_BIT);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
program_manager.RestoreGuestPipeline();
}
bool RendererOpenGL::TryPresent(int timeout_ms) {
@@ -753,6 +753,9 @@ void RendererOpenGL::RenderScreenshot() {
bool RendererOpenGL::Init() {
if (GLAD_GL_KHR_debug) {
glEnable(GL_DEBUG_OUTPUT);
if (Settings::values.renderer_debug) {
glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
}
glDebugMessageCallback(DebugHandler, nullptr);
}

View File

@@ -9,6 +9,7 @@
#include "common/common_types.h"
#include "common/math_util.h"
#include "video_core/renderer_base.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_shader_manager.h"
#include "video_core/renderer_opengl/gl_state_tracker.h"
@@ -95,6 +96,7 @@ private:
Core::Frontend::EmuWindow& emu_window;
Core::System& system;
Core::Frontend::GraphicsContext& context;
const Device device;
StateTracker state_tracker{system};
@@ -102,13 +104,14 @@ private:
OGLBuffer vertex_buffer;
OGLProgram vertex_program;
OGLProgram fragment_program;
OGLPipeline pipeline;
OGLFramebuffer screenshot_framebuffer;
/// Display information for Switch screen
ScreenInfo screen_info;
/// Global dummy shader pipeline
GLShader::ProgramManager program_manager;
ProgramManager program_manager;
/// OpenGL framebuffer data
std::vector<u8> gl_framebuffer_data;

View File

@@ -142,7 +142,7 @@ struct FormatTuple {
{VK_FORMAT_BC6H_UFLOAT_BLOCK}, // BC6H_UF16
{VK_FORMAT_BC6H_SFLOAT_BLOCK}, // BC6H_SF16
{VK_FORMAT_ASTC_4x4_UNORM_BLOCK}, // ASTC_2D_4X4
{VK_FORMAT_B8G8R8A8_UNORM}, // BGRA8
{VK_FORMAT_B8G8R8A8_UNORM, Attachable}, // BGRA8
{VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage}, // RGBA32F
{VK_FORMAT_R32G32_SFLOAT, Attachable | Storage}, // RG32F
{VK_FORMAT_R32_SFLOAT, Attachable | Storage}, // R32F
@@ -168,7 +168,7 @@ struct FormatTuple {
{VK_FORMAT_ASTC_8x8_UNORM_BLOCK}, // ASTC_2D_8X8
{VK_FORMAT_UNDEFINED}, // ASTC_2D_8X5
{VK_FORMAT_UNDEFINED}, // ASTC_2D_5X4
{VK_FORMAT_UNDEFINED}, // BGRA8_SRGB
{VK_FORMAT_B8G8R8A8_SRGB, Attachable}, // BGRA8_SRGB
{VK_FORMAT_BC1_RGBA_SRGB_BLOCK}, // DXT1_SRGB
{VK_FORMAT_BC2_SRGB_BLOCK}, // DXT23_SRGB
{VK_FORMAT_BC3_SRGB_BLOCK}, // DXT45_SRGB

View File

@@ -7,6 +7,7 @@
#include <memory>
#include "core/core.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_device.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"

View File

@@ -104,6 +104,7 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
VK_FORMAT_R16_SFLOAT,
VK_FORMAT_R16G16B16A16_SFLOAT,
VK_FORMAT_B8G8R8A8_UNORM,
VK_FORMAT_B8G8R8A8_SRGB,
VK_FORMAT_R4G4B4A4_UNORM_PACK16,
VK_FORMAT_D32_SFLOAT,
VK_FORMAT_D16_UNORM,

View File

@@ -7,6 +7,7 @@
#include <memory>
#include "video_core/fence_manager.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/wrapper.h"
namespace Core {

View File

@@ -312,7 +312,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
ASSERT(point_size != 0.0f);
}
for (std::size_t i = 0; i < Maxwell::NumVertexAttributes; ++i) {
specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].Type();
const auto& attribute = fixed_state.vertex_input.attributes[i];
specialization.enabled_attributes[i] = attribute.enabled.Value() != 0;
specialization.attribute_types[i] = attribute.Type();
}
specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one;
@@ -329,8 +331,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum);
const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
ASSERT(cpu_addr);
const auto shader = TryGet(*cpu_addr);
const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader;
ASSERT(shader);
const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5

View File

@@ -532,14 +532,14 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
return;
}
texture_cache.OnCPUWrite(addr, size);
pipeline_cache.InvalidateRegion(addr, size);
pipeline_cache.OnCPUWrite(addr, size);
buffer_cache.OnCPUWrite(addr, size);
query_cache.InvalidateRegion(addr, size);
}
void RasterizerVulkan::SyncGuestHost() {
texture_cache.SyncGuestHost();
buffer_cache.SyncGuestHost();
pipeline_cache.SyncGuestHost();
}
void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) {
@@ -877,14 +877,10 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex
for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) {
const auto& attrib = regs.vertex_attrib_format[index];
if (!attrib.IsValid()) {
if (attrib.IsConstant()) {
vertex_input.SetAttribute(index, false, 0, 0, {}, {});
continue;
}
[[maybe_unused]] const auto& buffer = regs.vertex_array[attrib.buffer];
ASSERT(buffer.IsEnabled());
vertex_input.SetAttribute(index, true, attrib.buffer, attrib.offset, attrib.type.Value(),
attrib.size.Value());
}

View File

@@ -515,6 +515,16 @@ private:
void DeclareCommon() {
thread_id =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupLocalInvocationId, t_in_uint, "thread_id");
thread_masks[0] =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupEqMask, t_in_uint4, "thread_eq_mask");
thread_masks[1] =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupGeMask, t_in_uint4, "thread_ge_mask");
thread_masks[2] =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupGtMask, t_in_uint4, "thread_gt_mask");
thread_masks[3] =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupLeMask, t_in_uint4, "thread_le_mask");
thread_masks[4] =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupLtMask, t_in_uint4, "thread_lt_mask");
}
void DeclareVertex() {
@@ -731,8 +741,10 @@ private:
if (!IsGenericAttribute(index)) {
continue;
}
const u32 location = GetGenericAttributeLocation(index);
if (!IsAttributeEnabled(location)) {
continue;
}
const auto type_descriptor = GetAttributeType(location);
Id type;
if (IsInputAttributeArray()) {
@@ -976,6 +988,10 @@ private:
return stage == ShaderType::TesselationControl;
}
bool IsAttributeEnabled(u32 location) const {
return stage != ShaderType::Vertex || specialization.enabled_attributes[location];
}
u32 GetNumInputVertices() const {
switch (stage) {
case ShaderType::Geometry:
@@ -1071,8 +1087,7 @@ private:
void VisitBasicBlock(const NodeBlock& bb) {
for (const auto& node : bb) {
[[maybe_unused]] const Type type = Visit(node).type;
ASSERT(type == Type::Void);
Visit(node);
}
}
@@ -1192,16 +1207,20 @@ private:
UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element);
return {v_float_zero, Type::Float};
default:
if (IsGenericAttribute(attribute)) {
const u32 location = GetGenericAttributeLocation(attribute);
const auto type_descriptor = GetAttributeType(location);
const Type type = type_descriptor.type;
const Id attribute_id = input_attributes.at(attribute);
const std::vector elements = {element};
const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
return {OpLoad(GetTypeDefinition(type), pointer), type};
if (!IsGenericAttribute(attribute)) {
break;
}
break;
const u32 location = GetGenericAttributeLocation(attribute);
if (!IsAttributeEnabled(location)) {
// Disabled attributes (also known as constant attributes) always return zero.
return {v_float_zero, Type::Float};
}
const auto type_descriptor = GetAttributeType(location);
const Type type = type_descriptor.type;
const Id attribute_id = input_attributes.at(attribute);
const std::vector elements = {element};
const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
return {OpLoad(GetTypeDefinition(type), pointer), type};
}
UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
return {v_float_zero, Type::Float};
@@ -1362,7 +1381,9 @@ private:
Expression target{};
if (const auto gpr = std::get_if<GprNode>(&*dest)) {
if (gpr->GetIndex() == Register::ZeroIndex) {
// Writing to Register::ZeroIndex is a no op
// Writing to Register::ZeroIndex is a no op but we still have to visit its source
// because it might have side effects.
Visit(src);
return {};
}
target = {registers.at(gpr->GetIndex()), Type::Float};
@@ -2175,14 +2196,37 @@ private:
return {OpLoad(t_uint, thread_id), Type::Uint};
}
template <std::size_t index>
Expression ThreadMask(Operation) {
// TODO(Rodrigo): Handle devices with different warp sizes
const Id mask = thread_masks[index];
return {OpLoad(t_uint, AccessElement(t_in_uint, mask, 0)), Type::Uint};
}
Expression ShuffleIndexed(Operation operation) {
const Id value = AsFloat(Visit(operation[0]));
const Id index = AsUint(Visit(operation[1]));
return {OpSubgroupReadInvocationKHR(t_float, value, index), Type::Float};
}
Expression MemoryBarrierGL(Operation) {
const auto scope = spv::Scope::Device;
Expression Barrier(Operation) {
if (!ir.IsDecompiled()) {
LOG_ERROR(Render_Vulkan, "OpBarrier used by shader is not decompiled");
return {};
}
const auto scope = spv::Scope::Workgroup;
const auto memory = spv::Scope::Workgroup;
const auto semantics =
spv::MemorySemanticsMask::WorkgroupMemory | spv::MemorySemanticsMask::AcquireRelease;
OpControlBarrier(Constant(t_uint, static_cast<u32>(scope)),
Constant(t_uint, static_cast<u32>(memory)),
Constant(t_uint, static_cast<u32>(semantics)));
return {};
}
template <spv::Scope scope>
Expression MemoryBarrier(Operation) {
const auto semantics =
spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory |
spv::MemorySemanticsMask::WorkgroupMemory |
@@ -2639,9 +2683,16 @@ private:
&SPIRVDecompiler::Vote<&Module::OpSubgroupAllEqualKHR>,
&SPIRVDecompiler::ThreadId,
&SPIRVDecompiler::ThreadMask<0>, // Eq
&SPIRVDecompiler::ThreadMask<1>, // Ge
&SPIRVDecompiler::ThreadMask<2>, // Gt
&SPIRVDecompiler::ThreadMask<3>, // Le
&SPIRVDecompiler::ThreadMask<4>, // Lt
&SPIRVDecompiler::ShuffleIndexed,
&SPIRVDecompiler::MemoryBarrierGL,
&SPIRVDecompiler::Barrier,
&SPIRVDecompiler::MemoryBarrier<spv::Scope::Workgroup>,
&SPIRVDecompiler::MemoryBarrier<spv::Scope::Device>,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
@@ -2763,6 +2814,7 @@ private:
Id workgroup_id{};
Id local_invocation_id{};
Id thread_id{};
std::array<Id, 5> thread_masks{}; // eq, ge, gt, le, lt
VertexIndices in_indices;
VertexIndices out_indices;

View File

@@ -88,7 +88,8 @@ struct Specialization final {
u32 shared_memory_size{};
// Graphics specific
std::optional<float> point_size{};
std::optional<float> point_size;
std::bitset<Maxwell::NumVertexAttributes> enabled_attributes;
std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{};
bool ndc_minus_one_to_one{};
};

View File

@@ -354,26 +354,23 @@ CachedSurfaceView::~CachedSurfaceView() = default;
VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source,
SwizzleSource z_source, SwizzleSource w_source) {
const u32 swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
if (last_image_view && last_swizzle == swizzle) {
const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
if (last_image_view && last_swizzle == new_swizzle) {
return last_image_view;
}
last_swizzle = swizzle;
last_swizzle = new_swizzle;
const auto [entry, is_cache_miss] = view_cache.try_emplace(swizzle);
const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
auto& image_view = entry->second;
if (!is_cache_miss) {
return last_image_view = *image_view;
}
auto swizzle_x = MaxwellToVK::SwizzleSource(x_source);
auto swizzle_y = MaxwellToVK::SwizzleSource(y_source);
auto swizzle_z = MaxwellToVK::SwizzleSource(z_source);
auto swizzle_w = MaxwellToVK::SwizzleSource(w_source);
std::array swizzle{MaxwellToVK::SwizzleSource(x_source), MaxwellToVK::SwizzleSource(y_source),
MaxwellToVK::SwizzleSource(z_source), MaxwellToVK::SwizzleSource(w_source)};
if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) {
// A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here.
std::swap(swizzle_x, swizzle_z);
std::swap(swizzle[0], swizzle[2]);
}
// Games can sample depth or stencil values on textures. This is decided by the swizzle value on
@@ -395,11 +392,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
UNIMPLEMENTED();
}
// Vulkan doesn't seem to understand swizzling of a depth stencil image, use identity
swizzle_x = VK_COMPONENT_SWIZZLE_R;
swizzle_y = VK_COMPONENT_SWIZZLE_G;
swizzle_z = VK_COMPONENT_SWIZZLE_B;
swizzle_w = VK_COMPONENT_SWIZZLE_A;
// Make sure we sample the first component
std::transform(
swizzle.begin(), swizzle.end(), swizzle.begin(), [](VkComponentSwizzle component) {
return component == VK_COMPONENT_SWIZZLE_G ? VK_COMPONENT_SWIZZLE_R : component;
});
}
VkImageViewCreateInfo ci;
@@ -409,7 +406,7 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
ci.image = surface.GetImageHandle();
ci.viewType = image_view_type;
ci.format = surface.GetImage().GetFormat();
ci.components = {swizzle_x, swizzle_y, swizzle_z, swizzle_w};
ci.components = {swizzle[0], swizzle[1], swizzle[2], swizzle[3]};
ci.subresourceRange.aspectMask = aspect;
ci.subresourceRange.baseMipLevel = base_level;
ci.subresourceRange.levelCount = num_levels;

View File

@@ -387,7 +387,6 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
}
case OpCode::Id::RED: {
UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32);
UNIMPLEMENTED_IF_MSG(instr.red.operation != AtomicOp::Add);
const auto [real_address, base_address, descriptor] =
TrackGlobalMemory(bb, instr, true, true);
if (!real_address || !base_address) {
@@ -396,7 +395,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
}
Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
Node value = GetRegister(instr.gpr0);
bb.push_back(Operation(OperationCode::ReduceIAdd, move(gmem), move(value)));
bb.push_back(Operation(GetAtomOperation(instr.red.operation), move(gmem), move(value)));
break;
}
case OpCode::Id::ATOM: {

View File

@@ -109,6 +109,27 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
return Operation(OperationCode::WorkGroupIdY);
case SystemVariable::CtaIdZ:
return Operation(OperationCode::WorkGroupIdZ);
case SystemVariable::EqMask:
case SystemVariable::LtMask:
case SystemVariable::LeMask:
case SystemVariable::GtMask:
case SystemVariable::GeMask:
uses_warps = true;
switch (instr.sys20) {
case SystemVariable::EqMask:
return Operation(OperationCode::ThreadEqMask);
case SystemVariable::LtMask:
return Operation(OperationCode::ThreadLtMask);
case SystemVariable::LeMask:
return Operation(OperationCode::ThreadLeMask);
case SystemVariable::GtMask:
return Operation(OperationCode::ThreadGtMask);
case SystemVariable::GeMask:
return Operation(OperationCode::ThreadGeMask);
default:
UNREACHABLE();
return Immediate(0u);
}
default:
UNIMPLEMENTED_MSG("Unhandled system move: {}",
static_cast<u32>(instr.sys20.Value()));
@@ -272,10 +293,25 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8));
break;
}
case OpCode::Id::BAR: {
UNIMPLEMENTED_IF_MSG(instr.value != 0xF0A81B8000070000ULL, "BAR is not BAR.SYNC 0x0");
bb.push_back(Operation(OperationCode::Barrier));
break;
}
case OpCode::Id::MEMBAR: {
UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL);
UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default);
bb.push_back(Operation(OperationCode::MemoryBarrierGL));
const OperationCode type = [instr] {
switch (instr.membar.type) {
case Tegra::Shader::MembarType::CTA:
return OperationCode::MemoryBarrierGroup;
case Tegra::Shader::MembarType::GL:
return OperationCode::MemoryBarrierGlobal;
default:
UNIMPLEMENTED_MSG("MEMBAR type={}", static_cast<int>(instr.membar.type.Value()));
return OperationCode::MemoryBarrierGlobal;
}
}();
bb.push_back(Operation(type));
break;
}
case OpCode::Id::DEPBAR: {

View File

@@ -226,9 +226,16 @@ enum class OperationCode {
VoteEqual, /// (bool) -> bool
ThreadId, /// () -> uint
ThreadEqMask, /// () -> uint
ThreadGeMask, /// () -> uint
ThreadGtMask, /// () -> uint
ThreadLeMask, /// () -> uint
ThreadLtMask, /// () -> uint
ShuffleIndexed, /// (uint value, uint index) -> uint
MemoryBarrierGL, /// () -> void
Barrier, /// () -> void
MemoryBarrierGroup, /// () -> void
MemoryBarrierGlobal, /// () -> void
Amount,
};

View File

@@ -14,6 +14,7 @@
#include <unordered_map>
#include <vector>
#include <boost/container/small_vector.hpp>
#include <boost/icl/interval_map.hpp>
#include <boost/range/iterator_range.hpp>
@@ -53,6 +54,7 @@ using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;
template <typename TSurface, typename TView>
class TextureCache {
using VectorSurface = boost::container::small_vector<TSurface, 1>;
public:
void InvalidateRegion(VAddr addr, std::size_t size) {
@@ -308,18 +310,20 @@ public:
dst_surface.first->MarkAsModified(true, Tick());
}
TSurface TryFindFramebufferSurface(VAddr addr) {
TSurface TryFindFramebufferSurface(VAddr addr) const {
if (!addr) {
return nullptr;
}
const VAddr page = addr >> registry_page_bits;
std::vector<TSurface>& list = registry[page];
for (auto& surface : list) {
if (surface->GetCpuAddr() == addr) {
return surface;
}
const auto it = registry.find(page);
if (it == registry.end()) {
return nullptr;
}
return nullptr;
const auto& list = it->second;
const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) {
return surface->GetCpuAddr() == addr;
});
return found != list.end() ? *found : nullptr;
}
u64 Tick() {
@@ -498,7 +502,7 @@ private:
* @param untopological Indicates to the recycler that the texture has no way
* to match the overlaps due to topological reasons.
**/
RecycleStrategy PickStrategy(std::vector<TSurface>& overlaps, const SurfaceParams& params,
RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params,
const GPUVAddr gpu_addr, const MatchTopologyResult untopological) {
if (Settings::IsGPULevelExtreme()) {
return RecycleStrategy::Flush;
@@ -538,9 +542,8 @@ private:
* @param untopological Indicates to the recycler that the texture has no way to match the
* overlaps due to topological reasons.
**/
std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps,
const SurfaceParams& params, const GPUVAddr gpu_addr,
const bool preserve_contents,
std::pair<TSurface, TView> RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params,
const GPUVAddr gpu_addr, const bool preserve_contents,
const MatchTopologyResult untopological) {
const bool do_load = preserve_contents && Settings::IsGPULevelExtreme();
for (auto& surface : overlaps) {
@@ -650,7 +653,7 @@ private:
* @param params The parameters on the new surface.
* @param gpu_addr The starting address of the new surface.
**/
std::optional<std::pair<TSurface, TView>> TryReconstructSurface(std::vector<TSurface>& overlaps,
std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps,
const SurfaceParams& params,
const GPUVAddr gpu_addr) {
if (params.target == SurfaceTarget::Texture3D) {
@@ -708,7 +711,7 @@ private:
* @param preserve_contents Indicates that the new surface should be loaded from memory or
* left blank.
*/
std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps,
std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps,
const SurfaceParams& params,
const GPUVAddr gpu_addr,
const VAddr cpu_addr,
@@ -810,7 +813,7 @@ private:
TSurface& current_surface = iter->second;
const auto topological_result = current_surface->MatchesTopology(params);
if (topological_result != MatchTopologyResult::FullMatch) {
std::vector<TSurface> overlaps{current_surface};
VectorSurface overlaps{current_surface};
return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
topological_result);
}
@@ -991,7 +994,9 @@ private:
params.target = target;
params.is_tiled = false;
params.srgb_conversion = false;
params.is_layered = false;
params.is_layered =
target == SurfaceTarget::Texture1DArray || target == SurfaceTarget::Texture2DArray ||
target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray;
params.block_width = 0;
params.block_height = 0;
params.block_depth = 0;
@@ -1124,23 +1129,25 @@ private:
}
}
std::vector<TSurface> GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
if (size == 0) {
return {};
}
const VAddr cpu_addr_end = cpu_addr + size;
VAddr start = cpu_addr >> registry_page_bits;
const VAddr end = (cpu_addr_end - 1) >> registry_page_bits;
std::vector<TSurface> surfaces;
while (start <= end) {
std::vector<TSurface>& list = registry[start];
for (auto& surface : list) {
if (!surface->IsPicked() && surface->Overlaps(cpu_addr, cpu_addr_end)) {
surface->MarkAsPicked(true);
surfaces.push_back(surface);
}
VectorSurface surfaces;
for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) {
const auto it = registry.find(start);
if (it == registry.end()) {
continue;
}
for (auto& surface : it->second) {
if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) {
continue;
}
surface->MarkAsPicked(true);
surfaces.push_back(surface);
}
start++;
}
for (auto& surface : surfaces) {
surface->MarkAsPicked(false);

View File

@@ -106,6 +106,9 @@ public:
format.setVersion(4, 3);
format.setProfile(QSurfaceFormat::CompatibilityProfile);
format.setOption(QSurfaceFormat::FormatOption::DeprecatedFunctions);
if (Settings::values.renderer_debug) {
format.setOption(QSurfaceFormat::FormatOption::DebugContext);
}
// TODO: expose a setting for buffer value (ie default/single/double/triple)
format.setSwapBehavior(QSurfaceFormat::DefaultSwapBehavior);
format.setSwapInterval(0);

View File

@@ -643,6 +643,8 @@ void Config::ReadRendererValues() {
Settings::values.use_asynchronous_gpu_emulation =
ReadSetting(QStringLiteral("use_asynchronous_gpu_emulation"), false).toBool();
Settings::values.use_vsync = ReadSetting(QStringLiteral("use_vsync"), true).toBool();
Settings::values.use_assembly_shaders =
ReadSetting(QStringLiteral("use_assembly_shaders"), false).toBool();
Settings::values.use_fast_gpu_time =
ReadSetting(QStringLiteral("use_fast_gpu_time"), true).toBool();
Settings::values.force_30fps_mode =
@@ -1090,6 +1092,8 @@ void Config::SaveRendererValues() {
WriteSetting(QStringLiteral("use_asynchronous_gpu_emulation"),
Settings::values.use_asynchronous_gpu_emulation, false);
WriteSetting(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
WriteSetting(QStringLiteral("use_assembly_shaders"), Settings::values.use_assembly_shaders,
false);
WriteSetting(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, true);
WriteSetting(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, false);

View File

@@ -12,6 +12,9 @@ ConfigureGraphicsAdvanced::ConfigureGraphicsAdvanced(QWidget* parent)
ui->setupUi(this);
// TODO: Remove this after assembly shaders are fully integrated
ui->use_assembly_shaders->setVisible(false);
SetConfiguration();
}
@@ -22,6 +25,8 @@ void ConfigureGraphicsAdvanced::SetConfiguration() {
ui->gpu_accuracy->setCurrentIndex(static_cast<int>(Settings::values.gpu_accuracy));
ui->use_vsync->setEnabled(runtime_lock);
ui->use_vsync->setChecked(Settings::values.use_vsync);
ui->use_assembly_shaders->setEnabled(runtime_lock);
ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders);
ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time);
ui->force_30fps_mode->setEnabled(runtime_lock);
ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode);
@@ -33,6 +38,7 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() {
auto gpu_accuracy = static_cast<Settings::GPUAccuracy>(ui->gpu_accuracy->currentIndex());
Settings::values.gpu_accuracy = gpu_accuracy;
Settings::values.use_vsync = ui->use_vsync->isChecked();
Settings::values.use_assembly_shaders = ui->use_assembly_shaders->isChecked();
Settings::values.use_fast_gpu_time = ui->use_fast_gpu_time->isChecked();
Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked();
Settings::values.max_anisotropy = ui->anisotropic_filtering_combobox->currentIndex();

View File

@@ -62,6 +62,16 @@
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="use_assembly_shaders">
<property name="toolTip">
<string>Enabling this reduces shader stutter. Enables OpenGL assembly shaders on supported Nvidia devices (NV_gpu_program5 is required). This feature is experimental.</string>
</property>
<property name="text">
<string>Use assembly shaders (experimental, Nvidia OpenGL only)</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="force_30fps_mode">
<property name="text">

View File

@@ -18,7 +18,7 @@ DiscordImpl::DiscordImpl() {
// The number is the client ID for yuzu, it's used for images and the
// application name
Discord_Initialize("471872241299226636", &handlers, 1, nullptr);
Discord_Initialize("712465656758665259", &handlers, 1, nullptr);
}
DiscordImpl::~DiscordImpl() {

View File

@@ -65,6 +65,7 @@ static FileSys::VirtualFile VfsDirectoryCreateFileWrapper(const FileSys::Virtual
#include "common/logging/backend.h"
#include "common/logging/filter.h"
#include "common/logging/log.h"
#include "common/memory_detect.h"
#include "common/microprofile.h"
#include "common/scm_rev.h"
#include "common/scope_exit.h"
@@ -219,6 +220,10 @@ GMainWindow::GMainWindow()
LOG_INFO(Frontend, "Host CPU: {}", Common::GetCPUCaps().cpu_string);
#endif
LOG_INFO(Frontend, "Host OS: {}", QSysInfo::prettyProductName().toStdString());
LOG_INFO(Frontend, "Host RAM: {:.2f} GB",
Common::GetMemInfo().TotalPhysicalMemory / 1024.0f / 1024 / 1024);
LOG_INFO(Frontend, "Host Swap: {:.2f} GB",
Common::GetMemInfo().TotalSwapMemory / 1024.0f / 1024 / 1024);
UpdateWindowTitle();
show();

View File

@@ -397,6 +397,8 @@ void Config::ReadValues() {
sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false);
Settings::values.use_vsync =
static_cast<u16>(sdl2_config->GetInteger("Renderer", "use_vsync", 1));
Settings::values.use_assembly_shaders =
sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", false);
Settings::values.use_fast_gpu_time =
sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true);

View File

@@ -134,6 +134,10 @@ max_anisotropy =
# 0 (default): Off, 1: On
use_vsync =
# Whether to use OpenGL assembly shaders or not. NV_gpu_program5 is required.
# 0 (default): Off, 1: On
use_assembly_shaders =
# Turns on the frame limiter, which will limit frames output to the target game speed
# 0: Off, 1: On (default)
use_frame_limit =

View File

@@ -98,6 +98,9 @@ EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(Core::System& system, bool fullscreen)
SDL_GL_SetAttribute(SDL_GL_BLUE_SIZE, 8);
SDL_GL_SetAttribute(SDL_GL_ALPHA_SIZE, 0);
SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1);
if (Settings::values.renderer_debug) {
SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
}
SDL_GL_SetSwapInterval(0);
std::string window_title = fmt::format("yuzu {} | {}-{}", Common::g_build_fullname,