Compare commits
115 Commits
__refs_pul
...
__refs_pul
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6508cdd003 | ||
|
|
3626254f48 | ||
|
|
98d2461529 | ||
|
|
2293e8a11a | ||
|
|
03fd5aa384 | ||
|
|
e78d681a6c | ||
|
|
354fbe701e | ||
|
|
98671b4cfe | ||
|
|
22369df357 | ||
|
|
624def4f38 | ||
|
|
34d4abc4f9 | ||
|
|
c0d2e3212f | ||
|
|
eca3d16e54 | ||
|
|
1b64fae912 | ||
|
|
3d99b449d3 | ||
|
|
c647999c61 | ||
|
|
411f5527d4 | ||
|
|
aaa4822fcb | ||
|
|
623b93a2b3 | ||
|
|
23d3e4a3c4 | ||
|
|
597d8b4bd4 | ||
|
|
9a0c1456e3 | ||
|
|
c5de3c1059 | ||
|
|
3a20e74f40 | ||
|
|
866c1165af | ||
|
|
4a6b9a1a71 | ||
|
|
3a59e724c9 | ||
|
|
4511502ca6 | ||
|
|
bb6d93630f | ||
|
|
74f2e5f1a4 | ||
|
|
70188d69b0 | ||
|
|
3a6714ab7f | ||
|
|
6c0b1a9ee2 | ||
|
|
8c84a7e7ec | ||
|
|
4d10d3113f | ||
|
|
0ee310ebdc | ||
|
|
ee21e4ecd3 | ||
|
|
e68ee43a1a | ||
|
|
104b334e40 | ||
|
|
0ac8848eae | ||
|
|
edbf3144d2 | ||
|
|
f7debcaa04 | ||
|
|
a280822c82 | ||
|
|
bb8ef38152 | ||
|
|
058ec22787 | ||
|
|
f2d1aa97ad | ||
|
|
1ee1a5d3d6 | ||
|
|
8118ea160b | ||
|
|
b032ebdfee | ||
|
|
9d9ffe0f94 | ||
|
|
d0bdd26c26 | ||
|
|
e454f7e7a7 | ||
|
|
dd70e097cc | ||
|
|
87b272699f | ||
|
|
5616be12be | ||
|
|
5b37cecd76 | ||
|
|
1bb3122c1f | ||
|
|
5242b21524 | ||
|
|
9b06e823ee | ||
|
|
fc153f6bcd | ||
|
|
f57cbd9f24 | ||
|
|
326403518d | ||
|
|
099ac9c2a8 | ||
|
|
136c563f76 | ||
|
|
640f0d1cec | ||
|
|
3b2dee88e6 | ||
|
|
b8b6f94ba9 | ||
|
|
630fc12d4e | ||
|
|
d2b2557542 | ||
|
|
f3f056c3b6 | ||
|
|
31eb658fea | ||
|
|
b2af304918 | ||
|
|
32e6727dae | ||
|
|
b2c4521a91 | ||
|
|
b17fe82973 | ||
|
|
8bba84a401 | ||
|
|
606a62d4c7 | ||
|
|
efe7b7483b | ||
|
|
508242c267 | ||
|
|
623d9c47a2 | ||
|
|
c13e2f1b75 | ||
|
|
86345c126a | ||
|
|
1adabdac7f | ||
|
|
325e7eed3c | ||
|
|
487dd05170 | ||
|
|
6a5cf1473e | ||
|
|
d0a9caa08f | ||
|
|
1306608b2a | ||
|
|
5d0986a53b | ||
|
|
103809a0ca | ||
|
|
e2b67a868b | ||
|
|
ed4e324991 | ||
|
|
434856c636 | ||
|
|
ebaace294f | ||
|
|
a2dcc642c1 | ||
|
|
19d4f28001 | ||
|
|
891236124c | ||
|
|
3b0baf746e | ||
|
|
599274e3f0 | ||
|
|
cb75ccc1f7 | ||
|
|
235805edf3 | ||
|
|
ae61e47cba | ||
|
|
420cc13248 | ||
|
|
47a7c4f4fe | ||
|
|
cf4ee279c6 | ||
|
|
d0fc12684a | ||
|
|
4cff5dd194 | ||
|
|
9a36d8600c | ||
|
|
17455b7222 | ||
|
|
91dddca26e | ||
|
|
cf6a40fc12 | ||
|
|
a79f060ea2 | ||
|
|
1887afaf9e | ||
|
|
8d15f8b28e | ||
|
|
0a4be73b9b |
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -28,3 +28,6 @@
|
||||
[submodule "libzip"]
|
||||
path = externals/libzip/libzip
|
||||
url = https://github.com/nih-at/libzip.git
|
||||
[submodule "xbyak"]
|
||||
path = externals/xbyak
|
||||
url = https://github.com/herumi/xbyak.git
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.11)
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules")
|
||||
@@ -13,7 +13,7 @@ project(yuzu)
|
||||
option(ENABLE_SDL2 "Enable the SDL2 frontend" ON)
|
||||
|
||||
option(ENABLE_QT "Enable the Qt frontend" ON)
|
||||
CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" OFF "ENABLE_QT;MSVC" OFF)
|
||||
CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" ON "ENABLE_QT;MSVC" OFF)
|
||||
|
||||
option(ENABLE_WEB_SERVICE "Enable web services (telemetry, etc.)" ON)
|
||||
|
||||
|
||||
8
externals/CMakeLists.txt
vendored
8
externals/CMakeLists.txt
vendored
@@ -75,3 +75,11 @@ if (ENABLE_WEB_SERVICE)
|
||||
target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)
|
||||
target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto)
|
||||
endif()
|
||||
|
||||
if (NOT TARGET xbyak)
|
||||
if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
|
||||
add_library(xbyak INTERFACE)
|
||||
target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
|
||||
target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
2
externals/sirit
vendored
2
externals/sirit
vendored
Submodule externals/sirit updated: 414fc4dbd2...eefca56afd
1
externals/xbyak
vendored
Submodule
1
externals/xbyak
vendored
Submodule
Submodule externals/xbyak added at 82b70e6659
@@ -123,6 +123,8 @@ add_library(common STATIC
|
||||
lz4_compression.cpp
|
||||
lz4_compression.h
|
||||
math_util.h
|
||||
memory_detect.cpp
|
||||
memory_detect.h
|
||||
memory_hook.cpp
|
||||
memory_hook.h
|
||||
microprofile.cpp
|
||||
@@ -169,10 +171,12 @@ if(ARCHITECTURE_x86_64)
|
||||
PRIVATE
|
||||
x64/cpu_detect.cpp
|
||||
x64/cpu_detect.h
|
||||
x64/xbyak_abi.h
|
||||
x64/xbyak_util.h
|
||||
)
|
||||
endif()
|
||||
|
||||
create_target_directory_groups(common)
|
||||
|
||||
target_link_libraries(common PUBLIC Boost::boost fmt::fmt microprofile)
|
||||
target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd)
|
||||
target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd xbyak)
|
||||
|
||||
60
src/common/memory_detect.cpp
Normal file
60
src/common/memory_detect.cpp
Normal file
@@ -0,0 +1,60 @@
|
||||
// Copyright 2020 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#ifdef _WIN32
|
||||
// clang-format off
|
||||
#include <windows.h>
|
||||
#include <sysinfoapi.h>
|
||||
// clang-format on
|
||||
#else
|
||||
#include <sys/types.h>
|
||||
#ifdef __APPLE__
|
||||
#include <sys/sysctl.h>
|
||||
#else
|
||||
#include <sys/sysinfo.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "common/memory_detect.h"
|
||||
|
||||
namespace Common {
|
||||
|
||||
// Detects the RAM and Swapfile sizes
|
||||
static MemoryInfo Detect() {
|
||||
MemoryInfo mem_info{};
|
||||
|
||||
#ifdef _WIN32
|
||||
MEMORYSTATUSEX memorystatus;
|
||||
memorystatus.dwLength = sizeof(memorystatus);
|
||||
GlobalMemoryStatusEx(&memorystatus);
|
||||
mem_info.TotalPhysicalMemory = memorystatus.ullTotalPhys;
|
||||
mem_info.TotalSwapMemory = memorystatus.ullTotalPageFile - mem_info.TotalPhysicalMemory;
|
||||
#elif defined(__APPLE__)
|
||||
u64 ramsize;
|
||||
struct xsw_usage vmusage;
|
||||
std::size_t sizeof_ramsize = sizeof(ramsize);
|
||||
std::size_t sizeof_vmusage = sizeof(vmusage);
|
||||
// hw and vm are defined in sysctl.h
|
||||
// https://github.com/apple/darwin-xnu/blob/master/bsd/sys/sysctl.h#L471
|
||||
// sysctlbyname(const char *, void *, size_t *, void *, size_t);
|
||||
sysctlbyname("hw.memsize", &ramsize, &sizeof_ramsize, NULL, 0);
|
||||
sysctlbyname("vm.swapusage", &vmusage, &sizeof_vmusage, NULL, 0);
|
||||
mem_info.TotalPhysicalMemory = ramsize;
|
||||
mem_info.TotalSwapMemory = vmusage.xsu_total;
|
||||
#else
|
||||
struct sysinfo meminfo;
|
||||
sysinfo(&meminfo);
|
||||
mem_info.TotalPhysicalMemory = meminfo.totalram;
|
||||
mem_info.TotalSwapMemory = meminfo.totalswap;
|
||||
#endif
|
||||
|
||||
return mem_info;
|
||||
}
|
||||
|
||||
const MemoryInfo& GetMemInfo() {
|
||||
static MemoryInfo mem_info = Detect();
|
||||
return mem_info;
|
||||
}
|
||||
|
||||
} // namespace Common
|
||||
22
src/common/memory_detect.h
Normal file
22
src/common/memory_detect.h
Normal file
@@ -0,0 +1,22 @@
|
||||
// Copyright 2020 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/common_types.h"
|
||||
|
||||
namespace Common {
|
||||
|
||||
struct MemoryInfo {
|
||||
u64 TotalPhysicalMemory{};
|
||||
u64 TotalSwapMemory{};
|
||||
};
|
||||
|
||||
/**
|
||||
* Gets the memory info of the host system
|
||||
* @return Reference to a MemoryInfo struct with the physical and swap memory sizes in bytes
|
||||
*/
|
||||
const MemoryInfo& GetMemInfo();
|
||||
|
||||
} // namespace Common
|
||||
266
src/common/x64/xbyak_abi.h
Normal file
266
src/common/x64/xbyak_abi.h
Normal file
@@ -0,0 +1,266 @@
|
||||
// Copyright 2016 Citra Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <bitset>
|
||||
#include <initializer_list>
|
||||
#include <xbyak.h>
|
||||
#include "common/assert.h"
|
||||
|
||||
namespace Common::X64 {
|
||||
|
||||
inline int RegToIndex(const Xbyak::Reg& reg) {
|
||||
using Kind = Xbyak::Reg::Kind;
|
||||
ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
|
||||
"RegSet only support GPRs and XMM registers.");
|
||||
ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15.");
|
||||
return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16);
|
||||
}
|
||||
|
||||
inline Xbyak::Reg64 IndexToReg64(int reg_index) {
|
||||
ASSERT(reg_index < 16);
|
||||
return Xbyak::Reg64(reg_index);
|
||||
}
|
||||
|
||||
inline Xbyak::Xmm IndexToXmm(int reg_index) {
|
||||
ASSERT(reg_index >= 16 && reg_index < 32);
|
||||
return Xbyak::Xmm(reg_index - 16);
|
||||
}
|
||||
|
||||
inline Xbyak::Reg IndexToReg(int reg_index) {
|
||||
if (reg_index < 16) {
|
||||
return IndexToReg64(reg_index);
|
||||
} else {
|
||||
return IndexToXmm(reg_index);
|
||||
}
|
||||
}
|
||||
|
||||
inline std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
|
||||
std::bitset<32> bits;
|
||||
for (const Xbyak::Reg& reg : regs) {
|
||||
bits[RegToIndex(reg)] = true;
|
||||
}
|
||||
return bits;
|
||||
}
|
||||
|
||||
const std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
|
||||
const std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
// Microsoft x64 ABI
|
||||
const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
|
||||
const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx;
|
||||
const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
|
||||
const Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
|
||||
const Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;
|
||||
|
||||
const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rcx,
|
||||
Xbyak::util::rdx,
|
||||
Xbyak::util::r8,
|
||||
Xbyak::util::r9,
|
||||
Xbyak::util::r10,
|
||||
Xbyak::util::r11,
|
||||
// XMMs
|
||||
Xbyak::util::xmm0,
|
||||
Xbyak::util::xmm1,
|
||||
Xbyak::util::xmm2,
|
||||
Xbyak::util::xmm3,
|
||||
Xbyak::util::xmm4,
|
||||
Xbyak::util::xmm5,
|
||||
});
|
||||
|
||||
const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rbx,
|
||||
Xbyak::util::rsi,
|
||||
Xbyak::util::rdi,
|
||||
Xbyak::util::rbp,
|
||||
Xbyak::util::r12,
|
||||
Xbyak::util::r13,
|
||||
Xbyak::util::r14,
|
||||
Xbyak::util::r15,
|
||||
// XMMs
|
||||
Xbyak::util::xmm6,
|
||||
Xbyak::util::xmm7,
|
||||
Xbyak::util::xmm8,
|
||||
Xbyak::util::xmm9,
|
||||
Xbyak::util::xmm10,
|
||||
Xbyak::util::xmm11,
|
||||
Xbyak::util::xmm12,
|
||||
Xbyak::util::xmm13,
|
||||
Xbyak::util::xmm14,
|
||||
Xbyak::util::xmm15,
|
||||
});
|
||||
|
||||
constexpr size_t ABI_SHADOW_SPACE = 0x20;
|
||||
|
||||
#else
|
||||
|
||||
// System V x86-64 ABI
|
||||
const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
|
||||
const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi;
|
||||
const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
|
||||
const Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
|
||||
const Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;
|
||||
|
||||
const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rcx,
|
||||
Xbyak::util::rdx,
|
||||
Xbyak::util::rdi,
|
||||
Xbyak::util::rsi,
|
||||
Xbyak::util::r8,
|
||||
Xbyak::util::r9,
|
||||
Xbyak::util::r10,
|
||||
Xbyak::util::r11,
|
||||
// XMMs
|
||||
Xbyak::util::xmm0,
|
||||
Xbyak::util::xmm1,
|
||||
Xbyak::util::xmm2,
|
||||
Xbyak::util::xmm3,
|
||||
Xbyak::util::xmm4,
|
||||
Xbyak::util::xmm5,
|
||||
Xbyak::util::xmm6,
|
||||
Xbyak::util::xmm7,
|
||||
Xbyak::util::xmm8,
|
||||
Xbyak::util::xmm9,
|
||||
Xbyak::util::xmm10,
|
||||
Xbyak::util::xmm11,
|
||||
Xbyak::util::xmm12,
|
||||
Xbyak::util::xmm13,
|
||||
Xbyak::util::xmm14,
|
||||
Xbyak::util::xmm15,
|
||||
});
|
||||
|
||||
const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
|
||||
// GPRs
|
||||
Xbyak::util::rbx,
|
||||
Xbyak::util::rbp,
|
||||
Xbyak::util::r12,
|
||||
Xbyak::util::r13,
|
||||
Xbyak::util::r14,
|
||||
Xbyak::util::r15,
|
||||
});
|
||||
|
||||
constexpr size_t ABI_SHADOW_SPACE = 0;
|
||||
|
||||
#endif
|
||||
|
||||
inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
|
||||
size_t needed_frame_size, s32* out_subtraction,
|
||||
s32* out_xmm_offset) {
|
||||
const auto count = (regs & ABI_ALL_GPRS).count();
|
||||
rsp_alignment -= count * 8;
|
||||
size_t subtraction = 0;
|
||||
const auto xmm_count = (regs & ABI_ALL_XMMS).count();
|
||||
if (xmm_count) {
|
||||
// If we have any XMMs to save, we must align the stack here.
|
||||
subtraction = rsp_alignment & 0xF;
|
||||
}
|
||||
subtraction += 0x10 * xmm_count;
|
||||
size_t xmm_base_subtraction = subtraction;
|
||||
subtraction += needed_frame_size;
|
||||
subtraction += ABI_SHADOW_SPACE;
|
||||
// Final alignment.
|
||||
rsp_alignment -= subtraction;
|
||||
subtraction += rsp_alignment & 0xF;
|
||||
|
||||
*out_subtraction = (s32)subtraction;
|
||||
*out_xmm_offset = (s32)(subtraction - xmm_base_subtraction);
|
||||
}
|
||||
|
||||
inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
|
||||
size_t rsp_alignment, size_t needed_frame_size = 0) {
|
||||
s32 subtraction, xmm_offset;
|
||||
ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
|
||||
for (std::size_t i = 0; i < regs.size(); ++i) {
|
||||
if (regs[i] && ABI_ALL_GPRS[i]) {
|
||||
code.push(IndexToReg64(static_cast<int>(i)));
|
||||
}
|
||||
}
|
||||
if (subtraction != 0) {
|
||||
code.sub(code.rsp, subtraction);
|
||||
}
|
||||
|
||||
for (int i = 0; i < regs.count(); i++) {
|
||||
if (regs.test(i) & ABI_ALL_GPRS.test(i)) {
|
||||
code.push(IndexToReg64(i));
|
||||
}
|
||||
}
|
||||
|
||||
for (std::size_t i = 0; i < regs.size(); ++i) {
|
||||
if (regs[i] && ABI_ALL_XMMS[i]) {
|
||||
code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(static_cast<int>(i)));
|
||||
xmm_offset += 0x10;
|
||||
}
|
||||
}
|
||||
|
||||
return ABI_SHADOW_SPACE;
|
||||
}
|
||||
|
||||
inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
|
||||
size_t rsp_alignment, size_t needed_frame_size = 0) {
|
||||
s32 subtraction, xmm_offset;
|
||||
ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
|
||||
|
||||
for (std::size_t i = 0; i < regs.size(); ++i) {
|
||||
if (regs[i] && ABI_ALL_XMMS[i]) {
|
||||
code.movaps(IndexToXmm(static_cast<int>(i)), code.xword[code.rsp + xmm_offset]);
|
||||
xmm_offset += 0x10;
|
||||
}
|
||||
}
|
||||
|
||||
if (subtraction != 0) {
|
||||
code.add(code.rsp, subtraction);
|
||||
}
|
||||
|
||||
// GPRs need to be popped in reverse order
|
||||
for (int i = 15; i >= 0; i--) {
|
||||
if (regs[i]) {
|
||||
code.pop(IndexToReg64(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline size_t ABI_PushRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
|
||||
size_t rsp_alignment,
|
||||
size_t needed_frame_size = 0) {
|
||||
s32 subtraction, xmm_offset;
|
||||
ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
|
||||
|
||||
for (std::size_t i = 0; i < regs.size(); ++i) {
|
||||
if (regs[i] && ABI_ALL_GPRS[i]) {
|
||||
code.push(IndexToReg64(static_cast<int>(i)));
|
||||
}
|
||||
}
|
||||
|
||||
if (subtraction != 0) {
|
||||
code.sub(code.rsp, subtraction);
|
||||
}
|
||||
|
||||
return ABI_SHADOW_SPACE;
|
||||
}
|
||||
|
||||
inline void ABI_PopRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
|
||||
size_t rsp_alignment, size_t needed_frame_size = 0) {
|
||||
s32 subtraction, xmm_offset;
|
||||
ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
|
||||
|
||||
if (subtraction != 0) {
|
||||
code.add(code.rsp, subtraction);
|
||||
}
|
||||
|
||||
// GPRs need to be popped in reverse order
|
||||
for (int i = 15; i >= 0; i--) {
|
||||
if (regs[i]) {
|
||||
code.pop(IndexToReg64(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Common::X64
|
||||
47
src/common/x64/xbyak_util.h
Normal file
47
src/common/x64/xbyak_util.h
Normal file
@@ -0,0 +1,47 @@
|
||||
// Copyright 2016 Citra Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <type_traits>
|
||||
#include <xbyak.h>
|
||||
#include "common/x64/xbyak_abi.h"
|
||||
|
||||
namespace Common::X64 {
|
||||
|
||||
// Constants for use with cmpps/cmpss
|
||||
enum {
|
||||
CMP_EQ = 0,
|
||||
CMP_LT = 1,
|
||||
CMP_LE = 2,
|
||||
CMP_UNORD = 3,
|
||||
CMP_NEQ = 4,
|
||||
CMP_NLT = 5,
|
||||
CMP_NLE = 6,
|
||||
CMP_ORD = 7,
|
||||
};
|
||||
|
||||
constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) {
|
||||
const u64 distance = target - (ref + 5);
|
||||
return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL);
|
||||
}
|
||||
|
||||
inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) {
|
||||
return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) {
|
||||
static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
|
||||
size_t addr = reinterpret_cast<size_t>(f);
|
||||
if (IsWithin2G(code, addr)) {
|
||||
code.call(f);
|
||||
} else {
|
||||
// ABI_RETURN is a safe temp register to use before a call
|
||||
code.mov(ABI_RETURN, addr);
|
||||
code.call(ABI_RETURN);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Common::X64
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "common/file_util.h"
|
||||
#include "common/hex_util.h"
|
||||
#include "common/logging/log.h"
|
||||
#include "common/string_util.h"
|
||||
#include "core/core.h"
|
||||
#include "core/file_sys/content_archive.h"
|
||||
#include "core/file_sys/control_metadata.h"
|
||||
@@ -48,6 +49,23 @@ std::string FormatTitleVersion(u32 version, TitleVersionFormat format) {
|
||||
return fmt::format("v{}.{}.{}", bytes[3], bytes[2], bytes[1]);
|
||||
}
|
||||
|
||||
std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
|
||||
std::string_view name) {
|
||||
#ifdef _WIN32
|
||||
return dir->GetSubdirectory(name);
|
||||
#else
|
||||
const auto subdirs = dir->GetSubdirectories();
|
||||
for (const auto& subdir : subdirs) {
|
||||
std::string dir_name = Common::ToLower(subdir->GetName());
|
||||
if (dir_name == name) {
|
||||
return subdir;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
PatchManager::PatchManager(u64 title_id) : title_id(title_id) {}
|
||||
|
||||
PatchManager::~PatchManager() = default;
|
||||
@@ -104,7 +122,7 @@ VirtualDir PatchManager::PatchExeFS(VirtualDir exefs) const {
|
||||
if (std::find(disabled.begin(), disabled.end(), subdir->GetName()) != disabled.end())
|
||||
continue;
|
||||
|
||||
auto exefs_dir = subdir->GetSubdirectory("exefs");
|
||||
auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
|
||||
if (exefs_dir != nullptr)
|
||||
layers.push_back(std::move(exefs_dir));
|
||||
}
|
||||
@@ -130,7 +148,7 @@ std::vector<VirtualFile> PatchManager::CollectPatches(const std::vector<VirtualD
|
||||
if (std::find(disabled.cbegin(), disabled.cend(), subdir->GetName()) != disabled.cend())
|
||||
continue;
|
||||
|
||||
auto exefs_dir = subdir->GetSubdirectory("exefs");
|
||||
auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
|
||||
if (exefs_dir != nullptr) {
|
||||
for (const auto& file : exefs_dir->GetFiles()) {
|
||||
if (file->GetExtension() == "ips") {
|
||||
@@ -295,7 +313,7 @@ std::vector<Core::Memory::CheatEntry> PatchManager::CreateCheatList(
|
||||
continue;
|
||||
}
|
||||
|
||||
auto cheats_dir = subdir->GetSubdirectory("cheats");
|
||||
auto cheats_dir = FindSubdirectoryCaseless(subdir, "cheats");
|
||||
if (cheats_dir != nullptr) {
|
||||
auto res = ReadCheatFileFromFolder(system, title_id, build_id_, cheats_dir, true);
|
||||
if (res.has_value()) {
|
||||
@@ -340,11 +358,11 @@ static void ApplyLayeredFS(VirtualFile& romfs, u64 title_id, ContentRecordType t
|
||||
continue;
|
||||
}
|
||||
|
||||
auto romfs_dir = subdir->GetSubdirectory("romfs");
|
||||
auto romfs_dir = FindSubdirectoryCaseless(subdir, "romfs");
|
||||
if (romfs_dir != nullptr)
|
||||
layers.push_back(std::move(romfs_dir));
|
||||
|
||||
auto ext_dir = subdir->GetSubdirectory("romfs_ext");
|
||||
auto ext_dir = FindSubdirectoryCaseless(subdir, "romfs_ext");
|
||||
if (ext_dir != nullptr)
|
||||
layers_ext.push_back(std::move(ext_dir));
|
||||
}
|
||||
@@ -470,7 +488,7 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
|
||||
for (const auto& mod : mod_dir->GetSubdirectories()) {
|
||||
std::string types;
|
||||
|
||||
const auto exefs_dir = mod->GetSubdirectory("exefs");
|
||||
const auto exefs_dir = FindSubdirectoryCaseless(mod, "exefs");
|
||||
if (IsDirValidAndNonEmpty(exefs_dir)) {
|
||||
bool ips = false;
|
||||
bool ipswitch = false;
|
||||
@@ -494,9 +512,9 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
|
||||
if (layeredfs)
|
||||
AppendCommaIfNotEmpty(types, "LayeredExeFS");
|
||||
}
|
||||
if (IsDirValidAndNonEmpty(mod->GetSubdirectory("romfs")))
|
||||
if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "romfs")))
|
||||
AppendCommaIfNotEmpty(types, "LayeredFS");
|
||||
if (IsDirValidAndNonEmpty(mod->GetSubdirectory("cheats")))
|
||||
if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "cheats")))
|
||||
AppendCommaIfNotEmpty(types, "Cheats");
|
||||
|
||||
if (types.empty())
|
||||
|
||||
@@ -29,6 +29,11 @@ enum class TitleVersionFormat : u8 {
|
||||
std::string FormatTitleVersion(u32 version,
|
||||
TitleVersionFormat format = TitleVersionFormat::ThreeElements);
|
||||
|
||||
// Returns a directory with name matching name case-insensitive. Returns nullptr if directory
|
||||
// doesn't have a directory with name.
|
||||
std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
|
||||
std::string_view name);
|
||||
|
||||
// A centralized class to manage patches to games.
|
||||
class PatchManager {
|
||||
public:
|
||||
|
||||
@@ -12,17 +12,17 @@ namespace SystemVersionData {
|
||||
// This section should reflect the best system version to describe yuzu's HLE api.
|
||||
// TODO(DarkLordZach): Update when HLE gets better.
|
||||
|
||||
constexpr u8 VERSION_MAJOR = 5;
|
||||
constexpr u8 VERSION_MINOR = 1;
|
||||
constexpr u8 VERSION_MICRO = 0;
|
||||
constexpr u8 VERSION_MAJOR = 10;
|
||||
constexpr u8 VERSION_MINOR = 0;
|
||||
constexpr u8 VERSION_MICRO = 2;
|
||||
|
||||
constexpr u8 REVISION_MAJOR = 3;
|
||||
constexpr u8 REVISION_MAJOR = 1;
|
||||
constexpr u8 REVISION_MINOR = 0;
|
||||
|
||||
constexpr char PLATFORM_STRING[] = "NX";
|
||||
constexpr char VERSION_HASH[] = "23f9df53e25709d756e0c76effcb2473bd3447dd";
|
||||
constexpr char DISPLAY_VERSION[] = "5.1.0";
|
||||
constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 5.1.0-3.0";
|
||||
constexpr char VERSION_HASH[] = "f90143fa8bbc061d4f68c35f95f04f8080c0ecdc";
|
||||
constexpr char DISPLAY_VERSION[] = "10.0.2";
|
||||
constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 10.0.2-1.0";
|
||||
|
||||
} // namespace SystemVersionData
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ void ReadableEvent::Clear() {
|
||||
|
||||
ResultCode ReadableEvent::Reset() {
|
||||
if (!is_signaled) {
|
||||
LOG_ERROR(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
|
||||
LOG_TRACE(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
|
||||
GetObjectId(), GetTypeName(), GetName());
|
||||
return ERR_INVALID_STATE;
|
||||
}
|
||||
|
||||
@@ -38,10 +38,11 @@ void Controller_Keyboard::OnUpdate(const Core::Timing::CoreTiming& core_timing,
|
||||
cur_entry.sampling_number = last_entry.sampling_number + 1;
|
||||
cur_entry.sampling_number2 = cur_entry.sampling_number;
|
||||
|
||||
cur_entry.key.fill(0);
|
||||
cur_entry.modifier = 0;
|
||||
|
||||
for (std::size_t i = 0; i < keyboard_keys.size(); ++i) {
|
||||
for (std::size_t k = 0; k < KEYS_PER_BYTE; ++k) {
|
||||
cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << k);
|
||||
}
|
||||
cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << (i % KEYS_PER_BYTE));
|
||||
}
|
||||
|
||||
for (std::size_t i = 0; i < keyboard_mods.size(); ++i) {
|
||||
|
||||
@@ -161,7 +161,7 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) {
|
||||
{40, nullptr, "AcquireXpadIdEventHandle"},
|
||||
{41, nullptr, "ReleaseXpadIdEventHandle"},
|
||||
{51, &Hid::ActivateXpad, "ActivateXpad"},
|
||||
{55, nullptr, "GetXpadIds"},
|
||||
{55, &Hid::GetXpadIDs, "GetXpadIds"},
|
||||
{56, nullptr, "ActivateJoyXpad"},
|
||||
{58, nullptr, "GetJoyXpadLifoHandle"},
|
||||
{59, nullptr, "GetJoyXpadIds"},
|
||||
@@ -319,6 +319,17 @@ void Hid::ActivateXpad(Kernel::HLERequestContext& ctx) {
|
||||
rb.Push(RESULT_SUCCESS);
|
||||
}
|
||||
|
||||
void Hid::GetXpadIDs(Kernel::HLERequestContext& ctx) {
|
||||
IPC::RequestParser rp{ctx};
|
||||
const auto applet_resource_user_id{rp.Pop<u64>()};
|
||||
|
||||
LOG_DEBUG(Service_HID, "(STUBBED) called, applet_resource_user_id={}", applet_resource_user_id);
|
||||
|
||||
IPC::ResponseBuilder rb{ctx, 3};
|
||||
rb.Push(RESULT_SUCCESS);
|
||||
rb.Push(0);
|
||||
}
|
||||
|
||||
void Hid::ActivateDebugPad(Kernel::HLERequestContext& ctx) {
|
||||
IPC::RequestParser rp{ctx};
|
||||
const auto applet_resource_user_id{rp.Pop<u64>()};
|
||||
|
||||
@@ -86,6 +86,7 @@ public:
|
||||
private:
|
||||
void CreateAppletResource(Kernel::HLERequestContext& ctx);
|
||||
void ActivateXpad(Kernel::HLERequestContext& ctx);
|
||||
void GetXpadIDs(Kernel::HLERequestContext& ctx);
|
||||
void ActivateDebugPad(Kernel::HLERequestContext& ctx);
|
||||
void ActivateTouchScreen(Kernel::HLERequestContext& ctx);
|
||||
void ActivateMouse(Kernel::HLERequestContext& ctx);
|
||||
|
||||
@@ -177,7 +177,8 @@ private:
|
||||
void CreateTemporaryNetworkProfile(Kernel::HLERequestContext& ctx) {
|
||||
LOG_DEBUG(Service_NIFM, "called");
|
||||
|
||||
ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c, "NetworkProfileData is not the correct size");
|
||||
ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c,
|
||||
"SfNetworkProfileData is not the correct size");
|
||||
u128 uuid{};
|
||||
auto buffer = ctx.ReadBuffer();
|
||||
std::memcpy(&uuid, buffer.data() + 8, sizeof(u128));
|
||||
|
||||
@@ -112,6 +112,7 @@ void LogSettings() {
|
||||
LogSetting("Renderer_UseAsynchronousGpuEmulation",
|
||||
Settings::values.use_asynchronous_gpu_emulation);
|
||||
LogSetting("Renderer_UseVsync", Settings::values.use_vsync);
|
||||
LogSetting("Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
|
||||
LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy);
|
||||
LogSetting("Audio_OutputEngine", Settings::values.sink_id);
|
||||
LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching);
|
||||
|
||||
@@ -446,6 +446,7 @@ struct Values {
|
||||
GPUAccuracy gpu_accuracy;
|
||||
bool use_asynchronous_gpu_emulation;
|
||||
bool use_vsync;
|
||||
bool use_assembly_shaders;
|
||||
bool force_30fps_mode;
|
||||
bool use_fast_gpu_time;
|
||||
|
||||
@@ -473,6 +474,7 @@ struct Values {
|
||||
bool reporting_services;
|
||||
bool quest_flag;
|
||||
bool disable_cpu_opt;
|
||||
bool disable_macro_jit;
|
||||
|
||||
// BCAT
|
||||
std::string bcat_backend;
|
||||
|
||||
@@ -201,6 +201,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
|
||||
AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
|
||||
Settings::values.use_asynchronous_gpu_emulation);
|
||||
AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync);
|
||||
AddField(field_type, "Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
|
||||
AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
add_library(video_core STATIC
|
||||
buffer_cache/buffer_block.h
|
||||
buffer_cache/buffer_cache.h
|
||||
buffer_cache/map_interval.cpp
|
||||
buffer_cache/map_interval.h
|
||||
dirty_flags.cpp
|
||||
dirty_flags.h
|
||||
@@ -24,6 +25,12 @@ add_library(video_core STATIC
|
||||
engines/shader_bytecode.h
|
||||
engines/shader_header.h
|
||||
engines/shader_type.h
|
||||
macro/macro.cpp
|
||||
macro/macro.h
|
||||
macro/macro_interpreter.cpp
|
||||
macro/macro_interpreter.h
|
||||
macro/macro_jit_x64.cpp
|
||||
macro/macro_jit_x64.h
|
||||
fence_manager.h
|
||||
gpu.cpp
|
||||
gpu.h
|
||||
@@ -35,8 +42,6 @@ add_library(video_core STATIC
|
||||
gpu_thread.h
|
||||
guest_driver.cpp
|
||||
guest_driver.h
|
||||
macro_interpreter.cpp
|
||||
macro_interpreter.h
|
||||
memory_manager.cpp
|
||||
memory_manager.h
|
||||
morton.cpp
|
||||
@@ -228,7 +233,7 @@ endif()
|
||||
create_target_directory_groups(video_core)
|
||||
|
||||
target_link_libraries(video_core PUBLIC common core)
|
||||
target_link_libraries(video_core PRIVATE glad)
|
||||
target_link_libraries(video_core PRIVATE glad xbyak)
|
||||
|
||||
if (ENABLE_VULKAN)
|
||||
target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
|
||||
|
||||
@@ -15,48 +15,47 @@ namespace VideoCommon {
|
||||
|
||||
class BufferBlock {
|
||||
public:
|
||||
bool Overlaps(const VAddr start, const VAddr end) const {
|
||||
bool Overlaps(VAddr start, VAddr end) const {
|
||||
return (cpu_addr < end) && (cpu_addr_end > start);
|
||||
}
|
||||
|
||||
bool IsInside(const VAddr other_start, const VAddr other_end) const {
|
||||
bool IsInside(VAddr other_start, VAddr other_end) const {
|
||||
return cpu_addr <= other_start && other_end <= cpu_addr_end;
|
||||
}
|
||||
|
||||
std::size_t GetOffset(const VAddr in_addr) {
|
||||
std::size_t Offset(VAddr in_addr) const {
|
||||
return static_cast<std::size_t>(in_addr - cpu_addr);
|
||||
}
|
||||
|
||||
VAddr GetCpuAddr() const {
|
||||
VAddr CpuAddr() const {
|
||||
return cpu_addr;
|
||||
}
|
||||
|
||||
VAddr GetCpuAddrEnd() const {
|
||||
VAddr CpuAddrEnd() const {
|
||||
return cpu_addr_end;
|
||||
}
|
||||
|
||||
void SetCpuAddr(const VAddr new_addr) {
|
||||
void SetCpuAddr(VAddr new_addr) {
|
||||
cpu_addr = new_addr;
|
||||
cpu_addr_end = new_addr + size;
|
||||
}
|
||||
|
||||
std::size_t GetSize() const {
|
||||
std::size_t Size() const {
|
||||
return size;
|
||||
}
|
||||
|
||||
u64 Epoch() const {
|
||||
return epoch;
|
||||
}
|
||||
|
||||
void SetEpoch(u64 new_epoch) {
|
||||
epoch = new_epoch;
|
||||
}
|
||||
|
||||
u64 GetEpoch() {
|
||||
return epoch;
|
||||
}
|
||||
|
||||
protected:
|
||||
explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} {
|
||||
SetCpuAddr(cpu_addr);
|
||||
explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
|
||||
SetCpuAddr(cpu_addr_);
|
||||
}
|
||||
~BufferBlock() = default;
|
||||
|
||||
private:
|
||||
VAddr cpu_addr{};
|
||||
|
||||
@@ -12,11 +12,12 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/icl/interval_map.hpp>
|
||||
#include <boost/container/small_vector.hpp>
|
||||
#include <boost/icl/interval_set.hpp>
|
||||
#include <boost/range/iterator_range.hpp>
|
||||
#include <boost/intrusive/set.hpp>
|
||||
|
||||
#include "common/alignment.h"
|
||||
#include "common/assert.h"
|
||||
#include "common/common_types.h"
|
||||
#include "common/logging/log.h"
|
||||
#include "core/core.h"
|
||||
@@ -29,10 +30,16 @@
|
||||
|
||||
namespace VideoCommon {
|
||||
|
||||
using MapInterval = std::shared_ptr<MapIntervalBase>;
|
||||
|
||||
template <typename OwnerBuffer, typename BufferType, typename StreamBuffer>
|
||||
template <typename Buffer, typename BufferType, typename StreamBuffer>
|
||||
class BufferCache {
|
||||
using IntervalSet = boost::icl::interval_set<VAddr>;
|
||||
using IntervalType = typename IntervalSet::interval_type;
|
||||
using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
|
||||
|
||||
static constexpr u64 WRITE_PAGE_BIT = 11;
|
||||
static constexpr u64 BLOCK_PAGE_BITS = 21;
|
||||
static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
|
||||
|
||||
public:
|
||||
using BufferInfo = std::pair<BufferType, u64>;
|
||||
|
||||
@@ -40,14 +47,12 @@ public:
|
||||
bool is_written = false, bool use_fast_cbuf = false) {
|
||||
std::lock_guard lock{mutex};
|
||||
|
||||
const std::optional<VAddr> cpu_addr_opt =
|
||||
system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
|
||||
|
||||
const auto& memory_manager = system.GPU().MemoryManager();
|
||||
const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
|
||||
if (!cpu_addr_opt) {
|
||||
return {GetEmptyBuffer(size), 0};
|
||||
}
|
||||
|
||||
VAddr cpu_addr = *cpu_addr_opt;
|
||||
const VAddr cpu_addr = *cpu_addr_opt;
|
||||
|
||||
// Cache management is a big overhead, so only cache entries with a given size.
|
||||
// TODO: Figure out which size is the best for given games.
|
||||
@@ -55,49 +60,58 @@ public:
|
||||
if (use_fast_cbuf || size < max_stream_size) {
|
||||
if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) {
|
||||
auto& memory_manager = system.GPU().MemoryManager();
|
||||
const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size);
|
||||
if (use_fast_cbuf) {
|
||||
if (memory_manager.IsGranularRange(gpu_addr, size)) {
|
||||
const auto host_ptr = memory_manager.GetPointer(gpu_addr);
|
||||
return ConstBufferUpload(host_ptr, size);
|
||||
u8* dest;
|
||||
if (is_granular) {
|
||||
dest = memory_manager.GetPointer(gpu_addr);
|
||||
} else {
|
||||
staging_buffer.resize(size);
|
||||
memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
|
||||
return ConstBufferUpload(staging_buffer.data(), size);
|
||||
dest = staging_buffer.data();
|
||||
memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
|
||||
}
|
||||
return ConstBufferUpload(dest, size);
|
||||
}
|
||||
if (is_granular) {
|
||||
u8* const host_ptr = memory_manager.GetPointer(gpu_addr);
|
||||
return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
|
||||
std::memcpy(dest, host_ptr, size);
|
||||
});
|
||||
} else {
|
||||
if (memory_manager.IsGranularRange(gpu_addr, size)) {
|
||||
const auto host_ptr = memory_manager.GetPointer(gpu_addr);
|
||||
return StreamBufferUpload(host_ptr, size, alignment);
|
||||
} else {
|
||||
staging_buffer.resize(size);
|
||||
memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
|
||||
return StreamBufferUpload(staging_buffer.data(), size, alignment);
|
||||
}
|
||||
return StreamBufferUpload(
|
||||
size, alignment, [&memory_manager, gpu_addr, size](u8* dest) {
|
||||
memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto block = GetBlock(cpu_addr, size);
|
||||
auto map = MapAddress(block, gpu_addr, cpu_addr, size);
|
||||
Buffer* const block = GetBlock(cpu_addr, size);
|
||||
MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
|
||||
if (!map) {
|
||||
return {GetEmptyBuffer(size), 0};
|
||||
}
|
||||
if (is_written) {
|
||||
map->MarkAsModified(true, GetModifiedTicks());
|
||||
if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
|
||||
MarkForAsyncFlush(map);
|
||||
}
|
||||
if (!map->IsWritten()) {
|
||||
map->MarkAsWritten(true);
|
||||
MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
|
||||
if (!map->is_written) {
|
||||
map->is_written = true;
|
||||
MarkRegionAsWritten(map->start, map->end - 1);
|
||||
}
|
||||
}
|
||||
|
||||
return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))};
|
||||
return {block->Handle(), static_cast<u64>(block->Offset(cpu_addr))};
|
||||
}
|
||||
|
||||
/// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
|
||||
BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
|
||||
std::size_t alignment = 4) {
|
||||
std::lock_guard lock{mutex};
|
||||
return StreamBufferUpload(raw_pointer, size, alignment);
|
||||
return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
|
||||
std::memcpy(dest, raw_pointer, size);
|
||||
});
|
||||
}
|
||||
|
||||
void Map(std::size_t max_size) {
|
||||
@@ -115,16 +129,18 @@ public:
|
||||
return std::exchange(invalidated, false);
|
||||
}
|
||||
|
||||
/// Function called at the end of each frame, inteded for deferred operations
|
||||
void TickFrame() {
|
||||
++epoch;
|
||||
|
||||
while (!pending_destruction.empty()) {
|
||||
// Delay at least 4 frames before destruction.
|
||||
// This is due to triple buffering happening on some drivers.
|
||||
static constexpr u64 epochs_to_destroy = 5;
|
||||
if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) {
|
||||
if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
|
||||
break;
|
||||
}
|
||||
pending_destruction.pop_front();
|
||||
pending_destruction.pop();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -132,12 +148,11 @@ public:
|
||||
void FlushRegion(VAddr addr, std::size_t size) {
|
||||
std::lock_guard lock{mutex};
|
||||
|
||||
std::vector<MapInterval> objects = GetMapsInRange(addr, size);
|
||||
std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) {
|
||||
return a->GetModificationTick() < b->GetModificationTick();
|
||||
});
|
||||
for (auto& object : objects) {
|
||||
if (object->IsModified() && object->IsRegistered()) {
|
||||
VectorMapInterval objects = GetMapsInRange(addr, size);
|
||||
std::sort(objects.begin(), objects.end(),
|
||||
[](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
|
||||
for (MapInterval* object : objects) {
|
||||
if (object->is_modified && object->is_registered) {
|
||||
mutex.unlock();
|
||||
FlushMap(object);
|
||||
mutex.lock();
|
||||
@@ -148,9 +163,9 @@ public:
|
||||
bool MustFlushRegion(VAddr addr, std::size_t size) {
|
||||
std::lock_guard lock{mutex};
|
||||
|
||||
const std::vector<MapInterval> objects = GetMapsInRange(addr, size);
|
||||
return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval& map) {
|
||||
return map->IsModified() && map->IsRegistered();
|
||||
const VectorMapInterval objects = GetMapsInRange(addr, size);
|
||||
return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
|
||||
return map->is_modified && map->is_registered;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -158,9 +173,8 @@ public:
|
||||
void InvalidateRegion(VAddr addr, u64 size) {
|
||||
std::lock_guard lock{mutex};
|
||||
|
||||
std::vector<MapInterval> objects = GetMapsInRange(addr, size);
|
||||
for (auto& object : objects) {
|
||||
if (object->IsRegistered()) {
|
||||
for (auto& object : GetMapsInRange(addr, size)) {
|
||||
if (object->is_registered) {
|
||||
Unregister(object);
|
||||
}
|
||||
}
|
||||
@@ -169,10 +183,10 @@ public:
|
||||
void OnCPUWrite(VAddr addr, std::size_t size) {
|
||||
std::lock_guard lock{mutex};
|
||||
|
||||
for (const auto& object : GetMapsInRange(addr, size)) {
|
||||
if (object->IsMemoryMarked() && object->IsRegistered()) {
|
||||
for (MapInterval* object : GetMapsInRange(addr, size)) {
|
||||
if (object->is_memory_marked && object->is_registered) {
|
||||
UnmarkMemory(object);
|
||||
object->SetSyncPending(true);
|
||||
object->is_sync_pending = true;
|
||||
marked_for_unregister.emplace_back(object);
|
||||
}
|
||||
}
|
||||
@@ -181,9 +195,9 @@ public:
|
||||
void SyncGuestHost() {
|
||||
std::lock_guard lock{mutex};
|
||||
|
||||
for (const auto& object : marked_for_unregister) {
|
||||
if (object->IsRegistered()) {
|
||||
object->SetSyncPending(false);
|
||||
for (auto& object : marked_for_unregister) {
|
||||
if (object->is_registered) {
|
||||
object->is_sync_pending = false;
|
||||
Unregister(object);
|
||||
}
|
||||
}
|
||||
@@ -192,9 +206,9 @@ public:
|
||||
|
||||
void CommitAsyncFlushes() {
|
||||
if (uncommitted_flushes) {
|
||||
auto commit_list = std::make_shared<std::list<MapInterval>>();
|
||||
for (auto& map : *uncommitted_flushes) {
|
||||
if (map->IsRegistered() && map->IsModified()) {
|
||||
auto commit_list = std::make_shared<std::list<MapInterval*>>();
|
||||
for (MapInterval* map : *uncommitted_flushes) {
|
||||
if (map->is_registered && map->is_modified) {
|
||||
// TODO(Blinkhawk): Implement backend asynchronous flushing
|
||||
// AsyncFlushMap(map)
|
||||
commit_list->push_back(map);
|
||||
@@ -228,8 +242,8 @@ public:
|
||||
committed_flushes.pop_front();
|
||||
return;
|
||||
}
|
||||
for (MapInterval& map : *flush_list) {
|
||||
if (map->IsRegistered()) {
|
||||
for (MapInterval* map : *flush_list) {
|
||||
if (map->is_registered) {
|
||||
// TODO(Blinkhawk): Replace this for reading the asynchronous flush
|
||||
FlushMap(map);
|
||||
}
|
||||
@@ -241,23 +255,21 @@ public:
|
||||
|
||||
protected:
|
||||
explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
|
||||
std::unique_ptr<StreamBuffer> stream_buffer)
|
||||
: rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
|
||||
stream_buffer_handle{this->stream_buffer->GetHandle()} {}
|
||||
std::unique_ptr<StreamBuffer> stream_buffer_)
|
||||
: rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer_)},
|
||||
stream_buffer_handle{stream_buffer->Handle()} {}
|
||||
|
||||
~BufferCache() = default;
|
||||
|
||||
virtual BufferType ToHandle(const OwnerBuffer& storage) = 0;
|
||||
virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
|
||||
|
||||
virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
|
||||
|
||||
virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
|
||||
virtual void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
|
||||
const u8* data) = 0;
|
||||
|
||||
virtual void DownloadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
|
||||
virtual void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
|
||||
u8* data) = 0;
|
||||
|
||||
virtual void CopyBlock(const OwnerBuffer& src, const OwnerBuffer& dst, std::size_t src_offset,
|
||||
virtual void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
|
||||
std::size_t dst_offset, std::size_t size) = 0;
|
||||
|
||||
virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
|
||||
@@ -265,76 +277,74 @@ protected:
|
||||
}
|
||||
|
||||
/// Register an object into the cache
|
||||
void Register(const MapInterval& new_map, bool inherit_written = false) {
|
||||
const VAddr cpu_addr = new_map->GetStart();
|
||||
MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
|
||||
const VAddr cpu_addr = new_map.start;
|
||||
if (!cpu_addr) {
|
||||
LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
|
||||
new_map->GetGpuAddress());
|
||||
return;
|
||||
new_map.gpu_addr);
|
||||
return nullptr;
|
||||
}
|
||||
const std::size_t size = new_map->GetEnd() - new_map->GetStart();
|
||||
new_map->MarkAsRegistered(true);
|
||||
const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
|
||||
mapped_addresses.insert({interval, new_map});
|
||||
const std::size_t size = new_map.end - new_map.start;
|
||||
new_map.is_registered = true;
|
||||
rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
|
||||
new_map->SetMemoryMarked(true);
|
||||
new_map.is_memory_marked = true;
|
||||
if (inherit_written) {
|
||||
MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1);
|
||||
new_map->MarkAsWritten(true);
|
||||
MarkRegionAsWritten(new_map.start, new_map.end - 1);
|
||||
new_map.is_written = true;
|
||||
}
|
||||
MapInterval* const storage = mapped_addresses_allocator.Allocate();
|
||||
*storage = new_map;
|
||||
mapped_addresses.insert(*storage);
|
||||
return storage;
|
||||
}
|
||||
|
||||
void UnmarkMemory(const MapInterval& map) {
|
||||
if (!map->IsMemoryMarked()) {
|
||||
void UnmarkMemory(MapInterval* map) {
|
||||
if (!map->is_memory_marked) {
|
||||
return;
|
||||
}
|
||||
const std::size_t size = map->GetEnd() - map->GetStart();
|
||||
rasterizer.UpdatePagesCachedCount(map->GetStart(), size, -1);
|
||||
map->SetMemoryMarked(false);
|
||||
const std::size_t size = map->end - map->start;
|
||||
rasterizer.UpdatePagesCachedCount(map->start, size, -1);
|
||||
map->is_memory_marked = false;
|
||||
}
|
||||
|
||||
/// Unregisters an object from the cache
|
||||
void Unregister(const MapInterval& map) {
|
||||
void Unregister(MapInterval* map) {
|
||||
UnmarkMemory(map);
|
||||
map->MarkAsRegistered(false);
|
||||
if (map->IsSyncPending()) {
|
||||
map->is_registered = false;
|
||||
if (map->is_sync_pending) {
|
||||
map->is_sync_pending = false;
|
||||
marked_for_unregister.remove(map);
|
||||
map->SetSyncPending(false);
|
||||
}
|
||||
if (map->IsWritten()) {
|
||||
UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
|
||||
if (map->is_written) {
|
||||
UnmarkRegionAsWritten(map->start, map->end - 1);
|
||||
}
|
||||
const IntervalType delete_interval{map->GetStart(), map->GetEnd()};
|
||||
mapped_addresses.erase(delete_interval);
|
||||
const auto it = mapped_addresses.find(*map);
|
||||
ASSERT(it != mapped_addresses.end());
|
||||
mapped_addresses.erase(it);
|
||||
mapped_addresses_allocator.Release(map);
|
||||
}
|
||||
|
||||
private:
|
||||
MapInterval CreateMap(const VAddr start, const VAddr end, const GPUVAddr gpu_addr) {
|
||||
return std::make_shared<MapIntervalBase>(start, end, gpu_addr);
|
||||
}
|
||||
|
||||
MapInterval MapAddress(const OwnerBuffer& block, const GPUVAddr gpu_addr, const VAddr cpu_addr,
|
||||
const std::size_t size) {
|
||||
std::vector<MapInterval> overlaps = GetMapsInRange(cpu_addr, size);
|
||||
MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr,
|
||||
std::size_t size) {
|
||||
const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
|
||||
if (overlaps.empty()) {
|
||||
auto& memory_manager = system.GPU().MemoryManager();
|
||||
const VAddr cpu_addr_end = cpu_addr + size;
|
||||
MapInterval new_map = CreateMap(cpu_addr, cpu_addr_end, gpu_addr);
|
||||
if (memory_manager.IsGranularRange(gpu_addr, size)) {
|
||||
u8* host_ptr = memory_manager.GetPointer(gpu_addr);
|
||||
UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr);
|
||||
UploadBlockData(*block, block->Offset(cpu_addr), size, host_ptr);
|
||||
} else {
|
||||
staging_buffer.resize(size);
|
||||
memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
|
||||
UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data());
|
||||
UploadBlockData(*block, block->Offset(cpu_addr), size, staging_buffer.data());
|
||||
}
|
||||
Register(new_map);
|
||||
return new_map;
|
||||
return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
|
||||
}
|
||||
|
||||
const VAddr cpu_addr_end = cpu_addr + size;
|
||||
if (overlaps.size() == 1) {
|
||||
MapInterval& current_map = overlaps[0];
|
||||
MapInterval* const current_map = overlaps[0];
|
||||
if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
|
||||
return current_map;
|
||||
}
|
||||
@@ -344,60 +354,70 @@ private:
|
||||
bool write_inheritance = false;
|
||||
bool modified_inheritance = false;
|
||||
// Calculate new buffer parameters
|
||||
for (auto& overlap : overlaps) {
|
||||
new_start = std::min(overlap->GetStart(), new_start);
|
||||
new_end = std::max(overlap->GetEnd(), new_end);
|
||||
write_inheritance |= overlap->IsWritten();
|
||||
modified_inheritance |= overlap->IsModified();
|
||||
for (MapInterval* overlap : overlaps) {
|
||||
new_start = std::min(overlap->start, new_start);
|
||||
new_end = std::max(overlap->end, new_end);
|
||||
write_inheritance |= overlap->is_written;
|
||||
modified_inheritance |= overlap->is_modified;
|
||||
}
|
||||
GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
|
||||
for (auto& overlap : overlaps) {
|
||||
Unregister(overlap);
|
||||
}
|
||||
UpdateBlock(block, new_start, new_end, overlaps);
|
||||
MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr);
|
||||
|
||||
const MapInterval new_map{new_start, new_end, new_gpu_addr};
|
||||
MapInterval* const map = Register(new_map, write_inheritance);
|
||||
if (!map) {
|
||||
return nullptr;
|
||||
}
|
||||
if (modified_inheritance) {
|
||||
new_map->MarkAsModified(true, GetModifiedTicks());
|
||||
map->MarkAsModified(true, GetModifiedTicks());
|
||||
if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
|
||||
MarkForAsyncFlush(new_map);
|
||||
MarkForAsyncFlush(map);
|
||||
}
|
||||
}
|
||||
Register(new_map, write_inheritance);
|
||||
return new_map;
|
||||
return map;
|
||||
}
|
||||
|
||||
void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end,
|
||||
std::vector<MapInterval>& overlaps) {
|
||||
void UpdateBlock(const Buffer* block, VAddr start, VAddr end,
|
||||
const VectorMapInterval& overlaps) {
|
||||
const IntervalType base_interval{start, end};
|
||||
IntervalSet interval_set{};
|
||||
interval_set.add(base_interval);
|
||||
for (auto& overlap : overlaps) {
|
||||
const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()};
|
||||
const IntervalType subtract{overlap->start, overlap->end};
|
||||
interval_set.subtract(subtract);
|
||||
}
|
||||
for (auto& interval : interval_set) {
|
||||
std::size_t size = interval.upper() - interval.lower();
|
||||
if (size > 0) {
|
||||
staging_buffer.resize(size);
|
||||
system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
|
||||
UploadBlockData(block, block->GetOffset(interval.lower()), size,
|
||||
staging_buffer.data());
|
||||
const std::size_t size = interval.upper() - interval.lower();
|
||||
if (size == 0) {
|
||||
continue;
|
||||
}
|
||||
staging_buffer.resize(size);
|
||||
system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
|
||||
UploadBlockData(*block, block->Offset(interval.lower()), size, staging_buffer.data());
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<MapInterval> GetMapsInRange(VAddr addr, std::size_t size) {
|
||||
VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
|
||||
VectorMapInterval result;
|
||||
if (size == 0) {
|
||||
return {};
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<MapInterval> objects{};
|
||||
const IntervalType interval{addr, addr + size};
|
||||
for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) {
|
||||
objects.push_back(pair.second);
|
||||
const VAddr addr_end = addr + size;
|
||||
auto it = mapped_addresses.lower_bound(addr);
|
||||
if (it != mapped_addresses.begin()) {
|
||||
--it;
|
||||
}
|
||||
|
||||
return objects;
|
||||
while (it != mapped_addresses.end() && it->start < addr_end) {
|
||||
if (it->Overlaps(addr, addr_end)) {
|
||||
result.push_back(&*it);
|
||||
}
|
||||
++it;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/// Returns a ticks counter used for tracking when cached objects were last modified
|
||||
@@ -405,20 +425,24 @@ private:
|
||||
return ++modified_ticks;
|
||||
}
|
||||
|
||||
void FlushMap(MapInterval map) {
|
||||
std::size_t size = map->GetEnd() - map->GetStart();
|
||||
OwnerBuffer block = blocks[map->GetStart() >> block_page_bits];
|
||||
void FlushMap(MapInterval* map) {
|
||||
const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
|
||||
ASSERT_OR_EXECUTE(it != blocks.end(), return;);
|
||||
|
||||
std::shared_ptr<Buffer> block = it->second;
|
||||
|
||||
const std::size_t size = map->end - map->start;
|
||||
staging_buffer.resize(size);
|
||||
DownloadBlockData(block, block->GetOffset(map->GetStart()), size, staging_buffer.data());
|
||||
system.Memory().WriteBlockUnsafe(map->GetStart(), staging_buffer.data(), size);
|
||||
DownloadBlockData(*block, block->Offset(map->start), size, staging_buffer.data());
|
||||
system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
|
||||
map->MarkAsModified(false, 0);
|
||||
}
|
||||
|
||||
BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
|
||||
std::size_t alignment) {
|
||||
template <typename Callable>
|
||||
BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
|
||||
AlignBuffer(alignment);
|
||||
const std::size_t uploaded_offset = buffer_offset;
|
||||
std::memcpy(buffer_ptr, raw_pointer, size);
|
||||
callable(buffer_ptr);
|
||||
|
||||
buffer_ptr += size;
|
||||
buffer_offset += size;
|
||||
@@ -432,97 +456,89 @@ private:
|
||||
buffer_offset = offset_aligned;
|
||||
}
|
||||
|
||||
OwnerBuffer EnlargeBlock(OwnerBuffer buffer) {
|
||||
const std::size_t old_size = buffer->GetSize();
|
||||
const std::size_t new_size = old_size + block_page_size;
|
||||
const VAddr cpu_addr = buffer->GetCpuAddr();
|
||||
OwnerBuffer new_buffer = CreateBlock(cpu_addr, new_size);
|
||||
CopyBlock(buffer, new_buffer, 0, 0, old_size);
|
||||
buffer->SetEpoch(epoch);
|
||||
pending_destruction.push_back(buffer);
|
||||
std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
|
||||
const std::size_t old_size = buffer->Size();
|
||||
const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
|
||||
const VAddr cpu_addr = buffer->CpuAddr();
|
||||
std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
|
||||
CopyBlock(*buffer, *new_buffer, 0, 0, old_size);
|
||||
QueueDestruction(std::move(buffer));
|
||||
|
||||
const VAddr cpu_addr_end = cpu_addr + new_size - 1;
|
||||
u64 page_start = cpu_addr >> block_page_bits;
|
||||
const u64 page_end = cpu_addr_end >> block_page_bits;
|
||||
while (page_start <= page_end) {
|
||||
blocks[page_start] = new_buffer;
|
||||
++page_start;
|
||||
const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
|
||||
for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
|
||||
blocks.insert_or_assign(page_start, new_buffer);
|
||||
}
|
||||
|
||||
return new_buffer;
|
||||
}
|
||||
|
||||
OwnerBuffer MergeBlocks(OwnerBuffer first, OwnerBuffer second) {
|
||||
const std::size_t size_1 = first->GetSize();
|
||||
const std::size_t size_2 = second->GetSize();
|
||||
const VAddr first_addr = first->GetCpuAddr();
|
||||
const VAddr second_addr = second->GetCpuAddr();
|
||||
std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
|
||||
std::shared_ptr<Buffer> second) {
|
||||
const std::size_t size_1 = first->Size();
|
||||
const std::size_t size_2 = second->Size();
|
||||
const VAddr first_addr = first->CpuAddr();
|
||||
const VAddr second_addr = second->CpuAddr();
|
||||
const VAddr new_addr = std::min(first_addr, second_addr);
|
||||
const std::size_t new_size = size_1 + size_2;
|
||||
OwnerBuffer new_buffer = CreateBlock(new_addr, new_size);
|
||||
CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
|
||||
CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
|
||||
first->SetEpoch(epoch);
|
||||
second->SetEpoch(epoch);
|
||||
pending_destruction.push_back(first);
|
||||
pending_destruction.push_back(second);
|
||||
|
||||
std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
|
||||
CopyBlock(*first, *new_buffer, 0, new_buffer->Offset(first_addr), size_1);
|
||||
CopyBlock(*second, *new_buffer, 0, new_buffer->Offset(second_addr), size_2);
|
||||
QueueDestruction(std::move(first));
|
||||
QueueDestruction(std::move(second));
|
||||
|
||||
const VAddr cpu_addr_end = new_addr + new_size - 1;
|
||||
u64 page_start = new_addr >> block_page_bits;
|
||||
const u64 page_end = cpu_addr_end >> block_page_bits;
|
||||
while (page_start <= page_end) {
|
||||
blocks[page_start] = new_buffer;
|
||||
++page_start;
|
||||
const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
|
||||
for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
|
||||
blocks.insert_or_assign(page_start, new_buffer);
|
||||
}
|
||||
return new_buffer;
|
||||
}
|
||||
|
||||
OwnerBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) {
|
||||
OwnerBuffer found;
|
||||
Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
|
||||
std::shared_ptr<Buffer> found;
|
||||
|
||||
const VAddr cpu_addr_end = cpu_addr + size - 1;
|
||||
u64 page_start = cpu_addr >> block_page_bits;
|
||||
const u64 page_end = cpu_addr_end >> block_page_bits;
|
||||
while (page_start <= page_end) {
|
||||
const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
|
||||
for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
|
||||
auto it = blocks.find(page_start);
|
||||
if (it == blocks.end()) {
|
||||
if (found) {
|
||||
found = EnlargeBlock(found);
|
||||
} else {
|
||||
const VAddr start_addr = (page_start << block_page_bits);
|
||||
found = CreateBlock(start_addr, block_page_size);
|
||||
blocks[page_start] = found;
|
||||
}
|
||||
} else {
|
||||
if (found) {
|
||||
if (found == it->second) {
|
||||
++page_start;
|
||||
continue;
|
||||
}
|
||||
found = MergeBlocks(found, it->second);
|
||||
} else {
|
||||
found = it->second;
|
||||
continue;
|
||||
}
|
||||
const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
|
||||
found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
|
||||
blocks.insert_or_assign(page_start, found);
|
||||
continue;
|
||||
}
|
||||
if (!found) {
|
||||
found = it->second;
|
||||
continue;
|
||||
}
|
||||
if (found != it->second) {
|
||||
found = MergeBlocks(std::move(found), it->second);
|
||||
}
|
||||
++page_start;
|
||||
}
|
||||
return found;
|
||||
return found.get();
|
||||
}
|
||||
|
||||
void MarkRegionAsWritten(const VAddr start, const VAddr end) {
|
||||
u64 page_start = start >> write_page_bit;
|
||||
const u64 page_end = end >> write_page_bit;
|
||||
while (page_start <= page_end) {
|
||||
void MarkRegionAsWritten(VAddr start, VAddr end) {
|
||||
const u64 page_end = end >> WRITE_PAGE_BIT;
|
||||
for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
|
||||
auto it = written_pages.find(page_start);
|
||||
if (it != written_pages.end()) {
|
||||
it->second = it->second + 1;
|
||||
} else {
|
||||
written_pages[page_start] = 1;
|
||||
written_pages.insert_or_assign(page_start, 1);
|
||||
}
|
||||
page_start++;
|
||||
}
|
||||
}
|
||||
|
||||
void UnmarkRegionAsWritten(const VAddr start, const VAddr end) {
|
||||
u64 page_start = start >> write_page_bit;
|
||||
const u64 page_end = end >> write_page_bit;
|
||||
while (page_start <= page_end) {
|
||||
void UnmarkRegionAsWritten(VAddr start, VAddr end) {
|
||||
const u64 page_end = end >> WRITE_PAGE_BIT;
|
||||
for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
|
||||
auto it = written_pages.find(page_start);
|
||||
if (it != written_pages.end()) {
|
||||
if (it->second > 1) {
|
||||
@@ -531,25 +547,27 @@ private:
|
||||
written_pages.erase(it);
|
||||
}
|
||||
}
|
||||
page_start++;
|
||||
}
|
||||
}
|
||||
|
||||
bool IsRegionWritten(const VAddr start, const VAddr end) const {
|
||||
u64 page_start = start >> write_page_bit;
|
||||
const u64 page_end = end >> write_page_bit;
|
||||
while (page_start <= page_end) {
|
||||
bool IsRegionWritten(VAddr start, VAddr end) const {
|
||||
const u64 page_end = end >> WRITE_PAGE_BIT;
|
||||
for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
|
||||
if (written_pages.count(page_start) > 0) {
|
||||
return true;
|
||||
}
|
||||
page_start++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void MarkForAsyncFlush(MapInterval& map) {
|
||||
void QueueDestruction(std::shared_ptr<Buffer> buffer) {
|
||||
buffer->SetEpoch(epoch);
|
||||
pending_destruction.push(std::move(buffer));
|
||||
}
|
||||
|
||||
void MarkForAsyncFlush(MapInterval* map) {
|
||||
if (!uncommitted_flushes) {
|
||||
uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval>>();
|
||||
uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
|
||||
}
|
||||
uncommitted_flushes->insert(map);
|
||||
}
|
||||
@@ -558,7 +576,7 @@ private:
|
||||
Core::System& system;
|
||||
|
||||
std::unique_ptr<StreamBuffer> stream_buffer;
|
||||
BufferType stream_buffer_handle{};
|
||||
BufferType stream_buffer_handle;
|
||||
|
||||
bool invalidated = false;
|
||||
|
||||
@@ -566,27 +584,23 @@ private:
|
||||
u64 buffer_offset = 0;
|
||||
u64 buffer_offset_base = 0;
|
||||
|
||||
using IntervalSet = boost::icl::interval_set<VAddr>;
|
||||
using IntervalCache = boost::icl::interval_map<VAddr, MapInterval>;
|
||||
using IntervalType = typename IntervalCache::interval_type;
|
||||
IntervalCache mapped_addresses;
|
||||
MapIntervalAllocator mapped_addresses_allocator;
|
||||
boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
|
||||
mapped_addresses;
|
||||
|
||||
static constexpr u64 write_page_bit = 11;
|
||||
std::unordered_map<u64, u32> written_pages;
|
||||
std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
|
||||
|
||||
static constexpr u64 block_page_bits = 21;
|
||||
static constexpr u64 block_page_size = 1ULL << block_page_bits;
|
||||
std::unordered_map<u64, OwnerBuffer> blocks;
|
||||
|
||||
std::list<OwnerBuffer> pending_destruction;
|
||||
std::queue<std::shared_ptr<Buffer>> pending_destruction;
|
||||
u64 epoch = 0;
|
||||
u64 modified_ticks = 0;
|
||||
|
||||
std::vector<u8> staging_buffer;
|
||||
std::list<MapInterval> marked_for_unregister;
|
||||
|
||||
std::shared_ptr<std::unordered_set<MapInterval>> uncommitted_flushes{};
|
||||
std::list<std::shared_ptr<std::list<MapInterval>>> committed_flushes;
|
||||
std::list<MapInterval*> marked_for_unregister;
|
||||
|
||||
std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
|
||||
std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
|
||||
|
||||
std::recursive_mutex mutex;
|
||||
};
|
||||
|
||||
33
src/video_core/buffer_cache/map_interval.cpp
Normal file
33
src/video_core/buffer_cache/map_interval.cpp
Normal file
@@ -0,0 +1,33 @@
|
||||
// Copyright 2020 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
|
||||
#include "video_core/buffer_cache/map_interval.h"
|
||||
|
||||
namespace VideoCommon {
|
||||
|
||||
MapIntervalAllocator::MapIntervalAllocator() {
|
||||
FillFreeList(first_chunk);
|
||||
}
|
||||
|
||||
MapIntervalAllocator::~MapIntervalAllocator() = default;
|
||||
|
||||
void MapIntervalAllocator::AllocateNewChunk() {
|
||||
*new_chunk = std::make_unique<Chunk>();
|
||||
FillFreeList(**new_chunk);
|
||||
new_chunk = &(*new_chunk)->next;
|
||||
}
|
||||
|
||||
void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
|
||||
const std::size_t old_size = free_list.size();
|
||||
free_list.resize(old_size + chunk.data.size());
|
||||
std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
|
||||
[](MapInterval& interval) { return &interval; });
|
||||
}
|
||||
|
||||
} // namespace VideoCommon
|
||||
@@ -4,104 +4,89 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/intrusive/set_hook.hpp>
|
||||
|
||||
#include "common/common_types.h"
|
||||
#include "video_core/gpu.h"
|
||||
|
||||
namespace VideoCommon {
|
||||
|
||||
class MapIntervalBase {
|
||||
public:
|
||||
MapIntervalBase(const VAddr start, const VAddr end, const GPUVAddr gpu_addr)
|
||||
: start{start}, end{end}, gpu_addr{gpu_addr} {}
|
||||
struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
|
||||
MapInterval() = default;
|
||||
|
||||
void SetCpuAddress(VAddr new_cpu_addr) {
|
||||
cpu_addr = new_cpu_addr;
|
||||
/*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
|
||||
|
||||
explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
|
||||
: start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
|
||||
|
||||
bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
|
||||
return start <= other_start && other_end <= end;
|
||||
}
|
||||
|
||||
VAddr GetCpuAddress() const {
|
||||
return cpu_addr;
|
||||
bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
|
||||
return start < other_end && other_start < end;
|
||||
}
|
||||
|
||||
GPUVAddr GetGpuAddress() const {
|
||||
return gpu_addr;
|
||||
}
|
||||
|
||||
bool IsInside(const VAddr other_start, const VAddr other_end) const {
|
||||
return (start <= other_start && other_end <= end);
|
||||
}
|
||||
|
||||
bool operator==(const MapIntervalBase& rhs) const {
|
||||
return std::tie(start, end) == std::tie(rhs.start, rhs.end);
|
||||
}
|
||||
|
||||
bool operator!=(const MapIntervalBase& rhs) const {
|
||||
return !operator==(rhs);
|
||||
}
|
||||
|
||||
void MarkAsRegistered(const bool registered) {
|
||||
is_registered = registered;
|
||||
}
|
||||
|
||||
bool IsRegistered() const {
|
||||
return is_registered;
|
||||
}
|
||||
|
||||
void SetMemoryMarked(bool is_memory_marked_) {
|
||||
is_memory_marked = is_memory_marked_;
|
||||
}
|
||||
|
||||
bool IsMemoryMarked() const {
|
||||
return is_memory_marked;
|
||||
}
|
||||
|
||||
void SetSyncPending(bool is_sync_pending_) {
|
||||
is_sync_pending = is_sync_pending_;
|
||||
}
|
||||
|
||||
bool IsSyncPending() const {
|
||||
return is_sync_pending;
|
||||
}
|
||||
|
||||
VAddr GetStart() const {
|
||||
return start;
|
||||
}
|
||||
|
||||
VAddr GetEnd() const {
|
||||
return end;
|
||||
}
|
||||
|
||||
void MarkAsModified(const bool is_modified_, const u64 tick) {
|
||||
void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
|
||||
is_modified = is_modified_;
|
||||
ticks = tick;
|
||||
ticks = ticks_;
|
||||
}
|
||||
|
||||
bool IsModified() const {
|
||||
return is_modified;
|
||||
boost::intrusive::set_member_hook<> member_hook_;
|
||||
VAddr start = 0;
|
||||
VAddr end = 0;
|
||||
GPUVAddr gpu_addr = 0;
|
||||
u64 ticks = 0;
|
||||
bool is_written = false;
|
||||
bool is_modified = false;
|
||||
bool is_registered = false;
|
||||
bool is_memory_marked = false;
|
||||
bool is_sync_pending = false;
|
||||
};
|
||||
|
||||
struct MapIntervalCompare {
|
||||
constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
|
||||
return lhs.start < rhs.start;
|
||||
}
|
||||
};
|
||||
|
||||
class MapIntervalAllocator {
|
||||
public:
|
||||
MapIntervalAllocator();
|
||||
~MapIntervalAllocator();
|
||||
|
||||
MapInterval* Allocate() {
|
||||
if (free_list.empty()) {
|
||||
AllocateNewChunk();
|
||||
}
|
||||
MapInterval* const interval = free_list.back();
|
||||
free_list.pop_back();
|
||||
return interval;
|
||||
}
|
||||
|
||||
u64 GetModificationTick() const {
|
||||
return ticks;
|
||||
}
|
||||
|
||||
void MarkAsWritten(const bool is_written_) {
|
||||
is_written = is_written_;
|
||||
}
|
||||
|
||||
bool IsWritten() const {
|
||||
return is_written;
|
||||
void Release(MapInterval* interval) {
|
||||
free_list.push_back(interval);
|
||||
}
|
||||
|
||||
private:
|
||||
VAddr start;
|
||||
VAddr end;
|
||||
GPUVAddr gpu_addr;
|
||||
VAddr cpu_addr{};
|
||||
bool is_written{};
|
||||
bool is_modified{};
|
||||
bool is_registered{};
|
||||
bool is_memory_marked{};
|
||||
bool is_sync_pending{};
|
||||
u64 ticks{};
|
||||
struct Chunk {
|
||||
std::unique_ptr<Chunk> next;
|
||||
std::array<MapInterval, 0x8000> data;
|
||||
};
|
||||
|
||||
void AllocateNewChunk();
|
||||
|
||||
void FillFreeList(Chunk& chunk);
|
||||
|
||||
std::vector<MapInterval*> free_list;
|
||||
std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
|
||||
|
||||
Chunk first_chunk;
|
||||
};
|
||||
|
||||
} // namespace VideoCommon
|
||||
|
||||
@@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
|
||||
Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
|
||||
MemoryManager& memory_manager)
|
||||
: system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
|
||||
macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
|
||||
macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} {
|
||||
dirty.flags.flip();
|
||||
|
||||
InitializeRegisterDefaults();
|
||||
}
|
||||
|
||||
@@ -106,7 +105,11 @@ void Maxwell3D::InitializeRegisterDefaults() {
|
||||
regs.rasterize_enable = 1;
|
||||
regs.rt_separate_frag_data = 1;
|
||||
regs.framebuffer_srgb = 1;
|
||||
regs.line_width_aliased = 1.0f;
|
||||
regs.line_width_smooth = 1.0f;
|
||||
regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise;
|
||||
regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill;
|
||||
regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill;
|
||||
|
||||
shadow_state = regs;
|
||||
|
||||
@@ -116,7 +119,7 @@ void Maxwell3D::InitializeRegisterDefaults() {
|
||||
mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
|
||||
}
|
||||
|
||||
void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) {
|
||||
void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
|
||||
// Reset the current macro.
|
||||
executing_macro = 0;
|
||||
|
||||
@@ -125,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3
|
||||
((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
|
||||
|
||||
// Execute the current macro.
|
||||
macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters);
|
||||
macro_engine->Execute(macro_positions[entry], parameters);
|
||||
if (mme_draw.current_mode != MMEDrawMode::Undefined) {
|
||||
FlushMMEInlineDraw();
|
||||
}
|
||||
@@ -161,7 +164,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
|
||||
|
||||
// Call the macro when there are no more parameters in the command buffer
|
||||
if (is_last_call) {
|
||||
CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
|
||||
CallMacroMethod(executing_macro, macro_params);
|
||||
macro_params.clear();
|
||||
}
|
||||
return;
|
||||
@@ -197,7 +200,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
|
||||
break;
|
||||
}
|
||||
case MAXWELL3D_REG_INDEX(macros.data): {
|
||||
ProcessMacroUpload(arg);
|
||||
macro_engine->AddCode(regs.macros.upload_address, arg);
|
||||
break;
|
||||
}
|
||||
case MAXWELL3D_REG_INDEX(macros.bind): {
|
||||
@@ -306,7 +309,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
|
||||
|
||||
// Call the macro when there are no more parameters in the command buffer
|
||||
if (amount == methods_pending) {
|
||||
CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
|
||||
CallMacroMethod(executing_macro, macro_params);
|
||||
macro_params.clear();
|
||||
}
|
||||
return;
|
||||
@@ -420,9 +423,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
|
||||
}
|
||||
|
||||
void Maxwell3D::ProcessMacroUpload(u32 data) {
|
||||
ASSERT_MSG(regs.macros.upload_address < macro_memory.size(),
|
||||
"upload_address exceeded macro_memory size!");
|
||||
macro_memory[regs.macros.upload_address++] = data;
|
||||
macro_engine->AddCode(regs.macros.upload_address++, data);
|
||||
}
|
||||
|
||||
void Maxwell3D::ProcessMacroBind(u32 data) {
|
||||
@@ -457,8 +458,9 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
|
||||
|
||||
void Maxwell3D::ProcessQueryGet() {
|
||||
// TODO(Subv): Support the other query units.
|
||||
ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop,
|
||||
"Units other than CROP are unimplemented");
|
||||
if (regs.query.query_get.unit != Regs::QueryUnit::Crop) {
|
||||
LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented");
|
||||
}
|
||||
|
||||
switch (regs.query.query_get.operation) {
|
||||
case Regs::QueryOperation::Release:
|
||||
@@ -534,8 +536,8 @@ void Maxwell3D::ProcessCounterReset() {
|
||||
rasterizer.ResetCounter(QueryType::SamplesPassed);
|
||||
break;
|
||||
default:
|
||||
LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
|
||||
static_cast<int>(regs.counter_reset));
|
||||
LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}",
|
||||
static_cast<int>(regs.counter_reset));
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -592,8 +594,8 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
|
||||
system.GPU().GetTicks());
|
||||
return {};
|
||||
default:
|
||||
UNIMPLEMENTED_MSG("Unimplemented query select type {}",
|
||||
static_cast<u32>(regs.query.query_get.select.Value()));
|
||||
LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
|
||||
static_cast<u32>(regs.query.query_get.select.Value()));
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#include "video_core/engines/engine_upload.h"
|
||||
#include "video_core/engines/shader_type.h"
|
||||
#include "video_core/gpu.h"
|
||||
#include "video_core/macro_interpreter.h"
|
||||
#include "video_core/macro/macro.h"
|
||||
#include "video_core/textures/texture.h"
|
||||
|
||||
namespace Core {
|
||||
@@ -1411,15 +1411,6 @@ public:
|
||||
|
||||
const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
|
||||
|
||||
/// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
|
||||
/// we've seen used.
|
||||
using MacroMemory = std::array<u32, 0x40000>;
|
||||
|
||||
/// Gets a reference to macro memory.
|
||||
const MacroMemory& GetMacroMemory() const {
|
||||
return macro_memory;
|
||||
}
|
||||
|
||||
bool ShouldExecute() const {
|
||||
return execute_on;
|
||||
}
|
||||
@@ -1468,16 +1459,13 @@ private:
|
||||
|
||||
std::array<bool, Regs::NUM_REGS> mme_inline{};
|
||||
|
||||
/// Memory for macro code
|
||||
MacroMemory macro_memory;
|
||||
|
||||
/// Macro method that is currently being executed / being fed parameters.
|
||||
u32 executing_macro = 0;
|
||||
/// Parameters that have been submitted to the macro call so far.
|
||||
std::vector<u32> macro_params;
|
||||
|
||||
/// Interpreter for the macro codes uploaded to the GPU.
|
||||
MacroInterpreter macro_interpreter;
|
||||
std::unique_ptr<MacroEngine> macro_engine;
|
||||
|
||||
static constexpr u32 null_cb_data = 0xFFFFFFFF;
|
||||
struct {
|
||||
@@ -1506,7 +1494,7 @@ private:
|
||||
* @param num_parameters Number of arguments
|
||||
* @param parameters Arguments to the method call
|
||||
*/
|
||||
void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters);
|
||||
void CallMacroMethod(u32 method, const std::vector<u32>& parameters);
|
||||
|
||||
/// Handles writes to the macro uploading register.
|
||||
void ProcessMacroUpload(u32 data);
|
||||
|
||||
45
src/video_core/macro/macro.cpp
Normal file
45
src/video_core/macro/macro.cpp
Normal file
@@ -0,0 +1,45 @@
|
||||
// Copyright 2020 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include "common/assert.h"
|
||||
#include "common/logging/log.h"
|
||||
#include "core/settings.h"
|
||||
#include "video_core/macro/macro.h"
|
||||
#include "video_core/macro/macro_interpreter.h"
|
||||
#include "video_core/macro/macro_jit_x64.h"
|
||||
|
||||
namespace Tegra {
|
||||
|
||||
void MacroEngine::AddCode(u32 method, u32 data) {
|
||||
uploaded_macro_code[method].push_back(data);
|
||||
}
|
||||
|
||||
void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
|
||||
auto compiled_macro = macro_cache.find(method);
|
||||
if (compiled_macro != macro_cache.end()) {
|
||||
compiled_macro->second->Execute(parameters, method);
|
||||
} else {
|
||||
// Macro not compiled, check if it's uploaded and if so, compile it
|
||||
auto macro_code = uploaded_macro_code.find(method);
|
||||
if (macro_code == uploaded_macro_code.end()) {
|
||||
UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
|
||||
return;
|
||||
}
|
||||
macro_cache[method] = Compile(macro_code->second);
|
||||
macro_cache[method]->Execute(parameters, method);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
|
||||
if (Settings::values.disable_macro_jit) {
|
||||
return std::make_unique<MacroInterpreter>(maxwell3d);
|
||||
}
|
||||
#ifdef ARCHITECTURE_x86_64
|
||||
return std::make_unique<MacroJITx64>(maxwell3d);
|
||||
#else
|
||||
return std::make_unique<MacroInterpreter>(maxwell3d);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace Tegra
|
||||
128
src/video_core/macro/macro.h
Normal file
128
src/video_core/macro/macro.h
Normal file
@@ -0,0 +1,128 @@
|
||||
// Copyright 2020 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include "common/bit_field.h"
|
||||
#include "common/common_types.h"
|
||||
|
||||
namespace Tegra {
|
||||
namespace Engines {
|
||||
class Maxwell3D;
|
||||
}
|
||||
namespace Macro {
|
||||
constexpr std::size_t NUM_MACRO_REGISTERS = 8;
|
||||
enum class Operation : u32 {
|
||||
ALU = 0,
|
||||
AddImmediate = 1,
|
||||
ExtractInsert = 2,
|
||||
ExtractShiftLeftImmediate = 3,
|
||||
ExtractShiftLeftRegister = 4,
|
||||
Read = 5,
|
||||
Unused = 6, // This operation doesn't seem to be a valid encoding.
|
||||
Branch = 7,
|
||||
};
|
||||
|
||||
enum class ALUOperation : u32 {
|
||||
Add = 0,
|
||||
AddWithCarry = 1,
|
||||
Subtract = 2,
|
||||
SubtractWithBorrow = 3,
|
||||
// Operations 4-7 don't seem to be valid encodings.
|
||||
Xor = 8,
|
||||
Or = 9,
|
||||
And = 10,
|
||||
AndNot = 11,
|
||||
Nand = 12
|
||||
};
|
||||
|
||||
enum class ResultOperation : u32 {
|
||||
IgnoreAndFetch = 0,
|
||||
Move = 1,
|
||||
MoveAndSetMethod = 2,
|
||||
FetchAndSend = 3,
|
||||
MoveAndSend = 4,
|
||||
FetchAndSetMethod = 5,
|
||||
MoveAndSetMethodFetchAndSend = 6,
|
||||
MoveAndSetMethodSend = 7
|
||||
};
|
||||
|
||||
enum class BranchCondition : u32 {
|
||||
Zero = 0,
|
||||
NotZero = 1,
|
||||
};
|
||||
|
||||
union Opcode {
|
||||
u32 raw;
|
||||
BitField<0, 3, Operation> operation;
|
||||
BitField<4, 3, ResultOperation> result_operation;
|
||||
BitField<4, 1, BranchCondition> branch_condition;
|
||||
// If set on a branch, then the branch doesn't have a delay slot.
|
||||
BitField<5, 1, u32> branch_annul;
|
||||
BitField<7, 1, u32> is_exit;
|
||||
BitField<8, 3, u32> dst;
|
||||
BitField<11, 3, u32> src_a;
|
||||
BitField<14, 3, u32> src_b;
|
||||
// The signed immediate overlaps the second source operand and the alu operation.
|
||||
BitField<14, 18, s32> immediate;
|
||||
|
||||
BitField<17, 5, ALUOperation> alu_operation;
|
||||
|
||||
// Bitfield instructions data
|
||||
BitField<17, 5, u32> bf_src_bit;
|
||||
BitField<22, 5, u32> bf_size;
|
||||
BitField<27, 5, u32> bf_dst_bit;
|
||||
|
||||
u32 GetBitfieldMask() const {
|
||||
return (1 << bf_size) - 1;
|
||||
}
|
||||
|
||||
s32 GetBranchTarget() const {
|
||||
return static_cast<s32>(immediate * sizeof(u32));
|
||||
}
|
||||
};
|
||||
|
||||
union MethodAddress {
|
||||
u32 raw;
|
||||
BitField<0, 12, u32> address;
|
||||
BitField<12, 6, u32> increment;
|
||||
};
|
||||
|
||||
} // namespace Macro
|
||||
|
||||
class CachedMacro {
|
||||
public:
|
||||
virtual ~CachedMacro() = default;
|
||||
/**
|
||||
* Executes the macro code with the specified input parameters.
|
||||
* @param code The macro byte code to execute
|
||||
* @param parameters The parameters of the macro
|
||||
*/
|
||||
virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
|
||||
};
|
||||
|
||||
class MacroEngine {
|
||||
public:
|
||||
virtual ~MacroEngine() = default;
|
||||
|
||||
// Store the uploaded macro code to compile them when they're called.
|
||||
void AddCode(u32 method, u32 data);
|
||||
|
||||
// Compiles the macro if its not in the cache, and executes the compiled macro
|
||||
void Execute(u32 method, const std::vector<u32>& parameters);
|
||||
|
||||
protected:
|
||||
virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
|
||||
|
||||
private:
|
||||
std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache;
|
||||
std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
|
||||
};
|
||||
|
||||
std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
|
||||
|
||||
} // namespace Tegra
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright 2018 yuzu Emulator Project
|
||||
// Copyright 2020 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
@@ -6,109 +6,46 @@
|
||||
#include "common/logging/log.h"
|
||||
#include "common/microprofile.h"
|
||||
#include "video_core/engines/maxwell_3d.h"
|
||||
#include "video_core/macro_interpreter.h"
|
||||
#include "video_core/macro/macro_interpreter.h"
|
||||
|
||||
MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
|
||||
|
||||
namespace Tegra {
|
||||
namespace {
|
||||
enum class Operation : u32 {
|
||||
ALU = 0,
|
||||
AddImmediate = 1,
|
||||
ExtractInsert = 2,
|
||||
ExtractShiftLeftImmediate = 3,
|
||||
ExtractShiftLeftRegister = 4,
|
||||
Read = 5,
|
||||
Unused = 6, // This operation doesn't seem to be a valid encoding.
|
||||
Branch = 7,
|
||||
};
|
||||
} // Anonymous namespace
|
||||
|
||||
enum class MacroInterpreter::ALUOperation : u32 {
|
||||
Add = 0,
|
||||
AddWithCarry = 1,
|
||||
Subtract = 2,
|
||||
SubtractWithBorrow = 3,
|
||||
// Operations 4-7 don't seem to be valid encodings.
|
||||
Xor = 8,
|
||||
Or = 9,
|
||||
And = 10,
|
||||
AndNot = 11,
|
||||
Nand = 12
|
||||
};
|
||||
|
||||
enum class MacroInterpreter::ResultOperation : u32 {
|
||||
IgnoreAndFetch = 0,
|
||||
Move = 1,
|
||||
MoveAndSetMethod = 2,
|
||||
FetchAndSend = 3,
|
||||
MoveAndSend = 4,
|
||||
FetchAndSetMethod = 5,
|
||||
MoveAndSetMethodFetchAndSend = 6,
|
||||
MoveAndSetMethodSend = 7
|
||||
};
|
||||
|
||||
enum class MacroInterpreter::BranchCondition : u32 {
|
||||
Zero = 0,
|
||||
NotZero = 1,
|
||||
};
|
||||
|
||||
union MacroInterpreter::Opcode {
|
||||
u32 raw;
|
||||
BitField<0, 3, Operation> operation;
|
||||
BitField<4, 3, ResultOperation> result_operation;
|
||||
BitField<4, 1, BranchCondition> branch_condition;
|
||||
// If set on a branch, then the branch doesn't have a delay slot.
|
||||
BitField<5, 1, u32> branch_annul;
|
||||
BitField<7, 1, u32> is_exit;
|
||||
BitField<8, 3, u32> dst;
|
||||
BitField<11, 3, u32> src_a;
|
||||
BitField<14, 3, u32> src_b;
|
||||
// The signed immediate overlaps the second source operand and the alu operation.
|
||||
BitField<14, 18, s32> immediate;
|
||||
|
||||
BitField<17, 5, ALUOperation> alu_operation;
|
||||
|
||||
// Bitfield instructions data
|
||||
BitField<17, 5, u32> bf_src_bit;
|
||||
BitField<22, 5, u32> bf_size;
|
||||
BitField<27, 5, u32> bf_dst_bit;
|
||||
|
||||
u32 GetBitfieldMask() const {
|
||||
return (1 << bf_size) - 1;
|
||||
}
|
||||
|
||||
s32 GetBranchTarget() const {
|
||||
return static_cast<s32>(immediate * sizeof(u32));
|
||||
}
|
||||
};
|
||||
|
||||
MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
|
||||
|
||||
void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) {
|
||||
std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
|
||||
return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
|
||||
}
|
||||
|
||||
MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d,
|
||||
const std::vector<u32>& code)
|
||||
: maxwell3d(maxwell3d), code(code) {}
|
||||
|
||||
void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {
|
||||
MICROPROFILE_SCOPE(MacroInterp);
|
||||
Reset();
|
||||
|
||||
registers[1] = parameters[0];
|
||||
num_parameters = parameters.size();
|
||||
|
||||
if (num_parameters > parameters_capacity) {
|
||||
parameters_capacity = num_parameters;
|
||||
this->parameters = std::make_unique<u32[]>(num_parameters);
|
||||
}
|
||||
std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32));
|
||||
std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));
|
||||
this->num_parameters = num_parameters;
|
||||
|
||||
// Execute the code until we hit an exit condition.
|
||||
bool keep_executing = true;
|
||||
while (keep_executing) {
|
||||
keep_executing = Step(offset, false);
|
||||
keep_executing = Step(false);
|
||||
}
|
||||
|
||||
// Assert the the macro used all the input parameters
|
||||
ASSERT(next_parameter_index == num_parameters);
|
||||
}
|
||||
|
||||
void MacroInterpreter::Reset() {
|
||||
void MacroInterpreterImpl::Reset() {
|
||||
registers = {};
|
||||
pc = 0;
|
||||
delayed_pc = {};
|
||||
@@ -120,10 +57,10 @@ void MacroInterpreter::Reset() {
|
||||
carry_flag = false;
|
||||
}
|
||||
|
||||
bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
|
||||
bool MacroInterpreterImpl::Step(bool is_delay_slot) {
|
||||
u32 base_address = pc;
|
||||
|
||||
Opcode opcode = GetOpcode(offset);
|
||||
Macro::Opcode opcode = GetOpcode();
|
||||
pc += 4;
|
||||
|
||||
// Update the program counter if we were delayed
|
||||
@@ -134,18 +71,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
|
||||
}
|
||||
|
||||
switch (opcode.operation) {
|
||||
case Operation::ALU: {
|
||||
case Macro::Operation::ALU: {
|
||||
u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
|
||||
GetRegister(opcode.src_b));
|
||||
ProcessResult(opcode.result_operation, opcode.dst, result);
|
||||
break;
|
||||
}
|
||||
case Operation::AddImmediate: {
|
||||
case Macro::Operation::AddImmediate: {
|
||||
ProcessResult(opcode.result_operation, opcode.dst,
|
||||
GetRegister(opcode.src_a) + opcode.immediate);
|
||||
break;
|
||||
}
|
||||
case Operation::ExtractInsert: {
|
||||
case Macro::Operation::ExtractInsert: {
|
||||
u32 dst = GetRegister(opcode.src_a);
|
||||
u32 src = GetRegister(opcode.src_b);
|
||||
|
||||
@@ -155,7 +92,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
|
||||
ProcessResult(opcode.result_operation, opcode.dst, dst);
|
||||
break;
|
||||
}
|
||||
case Operation::ExtractShiftLeftImmediate: {
|
||||
case Macro::Operation::ExtractShiftLeftImmediate: {
|
||||
u32 dst = GetRegister(opcode.src_a);
|
||||
u32 src = GetRegister(opcode.src_b);
|
||||
|
||||
@@ -164,7 +101,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
|
||||
ProcessResult(opcode.result_operation, opcode.dst, result);
|
||||
break;
|
||||
}
|
||||
case Operation::ExtractShiftLeftRegister: {
|
||||
case Macro::Operation::ExtractShiftLeftRegister: {
|
||||
u32 dst = GetRegister(opcode.src_a);
|
||||
u32 src = GetRegister(opcode.src_b);
|
||||
|
||||
@@ -173,12 +110,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
|
||||
ProcessResult(opcode.result_operation, opcode.dst, result);
|
||||
break;
|
||||
}
|
||||
case Operation::Read: {
|
||||
case Macro::Operation::Read: {
|
||||
u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
|
||||
ProcessResult(opcode.result_operation, opcode.dst, result);
|
||||
break;
|
||||
}
|
||||
case Operation::Branch: {
|
||||
case Macro::Operation::Branch: {
|
||||
ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
|
||||
u32 value = GetRegister(opcode.src_a);
|
||||
bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
|
||||
@@ -191,7 +128,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
|
||||
|
||||
delayed_pc = base_address + opcode.GetBranchTarget();
|
||||
// Execute one more instruction due to the delay slot.
|
||||
return Step(offset, true);
|
||||
return Step(true);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -204,51 +141,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
|
||||
// cause an exit if it's executed inside a delay slot.
|
||||
if (opcode.is_exit && !is_delay_slot) {
|
||||
// Exit has a delay slot, execute the next instruction
|
||||
Step(offset, true);
|
||||
Step(true);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const {
|
||||
const auto& macro_memory{maxwell3d.GetMacroMemory()};
|
||||
ASSERT((pc % sizeof(u32)) == 0);
|
||||
ASSERT((pc + offset) < macro_memory.size() * sizeof(u32));
|
||||
return {macro_memory[offset + pc / sizeof(u32)]};
|
||||
}
|
||||
|
||||
u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) {
|
||||
u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
|
||||
switch (operation) {
|
||||
case ALUOperation::Add: {
|
||||
case Macro::ALUOperation::Add: {
|
||||
const u64 result{static_cast<u64>(src_a) + src_b};
|
||||
carry_flag = result > 0xffffffff;
|
||||
return static_cast<u32>(result);
|
||||
}
|
||||
case ALUOperation::AddWithCarry: {
|
||||
case Macro::ALUOperation::AddWithCarry: {
|
||||
const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
|
||||
carry_flag = result > 0xffffffff;
|
||||
return static_cast<u32>(result);
|
||||
}
|
||||
case ALUOperation::Subtract: {
|
||||
case Macro::ALUOperation::Subtract: {
|
||||
const u64 result{static_cast<u64>(src_a) - src_b};
|
||||
carry_flag = result < 0x100000000;
|
||||
return static_cast<u32>(result);
|
||||
}
|
||||
case ALUOperation::SubtractWithBorrow: {
|
||||
case Macro::ALUOperation::SubtractWithBorrow: {
|
||||
const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
|
||||
carry_flag = result < 0x100000000;
|
||||
return static_cast<u32>(result);
|
||||
}
|
||||
case ALUOperation::Xor:
|
||||
case Macro::ALUOperation::Xor:
|
||||
return src_a ^ src_b;
|
||||
case ALUOperation::Or:
|
||||
case Macro::ALUOperation::Or:
|
||||
return src_a | src_b;
|
||||
case ALUOperation::And:
|
||||
case Macro::ALUOperation::And:
|
||||
return src_a & src_b;
|
||||
case ALUOperation::AndNot:
|
||||
case Macro::ALUOperation::AndNot:
|
||||
return src_a & ~src_b;
|
||||
case ALUOperation::Nand:
|
||||
case Macro::ALUOperation::Nand:
|
||||
return ~(src_a & src_b);
|
||||
|
||||
default:
|
||||
@@ -257,43 +187,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b)
|
||||
}
|
||||
}
|
||||
|
||||
void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) {
|
||||
void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
|
||||
switch (operation) {
|
||||
case ResultOperation::IgnoreAndFetch:
|
||||
case Macro::ResultOperation::IgnoreAndFetch:
|
||||
// Fetch parameter and ignore result.
|
||||
SetRegister(reg, FetchParameter());
|
||||
break;
|
||||
case ResultOperation::Move:
|
||||
case Macro::ResultOperation::Move:
|
||||
// Move result.
|
||||
SetRegister(reg, result);
|
||||
break;
|
||||
case ResultOperation::MoveAndSetMethod:
|
||||
case Macro::ResultOperation::MoveAndSetMethod:
|
||||
// Move result and use as Method Address.
|
||||
SetRegister(reg, result);
|
||||
SetMethodAddress(result);
|
||||
break;
|
||||
case ResultOperation::FetchAndSend:
|
||||
case Macro::ResultOperation::FetchAndSend:
|
||||
// Fetch parameter and send result.
|
||||
SetRegister(reg, FetchParameter());
|
||||
Send(result);
|
||||
break;
|
||||
case ResultOperation::MoveAndSend:
|
||||
case Macro::ResultOperation::MoveAndSend:
|
||||
// Move and send result.
|
||||
SetRegister(reg, result);
|
||||
Send(result);
|
||||
break;
|
||||
case ResultOperation::FetchAndSetMethod:
|
||||
case Macro::ResultOperation::FetchAndSetMethod:
|
||||
// Fetch parameter and use result as Method Address.
|
||||
SetRegister(reg, FetchParameter());
|
||||
SetMethodAddress(result);
|
||||
break;
|
||||
case ResultOperation::MoveAndSetMethodFetchAndSend:
|
||||
case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
|
||||
// Move result and use as Method Address, then fetch and send parameter.
|
||||
SetRegister(reg, result);
|
||||
SetMethodAddress(result);
|
||||
Send(FetchParameter());
|
||||
break;
|
||||
case ResultOperation::MoveAndSetMethodSend:
|
||||
case Macro::ResultOperation::MoveAndSetMethodSend:
|
||||
// Move result and use as Method Address, then send bits 12:17 of result.
|
||||
SetRegister(reg, result);
|
||||
SetMethodAddress(result);
|
||||
@@ -304,16 +234,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res
|
||||
}
|
||||
}
|
||||
|
||||
u32 MacroInterpreter::FetchParameter() {
|
||||
ASSERT(next_parameter_index < num_parameters);
|
||||
return parameters[next_parameter_index++];
|
||||
bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
|
||||
switch (cond) {
|
||||
case Macro::BranchCondition::Zero:
|
||||
return value == 0;
|
||||
case Macro::BranchCondition::NotZero:
|
||||
return value != 0;
|
||||
}
|
||||
UNREACHABLE();
|
||||
return true;
|
||||
}
|
||||
|
||||
u32 MacroInterpreter::GetRegister(u32 register_id) const {
|
||||
Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
|
||||
ASSERT((pc % sizeof(u32)) == 0);
|
||||
ASSERT(pc < code.size() * sizeof(u32));
|
||||
return {code[pc / sizeof(u32)]};
|
||||
}
|
||||
|
||||
u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
|
||||
return registers.at(register_id);
|
||||
}
|
||||
|
||||
void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
|
||||
void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
|
||||
// Register 0 is hardwired as the zero register.
|
||||
// Ensure no writes to it actually occur.
|
||||
if (register_id == 0) {
|
||||
@@ -323,30 +265,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
|
||||
registers.at(register_id) = value;
|
||||
}
|
||||
|
||||
void MacroInterpreter::SetMethodAddress(u32 address) {
|
||||
void MacroInterpreterImpl::SetMethodAddress(u32 address) {
|
||||
method_address.raw = address;
|
||||
}
|
||||
|
||||
void MacroInterpreter::Send(u32 value) {
|
||||
void MacroInterpreterImpl::Send(u32 value) {
|
||||
maxwell3d.CallMethodFromMME(method_address.address, value);
|
||||
// Increment the method address by the method increment.
|
||||
method_address.address.Assign(method_address.address.Value() +
|
||||
method_address.increment.Value());
|
||||
}
|
||||
|
||||
u32 MacroInterpreter::Read(u32 method) const {
|
||||
u32 MacroInterpreterImpl::Read(u32 method) const {
|
||||
return maxwell3d.GetRegisterValue(method);
|
||||
}
|
||||
|
||||
bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const {
|
||||
switch (cond) {
|
||||
case BranchCondition::Zero:
|
||||
return value == 0;
|
||||
case BranchCondition::NotZero:
|
||||
return value != 0;
|
||||
}
|
||||
UNREACHABLE();
|
||||
return true;
|
||||
u32 MacroInterpreterImpl::FetchParameter() {
|
||||
ASSERT(next_parameter_index < num_parameters);
|
||||
return parameters[next_parameter_index++];
|
||||
}
|
||||
|
||||
} // namespace Tegra
|
||||
@@ -1,44 +1,37 @@
|
||||
// Copyright 2018 yuzu Emulator Project
|
||||
// Copyright 2020 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <optional>
|
||||
|
||||
#include <vector>
|
||||
#include "common/bit_field.h"
|
||||
#include "common/common_types.h"
|
||||
#include "video_core/macro/macro.h"
|
||||
|
||||
namespace Tegra {
|
||||
namespace Engines {
|
||||
class Maxwell3D;
|
||||
}
|
||||
|
||||
class MacroInterpreter final {
|
||||
class MacroInterpreter final : public MacroEngine {
|
||||
public:
|
||||
explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d);
|
||||
|
||||
/**
|
||||
* Executes the macro code with the specified input parameters.
|
||||
* @param offset Offset to start execution at.
|
||||
* @param parameters The parameters of the macro.
|
||||
*/
|
||||
void Execute(u32 offset, std::size_t num_parameters, const u32* parameters);
|
||||
protected:
|
||||
std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
|
||||
|
||||
private:
|
||||
enum class ALUOperation : u32;
|
||||
enum class BranchCondition : u32;
|
||||
enum class ResultOperation : u32;
|
||||
Engines::Maxwell3D& maxwell3d;
|
||||
};
|
||||
|
||||
union Opcode;
|
||||
|
||||
union MethodAddress {
|
||||
u32 raw;
|
||||
BitField<0, 12, u32> address;
|
||||
BitField<12, 6, u32> increment;
|
||||
};
|
||||
class MacroInterpreterImpl : public CachedMacro {
|
||||
public:
|
||||
MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
|
||||
void Execute(const std::vector<u32>& parameters, u32 method) override;
|
||||
|
||||
private:
|
||||
/// Resets the execution engine state, zeroing registers, etc.
|
||||
void Reset();
|
||||
|
||||
@@ -49,20 +42,20 @@ private:
|
||||
* @param is_delay_slot Whether the current step is being executed due to a delay slot in a
|
||||
* previous instruction.
|
||||
*/
|
||||
bool Step(u32 offset, bool is_delay_slot);
|
||||
bool Step(bool is_delay_slot);
|
||||
|
||||
/// Calculates the result of an ALU operation. src_a OP src_b;
|
||||
u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b);
|
||||
u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
|
||||
|
||||
/// Performs the result operation on the input result and stores it in the specified register
|
||||
/// (if necessary).
|
||||
void ProcessResult(ResultOperation operation, u32 reg, u32 result);
|
||||
void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
|
||||
|
||||
/// Evaluates the branch condition and returns whether the branch should be taken or not.
|
||||
bool EvaluateBranchCondition(BranchCondition cond, u32 value) const;
|
||||
bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
|
||||
|
||||
/// Reads an opcode at the current program counter location.
|
||||
Opcode GetOpcode(u32 offset) const;
|
||||
Macro::Opcode GetOpcode() const;
|
||||
|
||||
/// Returns the specified register's value. Register 0 is hardcoded to always return 0.
|
||||
u32 GetRegister(u32 register_id) const;
|
||||
@@ -89,13 +82,11 @@ private:
|
||||
/// Program counter to execute at after the delay slot is executed.
|
||||
std::optional<u32> delayed_pc;
|
||||
|
||||
static constexpr std::size_t NumMacroRegisters = 8;
|
||||
|
||||
/// General purpose macro registers.
|
||||
std::array<u32, NumMacroRegisters> registers = {};
|
||||
std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
|
||||
|
||||
/// Method address to use for the next Send instruction.
|
||||
MethodAddress method_address = {};
|
||||
Macro::MethodAddress method_address = {};
|
||||
|
||||
/// Input parameters of the current macro.
|
||||
std::unique_ptr<u32[]> parameters;
|
||||
@@ -105,5 +96,7 @@ private:
|
||||
u32 next_parameter_index = 0;
|
||||
|
||||
bool carry_flag = false;
|
||||
const std::vector<u32>& code;
|
||||
};
|
||||
|
||||
} // namespace Tegra
|
||||
640
src/video_core/macro/macro_jit_x64.cpp
Normal file
640
src/video_core/macro/macro_jit_x64.cpp
Normal file
@@ -0,0 +1,640 @@
|
||||
// Copyright 2020 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include "common/assert.h"
|
||||
#include "common/logging/log.h"
|
||||
#include "common/microprofile.h"
|
||||
#include "common/x64/xbyak_util.h"
|
||||
#include "video_core/engines/maxwell_3d.h"
|
||||
#include "video_core/macro/macro_interpreter.h"
|
||||
#include "video_core/macro/macro_jit_x64.h"
|
||||
|
||||
MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47));
|
||||
MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
|
||||
|
||||
namespace Tegra {
|
||||
static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9;
|
||||
static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10;
|
||||
static const Xbyak::Reg64 STATE = Xbyak::util::r11;
|
||||
static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12;
|
||||
static const Xbyak::Reg32 RESULT = Xbyak::util::r13d;
|
||||
static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13;
|
||||
static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
|
||||
static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14;
|
||||
static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
|
||||
|
||||
static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
|
||||
PARAMETERS,
|
||||
REGISTERS,
|
||||
STATE,
|
||||
NEXT_PARAMETER,
|
||||
RESULT,
|
||||
METHOD_ADDRESS,
|
||||
BRANCH_HOLDER,
|
||||
});
|
||||
|
||||
MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
|
||||
|
||||
std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
|
||||
return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
|
||||
}
|
||||
|
||||
MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code)
|
||||
: Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) {
|
||||
Compile();
|
||||
}
|
||||
|
||||
MacroJITx64Impl::~MacroJITx64Impl() = default;
|
||||
|
||||
void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
|
||||
MICROPROFILE_SCOPE(MacroJitExecute);
|
||||
ASSERT_OR_EXECUTE(program != nullptr, { return; });
|
||||
JITState state{};
|
||||
state.maxwell3d = &maxwell3d;
|
||||
state.registers = {};
|
||||
state.parameters = parameters.data();
|
||||
program(&state);
|
||||
}
|
||||
|
||||
void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
|
||||
const bool is_a_zero = opcode.src_a == 0;
|
||||
const bool is_b_zero = opcode.src_b == 0;
|
||||
const bool valid_operation = !is_a_zero && !is_b_zero;
|
||||
const bool is_move_operation = !is_a_zero && is_b_zero;
|
||||
const bool has_zero_register = is_a_zero || is_b_zero;
|
||||
|
||||
Xbyak::Reg64 src_a;
|
||||
Xbyak::Reg32 src_b;
|
||||
|
||||
if (!optimizer.zero_reg_skip) {
|
||||
src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
|
||||
src_b = Compile_GetRegister(opcode.src_b, ebx);
|
||||
} else {
|
||||
if (!is_a_zero) {
|
||||
src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
|
||||
}
|
||||
if (!is_b_zero) {
|
||||
src_b = Compile_GetRegister(opcode.src_b, ebx);
|
||||
}
|
||||
}
|
||||
Xbyak::Label skip_carry{};
|
||||
|
||||
bool has_emitted = false;
|
||||
|
||||
switch (opcode.alu_operation) {
|
||||
case Macro::ALUOperation::Add:
|
||||
if (optimizer.zero_reg_skip) {
|
||||
if (valid_operation) {
|
||||
add(src_a, src_b);
|
||||
}
|
||||
} else {
|
||||
add(src_a, src_b);
|
||||
}
|
||||
|
||||
if (!optimizer.can_skip_carry) {
|
||||
setc(byte[STATE + offsetof(JITState, carry_flag)]);
|
||||
}
|
||||
break;
|
||||
case Macro::ALUOperation::AddWithCarry:
|
||||
bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
|
||||
adc(src_a, src_b);
|
||||
setc(byte[STATE + offsetof(JITState, carry_flag)]);
|
||||
break;
|
||||
case Macro::ALUOperation::Subtract:
|
||||
if (optimizer.zero_reg_skip) {
|
||||
if (valid_operation) {
|
||||
sub(src_a, src_b);
|
||||
has_emitted = true;
|
||||
}
|
||||
} else {
|
||||
sub(src_a, src_b);
|
||||
has_emitted = true;
|
||||
}
|
||||
if (!optimizer.can_skip_carry && has_emitted) {
|
||||
setc(byte[STATE + offsetof(JITState, carry_flag)]);
|
||||
}
|
||||
break;
|
||||
case Macro::ALUOperation::SubtractWithBorrow:
|
||||
bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
|
||||
sbb(src_a, src_b);
|
||||
setc(byte[STATE + offsetof(JITState, carry_flag)]);
|
||||
break;
|
||||
case Macro::ALUOperation::Xor:
|
||||
if (optimizer.zero_reg_skip) {
|
||||
if (valid_operation) {
|
||||
xor_(src_a, src_b);
|
||||
}
|
||||
} else {
|
||||
xor_(src_a, src_b);
|
||||
}
|
||||
break;
|
||||
case Macro::ALUOperation::Or:
|
||||
if (optimizer.zero_reg_skip) {
|
||||
if (valid_operation) {
|
||||
or_(src_a, src_b);
|
||||
}
|
||||
} else {
|
||||
or_(src_a, src_b);
|
||||
}
|
||||
break;
|
||||
case Macro::ALUOperation::And:
|
||||
if (optimizer.zero_reg_skip) {
|
||||
if (!has_zero_register) {
|
||||
and_(src_a, src_b);
|
||||
}
|
||||
} else {
|
||||
and_(src_a, src_b);
|
||||
}
|
||||
break;
|
||||
case Macro::ALUOperation::AndNot:
|
||||
if (optimizer.zero_reg_skip) {
|
||||
if (!is_a_zero) {
|
||||
not_(src_b);
|
||||
and_(src_a, src_b);
|
||||
}
|
||||
} else {
|
||||
not_(src_b);
|
||||
and_(src_a, src_b);
|
||||
}
|
||||
break;
|
||||
case Macro::ALUOperation::Nand:
|
||||
if (optimizer.zero_reg_skip) {
|
||||
if (!is_a_zero) {
|
||||
and_(src_a, src_b);
|
||||
not_(src_a);
|
||||
}
|
||||
} else {
|
||||
and_(src_a, src_b);
|
||||
not_(src_a);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
UNIMPLEMENTED_MSG("Unimplemented ALU operation {}",
|
||||
static_cast<std::size_t>(opcode.alu_operation.Value()));
|
||||
break;
|
||||
}
|
||||
Compile_ProcessResult(opcode.result_operation, opcode.dst);
|
||||
}
|
||||
|
||||
void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
|
||||
if (optimizer.skip_dummy_addimmediate) {
|
||||
// Games tend to use this as an exit instruction placeholder. It's to encode an instruction
|
||||
// without doing anything. In our case we can just not emit anything.
|
||||
if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Check for redundant moves
|
||||
if (optimizer.optimize_for_method_move &&
|
||||
opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
|
||||
if (next_opcode.has_value()) {
|
||||
const auto next = *next_opcode;
|
||||
if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (optimizer.zero_reg_skip && opcode.src_a == 0) {
|
||||
if (opcode.immediate == 0) {
|
||||
xor_(RESULT, RESULT);
|
||||
} else {
|
||||
mov(RESULT, opcode.immediate);
|
||||
}
|
||||
} else {
|
||||
auto result = Compile_GetRegister(opcode.src_a, RESULT);
|
||||
if (opcode.immediate > 2) {
|
||||
add(result, opcode.immediate);
|
||||
} else if (opcode.immediate == 1) {
|
||||
inc(result);
|
||||
} else if (opcode.immediate < 0) {
|
||||
sub(result, opcode.immediate * -1);
|
||||
}
|
||||
}
|
||||
Compile_ProcessResult(opcode.result_operation, opcode.dst);
|
||||
}
|
||||
|
||||
void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
|
||||
auto dst = Compile_GetRegister(opcode.src_a, RESULT);
|
||||
auto src = Compile_GetRegister(opcode.src_b, eax);
|
||||
|
||||
if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) {
|
||||
shr(src, opcode.bf_src_bit);
|
||||
} else if (opcode.bf_src_bit == 31) {
|
||||
xor_(src, src);
|
||||
}
|
||||
// Don't bother masking the whole register since we're using a 32 bit register
|
||||
if (opcode.bf_size != 31 && opcode.bf_size != 0) {
|
||||
and_(src, opcode.GetBitfieldMask());
|
||||
} else if (opcode.bf_size == 0) {
|
||||
xor_(src, src);
|
||||
}
|
||||
if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) {
|
||||
shl(src, opcode.bf_dst_bit);
|
||||
} else if (opcode.bf_dst_bit == 31) {
|
||||
xor_(src, src);
|
||||
}
|
||||
|
||||
const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
|
||||
if (mask != 0xffffffff) {
|
||||
and_(dst, mask);
|
||||
}
|
||||
or_(dst, src);
|
||||
Compile_ProcessResult(opcode.result_operation, opcode.dst);
|
||||
}
|
||||
|
||||
void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
|
||||
auto dst = Compile_GetRegister(opcode.src_a, eax);
|
||||
auto src = Compile_GetRegister(opcode.src_b, RESULT);
|
||||
|
||||
shr(src, al);
|
||||
if (opcode.bf_size != 0 && opcode.bf_size != 31) {
|
||||
and_(src, opcode.GetBitfieldMask());
|
||||
} else if (opcode.bf_size == 0) {
|
||||
xor_(src, src);
|
||||
}
|
||||
|
||||
if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) {
|
||||
shl(src, opcode.bf_dst_bit);
|
||||
} else if (opcode.bf_dst_bit == 31) {
|
||||
xor_(src, src);
|
||||
}
|
||||
Compile_ProcessResult(opcode.result_operation, opcode.dst);
|
||||
}
|
||||
|
||||
void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
|
||||
auto dst = Compile_GetRegister(opcode.src_a, eax);
|
||||
auto src = Compile_GetRegister(opcode.src_b, RESULT);
|
||||
|
||||
if (opcode.bf_src_bit != 0) {
|
||||
shr(src, opcode.bf_src_bit);
|
||||
}
|
||||
|
||||
if (opcode.bf_size != 31) {
|
||||
and_(src, opcode.GetBitfieldMask());
|
||||
}
|
||||
shl(src, al);
|
||||
Compile_ProcessResult(opcode.result_operation, opcode.dst);
|
||||
}
|
||||
|
||||
static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) {
|
||||
return maxwell3d->GetRegisterValue(method);
|
||||
}
|
||||
|
||||
static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
|
||||
maxwell3d->CallMethodFromMME(method_address.address, value);
|
||||
}
|
||||
|
||||
void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
|
||||
if (optimizer.zero_reg_skip && opcode.src_a == 0) {
|
||||
if (opcode.immediate == 0) {
|
||||
xor_(RESULT, RESULT);
|
||||
} else {
|
||||
mov(RESULT, opcode.immediate);
|
||||
}
|
||||
} else {
|
||||
auto result = Compile_GetRegister(opcode.src_a, RESULT);
|
||||
if (opcode.immediate > 2) {
|
||||
add(result, opcode.immediate);
|
||||
} else if (opcode.immediate == 1) {
|
||||
inc(result);
|
||||
} else if (opcode.immediate < 0) {
|
||||
sub(result, opcode.immediate * -1);
|
||||
}
|
||||
}
|
||||
Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
|
||||
mov(Common::X64::ABI_PARAM1, qword[STATE]);
|
||||
mov(Common::X64::ABI_PARAM2, RESULT);
|
||||
Common::X64::CallFarFunction(*this, &Read);
|
||||
Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
|
||||
mov(RESULT, Common::X64::ABI_RETURN.cvt32());
|
||||
Compile_ProcessResult(opcode.result_operation, opcode.dst);
|
||||
}
|
||||
|
||||
void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
|
||||
Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
|
||||
mov(Common::X64::ABI_PARAM1, qword[STATE]);
|
||||
mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
|
||||
mov(Common::X64::ABI_PARAM3, value);
|
||||
Common::X64::CallFarFunction(*this, &Send);
|
||||
Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
|
||||
|
||||
Xbyak::Label dont_process{};
|
||||
// Get increment
|
||||
test(METHOD_ADDRESS, 0x3f000);
|
||||
// If zero, method address doesn't update
|
||||
je(dont_process);
|
||||
|
||||
mov(ecx, METHOD_ADDRESS);
|
||||
and_(METHOD_ADDRESS, 0xfff);
|
||||
shr(ecx, 12);
|
||||
and_(ecx, 0x3f);
|
||||
lea(eax, ptr[rcx + METHOD_ADDRESS_64]);
|
||||
sal(ecx, 12);
|
||||
or_(eax, ecx);
|
||||
|
||||
mov(METHOD_ADDRESS, eax);
|
||||
|
||||
L(dont_process);
|
||||
}
|
||||
|
||||
void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
|
||||
ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
|
||||
const s32 jump_address =
|
||||
static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
|
||||
|
||||
Xbyak::Label end;
|
||||
auto value = Compile_GetRegister(opcode.src_a, eax);
|
||||
test(value, value);
|
||||
if (optimizer.has_delayed_pc) {
|
||||
switch (opcode.branch_condition) {
|
||||
case Macro::BranchCondition::Zero:
|
||||
jne(end, T_NEAR);
|
||||
break;
|
||||
case Macro::BranchCondition::NotZero:
|
||||
je(end, T_NEAR);
|
||||
break;
|
||||
}
|
||||
|
||||
if (opcode.branch_annul) {
|
||||
xor_(BRANCH_HOLDER, BRANCH_HOLDER);
|
||||
jmp(labels[jump_address], T_NEAR);
|
||||
} else {
|
||||
Xbyak::Label handle_post_exit{};
|
||||
Xbyak::Label skip{};
|
||||
jmp(skip, T_NEAR);
|
||||
if (opcode.is_exit) {
|
||||
L(handle_post_exit);
|
||||
// Execute 1 instruction
|
||||
mov(BRANCH_HOLDER, end_of_code);
|
||||
// Jump to next instruction to skip delay slot check
|
||||
jmp(labels[jump_address], T_NEAR);
|
||||
} else {
|
||||
L(handle_post_exit);
|
||||
xor_(BRANCH_HOLDER, BRANCH_HOLDER);
|
||||
jmp(labels[jump_address], T_NEAR);
|
||||
}
|
||||
L(skip);
|
||||
mov(BRANCH_HOLDER, handle_post_exit);
|
||||
jmp(delay_skip[pc], T_NEAR);
|
||||
}
|
||||
} else {
|
||||
switch (opcode.branch_condition) {
|
||||
case Macro::BranchCondition::Zero:
|
||||
je(labels[jump_address], T_NEAR);
|
||||
break;
|
||||
case Macro::BranchCondition::NotZero:
|
||||
jne(labels[jump_address], T_NEAR);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
L(end);
|
||||
}
|
||||
|
||||
void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
|
||||
optimizer.can_skip_carry = true;
|
||||
optimizer.has_delayed_pc = false;
|
||||
for (auto raw_op : code) {
|
||||
Macro::Opcode op{};
|
||||
op.raw = raw_op;
|
||||
|
||||
if (op.operation == Macro::Operation::ALU) {
|
||||
// Scan for any ALU operations which actually use the carry flag, if they don't exist in
|
||||
// our current code we can skip emitting the carry flag handling operations
|
||||
if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
|
||||
op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
|
||||
optimizer.can_skip_carry = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (op.operation == Macro::Operation::Branch) {
|
||||
if (!op.branch_annul) {
|
||||
optimizer.has_delayed_pc = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MacroJITx64Impl::Compile() {
|
||||
MICROPROFILE_SCOPE(MacroJitCompile);
|
||||
bool keep_executing = true;
|
||||
labels.fill(Xbyak::Label());
|
||||
|
||||
Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
|
||||
// JIT state
|
||||
mov(STATE, Common::X64::ABI_PARAM1);
|
||||
mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 +
|
||||
static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]);
|
||||
mov(REGISTERS, Common::X64::ABI_PARAM1);
|
||||
add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers)));
|
||||
xor_(RESULT, RESULT);
|
||||
xor_(METHOD_ADDRESS, METHOD_ADDRESS);
|
||||
xor_(NEXT_PARAMETER, NEXT_PARAMETER);
|
||||
xor_(BRANCH_HOLDER, BRANCH_HOLDER);
|
||||
|
||||
mov(dword[REGISTERS + 4], Compile_FetchParameter());
|
||||
|
||||
// Track get register for zero registers and mark it as no-op
|
||||
optimizer.zero_reg_skip = true;
|
||||
|
||||
// AddImmediate tends to be used as a NOP instruction, if we detect this we can
|
||||
// completely skip the entire code path and no emit anything
|
||||
optimizer.skip_dummy_addimmediate = true;
|
||||
|
||||
// SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
|
||||
// one if our register isn't "dirty"
|
||||
optimizer.optimize_for_method_move = true;
|
||||
|
||||
// Check to see if we can skip emitting certain instructions
|
||||
Optimizer_ScanFlags();
|
||||
|
||||
const u32 op_count = static_cast<u32>(code.size());
|
||||
for (u32 i = 0; i < op_count; i++) {
|
||||
if (i < op_count - 1) {
|
||||
pc = i + 1;
|
||||
next_opcode = GetOpCode();
|
||||
} else {
|
||||
next_opcode = {};
|
||||
}
|
||||
pc = i;
|
||||
Compile_NextInstruction();
|
||||
}
|
||||
|
||||
L(end_of_code);
|
||||
|
||||
Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
|
||||
ret();
|
||||
ready();
|
||||
program = getCode<ProgramType>();
|
||||
}
|
||||
|
||||
bool MacroJITx64Impl::Compile_NextInstruction() {
|
||||
const auto opcode = GetOpCode();
|
||||
if (labels[pc].getAddress()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
L(labels[pc]);
|
||||
|
||||
switch (opcode.operation) {
|
||||
case Macro::Operation::ALU:
|
||||
Compile_ALU(opcode);
|
||||
break;
|
||||
case Macro::Operation::AddImmediate:
|
||||
Compile_AddImmediate(opcode);
|
||||
break;
|
||||
case Macro::Operation::ExtractInsert:
|
||||
Compile_ExtractInsert(opcode);
|
||||
break;
|
||||
case Macro::Operation::ExtractShiftLeftImmediate:
|
||||
Compile_ExtractShiftLeftImmediate(opcode);
|
||||
break;
|
||||
case Macro::Operation::ExtractShiftLeftRegister:
|
||||
Compile_ExtractShiftLeftRegister(opcode);
|
||||
break;
|
||||
case Macro::Operation::Read:
|
||||
Compile_Read(opcode);
|
||||
break;
|
||||
case Macro::Operation::Branch:
|
||||
Compile_Branch(opcode);
|
||||
break;
|
||||
default:
|
||||
UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
|
||||
break;
|
||||
}
|
||||
|
||||
if (optimizer.has_delayed_pc) {
|
||||
if (opcode.is_exit) {
|
||||
mov(rax, end_of_code);
|
||||
test(BRANCH_HOLDER, BRANCH_HOLDER);
|
||||
cmove(BRANCH_HOLDER, rax);
|
||||
// Jump to next instruction to skip delay slot check
|
||||
je(labels[pc + 1], T_NEAR);
|
||||
} else {
|
||||
// TODO(ogniK): Optimize delay slot branching
|
||||
Xbyak::Label no_delay_slot{};
|
||||
test(BRANCH_HOLDER, BRANCH_HOLDER);
|
||||
je(no_delay_slot, T_NEAR);
|
||||
mov(rax, BRANCH_HOLDER);
|
||||
xor_(BRANCH_HOLDER, BRANCH_HOLDER);
|
||||
jmp(rax);
|
||||
L(no_delay_slot);
|
||||
}
|
||||
L(delay_skip[pc]);
|
||||
if (opcode.is_exit) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
test(BRANCH_HOLDER, BRANCH_HOLDER);
|
||||
jne(end_of_code, T_NEAR);
|
||||
if (opcode.is_exit) {
|
||||
inc(BRANCH_HOLDER);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
|
||||
mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]);
|
||||
inc(NEXT_PARAMETER);
|
||||
return eax;
|
||||
}
|
||||
|
||||
Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
|
||||
if (index == 0) {
|
||||
// Register 0 is always zero
|
||||
xor_(dst, dst);
|
||||
} else {
|
||||
mov(dst, dword[REGISTERS + index * sizeof(u32)]);
|
||||
}
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) {
|
||||
if (index == 0) {
|
||||
// Register 0 is always zero
|
||||
xor_(dst, dst);
|
||||
} else {
|
||||
mov(dst, dword[REGISTERS + index * sizeof(u32)]);
|
||||
}
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) {
|
||||
Xbyak::Label zero{}, end{};
|
||||
xor_(ecx, ecx);
|
||||
shr(dst, 32);
|
||||
setne(cl);
|
||||
mov(dword[STATE + offsetof(JITState, carry_flag)], ecx);
|
||||
}
|
||||
|
||||
void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
|
||||
auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) {
|
||||
// Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
|
||||
// register.
|
||||
if (reg == 0) {
|
||||
return;
|
||||
}
|
||||
mov(dword[REGISTERS + reg * sizeof(u32)], result);
|
||||
};
|
||||
auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); };
|
||||
|
||||
switch (operation) {
|
||||
case Macro::ResultOperation::IgnoreAndFetch:
|
||||
SetRegister(reg, Compile_FetchParameter());
|
||||
break;
|
||||
case Macro::ResultOperation::Move:
|
||||
SetRegister(reg, RESULT);
|
||||
break;
|
||||
case Macro::ResultOperation::MoveAndSetMethod:
|
||||
SetRegister(reg, RESULT);
|
||||
SetMethodAddress(RESULT);
|
||||
break;
|
||||
case Macro::ResultOperation::FetchAndSend:
|
||||
// Fetch parameter and send result.
|
||||
SetRegister(reg, Compile_FetchParameter());
|
||||
Compile_Send(RESULT);
|
||||
break;
|
||||
case Macro::ResultOperation::MoveAndSend:
|
||||
// Move and send result.
|
||||
SetRegister(reg, RESULT);
|
||||
Compile_Send(RESULT);
|
||||
break;
|
||||
case Macro::ResultOperation::FetchAndSetMethod:
|
||||
// Fetch parameter and use result as Method Address.
|
||||
SetRegister(reg, Compile_FetchParameter());
|
||||
SetMethodAddress(RESULT);
|
||||
break;
|
||||
case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
|
||||
// Move result and use as Method Address, then fetch and send parameter.
|
||||
SetRegister(reg, RESULT);
|
||||
SetMethodAddress(RESULT);
|
||||
Compile_Send(Compile_FetchParameter());
|
||||
break;
|
||||
case Macro::ResultOperation::MoveAndSetMethodSend:
|
||||
// Move result and use as Method Address, then send bits 12:17 of result.
|
||||
SetRegister(reg, RESULT);
|
||||
SetMethodAddress(RESULT);
|
||||
shr(RESULT, 12);
|
||||
and_(RESULT, 0b111111);
|
||||
Compile_Send(RESULT);
|
||||
break;
|
||||
default:
|
||||
UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation));
|
||||
}
|
||||
}
|
||||
|
||||
Macro::Opcode MacroJITx64Impl::GetOpCode() const {
|
||||
ASSERT(pc < code.size());
|
||||
return {code[pc]};
|
||||
}
|
||||
|
||||
std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
|
||||
return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
|
||||
}
|
||||
|
||||
} // namespace Tegra
|
||||
100
src/video_core/macro/macro_jit_x64.h
Normal file
100
src/video_core/macro/macro_jit_x64.h
Normal file
@@ -0,0 +1,100 @@
|
||||
// Copyright 2020 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <bitset>
|
||||
#include <xbyak.h>
|
||||
#include "common/bit_field.h"
|
||||
#include "common/common_types.h"
|
||||
#include "common/x64/xbyak_abi.h"
|
||||
#include "video_core/macro/macro.h"
|
||||
|
||||
namespace Tegra {
|
||||
|
||||
namespace Engines {
|
||||
class Maxwell3D;
|
||||
}
|
||||
|
||||
/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
|
||||
constexpr size_t MAX_CODE_SIZE = 0x10000;
|
||||
|
||||
class MacroJITx64 final : public MacroEngine {
|
||||
public:
|
||||
explicit MacroJITx64(Engines::Maxwell3D& maxwell3d);
|
||||
|
||||
protected:
|
||||
std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
|
||||
|
||||
private:
|
||||
Engines::Maxwell3D& maxwell3d;
|
||||
};
|
||||
|
||||
class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
|
||||
public:
|
||||
MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
|
||||
~MacroJITx64Impl();
|
||||
|
||||
void Execute(const std::vector<u32>& parameters, u32 method) override;
|
||||
|
||||
void Compile_ALU(Macro::Opcode opcode);
|
||||
void Compile_AddImmediate(Macro::Opcode opcode);
|
||||
void Compile_ExtractInsert(Macro::Opcode opcode);
|
||||
void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
|
||||
void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
|
||||
void Compile_Read(Macro::Opcode opcode);
|
||||
void Compile_Branch(Macro::Opcode opcode);
|
||||
|
||||
private:
|
||||
void Optimizer_ScanFlags();
|
||||
|
||||
void Compile();
|
||||
bool Compile_NextInstruction();
|
||||
|
||||
Xbyak::Reg32 Compile_FetchParameter();
|
||||
Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
|
||||
Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst);
|
||||
void Compile_WriteCarry(Xbyak::Reg64 dst);
|
||||
|
||||
void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
|
||||
void Compile_Send(Xbyak::Reg32 value);
|
||||
|
||||
Macro::Opcode GetOpCode() const;
|
||||
std::bitset<32> PersistentCallerSavedRegs() const;
|
||||
|
||||
struct JITState {
|
||||
Engines::Maxwell3D* maxwell3d{};
|
||||
std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
|
||||
const u32* parameters{};
|
||||
u32 carry_flag{};
|
||||
};
|
||||
static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
|
||||
using ProgramType = void (*)(JITState*);
|
||||
|
||||
struct OptimizerState {
|
||||
bool can_skip_carry{};
|
||||
bool has_delayed_pc{};
|
||||
bool zero_reg_skip{};
|
||||
bool skip_dummy_addimmediate{};
|
||||
bool optimize_for_method_move{};
|
||||
};
|
||||
OptimizerState optimizer{};
|
||||
|
||||
std::optional<Macro::Opcode> next_opcode{};
|
||||
ProgramType program{nullptr};
|
||||
|
||||
std::array<Xbyak::Label, MAX_CODE_SIZE> labels{};
|
||||
std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip{};
|
||||
Xbyak::Label end_of_code{};
|
||||
|
||||
bool is_delay_slot{};
|
||||
u32 pc{};
|
||||
std::optional<u32> delayed_pc;
|
||||
|
||||
const std::vector<u32>& code;
|
||||
Engines::Maxwell3D& maxwell3d;
|
||||
};
|
||||
|
||||
} // namespace Tegra
|
||||
@@ -56,9 +56,27 @@ public:
|
||||
last_modified_ticks = cache.GetModifiedTicks();
|
||||
}
|
||||
|
||||
void SetMemoryMarked(bool is_memory_marked_) {
|
||||
is_memory_marked = is_memory_marked_;
|
||||
}
|
||||
|
||||
bool IsMemoryMarked() const {
|
||||
return is_memory_marked;
|
||||
}
|
||||
|
||||
void SetSyncPending(bool is_sync_pending_) {
|
||||
is_sync_pending = is_sync_pending_;
|
||||
}
|
||||
|
||||
bool IsSyncPending() const {
|
||||
return is_sync_pending;
|
||||
}
|
||||
|
||||
private:
|
||||
bool is_registered{}; ///< Whether the object is currently registered with the cache
|
||||
bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory)
|
||||
bool is_memory_marked{}; ///< Whether the object is marking rasterizer memory.
|
||||
bool is_sync_pending{}; ///< Whether the object is pending deletion.
|
||||
u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
|
||||
VAddr cpu_addr{}; ///< Cpu address memory, unique from emulated virtual address space
|
||||
};
|
||||
@@ -94,6 +112,30 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
void OnCPUWrite(VAddr addr, std::size_t size) {
|
||||
std::lock_guard lock{mutex};
|
||||
|
||||
for (const auto& object : GetSortedObjectsFromRegion(addr, size)) {
|
||||
if (object->IsRegistered()) {
|
||||
UnmarkMemory(object);
|
||||
object->SetSyncPending(true);
|
||||
marked_for_unregister.emplace_back(object);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SyncGuestHost() {
|
||||
std::lock_guard lock{mutex};
|
||||
|
||||
for (const auto& object : marked_for_unregister) {
|
||||
if (object->IsRegistered()) {
|
||||
object->SetSyncPending(false);
|
||||
Unregister(object);
|
||||
}
|
||||
}
|
||||
marked_for_unregister.clear();
|
||||
}
|
||||
|
||||
/// Invalidates everything in the cache
|
||||
void InvalidateAll() {
|
||||
std::lock_guard lock{mutex};
|
||||
@@ -120,19 +162,32 @@ protected:
|
||||
interval_cache.add({GetInterval(object), ObjectSet{object}});
|
||||
map_cache.insert({object->GetCpuAddr(), object});
|
||||
rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
|
||||
object->SetMemoryMarked(true);
|
||||
}
|
||||
|
||||
/// Unregisters an object from the cache
|
||||
virtual void Unregister(const T& object) {
|
||||
std::lock_guard lock{mutex};
|
||||
|
||||
UnmarkMemory(object);
|
||||
object->SetIsRegistered(false);
|
||||
rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
|
||||
if (object->IsSyncPending()) {
|
||||
marked_for_unregister.remove(object);
|
||||
object->SetSyncPending(false);
|
||||
}
|
||||
const VAddr addr = object->GetCpuAddr();
|
||||
interval_cache.subtract({GetInterval(object), ObjectSet{object}});
|
||||
map_cache.erase(addr);
|
||||
}
|
||||
|
||||
void UnmarkMemory(const T& object) {
|
||||
if (!object->IsMemoryMarked()) {
|
||||
return;
|
||||
}
|
||||
rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
|
||||
object->SetMemoryMarked(false);
|
||||
}
|
||||
|
||||
/// Returns a ticks counter used for tracking when cached objects were last modified
|
||||
u64 GetModifiedTicks() {
|
||||
std::lock_guard lock{mutex};
|
||||
@@ -194,4 +249,5 @@ private:
|
||||
IntervalCache interval_cache; ///< Cache of objects
|
||||
u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing
|
||||
VideoCore::RasterizerInterface& rasterizer;
|
||||
std::list<T> marked_for_unregister;
|
||||
};
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
#include "common/assert.h"
|
||||
#include "common/microprofile.h"
|
||||
#include "video_core/buffer_cache/buffer_cache.h"
|
||||
#include "video_core/engines/maxwell_3d.h"
|
||||
#include "video_core/rasterizer_interface.h"
|
||||
#include "video_core/renderer_opengl/gl_buffer_cache.h"
|
||||
@@ -21,13 +22,12 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
|
||||
|
||||
MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
|
||||
|
||||
CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size)
|
||||
: VideoCommon::BufferBlock{cpu_addr, size} {
|
||||
Buffer::Buffer(VAddr cpu_addr, const std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} {
|
||||
gl_buffer.Create();
|
||||
glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
|
||||
}
|
||||
|
||||
CachedBufferBlock::~CachedBufferBlock() = default;
|
||||
Buffer::~Buffer() = default;
|
||||
|
||||
OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
|
||||
const Device& device, std::size_t stream_size)
|
||||
@@ -47,12 +47,8 @@ OGLBufferCache::~OGLBufferCache() {
|
||||
glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
|
||||
}
|
||||
|
||||
Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
|
||||
return std::make_shared<CachedBufferBlock>(cpu_addr, size);
|
||||
}
|
||||
|
||||
GLuint OGLBufferCache::ToHandle(const Buffer& buffer) {
|
||||
return buffer->GetHandle();
|
||||
std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
|
||||
return std::make_shared<Buffer>(cpu_addr, size);
|
||||
}
|
||||
|
||||
GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
|
||||
@@ -61,7 +57,7 @@ GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
|
||||
|
||||
void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
|
||||
const u8* data) {
|
||||
glNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
|
||||
glNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
|
||||
static_cast<GLsizeiptr>(size), data);
|
||||
}
|
||||
|
||||
@@ -69,20 +65,20 @@ void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
|
||||
u8* data) {
|
||||
MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
|
||||
glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
|
||||
glGetNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
|
||||
glGetNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
|
||||
static_cast<GLsizeiptr>(size), data);
|
||||
}
|
||||
|
||||
void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
|
||||
std::size_t dst_offset, std::size_t size) {
|
||||
glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast<GLintptr>(src_offset),
|
||||
glCopyNamedBufferSubData(src.Handle(), dst.Handle(), static_cast<GLintptr>(src_offset),
|
||||
static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
|
||||
}
|
||||
|
||||
OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
|
||||
std::size_t size) {
|
||||
DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
|
||||
const GLuint& cbuf = cbufs[cbuf_cursor++];
|
||||
const GLuint cbuf = cbufs[cbuf_cursor++];
|
||||
glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
|
||||
return {cbuf, 0};
|
||||
}
|
||||
|
||||
@@ -24,17 +24,12 @@ class Device;
|
||||
class OGLStreamBuffer;
|
||||
class RasterizerOpenGL;
|
||||
|
||||
class CachedBufferBlock;
|
||||
|
||||
using Buffer = std::shared_ptr<CachedBufferBlock>;
|
||||
using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
|
||||
|
||||
class CachedBufferBlock : public VideoCommon::BufferBlock {
|
||||
class Buffer : public VideoCommon::BufferBlock {
|
||||
public:
|
||||
explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size);
|
||||
~CachedBufferBlock();
|
||||
explicit Buffer(VAddr cpu_addr, const std::size_t size);
|
||||
~Buffer();
|
||||
|
||||
GLuint GetHandle() const {
|
||||
GLuint Handle() const {
|
||||
return gl_buffer.handle;
|
||||
}
|
||||
|
||||
@@ -42,6 +37,7 @@ private:
|
||||
OGLBuffer gl_buffer;
|
||||
};
|
||||
|
||||
using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
|
||||
class OGLBufferCache final : public GenericBufferCache {
|
||||
public:
|
||||
explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
|
||||
@@ -55,9 +51,7 @@ public:
|
||||
}
|
||||
|
||||
protected:
|
||||
Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
|
||||
|
||||
GLuint ToHandle(const Buffer& buffer) override;
|
||||
std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
|
||||
|
||||
void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
|
||||
const u8* data) override;
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
@@ -13,6 +14,7 @@
|
||||
|
||||
#include "common/logging/log.h"
|
||||
#include "common/scope_exit.h"
|
||||
#include "core/settings.h"
|
||||
#include "video_core/renderer_opengl/gl_device.h"
|
||||
#include "video_core/renderer_opengl/gl_resource_manager.h"
|
||||
|
||||
@@ -25,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;
|
||||
|
||||
constexpr u32 NumStages = 5;
|
||||
|
||||
constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
|
||||
GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS,
|
||||
GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS};
|
||||
constexpr std::array LimitUBOs = {
|
||||
GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
|
||||
GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
|
||||
GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS};
|
||||
|
||||
constexpr std::array LimitSSBOs = {
|
||||
GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
|
||||
GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
|
||||
GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
|
||||
GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS};
|
||||
GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
|
||||
|
||||
constexpr std::array LimitSamplers = {
|
||||
GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
|
||||
GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
|
||||
GL_MAX_TEXTURE_IMAGE_UNITS};
|
||||
constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
|
||||
GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
|
||||
GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
|
||||
GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
|
||||
GL_MAX_TEXTURE_IMAGE_UNITS,
|
||||
GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
|
||||
|
||||
constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS,
|
||||
GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
|
||||
GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS,
|
||||
GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS};
|
||||
constexpr std::array LimitImages = {
|
||||
GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
|
||||
GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
|
||||
GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS};
|
||||
|
||||
template <typename T>
|
||||
T GetInteger(GLenum pname) {
|
||||
@@ -84,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
|
||||
return std::exchange(base, base + amount);
|
||||
}
|
||||
|
||||
std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
|
||||
std::array<u32, Tegra::Engines::MaxShaderTypes> max;
|
||||
std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
|
||||
[](GLenum pname) { return GetInteger<u32>(pname); });
|
||||
return max;
|
||||
}
|
||||
|
||||
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
|
||||
std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
|
||||
|
||||
@@ -132,6 +144,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
|
||||
}
|
||||
|
||||
bool IsASTCSupported() {
|
||||
static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
|
||||
static constexpr std::array formats = {
|
||||
GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x4_KHR,
|
||||
GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_COMPRESSED_RGBA_ASTC_6x5_KHR,
|
||||
@@ -148,25 +161,43 @@ bool IsASTCSupported() {
|
||||
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR,
|
||||
GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR,
|
||||
};
|
||||
return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) {
|
||||
GLint supported;
|
||||
glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1,
|
||||
&supported);
|
||||
return supported == GL_TRUE;
|
||||
}) == formats.end();
|
||||
static constexpr std::array required_support = {
|
||||
GL_VERTEX_TEXTURE, GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE,
|
||||
GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE, GL_COMPUTE_TEXTURE,
|
||||
};
|
||||
|
||||
for (const GLenum target : targets) {
|
||||
for (const GLenum format : formats) {
|
||||
for (const GLenum support : required_support) {
|
||||
GLint value;
|
||||
glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value);
|
||||
if (value != GL_FULL_SUPPORT) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // Anonymous namespace
|
||||
|
||||
Device::Device() : base_bindings{BuildBaseBindings()} {
|
||||
Device::Device()
|
||||
: max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
|
||||
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
|
||||
const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
|
||||
const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
|
||||
const std::vector extensions = GetExtensions();
|
||||
|
||||
const bool is_nvidia = vendor == "NVIDIA Corporation";
|
||||
const bool is_amd = vendor == "ATI Technologies Inc.";
|
||||
const bool is_intel = vendor == "Intel";
|
||||
const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr;
|
||||
|
||||
bool disable_fast_buffer_sub_data = false;
|
||||
if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
|
||||
LOG_WARNING(
|
||||
Render_OpenGL,
|
||||
"Beta driver 443.24 is known to have issues. There might be performance issues.");
|
||||
disable_fast_buffer_sub_data = true;
|
||||
}
|
||||
|
||||
uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
|
||||
shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
|
||||
@@ -181,16 +212,24 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
|
||||
has_variable_aoffi = TestVariableAoffi();
|
||||
has_component_indexing_bug = is_amd;
|
||||
has_precise_bug = TestPreciseBug();
|
||||
has_broken_compute = is_intel_proprietary;
|
||||
has_fast_buffer_sub_data = is_nvidia;
|
||||
has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
|
||||
use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
|
||||
GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
|
||||
GLAD_GL_NV_transform_feedback2;
|
||||
|
||||
LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
|
||||
LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
|
||||
LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
|
||||
|
||||
if (Settings::values.use_assembly_shaders && !use_assembly_shaders) {
|
||||
LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
|
||||
}
|
||||
}
|
||||
|
||||
Device::Device(std::nullptr_t) {
|
||||
uniform_buffer_alignment = 0;
|
||||
max_uniform_buffers.fill(std::numeric_limits<u32>::max());
|
||||
uniform_buffer_alignment = 4;
|
||||
shader_storage_alignment = 4;
|
||||
max_vertex_attributes = 16;
|
||||
max_varyings = 15;
|
||||
has_warp_intrinsics = true;
|
||||
@@ -198,9 +237,6 @@ Device::Device(std::nullptr_t) {
|
||||
has_vertex_viewport_layer = true;
|
||||
has_image_load_formatted = true;
|
||||
has_variable_aoffi = true;
|
||||
has_component_indexing_bug = false;
|
||||
has_broken_compute = false;
|
||||
has_precise_bug = false;
|
||||
}
|
||||
|
||||
bool Device::TestVariableAoffi() {
|
||||
|
||||
@@ -24,6 +24,10 @@ public:
|
||||
explicit Device();
|
||||
explicit Device(std::nullptr_t);
|
||||
|
||||
u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
|
||||
return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
|
||||
}
|
||||
|
||||
const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
|
||||
return base_bindings[stage_index];
|
||||
}
|
||||
@@ -80,19 +84,20 @@ public:
|
||||
return has_precise_bug;
|
||||
}
|
||||
|
||||
bool HasBrokenCompute() const {
|
||||
return has_broken_compute;
|
||||
}
|
||||
|
||||
bool HasFastBufferSubData() const {
|
||||
return has_fast_buffer_sub_data;
|
||||
}
|
||||
|
||||
bool UseAssemblyShaders() const {
|
||||
return use_assembly_shaders;
|
||||
}
|
||||
|
||||
private:
|
||||
static bool TestVariableAoffi();
|
||||
static bool TestPreciseBug();
|
||||
|
||||
std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings;
|
||||
std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
|
||||
std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
|
||||
std::size_t uniform_buffer_alignment{};
|
||||
std::size_t shader_storage_alignment{};
|
||||
u32 max_vertex_attributes{};
|
||||
@@ -105,8 +110,8 @@ private:
|
||||
bool has_variable_aoffi{};
|
||||
bool has_component_indexing_bug{};
|
||||
bool has_precise_bug{};
|
||||
bool has_broken_compute{};
|
||||
bool has_fast_buffer_sub_data{};
|
||||
bool use_assembly_shaders{};
|
||||
};
|
||||
|
||||
} // namespace OpenGL
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
#include "common/assert.h"
|
||||
|
||||
#include "video_core/renderer_opengl/gl_buffer_cache.h"
|
||||
#include "video_core/renderer_opengl/gl_fence_manager.h"
|
||||
|
||||
namespace OpenGL {
|
||||
|
||||
@@ -54,6 +54,12 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
|
||||
constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
|
||||
NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
|
||||
constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
|
||||
NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
|
||||
|
||||
constexpr std::size_t NumSupportedVertexAttributes = 16;
|
||||
|
||||
template <typename Engine, typename Entry>
|
||||
@@ -87,6 +93,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
|
||||
return buffer.size;
|
||||
}
|
||||
|
||||
/// Translates hardware transform feedback indices
|
||||
/// @param location Hardware location
|
||||
/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
|
||||
/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt
|
||||
std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
|
||||
const u8 index = location / 4;
|
||||
if (index >= 8 && index <= 39) {
|
||||
return {GL_GENERIC_ATTRIB_NV, index - 8};
|
||||
}
|
||||
if (index >= 48 && index <= 55) {
|
||||
return {GL_TEXTURE_COORD_NV, index - 48};
|
||||
}
|
||||
switch (index) {
|
||||
case 7:
|
||||
return {GL_POSITION, 0};
|
||||
case 40:
|
||||
return {GL_PRIMARY_COLOR_NV, 0};
|
||||
case 41:
|
||||
return {GL_SECONDARY_COLOR_NV, 0};
|
||||
case 42:
|
||||
return {GL_BACK_PRIMARY_COLOR_NV, 0};
|
||||
case 43:
|
||||
return {GL_BACK_SECONDARY_COLOR_NV, 0};
|
||||
}
|
||||
UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
|
||||
return {GL_POSITION, 0};
|
||||
}
|
||||
|
||||
void oglEnable(GLenum cap, bool state) {
|
||||
(state ? glEnable : glDisable)(cap);
|
||||
}
|
||||
@@ -94,17 +128,33 @@ void oglEnable(GLenum cap, bool state) {
|
||||
} // Anonymous namespace
|
||||
|
||||
RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
|
||||
ScreenInfo& info, GLShader::ProgramManager& program_manager,
|
||||
StateTracker& state_tracker)
|
||||
: RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker},
|
||||
const Device& device, ScreenInfo& info,
|
||||
ProgramManager& program_manager, StateTracker& state_tracker)
|
||||
: RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device,
|
||||
state_tracker},
|
||||
shader_cache{*this, system, emu_window, device}, query_cache{system, *this},
|
||||
buffer_cache{*this, system, device, STREAM_BUFFER_SIZE},
|
||||
fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system},
|
||||
screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
|
||||
CheckExtensions();
|
||||
|
||||
unified_uniform_buffer.Create();
|
||||
glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
|
||||
|
||||
if (device.UseAssemblyShaders()) {
|
||||
glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
|
||||
for (const GLuint cbuf : staging_cbufs) {
|
||||
glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
|
||||
nullptr, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
RasterizerOpenGL::~RasterizerOpenGL() {}
|
||||
RasterizerOpenGL::~RasterizerOpenGL() {
|
||||
if (device.UseAssemblyShaders()) {
|
||||
glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::CheckExtensions() {
|
||||
if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
|
||||
@@ -230,6 +280,7 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
|
||||
void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
|
||||
MICROPROFILE_SCOPE(OpenGL_Shader);
|
||||
auto& gpu = system.GPU().Maxwell3D();
|
||||
std::size_t num_ssbos = 0;
|
||||
u32 clip_distances = 0;
|
||||
|
||||
for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
|
||||
@@ -261,6 +312,14 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
|
||||
|
||||
Shader shader{shader_cache.GetStageProgram(program)};
|
||||
|
||||
if (device.UseAssemblyShaders()) {
|
||||
// Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
|
||||
// all stages share the same bindings.
|
||||
const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size();
|
||||
ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage");
|
||||
num_ssbos += num_stage_ssbos;
|
||||
}
|
||||
|
||||
// Stage indices are 0 - 5
|
||||
const std::size_t stage = index == 0 ? 0 : index - 1;
|
||||
SetupDrawConstBuffers(stage, shader);
|
||||
@@ -526,6 +585,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
|
||||
SyncFramebufferSRGB();
|
||||
|
||||
buffer_cache.Acquire();
|
||||
current_cbuf = 0;
|
||||
|
||||
std::size_t buffer_size = CalculateVertexArraysSize();
|
||||
|
||||
@@ -535,9 +595,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
|
||||
}
|
||||
|
||||
// Uniform space for the 5 shader stages
|
||||
buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
|
||||
(sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
|
||||
Maxwell::MaxShaderStage;
|
||||
buffer_size =
|
||||
Common::AlignUp<std::size_t>(buffer_size, 4) +
|
||||
(sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
|
||||
|
||||
// Add space for at least 18 constant buffers
|
||||
buffer_size += Maxwell::MaxConstBuffers *
|
||||
@@ -558,12 +618,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
|
||||
}
|
||||
|
||||
// Setup emulation uniform buffer.
|
||||
GLShader::MaxwellUniformData ubo;
|
||||
ubo.SetFromRegs(gpu);
|
||||
const auto [buffer, offset] =
|
||||
buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
|
||||
static_cast<GLsizeiptr>(sizeof(ubo)));
|
||||
if (!device.UseAssemblyShaders()) {
|
||||
MaxwellUniformData ubo;
|
||||
ubo.SetFromRegs(gpu);
|
||||
const auto [buffer, offset] =
|
||||
buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
|
||||
static_cast<GLsizeiptr>(sizeof(ubo)));
|
||||
}
|
||||
|
||||
// Setup shaders and their used resources.
|
||||
texture_cache.GuardSamplers(true);
|
||||
@@ -630,16 +692,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
|
||||
if (device.HasBrokenCompute()) {
|
||||
return;
|
||||
}
|
||||
|
||||
buffer_cache.Acquire();
|
||||
current_cbuf = 0;
|
||||
|
||||
auto kernel = shader_cache.GetComputeKernel(code_addr);
|
||||
SetupComputeTextures(kernel);
|
||||
SetupComputeImages(kernel);
|
||||
program_manager.BindComputeShader(kernel->GetHandle());
|
||||
|
||||
const std::size_t buffer_size =
|
||||
Tegra::Engines::KeplerCompute::NumConstBuffers *
|
||||
@@ -652,6 +710,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
|
||||
buffer_cache.Unmap();
|
||||
|
||||
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
|
||||
program_manager.BindCompute(kernel->GetHandle());
|
||||
glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
|
||||
++num_queued_commands;
|
||||
}
|
||||
@@ -701,15 +760,15 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
|
||||
return;
|
||||
}
|
||||
texture_cache.OnCPUWrite(addr, size);
|
||||
shader_cache.InvalidateRegion(addr, size);
|
||||
shader_cache.OnCPUWrite(addr, size);
|
||||
buffer_cache.OnCPUWrite(addr, size);
|
||||
query_cache.InvalidateRegion(addr, size);
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::SyncGuestHost() {
|
||||
MICROPROFILE_SCOPE(OpenGL_CacheManagement);
|
||||
texture_cache.SyncGuestHost();
|
||||
buffer_cache.SyncGuestHost();
|
||||
shader_cache.SyncGuestHost();
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
|
||||
@@ -812,39 +871,72 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
|
||||
static constexpr std::array PARAMETER_LUT = {
|
||||
GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
|
||||
GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
|
||||
GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV};
|
||||
|
||||
MICROPROFILE_SCOPE(OpenGL_UBO);
|
||||
const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
|
||||
const auto& shader_stage = stages[stage_index];
|
||||
const auto& entries = shader->GetEntries();
|
||||
const bool use_unified = entries.use_unified_uniforms;
|
||||
const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
|
||||
|
||||
u32 binding = device.GetBaseBindings(stage_index).uniform_buffer;
|
||||
for (const auto& entry : shader->GetEntries().const_buffers) {
|
||||
const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
|
||||
SetupConstBuffer(binding++, buffer, entry);
|
||||
const auto base_bindings = device.GetBaseBindings(stage_index);
|
||||
u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
|
||||
for (const auto& entry : entries.const_buffers) {
|
||||
const u32 index = entry.GetIndex();
|
||||
const auto& buffer = shader_stage.const_buffers[index];
|
||||
SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
|
||||
base_unified_offset + index * Maxwell::MaxConstBufferSize);
|
||||
++binding;
|
||||
}
|
||||
if (use_unified) {
|
||||
const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
|
||||
entries.global_memory_entries.size());
|
||||
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
|
||||
base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
|
||||
MICROPROFILE_SCOPE(OpenGL_UBO);
|
||||
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
|
||||
const auto& entries = kernel->GetEntries();
|
||||
const bool use_unified = entries.use_unified_uniforms;
|
||||
|
||||
u32 binding = 0;
|
||||
for (const auto& entry : kernel->GetEntries().const_buffers) {
|
||||
for (const auto& entry : entries.const_buffers) {
|
||||
const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
|
||||
const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
|
||||
Tegra::Engines::ConstBufferInfo buffer;
|
||||
buffer.address = config.Address();
|
||||
buffer.size = config.size;
|
||||
buffer.enabled = mask[entry.GetIndex()];
|
||||
SetupConstBuffer(binding++, buffer, entry);
|
||||
SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
|
||||
use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
|
||||
++binding;
|
||||
}
|
||||
if (use_unified) {
|
||||
const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
|
||||
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
|
||||
NUM_CONST_BUFFERS_BYTES_PER_STAGE);
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
|
||||
const ConstBufferEntry& entry) {
|
||||
void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
|
||||
const Tegra::Engines::ConstBufferInfo& buffer,
|
||||
const ConstBufferEntry& entry, bool use_unified,
|
||||
std::size_t unified_offset) {
|
||||
if (!buffer.enabled) {
|
||||
// Set values to zero to unbind buffers
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
|
||||
sizeof(float));
|
||||
if (device.UseAssemblyShaders()) {
|
||||
glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
|
||||
} else {
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, binding,
|
||||
buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -852,10 +944,29 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const
|
||||
// UBO alignment requirements.
|
||||
const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
|
||||
|
||||
const auto alignment = device.GetUniformBufferAlignment();
|
||||
const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
|
||||
device.HasFastBufferSubData());
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
|
||||
const bool fast_upload = !use_unified && device.HasFastBufferSubData();
|
||||
|
||||
const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
|
||||
const GPUVAddr gpu_addr = buffer.address;
|
||||
auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
|
||||
|
||||
if (device.UseAssemblyShaders()) {
|
||||
UNIMPLEMENTED_IF(use_unified);
|
||||
if (offset != 0) {
|
||||
const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
|
||||
glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
|
||||
cbuf = staging_cbuf;
|
||||
offset = 0;
|
||||
}
|
||||
glBindBufferRangeNV(stage, binding, cbuf, offset, size);
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_unified) {
|
||||
glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
|
||||
} else {
|
||||
glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
|
||||
@@ -863,7 +974,8 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
|
||||
auto& memory_manager{gpu.MemoryManager()};
|
||||
const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
|
||||
|
||||
u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer;
|
||||
u32 binding =
|
||||
device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
|
||||
for (const auto& entry : shader->GetEntries().global_memory_entries) {
|
||||
const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
|
||||
const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
|
||||
@@ -929,16 +1041,12 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
|
||||
glBindTextureUnit(binding, 0);
|
||||
return;
|
||||
}
|
||||
glBindTextureUnit(binding, view->GetTexture());
|
||||
|
||||
if (view->GetSurfaceParams().IsBuffer()) {
|
||||
return;
|
||||
const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source,
|
||||
texture.tic.z_source, texture.tic.w_source);
|
||||
glBindTextureUnit(binding, handle);
|
||||
if (!view->GetSurfaceParams().IsBuffer()) {
|
||||
glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
|
||||
}
|
||||
// Apply swizzle to textures that are not buffers.
|
||||
view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source,
|
||||
texture.tic.w_source);
|
||||
|
||||
glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
|
||||
@@ -967,14 +1075,11 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t
|
||||
glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
|
||||
return;
|
||||
}
|
||||
if (!tic.IsBuffer()) {
|
||||
view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
|
||||
}
|
||||
if (entry.is_written) {
|
||||
view->MarkAsModified(texture_cache.Tick());
|
||||
}
|
||||
glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE,
|
||||
view->GetFormat());
|
||||
const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
|
||||
glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat());
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::SyncViewport() {
|
||||
@@ -983,6 +1088,26 @@ void RasterizerOpenGL::SyncViewport() {
|
||||
const auto& regs = gpu.regs;
|
||||
|
||||
const bool dirty_viewport = flags[Dirty::Viewports];
|
||||
const bool dirty_clip_control = flags[Dirty::ClipControl];
|
||||
|
||||
if (dirty_clip_control || flags[Dirty::FrontFace]) {
|
||||
flags[Dirty::FrontFace] = false;
|
||||
|
||||
GLenum mode = MaxwellToGL::FrontFace(regs.front_face);
|
||||
if (regs.screen_y_control.triangle_rast_flip != 0 &&
|
||||
regs.viewport_transform[0].scale_y < 0.0f) {
|
||||
switch (mode) {
|
||||
case GL_CW:
|
||||
mode = GL_CCW;
|
||||
break;
|
||||
case GL_CCW:
|
||||
mode = GL_CW;
|
||||
break;
|
||||
}
|
||||
}
|
||||
glFrontFace(mode);
|
||||
}
|
||||
|
||||
if (dirty_viewport || flags[Dirty::ClipControl]) {
|
||||
flags[Dirty::ClipControl] = false;
|
||||
|
||||
@@ -1080,11 +1205,6 @@ void RasterizerOpenGL::SyncCullMode() {
|
||||
glDisable(GL_CULL_FACE);
|
||||
}
|
||||
}
|
||||
|
||||
if (flags[Dirty::FrontFace]) {
|
||||
flags[Dirty::FrontFace] = false;
|
||||
glFrontFace(MaxwellToGL::FrontFace(regs.front_face));
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::SyncPrimitiveRestart() {
|
||||
@@ -1455,12 +1575,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
|
||||
oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::SyncTransformFeedback() {
|
||||
// TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
|
||||
// when this is required.
|
||||
const auto& regs = system.GPU().Maxwell3D().regs;
|
||||
|
||||
static constexpr std::size_t STRIDE = 3;
|
||||
std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
|
||||
std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams;
|
||||
|
||||
GLint* cursor = attribs.data();
|
||||
GLint* current_stream = streams.data();
|
||||
|
||||
for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) {
|
||||
const auto& layout = regs.tfb_layouts[feedback];
|
||||
UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding");
|
||||
if (layout.varying_count == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
*current_stream = static_cast<GLint>(feedback);
|
||||
if (current_stream != streams.data()) {
|
||||
// When stepping one stream, push the expected token
|
||||
cursor[0] = GL_NEXT_BUFFER_NV;
|
||||
cursor[1] = 0;
|
||||
cursor[2] = 0;
|
||||
cursor += STRIDE;
|
||||
}
|
||||
++current_stream;
|
||||
|
||||
const auto& locations = regs.tfb_varying_locs[feedback];
|
||||
std::optional<u8> current_index;
|
||||
for (u32 offset = 0; offset < layout.varying_count; ++offset) {
|
||||
const u8 location = locations[offset];
|
||||
const u8 index = location / 4;
|
||||
|
||||
if (current_index == index) {
|
||||
// Increase number of components of the previous attachment
|
||||
++cursor[-2];
|
||||
continue;
|
||||
}
|
||||
current_index = index;
|
||||
|
||||
std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location);
|
||||
cursor[1] = 1;
|
||||
cursor += STRIDE;
|
||||
}
|
||||
}
|
||||
|
||||
const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE);
|
||||
const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data());
|
||||
glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(),
|
||||
GL_INTERLEAVED_ATTRIBS);
|
||||
}
|
||||
|
||||
void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
|
||||
const auto& regs = system.GPU().Maxwell3D().regs;
|
||||
if (regs.tfb_enabled == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (device.UseAssemblyShaders()) {
|
||||
SyncTransformFeedback();
|
||||
}
|
||||
|
||||
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
|
||||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
|
||||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
|
||||
@@ -1487,6 +1665,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
|
||||
static_cast<GLsizeiptr>(size));
|
||||
}
|
||||
|
||||
// We may have to call BeginTransformFeedbackNV here since they seem to call different
|
||||
// implementations on Nvidia's driver (the pointer is different) but we are using
|
||||
// ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
|
||||
// extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
|
||||
glBeginTransformFeedback(GL_POINTS);
|
||||
}
|
||||
|
||||
|
||||
@@ -56,8 +56,8 @@ struct DrawParameters;
|
||||
class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
|
||||
public:
|
||||
explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
|
||||
ScreenInfo& info, GLShader::ProgramManager& program_manager,
|
||||
StateTracker& state_tracker);
|
||||
const Device& device, ScreenInfo& info,
|
||||
ProgramManager& program_manager, StateTracker& state_tracker);
|
||||
~RasterizerOpenGL() override;
|
||||
|
||||
void Draw(bool is_indexed, bool is_instanced) override;
|
||||
@@ -106,8 +106,9 @@ private:
|
||||
void SetupComputeConstBuffers(const Shader& kernel);
|
||||
|
||||
/// Configures a constant buffer.
|
||||
void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
|
||||
const ConstBufferEntry& entry);
|
||||
void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
|
||||
const ConstBufferEntry& entry, bool use_unified,
|
||||
std::size_t unified_offset);
|
||||
|
||||
/// Configures the current global memory entries to use for the draw command.
|
||||
void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
|
||||
@@ -201,6 +202,10 @@ private:
|
||||
/// Syncs the framebuffer sRGB state to match the guest state
|
||||
void SyncFramebufferSRGB();
|
||||
|
||||
/// Syncs transform feedback state to match guest state
|
||||
/// @note Only valid on assembly shaders
|
||||
void SyncTransformFeedback();
|
||||
|
||||
/// Begin a transform feedback
|
||||
void BeginTransformFeedback(GLenum primitive_mode);
|
||||
|
||||
@@ -224,7 +229,7 @@ private:
|
||||
|
||||
void SetupShaders(GLenum primitive_mode);
|
||||
|
||||
const Device device;
|
||||
const Device& device;
|
||||
|
||||
TextureCacheOpenGL texture_cache;
|
||||
ShaderCacheOpenGL shader_cache;
|
||||
@@ -236,7 +241,7 @@ private:
|
||||
|
||||
Core::System& system;
|
||||
ScreenInfo& screen_info;
|
||||
GLShader::ProgramManager& program_manager;
|
||||
ProgramManager& program_manager;
|
||||
StateTracker& state_tracker;
|
||||
|
||||
static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
|
||||
@@ -248,6 +253,13 @@ private:
|
||||
std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
|
||||
enabled_transform_feedback_buffers;
|
||||
|
||||
static constexpr std::size_t NUM_CONSTANT_BUFFERS =
|
||||
Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
|
||||
Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
|
||||
std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
|
||||
std::size_t current_cbuf = 0;
|
||||
OGLBuffer unified_uniform_buffer;
|
||||
|
||||
/// Number of commands queued to the OpenGL driver. Reseted on flush.
|
||||
std::size_t num_queued_commands = 0;
|
||||
|
||||
|
||||
@@ -125,6 +125,15 @@ void OGLProgram::Release() {
|
||||
handle = 0;
|
||||
}
|
||||
|
||||
void OGLAssemblyProgram::Release() {
|
||||
if (handle == 0) {
|
||||
return;
|
||||
}
|
||||
MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
|
||||
glDeleteProgramsARB(1, &handle);
|
||||
handle = 0;
|
||||
}
|
||||
|
||||
void OGLPipeline::Create() {
|
||||
if (handle != 0)
|
||||
return;
|
||||
|
||||
@@ -167,6 +167,22 @@ public:
|
||||
GLuint handle = 0;
|
||||
};
|
||||
|
||||
class OGLAssemblyProgram : private NonCopyable {
|
||||
public:
|
||||
OGLAssemblyProgram() = default;
|
||||
|
||||
OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
|
||||
|
||||
~OGLAssemblyProgram() {
|
||||
Release();
|
||||
}
|
||||
|
||||
/// Deletes the internal OpenGL resource
|
||||
void Release();
|
||||
|
||||
GLuint handle = 0;
|
||||
};
|
||||
|
||||
class OGLPipeline : private NonCopyable {
|
||||
public:
|
||||
OGLPipeline() = default;
|
||||
|
||||
@@ -97,6 +97,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
|
||||
return {};
|
||||
}
|
||||
|
||||
constexpr GLenum AssemblyEnum(ShaderType shader_type) {
|
||||
switch (shader_type) {
|
||||
case ShaderType::Vertex:
|
||||
return GL_VERTEX_PROGRAM_NV;
|
||||
case ShaderType::TesselationControl:
|
||||
return GL_TESS_CONTROL_PROGRAM_NV;
|
||||
case ShaderType::TesselationEval:
|
||||
return GL_TESS_EVALUATION_PROGRAM_NV;
|
||||
case ShaderType::Geometry:
|
||||
return GL_GEOMETRY_PROGRAM_NV;
|
||||
case ShaderType::Fragment:
|
||||
return GL_FRAGMENT_PROGRAM_NV;
|
||||
case ShaderType::Compute:
|
||||
return GL_COMPUTE_PROGRAM_NV;
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
|
||||
return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
|
||||
}
|
||||
@@ -120,18 +138,43 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
|
||||
return registry;
|
||||
}
|
||||
|
||||
std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type,
|
||||
u64 unique_identifier, const ShaderIR& ir,
|
||||
const Registry& registry, bool hint_retrievable = false) {
|
||||
ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier,
|
||||
const ShaderIR& ir, const Registry& registry,
|
||||
bool hint_retrievable = false) {
|
||||
const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
|
||||
LOG_INFO(Render_OpenGL, "{}", shader_id);
|
||||
|
||||
const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
|
||||
OGLShader shader;
|
||||
shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
|
||||
auto program = std::make_shared<ProgramHandle>();
|
||||
|
||||
if (device.UseAssemblyShaders()) {
|
||||
const std::string arb = "Not implemented";
|
||||
|
||||
GLuint& arb_prog = program->assembly_program.handle;
|
||||
|
||||
// Commented out functions signal OpenGL errors but are compatible with apitrace.
|
||||
// Use them only to capture and replay on apitrace.
|
||||
#if 0
|
||||
glGenProgramsNV(1, &arb_prog);
|
||||
glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()),
|
||||
reinterpret_cast<const GLubyte*>(arb.data()));
|
||||
#else
|
||||
glGenProgramsARB(1, &arb_prog);
|
||||
glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB,
|
||||
static_cast<GLsizei>(arb.size()), arb.data());
|
||||
#endif
|
||||
const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV));
|
||||
if (err && *err) {
|
||||
LOG_CRITICAL(Render_OpenGL, "{}", err);
|
||||
LOG_INFO(Render_OpenGL, "\n{}", arb);
|
||||
}
|
||||
} else {
|
||||
const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
|
||||
OGLShader shader;
|
||||
shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
|
||||
|
||||
program->source_program.Create(true, hint_retrievable, shader.handle);
|
||||
}
|
||||
|
||||
auto program = std::make_shared<OGLProgram>();
|
||||
program->Create(true, hint_retrievable, shader.handle);
|
||||
return program;
|
||||
}
|
||||
|
||||
@@ -153,15 +196,22 @@ std::unordered_set<GLenum> GetSupportedFormats() {
|
||||
|
||||
CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
|
||||
std::shared_ptr<VideoCommon::Shader::Registry> registry,
|
||||
ShaderEntries entries, std::shared_ptr<OGLProgram> program)
|
||||
ShaderEntries entries, ProgramSharedPtr program_)
|
||||
: RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
|
||||
size_in_bytes{size_in_bytes}, program{std::move(program)} {}
|
||||
size_in_bytes{size_in_bytes}, program{std::move(program_)} {
|
||||
// Assign either the assembly program or source program. We can't have both.
|
||||
handle = program->assembly_program.handle;
|
||||
if (handle == 0) {
|
||||
handle = program->source_program.handle;
|
||||
}
|
||||
ASSERT(handle != 0);
|
||||
}
|
||||
|
||||
CachedShader::~CachedShader() = default;
|
||||
|
||||
GLuint CachedShader::GetHandle() const {
|
||||
DEBUG_ASSERT(registry->IsConsistent());
|
||||
return program->handle;
|
||||
return handle;
|
||||
}
|
||||
|
||||
Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
|
||||
@@ -191,8 +241,9 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
|
||||
entry.bindless_samplers = registry->GetBindlessSamplers();
|
||||
params.disk_cache.SaveEntry(std::move(entry));
|
||||
|
||||
return std::shared_ptr<CachedShader>(new CachedShader(
|
||||
params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
|
||||
return std::shared_ptr<CachedShader>(
|
||||
new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
|
||||
MakeEntries(params.device, ir, shader_type), std::move(program)));
|
||||
}
|
||||
|
||||
Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
|
||||
@@ -215,8 +266,9 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
|
||||
entry.bindless_samplers = registry->GetBindlessSamplers();
|
||||
params.disk_cache.SaveEntry(std::move(entry));
|
||||
|
||||
return std::shared_ptr<CachedShader>(new CachedShader(
|
||||
params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
|
||||
return std::shared_ptr<CachedShader>(
|
||||
new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
|
||||
MakeEntries(params.device, ir, ShaderType::Compute), std::move(program)));
|
||||
}
|
||||
|
||||
Shader CachedShader::CreateFromCache(const ShaderParameters& params,
|
||||
@@ -239,7 +291,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
|
||||
return;
|
||||
}
|
||||
|
||||
const std::vector gl_cache = disk_cache.LoadPrecompiled();
|
||||
std::vector<ShaderDiskCachePrecompiled> gl_cache;
|
||||
if (!device.UseAssemblyShaders()) {
|
||||
// Only load precompiled cache when we are not using assembly shaders
|
||||
gl_cache = disk_cache.LoadPrecompiled();
|
||||
}
|
||||
const auto supported_formats = GetSupportedFormats();
|
||||
|
||||
// Track if precompiled cache was altered during loading to know if we have to
|
||||
@@ -278,7 +334,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
|
||||
auto registry = MakeRegistry(entry);
|
||||
const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);
|
||||
|
||||
std::shared_ptr<OGLProgram> program;
|
||||
ProgramSharedPtr program;
|
||||
if (precompiled_entry) {
|
||||
// If the shader is precompiled, attempt to load it with
|
||||
program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
|
||||
@@ -294,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
|
||||
PrecompiledShader shader;
|
||||
shader.program = std::move(program);
|
||||
shader.registry = std::move(registry);
|
||||
shader.entries = MakeEntries(ir);
|
||||
shader.entries = MakeEntries(device, ir, entry.type);
|
||||
|
||||
std::scoped_lock lock{mutex};
|
||||
if (callback) {
|
||||
@@ -332,6 +388,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
|
||||
return;
|
||||
}
|
||||
|
||||
if (device.UseAssemblyShaders()) {
|
||||
// Don't store precompiled binaries for assembly shaders.
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
|
||||
// before precompiling them
|
||||
|
||||
@@ -339,7 +400,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
|
||||
const u64 id = (*transferable)[i].unique_identifier;
|
||||
const auto it = find_precompiled(id);
|
||||
if (it == gl_cache.end()) {
|
||||
const GLuint program = runtime_cache.at(id).program->handle;
|
||||
const GLuint program = runtime_cache.at(id).program->source_program.handle;
|
||||
disk_cache.SavePrecompiled(id, program);
|
||||
precompiled_cache_altered = true;
|
||||
}
|
||||
@@ -350,7 +411,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
|
||||
ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
|
||||
const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
|
||||
const std::unordered_set<GLenum>& supported_formats) {
|
||||
if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
|
||||
@@ -358,15 +419,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
|
||||
return {};
|
||||
}
|
||||
|
||||
auto program = std::make_shared<OGLProgram>();
|
||||
program->handle = glCreateProgram();
|
||||
glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
|
||||
glProgramBinary(program->handle, precompiled_entry.binary_format,
|
||||
precompiled_entry.binary.data(),
|
||||
auto program = std::make_shared<ProgramHandle>();
|
||||
GLuint& handle = program->source_program.handle;
|
||||
handle = glCreateProgram();
|
||||
glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
|
||||
glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(),
|
||||
static_cast<GLsizei>(precompiled_entry.binary.size()));
|
||||
|
||||
GLint link_status;
|
||||
glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status);
|
||||
glGetProgramiv(handle, GL_LINK_STATUS, &link_status);
|
||||
if (link_status == GL_FALSE) {
|
||||
LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
|
||||
return {};
|
||||
|
||||
@@ -43,8 +43,14 @@ struct UnspecializedShader;
|
||||
using Shader = std::shared_ptr<CachedShader>;
|
||||
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
|
||||
|
||||
struct ProgramHandle {
|
||||
OGLProgram source_program;
|
||||
OGLAssemblyProgram assembly_program;
|
||||
};
|
||||
using ProgramSharedPtr = std::shared_ptr<ProgramHandle>;
|
||||
|
||||
struct PrecompiledShader {
|
||||
std::shared_ptr<OGLProgram> program;
|
||||
ProgramSharedPtr program;
|
||||
std::shared_ptr<VideoCommon::Shader::Registry> registry;
|
||||
ShaderEntries entries;
|
||||
};
|
||||
@@ -87,12 +93,13 @@ public:
|
||||
private:
|
||||
explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
|
||||
std::shared_ptr<VideoCommon::Shader::Registry> registry,
|
||||
ShaderEntries entries, std::shared_ptr<OGLProgram> program);
|
||||
ShaderEntries entries, ProgramSharedPtr program);
|
||||
|
||||
std::shared_ptr<VideoCommon::Shader::Registry> registry;
|
||||
ShaderEntries entries;
|
||||
std::size_t size_in_bytes = 0;
|
||||
std::shared_ptr<OGLProgram> program;
|
||||
ProgramSharedPtr program;
|
||||
GLuint handle = 0;
|
||||
};
|
||||
|
||||
class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
|
||||
@@ -115,7 +122,7 @@ protected:
|
||||
void FlushObjectInner(const Shader& object) override {}
|
||||
|
||||
private:
|
||||
std::shared_ptr<OGLProgram> GeneratePrecompiledProgram(
|
||||
ProgramSharedPtr GeneratePrecompiledProgram(
|
||||
const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
|
||||
const std::unordered_set<GLenum>& supported_formats);
|
||||
|
||||
|
||||
@@ -61,8 +61,8 @@ struct TextureDerivates {};
|
||||
using TextureArgument = std::pair<Type, Node>;
|
||||
using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;
|
||||
|
||||
constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
|
||||
static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
|
||||
constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
|
||||
constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
|
||||
|
||||
constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
|
||||
#define ftou floatBitsToUint
|
||||
@@ -402,6 +402,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
|
||||
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
|
||||
}
|
||||
|
||||
bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
|
||||
const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
|
||||
// We waste one UBO for emulation
|
||||
const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
|
||||
return num_ubos > num_available_ubos;
|
||||
}
|
||||
|
||||
struct GenericVaryingDescription {
|
||||
std::string name;
|
||||
u8 first_element = 0;
|
||||
@@ -412,8 +419,9 @@ class GLSLDecompiler final {
|
||||
public:
|
||||
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
|
||||
ShaderType stage, std::string_view identifier, std::string_view suffix)
|
||||
: device{device}, ir{ir}, registry{registry}, stage{stage},
|
||||
identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
|
||||
: device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
|
||||
suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
|
||||
UseUnifiedUniforms(device, ir, stage)} {
|
||||
if (stage != ShaderType::Compute) {
|
||||
transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
|
||||
}
|
||||
@@ -618,7 +626,9 @@ private:
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) {
|
||||
|
||||
if (stage != ShaderType::Geometry &&
|
||||
(stage != ShaderType::Vertex || device.HasVertexViewportLayer())) {
|
||||
if (ir.UsesLayer()) {
|
||||
code.AddLine("int gl_Layer;");
|
||||
}
|
||||
@@ -647,6 +657,16 @@ private:
|
||||
--code.scope;
|
||||
code.AddLine("}};");
|
||||
code.AddNewLine();
|
||||
|
||||
if (stage == ShaderType::Geometry) {
|
||||
if (ir.UsesLayer()) {
|
||||
code.AddLine("out int gl_Layer;");
|
||||
}
|
||||
if (ir.UsesViewportIndex()) {
|
||||
code.AddLine("out int gl_ViewportIndex;");
|
||||
}
|
||||
}
|
||||
code.AddNewLine();
|
||||
}
|
||||
|
||||
void DeclareRegisters() {
|
||||
@@ -834,12 +854,24 @@ private:
|
||||
}
|
||||
|
||||
void DeclareConstantBuffers() {
|
||||
if (use_unified_uniforms) {
|
||||
const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
|
||||
static_cast<u32>(ir.GetGlobalMemory().size());
|
||||
code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
|
||||
binding);
|
||||
code.AddLine(" uint cbufs[];");
|
||||
code.AddLine("}};");
|
||||
code.AddNewLine();
|
||||
return;
|
||||
}
|
||||
|
||||
u32 binding = device.GetBaseBindings(stage).uniform_buffer;
|
||||
for (const auto& buffers : ir.GetConstantBuffers()) {
|
||||
const auto index = buffers.first;
|
||||
for (const auto [index, info] : ir.GetConstantBuffers()) {
|
||||
const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
|
||||
const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
|
||||
code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
|
||||
GetConstBufferBlock(index));
|
||||
code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS);
|
||||
code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size);
|
||||
code.AddLine("}};");
|
||||
code.AddNewLine();
|
||||
}
|
||||
@@ -1038,42 +1070,51 @@ private:
|
||||
|
||||
if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
|
||||
const Node offset = cbuf->GetOffset();
|
||||
const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
|
||||
|
||||
if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
|
||||
// Direct access
|
||||
const u32 offset_imm = immediate->GetValue();
|
||||
ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
|
||||
return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
|
||||
offset_imm / (4 * 4), (offset_imm / 4) % 4),
|
||||
if (use_unified_uniforms) {
|
||||
return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
|
||||
Type::Uint};
|
||||
} else {
|
||||
return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
|
||||
offset_imm / (4 * 4), (offset_imm / 4) % 4),
|
||||
Type::Uint};
|
||||
}
|
||||
}
|
||||
|
||||
// Indirect access
|
||||
if (use_unified_uniforms) {
|
||||
return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
|
||||
Visit(offset).AsUint()),
|
||||
Type::Uint};
|
||||
}
|
||||
|
||||
if (std::holds_alternative<OperationNode>(*offset)) {
|
||||
// Indirect access
|
||||
const std::string final_offset = code.GenerateTemporary();
|
||||
code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
|
||||
const std::string final_offset = code.GenerateTemporary();
|
||||
code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
|
||||
|
||||
if (!device.HasComponentIndexingBug()) {
|
||||
return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
|
||||
final_offset, final_offset),
|
||||
Type::Uint};
|
||||
}
|
||||
|
||||
// AMD's proprietary GLSL compiler emits ill code for variable component access.
|
||||
// To bypass this driver bug generate 4 ifs, one per each component.
|
||||
const std::string pack = code.GenerateTemporary();
|
||||
code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
|
||||
final_offset);
|
||||
|
||||
const std::string result = code.GenerateTemporary();
|
||||
code.AddLine("uint {};", result);
|
||||
for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
|
||||
code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
|
||||
pack, GetSwizzle(swizzle));
|
||||
}
|
||||
return {result, Type::Uint};
|
||||
if (!device.HasComponentIndexingBug()) {
|
||||
return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
|
||||
final_offset, final_offset),
|
||||
Type::Uint};
|
||||
}
|
||||
|
||||
UNREACHABLE_MSG("Unmanaged offset node type");
|
||||
// AMD's proprietary GLSL compiler emits ill code for variable component access.
|
||||
// To bypass this driver bug generate 4 ifs, one per each component.
|
||||
const std::string pack = code.GenerateTemporary();
|
||||
code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
|
||||
final_offset);
|
||||
|
||||
const std::string result = code.GenerateTemporary();
|
||||
code.AddLine("uint {};", result);
|
||||
for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
|
||||
code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
|
||||
GetSwizzle(swizzle));
|
||||
}
|
||||
return {result, Type::Uint};
|
||||
}
|
||||
|
||||
if (const auto gmem = std::get_if<GmemNode>(&*node)) {
|
||||
@@ -1538,7 +1579,9 @@ private:
|
||||
Expression target;
|
||||
if (const auto gpr = std::get_if<GprNode>(&*dest)) {
|
||||
if (gpr->GetIndex() == Register::ZeroIndex) {
|
||||
// Writing to Register::ZeroIndex is a no op
|
||||
// Writing to Register::ZeroIndex is a no op but we still have to visit the source
|
||||
// as it might have side effects.
|
||||
code.AddLine("{};", Visit(src).GetCode());
|
||||
return {};
|
||||
}
|
||||
target = {GetRegister(gpr->GetIndex()), Type::Float};
|
||||
@@ -2309,6 +2352,18 @@ private:
|
||||
return {"gl_SubGroupInvocationARB", Type::Uint};
|
||||
}
|
||||
|
||||
template <const std::string_view& comparison>
|
||||
Expression ThreadMask(Operation) {
|
||||
if (device.HasWarpIntrinsics()) {
|
||||
return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint};
|
||||
}
|
||||
if (device.HasShaderBallot()) {
|
||||
return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint};
|
||||
}
|
||||
LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader");
|
||||
return {"0U", Type::Uint};
|
||||
}
|
||||
|
||||
Expression ShuffleIndexed(Operation operation) {
|
||||
std::string value = VisitOperand(operation, 0).AsFloat();
|
||||
|
||||
@@ -2321,7 +2376,21 @@ private:
|
||||
return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float};
|
||||
}
|
||||
|
||||
Expression MemoryBarrierGL(Operation) {
|
||||
Expression Barrier(Operation) {
|
||||
if (!ir.IsDecompiled()) {
|
||||
LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled");
|
||||
return {};
|
||||
}
|
||||
code.AddLine("barrier();");
|
||||
return {};
|
||||
}
|
||||
|
||||
Expression MemoryBarrierGroup(Operation) {
|
||||
code.AddLine("groupMemoryBarrier();");
|
||||
return {};
|
||||
}
|
||||
|
||||
Expression MemoryBarrierGlobal(Operation) {
|
||||
code.AddLine("memoryBarrier();");
|
||||
return {};
|
||||
}
|
||||
@@ -2337,6 +2406,12 @@ private:
|
||||
static constexpr std::string_view NotEqual = "!=";
|
||||
static constexpr std::string_view GreaterEqual = ">=";
|
||||
|
||||
static constexpr std::string_view Eq = "Eq";
|
||||
static constexpr std::string_view Ge = "Ge";
|
||||
static constexpr std::string_view Gt = "Gt";
|
||||
static constexpr std::string_view Le = "Le";
|
||||
static constexpr std::string_view Lt = "Lt";
|
||||
|
||||
static constexpr std::string_view Add = "Add";
|
||||
static constexpr std::string_view Min = "Min";
|
||||
static constexpr std::string_view Max = "Max";
|
||||
@@ -2554,9 +2629,16 @@ private:
|
||||
&GLSLDecompiler::VoteEqual,
|
||||
|
||||
&GLSLDecompiler::ThreadId,
|
||||
&GLSLDecompiler::ThreadMask<Func::Eq>,
|
||||
&GLSLDecompiler::ThreadMask<Func::Ge>,
|
||||
&GLSLDecompiler::ThreadMask<Func::Gt>,
|
||||
&GLSLDecompiler::ThreadMask<Func::Le>,
|
||||
&GLSLDecompiler::ThreadMask<Func::Lt>,
|
||||
&GLSLDecompiler::ShuffleIndexed,
|
||||
|
||||
&GLSLDecompiler::MemoryBarrierGL,
|
||||
&GLSLDecompiler::Barrier,
|
||||
&GLSLDecompiler::MemoryBarrierGroup,
|
||||
&GLSLDecompiler::MemoryBarrierGlobal,
|
||||
};
|
||||
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
|
||||
|
||||
@@ -2669,6 +2751,7 @@ private:
|
||||
const std::string_view identifier;
|
||||
const std::string_view suffix;
|
||||
const Header header;
|
||||
const bool use_unified_uniforms;
|
||||
std::unordered_map<u8, VaryingTFB> transform_feedback;
|
||||
|
||||
ShaderWriter code;
|
||||
@@ -2864,7 +2947,7 @@ void GLSLDecompiler::DecompileAST() {
|
||||
|
||||
} // Anonymous namespace
|
||||
|
||||
ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
|
||||
ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
|
||||
ShaderEntries entries;
|
||||
for (const auto& cbuf : ir.GetConstantBuffers()) {
|
||||
entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
|
||||
@@ -2885,6 +2968,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
|
||||
entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
|
||||
}
|
||||
entries.shader_length = ir.GetLength();
|
||||
entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
|
||||
return entries;
|
||||
}
|
||||
|
||||
|
||||
@@ -53,11 +53,13 @@ struct ShaderEntries {
|
||||
std::vector<GlobalMemoryEntry> global_memory_entries;
|
||||
std::vector<SamplerEntry> samplers;
|
||||
std::vector<ImageEntry> images;
|
||||
u32 clip_distances{};
|
||||
std::size_t shader_length{};
|
||||
u32 clip_distances{};
|
||||
bool use_unified_uniforms{};
|
||||
};
|
||||
|
||||
ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);
|
||||
ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
|
||||
Tegra::Engines::ShaderType stage);
|
||||
|
||||
std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
|
||||
const VideoCommon::Shader::Registry& registry,
|
||||
|
||||
@@ -6,47 +6,111 @@
|
||||
|
||||
#include "common/common_types.h"
|
||||
#include "video_core/engines/maxwell_3d.h"
|
||||
#include "video_core/renderer_opengl/gl_device.h"
|
||||
#include "video_core/renderer_opengl/gl_shader_manager.h"
|
||||
|
||||
namespace OpenGL::GLShader {
|
||||
namespace OpenGL {
|
||||
|
||||
ProgramManager::ProgramManager() = default;
|
||||
ProgramManager::ProgramManager(const Device& device) {
|
||||
use_assembly_programs = device.UseAssemblyShaders();
|
||||
if (use_assembly_programs) {
|
||||
glEnable(GL_COMPUTE_PROGRAM_NV);
|
||||
} else {
|
||||
graphics_pipeline.Create();
|
||||
glBindProgramPipeline(graphics_pipeline.handle);
|
||||
}
|
||||
}
|
||||
|
||||
ProgramManager::~ProgramManager() = default;
|
||||
|
||||
void ProgramManager::Create() {
|
||||
graphics_pipeline.Create();
|
||||
glBindProgramPipeline(graphics_pipeline.handle);
|
||||
void ProgramManager::BindCompute(GLuint program) {
|
||||
if (use_assembly_programs) {
|
||||
glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
|
||||
} else {
|
||||
is_graphics_bound = false;
|
||||
glUseProgram(program);
|
||||
}
|
||||
}
|
||||
|
||||
void ProgramManager::BindGraphicsPipeline() {
|
||||
if (use_assembly_programs) {
|
||||
UpdateAssemblyPrograms();
|
||||
} else {
|
||||
UpdateSourcePrograms();
|
||||
}
|
||||
}
|
||||
|
||||
void ProgramManager::BindHostPipeline(GLuint pipeline) {
|
||||
if (use_assembly_programs) {
|
||||
if (geometry_enabled) {
|
||||
geometry_enabled = false;
|
||||
old_state.geometry = 0;
|
||||
glDisable(GL_GEOMETRY_PROGRAM_NV);
|
||||
}
|
||||
} else {
|
||||
if (!is_graphics_bound) {
|
||||
glUseProgram(0);
|
||||
}
|
||||
}
|
||||
glBindProgramPipeline(pipeline);
|
||||
}
|
||||
|
||||
void ProgramManager::RestoreGuestPipeline() {
|
||||
if (use_assembly_programs) {
|
||||
glBindProgramPipeline(0);
|
||||
} else {
|
||||
glBindProgramPipeline(graphics_pipeline.handle);
|
||||
}
|
||||
}
|
||||
|
||||
void ProgramManager::UpdateAssemblyPrograms() {
|
||||
const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) {
|
||||
if (current == old) {
|
||||
return;
|
||||
}
|
||||
if (current == 0) {
|
||||
if (enabled) {
|
||||
enabled = false;
|
||||
glDisable(stage);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (!enabled) {
|
||||
enabled = true;
|
||||
glEnable(stage);
|
||||
}
|
||||
glBindProgramARB(stage, current);
|
||||
};
|
||||
|
||||
update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex);
|
||||
update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry,
|
||||
old_state.geometry);
|
||||
update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment,
|
||||
old_state.fragment);
|
||||
|
||||
old_state = current_state;
|
||||
}
|
||||
|
||||
void ProgramManager::UpdateSourcePrograms() {
|
||||
if (!is_graphics_bound) {
|
||||
is_graphics_bound = true;
|
||||
glUseProgram(0);
|
||||
}
|
||||
|
||||
// Avoid updating the pipeline when values have no changed
|
||||
if (old_state == current_state) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Workaround for AMD bug
|
||||
static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT |
|
||||
GL_FRAGMENT_SHADER_BIT};
|
||||
const GLuint handle = graphics_pipeline.handle;
|
||||
glUseProgramStages(handle, all_used_stages, 0);
|
||||
glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader);
|
||||
glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader);
|
||||
glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader);
|
||||
const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) {
|
||||
if (current == old) {
|
||||
return;
|
||||
}
|
||||
glUseProgramStages(handle, stage, current);
|
||||
};
|
||||
update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex);
|
||||
update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry);
|
||||
update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment);
|
||||
|
||||
old_state = current_state;
|
||||
}
|
||||
|
||||
void ProgramManager::BindComputeShader(GLuint program) {
|
||||
is_graphics_bound = false;
|
||||
glUseProgram(program);
|
||||
}
|
||||
|
||||
void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
|
||||
const auto& regs = maxwell.regs;
|
||||
|
||||
@@ -54,4 +118,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
|
||||
y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;
|
||||
}
|
||||
|
||||
} // namespace OpenGL::GLShader
|
||||
} // namespace OpenGL
|
||||
|
||||
@@ -11,7 +11,9 @@
|
||||
#include "video_core/renderer_opengl/gl_resource_manager.h"
|
||||
#include "video_core/renderer_opengl/maxwell_to_gl.h"
|
||||
|
||||
namespace OpenGL::GLShader {
|
||||
namespace OpenGL {
|
||||
|
||||
class Device;
|
||||
|
||||
/// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
|
||||
/// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
|
||||
@@ -28,50 +30,58 @@ static_assert(sizeof(MaxwellUniformData) < 16384,
|
||||
|
||||
class ProgramManager {
|
||||
public:
|
||||
explicit ProgramManager();
|
||||
explicit ProgramManager(const Device& device);
|
||||
~ProgramManager();
|
||||
|
||||
void Create();
|
||||
/// Binds a compute program
|
||||
void BindCompute(GLuint program);
|
||||
|
||||
/// Updates the graphics pipeline and binds it.
|
||||
/// Updates bound programs.
|
||||
void BindGraphicsPipeline();
|
||||
|
||||
/// Binds a compute shader.
|
||||
void BindComputeShader(GLuint program);
|
||||
/// Binds an OpenGL pipeline object unsynchronized with the guest state.
|
||||
void BindHostPipeline(GLuint pipeline);
|
||||
|
||||
/// Rewinds BindHostPipeline state changes.
|
||||
void RestoreGuestPipeline();
|
||||
|
||||
void UseVertexShader(GLuint program) {
|
||||
current_state.vertex_shader = program;
|
||||
current_state.vertex = program;
|
||||
}
|
||||
|
||||
void UseGeometryShader(GLuint program) {
|
||||
current_state.geometry_shader = program;
|
||||
current_state.geometry = program;
|
||||
}
|
||||
|
||||
void UseFragmentShader(GLuint program) {
|
||||
current_state.fragment_shader = program;
|
||||
current_state.fragment = program;
|
||||
}
|
||||
|
||||
private:
|
||||
struct PipelineState {
|
||||
bool operator==(const PipelineState& rhs) const noexcept {
|
||||
return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader &&
|
||||
geometry_shader == rhs.geometry_shader;
|
||||
}
|
||||
|
||||
bool operator!=(const PipelineState& rhs) const noexcept {
|
||||
return !operator==(rhs);
|
||||
}
|
||||
|
||||
GLuint vertex_shader = 0;
|
||||
GLuint fragment_shader = 0;
|
||||
GLuint geometry_shader = 0;
|
||||
GLuint vertex = 0;
|
||||
GLuint geometry = 0;
|
||||
GLuint fragment = 0;
|
||||
};
|
||||
|
||||
/// Update NV_gpu_program5 programs.
|
||||
void UpdateAssemblyPrograms();
|
||||
|
||||
/// Update GLSL programs.
|
||||
void UpdateSourcePrograms();
|
||||
|
||||
OGLPipeline graphics_pipeline;
|
||||
OGLPipeline compute_pipeline;
|
||||
|
||||
PipelineState current_state;
|
||||
PipelineState old_state;
|
||||
|
||||
bool use_assembly_programs = false;
|
||||
|
||||
bool is_graphics_bound = true;
|
||||
|
||||
bool vertex_enabled = false;
|
||||
bool geometry_enabled = false;
|
||||
bool fragment_enabled = false;
|
||||
};
|
||||
|
||||
} // namespace OpenGL::GLShader
|
||||
} // namespace OpenGL
|
||||
|
||||
@@ -49,14 +49,6 @@ OGLStreamBuffer::~OGLStreamBuffer() {
|
||||
gl_buffer.Release();
|
||||
}
|
||||
|
||||
GLuint OGLStreamBuffer::GetHandle() const {
|
||||
return gl_buffer.handle;
|
||||
}
|
||||
|
||||
GLsizeiptr OGLStreamBuffer::GetSize() const {
|
||||
return buffer_size;
|
||||
}
|
||||
|
||||
std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
|
||||
ASSERT(size <= buffer_size);
|
||||
ASSERT(alignment <= buffer_size);
|
||||
|
||||
@@ -17,9 +17,6 @@ public:
|
||||
bool use_persistent = true);
|
||||
~OGLStreamBuffer();
|
||||
|
||||
GLuint GetHandle() const;
|
||||
GLsizeiptr GetSize() const;
|
||||
|
||||
/*
|
||||
* Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
|
||||
* and the optional alignment requirement.
|
||||
@@ -32,6 +29,14 @@ public:
|
||||
|
||||
void Unmap(GLsizeiptr size);
|
||||
|
||||
GLuint Handle() const {
|
||||
return gl_buffer.handle;
|
||||
}
|
||||
|
||||
GLsizeiptr Size() const {
|
||||
return buffer_size;
|
||||
}
|
||||
|
||||
private:
|
||||
OGLBuffer gl_buffer;
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
|
||||
namespace {
|
||||
|
||||
struct FormatTuple {
|
||||
GLint internal_format;
|
||||
GLenum internal_format;
|
||||
GLenum format = GL_NONE;
|
||||
GLenum type = GL_NONE;
|
||||
};
|
||||
@@ -238,6 +238,12 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
|
||||
return texture;
|
||||
}
|
||||
|
||||
constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source,
|
||||
SwizzleSource w_source) {
|
||||
return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
|
||||
(static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
|
||||
}
|
||||
|
||||
} // Anonymous namespace
|
||||
|
||||
CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params,
|
||||
@@ -381,7 +387,7 @@ void CachedSurface::DecorateSurfaceName() {
|
||||
}
|
||||
|
||||
void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) {
|
||||
LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix);
|
||||
LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix);
|
||||
}
|
||||
|
||||
View CachedSurface::CreateView(const ViewParams& view_key) {
|
||||
@@ -397,14 +403,12 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr
|
||||
}
|
||||
|
||||
CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params,
|
||||
const bool is_proxy)
|
||||
: VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} {
|
||||
target = GetTextureTarget(params.target);
|
||||
format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format;
|
||||
bool is_proxy)
|
||||
: VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format},
|
||||
target{GetTextureTarget(params.target)}, is_proxy{is_proxy} {
|
||||
if (!is_proxy) {
|
||||
texture_view = CreateTextureView();
|
||||
main_view = CreateTextureView();
|
||||
}
|
||||
swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A);
|
||||
}
|
||||
|
||||
CachedSurfaceView::~CachedSurfaceView() = default;
|
||||
@@ -447,27 +451,49 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
|
||||
}
|
||||
}
|
||||
|
||||
void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source,
|
||||
GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source,
|
||||
SwizzleSource z_source, SwizzleSource w_source) {
|
||||
u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
|
||||
if (new_swizzle == swizzle)
|
||||
return;
|
||||
swizzle = new_swizzle;
|
||||
const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source),
|
||||
GetSwizzleSource(z_source), GetSwizzleSource(w_source)};
|
||||
const GLuint handle = GetTexture();
|
||||
const PixelFormat format = surface.GetSurfaceParams().pixel_format;
|
||||
switch (format) {
|
||||
if (GetSurfaceParams().IsBuffer()) {
|
||||
return GetTexture();
|
||||
}
|
||||
const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
|
||||
if (current_swizzle == new_swizzle) {
|
||||
return current_view;
|
||||
}
|
||||
current_swizzle = new_swizzle;
|
||||
|
||||
const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
|
||||
OGLTextureView& view = entry->second;
|
||||
if (!is_cache_miss) {
|
||||
current_view = view.handle;
|
||||
return view.handle;
|
||||
}
|
||||
view = CreateTextureView();
|
||||
current_view = view.handle;
|
||||
|
||||
std::array swizzle{x_source, y_source, z_source, w_source};
|
||||
|
||||
switch (const PixelFormat format = GetSurfaceParams().pixel_format) {
|
||||
case PixelFormat::Z24S8:
|
||||
case PixelFormat::Z32FS8:
|
||||
case PixelFormat::S8Z24:
|
||||
glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
|
||||
UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G);
|
||||
glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
|
||||
GetComponent(format, x_source == SwizzleSource::R));
|
||||
break;
|
||||
default:
|
||||
glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
|
||||
|
||||
// Make sure we sample the first component
|
||||
std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) {
|
||||
return value == SwizzleSource::G ? SwizzleSource::R : value;
|
||||
});
|
||||
[[fallthrough]];
|
||||
default: {
|
||||
const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]),
|
||||
GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])};
|
||||
glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
|
||||
break;
|
||||
}
|
||||
}
|
||||
return view.handle;
|
||||
}
|
||||
|
||||
OGLTextureView CachedSurfaceView::CreateTextureView() const {
|
||||
|
||||
@@ -83,7 +83,7 @@ public:
|
||||
/// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER
|
||||
void Attach(GLenum attachment, GLenum target) const;
|
||||
|
||||
void ApplySwizzle(Tegra::Texture::SwizzleSource x_source,
|
||||
GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
|
||||
Tegra::Texture::SwizzleSource y_source,
|
||||
Tegra::Texture::SwizzleSource z_source,
|
||||
Tegra::Texture::SwizzleSource w_source);
|
||||
@@ -98,7 +98,7 @@ public:
|
||||
if (is_proxy) {
|
||||
return surface.GetTexture();
|
||||
}
|
||||
return texture_view.handle;
|
||||
return main_view.handle;
|
||||
}
|
||||
|
||||
GLenum GetFormat() const {
|
||||
@@ -110,23 +110,19 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
|
||||
Tegra::Texture::SwizzleSource y_source,
|
||||
Tegra::Texture::SwizzleSource z_source,
|
||||
Tegra::Texture::SwizzleSource w_source) const {
|
||||
return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
|
||||
(static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
|
||||
}
|
||||
|
||||
OGLTextureView CreateTextureView() const;
|
||||
|
||||
CachedSurface& surface;
|
||||
GLenum target{};
|
||||
GLenum format{};
|
||||
const GLenum format;
|
||||
const GLenum target;
|
||||
const bool is_proxy;
|
||||
|
||||
OGLTextureView texture_view;
|
||||
u32 swizzle{};
|
||||
bool is_proxy{};
|
||||
std::unordered_map<u32, OGLTextureView> view_cache;
|
||||
OGLTextureView main_view;
|
||||
|
||||
// Use an invalid default so it always fails the comparison test
|
||||
u32 current_swizzle = 0xffffffff;
|
||||
GLuint current_view = 0;
|
||||
};
|
||||
|
||||
class TextureCacheOpenGL final : public TextureCacheBase {
|
||||
|
||||
@@ -316,7 +316,7 @@ public:
|
||||
RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
|
||||
Core::Frontend::GraphicsContext& context)
|
||||
: RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context},
|
||||
has_debug_tool{HasDebugTool()} {}
|
||||
program_manager{device}, has_debug_tool{HasDebugTool()} {}
|
||||
|
||||
RendererOpenGL::~RendererOpenGL() = default;
|
||||
|
||||
@@ -468,8 +468,9 @@ void RendererOpenGL::InitOpenGLObjects() {
|
||||
vertex_program.Create(true, false, vertex_shader.handle);
|
||||
fragment_program.Create(true, false, fragment_shader.handle);
|
||||
|
||||
// Create program pipeline
|
||||
program_manager.Create();
|
||||
pipeline.Create();
|
||||
glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle);
|
||||
glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle);
|
||||
|
||||
// Generate VBO handle for drawing
|
||||
vertex_buffer.Create();
|
||||
@@ -508,7 +509,7 @@ void RendererOpenGL::CreateRasterizer() {
|
||||
if (rasterizer) {
|
||||
return;
|
||||
}
|
||||
rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info,
|
||||
rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, device, screen_info,
|
||||
program_manager, state_tracker);
|
||||
}
|
||||
|
||||
@@ -620,10 +621,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
|
||||
state_tracker.NotifyClipControl();
|
||||
state_tracker.NotifyAlphaTest();
|
||||
|
||||
program_manager.UseVertexShader(vertex_program.handle);
|
||||
program_manager.UseGeometryShader(0);
|
||||
program_manager.UseFragmentShader(fragment_program.handle);
|
||||
program_manager.BindGraphicsPipeline();
|
||||
program_manager.BindHostPipeline(pipeline.handle);
|
||||
|
||||
glEnable(GL_CULL_FACE);
|
||||
if (screen_info.display_srgb) {
|
||||
@@ -665,6 +663,8 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
|
||||
|
||||
glClear(GL_COLOR_BUFFER_BIT);
|
||||
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
|
||||
|
||||
program_manager.RestoreGuestPipeline();
|
||||
}
|
||||
|
||||
bool RendererOpenGL::TryPresent(int timeout_ms) {
|
||||
@@ -751,8 +751,9 @@ void RendererOpenGL::RenderScreenshot() {
|
||||
}
|
||||
|
||||
bool RendererOpenGL::Init() {
|
||||
if (GLAD_GL_KHR_debug) {
|
||||
if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
|
||||
glEnable(GL_DEBUG_OUTPUT);
|
||||
glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
|
||||
glDebugMessageCallback(DebugHandler, nullptr);
|
||||
}
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "common/common_types.h"
|
||||
#include "common/math_util.h"
|
||||
#include "video_core/renderer_base.h"
|
||||
#include "video_core/renderer_opengl/gl_device.h"
|
||||
#include "video_core/renderer_opengl/gl_resource_manager.h"
|
||||
#include "video_core/renderer_opengl/gl_shader_manager.h"
|
||||
#include "video_core/renderer_opengl/gl_state_tracker.h"
|
||||
@@ -95,6 +96,7 @@ private:
|
||||
Core::Frontend::EmuWindow& emu_window;
|
||||
Core::System& system;
|
||||
Core::Frontend::GraphicsContext& context;
|
||||
const Device device;
|
||||
|
||||
StateTracker state_tracker{system};
|
||||
|
||||
@@ -102,13 +104,14 @@ private:
|
||||
OGLBuffer vertex_buffer;
|
||||
OGLProgram vertex_program;
|
||||
OGLProgram fragment_program;
|
||||
OGLPipeline pipeline;
|
||||
OGLFramebuffer screenshot_framebuffer;
|
||||
|
||||
/// Display information for Switch screen
|
||||
ScreenInfo screen_info;
|
||||
|
||||
/// Global dummy shader pipeline
|
||||
GLShader::ProgramManager program_manager;
|
||||
ProgramManager program_manager;
|
||||
|
||||
/// OpenGL framebuffer data
|
||||
std::vector<u8> gl_framebuffer_data;
|
||||
|
||||
@@ -71,8 +71,7 @@ void FixedPipelineState::Rasterizer::Fill(const Maxwell& regs) noexcept {
|
||||
const u32 topology_index = static_cast<u32>(regs.draw.topology.Value());
|
||||
|
||||
u32 packed_front_face = PackFrontFace(regs.front_face);
|
||||
if (regs.screen_y_control.triangle_rast_flip != 0 &&
|
||||
regs.viewport_transform[0].scale_y > 0.0f) {
|
||||
if (regs.screen_y_control.triangle_rast_flip != 0) {
|
||||
// Flip front face
|
||||
packed_front_face = 1 - packed_front_face;
|
||||
}
|
||||
|
||||
@@ -142,14 +142,14 @@ struct FormatTuple {
|
||||
{VK_FORMAT_BC6H_UFLOAT_BLOCK}, // BC6H_UF16
|
||||
{VK_FORMAT_BC6H_SFLOAT_BLOCK}, // BC6H_SF16
|
||||
{VK_FORMAT_ASTC_4x4_UNORM_BLOCK}, // ASTC_2D_4X4
|
||||
{VK_FORMAT_B8G8R8A8_UNORM}, // BGRA8
|
||||
{VK_FORMAT_B8G8R8A8_UNORM, Attachable}, // BGRA8
|
||||
{VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage}, // RGBA32F
|
||||
{VK_FORMAT_R32G32_SFLOAT, Attachable | Storage}, // RG32F
|
||||
{VK_FORMAT_R32_SFLOAT, Attachable | Storage}, // R32F
|
||||
{VK_FORMAT_R16_SFLOAT, Attachable | Storage}, // R16F
|
||||
{VK_FORMAT_R16_UNORM, Attachable | Storage}, // R16U
|
||||
{VK_FORMAT_UNDEFINED}, // R16S
|
||||
{VK_FORMAT_UNDEFINED}, // R16UI
|
||||
{VK_FORMAT_R16_UINT, Attachable | Storage}, // R16UI
|
||||
{VK_FORMAT_UNDEFINED}, // R16I
|
||||
{VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // RG16
|
||||
{VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // RG16F
|
||||
@@ -168,7 +168,7 @@ struct FormatTuple {
|
||||
{VK_FORMAT_ASTC_8x8_UNORM_BLOCK}, // ASTC_2D_8X8
|
||||
{VK_FORMAT_UNDEFINED}, // ASTC_2D_8X5
|
||||
{VK_FORMAT_UNDEFINED}, // ASTC_2D_5X4
|
||||
{VK_FORMAT_UNDEFINED}, // BGRA8_SRGB
|
||||
{VK_FORMAT_B8G8R8A8_SRGB, Attachable}, // BGRA8_SRGB
|
||||
{VK_FORMAT_BC1_RGBA_SRGB_BLOCK}, // DXT1_SRGB
|
||||
{VK_FORMAT_BC2_SRGB_BLOCK}, // DXT23_SRGB
|
||||
{VK_FORMAT_BC3_SRGB_BLOCK}, // DXT45_SRGB
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <memory>
|
||||
|
||||
#include "core/core.h"
|
||||
#include "video_core/buffer_cache/buffer_cache.h"
|
||||
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
|
||||
#include "video_core/renderer_vulkan/vk_device.h"
|
||||
#include "video_core/renderer_vulkan/vk_scheduler.h"
|
||||
@@ -36,8 +37,8 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch
|
||||
|
||||
} // Anonymous namespace
|
||||
|
||||
CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
|
||||
VAddr cpu_addr, std::size_t size)
|
||||
Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
|
||||
std::size_t size)
|
||||
: VideoCommon::BufferBlock{cpu_addr, size} {
|
||||
VkBufferCreateInfo ci;
|
||||
ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
|
||||
@@ -53,7 +54,7 @@ CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& me
|
||||
buffer.commit = memory_manager.Commit(buffer.handle, false);
|
||||
}
|
||||
|
||||
CachedBufferBlock::~CachedBufferBlock() = default;
|
||||
Buffer::~Buffer() = default;
|
||||
|
||||
VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
|
||||
const VKDevice& device, VKMemoryManager& memory_manager,
|
||||
@@ -66,12 +67,8 @@ VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::S
|
||||
|
||||
VKBufferCache::~VKBufferCache() = default;
|
||||
|
||||
Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
|
||||
return std::make_shared<CachedBufferBlock>(device, memory_manager, cpu_addr, size);
|
||||
}
|
||||
|
||||
VkBuffer VKBufferCache::ToHandle(const Buffer& buffer) {
|
||||
return buffer->GetHandle();
|
||||
std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
|
||||
return std::make_shared<Buffer>(device, memory_manager, cpu_addr, size);
|
||||
}
|
||||
|
||||
VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) {
|
||||
@@ -90,7 +87,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
|
||||
std::memcpy(staging.commit->Map(size), data, size);
|
||||
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
|
||||
scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
|
||||
size](vk::CommandBuffer cmdbuf) {
|
||||
cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size});
|
||||
|
||||
@@ -113,7 +110,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
|
||||
u8* data) {
|
||||
const auto& staging = staging_pool.GetUnusedBuffer(size, true);
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
|
||||
scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
|
||||
size](vk::CommandBuffer cmdbuf) {
|
||||
VkBufferMemoryBarrier barrier;
|
||||
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
|
||||
@@ -140,8 +137,8 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
|
||||
void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
|
||||
std::size_t dst_offset, std::size_t size) {
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
scheduler.Record([src_buffer = src->GetHandle(), dst_buffer = dst->GetHandle(), src_offset,
|
||||
dst_offset, size](vk::CommandBuffer cmdbuf) {
|
||||
scheduler.Record([src_buffer = src.Handle(), dst_buffer = dst.Handle(), src_offset, dst_offset,
|
||||
size](vk::CommandBuffer cmdbuf) {
|
||||
cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size});
|
||||
|
||||
std::array<VkBufferMemoryBarrier, 2> barriers;
|
||||
|
||||
@@ -24,13 +24,13 @@ class VKDevice;
|
||||
class VKMemoryManager;
|
||||
class VKScheduler;
|
||||
|
||||
class CachedBufferBlock final : public VideoCommon::BufferBlock {
|
||||
class Buffer final : public VideoCommon::BufferBlock {
|
||||
public:
|
||||
explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
|
||||
VAddr cpu_addr, std::size_t size);
|
||||
~CachedBufferBlock();
|
||||
explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
|
||||
std::size_t size);
|
||||
~Buffer();
|
||||
|
||||
VkBuffer GetHandle() const {
|
||||
VkBuffer Handle() const {
|
||||
return *buffer.handle;
|
||||
}
|
||||
|
||||
@@ -38,8 +38,6 @@ private:
|
||||
VKBuffer buffer;
|
||||
};
|
||||
|
||||
using Buffer = std::shared_ptr<CachedBufferBlock>;
|
||||
|
||||
class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {
|
||||
public:
|
||||
explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
|
||||
@@ -50,9 +48,7 @@ public:
|
||||
VkBuffer GetEmptyBuffer(std::size_t size) override;
|
||||
|
||||
protected:
|
||||
VkBuffer ToHandle(const Buffer& buffer) override;
|
||||
|
||||
Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
|
||||
std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
|
||||
|
||||
void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
|
||||
const u8* data) override;
|
||||
|
||||
@@ -53,8 +53,9 @@ vk::DescriptorSetLayout VKComputePipeline::CreateDescriptorSetLayout() const {
|
||||
};
|
||||
add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size());
|
||||
add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size());
|
||||
add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.texel_buffers.size());
|
||||
add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.uniform_texels.size());
|
||||
add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size());
|
||||
add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, entries.storage_texels.size());
|
||||
add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size());
|
||||
|
||||
VkDescriptorSetLayoutCreateInfo ci;
|
||||
|
||||
@@ -42,6 +42,7 @@ vk::DescriptorPool* VKDescriptorPool::AllocateNewPool() {
|
||||
{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60},
|
||||
{VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64},
|
||||
{VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64},
|
||||
{VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, num_sets * 64},
|
||||
{VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}};
|
||||
|
||||
VkDescriptorPoolCreateInfo ci;
|
||||
|
||||
@@ -73,75 +73,79 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType
|
||||
|
||||
std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties(
|
||||
vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) {
|
||||
static constexpr std::array formats{VK_FORMAT_A8B8G8R8_UNORM_PACK32,
|
||||
VK_FORMAT_A8B8G8R8_UINT_PACK32,
|
||||
VK_FORMAT_A8B8G8R8_SNORM_PACK32,
|
||||
VK_FORMAT_A8B8G8R8_SRGB_PACK32,
|
||||
VK_FORMAT_B5G6R5_UNORM_PACK16,
|
||||
VK_FORMAT_A2B10G10R10_UNORM_PACK32,
|
||||
VK_FORMAT_A1R5G5B5_UNORM_PACK16,
|
||||
VK_FORMAT_R32G32B32A32_SFLOAT,
|
||||
VK_FORMAT_R32G32B32A32_UINT,
|
||||
VK_FORMAT_R32G32_SFLOAT,
|
||||
VK_FORMAT_R32G32_UINT,
|
||||
VK_FORMAT_R16G16B16A16_UINT,
|
||||
VK_FORMAT_R16G16B16A16_SNORM,
|
||||
VK_FORMAT_R16G16B16A16_UNORM,
|
||||
VK_FORMAT_R16G16_UNORM,
|
||||
VK_FORMAT_R16G16_SNORM,
|
||||
VK_FORMAT_R16G16_SFLOAT,
|
||||
VK_FORMAT_R16_UNORM,
|
||||
VK_FORMAT_R8G8B8A8_SRGB,
|
||||
VK_FORMAT_R8G8_UNORM,
|
||||
VK_FORMAT_R8G8_SNORM,
|
||||
VK_FORMAT_R8G8_UINT,
|
||||
VK_FORMAT_R8_UNORM,
|
||||
VK_FORMAT_R8_UINT,
|
||||
VK_FORMAT_B10G11R11_UFLOAT_PACK32,
|
||||
VK_FORMAT_R32_SFLOAT,
|
||||
VK_FORMAT_R32_UINT,
|
||||
VK_FORMAT_R32_SINT,
|
||||
VK_FORMAT_R16_SFLOAT,
|
||||
VK_FORMAT_R16G16B16A16_SFLOAT,
|
||||
VK_FORMAT_B8G8R8A8_UNORM,
|
||||
VK_FORMAT_R4G4B4A4_UNORM_PACK16,
|
||||
VK_FORMAT_D32_SFLOAT,
|
||||
VK_FORMAT_D16_UNORM,
|
||||
VK_FORMAT_D16_UNORM_S8_UINT,
|
||||
VK_FORMAT_D24_UNORM_S8_UINT,
|
||||
VK_FORMAT_D32_SFLOAT_S8_UINT,
|
||||
VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
|
||||
VK_FORMAT_BC2_UNORM_BLOCK,
|
||||
VK_FORMAT_BC3_UNORM_BLOCK,
|
||||
VK_FORMAT_BC4_UNORM_BLOCK,
|
||||
VK_FORMAT_BC5_UNORM_BLOCK,
|
||||
VK_FORMAT_BC5_SNORM_BLOCK,
|
||||
VK_FORMAT_BC7_UNORM_BLOCK,
|
||||
VK_FORMAT_BC6H_UFLOAT_BLOCK,
|
||||
VK_FORMAT_BC6H_SFLOAT_BLOCK,
|
||||
VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
|
||||
VK_FORMAT_BC2_SRGB_BLOCK,
|
||||
VK_FORMAT_BC3_SRGB_BLOCK,
|
||||
VK_FORMAT_BC7_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
|
||||
VK_FORMAT_E5B9G9R9_UFLOAT_PACK32};
|
||||
static constexpr std::array formats{
|
||||
VK_FORMAT_A8B8G8R8_UNORM_PACK32,
|
||||
VK_FORMAT_A8B8G8R8_UINT_PACK32,
|
||||
VK_FORMAT_A8B8G8R8_SNORM_PACK32,
|
||||
VK_FORMAT_A8B8G8R8_SRGB_PACK32,
|
||||
VK_FORMAT_B5G6R5_UNORM_PACK16,
|
||||
VK_FORMAT_A2B10G10R10_UNORM_PACK32,
|
||||
VK_FORMAT_A1R5G5B5_UNORM_PACK16,
|
||||
VK_FORMAT_R32G32B32A32_SFLOAT,
|
||||
VK_FORMAT_R32G32B32A32_UINT,
|
||||
VK_FORMAT_R32G32_SFLOAT,
|
||||
VK_FORMAT_R32G32_UINT,
|
||||
VK_FORMAT_R16G16B16A16_UINT,
|
||||
VK_FORMAT_R16G16B16A16_SNORM,
|
||||
VK_FORMAT_R16G16B16A16_UNORM,
|
||||
VK_FORMAT_R16G16_UNORM,
|
||||
VK_FORMAT_R16G16_SNORM,
|
||||
VK_FORMAT_R16G16_SFLOAT,
|
||||
VK_FORMAT_R16_UNORM,
|
||||
VK_FORMAT_R16_UINT,
|
||||
VK_FORMAT_R8G8B8A8_SRGB,
|
||||
VK_FORMAT_R8G8_UNORM,
|
||||
VK_FORMAT_R8G8_SNORM,
|
||||
VK_FORMAT_R8G8_UINT,
|
||||
VK_FORMAT_R8_UNORM,
|
||||
VK_FORMAT_R8_UINT,
|
||||
VK_FORMAT_B10G11R11_UFLOAT_PACK32,
|
||||
VK_FORMAT_R32_SFLOAT,
|
||||
VK_FORMAT_R32_UINT,
|
||||
VK_FORMAT_R32_SINT,
|
||||
VK_FORMAT_R16_SFLOAT,
|
||||
VK_FORMAT_R16G16B16A16_SFLOAT,
|
||||
VK_FORMAT_B8G8R8A8_UNORM,
|
||||
VK_FORMAT_B8G8R8A8_SRGB,
|
||||
VK_FORMAT_R4G4B4A4_UNORM_PACK16,
|
||||
VK_FORMAT_D32_SFLOAT,
|
||||
VK_FORMAT_D16_UNORM,
|
||||
VK_FORMAT_D16_UNORM_S8_UINT,
|
||||
VK_FORMAT_D24_UNORM_S8_UINT,
|
||||
VK_FORMAT_D32_SFLOAT_S8_UINT,
|
||||
VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
|
||||
VK_FORMAT_BC2_UNORM_BLOCK,
|
||||
VK_FORMAT_BC3_UNORM_BLOCK,
|
||||
VK_FORMAT_BC4_UNORM_BLOCK,
|
||||
VK_FORMAT_BC5_UNORM_BLOCK,
|
||||
VK_FORMAT_BC5_SNORM_BLOCK,
|
||||
VK_FORMAT_BC7_UNORM_BLOCK,
|
||||
VK_FORMAT_BC6H_UFLOAT_BLOCK,
|
||||
VK_FORMAT_BC6H_SFLOAT_BLOCK,
|
||||
VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
|
||||
VK_FORMAT_BC2_SRGB_BLOCK,
|
||||
VK_FORMAT_BC3_SRGB_BLOCK,
|
||||
VK_FORMAT_BC7_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
|
||||
VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
|
||||
VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
|
||||
VK_FORMAT_E5B9G9R9_UFLOAT_PACK32,
|
||||
};
|
||||
std::unordered_map<VkFormat, VkFormatProperties> format_properties;
|
||||
for (const auto format : formats) {
|
||||
format_properties.emplace(format, physical.GetFormatProperties(format));
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <memory>
|
||||
|
||||
#include "video_core/fence_manager.h"
|
||||
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
|
||||
#include "video_core/renderer_vulkan/wrapper.h"
|
||||
|
||||
namespace Core {
|
||||
|
||||
@@ -45,6 +45,7 @@ constexpr VkDescriptorType UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
|
||||
constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
|
||||
constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
|
||||
constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
|
||||
constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER;
|
||||
constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
|
||||
|
||||
constexpr VideoCommon::Shader::CompilerSettings compiler_settings{
|
||||
@@ -104,8 +105,9 @@ u32 FillDescriptorLayout(const ShaderEntries& entries,
|
||||
u32 binding = base_binding;
|
||||
AddBindings<UNIFORM_BUFFER>(bindings, binding, flags, entries.const_buffers);
|
||||
AddBindings<STORAGE_BUFFER>(bindings, binding, flags, entries.global_buffers);
|
||||
AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.texel_buffers);
|
||||
AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.uniform_texels);
|
||||
AddBindings<COMBINED_IMAGE_SAMPLER>(bindings, binding, flags, entries.samplers);
|
||||
AddBindings<STORAGE_TEXEL_BUFFER>(bindings, binding, flags, entries.storage_texels);
|
||||
AddBindings<STORAGE_IMAGE>(bindings, binding, flags, entries.images);
|
||||
return binding;
|
||||
}
|
||||
@@ -312,7 +314,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
|
||||
ASSERT(point_size != 0.0f);
|
||||
}
|
||||
for (std::size_t i = 0; i < Maxwell::NumVertexAttributes; ++i) {
|
||||
specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].Type();
|
||||
const auto& attribute = fixed_state.vertex_input.attributes[i];
|
||||
specialization.enabled_attributes[i] = attribute.enabled.Value() != 0;
|
||||
specialization.attribute_types[i] = attribute.Type();
|
||||
}
|
||||
specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one;
|
||||
|
||||
@@ -329,8 +333,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
|
||||
|
||||
const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum);
|
||||
const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
|
||||
ASSERT(cpu_addr);
|
||||
const auto shader = TryGet(*cpu_addr);
|
||||
const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader;
|
||||
ASSERT(shader);
|
||||
|
||||
const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
|
||||
@@ -376,16 +379,17 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER) {
|
||||
// Nvidia has a bug where updating multiple uniform texels at once causes the driver to
|
||||
// crash.
|
||||
if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER ||
|
||||
descriptor_type == STORAGE_TEXEL_BUFFER) {
|
||||
// Nvidia has a bug where updating multiple texels at once causes the driver to crash.
|
||||
// Note: Fixed in driver Windows 443.24, Linux 440.66.15
|
||||
for (u32 i = 0; i < count; ++i) {
|
||||
VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back();
|
||||
entry.dstBinding = binding + i;
|
||||
entry.dstArrayElement = 0;
|
||||
entry.descriptorCount = 1;
|
||||
entry.descriptorType = descriptor_type;
|
||||
entry.offset = offset + i * entry_size;
|
||||
entry.offset = static_cast<std::size_t>(offset + i * entry_size);
|
||||
entry.stride = entry_size;
|
||||
}
|
||||
} else if (count > 0) {
|
||||
@@ -406,8 +410,9 @@ void FillDescriptorUpdateTemplateEntries(
|
||||
std::vector<VkDescriptorUpdateTemplateEntryKHR>& template_entries) {
|
||||
AddEntry<UNIFORM_BUFFER>(template_entries, offset, binding, entries.const_buffers);
|
||||
AddEntry<STORAGE_BUFFER>(template_entries, offset, binding, entries.global_buffers);
|
||||
AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.texel_buffers);
|
||||
AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.uniform_texels);
|
||||
AddEntry<COMBINED_IMAGE_SAMPLER>(template_entries, offset, binding, entries.samplers);
|
||||
AddEntry<STORAGE_TEXEL_BUFFER>(template_entries, offset, binding, entries.storage_texels);
|
||||
AddEntry<STORAGE_IMAGE>(template_entries, offset, binding, entries.images);
|
||||
}
|
||||
|
||||
|
||||
@@ -468,8 +468,9 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
|
||||
const auto& entries = pipeline.GetEntries();
|
||||
SetupComputeConstBuffers(entries);
|
||||
SetupComputeGlobalBuffers(entries);
|
||||
SetupComputeTexelBuffers(entries);
|
||||
SetupComputeUniformTexels(entries);
|
||||
SetupComputeTextures(entries);
|
||||
SetupComputeStorageTexels(entries);
|
||||
SetupComputeImages(entries);
|
||||
|
||||
buffer_cache.Unmap();
|
||||
@@ -532,14 +533,14 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
|
||||
return;
|
||||
}
|
||||
texture_cache.OnCPUWrite(addr, size);
|
||||
pipeline_cache.InvalidateRegion(addr, size);
|
||||
pipeline_cache.OnCPUWrite(addr, size);
|
||||
buffer_cache.OnCPUWrite(addr, size);
|
||||
query_cache.InvalidateRegion(addr, size);
|
||||
}
|
||||
|
||||
void RasterizerVulkan::SyncGuestHost() {
|
||||
texture_cache.SyncGuestHost();
|
||||
buffer_cache.SyncGuestHost();
|
||||
pipeline_cache.SyncGuestHost();
|
||||
}
|
||||
|
||||
void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) {
|
||||
@@ -787,8 +788,9 @@ void RasterizerVulkan::SetupShaderDescriptors(
|
||||
const auto& entries = shader->GetEntries();
|
||||
SetupGraphicsConstBuffers(entries, stage);
|
||||
SetupGraphicsGlobalBuffers(entries, stage);
|
||||
SetupGraphicsTexelBuffers(entries, stage);
|
||||
SetupGraphicsUniformTexels(entries, stage);
|
||||
SetupGraphicsTextures(entries, stage);
|
||||
SetupGraphicsStorageTexels(entries, stage);
|
||||
SetupGraphicsImages(entries, stage);
|
||||
}
|
||||
texture_cache.GuardSamplers(false);
|
||||
@@ -838,6 +840,10 @@ void RasterizerVulkan::BeginTransformFeedback() {
|
||||
if (regs.tfb_enabled == 0) {
|
||||
return;
|
||||
}
|
||||
if (!device.IsExtTransformFeedbackSupported()) {
|
||||
LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
|
||||
return;
|
||||
}
|
||||
|
||||
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
|
||||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
|
||||
@@ -866,6 +872,9 @@ void RasterizerVulkan::EndTransformFeedback() {
|
||||
if (regs.tfb_enabled == 0) {
|
||||
return;
|
||||
}
|
||||
if (!device.IsExtTransformFeedbackSupported()) {
|
||||
return;
|
||||
}
|
||||
|
||||
scheduler.Record(
|
||||
[](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
|
||||
@@ -877,14 +886,10 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex
|
||||
|
||||
for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) {
|
||||
const auto& attrib = regs.vertex_attrib_format[index];
|
||||
if (!attrib.IsValid()) {
|
||||
if (attrib.IsConstant()) {
|
||||
vertex_input.SetAttribute(index, false, 0, 0, {}, {});
|
||||
continue;
|
||||
}
|
||||
|
||||
[[maybe_unused]] const auto& buffer = regs.vertex_array[attrib.buffer];
|
||||
ASSERT(buffer.IsEnabled());
|
||||
|
||||
vertex_input.SetAttribute(index, true, attrib.buffer, attrib.offset, attrib.type.Value(),
|
||||
attrib.size.Value());
|
||||
}
|
||||
@@ -980,12 +985,12 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries,
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) {
|
||||
void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) {
|
||||
MICROPROFILE_SCOPE(Vulkan_Textures);
|
||||
const auto& gpu = system.GPU().Maxwell3D();
|
||||
for (const auto& entry : entries.texel_buffers) {
|
||||
for (const auto& entry : entries.uniform_texels) {
|
||||
const auto image = GetTextureInfo(gpu, entry, stage).tic;
|
||||
SetupTexelBuffer(image, entry);
|
||||
SetupUniformTexels(image, entry);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1000,6 +1005,15 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) {
|
||||
MICROPROFILE_SCOPE(Vulkan_Textures);
|
||||
const auto& gpu = system.GPU().Maxwell3D();
|
||||
for (const auto& entry : entries.storage_texels) {
|
||||
const auto image = GetTextureInfo(gpu, entry, stage).tic;
|
||||
SetupStorageTexel(image, entry);
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) {
|
||||
MICROPROFILE_SCOPE(Vulkan_Images);
|
||||
const auto& gpu = system.GPU().Maxwell3D();
|
||||
@@ -1032,12 +1046,12 @@ void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) {
|
||||
void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
|
||||
MICROPROFILE_SCOPE(Vulkan_Textures);
|
||||
const auto& gpu = system.GPU().KeplerCompute();
|
||||
for (const auto& entry : entries.texel_buffers) {
|
||||
for (const auto& entry : entries.uniform_texels) {
|
||||
const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
|
||||
SetupTexelBuffer(image, entry);
|
||||
SetupUniformTexels(image, entry);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1052,6 +1066,15 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
|
||||
MICROPROFILE_SCOPE(Vulkan_Textures);
|
||||
const auto& gpu = system.GPU().KeplerCompute();
|
||||
for (const auto& entry : entries.storage_texels) {
|
||||
const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
|
||||
SetupStorageTexel(image, entry);
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
|
||||
MICROPROFILE_SCOPE(Vulkan_Images);
|
||||
const auto& gpu = system.GPU().KeplerCompute();
|
||||
@@ -1101,8 +1124,8 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd
|
||||
update_descriptor_queue.AddBuffer(buffer, offset, size);
|
||||
}
|
||||
|
||||
void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic,
|
||||
const TexelBufferEntry& entry) {
|
||||
void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic,
|
||||
const UniformTexelEntry& entry) {
|
||||
const auto view = texture_cache.GetTextureSurface(tic, entry);
|
||||
ASSERT(view->IsBufferView());
|
||||
|
||||
@@ -1124,6 +1147,14 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
|
||||
sampled_views.push_back(ImageView{std::move(view), image_layout});
|
||||
}
|
||||
|
||||
void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic,
|
||||
const StorageTexelEntry& entry) {
|
||||
const auto view = texture_cache.GetImageSurface(tic, entry);
|
||||
ASSERT(view->IsBufferView());
|
||||
|
||||
update_descriptor_queue.AddTexelBuffer(view->GetBufferView());
|
||||
}
|
||||
|
||||
void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) {
|
||||
auto view = texture_cache.GetImageSurface(tic, entry);
|
||||
|
||||
|
||||
@@ -193,12 +193,15 @@ private:
|
||||
/// Setup global buffers in the graphics pipeline.
|
||||
void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage);
|
||||
|
||||
/// Setup texel buffers in the graphics pipeline.
|
||||
void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage);
|
||||
/// Setup uniform texels in the graphics pipeline.
|
||||
void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);
|
||||
|
||||
/// Setup textures in the graphics pipeline.
|
||||
void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage);
|
||||
|
||||
/// Setup storage texels in the graphics pipeline.
|
||||
void SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage);
|
||||
|
||||
/// Setup images in the graphics pipeline.
|
||||
void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage);
|
||||
|
||||
@@ -209,11 +212,14 @@ private:
|
||||
void SetupComputeGlobalBuffers(const ShaderEntries& entries);
|
||||
|
||||
/// Setup texel buffers in the compute pipeline.
|
||||
void SetupComputeTexelBuffers(const ShaderEntries& entries);
|
||||
void SetupComputeUniformTexels(const ShaderEntries& entries);
|
||||
|
||||
/// Setup textures in the compute pipeline.
|
||||
void SetupComputeTextures(const ShaderEntries& entries);
|
||||
|
||||
/// Setup storage texels in the compute pipeline.
|
||||
void SetupComputeStorageTexels(const ShaderEntries& entries);
|
||||
|
||||
/// Setup images in the compute pipeline.
|
||||
void SetupComputeImages(const ShaderEntries& entries);
|
||||
|
||||
@@ -222,10 +228,12 @@ private:
|
||||
|
||||
void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address);
|
||||
|
||||
void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry);
|
||||
void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry);
|
||||
|
||||
void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry);
|
||||
|
||||
void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry);
|
||||
|
||||
void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
|
||||
|
||||
void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
|
||||
|
||||
@@ -400,8 +400,9 @@ private:
|
||||
u32 binding = specialization.base_binding;
|
||||
binding = DeclareConstantBuffers(binding);
|
||||
binding = DeclareGlobalBuffers(binding);
|
||||
binding = DeclareTexelBuffers(binding);
|
||||
binding = DeclareUniformTexels(binding);
|
||||
binding = DeclareSamplers(binding);
|
||||
binding = DeclareStorageTexels(binding);
|
||||
binding = DeclareImages(binding);
|
||||
|
||||
const Id main = OpFunction(t_void, {}, TypeFunction(t_void));
|
||||
@@ -515,6 +516,16 @@ private:
|
||||
void DeclareCommon() {
|
||||
thread_id =
|
||||
DeclareInputBuiltIn(spv::BuiltIn::SubgroupLocalInvocationId, t_in_uint, "thread_id");
|
||||
thread_masks[0] =
|
||||
DeclareInputBuiltIn(spv::BuiltIn::SubgroupEqMask, t_in_uint4, "thread_eq_mask");
|
||||
thread_masks[1] =
|
||||
DeclareInputBuiltIn(spv::BuiltIn::SubgroupGeMask, t_in_uint4, "thread_ge_mask");
|
||||
thread_masks[2] =
|
||||
DeclareInputBuiltIn(spv::BuiltIn::SubgroupGtMask, t_in_uint4, "thread_gt_mask");
|
||||
thread_masks[3] =
|
||||
DeclareInputBuiltIn(spv::BuiltIn::SubgroupLeMask, t_in_uint4, "thread_le_mask");
|
||||
thread_masks[4] =
|
||||
DeclareInputBuiltIn(spv::BuiltIn::SubgroupLtMask, t_in_uint4, "thread_lt_mask");
|
||||
}
|
||||
|
||||
void DeclareVertex() {
|
||||
@@ -731,8 +742,10 @@ private:
|
||||
if (!IsGenericAttribute(index)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const u32 location = GetGenericAttributeLocation(index);
|
||||
if (!IsAttributeEnabled(location)) {
|
||||
continue;
|
||||
}
|
||||
const auto type_descriptor = GetAttributeType(location);
|
||||
Id type;
|
||||
if (IsInputAttributeArray()) {
|
||||
@@ -877,7 +890,7 @@ private:
|
||||
return binding;
|
||||
}
|
||||
|
||||
u32 DeclareTexelBuffers(u32 binding) {
|
||||
u32 DeclareUniformTexels(u32 binding) {
|
||||
for (const auto& sampler : ir.GetSamplers()) {
|
||||
if (!sampler.is_buffer) {
|
||||
continue;
|
||||
@@ -898,7 +911,7 @@ private:
|
||||
Decorate(id, spv::Decoration::Binding, binding++);
|
||||
Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
|
||||
|
||||
texel_buffers.emplace(sampler.index, TexelBuffer{image_type, id});
|
||||
uniform_texels.emplace(sampler.index, TexelBuffer{image_type, id});
|
||||
}
|
||||
return binding;
|
||||
}
|
||||
@@ -933,31 +946,48 @@ private:
|
||||
return binding;
|
||||
}
|
||||
|
||||
u32 DeclareImages(u32 binding) {
|
||||
u32 DeclareStorageTexels(u32 binding) {
|
||||
for (const auto& image : ir.GetImages()) {
|
||||
const auto [dim, arrayed] = GetImageDim(image);
|
||||
constexpr int depth = 0;
|
||||
constexpr bool ms = false;
|
||||
constexpr int sampled = 2; // This won't be accessed with a sampler
|
||||
constexpr auto format = spv::ImageFormat::Unknown;
|
||||
const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
|
||||
const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
|
||||
const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
|
||||
AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
|
||||
|
||||
Decorate(id, spv::Decoration::Binding, binding++);
|
||||
Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
|
||||
if (image.is_read && !image.is_written) {
|
||||
Decorate(id, spv::Decoration::NonWritable);
|
||||
} else if (image.is_written && !image.is_read) {
|
||||
Decorate(id, spv::Decoration::NonReadable);
|
||||
if (image.type != Tegra::Shader::ImageType::TextureBuffer) {
|
||||
continue;
|
||||
}
|
||||
|
||||
images.emplace(image.index, StorageImage{image_type, id});
|
||||
DeclareImage(image, binding);
|
||||
}
|
||||
return binding;
|
||||
}
|
||||
|
||||
u32 DeclareImages(u32 binding) {
|
||||
for (const auto& image : ir.GetImages()) {
|
||||
if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
|
||||
continue;
|
||||
}
|
||||
DeclareImage(image, binding);
|
||||
}
|
||||
return binding;
|
||||
}
|
||||
|
||||
void DeclareImage(const Image& image, u32& binding) {
|
||||
const auto [dim, arrayed] = GetImageDim(image);
|
||||
constexpr int depth = 0;
|
||||
constexpr bool ms = false;
|
||||
constexpr int sampled = 2; // This won't be accessed with a sampler
|
||||
const auto format = image.is_atomic ? spv::ImageFormat::R32ui : spv::ImageFormat::Unknown;
|
||||
const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
|
||||
const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
|
||||
const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
|
||||
AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
|
||||
|
||||
Decorate(id, spv::Decoration::Binding, binding++);
|
||||
Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
|
||||
if (image.is_read && !image.is_written) {
|
||||
Decorate(id, spv::Decoration::NonWritable);
|
||||
} else if (image.is_written && !image.is_read) {
|
||||
Decorate(id, spv::Decoration::NonReadable);
|
||||
}
|
||||
|
||||
images.emplace(image.index, StorageImage{image_type, id});
|
||||
}
|
||||
|
||||
bool IsRenderTargetEnabled(u32 rt) const {
|
||||
for (u32 component = 0; component < 4; ++component) {
|
||||
if (header.ps.IsColorComponentOutputEnabled(rt, component)) {
|
||||
@@ -976,6 +1006,10 @@ private:
|
||||
return stage == ShaderType::TesselationControl;
|
||||
}
|
||||
|
||||
bool IsAttributeEnabled(u32 location) const {
|
||||
return stage != ShaderType::Vertex || specialization.enabled_attributes[location];
|
||||
}
|
||||
|
||||
u32 GetNumInputVertices() const {
|
||||
switch (stage) {
|
||||
case ShaderType::Geometry:
|
||||
@@ -1071,8 +1105,7 @@ private:
|
||||
|
||||
void VisitBasicBlock(const NodeBlock& bb) {
|
||||
for (const auto& node : bb) {
|
||||
[[maybe_unused]] const Type type = Visit(node).type;
|
||||
ASSERT(type == Type::Void);
|
||||
Visit(node);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1192,16 +1225,20 @@ private:
|
||||
UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element);
|
||||
return {v_float_zero, Type::Float};
|
||||
default:
|
||||
if (IsGenericAttribute(attribute)) {
|
||||
const u32 location = GetGenericAttributeLocation(attribute);
|
||||
const auto type_descriptor = GetAttributeType(location);
|
||||
const Type type = type_descriptor.type;
|
||||
const Id attribute_id = input_attributes.at(attribute);
|
||||
const std::vector elements = {element};
|
||||
const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
|
||||
return {OpLoad(GetTypeDefinition(type), pointer), type};
|
||||
if (!IsGenericAttribute(attribute)) {
|
||||
break;
|
||||
}
|
||||
break;
|
||||
const u32 location = GetGenericAttributeLocation(attribute);
|
||||
if (!IsAttributeEnabled(location)) {
|
||||
// Disabled attributes (also known as constant attributes) always return zero.
|
||||
return {v_float_zero, Type::Float};
|
||||
}
|
||||
const auto type_descriptor = GetAttributeType(location);
|
||||
const Type type = type_descriptor.type;
|
||||
const Id attribute_id = input_attributes.at(attribute);
|
||||
const std::vector elements = {element};
|
||||
const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements);
|
||||
return {OpLoad(GetTypeDefinition(type), pointer), type};
|
||||
}
|
||||
UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
|
||||
return {v_float_zero, Type::Float};
|
||||
@@ -1237,7 +1274,7 @@ private:
|
||||
} else {
|
||||
UNREACHABLE_MSG("Unmanaged offset node type");
|
||||
}
|
||||
pointer = OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), buffer_index,
|
||||
pointer = OpAccessChain(t_cbuf_float, buffer_id, v_uint_zero, buffer_index,
|
||||
buffer_element);
|
||||
}
|
||||
return {OpLoad(t_float, pointer), Type::Float};
|
||||
@@ -1362,7 +1399,9 @@ private:
|
||||
Expression target{};
|
||||
if (const auto gpr = std::get_if<GprNode>(&*dest)) {
|
||||
if (gpr->GetIndex() == Register::ZeroIndex) {
|
||||
// Writing to Register::ZeroIndex is a no op
|
||||
// Writing to Register::ZeroIndex is a no op but we still have to visit its source
|
||||
// because it might have side effects.
|
||||
Visit(src);
|
||||
return {};
|
||||
}
|
||||
target = {registers.at(gpr->GetIndex()), Type::Float};
|
||||
@@ -1590,7 +1629,7 @@ private:
|
||||
|
||||
const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b);
|
||||
const Id carry = OpCompositeExtract(t_uint, result, 1);
|
||||
return {OpINotEqual(t_bool, carry, Constant(t_uint, 0)), Type::Bool};
|
||||
return {OpINotEqual(t_bool, carry, v_uint_zero), Type::Bool};
|
||||
}
|
||||
|
||||
Expression LogicalAssign(Operation operation) {
|
||||
@@ -1653,7 +1692,7 @@ private:
|
||||
const auto& meta = std::get<MetaTexture>(operation.GetMeta());
|
||||
const u32 index = meta.sampler.index;
|
||||
if (meta.sampler.is_buffer) {
|
||||
const auto& entry = texel_buffers.at(index);
|
||||
const auto& entry = uniform_texels.at(index);
|
||||
return OpLoad(entry.image_type, entry.image);
|
||||
} else {
|
||||
const auto& entry = sampled_images.at(index);
|
||||
@@ -1930,39 +1969,20 @@ private:
|
||||
return {};
|
||||
}
|
||||
|
||||
Expression AtomicImageAdd(Operation operation) {
|
||||
UNIMPLEMENTED();
|
||||
return {};
|
||||
}
|
||||
template <Id (Module::*func)(Id, Id, Id, Id, Id)>
|
||||
Expression AtomicImage(Operation operation) {
|
||||
const auto& meta{std::get<MetaImage>(operation.GetMeta())};
|
||||
ASSERT(meta.values.size() == 1);
|
||||
|
||||
Expression AtomicImageMin(Operation operation) {
|
||||
UNIMPLEMENTED();
|
||||
return {};
|
||||
}
|
||||
const Id coordinate = GetCoordinates(operation, Type::Int);
|
||||
const Id image = images.at(meta.image.index).image;
|
||||
const Id sample = v_uint_zero;
|
||||
const Id pointer = OpImageTexelPointer(t_image_uint, image, coordinate, sample);
|
||||
|
||||
Expression AtomicImageMax(Operation operation) {
|
||||
UNIMPLEMENTED();
|
||||
return {};
|
||||
}
|
||||
|
||||
Expression AtomicImageAnd(Operation operation) {
|
||||
UNIMPLEMENTED();
|
||||
return {};
|
||||
}
|
||||
|
||||
Expression AtomicImageOr(Operation operation) {
|
||||
UNIMPLEMENTED();
|
||||
return {};
|
||||
}
|
||||
|
||||
Expression AtomicImageXor(Operation operation) {
|
||||
UNIMPLEMENTED();
|
||||
return {};
|
||||
}
|
||||
|
||||
Expression AtomicImageExchange(Operation operation) {
|
||||
UNIMPLEMENTED();
|
||||
return {};
|
||||
const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
|
||||
const Id semantics = v_uint_zero;
|
||||
const Id value = AsUint(Visit(meta.values[0]));
|
||||
return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
|
||||
}
|
||||
|
||||
template <Id (Module::*func)(Id, Id, Id, Id, Id)>
|
||||
@@ -1977,7 +1997,7 @@ private:
|
||||
return {v_float_zero, Type::Float};
|
||||
}
|
||||
const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
|
||||
const Id semantics = Constant(t_uint, 0);
|
||||
const Id semantics = v_uint_zero;
|
||||
const Id value = AsUint(Visit(operation[1]));
|
||||
|
||||
return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
|
||||
@@ -2175,14 +2195,37 @@ private:
|
||||
return {OpLoad(t_uint, thread_id), Type::Uint};
|
||||
}
|
||||
|
||||
template <std::size_t index>
|
||||
Expression ThreadMask(Operation) {
|
||||
// TODO(Rodrigo): Handle devices with different warp sizes
|
||||
const Id mask = thread_masks[index];
|
||||
return {OpLoad(t_uint, AccessElement(t_in_uint, mask, 0)), Type::Uint};
|
||||
}
|
||||
|
||||
Expression ShuffleIndexed(Operation operation) {
|
||||
const Id value = AsFloat(Visit(operation[0]));
|
||||
const Id index = AsUint(Visit(operation[1]));
|
||||
return {OpSubgroupReadInvocationKHR(t_float, value, index), Type::Float};
|
||||
}
|
||||
|
||||
Expression MemoryBarrierGL(Operation) {
|
||||
const auto scope = spv::Scope::Device;
|
||||
Expression Barrier(Operation) {
|
||||
if (!ir.IsDecompiled()) {
|
||||
LOG_ERROR(Render_Vulkan, "OpBarrier used by shader is not decompiled");
|
||||
return {};
|
||||
}
|
||||
|
||||
const auto scope = spv::Scope::Workgroup;
|
||||
const auto memory = spv::Scope::Workgroup;
|
||||
const auto semantics =
|
||||
spv::MemorySemanticsMask::WorkgroupMemory | spv::MemorySemanticsMask::AcquireRelease;
|
||||
OpControlBarrier(Constant(t_uint, static_cast<u32>(scope)),
|
||||
Constant(t_uint, static_cast<u32>(memory)),
|
||||
Constant(t_uint, static_cast<u32>(semantics)));
|
||||
return {};
|
||||
}
|
||||
|
||||
template <spv::Scope scope>
|
||||
Expression MemoryBarrier(Operation) {
|
||||
const auto semantics =
|
||||
spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory |
|
||||
spv::MemorySemanticsMask::WorkgroupMemory |
|
||||
@@ -2578,11 +2621,11 @@ private:
|
||||
|
||||
&SPIRVDecompiler::ImageLoad,
|
||||
&SPIRVDecompiler::ImageStore,
|
||||
&SPIRVDecompiler::AtomicImageAdd,
|
||||
&SPIRVDecompiler::AtomicImageAnd,
|
||||
&SPIRVDecompiler::AtomicImageOr,
|
||||
&SPIRVDecompiler::AtomicImageXor,
|
||||
&SPIRVDecompiler::AtomicImageExchange,
|
||||
&SPIRVDecompiler::AtomicImage<&Module::OpAtomicIAdd>,
|
||||
&SPIRVDecompiler::AtomicImage<&Module::OpAtomicAnd>,
|
||||
&SPIRVDecompiler::AtomicImage<&Module::OpAtomicOr>,
|
||||
&SPIRVDecompiler::AtomicImage<&Module::OpAtomicXor>,
|
||||
&SPIRVDecompiler::AtomicImage<&Module::OpAtomicExchange>,
|
||||
|
||||
&SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,
|
||||
&SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>,
|
||||
@@ -2639,9 +2682,16 @@ private:
|
||||
&SPIRVDecompiler::Vote<&Module::OpSubgroupAllEqualKHR>,
|
||||
|
||||
&SPIRVDecompiler::ThreadId,
|
||||
&SPIRVDecompiler::ThreadMask<0>, // Eq
|
||||
&SPIRVDecompiler::ThreadMask<1>, // Ge
|
||||
&SPIRVDecompiler::ThreadMask<2>, // Gt
|
||||
&SPIRVDecompiler::ThreadMask<3>, // Le
|
||||
&SPIRVDecompiler::ThreadMask<4>, // Lt
|
||||
&SPIRVDecompiler::ShuffleIndexed,
|
||||
|
||||
&SPIRVDecompiler::MemoryBarrierGL,
|
||||
&SPIRVDecompiler::Barrier,
|
||||
&SPIRVDecompiler::MemoryBarrier<spv::Scope::Workgroup>,
|
||||
&SPIRVDecompiler::MemoryBarrier<spv::Scope::Device>,
|
||||
};
|
||||
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
|
||||
|
||||
@@ -2717,8 +2767,11 @@ private:
|
||||
Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
|
||||
const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct);
|
||||
|
||||
const Id t_image_uint = TypePointer(spv::StorageClass::Image, t_uint);
|
||||
|
||||
const Id v_float_zero = Constant(t_float, 0.0f);
|
||||
const Id v_float_one = Constant(t_float, 1.0f);
|
||||
const Id v_uint_zero = Constant(t_uint, 0);
|
||||
|
||||
// Nvidia uses these defaults for varyings (e.g. position and generic attributes)
|
||||
const Id v_varying_default =
|
||||
@@ -2743,15 +2796,16 @@ private:
|
||||
std::unordered_map<u8, GenericVaryingDescription> output_attributes;
|
||||
std::map<u32, Id> constant_buffers;
|
||||
std::map<GlobalMemoryBase, Id> global_buffers;
|
||||
std::map<u32, TexelBuffer> texel_buffers;
|
||||
std::map<u32, TexelBuffer> uniform_texels;
|
||||
std::map<u32, SampledImage> sampled_images;
|
||||
std::map<u32, TexelBuffer> storage_texels;
|
||||
std::map<u32, StorageImage> images;
|
||||
|
||||
std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
|
||||
Id instance_index{};
|
||||
Id vertex_index{};
|
||||
Id base_instance{};
|
||||
Id base_vertex{};
|
||||
std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
|
||||
Id frag_depth{};
|
||||
Id frag_coord{};
|
||||
Id front_facing{};
|
||||
@@ -2763,6 +2817,7 @@ private:
|
||||
Id workgroup_id{};
|
||||
Id local_invocation_id{};
|
||||
Id thread_id{};
|
||||
std::array<Id, 5> thread_masks{}; // eq, ge, gt, le, lt
|
||||
|
||||
VertexIndices in_indices;
|
||||
VertexIndices out_indices;
|
||||
@@ -3006,13 +3061,17 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
|
||||
}
|
||||
for (const auto& sampler : ir.GetSamplers()) {
|
||||
if (sampler.is_buffer) {
|
||||
entries.texel_buffers.emplace_back(sampler);
|
||||
entries.uniform_texels.emplace_back(sampler);
|
||||
} else {
|
||||
entries.samplers.emplace_back(sampler);
|
||||
}
|
||||
}
|
||||
for (const auto& image : ir.GetImages()) {
|
||||
entries.images.emplace_back(image);
|
||||
if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
|
||||
entries.storage_texels.emplace_back(image);
|
||||
} else {
|
||||
entries.images.emplace_back(image);
|
||||
}
|
||||
}
|
||||
for (const auto& attribute : ir.GetInputAttributes()) {
|
||||
if (IsGenericAttribute(attribute)) {
|
||||
|
||||
@@ -21,8 +21,9 @@ class VKDevice;
|
||||
namespace Vulkan {
|
||||
|
||||
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
|
||||
using TexelBufferEntry = VideoCommon::Shader::Sampler;
|
||||
using UniformTexelEntry = VideoCommon::Shader::Sampler;
|
||||
using SamplerEntry = VideoCommon::Shader::Sampler;
|
||||
using StorageTexelEntry = VideoCommon::Shader::Image;
|
||||
using ImageEntry = VideoCommon::Shader::Image;
|
||||
|
||||
constexpr u32 DESCRIPTOR_SET = 0;
|
||||
@@ -66,13 +67,15 @@ private:
|
||||
struct ShaderEntries {
|
||||
u32 NumBindings() const {
|
||||
return static_cast<u32>(const_buffers.size() + global_buffers.size() +
|
||||
texel_buffers.size() + samplers.size() + images.size());
|
||||
uniform_texels.size() + samplers.size() + storage_texels.size() +
|
||||
images.size());
|
||||
}
|
||||
|
||||
std::vector<ConstBufferEntry> const_buffers;
|
||||
std::vector<GlobalBufferEntry> global_buffers;
|
||||
std::vector<TexelBufferEntry> texel_buffers;
|
||||
std::vector<UniformTexelEntry> uniform_texels;
|
||||
std::vector<SamplerEntry> samplers;
|
||||
std::vector<StorageTexelEntry> storage_texels;
|
||||
std::vector<ImageEntry> images;
|
||||
std::set<u32> attributes;
|
||||
std::array<bool, Maxwell::NumClipDistances> clip_distances{};
|
||||
@@ -88,7 +91,8 @@ struct Specialization final {
|
||||
u32 shared_memory_size{};
|
||||
|
||||
// Graphics specific
|
||||
std::optional<float> point_size{};
|
||||
std::optional<float> point_size;
|
||||
std::bitset<Maxwell::NumVertexAttributes> enabled_attributes;
|
||||
std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{};
|
||||
bool ndc_minus_one_to_one{};
|
||||
};
|
||||
|
||||
@@ -35,7 +35,7 @@ public:
|
||||
/// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
|
||||
void Unmap(u64 size);
|
||||
|
||||
VkBuffer GetHandle() const {
|
||||
VkBuffer Handle() const {
|
||||
return *buffer;
|
||||
}
|
||||
|
||||
|
||||
@@ -100,8 +100,8 @@ vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params,
|
||||
ci.pNext = nullptr;
|
||||
ci.flags = 0;
|
||||
ci.size = static_cast<VkDeviceSize>(host_memory_size);
|
||||
ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
|
||||
VK_BUFFER_USAGE_TRANSFER_DST_BIT;
|
||||
ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT |
|
||||
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
|
||||
ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
|
||||
ci.queueFamilyIndexCount = 0;
|
||||
ci.pQueueFamilyIndices = nullptr;
|
||||
@@ -354,26 +354,23 @@ CachedSurfaceView::~CachedSurfaceView() = default;
|
||||
|
||||
VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source,
|
||||
SwizzleSource z_source, SwizzleSource w_source) {
|
||||
const u32 swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
|
||||
if (last_image_view && last_swizzle == swizzle) {
|
||||
const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
|
||||
if (last_image_view && last_swizzle == new_swizzle) {
|
||||
return last_image_view;
|
||||
}
|
||||
last_swizzle = swizzle;
|
||||
last_swizzle = new_swizzle;
|
||||
|
||||
const auto [entry, is_cache_miss] = view_cache.try_emplace(swizzle);
|
||||
const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
|
||||
auto& image_view = entry->second;
|
||||
if (!is_cache_miss) {
|
||||
return last_image_view = *image_view;
|
||||
}
|
||||
|
||||
auto swizzle_x = MaxwellToVK::SwizzleSource(x_source);
|
||||
auto swizzle_y = MaxwellToVK::SwizzleSource(y_source);
|
||||
auto swizzle_z = MaxwellToVK::SwizzleSource(z_source);
|
||||
auto swizzle_w = MaxwellToVK::SwizzleSource(w_source);
|
||||
|
||||
std::array swizzle{MaxwellToVK::SwizzleSource(x_source), MaxwellToVK::SwizzleSource(y_source),
|
||||
MaxwellToVK::SwizzleSource(z_source), MaxwellToVK::SwizzleSource(w_source)};
|
||||
if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) {
|
||||
// A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here.
|
||||
std::swap(swizzle_x, swizzle_z);
|
||||
std::swap(swizzle[0], swizzle[2]);
|
||||
}
|
||||
|
||||
// Games can sample depth or stencil values on textures. This is decided by the swizzle value on
|
||||
@@ -395,11 +392,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
|
||||
UNIMPLEMENTED();
|
||||
}
|
||||
|
||||
// Vulkan doesn't seem to understand swizzling of a depth stencil image, use identity
|
||||
swizzle_x = VK_COMPONENT_SWIZZLE_R;
|
||||
swizzle_y = VK_COMPONENT_SWIZZLE_G;
|
||||
swizzle_z = VK_COMPONENT_SWIZZLE_B;
|
||||
swizzle_w = VK_COMPONENT_SWIZZLE_A;
|
||||
// Make sure we sample the first component
|
||||
std::transform(
|
||||
swizzle.begin(), swizzle.end(), swizzle.begin(), [](VkComponentSwizzle component) {
|
||||
return component == VK_COMPONENT_SWIZZLE_G ? VK_COMPONENT_SWIZZLE_R : component;
|
||||
});
|
||||
}
|
||||
|
||||
VkImageViewCreateInfo ci;
|
||||
@@ -409,7 +406,7 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
|
||||
ci.image = surface.GetImageHandle();
|
||||
ci.viewType = image_view_type;
|
||||
ci.format = surface.GetImage().GetFormat();
|
||||
ci.components = {swizzle_x, swizzle_y, swizzle_z, swizzle_w};
|
||||
ci.components = {swizzle[0], swizzle[1], swizzle[2], swizzle[3]};
|
||||
ci.subresourceRange.aspectMask = aspect;
|
||||
ci.subresourceRange.baseMipLevel = base_level;
|
||||
ci.subresourceRange.levelCount = num_levels;
|
||||
|
||||
@@ -387,7 +387,6 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
|
||||
}
|
||||
case OpCode::Id::RED: {
|
||||
UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32);
|
||||
UNIMPLEMENTED_IF_MSG(instr.red.operation != AtomicOp::Add);
|
||||
const auto [real_address, base_address, descriptor] =
|
||||
TrackGlobalMemory(bb, instr, true, true);
|
||||
if (!real_address || !base_address) {
|
||||
@@ -396,7 +395,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
|
||||
}
|
||||
Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
|
||||
Node value = GetRegister(instr.gpr0);
|
||||
bb.push_back(Operation(OperationCode::ReduceIAdd, move(gmem), move(value)));
|
||||
bb.push_back(Operation(GetAtomOperation(instr.red.operation), move(gmem), move(value)));
|
||||
break;
|
||||
}
|
||||
case OpCode::Id::ATOM: {
|
||||
|
||||
@@ -83,7 +83,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
|
||||
return Operation(OperationCode::YNegate);
|
||||
case SystemVariable::InvocationInfo:
|
||||
LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete");
|
||||
return Immediate(0U);
|
||||
return Immediate(0x00ff'0000U);
|
||||
case SystemVariable::WscaleFactorXY:
|
||||
UNIMPLEMENTED_MSG("S2R WscaleFactorXY is not implemented");
|
||||
return Immediate(0U);
|
||||
@@ -109,6 +109,27 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
|
||||
return Operation(OperationCode::WorkGroupIdY);
|
||||
case SystemVariable::CtaIdZ:
|
||||
return Operation(OperationCode::WorkGroupIdZ);
|
||||
case SystemVariable::EqMask:
|
||||
case SystemVariable::LtMask:
|
||||
case SystemVariable::LeMask:
|
||||
case SystemVariable::GtMask:
|
||||
case SystemVariable::GeMask:
|
||||
uses_warps = true;
|
||||
switch (instr.sys20) {
|
||||
case SystemVariable::EqMask:
|
||||
return Operation(OperationCode::ThreadEqMask);
|
||||
case SystemVariable::LtMask:
|
||||
return Operation(OperationCode::ThreadLtMask);
|
||||
case SystemVariable::LeMask:
|
||||
return Operation(OperationCode::ThreadLeMask);
|
||||
case SystemVariable::GtMask:
|
||||
return Operation(OperationCode::ThreadGtMask);
|
||||
case SystemVariable::GeMask:
|
||||
return Operation(OperationCode::ThreadGeMask);
|
||||
default:
|
||||
UNREACHABLE();
|
||||
return Immediate(0u);
|
||||
}
|
||||
default:
|
||||
UNIMPLEMENTED_MSG("Unhandled system move: {}",
|
||||
static_cast<u32>(instr.sys20.Value()));
|
||||
@@ -272,10 +293,25 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
|
||||
SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8));
|
||||
break;
|
||||
}
|
||||
case OpCode::Id::BAR: {
|
||||
UNIMPLEMENTED_IF_MSG(instr.value != 0xF0A81B8000070000ULL, "BAR is not BAR.SYNC 0x0");
|
||||
bb.push_back(Operation(OperationCode::Barrier));
|
||||
break;
|
||||
}
|
||||
case OpCode::Id::MEMBAR: {
|
||||
UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL);
|
||||
UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default);
|
||||
bb.push_back(Operation(OperationCode::MemoryBarrierGL));
|
||||
const OperationCode type = [instr] {
|
||||
switch (instr.membar.type) {
|
||||
case Tegra::Shader::MembarType::CTA:
|
||||
return OperationCode::MemoryBarrierGroup;
|
||||
case Tegra::Shader::MembarType::GL:
|
||||
return OperationCode::MemoryBarrierGlobal;
|
||||
default:
|
||||
UNIMPLEMENTED_MSG("MEMBAR type={}", static_cast<int>(instr.membar.type.Value()));
|
||||
return OperationCode::MemoryBarrierGlobal;
|
||||
}
|
||||
}();
|
||||
bb.push_back(Operation(type));
|
||||
break;
|
||||
}
|
||||
case OpCode::Id::DEPBAR: {
|
||||
|
||||
@@ -226,9 +226,16 @@ enum class OperationCode {
|
||||
VoteEqual, /// (bool) -> bool
|
||||
|
||||
ThreadId, /// () -> uint
|
||||
ThreadEqMask, /// () -> uint
|
||||
ThreadGeMask, /// () -> uint
|
||||
ThreadGtMask, /// () -> uint
|
||||
ThreadLeMask, /// () -> uint
|
||||
ThreadLtMask, /// () -> uint
|
||||
ShuffleIndexed, /// (uint value, uint index) -> uint
|
||||
|
||||
MemoryBarrierGL, /// () -> void
|
||||
Barrier, /// () -> void
|
||||
MemoryBarrierGroup, /// () -> void
|
||||
MemoryBarrierGlobal, /// () -> void
|
||||
|
||||
Amount,
|
||||
};
|
||||
|
||||
@@ -41,7 +41,7 @@ struct Table {
|
||||
ComponentType alpha_component;
|
||||
bool is_srgb;
|
||||
};
|
||||
constexpr std::array<Table, 77> DefinitionTable = {{
|
||||
constexpr std::array<Table, 78> DefinitionTable = {{
|
||||
{TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U},
|
||||
{TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S},
|
||||
{TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI},
|
||||
@@ -98,6 +98,7 @@ constexpr std::array<Table, 77> DefinitionTable = {{
|
||||
{TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F},
|
||||
{TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16},
|
||||
{TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
|
||||
{TextureFormat::G24R8, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24},
|
||||
{TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8},
|
||||
|
||||
{TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1},
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/container/small_vector.hpp>
|
||||
#include <boost/icl/interval_map.hpp>
|
||||
#include <boost/range/iterator_range.hpp>
|
||||
|
||||
@@ -53,6 +54,7 @@ using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;
|
||||
|
||||
template <typename TSurface, typename TView>
|
||||
class TextureCache {
|
||||
using VectorSurface = boost::container::small_vector<TSurface, 1>;
|
||||
|
||||
public:
|
||||
void InvalidateRegion(VAddr addr, std::size_t size) {
|
||||
@@ -308,18 +310,20 @@ public:
|
||||
dst_surface.first->MarkAsModified(true, Tick());
|
||||
}
|
||||
|
||||
TSurface TryFindFramebufferSurface(VAddr addr) {
|
||||
TSurface TryFindFramebufferSurface(VAddr addr) const {
|
||||
if (!addr) {
|
||||
return nullptr;
|
||||
}
|
||||
const VAddr page = addr >> registry_page_bits;
|
||||
std::vector<TSurface>& list = registry[page];
|
||||
for (auto& surface : list) {
|
||||
if (surface->GetCpuAddr() == addr) {
|
||||
return surface;
|
||||
}
|
||||
const auto it = registry.find(page);
|
||||
if (it == registry.end()) {
|
||||
return nullptr;
|
||||
}
|
||||
return nullptr;
|
||||
const auto& list = it->second;
|
||||
const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) {
|
||||
return surface->GetCpuAddr() == addr;
|
||||
});
|
||||
return found != list.end() ? *found : nullptr;
|
||||
}
|
||||
|
||||
u64 Tick() {
|
||||
@@ -498,7 +502,7 @@ private:
|
||||
* @param untopological Indicates to the recycler that the texture has no way
|
||||
* to match the overlaps due to topological reasons.
|
||||
**/
|
||||
RecycleStrategy PickStrategy(std::vector<TSurface>& overlaps, const SurfaceParams& params,
|
||||
RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params,
|
||||
const GPUVAddr gpu_addr, const MatchTopologyResult untopological) {
|
||||
if (Settings::IsGPULevelExtreme()) {
|
||||
return RecycleStrategy::Flush;
|
||||
@@ -538,9 +542,8 @@ private:
|
||||
* @param untopological Indicates to the recycler that the texture has no way to match the
|
||||
* overlaps due to topological reasons.
|
||||
**/
|
||||
std::pair<TSurface, TView> RecycleSurface(std::vector<TSurface>& overlaps,
|
||||
const SurfaceParams& params, const GPUVAddr gpu_addr,
|
||||
const bool preserve_contents,
|
||||
std::pair<TSurface, TView> RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params,
|
||||
const GPUVAddr gpu_addr, const bool preserve_contents,
|
||||
const MatchTopologyResult untopological) {
|
||||
const bool do_load = preserve_contents && Settings::IsGPULevelExtreme();
|
||||
for (auto& surface : overlaps) {
|
||||
@@ -650,47 +653,65 @@ private:
|
||||
* @param params The parameters on the new surface.
|
||||
* @param gpu_addr The starting address of the new surface.
|
||||
**/
|
||||
std::optional<std::pair<TSurface, TView>> TryReconstructSurface(std::vector<TSurface>& overlaps,
|
||||
std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps,
|
||||
const SurfaceParams& params,
|
||||
const GPUVAddr gpu_addr) {
|
||||
GPUVAddr gpu_addr) {
|
||||
if (params.target == SurfaceTarget::Texture3D) {
|
||||
return {};
|
||||
return std::nullopt;
|
||||
}
|
||||
bool modified = false;
|
||||
const auto test_modified = [](TSurface& surface) { return surface->IsModified(); };
|
||||
TSurface new_surface = GetUncachedSurface(gpu_addr, params);
|
||||
u32 passed_tests = 0;
|
||||
|
||||
if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) {
|
||||
LoadSurface(new_surface);
|
||||
for (const auto& surface : overlaps) {
|
||||
Unregister(surface);
|
||||
}
|
||||
Register(new_surface);
|
||||
return {{new_surface, new_surface->GetMainView()}};
|
||||
}
|
||||
|
||||
std::size_t passed_tests = 0;
|
||||
for (auto& surface : overlaps) {
|
||||
const SurfaceParams& src_params = surface->GetSurfaceParams();
|
||||
if (src_params.is_layered || src_params.num_levels > 1) {
|
||||
// We send this cases to recycle as they are more complex to handle
|
||||
return {};
|
||||
}
|
||||
const std::size_t candidate_size = surface->GetSizeInBytes();
|
||||
auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
|
||||
const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
|
||||
if (!mipmap_layer) {
|
||||
continue;
|
||||
}
|
||||
const auto [layer, mipmap] = *mipmap_layer;
|
||||
if (new_surface->GetMipmapSize(mipmap) != candidate_size) {
|
||||
const auto [base_layer, base_mipmap] = *mipmap_layer;
|
||||
if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) {
|
||||
continue;
|
||||
}
|
||||
modified |= surface->IsModified();
|
||||
// Now we got all the data set up
|
||||
const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
|
||||
const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
|
||||
const CopyParams copy_params(0, 0, 0, 0, 0, layer, 0, mipmap, width, height, 1);
|
||||
passed_tests++;
|
||||
ImageCopy(surface, new_surface, copy_params);
|
||||
++passed_tests;
|
||||
|
||||
// Copy all mipmaps and layers
|
||||
const u32 block_width = params.GetDefaultBlockWidth();
|
||||
const u32 block_height = params.GetDefaultBlockHeight();
|
||||
for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) {
|
||||
const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
|
||||
const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
|
||||
if (width < block_width || height < block_height) {
|
||||
// Current APIs forbid copying small compressed textures, avoid errors
|
||||
break;
|
||||
}
|
||||
const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,
|
||||
src_params.depth);
|
||||
ImageCopy(surface, new_surface, copy_params);
|
||||
}
|
||||
}
|
||||
if (passed_tests == 0) {
|
||||
return {};
|
||||
// In Accurate GPU all tests should pass, else we recycle
|
||||
} else if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
|
||||
return {};
|
||||
return std::nullopt;
|
||||
}
|
||||
if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
|
||||
// In Accurate GPU all tests should pass, else we recycle
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified);
|
||||
for (const auto& surface : overlaps) {
|
||||
Unregister(surface);
|
||||
}
|
||||
|
||||
new_surface->MarkAsModified(modified, Tick());
|
||||
Register(new_surface);
|
||||
return {{new_surface, new_surface->GetMainView()}};
|
||||
@@ -708,7 +729,7 @@ private:
|
||||
* @param preserve_contents Indicates that the new surface should be loaded from memory or
|
||||
* left blank.
|
||||
*/
|
||||
std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps,
|
||||
std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps,
|
||||
const SurfaceParams& params,
|
||||
const GPUVAddr gpu_addr,
|
||||
const VAddr cpu_addr,
|
||||
@@ -810,7 +831,7 @@ private:
|
||||
TSurface& current_surface = iter->second;
|
||||
const auto topological_result = current_surface->MatchesTopology(params);
|
||||
if (topological_result != MatchTopologyResult::FullMatch) {
|
||||
std::vector<TSurface> overlaps{current_surface};
|
||||
VectorSurface overlaps{current_surface};
|
||||
return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
|
||||
topological_result);
|
||||
}
|
||||
@@ -868,12 +889,9 @@ private:
|
||||
// two things either the candidate surface is a supertexture of the overlap
|
||||
// or they don't match in any known way.
|
||||
if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) {
|
||||
if (current_surface->GetGpuAddr() == gpu_addr) {
|
||||
std::optional<std::pair<TSurface, TView>> view =
|
||||
TryReconstructSurface(overlaps, params, gpu_addr);
|
||||
if (view) {
|
||||
return *view;
|
||||
}
|
||||
const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr);
|
||||
if (view) {
|
||||
return *view;
|
||||
}
|
||||
return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
|
||||
MatchTopologyResult::FullMatch);
|
||||
@@ -991,7 +1009,9 @@ private:
|
||||
params.target = target;
|
||||
params.is_tiled = false;
|
||||
params.srgb_conversion = false;
|
||||
params.is_layered = false;
|
||||
params.is_layered =
|
||||
target == SurfaceTarget::Texture1DArray || target == SurfaceTarget::Texture2DArray ||
|
||||
target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray;
|
||||
params.block_width = 0;
|
||||
params.block_height = 0;
|
||||
params.block_depth = 0;
|
||||
@@ -1124,23 +1144,25 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TSurface> GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
|
||||
VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) {
|
||||
if (size == 0) {
|
||||
return {};
|
||||
}
|
||||
const VAddr cpu_addr_end = cpu_addr + size;
|
||||
VAddr start = cpu_addr >> registry_page_bits;
|
||||
const VAddr end = (cpu_addr_end - 1) >> registry_page_bits;
|
||||
std::vector<TSurface> surfaces;
|
||||
while (start <= end) {
|
||||
std::vector<TSurface>& list = registry[start];
|
||||
for (auto& surface : list) {
|
||||
if (!surface->IsPicked() && surface->Overlaps(cpu_addr, cpu_addr_end)) {
|
||||
surface->MarkAsPicked(true);
|
||||
surfaces.push_back(surface);
|
||||
}
|
||||
VectorSurface surfaces;
|
||||
for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) {
|
||||
const auto it = registry.find(start);
|
||||
if (it == registry.end()) {
|
||||
continue;
|
||||
}
|
||||
for (auto& surface : it->second) {
|
||||
if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) {
|
||||
continue;
|
||||
}
|
||||
surface->MarkAsPicked(true);
|
||||
surfaces.push_back(surface);
|
||||
}
|
||||
start++;
|
||||
}
|
||||
for (auto& surface : surfaces) {
|
||||
surface->MarkAsPicked(false);
|
||||
|
||||
@@ -106,6 +106,9 @@ public:
|
||||
format.setVersion(4, 3);
|
||||
format.setProfile(QSurfaceFormat::CompatibilityProfile);
|
||||
format.setOption(QSurfaceFormat::FormatOption::DeprecatedFunctions);
|
||||
if (Settings::values.renderer_debug) {
|
||||
format.setOption(QSurfaceFormat::FormatOption::DebugContext);
|
||||
}
|
||||
// TODO: expose a setting for buffer value (ie default/single/double/triple)
|
||||
format.setSwapBehavior(QSurfaceFormat::DefaultSwapBehavior);
|
||||
format.setSwapInterval(0);
|
||||
|
||||
@@ -533,6 +533,8 @@ void Config::ReadDebuggingValues() {
|
||||
Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool();
|
||||
Settings::values.disable_cpu_opt =
|
||||
ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool();
|
||||
Settings::values.disable_macro_jit =
|
||||
ReadSetting(QStringLiteral("disable_macro_jit"), false).toBool();
|
||||
|
||||
qt_config->endGroup();
|
||||
}
|
||||
@@ -643,6 +645,8 @@ void Config::ReadRendererValues() {
|
||||
Settings::values.use_asynchronous_gpu_emulation =
|
||||
ReadSetting(QStringLiteral("use_asynchronous_gpu_emulation"), false).toBool();
|
||||
Settings::values.use_vsync = ReadSetting(QStringLiteral("use_vsync"), true).toBool();
|
||||
Settings::values.use_assembly_shaders =
|
||||
ReadSetting(QStringLiteral("use_assembly_shaders"), false).toBool();
|
||||
Settings::values.use_fast_gpu_time =
|
||||
ReadSetting(QStringLiteral("use_fast_gpu_time"), true).toBool();
|
||||
Settings::values.force_30fps_mode =
|
||||
@@ -1009,6 +1013,7 @@ void Config::SaveDebuggingValues() {
|
||||
WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false);
|
||||
WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false);
|
||||
WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false);
|
||||
WriteSetting(QStringLiteral("disable_macro_jit"), Settings::values.disable_macro_jit, false);
|
||||
|
||||
qt_config->endGroup();
|
||||
}
|
||||
@@ -1090,6 +1095,8 @@ void Config::SaveRendererValues() {
|
||||
WriteSetting(QStringLiteral("use_asynchronous_gpu_emulation"),
|
||||
Settings::values.use_asynchronous_gpu_emulation, false);
|
||||
WriteSetting(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
|
||||
WriteSetting(QStringLiteral("use_assembly_shaders"), Settings::values.use_assembly_shaders,
|
||||
false);
|
||||
WriteSetting(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, true);
|
||||
WriteSetting(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, false);
|
||||
|
||||
|
||||
@@ -39,6 +39,8 @@ void ConfigureDebug::SetConfiguration() {
|
||||
ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt);
|
||||
ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn());
|
||||
ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug);
|
||||
ui->disable_macro_jit->setEnabled(!Core::System::GetInstance().IsPoweredOn());
|
||||
ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit);
|
||||
}
|
||||
|
||||
void ConfigureDebug::ApplyConfiguration() {
|
||||
@@ -51,6 +53,7 @@ void ConfigureDebug::ApplyConfiguration() {
|
||||
Settings::values.quest_flag = ui->quest_flag->isChecked();
|
||||
Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked();
|
||||
Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked();
|
||||
Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked();
|
||||
Debugger::ToggleConsole();
|
||||
Log::Filter filter;
|
||||
filter.ParseFilterString(Settings::values.log_filter);
|
||||
|
||||
@@ -148,6 +148,19 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QCheckBox" name="disable_macro_jit">
|
||||
<property name="enabled">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
<property name="whatsThis">
|
||||
<string>When checked, it disables the macro Just In Time compiler. Enabled this makes games run slower</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Disable Macro JIT</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
</layout>
|
||||
</widget>
|
||||
</item>
|
||||
|
||||
@@ -12,6 +12,9 @@ ConfigureGraphicsAdvanced::ConfigureGraphicsAdvanced(QWidget* parent)
|
||||
|
||||
ui->setupUi(this);
|
||||
|
||||
// TODO: Remove this after assembly shaders are fully integrated
|
||||
ui->use_assembly_shaders->setVisible(false);
|
||||
|
||||
SetConfiguration();
|
||||
}
|
||||
|
||||
@@ -22,6 +25,8 @@ void ConfigureGraphicsAdvanced::SetConfiguration() {
|
||||
ui->gpu_accuracy->setCurrentIndex(static_cast<int>(Settings::values.gpu_accuracy));
|
||||
ui->use_vsync->setEnabled(runtime_lock);
|
||||
ui->use_vsync->setChecked(Settings::values.use_vsync);
|
||||
ui->use_assembly_shaders->setEnabled(runtime_lock);
|
||||
ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders);
|
||||
ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time);
|
||||
ui->force_30fps_mode->setEnabled(runtime_lock);
|
||||
ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode);
|
||||
@@ -33,6 +38,7 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() {
|
||||
auto gpu_accuracy = static_cast<Settings::GPUAccuracy>(ui->gpu_accuracy->currentIndex());
|
||||
Settings::values.gpu_accuracy = gpu_accuracy;
|
||||
Settings::values.use_vsync = ui->use_vsync->isChecked();
|
||||
Settings::values.use_assembly_shaders = ui->use_assembly_shaders->isChecked();
|
||||
Settings::values.use_fast_gpu_time = ui->use_fast_gpu_time->isChecked();
|
||||
Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked();
|
||||
Settings::values.max_anisotropy = ui->anisotropic_filtering_combobox->currentIndex();
|
||||
|
||||
@@ -62,6 +62,16 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QCheckBox" name="use_assembly_shaders">
|
||||
<property name="toolTip">
|
||||
<string>Enabling this reduces shader stutter. Enables OpenGL assembly shaders on supported Nvidia devices (NV_gpu_program5 is required). This feature is experimental.</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Use assembly shaders (experimental, Nvidia OpenGL only)</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QCheckBox" name="force_30fps_mode">
|
||||
<property name="text">
|
||||
|
||||
@@ -480,7 +480,9 @@ void ConfigureInputPlayer::RestoreDefaults() {
|
||||
SetAnalogButton(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]);
|
||||
}
|
||||
}
|
||||
|
||||
UpdateButtonLabels();
|
||||
ApplyConfiguration();
|
||||
}
|
||||
|
||||
void ConfigureInputPlayer::ClearAll() {
|
||||
@@ -505,6 +507,7 @@ void ConfigureInputPlayer::ClearAll() {
|
||||
}
|
||||
|
||||
UpdateButtonLabels();
|
||||
ApplyConfiguration();
|
||||
}
|
||||
|
||||
void ConfigureInputPlayer::UpdateButtonLabels() {
|
||||
|
||||
@@ -18,7 +18,7 @@ DiscordImpl::DiscordImpl() {
|
||||
|
||||
// The number is the client ID for yuzu, it's used for images and the
|
||||
// application name
|
||||
Discord_Initialize("471872241299226636", &handlers, 1, nullptr);
|
||||
Discord_Initialize("712465656758665259", &handlers, 1, nullptr);
|
||||
}
|
||||
|
||||
DiscordImpl::~DiscordImpl() {
|
||||
|
||||
@@ -65,6 +65,7 @@ static FileSys::VirtualFile VfsDirectoryCreateFileWrapper(const FileSys::Virtual
|
||||
#include "common/logging/backend.h"
|
||||
#include "common/logging/filter.h"
|
||||
#include "common/logging/log.h"
|
||||
#include "common/memory_detect.h"
|
||||
#include "common/microprofile.h"
|
||||
#include "common/scm_rev.h"
|
||||
#include "common/scope_exit.h"
|
||||
@@ -219,6 +220,10 @@ GMainWindow::GMainWindow()
|
||||
LOG_INFO(Frontend, "Host CPU: {}", Common::GetCPUCaps().cpu_string);
|
||||
#endif
|
||||
LOG_INFO(Frontend, "Host OS: {}", QSysInfo::prettyProductName().toStdString());
|
||||
LOG_INFO(Frontend, "Host RAM: {:.2f} GB",
|
||||
Common::GetMemInfo().TotalPhysicalMemory / 1024.0f / 1024 / 1024);
|
||||
LOG_INFO(Frontend, "Host Swap: {:.2f} GB",
|
||||
Common::GetMemInfo().TotalSwapMemory / 1024.0f / 1024 / 1024);
|
||||
UpdateWindowTitle();
|
||||
|
||||
show();
|
||||
|
||||
@@ -397,6 +397,8 @@ void Config::ReadValues() {
|
||||
sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false);
|
||||
Settings::values.use_vsync =
|
||||
static_cast<u16>(sdl2_config->GetInteger("Renderer", "use_vsync", 1));
|
||||
Settings::values.use_assembly_shaders =
|
||||
sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", false);
|
||||
Settings::values.use_fast_gpu_time =
|
||||
sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true);
|
||||
|
||||
@@ -430,6 +432,8 @@ void Config::ReadValues() {
|
||||
Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false);
|
||||
Settings::values.disable_cpu_opt =
|
||||
sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false);
|
||||
Settings::values.disable_macro_jit =
|
||||
sdl2_config->GetBoolean("Debugging", "disable_macro_jit", false);
|
||||
|
||||
const auto title_list = sdl2_config->Get("AddOns", "title_ids", "");
|
||||
std::stringstream ss(title_list);
|
||||
|
||||
@@ -134,6 +134,10 @@ max_anisotropy =
|
||||
# 0 (default): Off, 1: On
|
||||
use_vsync =
|
||||
|
||||
# Whether to use OpenGL assembly shaders or not. NV_gpu_program5 is required.
|
||||
# 0 (default): Off, 1: On
|
||||
use_assembly_shaders =
|
||||
|
||||
# Turns on the frame limiter, which will limit frames output to the target game speed
|
||||
# 0: Off, 1: On (default)
|
||||
use_frame_limit =
|
||||
@@ -287,6 +291,8 @@ quest_flag =
|
||||
# Determines whether or not JIT CPU optimizations are enabled
|
||||
# false: Optimizations Enabled, true: Optimizations Disabled
|
||||
disable_cpu_opt =
|
||||
# Enables/Disables the macro JIT compiler
|
||||
disable_macro_jit=false
|
||||
|
||||
[WebService]
|
||||
# Whether or not to enable telemetry
|
||||
|
||||
@@ -98,6 +98,9 @@ EmuWindow_SDL2_GL::EmuWindow_SDL2_GL(Core::System& system, bool fullscreen)
|
||||
SDL_GL_SetAttribute(SDL_GL_BLUE_SIZE, 8);
|
||||
SDL_GL_SetAttribute(SDL_GL_ALPHA_SIZE, 0);
|
||||
SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1);
|
||||
if (Settings::values.renderer_debug) {
|
||||
SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
|
||||
}
|
||||
SDL_GL_SetSwapInterval(0);
|
||||
|
||||
std::string window_title = fmt::format("yuzu {} | {}-{}", Common::g_build_fullname,
|
||||
|
||||
Reference in New Issue
Block a user