Compare commits

..

33 Commits

Author SHA1 Message Date
Liam
1b5c87ab6a kernel: match calls to Register and Unregister 2023-04-29 21:52:26 -04:00
bunnei
fe57f39676 Merge pull request #10082 from FernandoS27/the-testers-really-love-chocolate
Refactor Accelerate DMA and do downloads through TC.
2023-04-29 11:46:01 -07:00
Fernando Sahmkow
4bc5469f52 Texture Cache: Release stagging buffers on tick frame 2023-04-29 15:31:38 +02:00
Fernando Sahmkow
58d1c7c77a Address Feedback & Clang Format 2023-04-29 00:18:21 +02:00
Fernando Sahmkow
56c9730a16 Maxwell3D: only update parameters on High 2023-04-29 00:18:21 +02:00
Fernando Sahmkow
e3a2ca96bd Accelerate DMA: Use texture cache async downloads to perform the copies
to host.

WIP
2023-04-29 00:18:21 +02:00
Fernando Sahmkow
3fbee093b2 TextureCache: refactor DMA downloads to allow multiple buffers. 2023-04-29 00:18:21 +02:00
Fernando S
9bf19b04f6 Merge pull request #10051 from liamwhite/surface-capabilities
vulkan: pick alpha composite flags based on available values
2023-04-24 12:37:13 +02:00
Fernando S
47cd0586ee Merge pull request #10056 from vonchenplus/audout_u
core: audio: return result when audio_out initialize failed
2023-04-24 12:36:52 +02:00
Fernando S
2311fa7c84 Merge pull request #10069 from liamwhite/log
maxwell_3d: fix out of bounds array access in size estimation
2023-04-24 12:36:24 +02:00
Liam
eb7c2314f6 maxwell_3d: fix out of bounds array access in size estimation 2023-04-22 10:35:26 -04:00
Fernando S
d600183583 Merge pull request #10074 from Kelebek1/fermi_blit
Account for a pre-added offset when using Corner sample mode for 2D blits
2023-04-22 12:06:00 +02:00
bunnei
0f1ff5f34e Merge pull request #10076 from german77/TryPopMyFriend
core: am: Demote TryPopFromFriendInvitationStorageChannel Log level
2023-04-21 23:15:07 -07:00
bunnei
d0e6eafe23 Merge pull request #10068 from twitchax/twitchax/dr_bind_address
Allow passing `--bind-address` to dedicated room.
2023-04-21 23:13:51 -07:00
bunnei
74d203fbe3 Merge pull request #10060 from german77/no_dead
core: hid: Remove deadzone of virtual controller
2023-04-21 23:13:21 -07:00
german77
7ffc42c397 core: am: Demote TryPopFromFriendInvitationStorageChannel Log level 2023-04-21 22:35:45 -06:00
Kelebek1
4e14b64bfc Account for a pre-added offset when using Corner sample mode for 2D blits 2023-04-21 19:08:21 +01:00
bunnei
0cfeb2e8d7 Merge pull request #10057 from liamwhite/its-not-in-the-timeline
vulkan: use plain fences when timeline semaphores are not available
2023-04-19 16:50:24 -07:00
Aaron Roney
79e32127b3 Run clang-format to fix all. 2023-04-19 17:52:09 +00:00
bunnei
799579c8d2 Merge pull request #10053 from german77/nfp_full
service: nfp: Implement all interfaces
2023-04-19 10:36:28 -07:00
Aaron Roney
3e7af5fbd7 Fix formatting. 2023-04-19 16:26:49 +00:00
Aaron Roney
34d0d94df0 Allow passing bind_address to dedicated room. 2023-04-19 05:37:30 +00:00
FengChen
55a33342cc core: audio: return result when audio_out initialize failed 2023-04-16 12:31:54 +08:00
german77
70a97fb5c7 core: hid: Remove deadzone of virtual controller 2023-04-15 18:41:09 -06:00
Liam
e3fb9b5e00 vulkan: use plain fences when timeline semaphores are not available 2023-04-14 22:53:37 -04:00
bunnei
e0895a8581 Merge pull request #10030 from Wollnashorn/botw-amd-fix
shader_recompiler: Fix ImageGather rounding on AMD/Intel
2023-04-14 16:56:34 -07:00
Narr the Reg
07694609fb Merge pull request #10055 from v1993/patch-1
input_common: minor fix to mouse movement
2023-04-14 17:05:45 -06:00
Valeri
60c4032b68 input_common: minor fix to mouse movement 2023-04-14 21:27:35 +03:00
Liam
e37e1d24f9 vulkan: pick alpha composite flags based on available values 2023-04-13 16:38:20 -04:00
Wollnashorn
c0e5ecc399 video_core: Enable ImageGather rounding fix on AMD open source drivers 2023-04-12 17:11:02 +02:00
Wollnashorn
82b78cde73 shader_recompiler: Use vector arithmetic rather than component-wise in ImageGatherSubpixelOffset
Should be more efficient and better readable
2023-04-08 16:13:08 +02:00
Wollnashorn
fe91066f46 video_core: Enable ImageGather with subpixel offset on Intel 2023-04-08 16:12:44 +02:00
Wollnashorn
780240e697 shader_recompiler: Add subpixel offset for correct rounding at ImageGather
On AMD a subpixel offset of 1/512 of the texel size is applied to the texture coordinates at a ImageGather call to ensure the rounding at the texel centers is done the same way as in Maxwell or other Nvidia architectures.
See https://www.reedbeta.com/blog/texture-gathers-and-coordinate-precision/ for more details why this might be necessary.

This should fix shadow artifacts at object edges in Zelda: Breath of the Wild (#9957, #6956).
2023-04-08 16:12:30 +02:00
35 changed files with 555 additions and 174 deletions

View File

@@ -293,6 +293,7 @@ struct System::Impl {
ASSERT(Kernel::KProcess::Initialize(main_process, system, "main",
Kernel::KProcess::ProcessType::Userland, resource_limit)
.IsSuccess());
Kernel::KProcess::Register(system.Kernel(), main_process);
kernel.MakeApplicationProcess(main_process);
const auto [load_result, load_parameters] = app_loader->Load(*main_process, system);
if (load_result != Loader::ResultStatus::Success) {

View File

@@ -280,6 +280,10 @@ void EmulatedController::LoadVirtualGamepadParams() {
virtual_stick_params[Settings::NativeAnalog::LStick].Set("axis_y", 1);
virtual_stick_params[Settings::NativeAnalog::RStick].Set("axis_x", 2);
virtual_stick_params[Settings::NativeAnalog::RStick].Set("axis_y", 3);
virtual_stick_params[Settings::NativeAnalog::LStick].Set("deadzone", 0.0f);
virtual_stick_params[Settings::NativeAnalog::LStick].Set("range", 1.0f);
virtual_stick_params[Settings::NativeAnalog::RStick].Set("deadzone", 0.0f);
virtual_stick_params[Settings::NativeAnalog::RStick].Set("range", 1.0f);
}
void EmulatedController::ReloadInput() {

View File

@@ -182,8 +182,8 @@ public:
explicit KAutoObjectWithList(KernelCore& kernel) : KAutoObject(kernel) {}
static int Compare(const KAutoObjectWithList& lhs, const KAutoObjectWithList& rhs) {
const u64 lid = lhs.GetId();
const u64 rid = rhs.GetId();
const uintptr_t lid = reinterpret_cast<uintptr_t>(std::addressof(lhs));
const uintptr_t rid = reinterpret_cast<uintptr_t>(std::addressof(rhs));
if (lid < rid) {
return -1;

View File

@@ -95,7 +95,7 @@ struct KernelCore::Impl {
pt_heap_region.GetSize());
}
InitializeHackSharedMemory();
InitializeHackSharedMemory(kernel);
RegisterHostThread(nullptr);
}
@@ -216,10 +216,12 @@ struct KernelCore::Impl {
auto* main_thread{Kernel::KThread::Create(system.Kernel())};
main_thread->SetCurrentCore(core);
ASSERT(Kernel::KThread::InitializeMainThread(system, main_thread, core).IsSuccess());
KThread::Register(system.Kernel(), main_thread);
auto* idle_thread{Kernel::KThread::Create(system.Kernel())};
idle_thread->SetCurrentCore(core);
ASSERT(Kernel::KThread::InitializeIdleThread(system, idle_thread, core).IsSuccess());
KThread::Register(system.Kernel(), idle_thread);
schedulers[i]->Initialize(main_thread, idle_thread, core);
}
@@ -230,6 +232,7 @@ struct KernelCore::Impl {
const Core::Timing::CoreTiming& core_timing) {
system_resource_limit = KResourceLimit::Create(system.Kernel());
system_resource_limit->Initialize(&core_timing);
KResourceLimit::Register(kernel, system_resource_limit);
const auto sizes{memory_layout->GetTotalAndKernelMemorySizes()};
const auto total_size{sizes.first};
@@ -355,6 +358,7 @@ struct KernelCore::Impl {
ASSERT(KThread::InitializeHighPriorityThread(system, shutdown_threads[core_id], {}, {},
core_id)
.IsSuccess());
KThread::Register(system.Kernel(), shutdown_threads[core_id]);
}
}
@@ -729,7 +733,7 @@ struct KernelCore::Impl {
memory_manager->Initialize(management_region.GetAddress(), management_region.GetSize());
}
void InitializeHackSharedMemory() {
void InitializeHackSharedMemory(KernelCore& kernel) {
// Setup memory regions for emulated processes
// TODO(bunnei): These should not be hardcoded regions initialized within the kernel
constexpr std::size_t hid_size{0x40000};
@@ -746,14 +750,23 @@ struct KernelCore::Impl {
hid_shared_mem->Initialize(system.DeviceMemory(), nullptr, Svc::MemoryPermission::None,
Svc::MemoryPermission::Read, hid_size);
KSharedMemory::Register(kernel, hid_shared_mem);
font_shared_mem->Initialize(system.DeviceMemory(), nullptr, Svc::MemoryPermission::None,
Svc::MemoryPermission::Read, font_size);
KSharedMemory::Register(kernel, font_shared_mem);
irs_shared_mem->Initialize(system.DeviceMemory(), nullptr, Svc::MemoryPermission::None,
Svc::MemoryPermission::Read, irs_size);
KSharedMemory::Register(kernel, irs_shared_mem);
time_shared_mem->Initialize(system.DeviceMemory(), nullptr, Svc::MemoryPermission::None,
Svc::MemoryPermission::Read, time_size);
KSharedMemory::Register(kernel, time_shared_mem);
hidbus_shared_mem->Initialize(system.DeviceMemory(), nullptr, Svc::MemoryPermission::None,
Svc::MemoryPermission::Read, hidbus_size);
KSharedMemory::Register(kernel, hidbus_shared_mem);
}
std::mutex registered_objects_lock;
@@ -1072,12 +1085,15 @@ static std::jthread RunHostThreadFunc(KernelCore& kernel, KProcess* process,
// Commit the thread reservation.
thread_reservation.Commit();
// Register the thread.
KThread::Register(kernel, thread);
return std::jthread(
[&kernel, thread, thread_name{std::move(thread_name)}, func{std::move(func)}] {
// Set the thread name.
Common::SetCurrentThreadName(thread_name.c_str());
// Register the thread.
// Set the thread as current.
kernel.RegisterHostThread(thread);
// Run the callback.
@@ -1099,6 +1115,9 @@ std::jthread KernelCore::RunOnHostCoreProcess(std::string&& process_name,
// Ensure that we don't hold onto any extra references.
SCOPE_EXIT({ process->Close(); });
// Register the new process.
KProcess::Register(*this, process);
// Run the host thread.
return RunHostThreadFunc(*this, process, std::move(process_name), std::move(func));
}
@@ -1124,6 +1143,9 @@ void KernelCore::RunOnGuestCoreProcess(std::string&& process_name, std::function
// Ensure that we don't hold onto any extra references.
SCOPE_EXIT({ process->Close(); });
// Register the new process.
KProcess::Register(*this, process);
// Reserve a new thread from the process resource limit.
KScopedResourceReservation thread_reservation(process, LimitableResource::ThreadCountMax);
ASSERT(thread_reservation.Succeeded());
@@ -1136,6 +1158,9 @@ void KernelCore::RunOnGuestCoreProcess(std::string&& process_name, std::function
// Commit the thread reservation.
thread_reservation.Commit();
// Register the new thread.
KThread::Register(*this, thread);
// Begin running the thread.
ASSERT(R_SUCCEEDED(thread->Run()));
}

View File

@@ -1807,7 +1807,7 @@ void IApplicationFunctions::GetFriendInvitationStorageChannelEvent(HLERequestCon
}
void IApplicationFunctions::TryPopFromFriendInvitationStorageChannel(HLERequestContext& ctx) {
LOG_WARNING(Service_AM, "(STUBBED) called");
LOG_DEBUG(Service_AM, "(STUBBED) called");
IPC::ResponseBuilder rb{ctx, 2};
rb.Push(AM::ResultNoDataInChannel);

View File

@@ -49,12 +49,6 @@ public:
};
// clang-format on
RegisterHandlers(functions);
if (impl->GetSystem()
.Initialize(device_name, in_params, handle, applet_resource_user_id)
.IsError()) {
LOG_ERROR(Service_Audio, "Failed to initialize the AudioOut System!");
}
}
~IAudioOut() override {
@@ -287,6 +281,14 @@ void AudOutU::OpenAudioOut(HLERequestContext& ctx) {
auto audio_out = std::make_shared<IAudioOut>(system, *impl, new_session_id, device_name,
in_params, handle, applet_resource_user_id);
result = audio_out->GetImpl()->GetSystem().Initialize(device_name, in_params, handle,
applet_resource_user_id);
if (result.IsError()) {
LOG_ERROR(Service_Audio, "Failed to initialize the AudioOut System!");
IPC::ResponseBuilder rb{ctx, 2};
rb.Push(result);
return;
}
impl->sessions[new_session_id] = audio_out->GetImpl();
impl->applet_resource_user_ids[new_session_id] = applet_resource_user_id;

View File

@@ -156,6 +156,7 @@ public:
auto* session = Kernel::KSession::Create(kernel);
session->Initialize(nullptr, 0);
Kernel::KSession::Register(kernel, session);
auto next_manager = std::make_shared<Service::SessionRequestManager>(
kernel, manager->GetServerManager());

View File

@@ -25,6 +25,9 @@ ServiceContext::ServiceContext(Core::System& system_, std::string name_)
Kernel::KProcess::ProcessType::KernelInternal,
kernel.GetSystemResourceLimit())
.IsSuccess());
// Register the process.
Kernel::KProcess::Register(kernel, process);
process_created = true;
}

View File

@@ -12,6 +12,9 @@ Mutex::Mutex(Core::System& system) : m_system(system) {
m_event = Kernel::KEvent::Create(system.Kernel());
m_event->Initialize(nullptr);
// Register the event.
Kernel::KEvent::Register(system.Kernel(), m_event);
ASSERT(R_SUCCEEDED(m_event->Signal()));
}

View File

@@ -33,6 +33,9 @@ ServerManager::ServerManager(Core::System& system) : m_system{system}, m_serve_m
// Initialize event.
m_event = Kernel::KEvent::Create(system.Kernel());
m_event->Initialize(nullptr);
// Register event.
Kernel::KEvent::Register(system.Kernel(), m_event);
}
ServerManager::~ServerManager() {
@@ -160,6 +163,9 @@ Result ServerManager::ManageDeferral(Kernel::KEvent** out_event) {
// Initialize the event.
m_deferral_event->Initialize(nullptr);
// Register the event.
Kernel::KEvent::Register(m_system.Kernel(), m_deferral_event);
// Set the output.
*out_event = m_deferral_event;

View File

@@ -64,6 +64,9 @@ Result ServiceManager::RegisterService(std::string name, u32 max_sessions,
auto* port = Kernel::KPort::Create(kernel);
port->Initialize(ServerSessionCountMax, false, 0);
// Register the port.
Kernel::KPort::Register(kernel, port);
service_ports.emplace(name, port);
registered_services.emplace(name, handler);
if (deferral_event) {

View File

@@ -49,6 +49,9 @@ void Controller::CloneCurrentObject(HLERequestContext& ctx) {
// Commit the session reservation.
session_reservation.Commit();
// Register the session.
Kernel::KSession::Register(system.Kernel(), session);
// Register with server manager.
session_manager->GetServerManager().RegisterSession(&session->GetServerSession(),
session_manager);

View File

@@ -49,6 +49,7 @@ static void PrintHelp(const char* argv0) {
" [options] <filename>\n"
"--room-name The name of the room\n"
"--room-description The room description\n"
"--bind-address The bind address for the room\n"
"--port The port used for the room\n"
"--max_members The maximum number of players for this room\n"
"--password The password for the room\n"
@@ -195,6 +196,7 @@ int main(int argc, char** argv) {
std::string web_api_url;
std::string ban_list_file;
std::string log_file = "yuzu-room.log";
std::string bind_address;
u64 preferred_game_id = 0;
u32 port = Network::DefaultRoomPort;
u32 max_members = 16;
@@ -203,6 +205,7 @@ int main(int argc, char** argv) {
static struct option long_options[] = {
{"room-name", required_argument, 0, 'n'},
{"room-description", required_argument, 0, 'd'},
{"bind-address", required_argument, 0, 's'},
{"port", required_argument, 0, 'p'},
{"max_members", required_argument, 0, 'm'},
{"password", required_argument, 0, 'w'},
@@ -222,7 +225,8 @@ int main(int argc, char** argv) {
InitializeLogging(log_file);
while (optind < argc) {
int arg = getopt_long(argc, argv, "n:d:p:m:w:g:u:t:a:i:l:hv", long_options, &option_index);
int arg =
getopt_long(argc, argv, "n:d:s:p:m:w:g:u:t:a:i:l:hv", long_options, &option_index);
if (arg != -1) {
switch (static_cast<char>(arg)) {
case 'n':
@@ -231,6 +235,9 @@ int main(int argc, char** argv) {
case 'd':
room_description.assign(optarg);
break;
case 's':
bind_address.assign(optarg);
break;
case 'p':
port = strtoul(optarg, &endarg, 0);
break;
@@ -295,6 +302,9 @@ int main(int argc, char** argv) {
PrintHelp(argv[0]);
return -1;
}
if (bind_address.empty()) {
LOG_INFO(Network, "Bind address is empty: defaulting to 0.0.0.0");
}
if (port > UINT16_MAX) {
LOG_ERROR(Network, "Port needs to be in the range 0 - 65535!");
PrintHelp(argv[0]);
@@ -358,8 +368,8 @@ int main(int argc, char** argv) {
if (auto room = network.GetRoom().lock()) {
AnnounceMultiplayerRoom::GameInfo preferred_game_info{.name = preferred_game,
.id = preferred_game_id};
if (!room->Create(room_name, room_description, "", port, password, max_members, username,
preferred_game_info, std::move(verify_backend), ban_list,
if (!room->Create(room_name, room_description, bind_address, port, password, max_members,
username, preferred_game_info, std::move(verify_backend), ban_list,
enable_yuzu_mods)) {
LOG_INFO(Network, "Failed to create room: ");
return -1;

View File

@@ -135,7 +135,7 @@ void Mouse::Move(int x, int y, int center_x, int center_y) {
auto mouse_change =
(Common::MakeVec(x, y) - Common::MakeVec(center_x, center_y)).Cast<float>();
last_motion_change += {-mouse_change.y, -mouse_change.x, last_motion_change.z};
last_motion_change += {-mouse_change.y, -mouse_change.x, 0};
const auto move_distance = mouse_change.Length();
if (move_distance == 0) {

View File

@@ -143,6 +143,21 @@ IR::Inst* PrepareSparse(IR::Inst& inst) {
}
return sparse_inst;
}
std::string ImageGatherSubpixelOffset(const IR::TextureInstInfo& info, std::string_view texture,
std::string_view coords) {
switch (info.type) {
case TextureType::Color2D:
case TextureType::Color2DRect:
return fmt::format("{}+vec2(0.001953125)/vec2(textureSize({}, 0))", coords, texture);
case TextureType::ColorArray2D:
case TextureType::ColorCube:
return fmt::format("vec3({0}.xy+vec2(0.001953125)/vec2(textureSize({1}, 0)),{0}.z)", coords,
texture);
default:
return std::string{coords};
}
}
} // Anonymous namespace
void EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst& inst, const IR::Value& index,
@@ -340,6 +355,13 @@ void EmitImageGather(EmitContext& ctx, IR::Inst& inst, const IR::Value& index,
LOG_WARNING(Shader_GLSL, "Device does not support sparse texture queries. STUBBING");
ctx.AddU1("{}=true;", *sparse_inst);
}
std::string coords_with_subpixel_offset;
if (ctx.profile.need_gather_subpixel_offset) {
// Apply a subpixel offset of 1/512 the texel size of the texture to ensure same rounding on
// AMD hardware as on Maxwell or other Nvidia architectures.
coords_with_subpixel_offset = ImageGatherSubpixelOffset(info, texture, coords);
coords = coords_with_subpixel_offset;
}
if (!sparse_inst || !supports_sparse) {
if (offset.IsEmpty()) {
ctx.Add("{}=textureGather({},{},int({}));", texel, texture, coords,
@@ -387,6 +409,13 @@ void EmitImageGatherDref(EmitContext& ctx, IR::Inst& inst, const IR::Value& inde
LOG_WARNING(Shader_GLSL, "Device does not support sparse texture queries. STUBBING");
ctx.AddU1("{}=true;", *sparse_inst);
}
std::string coords_with_subpixel_offset;
if (ctx.profile.need_gather_subpixel_offset) {
// Apply a subpixel offset of 1/512 the texel size of the texture to ensure same rounding on
// AMD hardware as on Maxwell or other Nvidia architectures.
coords_with_subpixel_offset = ImageGatherSubpixelOffset(info, texture, coords);
coords = coords_with_subpixel_offset;
}
if (!sparse_inst || !supports_sparse) {
if (offset.IsEmpty()) {
ctx.Add("{}=textureGather({},{},{});", texel, texture, coords, dref);

View File

@@ -261,6 +261,30 @@ Id BitTest(EmitContext& ctx, Id mask, Id bit) {
const Id bit_value{ctx.OpBitwiseAnd(ctx.U32[1], shifted, ctx.Const(1u))};
return ctx.OpINotEqual(ctx.U1, bit_value, ctx.u32_zero_value);
}
Id ImageGatherSubpixelOffset(EmitContext& ctx, const IR::TextureInstInfo& info, Id texture,
Id coords) {
// Apply a subpixel offset of 1/512 the texel size of the texture to ensure same rounding on
// AMD hardware as on Maxwell or other Nvidia architectures.
const auto calculate_coords{[&](size_t dim) {
const Id nudge{ctx.Const(0x1p-9f)};
const Id image_size{ctx.OpImageQuerySizeLod(ctx.U32[dim], texture, ctx.u32_zero_value)};
Id offset{dim == 2 ? ctx.ConstantComposite(ctx.F32[dim], nudge, nudge)
: ctx.ConstantComposite(ctx.F32[dim], nudge, nudge, ctx.f32_zero_value)};
offset = ctx.OpFDiv(ctx.F32[dim], offset, ctx.OpConvertUToF(ctx.F32[dim], image_size));
return ctx.OpFAdd(ctx.F32[dim], coords, offset);
}};
switch (info.type) {
case TextureType::Color2D:
case TextureType::Color2DRect:
return calculate_coords(2);
case TextureType::ColorArray2D:
case TextureType::ColorCube:
return calculate_coords(3);
default:
return coords;
}
}
} // Anonymous namespace
Id EmitBindlessImageSampleImplicitLod(EmitContext&) {
@@ -423,6 +447,9 @@ Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id
const IR::Value& offset, const IR::Value& offset2) {
const auto info{inst->Flags<IR::TextureInstInfo>()};
const ImageOperands operands(ctx, offset, offset2);
if (ctx.profile.need_gather_subpixel_offset) {
coords = ImageGatherSubpixelOffset(ctx, info, TextureImage(ctx, info, index), coords);
}
return Emit(&EmitContext::OpImageSparseGather, &EmitContext::OpImageGather, ctx, inst,
ctx.F32[4], Texture(ctx, info, index), coords, ctx.Const(info.gather_component),
operands.MaskOptional(), operands.Span());
@@ -432,6 +459,9 @@ Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, const IR::Value& index,
const IR::Value& offset, const IR::Value& offset2, Id dref) {
const auto info{inst->Flags<IR::TextureInstInfo>()};
const ImageOperands operands(ctx, offset, offset2);
if (ctx.profile.need_gather_subpixel_offset) {
coords = ImageGatherSubpixelOffset(ctx, info, TextureImage(ctx, info, index), coords);
}
return Emit(&EmitContext::OpImageSparseDrefGather, &EmitContext::OpImageDrefGather, ctx, inst,
ctx.F32[4], Texture(ctx, info, index), coords, dref, operands.MaskOptional(),
operands.Span());

View File

@@ -52,6 +52,10 @@ struct Profile {
bool need_declared_frag_colors{};
/// Prevents fast math optimizations that may cause inaccuracies
bool need_fastmath_off{};
/// Some GPU vendors use a different rounding precision when calculating texture pixel
/// coordinates with the 16.8 format in the ImageGather instruction than the Maxwell
/// architecture. Applying an offset does fix this mismatching rounding behaviour.
bool need_gather_subpixel_offset{};
/// OpFClamp is broken and OpFMax + OpFMin should be used instead
bool has_broken_spirv_clamp{};

View File

@@ -77,6 +77,14 @@ void Fermi2D::Blit() {
const auto bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
const bool delegate_to_gpu = src.width > 512 && src.height > 512 && bytes_per_pixel <= 8 &&
src.format != regs.dst.format;
auto srcX = args.src_x0;
auto srcY = args.src_y0;
if (args.sample_mode.origin == Origin::Corner) {
srcX -= (args.du_dx >> 33) << 32;
srcY -= (args.dv_dy >> 33) << 32;
}
Config config{
.operation = regs.operation,
.filter = args.sample_mode.filter,
@@ -86,10 +94,10 @@ void Fermi2D::Blit() {
.dst_y0 = args.dst_y0,
.dst_x1 = args.dst_x0 + args.dst_width,
.dst_y1 = args.dst_y0 + args.dst_height,
.src_x0 = static_cast<s32>(args.src_x0 >> 32),
.src_y0 = static_cast<s32>(args.src_y0 >> 32),
.src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32),
.src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32),
.src_x0 = static_cast<s32>(srcX >> 32),
.src_y0 = static_cast<s32>(srcY >> 32),
.src_x1 = static_cast<s32>((srcX + args.du_dx * args.dst_width) >> 32),
.src_y1 = static_cast<s32>((srcY + args.dv_dy * args.dst_height) >> 32),
};
const auto need_align_to_pitch =

View File

@@ -4,6 +4,7 @@
#include <cstring>
#include <optional>
#include "common/assert.h"
#include "common/bit_util.h"
#include "common/scope_exit.h"
#include "common/settings.h"
#include "core/core.h"
@@ -222,6 +223,9 @@ void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool
}
void Maxwell3D::RefreshParametersImpl() {
if (!Settings::IsGPULevelHigh()) {
return;
}
size_t current_index = 0;
for (auto& segment : macro_segments) {
if (segment.first == 0) {
@@ -259,12 +263,13 @@ u32 Maxwell3D::GetMaxCurrentVertices() {
size_t Maxwell3D::EstimateIndexBufferSize() {
GPUVAddr start_address = regs.index_buffer.StartAddress();
GPUVAddr end_address = regs.index_buffer.EndAddress();
static constexpr std::array<size_t, 4> max_sizes = {
std::numeric_limits<u8>::max(), std::numeric_limits<u16>::max(),
std::numeric_limits<u32>::max(), std::numeric_limits<u32>::max()};
static constexpr std::array<size_t, 3> max_sizes = {std::numeric_limits<u8>::max(),
std::numeric_limits<u16>::max(),
std::numeric_limits<u32>::max()};
const size_t byte_size = regs.index_buffer.FormatSizeInBytes();
const size_t log2_byte_size = Common::Log2Ceil64(byte_size);
return std::min<size_t>(
memory_manager.GetMemoryLayoutSize(start_address, byte_size * max_sizes[byte_size]) /
memory_manager.GetMemoryLayoutSize(start_address, byte_size * max_sizes[log2_byte_size]) /
byte_size,
static_cast<size_t>(end_address - start_address));
}

View File

@@ -176,6 +176,10 @@ public:
return vendor_name == "ATI Technologies Inc.";
}
bool IsIntel() const {
return vendor_name == "Intel";
}
bool CanReportMemoryUsage() const {
return can_report_memory;
}

View File

@@ -1287,8 +1287,7 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
}
const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing
: VideoCommon::ObtainBufferOperation::MarkAsWritten;
const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing;
const auto [buffer, offset] =
buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op);
@@ -1299,7 +1298,8 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
if constexpr (IS_IMAGE_UPLOAD) {
image->UploadMemory(buffer->Handle(), offset, copy_span);
} else {
image->DownloadMemory(buffer->Handle(), offset, copy_span);
texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span,
buffer_operand.address, buffer_size);
}
return true;
}

View File

@@ -218,6 +218,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo
.lower_left_origin_mode = true,
.need_declared_frag_colors = true,
.need_fastmath_off = device.NeedsFastmathOff(),
.need_gather_subpixel_offset = device.IsAmd() || device.IsIntel(),
.has_broken_spirv_clamp = true,
.has_broken_unsigned_image_offsets = true,

View File

@@ -803,30 +803,40 @@ void Image::UploadMemory(const ImageBufferMap& map,
void Image::DownloadMemory(GLuint buffer_handle, size_t buffer_offset,
std::span<const VideoCommon::BufferImageCopy> copies) {
std::array buffer_handles{buffer_handle};
std::array buffer_offsets{buffer_offset};
DownloadMemory(buffer_handles, buffer_offsets, copies);
}
void Image::DownloadMemory(std::span<GLuint> buffer_handles, std::span<size_t> buffer_offsets,
std::span<const VideoCommon::BufferImageCopy> copies) {
const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
if (is_rescaled) {
ScaleDown();
}
glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API
glBindBuffer(GL_PIXEL_PACK_BUFFER, buffer_handle);
glPixelStorei(GL_PACK_ALIGNMENT, 1);
for (size_t i = 0; i < buffer_handles.size(); i++) {
auto& buffer_handle = buffer_handles[i];
glBindBuffer(GL_PIXEL_PACK_BUFFER, buffer_handle);
glPixelStorei(GL_PACK_ALIGNMENT, 1);
u32 current_row_length = std::numeric_limits<u32>::max();
u32 current_image_height = std::numeric_limits<u32>::max();
u32 current_row_length = std::numeric_limits<u32>::max();
u32 current_image_height = std::numeric_limits<u32>::max();
for (const VideoCommon::BufferImageCopy& copy : copies) {
if (copy.image_subresource.base_level >= gl_num_levels) {
continue;
for (const VideoCommon::BufferImageCopy& copy : copies) {
if (copy.image_subresource.base_level >= gl_num_levels) {
continue;
}
if (current_row_length != copy.buffer_row_length) {
current_row_length = copy.buffer_row_length;
glPixelStorei(GL_PACK_ROW_LENGTH, current_row_length);
}
if (current_image_height != copy.buffer_image_height) {
current_image_height = copy.buffer_image_height;
glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height);
}
CopyImageToBuffer(copy, buffer_offsets[i]);
}
if (current_row_length != copy.buffer_row_length) {
current_row_length = copy.buffer_row_length;
glPixelStorei(GL_PACK_ROW_LENGTH, current_row_length);
}
if (current_image_height != copy.buffer_image_height) {
current_image_height = copy.buffer_image_height;
glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height);
}
CopyImageToBuffer(copy, buffer_offset);
}
if (is_rescaled) {
ScaleUp(true);

View File

@@ -215,6 +215,9 @@ public:
void DownloadMemory(GLuint buffer_handle, size_t buffer_offset,
std::span<const VideoCommon::BufferImageCopy> copies);
void DownloadMemory(std::span<GLuint> buffer_handle, std::span<size_t> buffer_offset,
std::span<const VideoCommon::BufferImageCopy> copies);
void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies);
GLuint StorageHandle() noexcept;
@@ -376,6 +379,7 @@ struct TextureCacheParams {
using Sampler = OpenGL::Sampler;
using Framebuffer = OpenGL::Framebuffer;
using AsyncBuffer = u32;
using BufferType = GLuint;
};
using TextureCache = VideoCommon::TextureCache<TextureCacheParams>;

View File

@@ -10,7 +10,14 @@
namespace Vulkan {
MasterSemaphore::MasterSemaphore(const Device& device) {
MasterSemaphore::MasterSemaphore(const Device& device_) : device(device_) {
if (!device.HasTimelineSemaphore()) {
static constexpr VkFenceCreateInfo fence_ci{
.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, .pNext = nullptr, .flags = 0};
fence = device.GetLogical().CreateFence(fence_ci);
return;
}
static constexpr VkSemaphoreTypeCreateInfo semaphore_type_ci{
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
.pNext = nullptr,
@@ -42,4 +49,134 @@ MasterSemaphore::MasterSemaphore(const Device& device) {
MasterSemaphore::~MasterSemaphore() = default;
void MasterSemaphore::Refresh() {
if (!semaphore) {
// If we don't support timeline semaphores, there's nothing to refresh
return;
}
u64 this_tick{};
u64 counter{};
do {
this_tick = gpu_tick.load(std::memory_order_acquire);
counter = semaphore.GetCounter();
if (counter < this_tick) {
return;
}
} while (!gpu_tick.compare_exchange_weak(this_tick, counter, std::memory_order_release,
std::memory_order_relaxed));
}
void MasterSemaphore::Wait(u64 tick) {
if (!semaphore) {
// If we don't support timeline semaphores, use an atomic wait
while (true) {
u64 current_value = gpu_tick.load(std::memory_order_relaxed);
if (current_value >= tick) {
return;
}
gpu_tick.wait(current_value);
}
return;
}
// No need to wait if the GPU is ahead of the tick
if (IsFree(tick)) {
return;
}
// Update the GPU tick and try again
Refresh();
if (IsFree(tick)) {
return;
}
// If none of the above is hit, fallback to a regular wait
while (!semaphore.Wait(tick)) {
}
Refresh();
}
VkResult MasterSemaphore::SubmitQueue(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
VkSemaphore wait_semaphore, u64 host_tick) {
if (semaphore) {
return SubmitQueueTimeline(cmdbuf, signal_semaphore, wait_semaphore, host_tick);
} else {
return SubmitQueueFence(cmdbuf, signal_semaphore, wait_semaphore, host_tick);
}
}
static constexpr std::array<VkPipelineStageFlags, 2> wait_stage_masks{
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
};
VkResult MasterSemaphore::SubmitQueueTimeline(vk::CommandBuffer& cmdbuf,
VkSemaphore signal_semaphore,
VkSemaphore wait_semaphore, u64 host_tick) {
const VkSemaphore timeline_semaphore = *semaphore;
const u32 num_signal_semaphores = signal_semaphore ? 2 : 1;
const std::array signal_values{host_tick, u64(0)};
const std::array signal_semaphores{timeline_semaphore, signal_semaphore};
const u32 num_wait_semaphores = wait_semaphore ? 2 : 1;
const std::array wait_values{host_tick - 1, u64(1)};
const std::array wait_semaphores{timeline_semaphore, wait_semaphore};
const VkTimelineSemaphoreSubmitInfo timeline_si{
.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
.pNext = nullptr,
.waitSemaphoreValueCount = num_wait_semaphores,
.pWaitSemaphoreValues = wait_values.data(),
.signalSemaphoreValueCount = num_signal_semaphores,
.pSignalSemaphoreValues = signal_values.data(),
};
const VkSubmitInfo submit_info{
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.pNext = &timeline_si,
.waitSemaphoreCount = num_wait_semaphores,
.pWaitSemaphores = wait_semaphores.data(),
.pWaitDstStageMask = wait_stage_masks.data(),
.commandBufferCount = 1,
.pCommandBuffers = cmdbuf.address(),
.signalSemaphoreCount = num_signal_semaphores,
.pSignalSemaphores = signal_semaphores.data(),
};
return device.GetGraphicsQueue().Submit(submit_info);
}
VkResult MasterSemaphore::SubmitQueueFence(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
VkSemaphore wait_semaphore, u64 host_tick) {
const u32 num_signal_semaphores = signal_semaphore ? 1 : 0;
const u32 num_wait_semaphores = wait_semaphore ? 1 : 0;
const VkSubmitInfo submit_info{
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.pNext = nullptr,
.waitSemaphoreCount = num_wait_semaphores,
.pWaitSemaphores = &wait_semaphore,
.pWaitDstStageMask = wait_stage_masks.data(),
.commandBufferCount = 1,
.pCommandBuffers = cmdbuf.address(),
.signalSemaphoreCount = num_signal_semaphores,
.pSignalSemaphores = &signal_semaphore,
};
auto result = device.GetGraphicsQueue().Submit(submit_info, *fence);
if (result == VK_SUCCESS) {
fence.Wait();
fence.Reset();
gpu_tick.store(host_tick);
gpu_tick.notify_all();
}
return result;
}
} // namespace Vulkan

View File

@@ -4,6 +4,8 @@
#pragma once
#include <atomic>
#include <condition_variable>
#include <mutex>
#include <thread>
#include "common/common_types.h"
@@ -29,11 +31,6 @@ public:
return gpu_tick.load(std::memory_order_acquire);
}
/// Returns the timeline semaphore handle.
[[nodiscard]] VkSemaphore Handle() const noexcept {
return *semaphore;
}
/// Returns true when a tick has been hit by the GPU.
[[nodiscard]] bool IsFree(u64 tick) const noexcept {
return KnownGpuTick() >= tick;
@@ -45,37 +42,24 @@ public:
}
/// Refresh the known GPU tick
void Refresh() {
u64 this_tick{};
u64 counter{};
do {
this_tick = gpu_tick.load(std::memory_order_acquire);
counter = semaphore.GetCounter();
if (counter < this_tick) {
return;
}
} while (!gpu_tick.compare_exchange_weak(this_tick, counter, std::memory_order_release,
std::memory_order_relaxed));
}
void Refresh();
/// Waits for a tick to be hit on the GPU
void Wait(u64 tick) {
// No need to wait if the GPU is ahead of the tick
if (IsFree(tick)) {
return;
}
// Update the GPU tick and try again
Refresh();
if (IsFree(tick)) {
return;
}
// If none of the above is hit, fallback to a regular wait
while (!semaphore.Wait(tick)) {
}
Refresh();
}
void Wait(u64 tick);
/// Submits the device graphics queue, updating the tick as necessary
VkResult SubmitQueue(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
VkSemaphore wait_semaphore, u64 host_tick);
private:
VkResult SubmitQueueTimeline(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
VkSemaphore wait_semaphore, u64 host_tick);
VkResult SubmitQueueFence(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
VkSemaphore wait_semaphore, u64 host_tick);
private:
const Device& device; ///< Device.
vk::Fence fence; ///< Fence.
vk::Semaphore semaphore; ///< Timeline semaphore.
std::atomic<u64> gpu_tick{0}; ///< Current known GPU tick.
std::atomic<u64> current_tick{1}; ///< Current logical tick.

View File

@@ -329,6 +329,11 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device
.lower_left_origin_mode = false,
.need_declared_frag_colors = false,
.need_gather_subpixel_offset = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY ||
driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE ||
driver_id == VK_DRIVER_ID_MESA_RADV ||
driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS ||
driver_id == VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA,
.has_broken_spirv_clamp = driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS,
.has_broken_spirv_position_input = driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY,

View File

@@ -781,8 +781,7 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
}
const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height);
static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize;
const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing
: VideoCommon::ObtainBufferOperation::MarkAsWritten;
const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing;
const auto [buffer, offset] =
buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op);
@@ -793,7 +792,8 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info,
if constexpr (IS_IMAGE_UPLOAD) {
image->UploadMemory(buffer->Handle(), offset, copy_span);
} else {
image->DownloadMemory(buffer->Handle(), offset, copy_span);
texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span,
buffer_operand.address, buffer_size);
}
return true;
}

View File

@@ -212,45 +212,13 @@ void Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_s
const u64 signal_value = master_semaphore->NextTick();
Record([signal_semaphore, wait_semaphore, signal_value, this](vk::CommandBuffer cmdbuf) {
cmdbuf.End();
const VkSemaphore timeline_semaphore = master_semaphore->Handle();
const u32 num_signal_semaphores = signal_semaphore ? 2U : 1U;
const std::array signal_values{signal_value, u64(0)};
const std::array signal_semaphores{timeline_semaphore, signal_semaphore};
const u32 num_wait_semaphores = wait_semaphore ? 2U : 1U;
const std::array wait_values{signal_value - 1, u64(1)};
const std::array wait_semaphores{timeline_semaphore, wait_semaphore};
static constexpr std::array<VkPipelineStageFlags, 2> wait_stage_masks{
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
};
const VkTimelineSemaphoreSubmitInfo timeline_si{
.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
.pNext = nullptr,
.waitSemaphoreValueCount = num_wait_semaphores,
.pWaitSemaphoreValues = wait_values.data(),
.signalSemaphoreValueCount = num_signal_semaphores,
.pSignalSemaphoreValues = signal_values.data(),
};
const VkSubmitInfo submit_info{
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.pNext = &timeline_si,
.waitSemaphoreCount = num_wait_semaphores,
.pWaitSemaphores = wait_semaphores.data(),
.pWaitDstStageMask = wait_stage_masks.data(),
.commandBufferCount = 1,
.pCommandBuffers = cmdbuf.address(),
.signalSemaphoreCount = num_signal_semaphores,
.pSignalSemaphores = signal_semaphores.data(),
};
if (on_submit) {
on_submit();
}
switch (const VkResult result = device.GetGraphicsQueue().Submit(submit_info)) {
switch (const VkResult result = master_semaphore->SubmitQueue(
cmdbuf, signal_semaphore, wait_semaphore, signal_value)) {
case VK_SUCCESS:
break;
case VK_ERROR_DEVICE_LOST:

View File

@@ -65,6 +65,18 @@ VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 wi
return extent;
}
VkCompositeAlphaFlagBitsKHR ChooseAlphaFlags(const VkSurfaceCapabilitiesKHR& capabilities) {
if (capabilities.supportedCompositeAlpha & VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR) {
return VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR;
} else if (capabilities.supportedCompositeAlpha & VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR) {
return VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR;
} else {
LOG_ERROR(Render_Vulkan, "Unknown composite alpha flags value {:#x}",
capabilities.supportedCompositeAlpha);
return VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR;
}
}
} // Anonymous namespace
Swapchain::Swapchain(VkSurfaceKHR surface_, const Device& device_, Scheduler& scheduler_,
@@ -155,6 +167,7 @@ void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bo
const auto formats{physical_device.GetSurfaceFormatsKHR(surface)};
const auto present_modes{physical_device.GetSurfacePresentModesKHR(surface)};
const VkCompositeAlphaFlagBitsKHR alpha_flags{ChooseAlphaFlags(capabilities)};
const VkSurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats)};
present_mode = ChooseSwapPresentMode(present_modes);
@@ -185,7 +198,7 @@ void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bo
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
.preTransform = capabilities.currentTransform,
.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR,
.compositeAlpha = alpha_flags,
.presentMode = present_mode,
.clipped = VK_FALSE,
.oldSwapchain = nullptr,

View File

@@ -1,10 +1,11 @@
// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-3.0-or-later
#include <algorithm>
#include <array>
#include <span>
#include <vector>
#include <boost/container/small_vector.hpp>
#include "common/bit_cast.h"
#include "common/bit_util.h"
@@ -1343,14 +1344,31 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImag
void Image::DownloadMemory(VkBuffer buffer, VkDeviceSize offset,
std::span<const VideoCommon::BufferImageCopy> copies) {
std::array buffer_handles{
buffer,
};
std::array buffer_offsets{
offset,
};
DownloadMemory(buffer_handles, buffer_offsets, copies);
}
void Image::DownloadMemory(std::span<VkBuffer> buffers_span, std::span<VkDeviceSize> offsets_span,
std::span<const VideoCommon::BufferImageCopy> copies) {
const bool is_rescaled = True(flags & ImageFlagBits::Rescaled);
if (is_rescaled) {
ScaleDown();
}
std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask);
boost::container::small_vector<VkBuffer, 1> buffers_vector{};
boost::container::small_vector<std::vector<VkBufferImageCopy>, 1> vk_copies;
for (size_t index = 0; index < buffers_span.size(); index++) {
buffers_vector.emplace_back(buffers_span[index]);
vk_copies.emplace_back(
TransformBufferImageCopies(copies, offsets_span[index], aspect_mask));
}
scheduler->RequestOutsideRenderPassOperationContext();
scheduler->Record([buffer, image = *original_image, aspect_mask = aspect_mask,
vk_copies](vk::CommandBuffer cmdbuf) {
scheduler->Record([buffers = std::move(buffers_vector), image = *original_image,
aspect_mask = aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) {
const VkImageMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
@@ -1369,6 +1387,20 @@ void Image::DownloadMemory(VkBuffer buffer, VkDeviceSize offset,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
0, read_barrier);
for (size_t index = 0; index < buffers.size(); index++) {
cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffers[index],
vk_copies[index]);
}
const VkMemoryBarrier memory_write_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
};
const VkImageMemoryBarrier image_write_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
@@ -1387,15 +1419,6 @@ void Image::DownloadMemory(VkBuffer buffer, VkDeviceSize offset,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
const VkMemoryBarrier memory_write_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
0, read_barrier);
cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, vk_copies);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0, memory_write_barrier, nullptr, image_write_barrier);
});
@@ -1405,7 +1428,13 @@ void Image::DownloadMemory(VkBuffer buffer, VkDeviceSize offset,
}
void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
DownloadMemory(map.buffer, map.offset, copies);
std::array buffers{
map.buffer,
};
std::array offsets{
map.offset,
};
DownloadMemory(buffers, offsets, copies);
}
bool Image::IsRescaled() const noexcept {

View File

@@ -1,5 +1,5 @@
// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-License-Identifier: GPL-3.0-or-later
#pragma once
@@ -141,6 +141,9 @@ public:
void DownloadMemory(VkBuffer buffer, VkDeviceSize offset,
std::span<const VideoCommon::BufferImageCopy> copies);
void DownloadMemory(std::span<VkBuffer> buffers, std::span<VkDeviceSize> offsets,
std::span<const VideoCommon::BufferImageCopy> copies);
void DownloadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies);
@@ -371,6 +374,7 @@ struct TextureCacheParams {
using Sampler = Vulkan::Sampler;
using Framebuffer = Vulkan::Framebuffer;
using AsyncBuffer = Vulkan::StagingBufferRef;
using BufferType = VkBuffer;
};
using TextureCache = VideoCommon::TextureCache<TextureCacheParams>;

View File

@@ -1,9 +1,10 @@
// SPDX-FileCopyrightText: 2021 yuzu Emulator Project
// SPDX-FileCopyrightText: 2023 yuzu Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#pragma once
#include <unordered_set>
#include <boost/container/small_vector.hpp>
#include "common/alignment.h"
#include "common/settings.h"
@@ -17,15 +18,10 @@
namespace VideoCommon {
using Tegra::Texture::SwizzleSource;
using Tegra::Texture::TextureType;
using Tegra::Texture::TICEntry;
using Tegra::Texture::TSCEntry;
using VideoCore::Surface::GetFormatType;
using VideoCore::Surface::IsCopyCompatible;
using VideoCore::Surface::PixelFormat;
using VideoCore::Surface::PixelFormatFromDepthFormat;
using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
using VideoCore::Surface::SurfaceType;
using namespace Common::Literals;
@@ -143,6 +139,13 @@ void TextureCache<P>::TickFrame() {
runtime.TickFrame();
critical_gc = 0;
++frame_tick;
if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
for (auto& buffer : async_buffers_death_ring) {
runtime.FreeDeferredStagingBuffer(buffer);
}
async_buffers_death_ring.clear();
}
}
template <class P>
@@ -661,25 +664,39 @@ template <class P>
void TextureCache<P>::CommitAsyncFlushes() {
// This is intentionally passing the value by copy
if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
const std::span<const ImageId> download_ids = uncommitted_downloads;
auto& download_ids = uncommitted_downloads;
if (download_ids.empty()) {
committed_downloads.emplace_back(std::move(uncommitted_downloads));
uncommitted_downloads.clear();
async_buffers.emplace_back(std::optional<AsyncBuffer>{});
async_buffers.emplace_back(std::move(uncommitted_async_buffers));
uncommitted_async_buffers.clear();
return;
}
size_t total_size_bytes = 0;
for (const ImageId image_id : download_ids) {
total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
size_t last_async_buffer_id = uncommitted_async_buffers.size();
bool any_none_dma = false;
for (PendingDownload& download_info : download_ids) {
if (download_info.is_swizzle) {
total_size_bytes +=
Common::AlignUp(slot_images[download_info.object_id].unswizzled_size_bytes, 64);
any_none_dma = true;
download_info.async_buffer_id = last_async_buffer_id;
}
}
auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true);
for (const ImageId image_id : download_ids) {
Image& image = slot_images[image_id];
const auto copies = FullDownloadCopies(image.info);
image.DownloadMemory(download_map, copies);
download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64);
if (any_none_dma) {
auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true);
for (const PendingDownload& download_info : download_ids) {
if (download_info.is_swizzle) {
Image& image = slot_images[download_info.object_id];
const auto copies = FullDownloadCopies(image.info);
image.DownloadMemory(download_map, copies);
download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64);
}
}
uncommitted_async_buffers.emplace_back(download_map);
}
async_buffers.emplace_back(download_map);
async_buffers.emplace_back(std::move(uncommitted_async_buffers));
uncommitted_async_buffers.clear();
}
committed_downloads.emplace_back(std::move(uncommitted_downloads));
uncommitted_downloads.clear();
@@ -691,39 +708,57 @@ void TextureCache<P>::PopAsyncFlushes() {
return;
}
if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
const std::span<const ImageId> download_ids = committed_downloads.front();
const auto& download_ids = committed_downloads.front();
if (download_ids.empty()) {
committed_downloads.pop_front();
async_buffers.pop_front();
return;
}
auto download_map = *async_buffers.front();
std::span<u8> download_span = download_map.mapped_span;
auto download_map = std::move(async_buffers.front());
for (size_t i = download_ids.size(); i > 0; i--) {
const ImageBase& image = slot_images[download_ids[i - 1]];
const auto copies = FullDownloadCopies(image.info);
download_map.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64);
std::span<u8> download_span_alt = download_span.subspan(download_map.offset);
SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span_alt,
swizzle_data_buffer);
auto& download_info = download_ids[i - 1];
auto& download_buffer = download_map[download_info.async_buffer_id];
if (download_info.is_swizzle) {
const ImageBase& image = slot_images[download_info.object_id];
const auto copies = FullDownloadCopies(image.info);
download_buffer.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64);
std::span<u8> download_span =
download_buffer.mapped_span.subspan(download_buffer.offset);
SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span,
swizzle_data_buffer);
} else {
const BufferDownload& buffer_info = slot_buffer_downloads[download_info.object_id];
std::span<u8> download_span =
download_buffer.mapped_span.subspan(download_buffer.offset);
gpu_memory->WriteBlockUnsafe(buffer_info.address, download_span.data(),
buffer_info.size);
slot_buffer_downloads.erase(download_info.object_id);
}
}
for (auto& download_buffer : download_map) {
async_buffers_death_ring.emplace_back(download_buffer);
}
runtime.FreeDeferredStagingBuffer(download_map);
committed_downloads.pop_front();
async_buffers.pop_front();
} else {
const std::span<const ImageId> download_ids = committed_downloads.front();
const auto& download_ids = committed_downloads.front();
if (download_ids.empty()) {
committed_downloads.pop_front();
return;
}
size_t total_size_bytes = 0;
for (const ImageId image_id : download_ids) {
total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
for (const PendingDownload& download_info : download_ids) {
if (download_info.is_swizzle) {
total_size_bytes += slot_images[download_info.object_id].unswizzled_size_bytes;
}
}
auto download_map = runtime.DownloadStagingBuffer(total_size_bytes);
const size_t original_offset = download_map.offset;
for (const ImageId image_id : download_ids) {
Image& image = slot_images[image_id];
for (const PendingDownload& download_info : download_ids) {
if (!download_info.is_swizzle) {
continue;
}
Image& image = slot_images[download_info.object_id];
const auto copies = FullDownloadCopies(image.info);
image.DownloadMemory(download_map, copies);
download_map.offset += image.unswizzled_size_bytes;
@@ -732,8 +767,11 @@ void TextureCache<P>::PopAsyncFlushes() {
runtime.Finish();
download_map.offset = original_offset;
std::span<u8> download_span = download_map.mapped_span;
for (const ImageId image_id : download_ids) {
const ImageBase& image = slot_images[image_id];
for (const PendingDownload& download_info : download_ids) {
if (!download_info.is_swizzle) {
continue;
}
const ImageBase& image = slot_images[download_info.object_id];
const auto copies = FullDownloadCopies(image.info);
SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span,
swizzle_data_buffer);
@@ -833,6 +871,33 @@ std::pair<typename TextureCache<P>::Image*, BufferImageCopy> TextureCache<P>::Dm
return {image, copy};
}
template <class P>
void TextureCache<P>::DownloadImageIntoBuffer(typename TextureCache<P>::Image* image,
typename TextureCache<P>::BufferType buffer,
size_t buffer_offset,
std::span<const VideoCommon::BufferImageCopy> copies,
GPUVAddr address, size_t size) {
if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) {
const BufferDownload new_buffer_download{address, size};
auto slot = slot_buffer_downloads.insert(new_buffer_download);
const PendingDownload new_download{false, uncommitted_async_buffers.size(), slot};
uncommitted_downloads.emplace_back(new_download);
auto download_map = runtime.DownloadStagingBuffer(size, true);
uncommitted_async_buffers.emplace_back(download_map);
std::array buffers{
buffer,
download_map.buffer,
};
std::array buffer_offsets{
buffer_offset,
download_map.offset,
};
image->DownloadMemory(buffers, buffer_offsets, copies);
} else {
image->DownloadMemory(buffer, buffer_offset, copies);
}
}
template <class P>
void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
if (False(image.flags & ImageFlagBits::CpuModified)) {
@@ -2209,7 +2274,8 @@ void TextureCache<P>::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id)
if (new_id) {
const ImageViewBase& old_view = slot_image_views[new_id];
if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) {
uncommitted_downloads.push_back(old_view.image_id);
const PendingDownload new_download{true, 0, old_view.image_id};
uncommitted_downloads.emplace_back(new_download);
}
}
*old_id = new_id;

View File

@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2021 yuzu Emulator Project
// SPDX-FileCopyrightText: 2023 yuzu Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#pragma once
@@ -40,14 +40,9 @@ struct ChannelState;
namespace VideoCommon {
using Tegra::Texture::SwizzleSource;
using Tegra::Texture::TICEntry;
using Tegra::Texture::TSCEntry;
using VideoCore::Surface::GetFormatType;
using VideoCore::Surface::IsCopyCompatible;
using VideoCore::Surface::PixelFormat;
using VideoCore::Surface::PixelFormatFromDepthFormat;
using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
using namespace Common::Literals;
struct ImageViewInOut {
@@ -119,6 +114,7 @@ class TextureCache : public VideoCommon::ChannelSetupCaches<TextureCacheChannelI
using Sampler = typename P::Sampler;
using Framebuffer = typename P::Framebuffer;
using AsyncBuffer = typename P::AsyncBuffer;
using BufferType = typename P::BufferType;
struct BlitImages {
ImageId dst_id;
@@ -215,6 +211,10 @@ public:
const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& buffer_operand,
const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image);
void DownloadImageIntoBuffer(Image* image, BufferType buffer, size_t buffer_offset,
std::span<const VideoCommon::BufferImageCopy> copies,
GPUVAddr address = 0, size_t size = 0);
/// Return true when a CPU region is modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
@@ -424,17 +424,32 @@ private:
u64 critical_memory;
size_t critical_gc;
struct BufferDownload {
GPUVAddr address;
size_t size;
};
struct PendingDownload {
bool is_swizzle;
size_t async_buffer_id;
SlotId object_id;
};
SlotVector<Image> slot_images;
SlotVector<ImageMapView> slot_map_views;
SlotVector<ImageView> slot_image_views;
SlotVector<ImageAlloc> slot_image_allocs;
SlotVector<Sampler> slot_samplers;
SlotVector<Framebuffer> slot_framebuffers;
SlotVector<BufferDownload> slot_buffer_downloads;
// TODO: This data structure is not optimal and it should be reworked
std::vector<ImageId> uncommitted_downloads;
std::deque<std::vector<ImageId>> committed_downloads;
std::deque<std::optional<AsyncBuffer>> async_buffers;
std::vector<PendingDownload> uncommitted_downloads;
std::deque<std::vector<PendingDownload>> committed_downloads;
std::vector<AsyncBuffer> uncommitted_async_buffers;
std::deque<std::vector<AsyncBuffer>> async_buffers;
std::deque<AsyncBuffer> async_buffers_death_ring;
struct LRUItemParams {
using ObjectType = ImageId;

View File

@@ -145,7 +145,6 @@
FEATURE_NAME(robustness2, robustImageAccess2) \
FEATURE_NAME(shader_demote_to_helper_invocation, shaderDemoteToHelperInvocation) \
FEATURE_NAME(shader_draw_parameters, shaderDrawParameters) \
FEATURE_NAME(timeline_semaphore, timelineSemaphore) \
FEATURE_NAME(variable_pointer, variablePointers) \
FEATURE_NAME(variable_pointer, variablePointersStorageBuffer)
@@ -158,6 +157,7 @@
FEATURE_NAME(provoking_vertex, provokingVertexLast) \
FEATURE_NAME(shader_float16_int8, shaderFloat16) \
FEATURE_NAME(shader_float16_int8, shaderInt8) \
FEATURE_NAME(timeline_semaphore, timelineSemaphore) \
FEATURE_NAME(transform_feedback, transformFeedback) \
FEATURE_NAME(uniform_buffer_standard_layout, uniformBufferStandardLayout) \
FEATURE_NAME(vertex_input_dynamic_state, vertexInputDynamicState)
@@ -493,6 +493,10 @@ public:
return extensions.shader_atomic_int64;
}
bool HasTimelineSemaphore() const {
return features.timeline_semaphore.timelineSemaphore;
}
/// Returns the minimum supported version of SPIR-V.
u32 SupportedSpirvVersion() const {
if (instance_version >= VK_API_VERSION_1_3) {