vulkan: Move unswizzle to GPU

This commit is contained in:
yzct12345
2021-08-11 05:32:32 +00:00
committed by GitHub
parent e6b80c2cf8
commit 118f8843cc
36 changed files with 1173 additions and 388 deletions

View File

@@ -2,11 +2,19 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/logging/log.h"
#include "core/device_memory.h"
namespace Core {
DeviceMemory::DeviceMemory() : buffer{DramMemoryMap::Size, 1ULL << 39} {}
DeviceMemory::DeviceMemory() : buffer{DramMemoryMap::Size, 1ULL << 39} {
auto ptr = reinterpret_cast<std::size_t>(buffer.BackingBasePointer());
if (ptr & 0xfff || !ptr) {
LOG_CRITICAL(HW_Memory, "Unaligned DeviceMemory");
abort();
}
}
DeviceMemory::~DeviceMemory() = default;
} // namespace Core

View File

@@ -12,7 +12,9 @@ namespace Core {
namespace DramMemoryMap {
enum : u64 {
Base = 0x80000000ULL,
Size = 0x100000000ULL,
GiB = 0x40000000ULL,
GiBs = 4,
Size = GiB * GiBs,
End = Base + Size,
KernelReserveBase = Base + 0x60000,
SlabHeapBase = KernelReserveBase + 0x85000,

View File

@@ -65,7 +65,11 @@ struct Memory::Impl {
return {};
}
return system.DeviceMemory().GetPointer(paddr) + vaddr;
u8* test = system.DeviceMemory().GetPointer(paddr);
// LOG_CRITICAL(Debug, "{:016X} {:016X} {:016X} va {:016X} {:016X}",
// (size_t)system.DeviceMemory().buffer.BackingBasePointer(),
// (size_t)test, paddr, vaddr, (size_t)(test + vaddr));
return test + vaddr;
}
u8 Read8(const VAddr addr) {
@@ -240,6 +244,42 @@ struct Memory::Impl {
ReadBlockImpl<true>(*system.CurrentProcess(), src_addr, dest_buffer, size);
}
void ReadBlockPointersUnsafe(const Kernel::KProcess& process, const VAddr src_addr,
ReadPointers& result, const std::size_t size) {
const auto end = &result.data[0];
const auto base_ptr = system.DeviceMemory().buffer.BackingBasePointer();
auto& tail = result.tail;
WalkBlock(
process, src_addr, size,
[src_addr, size, &tail](const std::size_t copy_amount, const VAddr current_vaddr) {
LOG_ERROR(
HW_Memory,
"Unmapped ReadBlockPointers @ 0x{:016X} (start address = 0x{:016X}, size = {})",
current_vaddr, src_addr, size);
tail->backing_offset = DramMemoryMap::Size;
},
[&tail, base_ptr](const std::size_t copy_amount, const u8* const src_ptr) {
tail->backing_offset = src_ptr - base_ptr;
},
[&tail, base_ptr](const VAddr current_vaddr, const std::size_t copy_amount,
const u8* const host_ptr) {
tail->backing_offset = host_ptr - base_ptr;
},
[&tail, end](const std::size_t copy_amount) {
tail->copy_amount = static_cast<u32>(copy_amount);
if (tail == end) {
LOG_CRITICAL(Debug, "Trying to read too much???");
abort();
}
--tail;
});
}
void ReadBlockPointersUnsafe(const VAddr src_addr, ReadPointers& result,
const std::size_t size) {
ReadBlockPointersUnsafe(*system.CurrentProcess(), src_addr, result, size);
}
template <bool UNSAFE>
void WriteBlockImpl(const Kernel::KProcess& process, const VAddr dest_addr,
const void* src_buffer, const std::size_t size) {
@@ -668,6 +708,11 @@ void Memory::ReadBlockUnsafe(const VAddr src_addr, void* dest_buffer, const std:
impl->ReadBlockUnsafe(src_addr, dest_buffer, size);
}
void Memory::ReadBlockPointersUnsafe(const VAddr src_addr, ReadPointers& result,
const std::size_t size) {
impl->ReadBlockPointersUnsafe(src_addr, result, size);
}
void Memory::WriteBlock(const Kernel::KProcess& process, VAddr dest_addr, const void* src_buffer,
std::size_t size) {
impl->WriteBlockImpl<false>(process, dest_addr, src_buffer, size);
@@ -691,4 +736,8 @@ void Memory::RasterizerMarkRegionCached(VAddr vaddr, u64 size, bool cached) {
impl->RasterizerMarkRegionCached(vaddr, size, cached);
}
Core::DeviceMemory& Memory::GetDeviceMemory() {
return system.DeviceMemory();
}
} // namespace Core::Memory

View File

@@ -14,8 +14,9 @@ struct PageTable;
}
namespace Core {
class DeviceMemory;
class System;
}
} // namespace Core
namespace Kernel {
class PhysicalMemory;
@@ -41,6 +42,17 @@ enum : VAddr {
DEFAULT_STACK_SIZE = 0x100000,
};
constexpr u32 MAX_READ_POINTERS = 100000;
struct ReadPointers {
struct ReadPointer {
u32 copy_amount;
u64 backing_offset;
};
std::array<ReadPointer, MAX_READ_POINTERS> data;
ReadPointer* tail;
};
/// Central class that handles all memory operations and state.
class Memory {
public:
@@ -348,6 +360,8 @@ public:
*/
void ReadBlockUnsafe(VAddr src_addr, void* dest_buffer, std::size_t size);
void ReadBlockPointersUnsafe(VAddr src_addr, ReadPointers& result, std::size_t size);
/**
* Writes a range of bytes into a given process' address space at the specified
* virtual address.
@@ -435,6 +449,8 @@ public:
*/
void RasterizerMarkRegionCached(VAddr vaddr, u64 size, bool cached);
Core::DeviceMemory& GetDeviceMemory();
private:
Core::System& system;

View File

@@ -132,6 +132,8 @@ add_library(video_core STATIC
renderer_vulkan/vk_fence_manager.h
renderer_vulkan/vk_graphics_pipeline.cpp
renderer_vulkan/vk_graphics_pipeline.h
renderer_vulkan/vk_host_memory.cpp
renderer_vulkan/vk_host_memory.h
renderer_vulkan/vk_master_semaphore.cpp
renderer_vulkan/vk_master_semaphore.h
renderer_vulkan/vk_pipeline_cache.cpp

View File

@@ -16,6 +16,7 @@ set(SHADER_FILES
vulkan_present.vert
vulkan_quad_indexed.comp
vulkan_uint8.comp
vulkan_unswizzle.comp
)
find_program(GLSLANGVALIDATOR "glslangValidator")

View File

@@ -0,0 +1,100 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 460 core
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
layout (local_size_x = 64) in;
//layout (constant_id = 0) const int BYTES_PER_PIXEL = 1;
layout(binding = 0, std430) readonly buffer InputBufferU8 { uint8_t u8data[]; };
layout(binding = 0, std430) readonly buffer InputBufferU16 { uint16_t u16data[]; };
layout(binding = 0, std430) readonly buffer InputBufferU32 { uint u32data[]; };
layout(binding = 0, std430) readonly buffer InputBufferU64 { uvec2 u64data[]; };
layout(binding = 0, std430) readonly buffer InputBufferU128 { uvec4 u128data[]; };
layout(binding = 1) writeonly uniform image2DArray output_image;
layout (push_constant) uniform constants {
uint size;
uint ptr;
uint so_far;
uint bytes_per_pixel;
uint pitch;
uint height;
uint depth;
uint block_height;
uint block_depth;
uint gobs_in_x;
uint dcl2;
};
const uint GiB = 0x40000000U;
uvec4 ReadTexel(uint offset) {
if (offset >= GiB) {
return uvec4(0);
}
switch (bytes_per_pixel) {
case 1:
// return uvec4(0xFF, 0, 0, 0);
return uvec4(u8data[offset], 0, 0, 0);
case 2:
return uvec4(u16data[offset / 2], 0, 0, 0);
case 4:
// return uvec4(0xFF, 0xFF, 0, 0xFF);
uint data4 = u32data[offset / 4];
// return uvec4(data4 & 0xffu, (data4 >> 8) & 0xffu, (data4 >> 16) & 0xffu, (data4 >> 24) & 0xffu);
return uvec4((data4 >> 24) & 0xffu, (data4 >> 16) & 0xffu, (data4 >> 8) & 0xffu, data4 & 0xffu);
// return uvec4(u32data[offset / 4], 0, 0, 0);
case 8:
return uvec4(u64data[offset / 8], 0, 0);
case 16:
return u128data[offset / 16];
}
return uvec4(0);
}
void main() {
if (gl_GlobalInvocationID.x >= size) {
return;
}
const uint swizzled_offset = gl_GlobalInvocationID.x + so_far;
// const uint swizzled_offset = 0;
const uint lesser_x_shift = block_height + block_depth;
const uint lesser_slice_size = dcl2 * gobs_in_x;
const uint block_height_mask = (1U << block_height) - 1;
const uint block_depth_mask = (1U << block_depth) - 1;
const uint entry = swizzled_offset & 511U;
const uint y_table = ((entry >> 5) & 6U) | ((entry >> 4) & 1U);
const uint x_entry = ((entry >> 3) & 32U) | ((entry >> 1) & 16U) | (entry & 15U);
const uint base_swizzled_offset = swizzled_offset >> 9;
const uint set_y = (base_swizzled_offset & block_height_mask) << 3;
const uint set_z = (base_swizzled_offset >> block_height) & block_depth_mask;
const uint inner_swizzled = base_swizzled_offset >> lesser_x_shift;
const uint sli = inner_swizzled / lesser_slice_size;
const uint gb = inner_swizzled % lesser_slice_size;
const uint x_inner = (gb % gobs_in_x) << 6;
const uint y_inner = (gb / gobs_in_x) << (block_height + 3);
const uint z_inner = sli << block_depth;
const uint x = x_inner + x_entry;
const uint y = y_inner + set_y + y_table;
const uint z = z_inner + set_z;
if (x >= pitch || y >= height || z >= depth) {
return;
}
if (z != 0) {
return; // TODO
}
const uvec4 texel = ReadTexel(ptr + swizzled_offset);
// const uvec4 texel = ReadTexel(0);
// imageStore(output_image, ivec3(x, y, z), texel);
imageStore(output_image, ivec3(x, y, z), vec4(texel)/255);
// imageStore(output_image, ivec3(x, y, z), uvec4(1, 1, 1, 1) * ((ptr >> 12) & 0xFF));
// imageStore(output_image, ivec3(x * 4, y * 4, z), texel);
// imageStore(output_image, ivec3(x * 4, y * 4, z), uvec4(255, 0, 255, 255));
// imageStore(output_image, ivec3(x, y, z), uvec4(255, 0, 255, 255));
}

View File

@@ -6,6 +6,7 @@
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/core.h"
#include "core/device_memory.h"
#include "core/hle/kernel/k_page_table.h"
#include "core/hle/kernel/k_process.h"
#include "core/memory.h"
@@ -312,6 +313,37 @@ void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
}
}
void MemoryManager::ReadBlockPointersUnsafe(GPUVAddr gpu_src_addr,
Core::Memory::ReadPointers& result,
const std::size_t size) const {
std::size_t remaining_size{size};
std::size_t page_index{gpu_src_addr >> page_bits};
std::size_t page_offset{gpu_src_addr & page_mask};
while (remaining_size > 0) {
const std::size_t copy_amount{
std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
const auto src_addr{*page_addr + page_offset};
system.Memory().ReadBlockPointersUnsafe(src_addr, result, copy_amount);
} else {
auto& tail = result.tail;
tail->backing_offset = Core::DramMemoryMap::Size;
tail->copy_amount = static_cast<u32>(copy_amount);
if (tail == &result.data[0]) {
LOG_CRITICAL(Debug, "Trying to read too much???");
abort();
}
--tail;
}
page_index++;
page_offset = 0;
remaining_size -= copy_amount;
}
}
void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size) {
std::size_t remaining_size{size};
std::size_t page_index{gpu_dest_addr >> page_bits};

View File

@@ -16,6 +16,9 @@ class RasterizerInterface;
namespace Core {
class System;
namespace Memory {
struct ReadPointers;
}
}
namespace Tegra {
@@ -111,6 +114,8 @@ public:
* being flushed.
*/
void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
void ReadBlockPointersUnsafe(GPUVAddr gpu_src_addr, Core::Memory::ReadPointers& result,
std::size_t size) const;
void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
/**

View File

@@ -671,8 +671,11 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_,
Image::~Image() = default;
void Image::UploadMemory(const ImageBufferMap& map,
std::span<const VideoCommon::BufferImageCopy> copies) {
void Image::UploadMemory(const ImageBufferMap& map, Tegra::MemoryManager& gpu_memory,
std::array<u8, VideoCommon::MAX_GUEST_SIZE>& scratch) {
const std::span<u8> mapped_span = map.mapped_span;
const auto copies =
VideoCommon::UnswizzleImage(gpu_memory, gpu_addr, info, scratch, mapped_span);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer);
glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes);

View File

@@ -151,8 +151,8 @@ public:
Image(Image&&) = default;
Image& operator=(Image&&) = default;
void UploadMemory(const ImageBufferMap& map,
std::span<const VideoCommon::BufferImageCopy> copies);
void UploadMemory(const ImageBufferMap& map, Tegra::MemoryManager& gpu_memory,
std::array<u8, VideoCommon::MAX_GUEST_SIZE>& scratch);
void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies);

View File

@@ -107,6 +107,7 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr),
surface(CreateSurface(instance, render_window)),
device(CreateDevice(instance, dld, *surface)),
host_memory(cpu_memory.GetDeviceMemory(), device),
memory_allocator(device, false),
state_tracker(gpu),
scheduler(device, state_tracker),
@@ -115,7 +116,7 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, scheduler,
screen_info),
rasterizer(render_window, gpu, gpu.MemoryManager(), cpu_memory, screen_info, device,
memory_allocator, state_tracker, scheduler) {
memory_allocator, state_tracker, scheduler, host_memory) {
Report();
} catch (const vk::Exception& exception) {
LOG_ERROR(Render_Vulkan, "Vulkan initialization failed with error: {}", exception.what());

View File

@@ -11,6 +11,7 @@
#include "common/dynamic_library.h"
#include "video_core/renderer_base.h"
#include "video_core/renderer_vulkan/vk_blit_screen.h"
#include "video_core/renderer_vulkan/vk_host_memory.h"
#include "video_core/renderer_vulkan/vk_rasterizer.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_state_tracker.h"
@@ -70,6 +71,7 @@ private:
VKScreenInfo screen_info;
Device device;
VulkanHostMemory host_memory;
MemoryAllocator memory_allocator;
StateTracker state_tracker;
VKScheduler scheduler;

View File

@@ -155,73 +155,7 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer,
std::memcpy(mapped_span.data(), &data, sizeof(data));
if (!use_accelerated) {
const u64 image_offset = GetRawImageOffset(framebuffer, image_index);
const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset;
const u8* const host_ptr = cpu_memory.GetPointer(framebuffer_addr);
const size_t size_bytes = GetSizeInBytes(framebuffer);
// TODO(Rodrigo): Read this from HLE
constexpr u32 block_height_log2 = 4;
const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer);
Tegra::Texture::UnswizzleTexture(
mapped_span.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes),
bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0);
const VkBufferImageCopy copy{
.bufferOffset = image_offset,
.bufferRowLength = 0,
.bufferImageHeight = 0,
.imageSubresource =
{
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.mipLevel = 0,
.baseArrayLayer = 0,
.layerCount = 1,
},
.imageOffset = {.x = 0, .y = 0, .z = 0},
.imageExtent =
{
.width = framebuffer.width,
.height = framebuffer.height,
.depth = 1,
},
};
scheduler.Record([this, copy, image_index](vk::CommandBuffer cmdbuf) {
const VkImage image = *raw_images[image_index];
const VkImageMemoryBarrier base_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = 0,
.dstAccessMask = 0,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image,
.subresourceRange{
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.baseMipLevel = 0,
.levelCount = 1,
.baseArrayLayer = 0,
.layerCount = 1,
},
};
VkImageMemoryBarrier read_barrier = base_barrier;
read_barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
read_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
read_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
VkImageMemoryBarrier write_barrier = base_barrier;
write_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
write_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
read_barrier);
cmdbuf.CopyBufferToImage(*buffer, image, VK_IMAGE_LAYOUT_GENERAL, copy);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, write_barrier);
});
// This seems unused
}
scheduler.Record(
[this, host_framebuffer, image_index, size = render_area](vk::CommandBuffer cmdbuf) {

View File

@@ -2,18 +2,15 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <cstring>
#include <memory>
#include <optional>
#include <utility>
#include "common/alignment.h"
#include "common/assert.h"
#include "common/common_types.h"
#include "common/div_ceil.h"
#include "video_core/host_shaders/astc_decoder_comp_spv.h"
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
#include "video_core/host_shaders/vulkan_unswizzle_comp_spv.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
@@ -22,20 +19,16 @@
#include "video_core/renderer_vulkan/vk_update_descriptor.h"
#include "video_core/texture_cache/accelerated_swizzle.h"
#include "video_core/texture_cache/types.h"
#include "video_core/textures/astc.h"
#include "video_core/textures/decoders.h"
#include "video_core/vulkan_common/vulkan_device.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan {
using Tegra::Texture::SWIZZLE_TABLE;
namespace {
constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;
constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 1;
constexpr size_t ASTC_NUM_BINDINGS = 2;
constexpr u32 BUFFER_TO_IMAGE_BINDING_INPUT_BUFFER = 0;
constexpr u32 BUFFER_TO_IMAGE_BINDING_OUTPUT_IMAGE = 1;
constexpr size_t BUFFER_TO_IMAGE_NUM_BINDINGS = 2;
template <size_t size>
inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{
@@ -71,24 +64,25 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
.score = 2,
};
constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
{
.binding = ASTC_BINDING_INPUT_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = ASTC_BINDING_OUTPUT_IMAGE,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
}};
constexpr std::array<VkDescriptorSetLayoutBinding, BUFFER_TO_IMAGE_NUM_BINDINGS>
BUFFER_TO_IMAGE_DESCRIPTOR_SET_BINDINGS{{
{
.binding = BUFFER_TO_IMAGE_BINDING_INPUT_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = BUFFER_TO_IMAGE_BINDING_OUTPUT_IMAGE,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
}};
constexpr DescriptorBankInfo ASTC_BANK_INFO{
constexpr DescriptorBankInfo BUFFER_TO_IMAGE_BANK_INFO{
.uniform_buffers = 0,
.storage_buffers = 1,
.texture_buffers = 0,
@@ -107,22 +101,22 @@ constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMP
.stride = sizeof(DescriptorUpdateEntry),
};
constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>
ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, BUFFER_TO_IMAGE_NUM_BINDINGS>
BUFFER_TO_IMAGE_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
{
.dstBinding = ASTC_BINDING_INPUT_BUFFER,
.dstBinding = BUFFER_TO_IMAGE_BINDING_INPUT_BUFFER,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
.offset = BUFFER_TO_IMAGE_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
{
.dstBinding = ASTC_BINDING_OUTPUT_IMAGE,
.dstBinding = BUFFER_TO_IMAGE_BINDING_OUTPUT_IMAGE,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
.offset = ASTC_BINDING_OUTPUT_IMAGE * sizeof(DescriptorUpdateEntry),
.offset = BUFFER_TO_IMAGE_BINDING_OUTPUT_IMAGE * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
}};
@@ -308,14 +302,11 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_,
DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,
VKUpdateDescriptorQueue& update_descriptor_queue_,
MemoryAllocator& memory_allocator_)
: ComputePass(device_, descriptor_pool_, ASTC_DESCRIPTOR_SET_BINDINGS,
ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY, ASTC_BANK_INFO,
VKUpdateDescriptorQueue& update_descriptor_queue_)
: ComputePass(device_, descriptor_pool_, BUFFER_TO_IMAGE_DESCRIPTOR_SET_BINDINGS,
BUFFER_TO_IMAGE_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY, BUFFER_TO_IMAGE_BANK_INFO,
COMPUTE_PUSH_CONSTANT_RANGE<sizeof(AstcPushConstants)>, ASTC_DECODER_COMP_SPV),
scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_},
update_descriptor_queue{update_descriptor_queue_}, memory_allocator{memory_allocator_} {}
scheduler{scheduler_}, update_descriptor_queue{update_descriptor_queue_} {}
ASTCDecoderPass::~ASTCDecoderPass() = default;
@@ -415,4 +406,130 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
scheduler.Finish();
}
UnswizzlePass::UnswizzlePass(const Device& device_, VKScheduler& scheduler_,
DescriptorPool& descriptor_pool_,
VKUpdateDescriptorQueue& update_descriptor_queue_,
VulkanHostMemory& vulkan_host_memory_)
: ComputePass(device_, descriptor_pool_, BUFFER_TO_IMAGE_DESCRIPTOR_SET_BINDINGS,
BUFFER_TO_IMAGE_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY, BUFFER_TO_IMAGE_BANK_INFO,
COMPUTE_PUSH_CONSTANT_RANGE<sizeof(VideoCommon::UnswizzlePushConstants)>,
VULKAN_UNSWIZZLE_COMP_SPV),
scheduler{scheduler_}, update_descriptor_queue{update_descriptor_queue_},
vulkan_host_memory{vulkan_host_memory_} {}
UnswizzlePass::~UnswizzlePass() = default;
namespace {
static constexpr VkAccessFlags UNSWIZZLE_WRITE_ACCESS_FLAGS =
VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
static constexpr VkAccessFlags UNSWIZZLE_READ_ACCESS_FLAGS =
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT;
} // namespace
void UnswizzlePass::Begin(Image& image) {
// LOG_CRITICAL(Debug, "ReqOut");
scheduler.RequestOutsideRenderPassOperationContext();
// scheduler.Finish();
// LOG_CRITICAL(Debug, "BeforeExchange");
const bool is_initialized = image.ExchangeInitialization();
scheduler.Record([vk_pipeline = *pipeline, vk_image = image.Handle(),
aspect_mask = image.AspectMask(), is_initialized](vk::CommandBuffer cmdbuf) {
const VkImageMemoryBarrier image_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = is_initialized ? UNSWIZZLE_WRITE_ACCESS_FLAGS : 0,
.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = vk_image,
.subresourceRange{
.aspectMask = aspect_mask,
.baseMipLevel = 0,
.levelCount = VK_REMAINING_MIP_LEVELS,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, image_barrier);
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, vk_pipeline);
});
// LOG_CRITICAL(Debug, "AfterExchange");
last_page = 0;
last_level = 0;
}
void UnswizzlePass::Assemble(Image& image, VideoCommon::UnswizzlePushConstants& unswizzle, u64 ptr,
u32 size, u32 so_far, s32 level, s32 layer, bool aspect) {
u32 new_page = static_cast<u32>(ptr / Core::DramMemoryMap::GiB);
unswizzle.ptr = ptr % Core::DramMemoryMap::GiB;
unswizzle.size = size;
unswizzle.so_far = so_far;
if (unswizzle.ptr + unswizzle.size >= Core::DramMemoryMap::GiB) {
LOG_CRITICAL(Debug, "swizzle page align");
abort();
}
if (new_page >= Core::DramMemoryMap::GiBs) {
new_page = 0;
unswizzle.ptr = Core::DramMemoryMap::GiB;
}
// if (new_page == last_page && level == last_level && layer == last_layer) {
// return nullptr;
// }
// LOG_CRITICAL(Debug, "BeforeAcq");
update_descriptor_queue.Acquire();
// LOG_CRITICAL(Debug, "AfterAcq");
vulkan_host_memory.BindPage(update_descriptor_queue, new_page);
// LOG_CRITICAL(Debug, "AfterBind");
update_descriptor_queue.AddImage(image.StorageImageView(level, layer, aspect));
last_page = new_page;
last_level = level;
last_layer = layer;
// LOG_CRITICAL(Debug, "BeforeUpdate");
const void* const descriptor_data{update_descriptor_queue.UpdateData()};
const u32 num_dispatches_x = Common::DivCeil(unswizzle.size, 64U);
// LOG_CRITICAL(Debug, "BeforeRecord");
scheduler.Record(
[this, num_dispatches_x, unswizzle, descriptor_data](vk::CommandBuffer cmdbuf) {
// if (descriptor_data) {
const VkDescriptorSet set = descriptor_allocator.Commit();
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
// }
cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, unswizzle);
cmdbuf.Dispatch(num_dispatches_x, 1, 1);
});
}
void UnswizzlePass::Finish(Image& image) {
scheduler.Record(
[vk_image = image.Handle(), aspect_mask = image.AspectMask()](vk::CommandBuffer cmdbuf) {
const VkImageMemoryBarrier image_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = UNSWIZZLE_READ_ACCESS_FLAGS | UNSWIZZLE_WRITE_ACCESS_FLAGS,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = vk_image,
.subresourceRange{
.aspectMask = aspect_mask,
.baseMipLevel = 0,
.levelCount = VK_REMAINING_MIP_LEVELS,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier);
});
// scheduler.Finish();
}
} // namespace Vulkan

View File

@@ -10,12 +10,14 @@
#include "common/common_types.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_host_memory.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
namespace VideoCommon {
struct SwizzleParameters;
}
struct UnswizzlePushConstants;
} // namespace VideoCommon
namespace Vulkan {
@@ -87,9 +89,7 @@ class ASTCDecoderPass final : public ComputePass {
public:
explicit ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_,
DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,
VKUpdateDescriptorQueue& update_descriptor_queue_,
MemoryAllocator& memory_allocator_);
VKUpdateDescriptorQueue& update_descriptor_queue_);
~ASTCDecoderPass();
void Assemble(Image& image, const StagingBufferRef& map,
@@ -97,9 +97,31 @@ public:
private:
VKScheduler& scheduler;
StagingBufferPool& staging_buffer_pool;
VKUpdateDescriptorQueue& update_descriptor_queue;
MemoryAllocator& memory_allocator;
};
class UnswizzlePass final : public ComputePass {
public:
explicit UnswizzlePass(const Device& device_, VKScheduler& scheduler_,
DescriptorPool& descriptor_pool_,
VKUpdateDescriptorQueue& update_descriptor_queue_,
VulkanHostMemory& vulkan_host_memory_);
~UnswizzlePass();
void Begin(Image& image);
void Assemble(Image& image, VideoCommon::UnswizzlePushConstants& swizzle, u64 ptr, u32 size,
u32 so_far, s32 level, s32 layer, bool aspect);
void Finish(Image& image);
private:
VKScheduler& scheduler;
VKUpdateDescriptorQueue& update_descriptor_queue;
VulkanHostMemory& vulkan_host_memory;
u32 last_page;
s32 last_level;
s32 last_layer;
};
} // namespace Vulkan

View File

@@ -458,6 +458,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
}
void GraphicsPipeline::ConfigureDraw() {
const void* const descriptor_data{update_descriptor_queue.UpdateData()};
texture_cache.UpdateRenderTargets(false);
scheduler.RequestRenderpass(texture_cache.GetFramebuffer());
@@ -469,7 +470,6 @@ void GraphicsPipeline::ConfigureDraw() {
});
}
const bool bind_pipeline{scheduler.UpdateGraphicsPipeline(this)};
const void* const descriptor_data{update_descriptor_queue.UpdateData()};
scheduler.Record([this, descriptor_data, bind_pipeline](vk::CommandBuffer cmdbuf) {
if (bind_pipeline) {
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline);

View File

@@ -0,0 +1,100 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
// Originally from
// https://github.com/google/vulkan_test_applications/blob/74e3a9790fb38303cd1646bbc098173fbb9200fa/application_sandbox/external_memory_host/main.cpp
// Copyright 2020 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "common/logging/log.h"
#include "video_core/renderer_vulkan/vk_host_memory.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
namespace Vulkan {
namespace {
const VkBufferCreateInfo BUFFER_CREATE_INFO = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = Core::DramMemoryMap::GiB,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
};
} // namespace
VulkanHostMemory::VulkanHostMemory(Core::DeviceMemory& memory, Device& device) {
VkImportMemoryHostPointerInfoEXT import_memory_info{
.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT,
.pNext = nullptr,
.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT,
.pHostPointer = nullptr,
};
VkMemoryAllocateInfo allocate_info{
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.pNext = &import_memory_info,
.allocationSize = Core::DramMemoryMap::GiB,
.memoryTypeIndex = 0,
};
const auto& logical = device.GetLogical();
const auto memory_properties = device.GetPhysical().GetMemoryProperties();
auto host = memory.buffer.BackingBasePointer();
for (auto& page : pages) {
page.second = logical.CreateBuffer(BUFFER_CREATE_INFO);
auto requirements = logical.GetBufferMemoryRequirements(*page.second, nullptr);
if (requirements.size != Core::DramMemoryMap::GiB) {
LOG_CRITICAL(Render_Vulkan, "Unexpected required size {}", requirements.size);
abort();
}
if (requirements.alignment > 4096) {
LOG_CRITICAL(Render_Vulkan, "Unexpected required alignment {}", requirements.alignment);
abort();
}
u32 host_pointer_memory_type_bits = logical.GetMemoryHostPointerProperties(host);
import_memory_info.pHostPointer = host;
u32 memory_type_bits = requirements.memoryTypeBits & host_pointer_memory_type_bits;
if (!memory_type_bits) {
LOG_CRITICAL(
Render_Vulkan,
"Buffer memory bits({}) are not compatible with host pointer memory type bits ({})",
requirements.memoryTypeBits, host_pointer_memory_type_bits);
abort();
}
allocate_info.memoryTypeIndex =
FindMemoryTypeIndex(memory_properties, memory_type_bits, false);
page.first = logical.AllocateMemory(allocate_info);
page.second.BindMemory(*page.first, 0);
host += Core::DramMemoryMap::GiB;
}
}
void VulkanHostMemory::BindPage(VKUpdateDescriptorQueue& update_descriptor_queue, u32 page) {
if (page >= Core::DramMemoryMap::GiBs) {
abort();
}
// page = 0;
std::pair<vk::DeviceMemory, vk::Buffer>& pair = pages[page];
[[maybe_unused]] auto mapping = pair.first.Map(0, Core::DramMemoryMap::GiB);
// mapping[0] = 0xFF;
// mapping[1] = 0xFF;
// mapping[2] = 0xFF;
// mapping[3] = 0xFF;
pair.first.Unmap();
update_descriptor_queue.AddBuffer(*pair.second, 0, Core::DramMemoryMap::GiB);
}
} // namespace Vulkan

View File

@@ -0,0 +1,23 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "core/device_memory.h"
#include "video_core/renderer_vulkan/vk_update_descriptor.h"
#include "video_core/vulkan_common/vulkan_device.h"
#pragma once
namespace Vulkan {
class VulkanHostMemory {
public:
explicit VulkanHostMemory(Core::DeviceMemory& memory, Device& device);
void BindPage(VKUpdateDescriptorQueue& update_descriptor_queue, u32 page);
private:
std::array<std::pair<vk::DeviceMemory, vk::Buffer>, Core::DramMemoryMap::GiBs> pages;
};
} // namespace Vulkan

View File

@@ -4,6 +4,7 @@
#include <algorithm>
#include <array>
#include <chrono>
#include <memory>
#include <mutex>
#include <vector>
@@ -125,7 +126,8 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
Tegra::MemoryManager& gpu_memory_,
Core::Memory::Memory& cpu_memory_, VKScreenInfo& screen_info_,
const Device& device_, MemoryAllocator& memory_allocator_,
StateTracker& state_tracker_, VKScheduler& scheduler_)
StateTracker& state_tracker_, VKScheduler& scheduler_,
VulkanHostMemory& host_memory_)
: RasterizerAccelerated{cpu_memory_}, gpu{gpu_},
gpu_memory{gpu_memory_}, maxwell3d{gpu.Maxwell3D()}, kepler_compute{gpu.KeplerCompute()},
screen_info{screen_info_}, device{device_}, memory_allocator{memory_allocator_},
@@ -133,12 +135,13 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler),
update_descriptor_queue(device, scheduler),
blit_image(device, scheduler, state_tracker, descriptor_pool),
astc_decoder_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue,
memory_allocator),
astc_decoder_pass(device, scheduler, descriptor_pool, update_descriptor_queue),
unswizzle_pass(device, scheduler, descriptor_pool, update_descriptor_queue, host_memory_),
render_pass_cache(device), texture_cache_runtime{device, scheduler,
memory_allocator, staging_pool,
blit_image, astc_decoder_pass,
render_pass_cache},
unswizzle_pass,
render_pass_cache, {}},
texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory),
buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool,
update_descriptor_queue, descriptor_pool),
@@ -155,6 +158,7 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
RasterizerVulkan::~RasterizerVulkan() = default;
void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
// auto t1 = std::chrono::high_resolution_clock::now();
MICROPROFILE_SCOPE(Vulkan_Drawing);
SCOPE_EXIT({ gpu.TickWork(); });
@@ -168,9 +172,11 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
}
std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
pipeline->Configure(is_indexed);
// auto t2 = std::chrono::high_resolution_clock::now();
BeginTransformFeedback();
// auto t3 = std::chrono::high_resolution_clock::now();
UpdateDynamicStates();
const auto& regs{maxwell3d.regs};
@@ -187,6 +193,14 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
}
});
EndTransformFeedback();
// auto t4 = std::chrono::high_resolution_clock::now();
// auto count1 = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
// auto count2 = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
// auto count3 = std::chrono::duration_cast<std::chrono::milliseconds>(t4 - t3).count();
// auto count4 = std::chrono::duration_cast<std::chrono::milliseconds>(t4 - t1).count();
// if (count4 > 1) {
// LOG_CRITICAL(Debug, "{} {} {}", count1, count2, count3);
// }
}
void RasterizerVulkan::Clear() {

View File

@@ -21,6 +21,7 @@
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_fence_manager.h"
#include "video_core/renderer_vulkan/vk_host_memory.h"
#include "video_core/renderer_vulkan/vk_pipeline_cache.h"
#include "video_core/renderer_vulkan/vk_query_cache.h"
#include "video_core/renderer_vulkan/vk_render_pass_cache.h"
@@ -67,7 +68,7 @@ public:
Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
VKScreenInfo& screen_info_, const Device& device_,
MemoryAllocator& memory_allocator_, StateTracker& state_tracker_,
VKScheduler& scheduler_);
VKScheduler& scheduler_, VulkanHostMemory& host_memory_);
~RasterizerVulkan() override;
void Draw(bool is_indexed, bool is_instanced) override;
@@ -154,6 +155,7 @@ private:
VKUpdateDescriptorQueue update_descriptor_queue;
BlitImageHelper blit_image;
ASTCDecoderPass astc_decoder_pass;
UnswizzlePass unswizzle_pass;
RenderPassCache render_pass_cache;
TextureCacheRuntime texture_cache_runtime;

View File

@@ -61,6 +61,11 @@ std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& p
return std::nullopt;
}
size_t Region(size_t iterator) noexcept {
return iterator / REGION_SIZE;
}
} // Anonymous namespace
u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask,
bool try_device_local) {
std::optional<u32> type;
@@ -80,11 +85,6 @@ u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_
throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY);
}
size_t Region(size_t iterator) noexcept {
return iterator / REGION_SIZE;
}
} // Anonymous namespace
StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_,
VKScheduler& scheduler_)
: device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} {

View File

@@ -23,6 +23,9 @@ struct StagingBufferRef {
std::span<u8> mapped_span;
};
u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask,
bool try_device_local);
class StagingBufferPool {
public:
static constexpr size_t NUM_SYNCS = 16;

View File

@@ -4,6 +4,8 @@
#include <algorithm>
#include <array>
#include <chrono>
#include <iostream>
#include <span>
#include <vector>
@@ -21,6 +23,7 @@
#include "video_core/renderer_vulkan/vk_texture_cache.h"
#include "video_core/texture_cache/formatter.h"
#include "video_core/texture_cache/samples_helper.h"
#include "video_core/textures/decoders.h"
#include "video_core/vulkan_common/vulkan_device.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
@@ -106,9 +109,9 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
UNREACHABLE_MSG("Invalid surface type");
}
}
if (info.storage) {
usage |= VK_IMAGE_USAGE_STORAGE_BIT;
}
// if (info.storage) {
usage |= VK_IMAGE_USAGE_STORAGE_BIT;
// }
return usage;
}
@@ -416,59 +419,6 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
}
}
void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage image,
VkImageAspectFlags aspect_mask, bool is_initialized,
std::span<const VkBufferImageCopy> copies) {
static constexpr VkAccessFlags WRITE_ACCESS_FLAGS =
VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
static constexpr VkAccessFlags READ_ACCESS_FLAGS = VK_ACCESS_SHADER_READ_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT;
const VkImageMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = WRITE_ACCESS_FLAGS,
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image,
.subresourceRange{
.aspectMask = aspect_mask,
.baseMipLevel = 0,
.levelCount = VK_REMAINING_MIP_LEVELS,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
const VkImageMemoryBarrier write_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = WRITE_ACCESS_FLAGS | READ_ACCESS_FLAGS,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image,
.subresourceRange{
.aspectMask = aspect_mask,
.baseMipLevel = 0,
.levelCount = VK_REMAINING_MIP_LEVELS,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
read_barrier);
cmdbuf.CopyBufferToImage(src_buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copies);
// TODO: Move this to another API
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0,
write_barrier);
}
[[nodiscard]] VkImageBlit MakeImageBlit(const Region2D& dst_region, const Region2D& src_region,
const VkImageSubresourceLayers& dst_layers,
const VkImageSubresourceLayers& src_layers) {
@@ -625,7 +575,7 @@ void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst
return;
}
}
ASSERT(src.format == dst.format);
// ASSERT(src.format == dst.format);
ASSERT(!(is_dst_msaa && !is_src_msaa));
ASSERT(operation == Fermi2D::Operation::SrcCopy);
@@ -844,7 +794,8 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
: VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime.scheduler},
image(MakeImage(runtime.device, info)),
commit(runtime.memory_allocator.Commit(image, MemoryUsage::DeviceLocal)),
aspect_mask(ImageAspectMask(info.format)) {
aspect_mask(ImageAspectMask(info.format)), unswizzle_pass{&runtime.unswizzle_pass},
read_pointers{&runtime.read_pointers} {
if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) {
if (Settings::values.accelerate_astc.GetValue()) {
flags |= VideoCommon::ImageFlagBits::AcceleratedUpload;
@@ -860,49 +811,197 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
.pNext = nullptr,
.usage = VK_IMAGE_USAGE_STORAGE_BIT,
};
const auto& device = runtime.device.GetLogical();
VkImageViewCreateInfo create_info{
.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
.pNext = &storage_image_view_usage_create_info,
.flags = 0,
.image = *image,
.viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
.format = VK_FORMAT_A8B8G8R8_UNORM_PACK32,
.components{
.r = VK_COMPONENT_SWIZZLE_IDENTITY,
.g = VK_COMPONENT_SWIZZLE_IDENTITY,
.b = VK_COMPONENT_SWIZZLE_IDENTITY,
.a = VK_COMPONENT_SWIZZLE_IDENTITY,
},
.subresourceRange{
.aspectMask = aspect_mask,
.baseMipLevel = ~0U,
.levelCount = 1,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
const u32 levels = static_cast<u32>(info.resources.levels);
const u32 layers = static_cast<u32>(info.resources.layers);
if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) {
const auto& device = runtime.device.GetLogical();
storage_image_views.reserve(info.resources.levels);
for (s32 level = 0; level < info.resources.levels; ++level) {
storage_image_views.push_back(device.CreateImageView(VkImageViewCreateInfo{
.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
.pNext = &storage_image_view_usage_create_info,
.flags = 0,
.image = *image,
.viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
.format = VK_FORMAT_A8B8G8R8_UNORM_PACK32,
.components{
.r = VK_COMPONENT_SWIZZLE_IDENTITY,
.g = VK_COMPONENT_SWIZZLE_IDENTITY,
.b = VK_COMPONENT_SWIZZLE_IDENTITY,
.a = VK_COMPONENT_SWIZZLE_IDENTITY,
},
.subresourceRange{
.aspectMask = aspect_mask,
.baseMipLevel = static_cast<u32>(level),
.levelCount = 1,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
}));
storage_image_views.reserve(levels);
for (u32 level = 0; level < levels; ++level) {
create_info.subresourceRange.baseMipLevel = level;
storage_image_views.push_back(device.CreateImageView(create_info));
}
LOG_CRITICAL(Debug, "astc");
abort();
} else {
switch (info.type) {
case ImageType::e2D:
case ImageType::e3D:
break;
default:
LOG_CRITICAL(Debug, "info.type {}", info.type);
// abort();
return;
}
create_info.subresourceRange.layerCount = 1;
const bool depth_stencil_aspect =
aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);
storage_image_views.reserve(levels * layers * (1 + depth_stencil_aspect));
for (u32 level = 0; level < levels; ++level) {
create_info.subresourceRange.baseMipLevel = level;
for (u32 layer = 0; layer < layers; ++layer) {
create_info.subresourceRange.baseArrayLayer = layer;
if (depth_stencil_aspect) {
create_info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
storage_image_views.push_back(device.CreateImageView(create_info));
create_info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
}
storage_image_views.push_back(device.CreateImageView(create_info));
}
}
}
}
Image::~Image() = default;
void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
// TODO: Move this to another API
scheduler->RequestOutsideRenderPassOperationContext();
std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
const VkBuffer src_buffer = map.buffer;
const VkImage vk_image = *image;
const VkImageAspectFlags vk_aspect_mask = aspect_mask;
const bool is_initialized = std::exchange(initialized, true);
scheduler->Record([src_buffer, vk_image, vk_aspect_mask, is_initialized,
vk_copies](vk::CommandBuffer cmdbuf) {
CopyBufferToImage(cmdbuf, src_buffer, vk_image, vk_aspect_mask, is_initialized, vk_copies);
});
// static int uploadmemorycount = 0;
void Image::UploadMemory(const StagingBufferRef&, Tegra::MemoryManager& gpu_memory,
std::array<u8, VideoCommon::MAX_GUEST_SIZE>&) {
const auto t1 = std::chrono::high_resolution_clock::now();
// const int debug_id = uploadmemorycount++;
// LOG_CRITICAL(Debug, "UploadMemory Starting {}", debug_id);
unswizzle_pass->Begin(*this);
// LOG_CRITICAL(Debug, "UploadMemory Begun {}", debug_id);
using namespace VideoCommon;
const size_t actual_guest_size_bytes = CalculateGuestSizeInBytes(info);
// if (actual_guest_size_bytes >= VideoCommon::MAX_GUEST_SIZE) {
// LOG_CRITICAL(Debug, "guest_size {}", actual_guest_size_bytes);
// abort();
// }
const u32 bpp_log2 = BytesPerBlockLog2(info.format);
const Extent3D size = info.size;
if (info.type == ImageType::Linear) {
// abort(); // TODO
LOG_CRITICAL(Debug, "Linear???");
return;
}
read_pointers->tail = &read_pointers->data.back();
// LOG_CRITICAL(Debug, "actually {}", actual_guest_size_bytes);
const auto t2 = std::chrono::high_resolution_clock::now();
gpu_memory.ReadBlockPointersUnsafe(gpu_addr, *read_pointers, actual_guest_size_bytes);
const auto t3 = std::chrono::high_resolution_clock::now();
const LevelInfo level_info = MakeLevelInfo(info);
const s32 num_levels = info.resources.levels;
const Extent2D tile_size = DefaultBlockSize(info.format);
const std::array level_sizes = CalculateLevelSizes(level_info, num_levels);
const Extent2D gob = GobSize(bpp_log2, info.block.height, info.tile_width_spacing);
const u32 layer_size = CalculateLevelBytes(level_sizes, num_levels);
const u32 layer_stride = AlignLayerSize(layer_size, size, level_info.block, tile_size.height,
info.tile_width_spacing);
const bool depth_stencil_aspect =
aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);
UnswizzlePushConstants unswizzle{};
const auto guest_end = read_pointers->tail;
const auto advance_tail = [guest_end](Core::Memory::ReadPointers::ReadPointer*& tail,
u32& offset, u32 size_, auto action) {
if (tail == guest_end) {
LOG_CRITICAL(Debug, "tail guest already");
return;
}
while (size_ && size_ + offset >= tail->copy_amount) {
const auto ptr_size = tail->copy_amount - offset;
if (!ptr_size) {
LOG_CRITICAL(Debug, "ptr???size {} {} {}", size_, offset, tail->copy_amount);
abort();
}
action(tail->backing_offset + offset, ptr_size);
size_ -= ptr_size;
offset = 0;
--tail;
if (tail == guest_end) {
// LOG_CRITICAL(Debug, "tail guest end");
++tail;
// abort();
offset = 0;
return;
}
}
if (size_) {
action(tail->backing_offset + offset, size_);
}
offset += size_;
};
auto guest_tail = &read_pointers->data.back();
u32 guest_offset = 0;
for (s32 level = 0; level < num_levels; ++level) {
// if (const auto levelsz = level_sizes[level]; true) {
// LOG_CRITICAL(Debug, "levelsz {} layersz {}", levelsz, layer_stride);
// }
const Extent3D level_size = AdjustMipSize(size, level);
const Extent3D num_tiles = AdjustTileSize(level_size, tile_size);
const Extent3D block = AdjustMipBlockSize(num_tiles, level_info.block, level);
const u32 stride_alignment = StrideAlignment(num_tiles, info.block, gob, bpp_log2);
Tegra::Texture::CalculateUnswizzle(unswizzle, 1U << bpp_log2, num_tiles.width,
num_tiles.height, num_tiles.depth, block.height,
block.depth, stride_alignment);
auto guest_layer_tail = guest_tail;
u32 guest_layer_offset = 0;
const auto level_advance_size = level_sizes[level];
for (s32 layer = 0; layer < info.resources.layers; ++layer) {
const auto assemble = [this, &unswizzle, level, layer, guest_layer_tail,
guest_layer_offset, level_advance_size,
advance_tail](bool aspect) {
auto assemble_tail = guest_layer_tail;
auto assemble_offset = guest_layer_offset;
u32 so_far = 0;
advance_tail(
assemble_tail, assemble_offset, level_advance_size,
[this, &unswizzle, level, layer, aspect, &so_far](u64 ptr, u32 ptr_size) {
// LOG_CRITICAL(Debug, "ptr_size
// {}", ptr_size);
unswizzle_pass->Assemble(*this, unswizzle, ptr, ptr_size, so_far, level,
layer, aspect);
so_far += ptr_size;
});
// LOG_CRITICAL(Debug, "total so_far {}", so_far);
};
assemble(false);
if (depth_stencil_aspect) {
assemble(true);
}
advance_tail(guest_layer_tail, guest_layer_offset, layer_stride, [](u64, u32) {});
}
advance_tail(guest_tail, guest_offset, level_advance_size, [](u64, u32) {});
}
// LOG_CRITICAL(Debug, "UploadMemory Finishing {}", debug_id);
const auto t4 = std::chrono::high_resolution_clock::now();
unswizzle_pass->Finish(*this);
const auto t5 = std::chrono::high_resolution_clock::now();
// const auto count0 = std::chrono::duration_cast<std::chrono::microseconds>(t5 - t1).count();
const auto count1 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
const auto count2 = std::chrono::duration_cast<std::chrono::microseconds>(t3 - t2).count();
const auto count3 = std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count();
const auto count4 = std::chrono::duration_cast<std::chrono::microseconds>(t5 - t4).count();
// if (count0 > 1) {
LOG_CRITICAL(Debug, "{} {} {} {} bpp {}", count1, count2, count3, count4, 1U << bpp_log2);
// }
// LOG_CRITICAL(Debug, "UploadMemory Done {}", debug_id);
// sleep(1);
// if (debug_id == 23) {
// __asm__("int3");
// abort();
// }
}
void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {

View File

@@ -7,6 +7,7 @@
#include <compare>
#include <span>
#include "core/memory.h"
#include "shader_recompiler/shader_info.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
#include "video_core/texture_cache/texture_cache_base.h"
@@ -29,6 +30,7 @@ class ImageView;
class Framebuffer;
class RenderPassCache;
class StagingBufferPool;
class UnswizzlePass;
class VKScheduler;
struct TextureCacheRuntime {
@@ -38,7 +40,9 @@ struct TextureCacheRuntime {
StagingBufferPool& staging_buffer_pool;
BlitImageHelper& blit_image_helper;
ASTCDecoderPass& astc_decoder_pass;
UnswizzlePass& unswizzle_pass;
RenderPassCache& render_pass_cache;
Core::Memory::ReadPointers read_pointers;
void Finish();
@@ -90,8 +94,8 @@ public:
Image(Image&&) = default;
Image& operator=(Image&&) = default;
void UploadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies);
void UploadMemory(const StagingBufferRef& map, Tegra::MemoryManager& gpu_memory,
std::array<u8, VideoCommon::MAX_GUEST_SIZE>& scratch);
void DownloadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies);
@@ -104,8 +108,15 @@ public:
return aspect_mask;
}
[[nodiscard]] VkImageView StorageImageView(s32 level) const noexcept {
return *storage_image_views[level];
[[nodiscard]] VkImageView StorageImageView(s32 level, s32 layer = 0,
bool aspect = false) const noexcept {
const auto idx = static_cast<u32>(layer + info.resources.layers *
(level + aspect * info.resources.levels));
if (idx >= storage_image_views.size()) {
LOG_CRITICAL(Debug, "{} {} {}", idx, storage_image_views.size(), aspect);
abort();
}
return *storage_image_views[idx];
}
/// Returns true when the image is already initialized and mark it as initialized
@@ -121,6 +132,8 @@ private:
std::vector<vk::ImageView> storage_image_views;
VkImageAspectFlags aspect_mask = 0;
bool initialized = false;
UnswizzlePass* unswizzle_pass;
Core::Memory::ReadPointers* read_pointers;
};
class ImageView : public VideoCommon::ImageViewBase {

View File

@@ -77,7 +77,7 @@ private:
DescriptorUpdateEntry* payload_cursor = nullptr;
const DescriptorUpdateEntry* upload_start = nullptr;
std::array<DescriptorUpdateEntry, 0x10000> payload;
std::array<DescriptorUpdateEntry, 0x100000> payload;
};
} // namespace Vulkan

View File

@@ -272,6 +272,7 @@ void TextureCache<P>::FillImageViews(DescriptorTable<TICEntry>& table,
std::span<ImageViewId> cached_image_view_ids,
std::span<const u32> indices,
std::span<ImageViewId> image_view_ids) {
// auto t1 = std::chrono::high_resolution_clock::now();
ASSERT(indices.size() <= image_view_ids.size());
do {
has_deleted_images = false;
@@ -279,24 +280,40 @@ void TextureCache<P>::FillImageViews(DescriptorTable<TICEntry>& table,
return VisitImageView(table, cached_image_view_ids, index);
});
} while (has_deleted_images);
// auto t2 = std::chrono::high_resolution_clock::now();
// auto count1 = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
// if (count1 > 1) {
// LOG_CRITICAL(Debug, "{}", count1);
// }
}
template <class P>
ImageViewId TextureCache<P>::VisitImageView(DescriptorTable<TICEntry>& table,
std::span<ImageViewId> cached_image_view_ids,
u32 index) {
// auto t1 = std::chrono::high_resolution_clock::now();
if (index > table.Limit()) {
LOG_DEBUG(HW_GPU, "Invalid image view index={}", index);
return NULL_IMAGE_VIEW_ID;
}
const auto [descriptor, is_new] = table.Read(index);
ImageViewId& image_view_id = cached_image_view_ids[index];
// auto t2 = std::chrono::high_resolution_clock::now();
if (is_new) {
image_view_id = FindImageView(descriptor);
}
// auto t3 = std::chrono::high_resolution_clock::now();
if (image_view_id != NULL_IMAGE_VIEW_ID) {
PrepareImageView(image_view_id, false, false);
}
// auto t4 = std::chrono::high_resolution_clock::now();
// auto count1 = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
// auto count2 = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
// auto count3 = std::chrono::duration_cast<std::chrono::milliseconds>(t4 - t3).count();
// auto count4 = std::chrono::duration_cast<std::chrono::milliseconds>(t4 - t1).count();
// if (count4 > 1) {
// LOG_CRITICAL(Debug, "{} {} {} {}", count1, count2, count3, count4);
// }
return image_view_id;
}
@@ -539,29 +556,26 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
return;
}
auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
UploadImageContents(image, staging);
UploadImageContents(image);
runtime.InsertUploadMemoryBarrier();
}
template <class P>
template <typename StagingBuffer>
void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging) {
const std::span<u8> mapped_span = staging.mapped_span;
const GPUVAddr gpu_addr = image.gpu_addr;
void TextureCache<P>::UploadImageContents(Image& image) {
auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
if (True(image.flags & ImageFlagBits::AcceleratedUpload)) {
gpu_memory.ReadBlockUnsafe(gpu_addr, mapped_span.data(), mapped_span.size_bytes());
const auto uploads = FullUploadSwizzles(image.info);
runtime.AccelerateImageUpload(image, staging, uploads);
// gpu_memory.ReadBlockUnsafe(gpu_addr, mapped_span.data(),
// mapped_span.size_bytes()); const auto uploads = FullUploadSwizzles(image.info);
// runtime.AccelerateImageUpload(image, staging, uploads);
abort();
} else if (True(image.flags & ImageFlagBits::Converted)) {
std::vector<u8> unswizzled_data(image.unswizzled_size_bytes);
auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, unswizzled_data);
ConvertImage(unswizzled_data, image.info, mapped_span, copies);
image.UploadMemory(staging, copies);
// std::vector<u8> unswizzled_data(image.unswizzled_size_bytes);
// auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, unswizzled_data);
// ConvertImage(unswizzled_data, image.info, mapped_span, copies);
// image.UploadMemory(staging, copies);
abort();
} else {
const auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, mapped_span);
image.UploadMemory(staging, copies);
image.UploadMemory(staging, gpu_memory, unswizzle_scratch);
}
}
@@ -1378,8 +1392,14 @@ void TextureCache<P>::PrepareImage(ImageId image_id, bool is_modification, bool
TrackImage(image, image_id);
}
} else {
// auto t1 = std::chrono::high_resolution_clock::now();
RefreshContents(image, image_id);
SynchronizeAliases(image_id);
// auto t2 = std::chrono::high_resolution_clock::now();
// auto count1 = std::chrono::duration_cast<std::chrono::milliseconds>(t2 -
// t1).count(); if (count1 > 1) {
// LOG_CRITICAL(Debug, "{}", count1);
// }
}
if (is_modification) {
MarkModification(image);

View File

@@ -217,9 +217,8 @@ private:
/// Refresh the contents (pixel data) of an image
void RefreshContents(Image& image, ImageId image_id);
/// Upload data from guest to an image
template <typename StagingBuffer>
void UploadImageContents(Image& image, StagingBuffer& staging_buffer);
/// Upload data from guest to an imag
void UploadImageContents(Image& image);
/// Find or create an image view from a guest descriptor
[[nodiscard]] ImageViewId FindImageView(const TICEntry& config);
@@ -328,6 +327,7 @@ private:
Tegra::Engines::Maxwell3D& maxwell3d;
Tegra::Engines::KeplerCompute& kepler_compute;
Tegra::MemoryManager& gpu_memory;
std::array<u8, MAX_GUEST_SIZE> unswizzle_scratch;
DescriptorTable<TICEntry> graphics_image_table{gpu_memory};
DescriptorTable<TSCEntry> graphics_sampler_table{gpu_memory};

View File

@@ -145,4 +145,18 @@ struct SwizzleParameters {
s32 level;
};
struct UnswizzlePushConstants {
u32 size;
u32 ptr;
u32 so_far;
u32 bytes_per_pixel;
u32 pitch;
u32 height;
u32 depth;
u32 block_height;
u32 block_depth;
u32 gobs_in_x;
u32 dcl2;
};
} // namespace VideoCommon

View File

@@ -52,8 +52,6 @@
namespace VideoCommon {
namespace {
using Tegra::Texture::GOB_SIZE;
using Tegra::Texture::GOB_SIZE_SHIFT;
using Tegra::Texture::GOB_SIZE_X;
@@ -80,15 +78,7 @@ using VideoCore::Surface::SurfaceType;
constexpr u32 CONVERTED_BYTES_PER_BLOCK = BytesPerBlock(PixelFormat::A8B8G8R8_UNORM);
struct LevelInfo {
Extent3D size;
Extent3D block;
Extent2D tile_size;
u32 bpp_log2;
u32 tile_width_spacing;
};
[[nodiscard]] constexpr u32 AdjustTileSize(u32 shift, u32 unit_factor, u32 dimension) {
[[nodiscard]] u32 AdjustTileSize(u32 shift, u32 unit_factor, u32 dimension) {
if (shift == 0) {
return 0;
}
@@ -104,11 +94,11 @@ struct LevelInfo {
return shift;
}
[[nodiscard]] constexpr u32 AdjustMipSize(u32 size, u32 level) {
[[nodiscard]] u32 AdjustMipSize(u32 size, u32 level) {
return std::max<u32>(size >> level, 1);
}
[[nodiscard]] constexpr Extent3D AdjustMipSize(Extent3D size, s32 level) {
[[nodiscard]] Extent3D AdjustMipSize(Extent3D size, s32 level) {
return Extent3D{
.width = AdjustMipSize(size.width, level),
.height = AdjustMipSize(size.height, level),
@@ -126,7 +116,7 @@ struct LevelInfo {
}
template <u32 GOB_EXTENT>
[[nodiscard]] constexpr u32 AdjustMipBlockSize(u32 num_tiles, u32 block_size, u32 level) {
[[nodiscard]] u32 AdjustMipBlockSize(u32 num_tiles, u32 block_size, u32 level) {
do {
while (block_size > 0 && num_tiles <= (1U << (block_size - 1)) * GOB_EXTENT) {
--block_size;
@@ -135,8 +125,7 @@ template <u32 GOB_EXTENT>
return block_size;
}
[[nodiscard]] constexpr Extent3D AdjustMipBlockSize(Extent3D num_tiles, Extent3D block_size,
u32 level) {
[[nodiscard]] Extent3D AdjustMipBlockSize(Extent3D num_tiles, Extent3D block_size, u32 level) {
return {
.width = AdjustMipBlockSize<GOB_SIZE_X>(num_tiles.width, block_size.width, level),
.height = AdjustMipBlockSize<GOB_SIZE_Y>(num_tiles.height, block_size.height, level),
@@ -144,7 +133,7 @@ template <u32 GOB_EXTENT>
};
}
[[nodiscard]] constexpr Extent3D AdjustTileSize(Extent3D size, Extent2D tile_size) {
[[nodiscard]] Extent3D AdjustTileSize(Extent3D size, Extent2D tile_size) {
return {
.width = Common::DivCeil(size.width, tile_size.width),
.height = Common::DivCeil(size.height, tile_size.height),
@@ -152,28 +141,28 @@ template <u32 GOB_EXTENT>
};
}
[[nodiscard]] constexpr u32 BytesPerBlockLog2(u32 bytes_per_block) {
[[nodiscard]] u32 BytesPerBlockLog2(u32 bytes_per_block) {
return std::countl_zero(bytes_per_block) ^ 0x1F;
}
[[nodiscard]] constexpr u32 BytesPerBlockLog2(PixelFormat format) {
[[nodiscard]] u32 BytesPerBlockLog2(PixelFormat format) {
return BytesPerBlockLog2(BytesPerBlock(format));
}
[[nodiscard]] constexpr u32 NumBlocks(Extent3D size, Extent2D tile_size) {
[[nodiscard]] u32 NumBlocks(Extent3D size, Extent2D tile_size) {
const Extent3D num_blocks = AdjustTileSize(size, tile_size);
return num_blocks.width * num_blocks.height * num_blocks.depth;
}
[[nodiscard]] constexpr u32 AdjustSize(u32 size, u32 level, u32 block_size) {
[[nodiscard]] u32 AdjustSize(u32 size, u32 level, u32 block_size) {
return Common::DivCeil(AdjustMipSize(size, level), block_size);
}
[[nodiscard]] constexpr Extent2D DefaultBlockSize(PixelFormat format) {
[[nodiscard]] Extent2D DefaultBlockSize(PixelFormat format) {
return {DefaultBlockWidth(format), DefaultBlockHeight(format)};
}
[[nodiscard]] constexpr Extent3D NumLevelBlocks(const LevelInfo& info, u32 level) {
[[nodiscard]] Extent3D NumLevelBlocks(const LevelInfo& info, u32 level) {
return Extent3D{
.width = AdjustSize(info.size.width, level, info.tile_size.width) << info.bpp_log2,
.height = AdjustSize(info.size.height, level, info.tile_size.height),
@@ -181,7 +170,7 @@ template <u32 GOB_EXTENT>
};
}
[[nodiscard]] constexpr Extent3D TileShift(const LevelInfo& info, u32 level) {
[[nodiscard]] Extent3D TileShift(const LevelInfo& info, u32 level) {
const Extent3D blocks = NumLevelBlocks(info, level);
return Extent3D{
.width = AdjustTileSize(info.block.width, GOB_SIZE_X, blocks.width),
@@ -190,21 +179,19 @@ template <u32 GOB_EXTENT>
};
}
[[nodiscard]] constexpr Extent2D GobSize(u32 bpp_log2, u32 block_height, u32 tile_width_spacing) {
[[nodiscard]] Extent2D GobSize(u32 bpp_log2, u32 block_height, u32 tile_width_spacing) {
return Extent2D{
.width = GOB_SIZE_X_SHIFT - bpp_log2 + tile_width_spacing,
.height = GOB_SIZE_Y_SHIFT + block_height,
};
}
[[nodiscard]] constexpr bool IsSmallerThanGobSize(Extent3D num_tiles, Extent2D gob,
u32 block_depth) {
[[nodiscard]] bool IsSmallerThanGobSize(Extent3D num_tiles, Extent2D gob, u32 block_depth) {
return num_tiles.width <= (1U << gob.width) || num_tiles.height <= (1U << gob.height) ||
num_tiles.depth < (1U << block_depth);
}
[[nodiscard]] constexpr u32 StrideAlignment(Extent3D num_tiles, Extent3D block, Extent2D gob,
u32 bpp_log2) {
[[nodiscard]] u32 StrideAlignment(Extent3D num_tiles, Extent3D block, Extent2D gob, u32 bpp_log2) {
if (IsSmallerThanGobSize(num_tiles, gob, block.depth)) {
return GOB_SIZE_X_SHIFT - bpp_log2;
} else {
@@ -212,13 +199,13 @@ template <u32 GOB_EXTENT>
}
}
[[nodiscard]] constexpr u32 StrideAlignment(Extent3D num_tiles, Extent3D block, u32 bpp_log2,
u32 tile_width_spacing) {
[[nodiscard]] u32 StrideAlignment(Extent3D num_tiles, Extent3D block, u32 bpp_log2,
u32 tile_width_spacing) {
const Extent2D gob = GobSize(bpp_log2, block.height, tile_width_spacing);
return StrideAlignment(num_tiles, block, gob, bpp_log2);
}
[[nodiscard]] constexpr Extent2D NumGobs(const LevelInfo& info, u32 level) {
[[nodiscard]] Extent2D NumGobs(const LevelInfo& info, u32 level) {
const Extent3D blocks = NumLevelBlocks(info, level);
const Extent2D gobs{
.width = Common::DivCeilLog2(blocks.width, GOB_SIZE_X_SHIFT),
@@ -233,7 +220,7 @@ template <u32 GOB_EXTENT>
};
}
[[nodiscard]] constexpr Extent3D LevelTiles(const LevelInfo& info, u32 level) {
[[nodiscard]] Extent3D LevelTiles(const LevelInfo& info, u32 level) {
const Extent3D blocks = NumLevelBlocks(info, level);
const Extent3D tile_shift = TileShift(info, level);
const Extent2D gobs = NumGobs(info, level);
@@ -244,7 +231,7 @@ template <u32 GOB_EXTENT>
};
}
[[nodiscard]] constexpr u32 CalculateLevelSize(const LevelInfo& info, u32 level) {
[[nodiscard]] u32 CalculateLevelSize(const LevelInfo& info, u32 level) {
const Extent3D tile_shift = TileShift(info, level);
const Extent3D tiles = LevelTiles(info, level);
const u32 num_tiles = tiles.width * tiles.height * tiles.depth;
@@ -252,7 +239,7 @@ template <u32 GOB_EXTENT>
return num_tiles << shift;
}
[[nodiscard]] constexpr LevelArray CalculateLevelSizes(const LevelInfo& info, u32 num_levels) {
[[nodiscard]] LevelArray CalculateLevelSizes(const LevelInfo& info, u32 num_levels) {
ASSERT(num_levels <= MAX_MIP_LEVELS);
LevelArray sizes{};
for (u32 level = 0; level < num_levels; ++level) {
@@ -265,8 +252,8 @@ template <u32 GOB_EXTENT>
return std::reduce(sizes.begin(), sizes.begin() + num_levels, 0U);
}
[[nodiscard]] constexpr LevelInfo MakeLevelInfo(PixelFormat format, Extent3D size, Extent3D block,
u32 tile_width_spacing) {
[[nodiscard]] LevelInfo MakeLevelInfo(PixelFormat format, Extent3D size, Extent3D block,
u32 tile_width_spacing) {
const u32 bytes_per_block = BytesPerBlock(format);
return {
.size =
@@ -282,12 +269,12 @@ template <u32 GOB_EXTENT>
};
}
[[nodiscard]] constexpr LevelInfo MakeLevelInfo(const ImageInfo& info) {
[[nodiscard]] LevelInfo MakeLevelInfo(const ImageInfo& info) {
return MakeLevelInfo(info.format, info.size, info.block, info.tile_width_spacing);
}
[[nodiscard]] constexpr u32 CalculateLevelOffset(PixelFormat format, Extent3D size, Extent3D block,
u32 tile_width_spacing, u32 level) {
[[nodiscard]] u32 CalculateLevelOffset(PixelFormat format, Extent3D size, Extent3D block,
u32 tile_width_spacing, u32 level) {
const LevelInfo info = MakeLevelInfo(format, size, block, tile_width_spacing);
u32 offset = 0;
for (u32 current_level = 0; current_level < level; ++current_level) {
@@ -296,8 +283,8 @@ template <u32 GOB_EXTENT>
return offset;
}
[[nodiscard]] constexpr u32 AlignLayerSize(u32 size_bytes, Extent3D size, Extent3D block,
u32 tile_size_y, u32 tile_width_spacing) {
[[nodiscard]] u32 AlignLayerSize(u32 size_bytes, Extent3D size, Extent3D block, u32 tile_size_y,
u32 tile_width_spacing) {
// https://github.com/Ryujinx/Ryujinx/blob/1c9aba6de1520aea5480c032e0ff5664ac1bb36f/Ryujinx.Graphics.Texture/SizeCalculator.cs#L134
if (tile_width_spacing > 0) {
const u32 alignment_log2 = GOB_SIZE_SHIFT + tile_width_spacing + block.height + block.depth;
@@ -483,7 +470,7 @@ template <u32 GOB_EXTENT>
};
}
[[nodiscard]] constexpr u32 NumBlocksPerLayer(const ImageInfo& info, Extent2D tile_size) noexcept {
[[nodiscard]] u32 NumBlocksPerLayer(const ImageInfo& info, Extent2D tile_size) noexcept {
u32 num_blocks = 0;
for (s32 level = 0; level < info.resources.levels; ++level) {
const Extent3D mip_size = AdjustMipSize(info.size, level);
@@ -574,8 +561,6 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr
ASSERT(host_offset - copy.buffer_offset == copy.buffer_size);
}
} // Anonymous namespace
u32 CalculateGuestSizeInBytes(const ImageInfo& info) noexcept {
if (info.type == ImageType::Buffer) {
return info.size.width * BytesPerBlock(info.format);
@@ -783,8 +768,15 @@ bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config
}
std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr,
const ImageInfo& info, std::span<u8> output) {
const ImageInfo& info,
std::array<u8, MAX_GUEST_SIZE>& scratch,
std::span<u8> output) {
auto t1 = std::chrono::high_resolution_clock::now();
const size_t guest_size_bytes = CalculateGuestSizeInBytes(info);
if (guest_size_bytes >= MAX_GUEST_SIZE) {
LOG_CRITICAL(Debug, "guest_size {}", guest_size_bytes);
abort();
}
const u32 bpp_log2 = BytesPerBlockLog2(info.format);
const Extent3D size = info.size;
@@ -807,9 +799,12 @@ std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, GP
.image_extent = size,
}};
}
const auto input_data = std::make_unique<u8[]>(guest_size_bytes);
gpu_memory.ReadBlockUnsafe(gpu_addr, input_data.get(), guest_size_bytes);
const std::span<const u8> input(input_data.get(), guest_size_bytes);
auto t2 = std::chrono::high_resolution_clock::now();
auto t3 = std::chrono::high_resolution_clock::now();
gpu_memory.ReadBlockUnsafe(gpu_addr, scratch.data(), guest_size_bytes);
auto t4 = std::chrono::high_resolution_clock::now();
const std::span<const u8> input(scratch.data(), guest_size_bytes);
auto t5 = std::chrono::high_resolution_clock::now();
const LevelInfo level_info = MakeLevelInfo(info);
const s32 num_layers = info.resources.layers;
@@ -850,13 +845,27 @@ std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, GP
for (s32 layer = 0; layer < info.resources.layers; ++layer) {
const std::span<u8> dst = output.subspan(host_offset);
const std::span<const u8> src = input.subspan(guest_offset + guest_layer_offset);
UnswizzleTexture(dst, src, 1U << bpp_log2, num_tiles.width, num_tiles.height,
num_tiles.depth, block.height, block.depth, stride_alignment);
const std::span<const u8> src_limit =
src.first(std::min(src.size(), static_cast<size_t>(level_sizes[level])));
if (1) {
UnswizzleTexture(dst, src_limit, 1U << bpp_log2, num_tiles.width, num_tiles.height,
num_tiles.depth, block.height, block.depth, stride_alignment);
}
guest_layer_offset += layer_stride;
host_offset += host_bytes_per_layer;
}
guest_offset += level_sizes[level];
}
auto t6 = std::chrono::high_resolution_clock::now();
auto count1 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
auto count2 = std::chrono::duration_cast<std::chrono::microseconds>(t3 - t2).count();
auto count3 = std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count();
auto count4 = std::chrono::duration_cast<std::chrono::microseconds>(t5 - t4).count();
auto count5 = std::chrono::duration_cast<std::chrono::microseconds>(t6 - t5).count();
auto count0 = std::chrono::duration_cast<std::chrono::microseconds>(t4 - t1).count();
if (count0 > 1) {
LOG_CRITICAL(Debug, "{} {} {} {} {}", count1, count2, count3, count4, count5);
}
return copies;
}
@@ -1171,52 +1180,4 @@ u32 MapSizeBytes(const ImageBase& image) {
}
}
static_assert(CalculateLevelSize(LevelInfo{{1920, 1080, 1}, {0, 2, 0}, {1, 1}, 2, 0}, 0) ==
0x7f8000);
static_assert(CalculateLevelSize(LevelInfo{{32, 32, 1}, {0, 0, 4}, {1, 1}, 4, 0}, 0) == 0x4000);
static_assert(CalculateLevelOffset(PixelFormat::R8_SINT, {1920, 1080, 1}, {0, 2, 0}, 0, 7) ==
0x2afc00);
static_assert(CalculateLevelOffset(PixelFormat::ASTC_2D_12X12_UNORM, {8192, 4096, 1}, {0, 2, 0}, 0,
12) == 0x50d200);
static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 0, 0) ==
0);
static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 0, 1) ==
0x400000);
static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 0, 2) ==
0x500000);
static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 0, 3) ==
0x540000);
static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 0, 4) ==
0x550000);
static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 0, 5) ==
0x554000);
static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 0, 6) ==
0x555000);
static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 0, 7) ==
0x555400);
static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 0, 8) ==
0x555600);
static_assert(CalculateLevelOffset(PixelFormat::A8B8G8R8_UNORM, {1024, 1024, 1}, {0, 4, 0}, 0, 9) ==
0x555800);
constexpr u32 ValidateLayerSize(PixelFormat format, u32 width, u32 height, u32 block_height,
u32 tile_width_spacing, u32 level) {
const Extent3D size{width, height, 1};
const Extent3D block{0, block_height, 0};
const u32 offset = CalculateLevelOffset(format, size, block, tile_width_spacing, level);
return AlignLayerSize(offset, size, block, DefaultBlockHeight(format), tile_width_spacing);
}
static_assert(ValidateLayerSize(PixelFormat::ASTC_2D_12X12_UNORM, 8192, 4096, 2, 0, 12) ==
0x50d800);
static_assert(ValidateLayerSize(PixelFormat::A8B8G8R8_UNORM, 1024, 1024, 2, 0, 10) == 0x556000);
static_assert(ValidateLayerSize(PixelFormat::BC3_UNORM, 128, 128, 2, 0, 8) == 0x6000);
static_assert(ValidateLayerSize(PixelFormat::A8B8G8R8_UNORM, 518, 572, 4, 3, 1) == 0x190000,
"Tile width spacing is not working");
static_assert(ValidateLayerSize(PixelFormat::BC5_UNORM, 1024, 1024, 3, 4, 11) == 0x160000,
"Compressed tile width spacing is not working");
} // namespace VideoCommon

View File

@@ -28,6 +28,77 @@ struct OverlapResult {
SubresourceExtent resources;
};
// This ought to be enough for anybody
constexpr size_t MAX_GUEST_SIZE = 0x4000000;
struct LevelInfo {
Extent3D size;
Extent3D block;
Extent2D tile_size;
u32 bpp_log2;
u32 tile_width_spacing;
};
[[nodiscard]] u32 AdjustTileSize(u32 shift, u32 unit_factor, u32 dimension);
[[nodiscard]] u32 AdjustMipSize(u32 size, u32 level);
[[nodiscard]] Extent3D AdjustMipSize(Extent3D size, s32 level);
[[nodiscard]] Extent3D AdjustSamplesSize(Extent3D size, s32 num_samples);
template <u32 GOB_EXTENT>
[[nodiscard]] u32 AdjustMipBlockSize(u32 num_tiles, u32 block_size, u32 level);
[[nodiscard]] Extent3D AdjustMipBlockSize(Extent3D num_tiles, Extent3D block_size, u32 level);
[[nodiscard]] Extent3D AdjustTileSize(Extent3D size, Extent2D tile_size);
[[nodiscard]] u32 BytesPerBlockLog2(u32 bytes_per_block);
[[nodiscard]] u32 BytesPerBlockLog2(PixelFormat format);
[[nodiscard]] u32 NumBlocks(Extent3D size, Extent2D tile_size);
[[nodiscard]] u32 AdjustSize(u32 size, u32 level, u32 block_size);
[[nodiscard]] Extent2D DefaultBlockSize(PixelFormat format);
[[nodiscard]] Extent3D NumLevelBlocks(const LevelInfo& info, u32 level);
[[nodiscard]] Extent3D TileShift(const LevelInfo& info, u32 level);
[[nodiscard]] Extent2D GobSize(u32 bpp_log2, u32 block_height, u32 tile_width_spacing);
[[nodiscard]] bool IsSmallerThanGobSize(Extent3D num_tiles, Extent2D gob, u32 block_depth);
[[nodiscard]] u32 StrideAlignment(Extent3D num_tiles, Extent3D block, Extent2D gob, u32 bpp_log2);
[[nodiscard]] u32 StrideAlignment(Extent3D num_tiles, Extent3D block, u32 bpp_log2,
u32 tile_width_spacing);
[[nodiscard]] Extent2D NumGobs(const LevelInfo& info, u32 level);
[[nodiscard]] Extent3D LevelTiles(const LevelInfo& info, u32 level);
[[nodiscard]] u32 CalculateLevelSize(const LevelInfo& info, u32 level);
[[nodiscard]] LevelArray CalculateLevelSizes(const LevelInfo& info, u32 num_levels);
[[nodiscard]] u32 CalculateLevelBytes(const LevelArray& sizes, u32 num_levels);
[[nodiscard]] LevelInfo MakeLevelInfo(PixelFormat format, Extent3D size, Extent3D block,
u32 tile_width_spacing);
[[nodiscard]] LevelInfo MakeLevelInfo(const ImageInfo& info);
[[nodiscard]] u32 CalculateLevelOffset(PixelFormat format, Extent3D size, Extent3D block,
u32 tile_width_spacing, u32 level);
[[nodiscard]] u32 AlignLayerSize(u32 size_bytes, Extent3D size, Extent3D block, u32 tile_size_y,
u32 tile_width_spacing);
[[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapEqualAddress(const ImageInfo& new_info,
const ImageBase& overlap,
bool strict_size);
[[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapRightAddress3D(
const ImageInfo& new_info, GPUVAddr gpu_addr, const ImageBase& overlap, bool strict_size);
[[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapRightAddress2D(
const ImageInfo& new_info, GPUVAddr gpu_addr, const ImageBase& overlap, bool strict_size);
[[nodiscard]] std::optional<OverlapResult> ResolveOverlapRightAddress(const ImageInfo& new_info,
GPUVAddr gpu_addr,
VAddr cpu_addr,
const ImageBase& overlap,
bool strict_size);
[[nodiscard]] std::optional<OverlapResult> ResolveOverlapLeftAddress(const ImageInfo& new_info,
GPUVAddr gpu_addr,
VAddr cpu_addr,
const ImageBase& overlap,
bool strict_size);
[[nodiscard]] Extent2D PitchLinearAlignedSize(const ImageInfo& info);
[[nodiscard]] Extent3D BlockLinearAlignedSize(const ImageInfo& info, u32 level);
[[nodiscard]] u32 NumBlocksPerLayer(const ImageInfo& info, Extent2D tile_size) noexcept;
[[nodiscard]] u32 NumSlices(const ImageInfo& info) noexcept;
void SwizzlePitchLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr,
const ImageInfo& info, const BufferImageCopy& copy,
std::span<const u8> memory);
void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr,
const ImageInfo& info, const BufferImageCopy& copy,
std::span<const u8> input);
[[nodiscard]] u32 CalculateGuestSizeInBytes(const ImageInfo& info) noexcept;
[[nodiscard]] u32 CalculateUnswizzledSizeBytes(const ImageInfo& info) noexcept;
@@ -61,6 +132,7 @@ struct OverlapResult {
[[nodiscard]] std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory,
GPUVAddr gpu_addr, const ImageInfo& info,
std::array<u8, MAX_GUEST_SIZE>& scratch,
std::span<u8> output);
[[nodiscard]] BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr,

View File

@@ -18,15 +18,113 @@
namespace Tegra::Texture {
namespace {
[[maybe_unused]] u32 CalcUnswiz(u32 swizzled_offset, u32 block_height_mask, u32 block_height,
u32 block_depth_mask, u32 lesser_x_shift, u32 lesser_slice_size,
u32 gobs_in_x, u32 block_depth, u32 pitch, u32 height, u32 depth,
u32 pitch_height, size_t output_size) {
const u32 entry = swizzled_offset & 0b111111111;
const u32 y_table = ((entry >> 5) & 6) | ((entry >> 4) & 1);
const u32 x_entry = ((entry >> 3) & 32) | ((entry >> 1) & 16) | (entry & 15);
const u32 base_swizzled_offset = swizzled_offset >> 9;
const u32 set_y = (base_swizzled_offset & block_height_mask) << 3;
const u32 set_z = (base_swizzled_offset >> block_height) & block_depth_mask;
const u32 inner_swizzled = base_swizzled_offset >> lesser_x_shift;
const u32 sli = inner_swizzled / lesser_slice_size;
const u32 gb = inner_swizzled % lesser_slice_size;
const u32 x_inner = (gb % gobs_in_x) << 6;
const u32 y_inner = (gb / gobs_in_x) << (block_height + 3);
const u32 z_inner = sli << block_depth;
const u32 x = x_inner + x_entry;
const u32 y = y_inner + set_y + y_table;
const u32 z = z_inner + set_z;
if (x >= pitch || y >= height || z >= depth) {
return ~0U;
}
const u32 z_pitch_height = z * pitch_height;
const u32 z_pitch_height_y_pitch = z_pitch_height + y * pitch;
const u32 unswizzled_offset = z_pitch_height_y_pitch + x;
if (unswizzled_offset >= output_size) {
return ~0U;
}
return unswizzled_offset;
}
template <u32 BYTES_PER_PIXEL>
void UnswizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height,
u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
const u32 pitch = width * BYTES_PER_PIXEL;
const u32 stride = Common::AlignUpLog2(width, stride_alignment) * BYTES_PER_PIXEL;
const u32 gobs_in_x = Common::DivCeilLog2(stride, 6U);
const u32 lesser_x_shift = block_height + block_depth;
const u32 x_shift = 9 + lesser_x_shift;
[[maybe_unused]] const u32 block_size = gobs_in_x << x_shift;
const u32 dcl2 = Common::DivCeilLog2(height, block_height + 3);
const u32 lesser_slice_size = dcl2 * gobs_in_x;
[[maybe_unused]] const u32 slice_size = lesser_slice_size << x_shift;
const u32 block_height_mask = (1U << block_height) - 1;
const u32 block_depth_mask = (1U << block_depth) - 1;
[[maybe_unused]] const u32 pitch_height = pitch * height;
// for (u32 swizzled_offset = 0; swizzled_offset < input.size();
// swizzled_offset += BYTES_PER_PIXEL) {
// u32 unswizzled_offset =
// CalcUnswiz(swizzled_offset, block_height_mask, block_height, block_depth_mask,
// lesser_x_shift, lesser_slice_size, gobs_in_x, block_depth, pitch,
// height, depth, pitch_height, output.size());
// if (!~unswizzled_offset) {
// continue;
// }
// u8* const dst = &output[unswizzled_offset];
// const u8* const src = &input[swizzled_offset];
// std::memcpy(dst, src, BYTES_PER_PIXEL);
// }
for (u32 unswizzled_offset = 0; unswizzled_offset < output.size();
unswizzled_offset += BYTES_PER_PIXEL) {
const u32 unswizzled_offset_pitch = unswizzled_offset / pitch;
const u32 z = unswizzled_offset_pitch / height;
const u32 y = unswizzled_offset_pitch % height;
const u32 x = unswizzled_offset % pitch;
const u32 offset_z =
(z >> block_depth) * slice_size + ((z & block_depth_mask) << (9 + block_height));
const u32 block_y = y >> GOB_SIZE_Y_SHIFT;
const u32 offset_y =
(block_y >> block_height) * block_size + ((block_y & block_height_mask) << 9);
const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;
const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
const u32 table = ((y & 6) << 5) | ((y & 1) << 4);
const u32 entry = ((x & 32) << 3) | ((x & 16) << 1) | (x & 15) | table;
const u32 swizzled_offset = base_swizzled_offset | entry;
u32 other = CalcUnswiz(swizzled_offset, block_height_mask, block_height, block_depth_mask,
lesser_x_shift, lesser_slice_size, gobs_in_x, block_depth, pitch,
height, depth, pitch_height, output.size());
if (swizzled_offset >= input.size()) {
continue;
}
if (x >= pitch || y >= height || z >= depth) {
// if (~other) {
// LOG_CRITICAL(Debug, "E2 {} != {}", unswizzled_offset, other);
// abort();
// }
continue;
}
// if (z != 0) {
// continue;
// }
if (other != unswizzled_offset) {
LOG_CRITICAL(Debug, "E3 {} != {} | {} {} {} {} {} {} {} {} {} {} {} {} | {} {} {}",
unswizzled_offset, other, swizzled_offset, block_height_mask, block_height,
block_depth_mask, lesser_x_shift, lesser_slice_size, gobs_in_x,
block_depth, pitch, height, depth, pitch_height, x, y, z);
abort();
}
u8* const dst = &output[unswizzled_offset];
const u8* const src = &input[swizzled_offset];
std::memcpy(dst, src, BYTES_PER_PIXEL);
}
}
template <bool TO_LINEAR, u32 BYTES_PER_PIXEL>
void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height, u32 depth,
u32 block_height, u32 block_depth, u32 stride_alignment) {
// The origin of the transformation can be configured here, leave it as zero as the current API
// doesn't expose it.
static constexpr u32 origin_x = 0;
static constexpr u32 origin_y = 0;
static constexpr u32 origin_z = 0;
// We can configure here a custom pitch
// As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch.
const u32 pitch = width * BYTES_PER_PIXEL;
@@ -42,32 +140,34 @@ void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32
const u32 x_shift = GOB_SIZE_SHIFT + block_height + block_depth;
for (u32 slice = 0; slice < depth; ++slice) {
const u32 z = slice + origin_z;
const u32 z = slice;
const u32 offset_z = (z >> block_depth) * slice_size +
((z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height));
const u32 slice_pitch_height = slice * pitch * height;
for (u32 line = 0; line < height; ++line) {
const u32 y = line + origin_y;
const auto& table = SWIZZLE_TABLE[y % GOB_SIZE_Y];
const u32 y = line;
const u32 table = ((y & 6) << 5) | ((y & 1) << 4);
const u32 block_y = y >> GOB_SIZE_Y_SHIFT;
const u32 offset_y = (block_y >> block_height) * block_size +
((block_y & block_height_mask) << GOB_SIZE_SHIFT);
const u32 line_pitch = line * pitch;
for (u32 column = 0; column < width; ++column) {
const u32 x = (column + origin_x) * BYTES_PER_PIXEL;
const u32 x = column * BYTES_PER_PIXEL;
const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;
const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X];
const u32 entry = ((x & 32) << 3) | ((x & 16) << 1) | (x & 15) | table;
const u32 swizzled_offset = base_swizzled_offset | entry;
const u32 unswizzled_offset =
slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL;
const u32 unswizzled_offset = slice_pitch_height + line_pitch + x;
if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset);
offset >= input.size()) {
// TODO(Rodrigo): This is an out of bounds access that should never happen. To
// avoid crashing the emulator, break.
ASSERT_MSG(false, "offset {} exceeds input size {}!", offset, input.size());
// ASSERT_MSG(false, "offset {} exceeds input size {}!", offset, input.size());
break;
}
@@ -84,36 +184,61 @@ template <bool TO_LINEAR>
void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
switch (bytes_per_pixel) {
case 1:
return SwizzleImpl<TO_LINEAR, 1>(output, input, width, height, depth, block_height,
#define BPP_CASE(x) \
case x: \
return SwizzleImpl<TO_LINEAR, x>(output, input, width, height, depth, block_height, \
block_depth, stride_alignment);
case 2:
return SwizzleImpl<TO_LINEAR, 2>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 3:
return SwizzleImpl<TO_LINEAR, 3>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 4:
return SwizzleImpl<TO_LINEAR, 4>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 6:
return SwizzleImpl<TO_LINEAR, 6>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 8:
return SwizzleImpl<TO_LINEAR, 8>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 12:
return SwizzleImpl<TO_LINEAR, 12>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
case 16:
return SwizzleImpl<TO_LINEAR, 16>(output, input, width, height, depth, block_height,
block_depth, stride_alignment);
BPP_CASE(1)
BPP_CASE(2)
BPP_CASE(3)
BPP_CASE(4)
BPP_CASE(6)
BPP_CASE(8)
BPP_CASE(12)
BPP_CASE(16)
#undef BPP_CASE
default:
UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
}
}
[[maybe_unused]] void Unswizzle(std::span<u8> output, std::span<const u8> input,
u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
u32 block_height, u32 block_depth, u32 stride_alignment) {
switch (bytes_per_pixel) {
#define BPP_CASE(x) \
case x: \
return UnswizzleImpl<x>(output, input, width, height, depth, block_height, block_depth, \
stride_alignment);
BPP_CASE(1)
BPP_CASE(2)
BPP_CASE(3)
BPP_CASE(4)
BPP_CASE(6)
BPP_CASE(8)
BPP_CASE(12)
BPP_CASE(16)
#undef BPP_CASE
default:
UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
}
}
} // Anonymous namespace
void CalculateUnswizzle(VideoCommon::UnswizzlePushConstants& result, u32 bytes_per_pixel, u32 width,
u32 height, u32 depth, u32 block_height, u32 block_depth,
u32 stride_alignment) {
const u32 stride = Common::AlignUpLog2(width, stride_alignment) * bytes_per_pixel;
result.bytes_per_pixel = bytes_per_pixel;
result.pitch = width * bytes_per_pixel;
result.height = height;
result.depth = depth;
result.block_height = block_height;
result.block_depth = block_depth;
result.gobs_in_x = Common::DivCeilLog2(stride, 6U);
result.dcl2 = Common::DivCeilLog2(stride, 6U);
}
void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth,
u32 stride_alignment) {

View File

@@ -7,6 +7,7 @@
#include <span>
#include "common/common_types.h"
#include "video_core/texture_cache/types.h"
#include "video_core/textures/texture.h"
namespace Tegra::Texture {
@@ -40,6 +41,10 @@ constexpr SwizzleTable MakeSwizzleTable() {
}
constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTable();
void CalculateUnswizzle(VideoCommon::UnswizzlePushConstants& result, u32 bytes_per_pixel, u32 width,
u32 height, u32 depth, u32 block_height, u32 block_depth,
u32 stride_alignment);
/// Unswizzles a block linear texture into linear memory.
void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth,

View File

@@ -14,6 +14,7 @@
#include "common/assert.h"
#include "common/settings.h"
#include "core/device_memory.h"
#include "video_core/vulkan_common/nsight_aftermath_tracker.h"
#include "video_core/vulkan_common/vulkan_device.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
@@ -42,6 +43,7 @@ enum class NvidiaArchitecture {
constexpr std::array REQUIRED_EXTENSIONS{
VK_KHR_MAINTENANCE1_EXTENSION_NAME,
VK_KHR_MAINTENANCE3_EXTENSION_NAME,
VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME,
VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME,
VK_KHR_16BIT_STORAGE_EXTENSION_NAME,
@@ -63,6 +65,7 @@ constexpr std::array REQUIRED_EXTENSIONS{
#endif
#ifdef __unix__
VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME,
#endif
};
@@ -754,6 +757,7 @@ void Device::CheckSuitability(bool requires_swapchain) const {
const VkPhysicalDeviceLimits& limits{properties.limits};
const std::array limits_report{
LimitTuple{65536, limits.maxUniformBufferRange, "maxUniformBufferRange"},
LimitTuple{134217728, limits.maxStorageBufferRange, "maxUniformBufferRange"},
LimitTuple{16, limits.maxViewports, "maxViewports"},
LimitTuple{8, limits.maxColorAttachments, "maxColorAttachments"},
LimitTuple{8, limits.maxClipDistances, "maxClipDistances"},
@@ -900,6 +904,27 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
VkPhysicalDeviceProperties2KHR physical_properties;
physical_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR;
{
VkPhysicalDeviceMaintenance3Properties properties3{};
properties3.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES;
properties3.pNext = nullptr;
VkPhysicalDeviceExternalMemoryHostPropertiesEXT host_properties{};
host_properties.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT;
host_properties.pNext = &properties3;
physical_properties.pNext = &host_properties;
physical.GetProperties2KHR(physical_properties);
if (properties3.maxMemoryAllocationSize < Core::DramMemoryMap::GiB) {
LOG_CRITICAL(Render_Vulkan, "Not enough memory for Vulkan host memory {} < {}",
properties3.maxMemoryAllocationSize, Core::DramMemoryMap::GiB);
abort();
}
if (host_properties.minImportedHostPointerAlignment > 4096) {
LOG_CRITICAL(Render_Vulkan, "Unexpected minImportedHostPointerAlignment {}",
host_properties.minImportedHostPointerAlignment);
abort();
}
}
if (has_khr_shader_float16_int8) {
VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8_features;

View File

@@ -180,6 +180,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
#ifdef _WIN32
X(vkGetMemoryWin32HandleKHR);
#endif
X(vkGetMemoryHostPointerPropertiesEXT);
X(vkGetQueryPoolResults);
X(vkGetPipelineExecutablePropertiesKHR);
X(vkGetPipelineExecutableStatisticsKHR);
@@ -811,6 +812,17 @@ VkMemoryRequirements Device::GetImageMemoryRequirements(VkImage image) const noe
return requirements;
}
u32 Device::GetMemoryHostPointerProperties(const void* ptr) const noexcept {
VkMemoryHostPointerPropertiesEXT properties{
.sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT,
.pNext = nullptr,
.memoryTypeBits = 0,
};
Check(dld->vkGetMemoryHostPointerPropertiesEXT(
handle, VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT, ptr, &properties));
return properties.memoryTypeBits;
}
std::vector<VkPipelineExecutablePropertiesKHR> Device::GetPipelineExecutablePropertiesKHR(
VkPipeline pipeline) const {
const VkPipelineInfoKHR info{

View File

@@ -295,6 +295,7 @@ struct DeviceDispatch : InstanceDispatch {
#ifdef _WIN32
PFN_vkGetMemoryWin32HandleKHR vkGetMemoryWin32HandleKHR{};
#endif
PFN_vkGetMemoryHostPointerPropertiesEXT vkGetMemoryHostPointerPropertiesEXT{};
PFN_vkGetPipelineExecutablePropertiesKHR vkGetPipelineExecutablePropertiesKHR{};
PFN_vkGetPipelineExecutableStatisticsKHR vkGetPipelineExecutableStatisticsKHR{};
PFN_vkGetQueryPoolResults vkGetQueryPoolResults{};
@@ -881,6 +882,8 @@ public:
VkMemoryRequirements GetImageMemoryRequirements(VkImage image) const noexcept;
u32 GetMemoryHostPointerProperties(const void* ptr) const noexcept;
std::vector<VkPipelineExecutablePropertiesKHR> GetPipelineExecutablePropertiesKHR(
VkPipeline pipeline) const;