From bb9eeba670aa6fd7db1963d1a724b18474cd68f8 Mon Sep 17 00:00:00 2001 From: Subv Date: Thu, 20 Sep 2018 21:13:02 -0500 Subject: [PATCH] GPU/DMA: Fixed the Linear->Tiled and Linear->Linear transfer modes. This fixes the loading bar in Has-Been Heroes. The Tiled->Tiled transfer mode is not implemented yet and will assert. --- src/video_core/engines/maxwell_dma.cpp | 74 +++++++++++++++++--------- src/video_core/textures/decoders.cpp | 16 ++++++ src/video_core/textures/decoders.h | 5 ++ 3 files changed, 70 insertions(+), 25 deletions(-) diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 9cf1f83e96..b9309b81d5 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -49,40 +49,58 @@ void MaxwellDMA::HandleCopy() { ASSERT(regs.dst_params.pos_x == 0); ASSERT(regs.dst_params.pos_y == 0); - size_t copy_size = regs.x_count; - - // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D - // buffer of length `x_count`, otherwise we copy a 2D buffer of size (x_count, y_count). - if (regs.exec.enable_2d) { - copy_size = copy_size * regs.y_count; + if (!regs.exec.is_dst_linear && !regs.exec.is_src_linear) { + // If both the source and the destination are in block layout, assert. + UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented"); + return; } - if (regs.exec.is_dst_linear == regs.exec.is_src_linear) { - ASSERT(regs.src_params.pos_x == 0); - ASSERT(regs.src_params.pos_y == 0); - ASSERT(regs.dst_pitch == 1); - ASSERT(regs.src_pitch == 1); + if (regs.exec.is_dst_linear && regs.exec.is_src_linear) { + // TODO(Subv): For now assume that the size of the destination rectangle matches exactly the + // destination pitch. + ASSERT(regs.dst_pitch == regs.x_count); - // CopyBlock already takes care of flushing and invalidating the cache at the affected - // addresses. - Memory::CopyBlock(dest_cpu, source_cpu, copy_size); + // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D + // buffer of length `x_count`, otherwise we copy a 2D image of dimensions (x_count, + // y_count). + if (!regs.exec.enable_2d) { + Memory::CopyBlock(dest_cpu, source_cpu, regs.x_count); + return; + } + + // If both the source and the destination are in linear layout, perform a line-by-line + // copy. We're going to take a subrect of size (x_count, y_count) from the source + // rectangle. There is no need to manually flush/invalidate the regions because + // CopyBlock does that for us. + for (u32 line = 0; line < regs.y_count; ++line) { + const VAddr source_line = source_cpu + line * regs.src_pitch; + const VAddr dest_line = dest_cpu + line * regs.dst_pitch; + Memory::CopyBlock(dest_line, source_line, regs.dst_pitch); + } return; } ASSERT(regs.exec.enable_2d == 1); - // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated copying. - rasterizer.FlushRegion(source_cpu, copy_size); + size_t copy_size = regs.x_count * regs.y_count; - // We have to invalidate the destination region to evict any outdated surfaces from the cache. - // We do this before actually writing the new data because the destination address might contain - // a dirty surface that will have to be written back to memory. - rasterizer.InvalidateRegion(dest_cpu, copy_size); + const auto FlushAndInvalidate = [&](u32 src_size, u32 dst_size) { + // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated + // copying. + rasterizer.FlushRegion(source_cpu, src_size); + + // We have to invalidate the destination region to evict any outdated surfaces from the + // cache. We do this before actually writing the new data because the destination address + // might contain a dirty surface that will have to be written back to memory. + rasterizer.InvalidateRegion(dest_cpu, dst_size); + }; u8* src_buffer = Memory::GetPointer(source_cpu); u8* dst_buffer = Memory::GetPointer(dest_cpu); if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { + ASSERT(regs.src_params.size_z == 1); + // If the input is tiled and the output is linear, deswizzle the input and copy it over. // Copy the data to a staging buffer first to make applying the src and dst offsets easier @@ -93,6 +111,8 @@ void MaxwellDMA::HandleCopy() { u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x; u32 dst_bytes_per_pixel = regs.dst_pitch; + FlushAndInvalidate(staging_buffer.size(), copy_size * dst_bytes_per_pixel); + Texture::CopySwizzledData(regs.src_params.size_x, regs.src_params.size_y, src_bytes_per_pixel, dst_bytes_per_pixel, src_buffer, staging_buffer.data(), true, regs.src_params.BlockHeight()); @@ -101,13 +121,17 @@ void MaxwellDMA::HandleCopy() { regs.dst_pitch; std::memcpy(dst_buffer, staging_buffer.data() + src_offset, copy_size * regs.dst_pitch); } else { - // TODO(Subv): Source offsets are not yet implemented for this mode. - ASSERT(regs.src_params.pos_x == 0); - ASSERT(regs.src_params.pos_y == 0); + ASSERT(regs.dst_params.size_z == 1); + ASSERT(regs.src_pitch == regs.x_count); + + u32 src_bpp = regs.src_pitch / regs.x_count; + + FlushAndInvalidate(regs.src_pitch * regs.y_count, + regs.dst_params.size_x * regs.dst_params.size_y * src_bpp); // If the input is linear and the output is tiled, swizzle the input and copy it over. - Texture::CopySwizzledData(regs.dst_params.size_x, regs.dst_params.size_y, 1, 1, dst_buffer, - src_buffer, false, regs.dst_params.BlockHeight()); + Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, + src_bpp, dest_cpu, source_cpu, regs.dst_params.BlockHeight()); } } diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 20ba6d4f6b..ec64da0065 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -88,6 +88,22 @@ void FastSwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u8* swizzled_da } } +void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, + u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data, + u32 block_height) { + for (u32 line = 0; line < subrect_height; ++line) { + for (u32 x = 0; x < subrect_width; ++x) { + u32 swizzled_offset = + GetSwizzleOffset(x, line, swizzled_width, bytes_per_pixel, block_height); + + const VAddr source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel; + const VAddr dest_addr = swizzled_data + swizzled_offset; + + Memory::CopyBlock(dest_addr, source_line, bytes_per_pixel); + } + } +} + u32 BytesPerPixel(TextureFormat format) { switch (format) { case TextureFormat::DXT1: diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index 1f7b731bee..23afa5a058 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -26,6 +26,11 @@ std::vector UnswizzleDepthTexture(VAddr address, DepthFormat format, u32 wid void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 block_height); +/// Copies an untiled subrectangle into a tiled surface. +void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, + u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data, + u32 block_height); + /** * Decodes an unswizzled texture into a A8R8G8B8 texture. */