From bb9eeba670aa6fd7db1963d1a724b18474cd68f8 Mon Sep 17 00:00:00 2001
From: Subv <subv2112@gmail.com>
Date: Thu, 20 Sep 2018 21:13:02 -0500
Subject: [PATCH] GPU/DMA: Fixed the Linear->Tiled and Linear->Linear transfer
 modes.

This fixes the loading bar in Has-Been Heroes.

The Tiled->Tiled transfer mode is not implemented yet and will assert.
---
 src/video_core/engines/maxwell_dma.cpp | 74 +++++++++++++++++---------
 src/video_core/textures/decoders.cpp   | 16 ++++++
 src/video_core/textures/decoders.h     |  5 ++
 3 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 9cf1f83e96..b9309b81d5 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -49,40 +49,58 @@ void MaxwellDMA::HandleCopy() {
     ASSERT(regs.dst_params.pos_x == 0);
     ASSERT(regs.dst_params.pos_y == 0);
 
-    size_t copy_size = regs.x_count;
-
-    // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
-    // buffer of length `x_count`, otherwise we copy a 2D buffer of size (x_count, y_count).
-    if (regs.exec.enable_2d) {
-        copy_size = copy_size * regs.y_count;
+    if (!regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        // If both the source and the destination are in block layout, assert.
+        UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented");
+        return;
     }
 
-    if (regs.exec.is_dst_linear == regs.exec.is_src_linear) {
-        ASSERT(regs.src_params.pos_x == 0);
-        ASSERT(regs.src_params.pos_y == 0);
-        ASSERT(regs.dst_pitch == 1);
-        ASSERT(regs.src_pitch == 1);
+    if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
+        // TODO(Subv): For now assume that the size of the destination rectangle matches exactly the
+        // destination pitch.
+        ASSERT(regs.dst_pitch == regs.x_count);
 
-        // CopyBlock already takes care of flushing and invalidating the cache at the affected
-        // addresses.
-        Memory::CopyBlock(dest_cpu, source_cpu, copy_size);
+        // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
+        // buffer of length `x_count`, otherwise we copy a 2D image of dimensions (x_count,
+        // y_count).
+        if (!regs.exec.enable_2d) {
+            Memory::CopyBlock(dest_cpu, source_cpu, regs.x_count);
+            return;
+        }
+
+        // If both the source and the destination are in linear layout, perform a line-by-line
+        // copy. We're going to take a subrect of size (x_count, y_count) from the source
+        // rectangle. There is no need to manually flush/invalidate the regions because
+        // CopyBlock does that for us.
+        for (u32 line = 0; line < regs.y_count; ++line) {
+            const VAddr source_line = source_cpu + line * regs.src_pitch;
+            const VAddr dest_line = dest_cpu + line * regs.dst_pitch;
+            Memory::CopyBlock(dest_line, source_line, regs.dst_pitch);
+        }
         return;
     }
 
     ASSERT(regs.exec.enable_2d == 1);
 
-    // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated copying.
-    rasterizer.FlushRegion(source_cpu, copy_size);
+    size_t copy_size = regs.x_count * regs.y_count;
 
-    // We have to invalidate the destination region to evict any outdated surfaces from the cache.
-    // We do this before actually writing the new data because the destination address might contain
-    // a dirty surface that will have to be written back to memory.
-    rasterizer.InvalidateRegion(dest_cpu, copy_size);
+    const auto FlushAndInvalidate = [&](u32 src_size, u32 dst_size) {
+        // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
+        // copying.
+        rasterizer.FlushRegion(source_cpu, src_size);
+
+        // We have to invalidate the destination region to evict any outdated surfaces from the
+        // cache. We do this before actually writing the new data because the destination address
+        // might contain a dirty surface that will have to be written back to memory.
+        rasterizer.InvalidateRegion(dest_cpu, dst_size);
+    };
 
     u8* src_buffer = Memory::GetPointer(source_cpu);
     u8* dst_buffer = Memory::GetPointer(dest_cpu);
 
     if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        ASSERT(regs.src_params.size_z == 1);
+
         // If the input is tiled and the output is linear, deswizzle the input and copy it over.
 
         // Copy the data to a staging buffer first to make applying the src and dst offsets easier
@@ -93,6 +111,8 @@ void MaxwellDMA::HandleCopy() {
         u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
         u32 dst_bytes_per_pixel = regs.dst_pitch;
 
+        FlushAndInvalidate(staging_buffer.size(), copy_size * dst_bytes_per_pixel);
+
         Texture::CopySwizzledData(regs.src_params.size_x, regs.src_params.size_y,
                                   src_bytes_per_pixel, dst_bytes_per_pixel, src_buffer,
                                   staging_buffer.data(), true, regs.src_params.BlockHeight());
@@ -101,13 +121,17 @@ void MaxwellDMA::HandleCopy() {
                          regs.dst_pitch;
         std::memcpy(dst_buffer, staging_buffer.data() + src_offset, copy_size * regs.dst_pitch);
     } else {
-        // TODO(Subv): Source offsets are not yet implemented for this mode.
-        ASSERT(regs.src_params.pos_x == 0);
-        ASSERT(regs.src_params.pos_y == 0);
+        ASSERT(regs.dst_params.size_z == 1);
+        ASSERT(regs.src_pitch == regs.x_count);
+
+        u32 src_bpp = regs.src_pitch / regs.x_count;
+
+        FlushAndInvalidate(regs.src_pitch * regs.y_count,
+                           regs.dst_params.size_x * regs.dst_params.size_y * src_bpp);
 
         // If the input is linear and the output is tiled, swizzle the input and copy it over.
-        Texture::CopySwizzledData(regs.dst_params.size_x, regs.dst_params.size_y, 1, 1, dst_buffer,
-                                  src_buffer, false, regs.dst_params.BlockHeight());
+        Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
+                                src_bpp, dest_cpu, source_cpu, regs.dst_params.BlockHeight());
     }
 }
 
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 20ba6d4f6b..ec64da0065 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -88,6 +88,22 @@ void FastSwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u8* swizzled_da
     }
 }
 
+void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
+                    u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
+                    u32 block_height) {
+    for (u32 line = 0; line < subrect_height; ++line) {
+        for (u32 x = 0; x < subrect_width; ++x) {
+            u32 swizzled_offset =
+                GetSwizzleOffset(x, line, swizzled_width, bytes_per_pixel, block_height);
+
+            const VAddr source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
+            const VAddr dest_addr = swizzled_data + swizzled_offset;
+
+            Memory::CopyBlock(dest_addr, source_line, bytes_per_pixel);
+        }
+    }
+}
+
 u32 BytesPerPixel(TextureFormat format) {
     switch (format) {
     case TextureFormat::DXT1:
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index 1f7b731bee..23afa5a058 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -26,6 +26,11 @@ std::vector<u8> UnswizzleDepthTexture(VAddr address, DepthFormat format, u32 wid
 void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
                       u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 block_height);
 
+/// Copies an untiled subrectangle into a tiled surface.
+void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
+                    u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
+                    u32 block_height);
+
 /**
  * Decodes an unswizzled texture into a A8R8G8B8 texture.
  */