GPU/DMA: Fixed the Linear->Tiled and Linear->Linear transfer modes.

This fixes the loading bar in Has-Been Heroes. The Tiled->Tiled transfer mode is not implemented yet and will assert.
2018-09-20 21:13:02 -05:00
parent 47826fd090
commit bb9eeba670
3 changed files with 70 additions and 25 deletions
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -49,40 +49,58 @@ void MaxwellDMA::HandleCopy() {
    ASSERT(regs.dst_params.pos_x == 0);
    ASSERT(regs.dst_params.pos_y == 0);

-    size_t copy_size = regs.x_count;
-
-    // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
-    // buffer of length `x_count`, otherwise we copy a 2D buffer of size (x_count, y_count).
-    if (regs.exec.enable_2d) {
-        copy_size = copy_size * regs.y_count;
+    if (!regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        // If both the source and the destination are in block layout, assert.
+        UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented");
+        return;
    }

-    if (regs.exec.is_dst_linear == regs.exec.is_src_linear) {
-        ASSERT(regs.src_params.pos_x == 0);
-        ASSERT(regs.src_params.pos_y == 0);
-        ASSERT(regs.dst_pitch == 1);
-        ASSERT(regs.src_pitch == 1);
+    if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
+        // TODO(Subv): For now assume that the size of the destination rectangle matches exactly the
+        // destination pitch.
+        ASSERT(regs.dst_pitch == regs.x_count);

-        // CopyBlock already takes care of flushing and invalidating the cache at the affected
-        // addresses.
-        Memory::CopyBlock(dest_cpu, source_cpu, copy_size);
+        // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
+        // buffer of length `x_count`, otherwise we copy a 2D image of dimensions (x_count,
+        // y_count).
+        if (!regs.exec.enable_2d) {
+            Memory::CopyBlock(dest_cpu, source_cpu, regs.x_count);
+            return;
+        }
+
+        // If both the source and the destination are in linear layout, perform a line-by-line
+        // copy. We're going to take a subrect of size (x_count, y_count) from the source
+        // rectangle. There is no need to manually flush/invalidate the regions because
+        // CopyBlock does that for us.
+        for (u32 line = 0; line < regs.y_count; ++line) {
+            const VAddr source_line = source_cpu + line * regs.src_pitch;
+            const VAddr dest_line = dest_cpu + line * regs.dst_pitch;
+            Memory::CopyBlock(dest_line, source_line, regs.dst_pitch);
+        }
        return;
    }

    ASSERT(regs.exec.enable_2d == 1);

-    // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated copying.
-    rasterizer.FlushRegion(source_cpu, copy_size);
+    size_t copy_size = regs.x_count * regs.y_count;

-    // We have to invalidate the destination region to evict any outdated surfaces from the cache.
-    // We do this before actually writing the new data because the destination address might contain
-    // a dirty surface that will have to be written back to memory.
-    rasterizer.InvalidateRegion(dest_cpu, copy_size);
+    const auto FlushAndInvalidate = [&](u32 src_size, u32 dst_size) {
+        // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
+        // copying.
+        rasterizer.FlushRegion(source_cpu, src_size);
+
+        // We have to invalidate the destination region to evict any outdated surfaces from the
+        // cache. We do this before actually writing the new data because the destination address
+        // might contain a dirty surface that will have to be written back to memory.
+        rasterizer.InvalidateRegion(dest_cpu, dst_size);
+    };

    u8* src_buffer = Memory::GetPointer(source_cpu);
    u8* dst_buffer = Memory::GetPointer(dest_cpu);

    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        ASSERT(regs.src_params.size_z == 1);
+
        // If the input is tiled and the output is linear, deswizzle the input and copy it over.

        // Copy the data to a staging buffer first to make applying the src and dst offsets easier
@@ -93,6 +111,8 @@ void MaxwellDMA::HandleCopy() {
        u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
        u32 dst_bytes_per_pixel = regs.dst_pitch;

+        FlushAndInvalidate(staging_buffer.size(), copy_size * dst_bytes_per_pixel);
+
        Texture::CopySwizzledData(regs.src_params.size_x, regs.src_params.size_y,
                                  src_bytes_per_pixel, dst_bytes_per_pixel, src_buffer,
                                  staging_buffer.data(), true, regs.src_params.BlockHeight());
@@ -101,13 +121,17 @@ void MaxwellDMA::HandleCopy() {
                         regs.dst_pitch;
        std::memcpy(dst_buffer, staging_buffer.data() + src_offset, copy_size * regs.dst_pitch);
    } else {
-        // TODO(Subv): Source offsets are not yet implemented for this mode.
-        ASSERT(regs.src_params.pos_x == 0);
-        ASSERT(regs.src_params.pos_y == 0);
+        ASSERT(regs.dst_params.size_z == 1);
+        ASSERT(regs.src_pitch == regs.x_count);
+
+        u32 src_bpp = regs.src_pitch / regs.x_count;
+
+        FlushAndInvalidate(regs.src_pitch * regs.y_count,
+                           regs.dst_params.size_x * regs.dst_params.size_y * src_bpp);

        // If the input is linear and the output is tiled, swizzle the input and copy it over.
-        Texture::CopySwizzledData(regs.dst_params.size_x, regs.dst_params.size_y, 1, 1, dst_buffer,
-                                  src_buffer, false, regs.dst_params.BlockHeight());
+        Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
+                                src_bpp, dest_cpu, source_cpu, regs.dst_params.BlockHeight());
    }
 }

--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -88,6 +88,22 @@ void FastSwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u8* swizzled_da
    }
 }

+void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
+                    u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
+                    u32 block_height) {
+    for (u32 line = 0; line < subrect_height; ++line) {
+        for (u32 x = 0; x < subrect_width; ++x) {
+            u32 swizzled_offset =
+                GetSwizzleOffset(x, line, swizzled_width, bytes_per_pixel, block_height);
+
+            const VAddr source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
+            const VAddr dest_addr = swizzled_data + swizzled_offset;
+
+            Memory::CopyBlock(dest_addr, source_line, bytes_per_pixel);
+        }
+    }
+}
+
 u32 BytesPerPixel(TextureFormat format) {
    switch (format) {
    case TextureFormat::DXT1:
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -26,6 +26,11 @@ std::vector<u8> UnswizzleDepthTexture(VAddr address, DepthFormat format, u32 wid
 void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
                      u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 block_height);

+/// Copies an untiled subrectangle into a tiled surface.
+void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
+                    u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
+                    u32 block_height);
+
 /**
 * Decodes an unswizzled texture into a A8R8G8B8 texture.
 */