GPU: Partially implemented the Maxwell DMA engine.

Only tiled->linear and linear->tiled copies that aren't offsetted are supported for now. Queries are not supported. Swizzled copies are not supported.
2018-06-12 11:27:36 -05:00
9 changed files with 277 additions and 34 deletions
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -9,6 +9,8 @@ add_library(video_core STATIC
    engines/maxwell_3d.h
    engines/maxwell_compute.cpp
    engines/maxwell_compute.h
+    engines/maxwell_dma.cpp
+    engines/maxwell_dma.h
    engines/shader_bytecode.h
    gpu.cpp
    gpu.h
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -16,6 +16,7 @@
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
+#include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
 #include "video_core/renderer_base.h"
 #include "video_core/video_core.h"
@@ -60,8 +61,11 @@ void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params)
    case EngineID::MAXWELL_COMPUTE_B:
        maxwell_compute->WriteReg(method, value);
        break;
+    case EngineID::MAXWELL_DMA_COPY_A:
+        maxwell_dma->WriteReg(method, value);
+        break;
    default:
-        UNIMPLEMENTED();
+        UNIMPLEMENTED_MSG("Unimplemented engine");
    }
 }

--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -47,6 +47,7 @@ void Fermi2D::HandleSurfaceCopy() {

    if (regs.src.linear == regs.dst.linear) {
        // If the input layout and the output layout are the same, just perform a raw copy.
+        ASSERT(regs.src.BlockHeight() == regs.dst.BlockHeight());
        Memory::CopyBlock(dest_cpu, source_cpu,
                          src_bytes_per_pixel * regs.dst.width * regs.dst.height);
        return;
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -0,0 +1,69 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/memory.h"
+#include "video_core/engines/maxwell_dma.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra {
+namespace Engines {
+
+MaxwellDMA::MaxwellDMA(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
+
+void MaxwellDMA::WriteReg(u32 method, u32 value) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid MaxwellDMA register, increase the size of the Regs structure");
+
+    regs.reg_array[method] = value;
+
+#define MAXWELLDMA_REG_INDEX(field_name)                                                           \
+    (offsetof(Tegra::Engines::MaxwellDMA::Regs, field_name) / sizeof(u32))
+
+    switch (method) {
+    case MAXWELLDMA_REG_INDEX(exec): {
+        HandleCopy();
+        break;
+    }
+    }
+
+#undef MAXWELLDMA_REG_INDEX
+}
+
+void MaxwellDMA::HandleCopy() {
+    NGLOG_WARNING(HW_GPU, "Requested a DMA copy");
+
+    const GPUVAddr source = regs.src_address.Address();
+    const GPUVAddr dest = regs.dst_address.Address();
+
+    const VAddr source_cpu = *memory_manager.GpuToCpuAddress(source);
+    const VAddr dest_cpu = *memory_manager.GpuToCpuAddress(dest);
+
+    // TODO(Subv): Perform more research and implement all features of this engine.
+    ASSERT(regs.exec.enable_swizzle == 0);
+    ASSERT(regs.exec.enable_2d == 1);
+    ASSERT(regs.exec.query_mode == Regs::QueryMode::None);
+    ASSERT(regs.exec.query_intr == Regs::QueryIntr::None);
+    ASSERT(regs.exec.copy_mode == Regs::CopyMode::Unk2);
+    ASSERT(regs.src_params.pos_x == 0);
+    ASSERT(regs.src_params.pos_y == 0);
+    ASSERT(regs.dst_params.pos_x == 0);
+    ASSERT(regs.dst_params.pos_y == 0);
+    ASSERT(regs.exec.is_dst_linear != regs.exec.is_src_linear);
+
+    u8* src_buffer = Memory::GetPointer(source_cpu);
+    u8* dst_buffer = Memory::GetPointer(dest_cpu);
+
+    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
+        Texture::CopySwizzledData(regs.src_params.size_x, regs.src_params.size_y, 1, 1, src_buffer,
+                                  dst_buffer, true, regs.src_params.BlockHeight());
+    } else {
+        // If the input is linear and the output is tiled, swizzle the input and copy it over.
+        Texture::CopySwizzledData(regs.dst_params.size_x, regs.dst_params.size_y, 1, 1, dst_buffer,
+                                  src_buffer, false, regs.dst_params.BlockHeight());
+    }
+}
+
+} // namespace Engines
+} // namespace Tegra
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -0,0 +1,155 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include "common/assert.h"
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+namespace Engines {
+
+class MaxwellDMA final {
+public:
+    explicit MaxwellDMA(MemoryManager& memory_manager);
+    ~MaxwellDMA() = default;
+
+    /// Write the value to the register identified by method.
+    void WriteReg(u32 method, u32 value);
+
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x1D6;
+
+        struct Parameters {
+            union {
+                BitField<0, 4, u32> block_depth;
+                BitField<4, 4, u32> block_height;
+                BitField<8, 4, u32> block_width;
+            };
+            u32 size_x;
+            u32 size_y;
+            u32 size_z;
+            u32 pos_z;
+            union {
+                BitField<0, 16, u32> pos_x;
+                BitField<16, 16, u32> pos_y;
+            };
+
+            u32 BlockHeight() const {
+                return 1 << block_height;
+            }
+        };
+
+        static_assert(sizeof(Parameters) == 24, "Parameters has wrong size");
+
+        enum class CopyMode : u32 {
+            None = 0,
+            Unk1 = 1,
+            Unk2 = 2,
+        };
+
+        enum class QueryMode : u32 {
+            None = 0,
+            Short = 1,
+            Long = 2,
+        };
+
+        enum class QueryIntr : u32 {
+            None = 0,
+            Block = 1,
+            NonBlock = 2,
+        };
+
+        union {
+            struct {
+                INSERT_PADDING_WORDS(0xC0);
+
+                struct {
+                    union {
+                        BitField<0, 2, CopyMode> copy_mode;
+                        BitField<2, 1, u32> flush;
+
+                        BitField<3, 2, QueryMode> query_mode;
+                        BitField<5, 2, QueryIntr> query_intr;
+
+                        BitField<7, 1, u32> is_src_linear;
+                        BitField<8, 1, u32> is_dst_linear;
+
+                        BitField<9, 1, u32> enable_2d;
+                        BitField<10, 1, u32> enable_swizzle;
+                    };
+                } exec;
+
+                INSERT_PADDING_WORDS(0x3F);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } src_address;
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } dst_address;
+
+                u32 src_pitch;
+                u32 dst_pitch;
+                u32 x_count;
+                u32 y_count;
+
+                INSERT_PADDING_WORDS(0xBB);
+
+                Parameters dst_params;
+
+                INSERT_PADDING_WORDS(1);
+
+                Parameters src_params;
+
+                INSERT_PADDING_WORDS(0x13);
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+    MemoryManager& memory_manager;
+
+private:
+    /// Performs the copy from the source buffer to the destination buffer as configured in the
+    /// registers.
+    void HandleCopy();
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(MaxwellDMA::Regs, field_name) == position * 4,                          \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(exec, 0xC0);
+ASSERT_REG_POSITION(src_address, 0x100);
+ASSERT_REG_POSITION(dst_address, 0x102);
+ASSERT_REG_POSITION(src_pitch, 0x104);
+ASSERT_REG_POSITION(dst_pitch, 0x105);
+ASSERT_REG_POSITION(x_count, 0x106);
+ASSERT_REG_POSITION(y_count, 0x107);
+ASSERT_REG_POSITION(dst_params, 0x1C3);
+ASSERT_REG_POSITION(src_params, 0x1CA);
+
+#undef ASSERT_REG_POSITION
+
+} // namespace Engines
+} // namespace Tegra
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -213,6 +213,7 @@ union Instruction {
    BitField<28, 8, Register> gpr28;
    BitField<39, 8, Register> gpr39;
    BitField<48, 16, u64> opcode;
+    BitField<50, 1, u64> saturate_a;

    union {
        BitField<20, 19, u64> imm20_19;
@@ -221,7 +222,7 @@ union Instruction {
        BitField<46, 1, u64> abs_a;
        BitField<48, 1, u64> negate_a;
        BitField<49, 1, u64> abs_b;
-        BitField<50, 1, u64> saturate_d;
+        BitField<50, 1, u64> abs_d;
        BitField<56, 1, u64> negate_imm;

        union {
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -5,6 +5,7 @@
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
+#include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"

 namespace Tegra {
@@ -14,6 +15,7 @@ GPU::GPU() {
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(*memory_manager);
    fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
    maxwell_compute = std::make_unique<Engines::MaxwellCompute>();
+    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(*memory_manager);
 }

 GPU::~GPU() = default;
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -63,6 +63,7 @@ namespace Engines {
 class Fermi2D;
 class Maxwell3D;
 class MaxwellCompute;
+class MaxwellDMA;
 } // namespace Engines

 enum class EngineID {
@@ -103,6 +104,8 @@ private:
    std::unique_ptr<Engines::Fermi2D> fermi_2d;
    /// Compute engine
    std::unique_ptr<Engines::MaxwellCompute> maxwell_compute;
+    /// DMA engine
+    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
 };

 } // namespace Tegra
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -299,15 +299,13 @@ public:
     * @param value The code representing the value to assign.
     * @param dest_num_components Number of components in the destination.
     * @param value_num_components Number of components in the value.
-     * @param is_saturated Optional, when True, saturates the provided value.
+     * @param is_abs Optional, when True, applies absolute value to output.
     * @param dest_elem Optional, the destination element to use for the operation.
     */
    void SetRegisterToFloat(const Register& reg, u64 elem, const std::string& value,
-                            u64 dest_num_components, u64 value_num_components,
-                            bool is_saturated = false, u64 dest_elem = 0) {
-
-        SetRegister(reg, elem, is_saturated ? "clamp(" + value + ", 0.0, 1.0)" : value,
-                    dest_num_components, value_num_components, dest_elem);
+                            u64 dest_num_components, u64 value_num_components, bool is_abs = false,
+                            u64 dest_elem = 0) {
+        SetRegister(reg, elem, value, dest_num_components, value_num_components, is_abs, dest_elem);
    }

    /**
@@ -317,21 +315,18 @@ public:
     * @param value The code representing the value to assign.
     * @param dest_num_components Number of components in the destination.
     * @param value_num_components Number of components in the value.
-     * @param is_saturated Optional, when True, saturates the provided value.
+     * @param is_abs Optional, when True, applies absolute value to output.
     * @param dest_elem Optional, the destination element to use for the operation.
     */
    void SetRegisterToInteger(const Register& reg, bool is_signed, u64 elem,
                              const std::string& value, u64 dest_num_components,
-                              u64 value_num_components, bool is_saturated = false,
-                              u64 dest_elem = 0) {
-        ASSERT_MSG(!is_saturated, "Unimplemented");
-
+                              u64 value_num_components, bool is_abs = false, u64 dest_elem = 0) {
        const std::string func = GetGLSLConversionFunc(
            is_signed ? GLSLRegister::Type::Integer : GLSLRegister::Type::UnsignedInteger,
            GLSLRegister::Type::Float);

        SetRegister(reg, elem, func + '(' + value + ')', dest_num_components, value_num_components,
-                    dest_elem);
+                    is_abs, dest_elem);
    }

    /**
@@ -505,10 +500,12 @@ private:
     * @param value The code representing the value to assign.
     * @param dest_num_components Number of components in the destination.
     * @param value_num_components Number of components in the value.
+     * @param is_abs Optional, when True, applies absolute value to output.
     * @param dest_elem Optional, the destination element to use for the operation.
     */
    void SetRegister(const Register& reg, u64 elem, const std::string& value,
-                     u64 dest_num_components, u64 value_num_components, u64 dest_elem) {
+                     u64 dest_num_components, u64 value_num_components, bool is_abs,
+                     u64 dest_elem) {
        std::string dest = GetRegister(reg, dest_elem);
        if (dest_num_components > 1) {
            dest += GetSwizzle(elem);
@@ -519,6 +516,8 @@ private:
            src += GetSwizzle(elem);
        }

+        src = is_abs ? "abs(" + src + ')' : src;
+
        shader.AddLine(dest + " = " + src + ';');
    }

@@ -539,7 +538,7 @@ private:
            // vertex shader, and what's the value of the fourth element when inside a Tess Eval
            // shader.
            ASSERT(stage == Maxwell3D::Regs::ShaderStage::Vertex);
-            return "vec4(0, 0, uintBitsToFloat(gl_InstanceID), uintBitsToFloat(gl_VertexID))";
+            return "vec4(0, 0, gl_InstanceID, gl_VertexID)";
        default:
            const u32 index{static_cast<u32>(attribute) -
                            static_cast<u32>(Attribute::Index::Attribute_0)};
@@ -809,8 +808,9 @@ private:
            case OpCode::Id::FMUL_C:
            case OpCode::Id::FMUL_R:
            case OpCode::Id::FMUL_IMM: {
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1,
-                                        instr.alu.saturate_d);
+                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
+
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1, instr.alu.abs_d);
                break;
            }
            case OpCode::Id::FMUL32_IMM: {
@@ -823,39 +823,41 @@ private:
            case OpCode::Id::FADD_C:
            case OpCode::Id::FADD_R:
            case OpCode::Id::FADD_IMM: {
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1,
-                                        instr.alu.saturate_d);
+                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
+
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1, instr.alu.abs_d);
                break;
            }
            case OpCode::Id::MUFU: {
+                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
+
                switch (instr.sub_op) {
                case SubOp::Cos:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "cos(" + op_a + ')', 1, 1,
-                                            instr.alu.saturate_d);
+                                            instr.alu.abs_d);
                    break;
                case SubOp::Sin:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "sin(" + op_a + ')', 1, 1,
-                                            instr.alu.saturate_d);
+                                            instr.alu.abs_d);
                    break;
                case SubOp::Ex2:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "exp2(" + op_a + ')', 1, 1,
-                                            instr.alu.saturate_d);
+                                            instr.alu.abs_d);
                    break;
                case SubOp::Lg2:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "log2(" + op_a + ')', 1, 1,
-                                            instr.alu.saturate_d);
+                                            instr.alu.abs_d);
                    break;
                case SubOp::Rcp:
-                    regs.SetRegisterToFloat(instr.gpr0, 0, "1.0 / " + op_a, 1, 1,
-                                            instr.alu.saturate_d);
+                    regs.SetRegisterToFloat(instr.gpr0, 0, "1.0 / " + op_a, 1, 1, instr.alu.abs_d);
                    break;
                case SubOp::Rsq:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "inversesqrt(" + op_a + ')', 1, 1,
-                                            instr.alu.saturate_d);
+                                            instr.alu.abs_d);
                    break;
                case SubOp::Min:
                    regs.SetRegisterToFloat(instr.gpr0, 0, "min(" + op_a + "," + op_b + ')', 1, 1,
-                                            instr.alu.saturate_d);
+                                            instr.alu.abs_d);
                    break;
                default:
                    NGLOG_CRITICAL(HW_GPU, "Unhandled MUFU sub op: {0:x}",
@@ -1026,8 +1028,8 @@ private:
            case OpCode::Id::IADD_C:
            case OpCode::Id::IADD_R:
            case OpCode::Id::IADD_IMM: {
-                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1,
-                                          instr.alu.saturate_d);
+                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1);
                break;
            }
            case OpCode::Id::ISCADD_C:
@@ -1049,6 +1051,8 @@ private:
            break;
        }
        case OpCode::Type::Ffma: {
+            ASSERT_MSG(!instr.saturate_a, "Unimplemented");
+
            std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
            std::string op_b = instr.ffma.negate_b ? "-" : "";
            std::string op_c = instr.ffma.negate_c ? "-" : "";
@@ -1082,13 +1086,13 @@ private:
            }
            }

-            regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b + " + " + op_c, 1, 1,
-                                    instr.alu.saturate_d);
+            regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b + " + " + op_c, 1, 1);
            break;
        }
        case OpCode::Type::Conversion: {
            ASSERT_MSG(instr.conversion.size == Register::Size::Word, "Unimplemented");
            ASSERT_MSG(!instr.conversion.negate_a, "Unimplemented");
+            ASSERT_MSG(!instr.saturate_a, "Unimplemented");

            switch (opcode->GetId()) {
            case OpCode::Id::I2I_R: {
@@ -1102,7 +1106,7 @@ private:
                }

                regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1,
-                                          1, instr.alu.saturate_d);
+                                          1);
                break;
            }
            case OpCode::Id::I2F_R: {
@@ -1118,6 +1122,8 @@ private:
                break;
            }
            case OpCode::Id::F2F_R: {
+                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
+
                std::string op_a = regs.GetRegisterAsFloat(instr.gpr20);

                switch (instr.conversion.f2f.rounding) {
@@ -1143,7 +1149,7 @@ private:
                    op_a = "abs(" + op_a + ')';
                }

-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1, instr.alu.saturate_d);
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
                break;
            }
            case OpCode::Id::F2I_R: {