GPU/DMA: Fixed the Linear->Tiled and Linear->Linear transfer modes.
This fixes the loading bar in Has-Been Heroes. The Tiled->Tiled transfer mode is not implemented yet and will assert.
This commit is contained in:
@@ -49,40 +49,58 @@ void MaxwellDMA::HandleCopy() {
|
||||
ASSERT(regs.dst_params.pos_x == 0);
|
||||
ASSERT(regs.dst_params.pos_y == 0);
|
||||
|
||||
size_t copy_size = regs.x_count;
|
||||
|
||||
// When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
|
||||
// buffer of length `x_count`, otherwise we copy a 2D buffer of size (x_count, y_count).
|
||||
if (regs.exec.enable_2d) {
|
||||
copy_size = copy_size * regs.y_count;
|
||||
if (!regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
|
||||
// If both the source and the destination are in block layout, assert.
|
||||
UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented");
|
||||
return;
|
||||
}
|
||||
|
||||
if (regs.exec.is_dst_linear == regs.exec.is_src_linear) {
|
||||
ASSERT(regs.src_params.pos_x == 0);
|
||||
ASSERT(regs.src_params.pos_y == 0);
|
||||
ASSERT(regs.dst_pitch == 1);
|
||||
ASSERT(regs.src_pitch == 1);
|
||||
if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
|
||||
// TODO(Subv): For now assume that the size of the destination rectangle matches exactly the
|
||||
// destination pitch.
|
||||
ASSERT(regs.dst_pitch == regs.x_count);
|
||||
|
||||
// CopyBlock already takes care of flushing and invalidating the cache at the affected
|
||||
// addresses.
|
||||
Memory::CopyBlock(dest_cpu, source_cpu, copy_size);
|
||||
// When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
|
||||
// buffer of length `x_count`, otherwise we copy a 2D image of dimensions (x_count,
|
||||
// y_count).
|
||||
if (!regs.exec.enable_2d) {
|
||||
Memory::CopyBlock(dest_cpu, source_cpu, regs.x_count);
|
||||
return;
|
||||
}
|
||||
|
||||
// If both the source and the destination are in linear layout, perform a line-by-line
|
||||
// copy. We're going to take a subrect of size (x_count, y_count) from the source
|
||||
// rectangle. There is no need to manually flush/invalidate the regions because
|
||||
// CopyBlock does that for us.
|
||||
for (u32 line = 0; line < regs.y_count; ++line) {
|
||||
const VAddr source_line = source_cpu + line * regs.src_pitch;
|
||||
const VAddr dest_line = dest_cpu + line * regs.dst_pitch;
|
||||
Memory::CopyBlock(dest_line, source_line, regs.dst_pitch);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
ASSERT(regs.exec.enable_2d == 1);
|
||||
|
||||
// TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated copying.
|
||||
rasterizer.FlushRegion(source_cpu, copy_size);
|
||||
size_t copy_size = regs.x_count * regs.y_count;
|
||||
|
||||
// We have to invalidate the destination region to evict any outdated surfaces from the cache.
|
||||
// We do this before actually writing the new data because the destination address might contain
|
||||
// a dirty surface that will have to be written back to memory.
|
||||
rasterizer.InvalidateRegion(dest_cpu, copy_size);
|
||||
const auto FlushAndInvalidate = [&](u32 src_size, u32 dst_size) {
|
||||
// TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
|
||||
// copying.
|
||||
rasterizer.FlushRegion(source_cpu, src_size);
|
||||
|
||||
// We have to invalidate the destination region to evict any outdated surfaces from the
|
||||
// cache. We do this before actually writing the new data because the destination address
|
||||
// might contain a dirty surface that will have to be written back to memory.
|
||||
rasterizer.InvalidateRegion(dest_cpu, dst_size);
|
||||
};
|
||||
|
||||
u8* src_buffer = Memory::GetPointer(source_cpu);
|
||||
u8* dst_buffer = Memory::GetPointer(dest_cpu);
|
||||
|
||||
if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
|
||||
ASSERT(regs.src_params.size_z == 1);
|
||||
|
||||
// If the input is tiled and the output is linear, deswizzle the input and copy it over.
|
||||
|
||||
// Copy the data to a staging buffer first to make applying the src and dst offsets easier
|
||||
@@ -93,6 +111,8 @@ void MaxwellDMA::HandleCopy() {
|
||||
u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
|
||||
u32 dst_bytes_per_pixel = regs.dst_pitch;
|
||||
|
||||
FlushAndInvalidate(staging_buffer.size(), copy_size * dst_bytes_per_pixel);
|
||||
|
||||
Texture::CopySwizzledData(regs.src_params.size_x, regs.src_params.size_y,
|
||||
src_bytes_per_pixel, dst_bytes_per_pixel, src_buffer,
|
||||
staging_buffer.data(), true, regs.src_params.BlockHeight());
|
||||
@@ -101,13 +121,17 @@ void MaxwellDMA::HandleCopy() {
|
||||
regs.dst_pitch;
|
||||
std::memcpy(dst_buffer, staging_buffer.data() + src_offset, copy_size * regs.dst_pitch);
|
||||
} else {
|
||||
// TODO(Subv): Source offsets are not yet implemented for this mode.
|
||||
ASSERT(regs.src_params.pos_x == 0);
|
||||
ASSERT(regs.src_params.pos_y == 0);
|
||||
ASSERT(regs.dst_params.size_z == 1);
|
||||
ASSERT(regs.src_pitch == regs.x_count);
|
||||
|
||||
u32 src_bpp = regs.src_pitch / regs.x_count;
|
||||
|
||||
FlushAndInvalidate(regs.src_pitch * regs.y_count,
|
||||
regs.dst_params.size_x * regs.dst_params.size_y * src_bpp);
|
||||
|
||||
// If the input is linear and the output is tiled, swizzle the input and copy it over.
|
||||
Texture::CopySwizzledData(regs.dst_params.size_x, regs.dst_params.size_y, 1, 1, dst_buffer,
|
||||
src_buffer, false, regs.dst_params.BlockHeight());
|
||||
Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
|
||||
src_bpp, dest_cpu, source_cpu, regs.dst_params.BlockHeight());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -88,6 +88,22 @@ void FastSwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u8* swizzled_da
|
||||
}
|
||||
}
|
||||
|
||||
void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
|
||||
u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
|
||||
u32 block_height) {
|
||||
for (u32 line = 0; line < subrect_height; ++line) {
|
||||
for (u32 x = 0; x < subrect_width; ++x) {
|
||||
u32 swizzled_offset =
|
||||
GetSwizzleOffset(x, line, swizzled_width, bytes_per_pixel, block_height);
|
||||
|
||||
const VAddr source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
|
||||
const VAddr dest_addr = swizzled_data + swizzled_offset;
|
||||
|
||||
Memory::CopyBlock(dest_addr, source_line, bytes_per_pixel);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
u32 BytesPerPixel(TextureFormat format) {
|
||||
switch (format) {
|
||||
case TextureFormat::DXT1:
|
||||
|
||||
@@ -26,6 +26,11 @@ std::vector<u8> UnswizzleDepthTexture(VAddr address, DepthFormat format, u32 wid
|
||||
void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
|
||||
u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 block_height);
|
||||
|
||||
/// Copies an untiled subrectangle into a tiled surface.
|
||||
void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
|
||||
u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
|
||||
u32 block_height);
|
||||
|
||||
/**
|
||||
* Decodes an unswizzled texture into a A8R8G8B8 texture.
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user