Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add heuristic for memory->framebuffer copies, fixing video flicker in Naruto UNH 2 #18008

Merged
merged 2 commits into from
Aug 29, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 100 additions & 61 deletions GPU/Common/FramebufferManagerCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1842,21 +1842,31 @@ bool FramebufferManagerCommon::NotifyFramebufferCopy(u32 src, u32 dst, int size,
// Or at least this should be like the other ones, gathering possible candidates
// with the ability to list them out for debugging.

VirtualFramebuffer *dstBuffer = nullptr;
VirtualFramebuffer *srcBuffer = nullptr;
bool ignoreDstBuffer = flags & GPUCopyFlag::FORCE_DST_MATCH_MEM;
bool ignoreSrcBuffer = flags & (GPUCopyFlag::FORCE_SRC_MATCH_MEM | GPUCopyFlag::MEMSET);
RasterChannel channel = flags & GPUCopyFlag::DEPTH_REQUESTED ? RASTER_DEPTH : RASTER_COLOR;

u32 dstY = (u32)-1;
u32 dstH = 0;
u32 srcY = (u32)-1;
u32 srcH = 0;
struct CopyCandidate {
VirtualFramebuffer *vfb;
int score;

VirtualFramebuffer *dstBuffer = nullptr;
VirtualFramebuffer *srcBuffer = nullptr;

u32 dstY = (u32)-1;
u32 dstH = 0;
u32 srcY = (u32)-1;
u32 srcH = 0;
};

TinySet<CopyCandidate, 4> candidates;
for (auto vfb : vfbs_) {
if (vfb->fb_stride == 0 || channel != RASTER_COLOR) {
continue;
}

CopyCandidate candidate{ vfb, 10 };

// We only remove the kernel and uncached bits when comparing.
const u32 vfb_address = vfb->fb_address;
const u32 vfb_size = vfb->BufferByteSize(RASTER_COLOR);
Expand All @@ -1868,121 +1878,150 @@ bool FramebufferManagerCommon::NotifyFramebufferCopy(u32 src, u32 dst, int size,
if (!ignoreDstBuffer && vfb_address == dst && ((size == 0x44000 && vfb_size == 0x88000) || (size == 0x88000 && vfb_size == 0x44000))) {
// Not likely to be a correct color format copy for this buffer. Ignore it, there will either be RAM
// that can be displayed from, or another matching buffer with the right format if rendering is going on.
WARN_LOG_N_TIMES(notify_copy_2x, 5, G3D, "Framebuffer size %08x conspicuously not matching copy size %08x in NotifyFramebufferCopy. Ignoring.", size, vfb_size);
continue;
// If we had scoring here, we should strongly penalize this target instead of ignoring it.
WARN_LOG_N_TIMES(notify_copy_2x, 5, G3D, "Framebuffer size %08x conspicuously not matching copy size %08x in NotifyFramebufferCopy. Downscoring.", size, vfb_size);
candidate.score = 2;
}

if ((u32)size > vfb_size + 0x1000 && vfb->fb_format != GE_FORMAT_8888 && vfb->last_frame_render < gpuStats.numFlips) {
// Seems likely we are looking at a potential copy of 32-bit pixels (like video) to an old 16-bit buffer,
// which is very likely simply the wrong target, so skip it. See issue #17740 where this happens in Naruto Ultimate Ninja Heroes 2.
// If we had scoring here, we should strongly penalize this target instead of ignoring it.
WARN_LOG_N_TIMES(notify_copy_2x, 5, G3D, "Framebuffer size %08x too small for %08x bytes of data and also 16-bit (%s), and not rendered to this frame. Downscoring.", vfb_size, size, GeBufferFormatToString(vfb->fb_format));
candidate.score = 1;
}

if (!ignoreDstBuffer && dst >= vfb_address && (dst + size <= vfb_address + vfb_size || dst == vfb_address)) {
const u32 offset = dst - vfb_address;
const u32 yOffset = offset / vfb_byteStride;
if ((offset % vfb_byteStride) == 0 && (size == vfb_byteWidth || (size % vfb_byteStride) == 0) && yOffset < dstY) {
dstBuffer = vfb;
dstY = yOffset;
dstH = size == vfb_byteWidth ? 1 : std::min((u32)size / vfb_byteStride, (u32)vfb->height);
if ((offset % vfb_byteStride) == 0 && (size == vfb_byteWidth || (size % vfb_byteStride) == 0)) {
candidate.dstBuffer = vfb;
candidate.dstY = yOffset;
candidate.dstH = size == vfb_byteWidth ? 1 : std::min((u32)size / vfb_byteStride, (u32)vfb->height);
}
}

if (!ignoreSrcBuffer && src >= vfb_address && (src + size <= vfb_address + vfb_size || src == vfb_address)) {
const u32 offset = src - vfb_address;
const u32 yOffset = offset / vfb_byteStride;
if ((offset % vfb_byteStride) == 0 && (size == vfb_byteWidth || (size % vfb_byteStride) == 0) && yOffset < srcY) {
srcBuffer = vfb;
srcY = yOffset;
srcH = size == vfb_byteWidth ? 1 : std::min((u32)size / vfb_byteStride, (u32)vfb->height);
} else if ((offset % vfb_byteStride) == 0 && size == vfb->fb_stride && yOffset < srcY) {
if ((offset % vfb_byteStride) == 0 && (size == vfb_byteWidth || (size % vfb_byteStride) == 0)) {
candidate.srcBuffer = vfb;
candidate.srcY = yOffset;
candidate.srcH = size == vfb_byteWidth ? 1 : std::min((u32)size / vfb_byteStride, (u32)vfb->height);
} else if ((offset % vfb_byteStride) == 0 && size == vfb->fb_stride) {
// Valkyrie Profile reads 512 bytes at a time, rather than 2048. So, let's whitelist fb_stride also.
srcBuffer = vfb;
srcY = yOffset;
srcH = 1;
} else if (yOffset == 0 && yOffset < srcY) {
candidate.srcBuffer = vfb;
candidate.srcY = yOffset;
candidate.srcH = 1;
} else if (yOffset == 0) {
// Okay, last try - it might be a clut.
if (vfb->usageFlags & FB_USAGE_CLUT) {
srcBuffer = vfb;
srcY = yOffset;
srcH = 1;
candidate.srcBuffer = vfb;
candidate.srcY = yOffset;
candidate.srcH = 1;
}
}
}
candidates.push_back(candidate);
}

if (channel == RASTER_DEPTH) {
srcBuffer = nullptr;
dstBuffer = nullptr;
// Let's assume exact matches only for simplicity.
for (auto vfb : vfbs_) {
CopyCandidate candidate{ vfb, 10 };
candidate.srcBuffer = nullptr;
candidate.dstBuffer = nullptr;
if (!ignoreDstBuffer && dst == vfb->z_address && size == vfb->z_stride * 2 * vfb->height) {
if (!dstBuffer || dstBuffer->depthBindSeq < vfb->depthBindSeq) {
dstBuffer = vfb;
dstY = 0;
dstH = vfb->height;
if (!candidate.dstBuffer || candidate.dstBuffer->depthBindSeq < vfb->depthBindSeq) {
candidate.dstBuffer = vfb;
candidate.dstY = 0;
candidate.dstH = vfb->height;
}
}
if (!ignoreSrcBuffer && src == vfb->z_address && size == vfb->z_stride * 2 * vfb->height) {
if (!srcBuffer || srcBuffer->depthBindSeq < vfb->depthBindSeq) {
srcBuffer = vfb;
srcY = 0;
srcH = vfb->height;
if (!candidate.srcBuffer || candidate.srcBuffer->depthBindSeq < vfb->depthBindSeq) {
candidate.srcBuffer = vfb;
candidate.srcY = 0;
candidate.srcH = vfb->height;
}
}
candidates.push_back(candidate);
}
}

// Currently we find the first or best one, but maybe we should just copy to all?
CopyCandidate *best = nullptr;
for (size_t i = 0; i < candidates.size(); i++) {
if (!best) {
best = &candidates[i];
}
if (candidates[i].score > best->score) {
best = &candidates[i];
}
// In the old code, lower Y won, so let's replicate that.
if (candidates[i].score == best->score && candidates[i].dstY < best->dstY) {
best = &candidates[i];
}
}
if (!best) {
return false;
}

if (!useBufferedRendering_) {
// If we're copying into a recently used display buf, it's probably destined for the screen.
if (channel == RASTER_DEPTH || srcBuffer || (dstBuffer != displayFramebuf_ && dstBuffer != prevDisplayFramebuf_)) {
if (channel == RASTER_DEPTH || best->srcBuffer || (best->dstBuffer != displayFramebuf_ && best->dstBuffer != prevDisplayFramebuf_)) {
return false;
}
}

if (!dstBuffer && srcBuffer && channel != RASTER_DEPTH) {
if (!best->dstBuffer && best->srcBuffer && channel != RASTER_DEPTH) {
// Note - if we're here, we're in a memcpy, not a block transfer. Not allowing IntraVRAMBlockTransferAllowCreateFB.
// Technically, that makes BlockTransferAllowCreateFB a bit of a misnomer.
if (PSP_CoreParameter().compat.flags().BlockTransferAllowCreateFB && !(flags & GPUCopyFlag::DISALLOW_CREATE_VFB)) {
dstBuffer = CreateRAMFramebuffer(dst, srcBuffer->width, srcBuffer->height, srcBuffer->fb_stride, srcBuffer->fb_format);
dstY = 0;
best->dstBuffer = CreateRAMFramebuffer(dst, best->srcBuffer->width, best->srcBuffer->height, best->srcBuffer->fb_stride, best->srcBuffer->fb_format);
best->dstY = 0;
}
}
if (dstBuffer) {
dstBuffer->last_frame_used = gpuStats.numFlips;
if (channel == RASTER_DEPTH && !srcBuffer)
dstBuffer->usageFlags |= FB_USAGE_COLOR_MIXED_DEPTH;
if (best->dstBuffer) {
best->dstBuffer->last_frame_used = gpuStats.numFlips;
if (channel == RASTER_DEPTH && !best->srcBuffer)
best->dstBuffer->usageFlags |= FB_USAGE_COLOR_MIXED_DEPTH;
}
if (srcBuffer && channel == RASTER_DEPTH && !dstBuffer)
srcBuffer->usageFlags |= FB_USAGE_COLOR_MIXED_DEPTH;
if (best->srcBuffer && channel == RASTER_DEPTH && !best->dstBuffer)
best->srcBuffer->usageFlags |= FB_USAGE_COLOR_MIXED_DEPTH;

if (dstBuffer && srcBuffer) {
if (srcBuffer == dstBuffer) {
if (best->dstBuffer && best->srcBuffer) {
if (best->srcBuffer == best->dstBuffer) {
WARN_LOG_ONCE(dstsrccpy, G3D, "Intra-buffer memcpy (not supported) %08x -> %08x (size: %x)", src, dst, size);
} else {
WARN_LOG_ONCE(dstnotsrccpy, G3D, "Inter-buffer memcpy %08x -> %08x (size: %x)", src, dst, size);
// Just do the blit!
BlitFramebuffer(dstBuffer, 0, dstY, srcBuffer, 0, srcY, srcBuffer->width, srcH, 0, channel, "Blit_InterBufferMemcpy");
SetColorUpdated(dstBuffer, skipDrawReason);
BlitFramebuffer(best->dstBuffer, 0, best->dstY, best->srcBuffer, 0, best->srcY, best->srcBuffer->width, best->srcH, 0, channel, "Blit_InterBufferMemcpy");
SetColorUpdated(best->dstBuffer, skipDrawReason);
RebindFramebuffer("RebindFramebuffer - Inter-buffer memcpy");
}
return false;
} else if (dstBuffer) {
} else if (best->dstBuffer) {
if (flags & GPUCopyFlag::MEMSET) {
gpuStats.numClears++;
}
WARN_LOG_ONCE(btucpy, G3D, "Memcpy fbo upload %08x -> %08x (size: %x)", src, dst, size);
FlushBeforeCopy();
const u8 *srcBase = Memory::GetPointerUnchecked(src);
GEBufferFormat srcFormat = channel == RASTER_DEPTH ? GE_FORMAT_DEPTH16 : dstBuffer->fb_format;
int srcStride = channel == RASTER_DEPTH ? dstBuffer->z_stride : dstBuffer->fb_stride;
DrawPixels(dstBuffer, 0, dstY, srcBase, srcFormat, srcStride, dstBuffer->width, dstH, channel, "MemcpyFboUpload_DrawPixels");
SetColorUpdated(dstBuffer, skipDrawReason);
GEBufferFormat srcFormat = channel == RASTER_DEPTH ? GE_FORMAT_DEPTH16 : best->dstBuffer->fb_format;
int srcStride = channel == RASTER_DEPTH ? best->dstBuffer->z_stride : best->dstBuffer->fb_stride;
DrawPixels(best->dstBuffer, 0, best->dstY, srcBase, srcFormat, srcStride, best->dstBuffer->width, best->dstH, channel, "MemcpyFboUpload_DrawPixels");
SetColorUpdated(best->dstBuffer, skipDrawReason);
RebindFramebuffer("RebindFramebuffer - Memcpy fbo upload");
// This is a memcpy, let's still copy just in case.
return false;
} else if (srcBuffer) {
} else if (best->srcBuffer) {
WARN_LOG_ONCE(btdcpy, G3D, "Memcpy fbo download %08x -> %08x", src, dst);
FlushBeforeCopy();
if (srcH == 0 || srcY + srcH > srcBuffer->bufferHeight) {
WARN_LOG_ONCE(btdcpyheight, G3D, "Memcpy fbo download %08x -> %08x skipped, %d+%d is taller than %d", src, dst, srcY, srcH, srcBuffer->bufferHeight);
} else if (!g_Config.bSkipGPUReadbacks && (!srcBuffer->memoryUpdated || channel == RASTER_DEPTH)) {
ReadFramebufferToMemory(srcBuffer, 0, srcY, srcBuffer->width, srcH, channel, Draw::ReadbackMode::BLOCK);
srcBuffer->usageFlags = (srcBuffer->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
if (best->srcH == 0 || best->srcY + best->srcH > best->srcBuffer->bufferHeight) {
WARN_LOG_ONCE(btdcpyheight, G3D, "Memcpy fbo download %08x -> %08x skipped, %d+%d is taller than %d", src, dst, best->srcY, best->srcH, best->srcBuffer->bufferHeight);
} else if (!g_Config.bSkipGPUReadbacks && (!best->srcBuffer->memoryUpdated || channel == RASTER_DEPTH)) {
ReadFramebufferToMemory(best->srcBuffer, 0, best->srcY, best->srcBuffer->width, best->srcH, channel, Draw::ReadbackMode::BLOCK);
best->srcBuffer->usageFlags = (best->srcBuffer->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
}
return false;
} else {
Expand Down Expand Up @@ -2106,7 +2145,7 @@ bool FramebufferManagerCommon::FindTransferFramebuffer(u32 basePtr, int stride_p
}

const BlockTransferRect *best = nullptr;
// Sort candidates by just recency for now, we might add other.
// Use some heuristics to find the best candidate.
for (size_t i = 0; i < candidates.size(); i++) {
const BlockTransferRect *candidate = &candidates[i];

Expand Down