Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

softgpu: Check depth test early on simple stencil #16001

Merged
merged 1 commit into from
Sep 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions GPU/Software/DrawPixel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,10 @@ static inline bool DepthTestPassed(GEComparison func, int x, int y, int stride,
}
}

bool CheckDepthTestPassed(GEComparison func, int x, int y, int stride, u16 z) {
return DepthTestPassed(func, x, y, stride, z);
}

static inline u32 ApplyLogicOp(GELogicOp op, u32 old_color, u32 new_color) {
// All of the operations here intentionally preserve alpha/stencil.
switch (op) {
Expand Down Expand Up @@ -400,7 +404,7 @@ template <bool clearMode, GEBufferFormat fbFormat>
void SOFTRAST_CALL DrawSinglePixel(int x, int y, int z, int fog, Vec4IntArg color_in, const PixelFuncID &pixelID) {
Vec4<int> prim_color = Vec4<int>(color_in).Clamp(0, 255);
// Depth range test - applied in clear mode, if not through mode.
if (pixelID.applyDepthRange)
if (pixelID.applyDepthRange && !pixelID.earlyZChecks)
if (z < pixelID.cached.minz || z > pixelID.cached.maxz)
return;

Expand Down Expand Up @@ -436,14 +440,14 @@ void SOFTRAST_CALL DrawSinglePixel(int x, int y, int z, int fog, Vec4IntArg colo
}

// Also apply depth at the same time. If disabled, same as passing.
if (pixelID.DepthTestFunc() != GE_COMP_ALWAYS && !DepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
if (!pixelID.earlyZChecks && pixelID.DepthTestFunc() != GE_COMP_ALWAYS && !DepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.ZFail(), stencil);
SetPixelStencil(fbFormat, pixelID.cached.framebufStride, targetWriteMask, x, y, stencil);
return;
}

stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.ZPass(), stencil);
} else {
} else if (!pixelID.earlyZChecks) {
if (pixelID.DepthTestFunc() != GE_COMP_ALWAYS && !DepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
return;
}
Expand Down
2 changes: 2 additions & 0 deletions GPU/Software/DrawPixel.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ SingleFunc GetSingleFunc(const PixelFuncID &id);
void Init();
void Shutdown();

bool CheckDepthTestPassed(GEComparison func, int x, int y, int stride, u16 z);

bool DescribeCodePtr(const u8 *ptr, std::string &name);

struct PixelBlendState {
Expand Down
10 changes: 5 additions & 5 deletions GPU/Software/DrawPixelX86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ RegCache::Reg PixelJitCache::GetColorOff(const PixelFuncID &id) {
if (!regCache_.Has(RegCache::GEN_COLOR_OFF)) {
Describe("GetColorOff");
if (id.useStandardStride && !id.dithering) {
bool loadDepthOff = id.depthWrite || id.DepthTestFunc() != GE_COMP_ALWAYS;
bool loadDepthOff = id.depthWrite || (id.DepthTestFunc() != GE_COMP_ALWAYS && !id.earlyZChecks);
X64Reg depthTemp = INVALID_REG;
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
Expand Down Expand Up @@ -345,7 +345,7 @@ void PixelJitCache::WriteConstantPool(const PixelFuncID &id) {
}

bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) {
if (id.applyDepthRange) {
if (id.applyDepthRange && !id.earlyZChecks) {
Describe("ApplyDepthR");
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
X64Reg idReg = GetPixelID();
Expand All @@ -365,7 +365,7 @@ bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) {
// Since this is early on, try to free up the z reg if we don't need it anymore.
if (id.clearMode && !id.DepthClear())
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
else if (!id.clearMode && !id.depthWrite && id.DepthTestFunc() == GE_COMP_ALWAYS)
else if (!id.clearMode && !id.depthWrite && (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks))
regCache_.ForceRelease(RegCache::GEN_ARG_Z);

return true;
Expand Down Expand Up @@ -721,7 +721,7 @@ bool PixelJitCache::Jit_StencilTest(const PixelFuncID &id, RegCache::Reg stencil
}

bool PixelJitCache::Jit_DepthTestForStencil(const PixelFuncID &id, RegCache::Reg stencilReg) {
if (id.DepthTestFunc() == GE_COMP_ALWAYS)
if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)
return true;

X64Reg depthOffReg = GetDepthOff(id);
Expand Down Expand Up @@ -964,7 +964,7 @@ bool PixelJitCache::Jit_WriteStencilOnly(const PixelFuncID &id, RegCache::Reg st
}

bool PixelJitCache::Jit_DepthTest(const PixelFuncID &id) {
if (id.DepthTestFunc() == GE_COMP_ALWAYS)
if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)
return true;

if (id.DepthTestFunc() == GE_COMP_NEVER) {
Expand Down
7 changes: 7 additions & 0 deletions GPU/Software/FuncId.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,13 @@ void ComputePixelFuncID(PixelFuncID *id, bool throughMode) {

id->applyLogicOp = gstate.isLogicOpEnabled() && gstate.getLogicOp() != GE_LOGIC_COPY;
id->applyFog = gstate.isFogEnabled() && !throughMode;

id->earlyZChecks = id->DepthTestFunc() != GE_COMP_ALWAYS;
if (id->stencilTest && id->earlyZChecks) {
// Can't do them early if stencil might need to write.
if (id->SFail() != GE_STENCILOP_KEEP || id->ZFail() != GE_STENCILOP_KEEP)
id->earlyZChecks = false;
}
}

// Cache some values for later convenience.
Expand Down
3 changes: 2 additions & 1 deletion GPU/Software/FuncId.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ struct PixelFuncID {
uint8_t sFail : 3;
uint8_t zFail : 3;
uint8_t zPass : 3;
// 60 bits, 4 free.
bool earlyZChecks : 1;
// 61 bits, 3 free.
};
};

Expand Down
95 changes: 81 additions & 14 deletions GPU/Software/Rasterizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,11 @@ void DrawTriangleSlice(
const bool flatColor1 = flatColorAll || (v0.color1 == v1.color1 && v0.color1 == v2.color1);
const bool noFog = clearMode || !pixelID.applyFog || (v0.fogdepth >= 1.0f && v1.fogdepth >= 1.0f && v2.fogdepth >= 1.0f);

if (pixelID.applyDepthRange && flatZ) {
if (v0.screenpos.z < pixelID.cached.minz || v0.screenpos.z > pixelID.cached.maxz)
return;
}

#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
uint32_t bpp = pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
std::string tag = StringFromFormat("DisplayListT_%08x", state.listPC);
Expand Down Expand Up @@ -754,6 +759,32 @@ void DrawTriangleSlice(
if (AnyMask<useSSE4>(mask)) {
Vec4<float> wsum_recip = EdgeRecip(w0, w1, w2);

Vec4<int> z;
if (flatZ) {
z = Vec4<int>::AssignToAll(v2.screenpos.z);
} else {
// Z is interpolated pretty much directly.
Vec4<float> zfloats = w0.Cast<float>() * v0.screenpos.z + w1.Cast<float>() * v1.screenpos.z + w2.Cast<float>() * v2.screenpos.z;
z = (zfloats * wsum_recip).Cast<int>();
}

if (pixelID.earlyZChecks) {
for (int i = 0; i < 4; ++i) {
if (pixelID.applyDepthRange) {
if (z[i] < pixelID.cached.minz || z[i] > pixelID.cached.maxz)
mask[i] = -1;
}
if (mask[i] < 0)
continue;

int x = p.x + (i & 1);
int y = p.y + (i / 2);
if (!CheckDepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z[i])) {
mask[i] = -1;
}
}
}

// Color interpolation is not perspective corrected on the PSP.
Vec4<int> prim_color[4];
if (!flatColor0) {
Expand Down Expand Up @@ -816,15 +847,6 @@ void DrawTriangleSlice(
}
}

Vec4<int> z;
if (flatZ) {
z = Vec4<int>::AssignToAll(v2.screenpos.z);
} else {
// Z is interpolated pretty much directly.
Vec4<float> zfloats = w0.Cast<float>() * v0.screenpos.z + w1.Cast<float>() * v1.screenpos.z + w2.Cast<float>() * v2.screenpos.z;
z = (zfloats * wsum_recip).Cast<int>();
}

PROFILE_THIS_SCOPE("draw_tri_px");
DrawingCoords subp = p;
for (int i = 0; i < 4; ++i) {
Expand Down Expand Up @@ -947,6 +969,12 @@ void DrawRectangle(const VertexData &v0, const VertexData &v1, const BinCoords &
Vec4<int> z = Vec4<int>::AssignToAll(v1.screenpos.z);
Vec3<int> sec_color = v1.color1;

if (state.pixelID.applyDepthRange) {
// We can bail early since the Z is flat.
if (v1.screenpos.z < state.pixelID.cached.minz || v1.screenpos.z > state.pixelID.cached.maxz)
return;
}

#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
uint32_t bpp = state.pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
std::string tag = StringFromFormat("DisplayListR_%08x", state.listPC);
Expand All @@ -972,6 +1000,19 @@ void DrawRectangle(const VertexData &v0, const VertexData &v1, const BinCoords &
prim_color[i] = v1.color0;
}

if (state.pixelID.earlyZChecks) {
for (int i = 0; i < 4; ++i) {
if (mask[i] < 0)
continue;

int x = p.x + (i & 1);
int y = p.y + (i / 2);
if (!CheckDepthTestPassed(state.pixelID.DepthTestFunc(), x, y, state.pixelID.cached.depthbufStride, z[i])) {
mask[i] = -1;
}
}
}

if (state.enableTextures) {
Vec4<float> s, t;
s = Vec4<float>::AssignToAll(st.s()) + sto4;
Expand Down Expand Up @@ -1038,6 +1079,20 @@ void DrawPoint(const VertexData &v0, const BinCoords &range, const RasterizerSta
auto &pixelID = state.pixelID;
auto &samplerID = state.samplerID;

DrawingCoords p = TransformUnit::ScreenToDrawing(pos);
u16 z = pos.z;

if (pixelID.earlyZChecks) {
if (pixelID.applyDepthRange) {
if (z < pixelID.cached.minz || z > pixelID.cached.maxz)
return;
}

if (!CheckDepthTestPassed(pixelID.DepthTestFunc(), p.x, p.y, pixelID.cached.depthbufStride, z)) {
return;
}
}

if (state.enableTextures) {
float s = v0.texturecoords.s();
float t = v0.texturecoords.t();
Expand All @@ -1060,9 +1115,6 @@ void DrawPoint(const VertexData &v0, const BinCoords &range, const RasterizerSta
if (!pixelID.clearMode)
prim_color += Vec4<int>(sec_color, 0);

DrawingCoords p = TransformUnit::ScreenToDrawing(pos);
u16 z = pos.z;

u8 fog = 255;
if (pixelID.applyFog) {
fog = ClampFogDepth(v0.fogdepth);
Expand Down Expand Up @@ -1302,7 +1354,23 @@ void DrawLine(const VertexData &v0, const VertexData &v1, const BinCoords &range
double z = a.z;
const int steps1 = steps == 0 ? 1 : steps;
for (int i = 0; i < steps; i++) {
if (x >= range.x1 && y >= range.y1 && x <= range.x2 && y <= range.y2) {
DrawingCoords p = TransformUnit::ScreenToDrawing(x, y);

bool maskOK = x >= range.x1 && y >= range.y1 && x <= range.x2 && y <= range.y2;
if (maskOK) {
if (pixelID.earlyZChecks) {
if (pixelID.applyDepthRange) {
if (z < pixelID.cached.minz || z > pixelID.cached.maxz)
maskOK = false;
}

if (!CheckDepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
maskOK = false;
}
}
}

if (maskOK) {
// Interpolate between the two points.
Vec4<int> prim_color;
Vec3<int> sec_color;
Expand Down Expand Up @@ -1368,7 +1436,6 @@ void DrawLine(const VertexData &v0, const VertexData &v1, const BinCoords &range
prim_color += Vec4<int>(sec_color, 0);

PROFILE_THIS_SCOPE("draw_px");
DrawingCoords p = TransformUnit::ScreenToDrawing(x, y);
state.drawPixel(p.x, p.y, z, fog, ToVec4IntArg(prim_color), pixelID);

#if defined(SOFTGPU_MEMORY_TAGGING_DETAILED) || defined(SOFTGPU_MEMORY_TAGGING_BASIC)
Expand Down