From c9edf4ddaa46cc3bdc62046e643c59fdca7c3f92 Mon Sep 17 00:00:00 2001 From: Eric Wasylishen Date: Tue, 7 Nov 2023 23:55:21 -0700 Subject: [PATCH] vis: huge speedup from fixing false sharing with stat counters gmsp3v2.bsp, 32 threads 3950x, 157s -> 61s --- include/vis/vis.hh | 41 +++++++++++++++++++++------- vis/flow.cc | 48 ++++++++++++++++----------------- vis/vis.cc | 66 +++++++++++++++++++++++++++------------------- 3 files changed, 93 insertions(+), 62 deletions(-) diff --git a/include/vis/vis.hh b/include/vis/vis.hh index 64d956d3..46919291 100644 --- a/include/vis/vis.hh +++ b/include/vis/vis.hh @@ -196,15 +196,45 @@ struct pstack_t // important for perf as a ton of these are stack allocated, needs to be be just a pointer bump static_assert(std::is_trivially_default_constructible_v); +struct visstats_t +{ + int64_t c_portaltest = 0; + int64_t c_portalpass = 0; + int64_t c_portalcheck = 0; + int64_t c_mightseeupdate = 0; + int64_t c_noclip = 0; + int64_t c_vistest = 0; + int64_t c_mighttest = 0; + int64_t c_chains = 0; + int64_t c_leafskip = 0; + int64_t c_portalskip = 0; + + visstats_t operator+(const visstats_t& other) const { + visstats_t result; + result.c_portaltest = this->c_portaltest + other.c_portaltest; + result.c_portalpass = this->c_portalpass + other.c_portalpass; + result.c_portalcheck = this->c_portalcheck + other.c_portalcheck; + result.c_mightseeupdate = this->c_mightseeupdate + other.c_mightseeupdate; + result.c_noclip = this->c_noclip + other.c_noclip; + result.c_vistest = this->c_vistest + other.c_vistest; + result.c_mighttest = this->c_mighttest + other.c_mighttest; + result.c_chains = this->c_chains + other.c_chains; + result.c_leafskip = this->c_leafskip + other.c_leafskip; + result.c_portalskip = this->c_portalskip + other.c_portalskip; + return result; + } +}; + viswinding_t *AllocStackWinding(pstack_t &stack); void FreeStackWinding(viswinding_t *&w, pstack_t &stack); -viswinding_t *ClipStackWinding(viswinding_t *in, pstack_t &stack, const qplane3d &split); +viswinding_t *ClipStackWinding(visstats_t &stats, viswinding_t *in, pstack_t &stack, const qplane3d &split); struct threaddata_t { leafbits_t &leafvis; visportal_t *base; pstack_t pstack_head; + visstats_t stats; }; extern int numportals; @@ -214,13 +244,6 @@ extern int portalleafs_real; extern std::vector portals; // always numportals * 2; front and back extern std::vector leafs; -extern int c_noclip; -extern int c_portaltest, c_portalpass, c_portalcheck; -extern int c_vistest, c_mighttest; -extern unsigned long c_chains; - -extern bool showgetleaf; - extern std::vector uncompressed; extern int leafbytes; extern int leafbytes_real; @@ -230,7 +253,7 @@ extern fs::path portalfile, statefile, statetmpfile; void BasePortalVis(void); -void PortalFlow(visportal_t *p); +visstats_t PortalFlow(visportal_t *p); void CalcAmbientSounds(mbsp_t *bsp); diff --git a/vis/flow.cc b/vis/flow.cc index 5c2f2a6f..170e3b55 100644 --- a/vis/flow.cc +++ b/vis/flow.cc @@ -4,12 +4,6 @@ #include #include -unsigned long c_chains; -int c_vistest, c_mighttest; - -static int c_portalskip; -static int c_leafskip; - /* ============== ClipToSeparators @@ -30,7 +24,7 @@ static int c_leafskip; pointer, was measurably faster ============== */ -static void ClipToSeparators(const viswinding_t *source, const qplane3d src_pl, const viswinding_t *pass, +static void ClipToSeparators(visstats_t &stats, const viswinding_t *source, const qplane3d src_pl, const viswinding_t *pass, viswinding_t *&target, unsigned int test, pstack_t &stack) { int i, j, k, l; @@ -114,7 +108,7 @@ static void ClipToSeparators(const viswinding_t *source, const qplane3d src_pl, stack.numseparators[test]++; } - target = ClipStackWinding(target, stack, sep); + target = ClipStackWinding(stats, target, stack, sep); if (!target) return; // target is not visible @@ -150,7 +144,7 @@ static void RecursiveLeafFlow(int leafnum, threaddata_t *thread, pstack_t &prevs leaf_t *leaf; int i, j, err, numblocks; - ++c_chains; + ++thread->stats.c_chains; leaf = &leafs[leafnum]; @@ -193,7 +187,7 @@ static void RecursiveLeafFlow(int leafnum, threaddata_t *thread, pstack_t &prevs p = leaf->portals[i]; if (!(*prevstack.mightsee)[p->leaf]) { - c_leafskip++; + thread->stats.c_leafskip++; continue; // can't possibly see it } @@ -201,10 +195,10 @@ static void RecursiveLeafFlow(int leafnum, threaddata_t *thread, pstack_t &prevs // if the portal can't see anything we haven't allready seen, skip it if (p->status == pstat_done) { - c_vistest++; + thread->stats.c_vistest++; test = p->visbits.data(); } else { - c_mighttest++; + thread->stats.c_mighttest++; test = p->mightsee.data(); } @@ -217,7 +211,7 @@ static void RecursiveLeafFlow(int leafnum, threaddata_t *thread, pstack_t &prevs if (!more) { // can't see anything new - c_portalskip++; + thread->stats.c_portalskip++; continue; } // get plane of portal, point normal into the neighbor leaf @@ -227,7 +221,7 @@ static void RecursiveLeafFlow(int leafnum, threaddata_t *thread, pstack_t &prevs if (qv::epsilonEqual(prevstack.portalplane.normal, backplane.normal, VIS_EQUAL_EPSILON)) continue; // can't go out a coplanar face - c_portalcheck++; + thread->stats.c_portalcheck++; stack.portal = p; stack.next = NULL; @@ -244,7 +238,7 @@ static void RecursiveLeafFlow(int leafnum, threaddata_t *thread, pstack_t &prevs */ /* Clip any part of the target portal behind the source portal */ - stack.pass = ClipStackWinding(p->winding.get(), stack, thread->pstack_head.portalplane); + stack.pass = ClipStackWinding(thread->stats, p->winding.get(), stack, thread->pstack_head.portalplane); if (!stack.pass) continue; @@ -257,31 +251,31 @@ static void RecursiveLeafFlow(int leafnum, threaddata_t *thread, pstack_t &prevs } /* Clip any part of the target portal behind the pass portal */ - stack.pass = ClipStackWinding(stack.pass, stack, prevstack.portalplane); + stack.pass = ClipStackWinding(thread->stats, stack.pass, stack, prevstack.portalplane); if (!stack.pass) continue; /* Clip any part of the source portal in front of the target portal */ - stack.source = ClipStackWinding(prevstack.source, stack, backplane); + stack.source = ClipStackWinding(thread->stats, prevstack.source, stack, backplane); if (!stack.source) { FreeStackWinding(stack.pass, stack); continue; } - c_portaltest++; + thread->stats.c_portaltest++; /* TEST 0 :: source -> pass -> target */ if (vis_options.level.value() > 0) { if (stack.numseparators[0]) { for (j = 0; j < stack.numseparators[0]; j++) { - stack.pass = ClipStackWinding(stack.pass, stack, stack.separators[0][j]); + stack.pass = ClipStackWinding(thread->stats, stack.pass, stack, stack.separators[0][j]); if (!stack.pass) break; } } else { /* Using prevstack source for separator cache correctness */ ClipToSeparators( - prevstack.source, thread->pstack_head.portalplane, prevstack.pass, stack.pass, 0, stack); + thread->stats, prevstack.source, thread->pstack_head.portalplane, prevstack.pass, stack.pass, 0, stack); } if (!stack.pass) { FreeStackWinding(stack.source, stack); @@ -293,13 +287,13 @@ static void RecursiveLeafFlow(int leafnum, threaddata_t *thread, pstack_t &prevs if (vis_options.level.value() > 1) { if (stack.numseparators[1]) { for (j = 0; j < stack.numseparators[1]; j++) { - stack.pass = ClipStackWinding(stack.pass, stack, stack.separators[1][j]); + stack.pass = ClipStackWinding(thread->stats, stack.pass, stack, stack.separators[1][j]); if (!stack.pass) break; } } else { /* Using prevstack source for separator cache correctness */ - ClipToSeparators(prevstack.pass, prevstack.portalplane, prevstack.source, stack.pass, 1, stack); + ClipToSeparators(thread->stats, prevstack.pass, prevstack.portalplane, prevstack.source, stack.pass, 1, stack); } if (!stack.pass) { FreeStackWinding(stack.source, stack); @@ -309,7 +303,7 @@ static void RecursiveLeafFlow(int leafnum, threaddata_t *thread, pstack_t &prevs /* TEST 2 :: target -> pass -> source */ if (vis_options.level.value() > 2) { - ClipToSeparators(stack.pass, stack.portalplane, prevstack.pass, stack.source, 2, stack); + ClipToSeparators(thread->stats, stack.pass, stack.portalplane, prevstack.pass, stack.source, 2, stack); if (!stack.source) { FreeStackWinding(stack.pass, stack); continue; @@ -318,14 +312,14 @@ static void RecursiveLeafFlow(int leafnum, threaddata_t *thread, pstack_t &prevs /* TEST 3 :: pass -> target -> source */ if (vis_options.level.value() > 3) { - ClipToSeparators(prevstack.pass, prevstack.portalplane, stack.pass, stack.source, 3, stack); + ClipToSeparators(thread->stats, prevstack.pass, prevstack.portalplane, stack.pass, stack.source, 3, stack); if (!stack.source) { FreeStackWinding(stack.pass, stack); continue; } } - c_portalpass++; + thread->stats.c_portalpass++; // flow through it for real RecursiveLeafFlow(p->leaf, thread, stack); @@ -340,7 +334,7 @@ static void RecursiveLeafFlow(int leafnum, threaddata_t *thread, pstack_t &prevs PortalFlow =============== */ -void PortalFlow(visportal_t *p) +visstats_t PortalFlow(visportal_t *p) { threaddata_t data{p->visbits}; @@ -357,6 +351,8 @@ void PortalFlow(visportal_t *p) data.pstack_head.mightsee = &p->mightsee; RecursiveLeafFlow(p->leaf, &data, data.pstack_head); + + return data.stats; } /* diff --git a/vis/vis.cc b/vis/vis.cc index 31487633..0ad86577 100644 --- a/vis/vis.cc +++ b/vis/vis.cc @@ -3,6 +3,7 @@ #include #include #include // for std::countr_zero +#include // for std::accumulate #include #include @@ -24,11 +25,6 @@ int portalleafs_real; /* real no. of leafs after expanding PRT2 clusters. Not us std::vector portals; // always numportals * 2; front and back std::vector leafs; -int c_portaltest, c_portalpass, c_portalcheck, c_mightseeupdate; -int c_noclip = 0; - -bool showgetleaf = true; - static std::vector vismap; uint32_t originalvismapsize; @@ -116,7 +112,7 @@ void FreeStackWinding(viswinding_t *&w, pstack_t &stack) is returned. ================== */ -viswinding_t *ClipStackWinding(viswinding_t *in, pstack_t &stack, const qplane3d &split) +viswinding_t *ClipStackWinding(visstats_t &stats, viswinding_t *in, pstack_t &stack, const qplane3d &split) { vec_t *dists = (vec_t *)alloca(sizeof(vec_t) * (in->size() + 1)); int *sides = (int *)alloca(sizeof(int) * (in->size() + 1)); @@ -216,7 +212,7 @@ viswinding_t *ClipStackWinding(viswinding_t *in, pstack_t &stack, const qplane3d noclip: FreeStackWinding(neww, stack); - c_noclip++; + stats.c_noclip++; return in; } @@ -271,7 +267,7 @@ visportal_t *GetNextPortal(void) Called with the lock held. ============= */ -static void UpdateMightsee(const leaf_t &source, const leaf_t &dest) +static void UpdateMightsee(visstats_t &stats, const leaf_t &source, const leaf_t &dest) { size_t leafnum = &dest - leafs.data(); for (size_t i = 0; i < source.numportals; i++) { @@ -282,7 +278,7 @@ static void UpdateMightsee(const leaf_t &source, const leaf_t &dest) if (p->mightsee[leafnum]) { p->mightsee[leafnum] = false; p->nummightsee--; - c_mightseeupdate++; + stats.c_mightseeupdate++; } } } @@ -297,7 +293,7 @@ static void UpdateMightsee(const leaf_t &source, const leaf_t &dest) Called with the lock held. ============= */ -static void PortalCompleted(visportal_t *completed) +static void PortalCompleted(visstats_t &stats, visportal_t *completed) { int i, j, k, bit, numblocks; int leafnum; @@ -349,7 +345,7 @@ static void PortalCompleted(visportal_t *completed) bit = std::countr_zero(changed); changed &= ~nth_bit(bit); leafnum = (j << leafbits_t::shift) + bit; - UpdateMightsee(leafs[leafnum], myleaf); + UpdateMightsee(stats, leafs[leafnum], myleaf); } } } @@ -365,7 +361,7 @@ static duration stateinterval; LeafThread ============== */ -void LeafThread(size_t) +static visstats_t LeafThread() { visportal_t *p; @@ -380,14 +376,16 @@ void LeafThread(size_t) p = GetNextPortal(); if (!p) - return; + return {}; - PortalFlow(p); + visstats_t stats = PortalFlow(p); - PortalCompleted(p); + PortalCompleted(stats, p); logging::print(logging::flag::VERBOSE, "portal:{:4} mightsee:{:4} cansee:{:4}\n", (ptrdiff_t)(p - portals.data()), p->nummightsee, p->numcansee); + + return stats; } /* @@ -503,7 +501,7 @@ static void ClusterFlow(int clusternum, leafbits_t &buffer, mbsp_t *bsp) CalcPortalVis ================== */ -void CalcPortalVis(const mbsp_t *bsp) +visstats_t CalcPortalVis(const mbsp_t *bsp) { // fastvis just uses mightsee for a very loose bound if (vis_options.fast.value()) { @@ -511,7 +509,7 @@ void CalcPortalVis(const mbsp_t *bsp) p.visbits = p.mightsee; p.status = pstat_done; } - return; + return {}; } /* @@ -525,14 +523,26 @@ void CalcPortalVis(const mbsp_t *bsp) } portalIndex = startcount; - logging::parallel_for(startcount, numportals * 2, LeafThread); + + std::vector stats_perportal; + stats_perportal.resize(numportals * 2); + + logging::parallel_for(startcount, numportals * 2, [&](size_t i) { + stats_perportal[i] = LeafThread(); + }); + + const visstats_t stats = std::accumulate(stats_perportal.begin(), + stats_perportal.end(), + visstats_t{}); SaveVisState(); - logging::print(logging::flag::VERBOSE, "portalcheck: {} portaltest: {} portalpass: {}\n", c_portalcheck, - c_portaltest, c_portalpass); - logging::print(logging::flag::VERBOSE, "c_vistest: {} c_mighttest: {} c_mightseeupdate {}\n", c_vistest, - c_mighttest, c_mightseeupdate); + logging::print(logging::flag::VERBOSE, "portalcheck: {} portaltest: {} portalpass: {}\n", stats.c_portalcheck, + stats.c_portaltest, stats.c_portalpass); + logging::print(logging::flag::VERBOSE, "c_vistest: {} c_mighttest: {} c_mightseeupdate {}\n", stats.c_vistest, + stats.c_mighttest, stats.c_mightseeupdate); + + return stats; } /* @@ -540,7 +550,7 @@ void CalcPortalVis(const mbsp_t *bsp) CalcVis ================== */ -void CalcVis(mbsp_t *bsp) +visstats_t CalcVis(mbsp_t *bsp) { int i; @@ -552,7 +562,7 @@ void CalcVis(mbsp_t *bsp) } logging::print("Calculating Full Vis:\n"); - CalcPortalVis(bsp); + auto stats = CalcPortalVis(bsp); // // assemble the leaf vis lists by oring and compressing the portal lists @@ -575,6 +585,8 @@ void CalcVis(mbsp_t *bsp) logging::print("average leafs visible: {}\n", avg); } + + return stats; } // =========================================================================== @@ -747,10 +759,10 @@ int vis_main(int argc, const char **argv) uncompressed.resize(portalleafs * leafbytes); } - CalcVis(&bsp); + auto stats = CalcVis(&bsp); - logging::print("c_noclip: {}\n", c_noclip); - logging::print("c_chains: {}\n", c_chains); + logging::print("c_noclip: {}\n", stats.c_noclip); + logging::print("c_chains: {}\n", stats.c_chains); bsp.dvis.bits = std::move(vismap); bsp.dvis.bits.shrink_to_fit();