mitigate overhead of jobsystem

For jobs with that do very little work, the jobsystem can introduce a lot of overhead, we mitigate this by: - don't wake-up worker threads when scheduling several very small jobs, like when scheduling the per-face jobs. - don't wait for per-face jobs to finish -- we only did that to avoid a copy of the job's data. - don't use multi-threading at all if the job has too little work. We evaluate the work using the scanline length and number of samples.
google · Jun 26, 2019 · 921c2bc · 921c2bc
1 parent 6ea8ed0
commit 921c2bc
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 42 deletions.
diff --git a/filament/src/Texture.cpp b/filament/src/Texture.cpp
@@ -477,7 +477,7 @@ void FTexture::generatePrefilterMipmap(FEngine& engine,
     for (ssize_t i = baseExp; i >= 0; --i) {
         const size_t dim = 1U << i;
         const size_t level = baseExp - i;
-        const float lod = saturate(level / (numLevels - 1.0));
+        const float lod = saturate(level / (numLevels - 1.0f));
         const float linearRoughness = lod * lod;
 
         Image image;

diff --git a/libs/ibl/src/CubemapIBL.cpp b/libs/ibl/src/CubemapIBL.cpp
@@ -298,7 +298,7 @@ void CubemapIBL::roughnessFilter(JobSystem& js, Cubemap& dst, const std::vector<
     std::atomic_uint progress = {0};
 
     if (linearRoughness == 0) {
-        CubemapUtils::process<CubemapUtils::EmptyState>(dst, js, [&]
+        auto scanline = [&]
                 (CubemapUtils::EmptyState&, size_t y, Cubemap::Face f, Cubemap::Texel* data, size_t dim) {
                     if (UTILS_UNLIKELY(updater)) {
                         size_t p = progress.fetch_add(1, std::memory_order_relaxed) + 1;
@@ -311,11 +311,17 @@ void CubemapIBL::roughnessFilter(JobSystem& js, Cubemap& dst, const std::vector<
                         // FIXME: we should pick the proper LOD here and do trilinear filtering
                         Cubemap::writeAt(data, cm.sampleAt(N));
                     }
-                });
+        };
+        // at least 256 pixel cubemap before we use multithreading -- the overhead of launching
+        // jobs is too large compared to the work above.
+        if (dst.getDimensions() <= 256) {
+            CubemapUtils::processSingleThreaded<CubemapUtils::EmptyState>(dst, js, std::ref(scanline));
+        } else {
+            CubemapUtils::process<CubemapUtils::EmptyState>(dst, js, std::ref(scanline));
+        }
         return;
     }
 
-
     // be careful w/ the size of this structure, the smaller the better
     struct CacheEntry {
         float3 L;
@@ -391,15 +397,12 @@ void CubemapIBL::roughnessFilter(JobSystem& js, Cubemap& dst, const std::vector<
         return lhs.brdf_NoL < rhs.brdf_NoL;
     });
 
-    CubemapUtils::process<CubemapUtils::EmptyState>(dst, js,
-            [&](CubemapUtils::EmptyState&, size_t y,
-                    Cubemap::Face f, Cubemap::Texel* data, size_t dim) {
-
+    auto scanline = [&](CubemapUtils::EmptyState&, size_t y,
+            Cubemap::Face f, Cubemap::Texel* data, size_t dim) {
         if (UTILS_UNLIKELY(updater)) {
             size_t p = progress.fetch_add(1, std::memory_order_relaxed) + 1;
             updater(0, (float)p / (dim * 6));
         }
-
         mat3 R;
         const size_t numSamples = cache.size();
         for (size_t x = 0; x < dim; ++x, ++data) {
@@ -423,7 +426,15 @@ void CubemapIBL::roughnessFilter(JobSystem& js, Cubemap& dst, const std::vector<
             }
             Cubemap::writeAt(data, Cubemap::Texel(Li));
         }
-    });
+    };
+
+    // don't use the jobsystem unless we have enough work per scanline -- or the overhead of
+    // launching jobs will prevail.
+    if (dst.getDimensions() * maxNumSamples <= 256) {
+        CubemapUtils::processSingleThreaded<CubemapUtils::EmptyState>(dst, js, std::ref(scanline));
+    } else {
+        CubemapUtils::process<CubemapUtils::EmptyState>(dst, js, std::ref(scanline));
+    }
 }
 
 /*

diff --git a/libs/ibl/src/CubemapUtils.cpp b/libs/ibl/src/CubemapUtils.cpp
@@ -332,8 +332,8 @@ void CubemapUtils::generateUVGrid(JobSystem& js, Cubemap& cml, size_t gridFreque
     const float uvGridHDRIntensity = 5.0f;
     size_t gridSizeX = cml.getDimensions() / gridFrequencyX;
     size_t gridSizeY = cml.getDimensions() / gridFrequencyY;
-    CubemapUtils::process<CubemapUtils::EmptyState>(cml, js,
-            [ & ](CubemapUtils::EmptyState&,
+    CubemapUtils::process<EmptyState>(cml, js,
+            [ & ](EmptyState&,
                     size_t y, Cubemap::Face f, Cubemap::Texel* data, size_t dim) {
                 for (size_t x = 0; x < dim; ++x, ++data) {
                     bool grid = bool(((x / gridSizeX) ^ (y / gridSizeY)) & 1);

diff --git a/libs/ibl/src/CubemapUtilsImpl.h b/libs/ibl/src/CubemapUtilsImpl.h
@@ -19,6 +19,7 @@
 
 #include <ibl/CubemapUtils.h>
 
+#include <utils/compiler.h>
 #include <utils/JobSystem.h>
 
 namespace filament {
@@ -42,34 +43,40 @@ void CubemapUtils::process(
 
     JobSystem::Job* parent = js.createJob();
     for (size_t faceIndex = 0; faceIndex < 6; faceIndex++) {
-        const Cubemap::Face f = (Cubemap::Face)faceIndex;
-        JobSystem::Job* face = jobs::createJob(js, parent,
-                [faceIndex, &states, f, &cm, &dim, &proc]
-                        (utils::JobSystem& js, utils::JobSystem::Job* parent) {
-                    STATE& s = states[faceIndex];
-                    Image& image(cm.getImageForFace(f));
-
-                    auto parallelJobTask = [&image, &proc, &s, dim, f](size_t y0, size_t c) {
-                        for (size_t y = y0; y < y0 + c; y++) {
-                            Cubemap::Texel* data =
-                                    static_cast<Cubemap::Texel*>(image.getPixelRef(0, y));
-                            proc(s, y, f, data, dim);
-                        }
-                    };
-
-                    if (std::is_same<STATE, CubemapUtils::EmptyState>::value) {
-                        auto job = jobs::parallel_for(js, parent, 0, uint32_t(dim),
-                                std::ref(parallelJobTask), jobs::CountSplitter<64, 8>());
-
-                        // we need to wait here because parallelJobTask is passed by reference
-                        js.runAndWait(job);
-                    } else {
-                        // if we have a per-thread STATE, we can't parallel_for()
-                        parallelJobTask(0, dim);
-                    }
-                }, std::ref(js), parent);
-        js.run(face);
+
+        auto perFaceJob = [faceIndex, &states, &cm, dim, &proc]
+                (utils::JobSystem& js, utils::JobSystem::Job* parent) {
+            STATE& s = states[faceIndex];
+            Image& image(cm.getImageForFace((Cubemap::Face)faceIndex));
+
+            // here we must limit how much we capture so we can use this closure
+            // by value.
+            auto parallelJobTask = [&s, &image, &proc, dim = uint16_t(dim),
+                                    faceIndex = uint8_t(faceIndex)](size_t y0, size_t c) {
+                for (size_t y = y0; y < y0 + c; y++) {
+                    Cubemap::Texel* data =
+                            static_cast<Cubemap::Texel*>(image.getPixelRef(0, y));
+                    proc(s, y, (Cubemap::Face)faceIndex, data, dim);
+                }
+            };
+
+            constexpr bool isStateLess = std::is_same<STATE, CubemapUtils::EmptyState>::value;
+            if (UTILS_LIKELY(isStateLess)) {
+                // create the job, copying it by value
+                auto job = jobs::parallel_for(js, parent, 0, uint32_t(dim),
+                        parallelJobTask, jobs::CountSplitter<64, 8>());
+                // not need to signal here, since we're just scheduling work
+                js.run(job, JobSystem::DONT_SIGNAL);
+            } else {
+                // if we have a per-thread STATE, we can't parallel_for()
+                parallelJobTask(0, dim);
+            }
+        };
+
+        // not need to signal here, since we're just scheduling work
+        js.run(jobs::createJob(js, parent, perFaceJob, std::ref(js), parent), JobSystem::DONT_SIGNAL);
     }
+
     // wait for all our threads to finish
     js.runAndWait(parent);
 

diff --git a/libs/utils/src/JobSystem.cpp b/libs/utils/src/JobSystem.cpp
@@ -384,6 +384,8 @@ void JobSystem::release(JobSystem::Job*& job) noexcept {
 void JobSystem::run(JobSystem::Job*& job, uint32_t flags) noexcept {
 #if HEAVY_SYSTRACE
     SYSTRACE_CALL();
+#else
+    SYSTRACE_CONTEXT();
 #endif
 
     ThreadState& state(getState());
@@ -395,14 +397,14 @@ void JobSystem::run(JobSystem::Job*& job, uint32_t flags) noexcept {
 
     put(state.workQueue, job);
 
-    SYSTRACE_CONTEXT();
     SYSTRACE_VALUE32("JobSystem::activeJobs", activeJobs + 1);
 
     // wake-up a thread if needed...
     if (!(flags & DONT_SIGNAL)) {
-        // wake-up a queue
+        // wake-up multiple queues because there could be multiple jobs queued
+        // especially if DONT_SIGNAL was used
         { std::lock_guard<Mutex> lock(mLooperLock); }
-        mLooperCondition.notify_one();
+        mLooperCondition.notify_all();
     }
 
     // after run() returns, the job is virtually invalid (it'll die on its own)