Skip to content

Commit

Permalink
mitigate overhead of jobsystem
Browse files Browse the repository at this point in the history
For jobs with that do very little work, the jobsystem can introduce
a lot of overhead, we mitigate this by:

- don't wake-up worker threads when scheduling several very small jobs,
like when scheduling the per-face jobs. 

- don't wait for per-face jobs to finish -- we only did that to avoid
a copy of the job's data.

- don't use multi-threading at all if the job has too little work. We
evaluate the work using the scanline length and number of samples.
  • Loading branch information
pixelflinger committed Jun 26, 2019
1 parent 6ea8ed0 commit 921c2bc
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 42 deletions.
2 changes: 1 addition & 1 deletion filament/src/Texture.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ void FTexture::generatePrefilterMipmap(FEngine& engine,
for (ssize_t i = baseExp; i >= 0; --i) {
const size_t dim = 1U << i;
const size_t level = baseExp - i;
const float lod = saturate(level / (numLevels - 1.0));
const float lod = saturate(level / (numLevels - 1.0f));
const float linearRoughness = lod * lod;

Image image;
Expand Down
29 changes: 20 additions & 9 deletions libs/ibl/src/CubemapIBL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ void CubemapIBL::roughnessFilter(JobSystem& js, Cubemap& dst, const std::vector<
std::atomic_uint progress = {0};

if (linearRoughness == 0) {
CubemapUtils::process<CubemapUtils::EmptyState>(dst, js, [&]
auto scanline = [&]
(CubemapUtils::EmptyState&, size_t y, Cubemap::Face f, Cubemap::Texel* data, size_t dim) {
if (UTILS_UNLIKELY(updater)) {
size_t p = progress.fetch_add(1, std::memory_order_relaxed) + 1;
Expand All @@ -311,11 +311,17 @@ void CubemapIBL::roughnessFilter(JobSystem& js, Cubemap& dst, const std::vector<
// FIXME: we should pick the proper LOD here and do trilinear filtering
Cubemap::writeAt(data, cm.sampleAt(N));
}
});
};
// at least 256 pixel cubemap before we use multithreading -- the overhead of launching
// jobs is too large compared to the work above.
if (dst.getDimensions() <= 256) {
CubemapUtils::processSingleThreaded<CubemapUtils::EmptyState>(dst, js, std::ref(scanline));
} else {
CubemapUtils::process<CubemapUtils::EmptyState>(dst, js, std::ref(scanline));
}
return;
}


// be careful w/ the size of this structure, the smaller the better
struct CacheEntry {
float3 L;
Expand Down Expand Up @@ -391,15 +397,12 @@ void CubemapIBL::roughnessFilter(JobSystem& js, Cubemap& dst, const std::vector<
return lhs.brdf_NoL < rhs.brdf_NoL;
});

CubemapUtils::process<CubemapUtils::EmptyState>(dst, js,
[&](CubemapUtils::EmptyState&, size_t y,
Cubemap::Face f, Cubemap::Texel* data, size_t dim) {

auto scanline = [&](CubemapUtils::EmptyState&, size_t y,
Cubemap::Face f, Cubemap::Texel* data, size_t dim) {
if (UTILS_UNLIKELY(updater)) {
size_t p = progress.fetch_add(1, std::memory_order_relaxed) + 1;
updater(0, (float)p / (dim * 6));
}

mat3 R;
const size_t numSamples = cache.size();
for (size_t x = 0; x < dim; ++x, ++data) {
Expand All @@ -423,7 +426,15 @@ void CubemapIBL::roughnessFilter(JobSystem& js, Cubemap& dst, const std::vector<
}
Cubemap::writeAt(data, Cubemap::Texel(Li));
}
});
};

// don't use the jobsystem unless we have enough work per scanline -- or the overhead of
// launching jobs will prevail.
if (dst.getDimensions() * maxNumSamples <= 256) {
CubemapUtils::processSingleThreaded<CubemapUtils::EmptyState>(dst, js, std::ref(scanline));
} else {
CubemapUtils::process<CubemapUtils::EmptyState>(dst, js, std::ref(scanline));
}
}

/*
Expand Down
4 changes: 2 additions & 2 deletions libs/ibl/src/CubemapUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,8 +332,8 @@ void CubemapUtils::generateUVGrid(JobSystem& js, Cubemap& cml, size_t gridFreque
const float uvGridHDRIntensity = 5.0f;
size_t gridSizeX = cml.getDimensions() / gridFrequencyX;
size_t gridSizeY = cml.getDimensions() / gridFrequencyY;
CubemapUtils::process<CubemapUtils::EmptyState>(cml, js,
[ & ](CubemapUtils::EmptyState&,
CubemapUtils::process<EmptyState>(cml, js,
[ & ](EmptyState&,
size_t y, Cubemap::Face f, Cubemap::Texel* data, size_t dim) {
for (size_t x = 0; x < dim; ++x, ++data) {
bool grid = bool(((x / gridSizeX) ^ (y / gridSizeY)) & 1);
Expand Down
61 changes: 34 additions & 27 deletions libs/ibl/src/CubemapUtilsImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <ibl/CubemapUtils.h>

#include <utils/compiler.h>
#include <utils/JobSystem.h>

namespace filament {
Expand All @@ -42,34 +43,40 @@ void CubemapUtils::process(

JobSystem::Job* parent = js.createJob();
for (size_t faceIndex = 0; faceIndex < 6; faceIndex++) {
const Cubemap::Face f = (Cubemap::Face)faceIndex;
JobSystem::Job* face = jobs::createJob(js, parent,
[faceIndex, &states, f, &cm, &dim, &proc]
(utils::JobSystem& js, utils::JobSystem::Job* parent) {
STATE& s = states[faceIndex];
Image& image(cm.getImageForFace(f));

auto parallelJobTask = [&image, &proc, &s, dim, f](size_t y0, size_t c) {
for (size_t y = y0; y < y0 + c; y++) {
Cubemap::Texel* data =
static_cast<Cubemap::Texel*>(image.getPixelRef(0, y));
proc(s, y, f, data, dim);
}
};

if (std::is_same<STATE, CubemapUtils::EmptyState>::value) {
auto job = jobs::parallel_for(js, parent, 0, uint32_t(dim),
std::ref(parallelJobTask), jobs::CountSplitter<64, 8>());

// we need to wait here because parallelJobTask is passed by reference
js.runAndWait(job);
} else {
// if we have a per-thread STATE, we can't parallel_for()
parallelJobTask(0, dim);
}
}, std::ref(js), parent);
js.run(face);

auto perFaceJob = [faceIndex, &states, &cm, dim, &proc]
(utils::JobSystem& js, utils::JobSystem::Job* parent) {
STATE& s = states[faceIndex];
Image& image(cm.getImageForFace((Cubemap::Face)faceIndex));

// here we must limit how much we capture so we can use this closure
// by value.
auto parallelJobTask = [&s, &image, &proc, dim = uint16_t(dim),
faceIndex = uint8_t(faceIndex)](size_t y0, size_t c) {
for (size_t y = y0; y < y0 + c; y++) {
Cubemap::Texel* data =
static_cast<Cubemap::Texel*>(image.getPixelRef(0, y));
proc(s, y, (Cubemap::Face)faceIndex, data, dim);
}
};

constexpr bool isStateLess = std::is_same<STATE, CubemapUtils::EmptyState>::value;
if (UTILS_LIKELY(isStateLess)) {
// create the job, copying it by value
auto job = jobs::parallel_for(js, parent, 0, uint32_t(dim),
parallelJobTask, jobs::CountSplitter<64, 8>());
// not need to signal here, since we're just scheduling work
js.run(job, JobSystem::DONT_SIGNAL);
} else {
// if we have a per-thread STATE, we can't parallel_for()
parallelJobTask(0, dim);
}
};

// not need to signal here, since we're just scheduling work
js.run(jobs::createJob(js, parent, perFaceJob, std::ref(js), parent), JobSystem::DONT_SIGNAL);
}

// wait for all our threads to finish
js.runAndWait(parent);

Expand Down
8 changes: 5 additions & 3 deletions libs/utils/src/JobSystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,8 @@ void JobSystem::release(JobSystem::Job*& job) noexcept {
void JobSystem::run(JobSystem::Job*& job, uint32_t flags) noexcept {
#if HEAVY_SYSTRACE
SYSTRACE_CALL();
#else
SYSTRACE_CONTEXT();
#endif

ThreadState& state(getState());
Expand All @@ -395,14 +397,14 @@ void JobSystem::run(JobSystem::Job*& job, uint32_t flags) noexcept {

put(state.workQueue, job);

SYSTRACE_CONTEXT();
SYSTRACE_VALUE32("JobSystem::activeJobs", activeJobs + 1);

// wake-up a thread if needed...
if (!(flags & DONT_SIGNAL)) {
// wake-up a queue
// wake-up multiple queues because there could be multiple jobs queued
// especially if DONT_SIGNAL was used
{ std::lock_guard<Mutex> lock(mLooperLock); }
mLooperCondition.notify_one();
mLooperCondition.notify_all();
}

// after run() returns, the job is virtually invalid (it'll die on its own)
Expand Down

0 comments on commit 921c2bc

Please sign in to comment.