diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc index 7974ff768591..9d14d3a14d03 100644 --- a/src/runtime/threading_backend.cc +++ b/src/runtime/threading_backend.cc @@ -133,34 +133,44 @@ class ThreadGroup::Impl { #endif } if (exclude_worker0) { // master thread run task -#if defined(__ANDROID__) - SetFullCpuAffinity(); -#else - // if we set TVM_BIND_MASTER_THREAD to be 1, we will bind master thread - // to core 0. - const char* bind_master_thread = getenv("TVM_BIND_MASTER_THREAD"); - if (bind_master_thread && atoi(bind_master_thread) == 1) { - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - if (reverse) { - CPU_SET(sorted_order_[sorted_order_.size() - 1], &cpuset); - } else { - CPU_SET(sorted_order_[0], &cpuset); - } - pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); - } - pthread_atfork(nullptr, nullptr, ThreadGroup::Impl::SetFullCpuAffinity); -#endif + // Master thread will have free migration on needed cores. + // Typically, the OS will schedule the master thread to run at core 0, + // which is idle, when other workers are running. + // See the comment inside SetMasterThreadFullCpuAffinity function to get more detail. + SetMasterThreadFullCpuAffinity(reverse); } #endif } - static void SetFullCpuAffinity() { + void SetMasterThreadFullCpuAffinity(bool reverse) { #if defined(__linux__) || defined(__ANDROID__) cpu_set_t cpuset; CPU_ZERO(&cpuset); - for (unsigned i = 0; i < std::thread::hardware_concurrency(); i++) { - CPU_SET(i, &cpuset); + // For example, we have 2xA72 + 4xA53 (id is 0 - 5, 4, 5 is A72 big core) + // And we use config_threadpool API to set we will only use 4xA53. + // The sorted_order will be [4, 5, 0, 1, 2, 3]. + // When to call this API, we have spawn threads on little cores for other workers + // in SetAffinity function. And for tvm master thread, it should also run on little cores, + // not big cores (4, 5). + + // Note: this works well on x86 too. Because x86 doesn't have BIG.LITTLE, + // our implementation will use kBig mode by default and will let master thread + // run on intended cores. + if (reverse) { + for (int i = 0; i < little_count_; ++i) { + CPU_SET(sorted_order_[sorted_order_.size() - i - 1], &cpuset); + } + } else { + int big_count = big_count_; + // Imagine our x86 has cores 0 - 7 + // physical cores are 0 - 3, logical cores are 4 - 7, big_count_ is 8 + // we wish we run on physical cores, not logical cores to avoid contention issue. +#if defined(_M_X64) || defined(__x86_64__) + big_count /= 2; // ignore hyper-threading +#endif + for (int i = 0; i < big_count; ++i) { + CPU_SET(sorted_order_[i], &cpuset); + } } #if defined(__ANDROID__) sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset);