Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Basic CPU Kernel OMP selection based upon whether GPU has been used #7854

Merged
merged 58 commits into from
Oct 13, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
0dd7e61
Basic CPU Kernel OMP selection based upon whether GPU has been used
Sep 11, 2017
5ad31e3
lint
Sep 11, 2017
a74a7ef
Disabling the test_CSVIter for now (#7829)
goswamig Sep 11, 2017
6d3ed8d
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Sep 12, 2017
7994d0d
Use OMP thread count as test in Kernel, set count for Kernel loop
Sep 12, 2017
f0ff547
lint
Sep 12, 2017
b55b470
removed
Sep 12, 2017
388f430
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Sep 13, 2017
7cb8a57
Remove assert
Sep 14, 2017
fc70a1a
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Sep 14, 2017
f4e59ce
Adjust DefaultOMPThreadsPerWorker
Sep 14, 2017
9857eb8
remove -1 from omp_cores
Sep 14, 2017
f512139
Trigger build
Sep 14, 2017
f8c3a53
It is not clear why pylint claims that this is re-imported. It is not…
Sep 14, 2017
bafd91b
lint
Sep 14, 2017
5796f7d
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Sep 18, 2017
2d2d92f
lint
Sep 19, 2017
33d3a21
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Sep 20, 2017
d0df4b3
Change getter/setter naming style
Sep 20, 2017
f66ebd9
allow env override
Sep 20, 2017
31ee41a
check environment directly, since OMP_NUM_THREADS mnay have odd forma…
Sep 20, 2017
a0ae0ac
CR comments
Sep 22, 2017
aea5dbd
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Sep 22, 2017
ab6e7a3
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Sep 25, 2017
48890a3
Squashed commit of the following:
Sep 25, 2017
78b23ef
Update mxnet_predict0.cc
cjolivier01 Sep 26, 2017
ac8c8ff
Update mxnet_predict0.cc
cjolivier01 Sep 26, 2017
8746845
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Sep 26, 2017
ed21c19
Merge branch 'optimize_basic_omp' of github.com:/cjolivier01/mxnet in…
Sep 26, 2017
ee3caae
fix oversight with bracket
Sep 26, 2017
e195b12
Binary scatter working on CPU and GPU
Sep 26, 2017
916d7dc
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Sep 26, 2017
4ace075
return unchanged
Sep 26, 2017
2eb41a0
This test case is BS. I can't even tell what's wrong on the CI build …
Sep 27, 2017
ec28b23
inconsequential cleanup
Sep 27, 2017
3a58a20
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Sep 27, 2017
b611025
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Sep 27, 2017
a26982f
Update test_kvstore.py
cjolivier01 Sep 28, 2017
363809a
Update CMakeLists.txt
cjolivier01 Sep 28, 2017
99ec6d3
Update CMakeLists.txt
cjolivier01 Oct 4, 2017
0cf6cf8
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Oct 9, 2017
03e4437
Merge branch 'optimize_basic_omp' of github.com:/cjolivier01/mxnet in…
Oct 9, 2017
8e3cf54
force fail
Oct 9, 2017
ec4dbe6
remove forced error
Oct 9, 2017
702930b
test clean every make
Oct 9, 2017
ce84554
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Oct 9, 2017
e6cad58
Test
Oct 9, 2017
99083f1
Copy Jenkinsfile from upstream/master to fix the build.
indhub Oct 10, 2017
6746828
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Oct 10, 2017
3a7176e
Merge branch 'optimize_basic_omp' of github.com:/cjolivier01/mxnet in…
Oct 10, 2017
f0682a2
logic was reversed
Oct 10, 2017
95d7d7b
Update threaded_engine.h
cjolivier01 Oct 10, 2017
5f4554c
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Oct 11, 2017
ead033c
Trigger rebuild
Oct 11, 2017
72c2345
Merge remote-tracking branch 'apache/master' into optimize_basic_omp
Oct 11, 2017
84569a5
Trigger build
Oct 12, 2017
9f878ad
Merge branch 'master' into optimize_basic_omp
cjolivier01 Oct 12, 2017
a8d39b7
Trigger build
Oct 12, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion amalgamation/mxnet_predict0.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@
#include "src/operator/tensor/elemwise_binary_broadcast_op_basic.cc"
#include "src/operator/tensor/elemwise_binary_op_basic.cc"
#include "src/operator/tensor/elemwise_binary_scalar_op_basic.cc"
#include "src/operator/tensor/elemwise_unary_op.cc"
#include "src/operator/tensor/elemwise_unary_op_basic.cc"
#include "src/operator/tensor/elemwise_unary_op_trig.cc"
#include "src/operator/tensor/matrix_op.cc"

#include "src/storage/storage.cc"
Expand All @@ -90,3 +91,4 @@
#include "src/c_api/c_api_symbolic.cc"
#include "src/c_api/c_api_ndarray.cc"
#include "src/c_api/c_api_error.cc"

10 changes: 10 additions & 0 deletions include/mxnet/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,16 @@ class MXNET_API Engine {
}
read_vars->resize(rtop - read_vars->begin());
}

/*! \brief Return the number of OMP threads that should be used per worker
* \return Number of OMP threads that should be used per worker
*/
virtual int num_omp_threads_per_worker() const = 0;

/*! \brief Set the number of OMP threads that should be used per worker
* \param num_threads_per_worker Number of OMP threads to be used per worker
*/
virtual void set_num_omp_threads_per_worker(int num_omp_threads_per_worker) = 0;
}; // class Engine
#endif // DMLC_USE_CXX11
} // namespace mxnet
Expand Down
18 changes: 18 additions & 0 deletions src/engine/naive_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <thread>
#include "./engine_impl.h"
#include "./profiler.h"
#include "threaded_engine.h"

namespace mxnet {
namespace engine {
Expand All @@ -46,6 +47,7 @@ class NaiveEngine final : public Engine {
};

NaiveEngine() {
set_num_omp_threads_per_worker(ThreadedEngine::DefaultOMPThreadsPerWorker());
}
// virtual destructor
virtual ~NaiveEngine() {
Expand Down Expand Up @@ -187,6 +189,20 @@ class NaiveEngine final : public Engine {
shutdown_phase_.store(true);
}

/*! \brief Return the number of OMP threads that should be used per worker
* \return Number of OMP threads that should be used per worker
*/
int num_omp_threads_per_worker() const override {
return num_omp_threads_per_worker_;
}

/*! \brief Set the number of OMP threads that should be used per worker
* \param num_threads_per_worker Number of OMP threads to be used per worker
*/
void set_num_omp_threads_per_worker(int num_threads_per_worker) override {
num_omp_threads_per_worker_ = num_threads_per_worker;
}

private:
// callback to oncomplete
static void OnComplete(Engine *engine, void *param) {
Expand All @@ -202,6 +218,8 @@ class NaiveEngine final : public Engine {
mshadow::Stream<cpu> cpu_stream_;
// GPU streams
std::vector<mshadow::Stream<gpu>*> streams_;
/*! \brief Number of OMP threads to be used per worker */
int num_omp_threads_per_worker_{0};
}; // class NaiveEngine


Expand Down
44 changes: 44 additions & 0 deletions src/engine/threaded_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

#include <dmlc/base.h>
#include <dmlc/logging.h>
#include <dmlc/omp.h>
#include <vector>
#include <functional>
#include <condition_variable>
Expand Down Expand Up @@ -284,6 +285,10 @@ class ThreadedEngine : public Engine {
objpool_blk_ref_ = common::ObjectPool<OprBlock>::_GetSharedRef();
objpool_varblk_ref_ = common::ObjectPool<VersionedVarBlock>::_GetSharedRef();
objpool_var_ref_ = common::ObjectPool<ThreadedVar>::_GetSharedRef();

/*! \brief Set default OMP threads per kernel worker to default */
set_num_omp_threads_per_worker(DefaultOMPThreadsPerWorker());
CHECK_GT(num_omp_threads_per_worker(), 0);
}
~ThreadedEngine() {
{
Expand All @@ -293,6 +298,25 @@ class ThreadedEngine : public Engine {
finished_cv_.notify_all();
}

/*! \brief Return default OMP thread count. Currently, this is whatever OMP shows as number
* of procs
* \warning Do not call this in any performance-sensitive use-case since checking the environment
* is slow
*/
static int DefaultOMPThreadsPerWorker() {
#ifdef _OPENMP
// If OMP_NUM_THREADS is set, use omp_get_max_threads(), which will be the value
// interpreted by the implemetation from the OMP_NUM_THREADS environment variable.
// Otherwise, return the number of processors, not counting hyperthreading.
// Test for set OMP_NUM_THREADS by checking against some nonsensical value
const int max_threads = dmlc::GetEnv("OMP_NUM_THREADS", INT_MIN) == INT_MIN ?
omp_get_num_procs() : omp_get_num_threads();
return max_threads;
#else
return 0;
#endif
}

protected:
/*!
* \brief Push the opr block to execution queue to be executed.
Expand Down Expand Up @@ -360,6 +384,21 @@ class ThreadedEngine : public Engine {
}
}

/*! \brief Return the number of OMP threads that should be used per worker
* \return Number of OMP threads that should be used per worker
*/
int num_omp_threads_per_worker() const override {
return num_omp_threads_per_worker_;
}

/*! \brief Set the number of OMP threads that should be used per worker
* \param num_threads_per_worker Number of OMP threads to be used per worker
* TODO(cjolivier01) Dynamically adjust based upon number of concurrent CPU jobs
*/
void set_num_omp_threads_per_worker(int num_omp_threads_per_worker) override {
num_omp_threads_per_worker_ = num_omp_threads_per_worker;
}

private:
/*!
* \brief check if thee is duplication in const_vars and mutable_vars.
Expand Down Expand Up @@ -405,6 +444,10 @@ class ThreadedEngine : public Engine {
std::shared_ptr<common::ObjectPool<OprBlock> > objpool_blk_ref_;
std::shared_ptr<common::ObjectPool<VersionedVarBlock> > objpool_varblk_ref_;
std::shared_ptr<common::ObjectPool<ThreadedVar> > objpool_var_ref_;

/*! \brief Number of OMP threads to be used per worker */
int num_omp_threads_per_worker_{0};

/*!
* \brief Disallow copy construction and assignment.
*/
Expand All @@ -413,4 +456,5 @@ class ThreadedEngine : public Engine {

} // namespace engine
} // namespace mxnet

#endif // MXNET_ENGINE_THREADED_ENGINE_H_
8 changes: 6 additions & 2 deletions src/engine/threaded_engine_perdevice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,9 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
if (is_copy) {
auto ptr =
gpu_copy_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
auto blk = new ThreadWorkerBlock<kCopyQueue>();
// Signify to kernel that GPU is being used, no Kernel Launch OMP (temporary behavior)
Engine::Get()->set_num_omp_threads_per_worker(0);
auto blk = new ThreadWorkerBlock<kCopyQueue>();
blk->pool.reset(new ThreadPool(
nthread,
[this, ctx, is_copy, blk]
Expand All @@ -131,6 +133,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
}
} else {
auto ptr = gpu_normal_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
// Signify to kernel that GPU is being used, no Kernel Launch OMP (temporary behavior)
Engine::Get()->set_num_omp_threads_per_worker(0);
auto blk = new ThreadWorkerBlock<kWorkerQueue>();
blk->pool.reset(new ThreadPool(
nthread,
Expand Down Expand Up @@ -230,7 +234,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
}
}

/*! \brief Signal a single queue for shutdown */
/*! \brief Signal a single queue for shutdown */
template<typename Object>
static inline void SignalQueueForKill(common::LazyAllocArray<Object> *array) {
array->ForEach([](size_t i, Object *block) {
Expand Down
24 changes: 19 additions & 5 deletions src/operator/mxnet_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@

#include <dmlc/omp.h>
#include <mxnet/base.h>
#include <mxnet/engine.h>
#include <mxnet/op_attr_types.h>
#include <algorithm>

#ifdef __CUDACC__
#include "../common/cuda_utils.h"
#endif // __CUDACC__
Expand Down Expand Up @@ -241,13 +243,25 @@ struct Kernel;
template<typename OP>
struct Kernel<OP, cpu> {
template<typename ...Args>
inline static void Launch(mshadow::Stream<cpu> *s, int N, Args... args) {
#if (MXNET_USE_CUDA == 0)
#pragma omp parallel for
#endif
inline static void Launch(mshadow::Stream<cpu> *s, const int N, Args... args) {
#ifdef _OPENMP
const int omp_cores = Engine::Get()->num_omp_threads_per_worker();
if (omp_cores <= 1) {
// Zero means not to use OMP, but don't interfere with external OMP behavior
for (int i = 0; i < N; ++i) {
OP::Map(i, args...);
}
} else {
#pragma omp parallel for num_threads(omp_cores)
for (int i = 0; i < N; ++i) {
OP::Map(i, args...);
}
}
#else
for (int i = 0; i < N; ++i) {
OP::Map(i, args...);
OP::Map(i, args...);
}
#endif
}
};

Expand Down
Loading