diff --git a/3rdparty/ps-lite b/3rdparty/ps-lite index 862f8a664..7672fdc62 160000 --- a/3rdparty/ps-lite +++ b/3rdparty/ps-lite @@ -1 +1 @@ -Subproject commit 862f8a6644ddee9a2220244097e18a9bccf9e1da +Subproject commit 7672fdc62c50ff7bb13586bee6872a8efd98d966 diff --git a/README.md b/README.md index 395e2364d..f49d901f2 100644 --- a/README.md +++ b/README.md @@ -5,32 +5,27 @@ BytePS is a high performance and general distributed training framework. It supports TensorFlow, Keras, PyTorch, and MXNet, and can run on either TCP or RDMA network. -BytePS outperforms existing open-sourced distributed training frameworks by a large margin. For example, on a popular public cloud and with the same number of GPUs, BytePS can *double the training speed* (see below), compared with [Horovod](https://github.com/horovod/horovod)+[NCCL](https://github.com/NVIDIA/nccl). +BytePS outperforms existing open-sourced distributed training frameworks by a large margin. For example, on BERT-large training, BytePS can achieve ~90% scaling efficiency with 256 GPUs (see below), which is much higher than [Horovod](https://github.com/horovod/horovod)+[NCCL](https://github.com/NVIDIA/nccl). ## News -- Use [the ssh launcher](launcher/) to launch your distributed jobs -- Asynchronous training support for -[PyTorch](https://github.com/bytedance/byteps/pull/121), -[TensorFlow](https://github.com/bytedance/byteps/pull/122), -[MXNet](https://github.com/bytedance/byteps/pull/114) -- Find your training stragglers using [server timeline](docs/timeline.md) -- [Improved key distribution strategy for better load-balancing](https://github.com/bytedance/byteps/pull/116) +- [New Server](https://github.com/bytedance/byteps/pull/151): We improve the server performance by a large margin, and it is now independent of MXNet KVStore. Try our [new docker images](docker/). +- Use [the ssh launcher](launcher/) to launch your distributed jobs +- [Improved key distribution strategy for better load-balancing](https://github.com/bytedance/byteps/pull/116) - [Improved RDMA robustness](https://github.com/bytedance/byteps/pull/91) ## Performance -For demonstration, we test two models: VGG16 (communication-intensive) and Resnet50 (computation-intensive). Both models are trained using fp32. +We show our experiment on BERT-large training, which is based on GluonNLP toolkit. The model uses mixed precision. -We use Tesla V100 16GB GPUs and set batch size equal to 64 *per GPU*. The machines are in fact VMs on a popular public cloud. Each machine has 8 V100 GPUs with NVLink-enabled. Machines are inter-connected with 20 Gbps TCP/IP network. +We use Tesla V100 32GB GPUs and set batch size equal to 64 per GPU. Each machine has 8 V100 GPUs (32GB memory) with NVLink-enabled. Machines are inter-connected with 100 Gbps RoCEv2 network. -BytePS outperforms Horovod (NCCL) by 44% for Resnet50, and 100% for VGG16. +BytePS achieves ~90% scaling efficiency for BERT-large. The code is available [here](https://github.com/ymjiang/gluon-nlp/tree/bert-byteps/scripts/bert). - +![BERT-Large](https://user-images.githubusercontent.com/13852819/69874496-1ca43600-12f6-11ea-997b-b023e4c93360.png) -You can reproduce the results using the Dockerfiles and example scripts we provide. -Evaluation on RDMA networks can be found at [performance.md](docs/performance.md). +More evaluation in different scenarios can be found at [performance.md](docs/performance.md). ## Goodbye MPI, Hello Cloud diff --git a/byteps/common/common.h b/byteps/common/common.h index 6a39a4663..739e2ac28 100644 --- a/byteps/common/common.h +++ b/byteps/common/common.h @@ -17,8 +17,11 @@ #ifndef BYTEPS_COMMON_H #define BYTEPS_COMMON_H +#ifndef BYTEPS_BUILDING_SERVER #include #include +#endif + #include #include #include @@ -217,7 +220,9 @@ enum class RequestType { int GetCommandType(RequestType requestType, int d); +#ifndef BYTEPS_BUILDING_SERVER ncclDataType_t getNcclDataType(DataType dtype); +#endif int getDataTypeLength(int dtype); diff --git a/byteps/common/cpu_reducer.cc b/byteps/common/cpu_reducer.cc index a7699fb79..d145b5341 100644 --- a/byteps/common/cpu_reducer.cc +++ b/byteps/common/cpu_reducer.cc @@ -13,19 +13,32 @@ // limitations under the License. // ============================================================================= +#ifndef BYTEPS_BUILDING_SERVER #include "global.h" +#endif + +#include "cpu_reducer.h" namespace byteps { namespace common { CpuReducer::CpuReducer(std::shared_ptr comm) { + +#ifndef BYTEPS_BUILDING_SERVER std::vector peers; auto pcie_size = BytePSGlobal::GetPcieSwitchSize(); for (int i = BytePSGlobal::GetLocalRank() % pcie_size; i < BytePSGlobal::GetLocalSize(); i += pcie_size) { peers.push_back(i); } - _comm = std::make_shared(comm, std::string("cpu"), peers); + if (comm) { + _comm = std::make_shared(comm, std::string("cpu"), peers); + } + else { + _comm = nullptr; + } +#endif + if (getenv("BYTEPS_OMP_THREAD_PER_GPU")) { _num_threads = atoi(getenv("BYTEPS_OMP_THREAD_PER_GPU")); } else { @@ -34,9 +47,14 @@ CpuReducer::CpuReducer(std::shared_ptr comm) { return; } +#ifndef BYTEPS_BUILDING_SERVER bool CpuReducer::isRoot() { + if (!_comm) { + return false; + } return (_comm->getRoot() == BytePSGlobal::GetLocalRank()); } +#endif int CpuReducer::sum(void* dst, void* src, size_t len, DataType dtype) { switch (dtype) { @@ -64,7 +82,7 @@ int CpuReducer::sum(void* dst, void* src, size_t len, DataType dtype) { BPS_CHECK(0) << "Unsupported data type: " << dtype; } return 0; -} +} template int CpuReducer::_sum(T* dst, T* src, size_t len) { @@ -190,5 +208,19 @@ int CpuReducer::_sum_float16(void* dst, void* src1, void* src2, size_t len) { return 0; } +int CpuReducer::copy(void* dst, void* src, size_t len) { + auto in = (float*)src; + auto out = (float*)dst; +#pragma omp parallel for simd num_threads(_num_threads) + for (size_t i = 0; i < len / 4; ++i) { + out[i] = in[i]; + } + if (len % 4) { + std::memcpy(out + len / 4, in + len / 4, len % 4); + } + return 0; +} + + } // namespace common } // namespace byteps diff --git a/byteps/common/cpu_reducer.h b/byteps/common/cpu_reducer.h index 92e8bdd7a..a6e682c55 100644 --- a/byteps/common/cpu_reducer.h +++ b/byteps/common/cpu_reducer.h @@ -22,10 +22,16 @@ #endif #include +#include #include "common.h" -#include "communicator.h" #include "logging.h" +#ifndef BYTEPS_BUILDING_SERVER +#include "communicator.h" +#else +typedef void BytePSComm; +#endif + #include namespace byteps { @@ -41,8 +47,17 @@ class CpuReducer { int sum(void* dst, void* src, size_t len, DataType dtype); int sum(void* dst, void* src1, void* src2, size_t len, DataType dtype); + int copy(void* dst, void* src, size_t len); + +#ifndef BYTEPS_BUILDING_SERVER bool isRoot(); std::shared_ptr getComm() { return _comm; } +#endif + + + DataType GetDataType(int dtype) { + return static_cast(dtype); + } private: #if __AVX__ && __F16C__ diff --git a/byteps/common/global.cc b/byteps/common/global.cc index 3e3d080f9..03e4a0fe3 100644 --- a/byteps/common/global.cc +++ b/byteps/common/global.cc @@ -338,7 +338,7 @@ uint64_t BytePSGlobal::Hash_DJB2(uint64_t key) { auto str = std::to_string(key).c_str(); uint64_t hash = 5381; int c; - while (c = *str) { // hash(i) = hash(i-1) * 33 ^ str[i] + while ((c = *str)) { // hash(i) = hash(i-1) * 33 ^ str[i] hash = ((hash << 5) + hash) + c; str++; } @@ -349,7 +349,7 @@ uint64_t BytePSGlobal::Hash_SDBM(uint64_t key) { auto str = std::to_string(key).c_str(); uint64_t hash = 0; int c; - while (c = *str) { // hash(i) = hash(i-1) * 65599 + str[i] + while ((c = *str)) { // hash(i) = hash(i-1) * 65599 + str[i] hash = c + (hash << 6) + (hash << 16) - hash; str++; } diff --git a/byteps/common/shared_memory.cc b/byteps/common/shared_memory.cc index ec86d166f..e4ff185f2 100644 --- a/byteps/common/shared_memory.cc +++ b/byteps/common/shared_memory.cc @@ -54,22 +54,26 @@ std::vector BytePSSharedMemory::openPcieSharedMemory(uint64_t key, for (int i = 0; i < BytePSGlobal::GetPcieSwitchNum(); i++) { auto prefix = std::string("BytePS_Pcie") + std::to_string(i) + "_Shm_"; if (BytePSGlobal::IsDistributed()) { - if (i <= numa_max_node()) { - numa_set_preferred(i); + if (BytePSGlobal::IsCrossPcieSwitch()) { + if (i <= numa_max_node()) { + numa_set_preferred(i); + r.push_back(openSharedMemory(prefix, key, size)); + numa_set_preferred(-1); + } else { + numa_set_preferred(numa_max_node()); + r.push_back(openSharedMemory(prefix, key, size)); + numa_set_preferred(-1); + } } else { - numa_set_preferred(numa_max_node()); + r.push_back(openSharedMemory(prefix, key, size)); } - r.push_back(openSharedMemory(prefix, key, size)); - numa_set_preferred(-1); } else { if (BytePSGlobal::IsCrossPcieSwitch()) { numa_set_interleave_mask(numa_all_nodes_ptr); r.push_back(openSharedMemory(prefix, key, size)); numa_set_interleave_mask(numa_no_nodes_ptr); } else { - numa_set_preferred(0); r.push_back(openSharedMemory(prefix, key, size)); - numa_set_preferred(-1); } } } diff --git a/byteps/server/__init__.py b/byteps/server/__init__.py new file mode 100644 index 000000000..bb9500d64 --- /dev/null +++ b/byteps/server/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import ctypes +import os +from byteps.common import get_ext_suffix + +dll_path = os.path.join(os.path.dirname(__file__), + 'c_lib' + get_ext_suffix()) +SERVER_LIB_CTYPES = ctypes.CDLL(dll_path, ctypes.RTLD_GLOBAL) +SERVER_LIB_CTYPES.byteps_server() diff --git a/byteps/server/queue.h b/byteps/server/queue.h new file mode 100644 index 000000000..fe3f7f57c --- /dev/null +++ b/byteps/server/queue.h @@ -0,0 +1,110 @@ +// Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= + +#ifndef BYTEPS_SERVER_QUEUE_H +#define BYTEPS_SERVER_QUEUE_H + +#include +#include +#include +#include +#include + +namespace byteps { +namespace server { + +/** + * \brief thread-safe queue allowing push and waited pop + */ +class PriorityQueue { + public: + PriorityQueue(bool is_schedule) { + enable_schedule_ = is_schedule; + if (enable_schedule_) { + std::make_heap(queue_.begin(), queue_.end(), + [this](const BytePSEngineMessage& a, const BytePSEngineMessage& b) { + return ComparePriority(a, b); + } + ); + } + } + ~PriorityQueue() { } + + /** + * \brief push an value and sort using heap. threadsafe. + * \param new_value the value + */ + void Push(BytePSEngineMessage new_value) { + mu_.lock(); + queue_.push_back(std::move(new_value)); + if (enable_schedule_) { + ++push_cnt_[new_value.key]; + std::push_heap(queue_.begin(), queue_.end(), + [this](const BytePSEngineMessage& a, const BytePSEngineMessage& b) { + return ComparePriority(a, b); + } + ); + } + mu_.unlock(); + cond_.notify_all(); + } + + /** + * \brief wait until pop an element from the beginning, threadsafe + * \param value the poped value + */ + void WaitAndPop(BytePSEngineMessage* value) { + std::unique_lock lk(mu_); + cond_.wait(lk, [this]{return !queue_.empty();}); + if (enable_schedule_) { + std::pop_heap(queue_.begin(), queue_.end(), + [this](const BytePSEngineMessage& a, const BytePSEngineMessage& b) { + return ComparePriority(a, b); + } + ); + *value = queue_.back(); + queue_.pop_back(); + } else { + *value = std::move(queue_.front()); + queue_.erase(queue_.begin()); + } + } + + void ClearCounter(uint64_t key) { + if (!enable_schedule_) return; + std::unique_lock lk(mu_); + push_cnt_[key] = 0; + } + + bool ComparePriority(const BytePSEngineMessage& a, const BytePSEngineMessage& b) { + if (push_cnt_[a.key] == push_cnt_[b.key]) { + return (a.id > b.id); + } else { + return (push_cnt_[a.key] > push_cnt_[b.key]); + } + } + + private: + mutable std::mutex mu_; + std::vector queue_; + std::condition_variable cond_; + std::unordered_map push_cnt_; + volatile bool enable_schedule_ = false; +}; + +} // namespace server +} // namespace byteps + +#endif // BYTEPS_SERVER_QUEUE_H \ No newline at end of file diff --git a/byteps/server/server.cc b/byteps/server/server.cc new file mode 100644 index 000000000..3f91fcdfe --- /dev/null +++ b/byteps/server/server.cc @@ -0,0 +1,403 @@ +// Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= + +#include "server.h" +#include "queue.h" + +namespace byteps { +namespace server { + +using namespace ps; + +// engine related +std::vector engine_queues_; +std::vector engine_threads_; + +void SendPushResponse(uint64_t key, const ps::KVMeta& req, ps::KVServer* server){ + auto iterator = push_response_map_.find(key); + if (iterator == push_response_map_.end()) { // new key + ps::KVPairs response; + response.keys.push_back(key); + push_response_map_[key] = response; // add to the map + server->Response(req, response); + } else { // not new key, then reuse the memory address to avoid ibv_reg_mr on RDMA data path + ps::KVPairs *response = &iterator->second; + response->keys[0] = key; + server->Response(req, *response); + } +} + +void SendPullResponse(const DataHandleType type, + const uint64_t key, + const ps::KVMeta& req_meta, + ps::KVServer* server) { + std::lock_guard lock(pullresp_mu_); + auto& stored = store_[key]; + CHECK(stored.tensor) << "init " << key << " first"; + // as server returns when store_realt is ready in this case + auto len = stored.len; + // send pull response + auto iterator = pull_response_map_.find(key); + if (iterator == pull_response_map_.end()) { // new key + ps::KVPairs response; + response.keys = {EncodeKey(key)}; + response.lens = {len}; + response.vals = ps::SArray(stored.tensor, len, false); // zero copy + pull_response_map_[key] = response; // add to the map + server->Response(req_meta, response); + } else { // not new key, then reuse the memory address to avoid ibv_reg_mr on RDMA data path + ps::KVPairs *response = &iterator->second; + // keys and lens remain unchanged, just update vals + auto p = static_cast(stored.tensor); + CHECK(p); + response->vals = ps::SArray(p, len, false); + server->Response(req_meta, *response); + } +} + +void BytePSServerEngineThread(int i) { + auto& q = engine_queues_[i]; + while (true) { + BytePSEngineMessage msg; + q->WaitAndPop(&msg); + if (msg.ops == TERMINATE) break; + // do some check + CHECK(msg.dst); + CHECK(msg.src); + + bool is_debug = (debug_mode_ && (debug_key_ == msg.key)); + switch (msg.ops) { + case COPY_MERGED: { + if (is_debug) { + std::lock_guard lock(debug_mu_); + LOG(INFO) << "stage: ENGINE_COPY_MERGED_TO_STORE_BEFORE \t" + << "dst: " << DEBUG_PRINT_TENSOR_VALUE(msg.dst) << "\t" + << "src: " << DEBUG_PRINT_TENSOR_VALUE(msg.src) << "\t" + << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst) << "\t" + << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src) << "\t"; + } + bps_reducer_->copy(msg.dst, msg.src, msg.len); + if (is_debug) { + std::lock_guard lock(debug_mu_); + LOG(INFO) << "stage: ENGINE_COPY_MERGED_TO_STORE_AFTER \t" + << "dst: " << DEBUG_PRINT_TENSOR_VALUE(msg.dst) << "\t" + << "src: " << DEBUG_PRINT_TENSOR_VALUE(msg.src) << "\t" + << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst) << "\t" + << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src) << "\t"; + } + std::lock_guard lock(flag_mu_[i]); + if (is_push_finished_[i].find(msg.key) == is_push_finished_[i].end()) { + is_push_finished_[i][msg.key] = false; + pull_cnt_[i][msg.key] = 0; + } + is_push_finished_[i][msg.key] = true; + for (auto& req_meta : q_pull_reqmeta_[i][msg.key]) { + SendPullResponse(msg.type, msg.key, req_meta, byteps_server_); + pull_cnt_[i][msg.key] += 1; + if (pull_cnt_[i][msg.key] == (size_t) ps::NumWorkers()) { + is_push_finished_[i][msg.key] = false; + pull_cnt_[i][msg.key] = 0; + } + } + q_pull_reqmeta_[i][msg.key].clear(); + break; + } + case SUM_RECV: { + auto bps_type = bps_reducer_->GetDataType(msg.type.dtype); + if (is_debug) { + std::lock_guard lock(debug_mu_); + LOG(INFO) << "stage: ENGINE_SUM_RECV_BEFORE \t" + << "dst: " << DEBUG_PRINT_TENSOR_VALUE(msg.dst) << "\t" + << "src: " << DEBUG_PRINT_TENSOR_VALUE(msg.src) << "\t" + << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst) << "\t" + << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src) << "\t"; + } + CHECK_GE(bps_reducer_->sum(msg.dst, + msg.src, + msg.len, + bps_type), 0); + if (is_debug) { + std::lock_guard lock(debug_mu_); + LOG(INFO) << "stage: ENGINE_SUM_RECV_AFTER \t" + << "dst: " << DEBUG_PRINT_TENSOR_VALUE(msg.dst) << "\t" + << "src: " << DEBUG_PRINT_TENSOR_VALUE(msg.src) << "\t" + << "dst_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.dst) << "\t" + << "src_addr: " << DEBUG_PRINT_TENSOR_ADDRESS(msg.src) << "\t"; + } + break; + } + default: + CHECK(0); + } + } +} + +void BytePSHandler(const ps::KVMeta& req_meta, + const ps::KVPairs &req_data, ps::KVServer* server) { + std::lock_guard lock(handle_mu_); // push & pull may have racing + DataHandleType type = DepairDataHandleType(req_meta.cmd); + CHECK_EQ(type.requestType, RequestType::kDefaultPushPull); + // do some check + CHECK_EQ(req_data.keys.size(), (size_t)1); + if (log_key_info_) { + if (req_meta.push) { + CHECK_EQ(req_data.lens.size(), (size_t)1); + CHECK_EQ(req_data.vals.size(), (size_t)req_data.lens[0]); + LOG(INFO) << "push key=" + << DecodeKey(req_data.keys[0]) + << "\t sender=" << req_meta.sender + << "\t size=" << (size_t) req_data.lens[0]; + } else { + LOG(INFO) << "pull key=" + << (uint64_t) DecodeKey(req_data.keys[0]) + << "\t sender=" << req_meta.sender; + } + } + uint64_t key = DecodeKey(req_data.keys[0]); + if (req_meta.push) { // push request + CHECK_EQ(req_data.lens.size(), (size_t)1); + CHECK_EQ(req_data.vals.size(), (size_t)req_data.lens[0]); + auto& stored = store_[key]; + auto len = (size_t) req_data.lens[0]; + auto recved = reinterpret_cast(req_data.vals.data()); + if (!stored.tensor) { + if (sync_mode_ && (update_buf_.find(key) == update_buf_.end())) { + update_buf_[key].merged.len = len; + update_buf_[key].merged.dtype = type.dtype; + } + // buffer the request meta + auto &updates = update_buf_[key]; + updates.request.push_back(req_meta); + // should send response after collecting all init push + if (updates.request.size() < (size_t) ps::NumWorkers()) return; + if (log_key_info_) { + LOG(INFO) << "Collected all " << updates.request.size() + << " requests for key=" << key + << ", init the store buffer size=" << (size_t) req_data.lens[0]; + } + // initialization + stored.tensor = (char*) malloc(len); + stored.len = len; + stored.dtype = type.dtype; + CHECK(stored.tensor); + bps_reducer_->copy(stored.tensor, recved, len); // we may not need this copy + for (const auto& req : updates.request) { + SendPushResponse(key, req, server); + } + updates.request.clear(); + } else { + auto &updates = update_buf_[key]; + auto tid = GetThreadID(key, len); + if (updates.request.empty()) { // from the first incoming worker + if (sync_mode_) { + if (is_engine_blocking_) { + bps_reducer_->copy(updates.merged.tensor, recved, len); + } else { // non-blocking + if (debug_mode_ && (debug_key_ == key)) { + std::lock_guard lock(debug_mu_); + LOG(INFO) << "stage: FIRST_WORKER_RECV \t" + << "stored: " << DEBUG_PRINT_TENSOR_VALUE(stored.tensor) << "\t" + << "recved: " << DEBUG_PRINT_TENSOR_VALUE(recved) << "\t" + << "len: " << len << "\t" + << "addr: " << DEBUG_PRINT_TENSOR_ADDRESS(recved); + } + // zero copy + updates.merged.tensor = recved; + updates.merged.tmp_sarray = req_data; + } + } else { // async mode, directly add to the buffer + if (is_engine_blocking_) { + CHECK_GE(bps_reducer_->sum((void *) stored.tensor, + (void *) recved, + len, + bps_reducer_->GetDataType(stored.dtype)), 0); + } else { + BytePSEngineMessage msg = {timestamp_++, type, key, stored.tensor, recved, len, SUM_RECV, req_data}; + engine_queues_[tid]->Push(msg); + } + } + } else { // from other workers + CHECK(sync_mode_); + CHECK(updates.merged.tensor); + if (is_engine_blocking_) { + CHECK_GE(bps_reducer_->sum((void *) updates.merged.tensor, + (void *) recved, + len, + bps_reducer_->GetDataType(updates.merged.dtype)), 0); + } else { // non-blocking + if (debug_mode_ && (debug_key_ == key)) { + std::lock_guard lock(debug_mu_); + LOG(INFO) << "stage: OTHER_WORKER_SUM \t" + << "stored: " << DEBUG_PRINT_TENSOR_VALUE(stored.tensor) << "\t" + << "merged: " << DEBUG_PRINT_TENSOR_VALUE(updates.merged.tensor) << "\t" + << "recved: " << DEBUG_PRINT_TENSOR_VALUE(recved) << "\t" + << "len: " << len << "\t" + << "addr: " << DEBUG_PRINT_TENSOR_ADDRESS(recved); + } + BytePSEngineMessage msg = {timestamp_++, type, key, updates.merged.tensor, recved, len, SUM_RECV, req_data, req_meta}; + engine_queues_[tid]->Push(msg); + } + } + // add a worker information (request.size() is the # workers received) + updates.request.push_back(req_meta); + SendPushResponse(key, req_meta, server); + if (sync_mode_ && updates.request.size() == (size_t) ps::NumWorkers()) { + auto& stored = store_[key]; + auto& update = updates.merged; + if (is_engine_blocking_) { + bps_reducer_->copy(stored.tensor, updates.merged.tensor, len); + } else { + if (debug_mode_ && (debug_key_ == key)) { + std::lock_guard lock(debug_mu_); + LOG(INFO) << "stage: COPY_MERGED_TO_STORE \t" + << "stored: " << DEBUG_PRINT_TENSOR_VALUE(stored.tensor) << "\t" + << "merged: " << DEBUG_PRINT_TENSOR_VALUE(updates.merged.tensor) << "\t" + << "recved: " << DEBUG_PRINT_TENSOR_VALUE(recved); + } + BytePSEngineMessage msg = {timestamp_++, type, key, stored.tensor, update.tensor, len, COPY_MERGED}; + engine_queues_[tid]->Push(msg); + engine_queues_[tid]->ClearCounter(key); + } + updates.request.clear(); + } else if (!sync_mode_) { + // async: clean the request buffer + updates.request.clear(); + engine_queues_[tid]->ClearCounter(key); + } + } + } else { // pull request + auto& stored = store_[key]; + CHECK(stored.tensor) << "Processing pull request when the NDArray of key " + << key << " has not been inited yet, which is not expected."; + if (is_engine_blocking_) { + SendPullResponse(type, key, req_meta, server); + } else { + auto tid = GetThreadID(key, 0); + std::lock_guard lock(flag_mu_[tid]); + if (is_push_finished_[tid].find(key) == is_push_finished_[tid].end()) { + is_push_finished_[tid][key] = false; + pull_cnt_[tid][key] = 0; + } + if (is_push_finished_[tid][key]) { // push already finished + SendPullResponse(type, key, req_meta, server); + pull_cnt_[tid][key] += 1; + if (pull_cnt_[tid][key] == (size_t) ps::NumWorkers()) { + is_push_finished_[tid][key] = false; + pull_cnt_[tid][key] = 0; + // check: remain should be 0 + auto remain = q_pull_reqmeta_[tid][key].size(); + CHECK_EQ(remain, 0) << remain; + } + } else { // push not finished, put into the queue, and wait for the engine + q_pull_reqmeta_[tid][key].push_back(req_meta); + } + } + } +} + +void init_global_env() { + // enable to print key profile + log_key_info_ = GetEnv("PS_KEY_LOG", false); + + // sync or async training + sync_mode_ = GetEnv("BYTEPS_ENABLE_ASYNC", true); + if (!sync_mode_) LOG(INFO) << "BytePS server is enabled asynchronous training"; + + // debug mode + debug_mode_ = GetEnv("BYTEPS_SERVER_DEBUG", false); + debug_key_ = GetEnv("BYTEPS_SERVER_DEBUG_KEY", 0); + if (debug_mode_) LOG(INFO) << "Debug mode enabled! Printing key " << debug_key_; + + // enable engine block mode (default disabled) + is_engine_blocking_ = GetEnv("BYTEPS_SERVER_ENGINE_BLOCKING", false); + if (is_engine_blocking_) LOG(INFO) << "Enable blocking mode of the server engine"; + + // number of engine thread + // invalid if is_engine_blocking = true + engine_thread_num_ = GetEnv("BYTEPS_SERVER_ENGINE_THREAD", 4); + LOG(INFO) << "BytePS server engine uses " << engine_thread_num_ << " threads" + << ", consider increasing BYTEPS_SERVER_ENGINE_THREAD for higher performance"; + CHECK_GE(engine_thread_num_, 1); + + // enable scheduling for server engine + enable_schedule_ = GetEnv("BYTEPS_SERVER_ENABLE_SCHEDULE", false); + if (enable_schedule_) LOG(INFO) << "Enable engine scheduling for BytePS server"; +} + +extern "C" void byteps_server() { + init_global_env(); + + // cpu reducer + bps_reducer_ = new byteps::common::CpuReducer(nullptr); + + // flag mu and its protected map + std::vector tmp_flagmu(engine_thread_num_); + std::vector > tmp_ispushfinished(engine_thread_num_); + std::vector > > tmp_qpullreqmeta(engine_thread_num_); + std::vector > tmp_pullcnt(engine_thread_num_); + flag_mu_.swap(tmp_flagmu); + is_push_finished_.swap(tmp_ispushfinished); + q_pull_reqmeta_.swap(tmp_qpullreqmeta); + pull_cnt_.swap(tmp_pullcnt); + CHECK_EQ(flag_mu_.size(), engine_thread_num_); + CHECK_EQ(is_push_finished_.size(), engine_thread_num_); + CHECK_EQ(q_pull_reqmeta_.size(), engine_thread_num_); + CHECK_EQ(pull_cnt_.size(), engine_thread_num_); + + // init the engine + for (size_t i = 0; i < engine_thread_num_; ++i) { + acc_load_.push_back(0); + } + for (size_t i = 0; i < engine_thread_num_; ++i) { + auto q = new PriorityQueue(enable_schedule_); + engine_queues_.push_back(q); + } + for (size_t i = 0; i < engine_thread_num_; ++i) { + auto t = new std::thread(&BytePSServerEngineThread, i); + engine_threads_.push_back(t); + } + + // init server instance + byteps_server_ = new KVServer(0); + byteps_server_->set_request_handle(BytePSHandler); + StartAsync(0, "byteps_server\0"); + if (!Postoffice::Get()->is_recovery()) { + Postoffice::Get()->Barrier(0, + ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler); + } + + // clean the server resource + Finalize(0, true); + if (byteps_server_) { + delete byteps_server_; + byteps_server_ = nullptr; + } + if (bps_reducer_) { + delete bps_reducer_; + bps_reducer_ = nullptr; + } + BytePSEngineMessage msg; + msg.ops = TERMINATE; + for (auto q : engine_queues_) q->Push(msg); + for (auto t : engine_threads_) t->join(); + for (auto& it : store_) free(it.second.tensor); + for (auto& it : update_buf_) free(it.second.merged.tensor); + LOG(INFO) << "byteps has been shutdown"; + + return; +} + +} // namespace server +} // namespace byteps diff --git a/byteps/server/server.h b/byteps/server/server.h new file mode 100644 index 000000000..5b6e30fcf --- /dev/null +++ b/byteps/server/server.h @@ -0,0 +1,169 @@ +// Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= + +#ifndef BYTEPS_SERVER_H +#define BYTEPS_SERVER_H + +#include +#include +#include +#include "ps/ps.h" +#include "../common/cpu_reducer.h" + +namespace byteps { +namespace server { + +#define SERVER_KEY_TYPE uint64_t +#define SERVER_DATA_TYPE char +#define DEBUG_PRINT_TENSOR_VALUE(X) (*((float *)(X) + 0)) +#define DEBUG_PRINT_TENSOR_ADDRESS(X) (reinterpret_cast(X)) + +using namespace ps; + +enum class RequestType { + kDefaultPushPull, kRowSparsePushPull, kCompressedPushPull +}; + +enum BytePSEngineOperation { + SUM_RECV, COPY_MERGED, TERMINATE +}; + +struct PSKV { + SArray keys; // n keys + SArray lens; // the length of the i-th value +}; + +struct DataHandleType { + RequestType requestType; + int dtype; +}; + +struct BytePSArray { + char* tensor; + size_t len; + int dtype; + ps::KVPairs tmp_sarray; +}; + +struct UpdateBuf { + std::vector request; + BytePSArray merged; +}; + +struct BytePSEngineMessage { + uint64_t id; + DataHandleType type; + uint64_t key; + void* dst; + void* src; + size_t len; + BytePSEngineOperation ops; + ps::KVPairs sarray; // to temporarily hold it and auto release + ps::KVMeta req_meta; +}; + +static DataHandleType DepairDataHandleType(int cmd) { + int w = std::floor((std::sqrt(8 * cmd + 1) - 1)/2); + int t = ((w * w) + w) / 2; + int y = cmd - t; + int x = w - y; + CHECK_GE(x, 0); + CHECK_GE(y, 0); + DataHandleType type; + type.requestType = static_cast(x); + type.dtype = y; + return type; +} + + +KVServer* byteps_server_; +byteps::common::CpuReducer* bps_reducer_; +std::unordered_map > mem_map_; +std::mutex pullresp_mu_; +std::unordered_map > push_response_map_; +std::unordered_map > pull_response_map_; + +// push & pull flag +std::vector flag_mu_; +std::vector > is_push_finished_; +std::vector > > q_pull_reqmeta_; +std::vector > pull_cnt_; + +// address map +std::mutex handle_mu_; +std::unordered_map store_; +std::unordered_map update_buf_; + +// hash function +std::mutex hash_mu_; +std::unordered_map hash_cache_; +std::vector acc_load_; // accumulated tensor size for an engine thread + +// global knob +uint64_t timestamp_ = 0; +size_t engine_thread_num_ = 4; +volatile bool is_engine_blocking_ = false; +volatile bool log_key_info_ = false; +volatile bool sync_mode_ = true; +volatile bool debug_mode_ = false; +volatile bool enable_schedule_ = false; + +// debug +uint64_t debug_key_; +std::mutex debug_mu_; + + +uint64_t DecodeKey(ps::Key key) { + auto kr = ps::Postoffice::Get()->GetServerKeyRanges()[ps::MyRank()]; + return key - kr.begin(); +} + +uint64_t EncodeKey(ps::Key key) { + auto kr = ps::Postoffice::Get()->GetServerKeyRanges()[ps::MyRank()]; + return key + kr.begin(); +} + +size_t GetThreadID(uint64_t key, size_t len) { + std::lock_guard lock(hash_mu_); + if (len == 0) { // pull + CHECK_NE(hash_cache_.find(key), hash_cache_.end()); + return hash_cache_[key]; + } + if (hash_cache_.find(key) != hash_cache_.end()) { + return hash_cache_[key]; + } + CHECK_GT(len, 0); + CHECK_EQ(acc_load_.size(), engine_thread_num_); + auto min_index = -1; + auto min_load = std::numeric_limits::max(); + for (size_t i = 0; i < engine_thread_num_; ++i) { + if (acc_load_[i] < min_load) { + min_load = acc_load_[i]; + min_index = i; + } + } + CHECK_GE(min_index, 0); + CHECK_LT(min_index, engine_thread_num_); + acc_load_[min_index] += len; + hash_cache_[key] = min_index; + return hash_cache_[key]; +} + +extern "C" void byteps_server(); + +} // namespace server +} // namespace byteps + +#endif // BYTEPS_SERVER_H diff --git a/docker/Dockerfile.mix.mxnet15 b/docker/Dockerfile.mix.mxnet15 deleted file mode 100644 index 683ec86c1..000000000 --- a/docker/Dockerfile.mix.mxnet15 +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -FROM nvidia/cuda:10.0-devel-ubuntu16.04 -ENV CUDA_VERSION=10.0 -ARG REGION - -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi - -ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 -ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -ARG CUDNN_VERSION=7.4.1.5-1+cuda$CUDA_VERSION - -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends --fix-missing \ - build-essential \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-pip \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ - cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ - lsb-release \ - libcudnn7=${CUDNN_VERSION} \ - libnuma-dev \ - gcc-4.9 \ - g++-4.9 \ - gcc-4.9-base \ - python3 \ - python3-dev \ - python3-pip \ - python3-setuptools - - -RUN python -m pip install --upgrade pip &&\ - pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 - -RUN pip3 install --upgrade pip &&\ - python3 -m pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 - -# Install NCCL -ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881 - -RUN cd / && \ - wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ - cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ - mkdir -p /usr/local/nccl && \ - tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \ - echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig && rm -rf /nccl-$NCCL_VERSION - -WORKDIR /root/ - -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - - -############ build server -# To enable RDMA, add `USE_RDMA=1` to `SERVER_BUILD_OPTS` below. -ENV SERVER_BUILD_OPTS "USE_BLAS=openblas USE_MKL=1 USE_DIST_KVSTORE=1" -ENV BYTEPS_SERVER_MXNET_PATH /root/incubator-mxnet -ENV MXNET_SERVER_LINK https://github.com/bytedance/incubator-mxnet - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -WORKDIR /root/ - -RUN git clone --single-branch --branch byteps --recurse-submodules $MXNET_SERVER_LINK - -RUN cd $BYTEPS_SERVER_MXNET_PATH && \ - make clean_all && make -j16 $SERVER_BUILD_OPTS - -################################ install your framework ################################ -# install mxnet -ARG FRAMEWORK_VERSION=1.5.0 -RUN python -m pip --no-cache-dir install mxnet-cu100==$FRAMEWORK_VERSION -RUN pip3 --no-cache-dir install mxnet-cu100==$FRAMEWORK_VERSION - -################################ install your framework ################################ - - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - -# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet. -RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 - - -# Install BytePS -ARG BYTEPS_NCCL_LINK=shared -RUN cd $BYTEPS_PATH &&\ - BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python3 setup.py install &&\ - BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python3 setup.py bdist_wheel -RUN cd $BYTEPS_PATH &&\ - BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py install &&\ - BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py bdist_wheel - -# Remove GCC pinning -RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove g++ /usr/bin/g++-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 - -RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \ - rm -rf /usr/local/cuda/lib64/libcuda.so.1 diff --git a/docker/Dockerfile.worker.mxnet.cu100.rdma b/docker/Dockerfile.mxnet similarity index 65% rename from docker/Dockerfile.worker.mxnet.cu100.rdma rename to docker/Dockerfile.mxnet index a38055c71..68ed1639e 100644 --- a/docker/Dockerfile.worker.mxnet.cu100.rdma +++ b/docker/Dockerfile.mxnet @@ -1,25 +1,7 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= FROM nvidia/cuda:10.0-devel-ubuntu18.04 -ENV CUDA_VERSION=10.0 -ARG REGION -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi +ARG https_proxy +ARG http_proxy ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} @@ -30,11 +12,9 @@ ENV BYTEPS_BASE_PATH /usr/local ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps -ARG CUDNN_VERSION=7.4.1.5-1+cuda$CUDA_VERSION - ARG DEBIAN_FRONTEND=noninteractive -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ +RUN apt-get update -qq +RUN apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ build-essential \ tzdata \ ca-certificates \ @@ -42,47 +22,22 @@ RUN apt-get update &&\ curl \ wget \ vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ lsb-release \ - libcudnn7=${CUDNN_VERSION} \ - libnuma-dev - -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - ibverbs-providers \ - librdmacm-dev \ - ibverbs-utils \ - rdmacm-utils \ - libibverbs-dev - -RUN apt-get update &&\ - apt-get -y install python-pip &&\ - pip install --upgrade pip - -RUN pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 + libcudnn7=7.6.0.64-1+cuda10.0 \ + libnuma-dev \ + ibverbs-providers \ + librdmacm-dev \ + ibverbs-utils \ + rdmacm-utils \ + libibverbs-dev \ + python3 \ + python3-dev \ + python3-pip \ + python3-setuptools # Install NCCL ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881 - RUN cd / && \ wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ @@ -91,31 +46,8 @@ RUN cd / && \ echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ ldconfig && rm -rf /nccl-$NCCL_VERSION - WORKDIR /root/ -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - - -################################ install your framework ################################ -# install mxnet -ARG FRAMEWORK_VERSION=1.4.1 -RUN pip --no-cache-dir install mxnet-cu100==$FRAMEWORK_VERSION - -################################ install your framework ################################ - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - # install gcc 4.9 RUN mkdir -p /root/gcc/ && cd /root/gcc &&\ wget http://launchpadlibrarian.net/247707088/libmpfr4_3.1.4-1_amd64.deb &&\ @@ -147,12 +79,32 @@ RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 + +RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ + echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ + echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ + echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ + ldconfig + +RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ + ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ + ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ + ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 + +# install mxnet +ARG FRAMEWORK_VERSION=1.5.0 +RUN python3 -m pip --no-cache-dir install mxnet-cu100==$FRAMEWORK_VERSION + # Install BytePS ARG BYTEPS_NCCL_LINK=shared ARG BYTEPS_USE_RDMA=1 +ARG BYTEPS_WITHOUT_PYTORCH=1 +ARG BYTEPS_WITHOUT_TENSORFLOW=1 +ARG BYTEPS_BRANCH=master +RUN cd $BYTEPS_BASE_PATH &&\ + git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK RUN cd $BYTEPS_PATH &&\ - BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py install &&\ - BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py bdist_wheel + python3 setup.py install # Remove GCC pinning RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ @@ -162,5 +114,3 @@ RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \ rm -rf /usr/local/cuda/lib64/libcuda.so.1 - - diff --git a/docker/Dockerfile.worker.pytorch.cu100.rdma b/docker/Dockerfile.pytorch similarity index 64% rename from docker/Dockerfile.worker.pytorch.cu100.rdma rename to docker/Dockerfile.pytorch index e67c9a359..a6f0c1d28 100644 --- a/docker/Dockerfile.worker.pytorch.cu100.rdma +++ b/docker/Dockerfile.pytorch @@ -1,25 +1,7 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= FROM nvidia/cuda:10.0-devel-ubuntu18.04 -ENV CUDA_VERSION=10.0 -ARG REGION -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi +ARG https_proxy +ARG http_proxy ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} @@ -30,11 +12,9 @@ ENV BYTEPS_BASE_PATH /usr/local ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps -ARG CUDNN_VERSION=7.4.1.5-1+cuda$CUDA_VERSION - ARG DEBIAN_FRONTEND=noninteractive -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ +RUN apt-get update -qq +RUN apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ build-essential \ tzdata \ ca-certificates \ @@ -42,47 +22,22 @@ RUN apt-get update &&\ curl \ wget \ vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ lsb-release \ - libcudnn7=${CUDNN_VERSION} \ - libnuma-dev - -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - ibverbs-providers \ - librdmacm-dev \ - ibverbs-utils \ - rdmacm-utils \ - libibverbs-dev - -RUN apt-get update &&\ - apt-get -y install python-pip &&\ - pip install --upgrade pip - -RUN pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 + libcudnn7=7.6.0.64-1+cuda10.0 \ + libnuma-dev \ + ibverbs-providers \ + librdmacm-dev \ + ibverbs-utils \ + rdmacm-utils \ + libibverbs-dev \ + python3 \ + python3-dev \ + python3-pip \ + python3-setuptools # Install NCCL ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881 - RUN cd / && \ wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ @@ -91,40 +46,8 @@ RUN cd / && \ echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ ldconfig && rm -rf /nccl-$NCCL_VERSION - WORKDIR /root/ -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - - -################################ install your framework ################################ -# install pytorch -ARG FRAMEWORK_VERSION=1.0.1 -RUN pip --no-cache-dir install \ - future \ - numpy \ - pyyaml \ - setuptools \ - six \ - typing \ - protobuf \ - torchvision==0.2.2 \ - torch==$FRAMEWORK_VERSION - -################################ install your framework ################################ - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - # install gcc 4.9 RUN mkdir -p /root/gcc/ && cd /root/gcc &&\ wget http://launchpadlibrarian.net/247707088/libmpfr4_3.1.4-1_amd64.deb &&\ @@ -156,12 +79,33 @@ RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 + +RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ + echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ + echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ + echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ + ldconfig + +RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ + ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ + ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ + ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 + +# install pytorch +ARG FRAMEWORK_VERSION=1.1.0 +ARG TORCHVISION_VERSION=0.2.2 +RUN python3 -m pip --no-cache-dir install torch==$FRAMEWORK_VERSION torchvision==$TORCHVISION_VERSION + # Install BytePS ARG BYTEPS_NCCL_LINK=shared ARG BYTEPS_USE_RDMA=1 +ARG BYTEPS_WITHOUT_TENSORFLOW=1 +ARG BYTEPS_WITHOUT_MXNET=1 +ARG BYTEPS_BRANCH=master +RUN cd $BYTEPS_BASE_PATH &&\ + git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK RUN cd $BYTEPS_PATH &&\ - BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py install &&\ - BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py bdist_wheel + python3 setup.py install # Remove GCC pinning RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ diff --git a/docker/Dockerfile.server b/docker/Dockerfile.server deleted file mode 100644 index f6bf291f2..000000000 --- a/docker/Dockerfile.server +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -FROM ubuntu:16.04 - -ARG REGION - -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi - -ENV LD_LIBRARY_PATH /root/incubator-mxnet/lib/:/usr/local/lib:$LD_LIBRARY_PATH - -# To enable RDMA, add `USE_RDMA=1` to `SERVER_BUILD_OPTS` below. -ENV SERVER_BUILD_OPTS "USE_BLAS=openblas USE_MKL=1 USE_DIST_KVSTORE=1" -ENV BYTEPS_SERVER_MXNET_PATH /root/incubator-mxnet -ENV MXNET_SERVER_LINK https://github.com/bytedance/incubator-mxnet - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ - build-essential \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ - cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ - lsb-release - -RUN apt-get update &&\ - apt-get -y install python-pip &&\ - pip install --upgrade pip - -RUN pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 - -WORKDIR /root/ - -RUN git clone --single-branch --branch byteps --recurse-submodules $MXNET_SERVER_LINK - -RUN cd $BYTEPS_SERVER_MXNET_PATH && \ - make clean_all && make -j16 $SERVER_BUILD_OPTS - -RUN cd $BYTEPS_SERVER_MXNET_PATH && \ - cd python && \ - python setup.py build && \ - python setup.py install &&\ - python setup.py bdist_wheel - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - diff --git a/docker/Dockerfile.server.rdma b/docker/Dockerfile.server.rdma deleted file mode 100644 index f86c2abc4..000000000 --- a/docker/Dockerfile.server.rdma +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -FROM ubuntu:18.04 - -ARG REGION - -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi - -ENV LD_LIBRARY_PATH /root/incubator-mxnet/lib/:/usr/local/lib:$LD_LIBRARY_PATH - -# To enable RDMA, add `USE_RDMA=1` to `SERVER_BUILD_OPTS` below. -ENV SERVER_BUILD_OPTS "USE_BLAS=openblas USE_MKL=1 USE_DIST_KVSTORE=1 USE_RDMA=1" -ENV BYTEPS_SERVER_MXNET_PATH /root/incubator-mxnet -ENV MXNET_SERVER_LINK https://github.com/bytedance/incubator-mxnet - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -ARG DEBIAN_FRONTEND=noninteractive -RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ - tzdata \ - build-essential \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ - cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ - lsb-release \ - librdmacm-dev - -RUN apt-get update &&\ - apt-get -y install python-pip ibverbs-providers &&\ - pip install --upgrade pip - -RUN pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 - -WORKDIR /root/ - -RUN git clone --single-branch --branch byteps --recurse-submodules $MXNET_SERVER_LINK - -RUN cd $BYTEPS_SERVER_MXNET_PATH && \ - make clean_all && make -j16 $SERVER_BUILD_OPTS - -RUN cd $BYTEPS_SERVER_MXNET_PATH && \ - cd python && \ - python setup.py build && \ - python setup.py install &&\ - python setup.py bdist_wheel - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - diff --git a/docker/Dockerfile.worker.tensorflow.cu100.rdma b/docker/Dockerfile.tensorflow similarity index 65% rename from docker/Dockerfile.worker.tensorflow.cu100.rdma rename to docker/Dockerfile.tensorflow index e07f902b0..1159b4d9a 100644 --- a/docker/Dockerfile.worker.tensorflow.cu100.rdma +++ b/docker/Dockerfile.tensorflow @@ -1,25 +1,7 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= FROM nvidia/cuda:10.0-devel-ubuntu18.04 -ENV CUDA_VERSION=10.0 -ARG REGION -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi +ARG https_proxy +ARG http_proxy ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} @@ -30,11 +12,9 @@ ENV BYTEPS_BASE_PATH /usr/local ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps -ARG CUDNN_VERSION=7.4.1.5-1+cuda$CUDA_VERSION - ARG DEBIAN_FRONTEND=noninteractive -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ +RUN apt-get update -qq +RUN apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ build-essential \ tzdata \ ca-certificates \ @@ -42,47 +22,22 @@ RUN apt-get update &&\ curl \ wget \ vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ lsb-release \ - libcudnn7=${CUDNN_VERSION} \ - libnuma-dev - -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - ibverbs-providers \ - librdmacm-dev \ - ibverbs-utils \ - rdmacm-utils \ - libibverbs-dev - -RUN apt-get update &&\ - apt-get -y install python-pip &&\ - pip install --upgrade pip - -RUN pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 + libcudnn7=7.6.0.64-1+cuda10.0 \ + libnuma-dev \ + ibverbs-providers \ + librdmacm-dev \ + ibverbs-utils \ + rdmacm-utils \ + libibverbs-dev \ + python3 \ + python3-dev \ + python3-pip \ + python3-setuptools # Install NCCL ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881 - RUN cd / && \ wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ @@ -91,33 +46,8 @@ RUN cd / && \ echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ ldconfig && rm -rf /nccl-$NCCL_VERSION - WORKDIR /root/ -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - - -################################ install your framework ################################ -# install tensorflow -ARG FRAMEWORK_VERSION=1.14.0 -RUN pip --no-cache-dir install tensorflow-gpu==$FRAMEWORK_VERSION && \ - rm -rf /tmp/pip && \ - rm -rf /root/.cache - -################################ install your framework ################################ - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - # install gcc 4.9 RUN mkdir -p /root/gcc/ && cd /root/gcc &&\ wget http://launchpadlibrarian.net/247707088/libmpfr4_3.1.4-1_amd64.deb &&\ @@ -149,12 +79,33 @@ RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 + +RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ + echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ + echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ + echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ + ldconfig + +RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ + ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ + ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ + ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 + +# install tensorflow +ARG FRAMEWORK_VERSION=1.14.0 +RUN python3 -m pip --no-cache-dir install tensorflow-gpu==$FRAMEWORK_VERSION &&\ + rm -rf /tmp/pip && rm -rf /root/.cache + # Install BytePS ARG BYTEPS_NCCL_LINK=shared ARG BYTEPS_USE_RDMA=1 +ARG BYTEPS_WITHOUT_PYTORCH=1 +ARG BYTEPS_WITHOUT_MXNET=1 +ARG BYTEPS_BRANCH=master +RUN cd $BYTEPS_BASE_PATH &&\ + git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK RUN cd $BYTEPS_PATH &&\ - BYTEPS_WITHOUT_PYTORCH=1 python setup.py install &&\ - BYTEPS_WITHOUT_PYTORCH=1 python setup.py bdist_wheel + python3 setup.py install # Remove GCC pinning RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ diff --git a/docker/Dockerfile.worker.mxnet.cu100 b/docker/Dockerfile.worker.mxnet.cu100 deleted file mode 100644 index 38cd85c2e..000000000 --- a/docker/Dockerfile.worker.mxnet.cu100 +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= -FROM nvidia/cuda:10.0-devel-ubuntu16.04 -ENV CUDA_VERSION=10.0 -ARG REGION - -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi - -ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 -ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -ARG CUDNN_VERSION=7.4.1.5-1+cuda$CUDA_VERSION - -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - build-essential \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ - cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ - lsb-release \ - libcudnn7=${CUDNN_VERSION} \ - libnuma-dev \ - gcc-4.9 \ - g++-4.9 \ - gcc-4.9-base - -RUN apt-get update &&\ - apt-get -y install python-pip &&\ - pip install --upgrade pip - -RUN pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 - -# Install NCCL -ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881 - -RUN cd / && \ - wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ - cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ - mkdir -p /usr/local/nccl && \ - tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \ - echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig && rm -rf /nccl-$NCCL_VERSION - - -WORKDIR /root/ - -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - - -################################ install your framework ################################ -# install mxnet -ARG FRAMEWORK_VERSION=1.4.1 -RUN pip --no-cache-dir install mxnet-cu100==$FRAMEWORK_VERSION - -################################ install your framework ################################ - - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - -# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet. -RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 - -# Install BytePS -ARG BYTEPS_NCCL_LINK=shared -RUN cd $BYTEPS_PATH &&\ - BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py install &&\ - BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py bdist_wheel - -# Remove GCC pinning -RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove g++ /usr/bin/g++-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 - -RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \ - rm -rf /usr/local/cuda/lib64/libcuda.so.1 - - diff --git a/docker/Dockerfile.worker.mxnet.cu90 b/docker/Dockerfile.worker.mxnet.cu90 deleted file mode 100644 index 08c3a5578..000000000 --- a/docker/Dockerfile.worker.mxnet.cu90 +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= -FROM nvidia/cuda:9.0-devel-ubuntu16.04 -ENV CUDA_VERSION=9.0 -ARG REGION - -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi - -ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 -ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -ARG CUDNN_VERSION=7.4.1.5-1+cuda$CUDA_VERSION - -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - build-essential \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ - cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ - lsb-release \ - libcudnn7=${CUDNN_VERSION} \ - libnuma-dev \ - gcc-4.9 \ - g++-4.9 \ - gcc-4.9-base - -RUN apt-get update &&\ - apt-get -y install python-pip &&\ - pip install --upgrade pip - -RUN pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 - -# Install NCCL -ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881 - -RUN cd / && \ - wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ - cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ - mkdir -p /usr/local/nccl && \ - tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \ - echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig && rm -rf /nccl-$NCCL_VERSION - - -WORKDIR /root/ - -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - - -################################ install your framework ################################ -# install mxnet -ARG FRAMEWORK_VERSION=1.4.1 -RUN pip --no-cache-dir install mxnet-cu90==$FRAMEWORK_VERSION - -################################ install your framework ################################ - - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - -# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet. -RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 - -# Install BytePS -ARG BYTEPS_NCCL_LINK=shared -RUN cd $BYTEPS_PATH &&\ - BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py install &&\ - BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py bdist_wheel - -# Remove GCC pinning -RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove g++ /usr/bin/g++-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 - -RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \ - rm -rf /usr/local/cuda/lib64/libcuda.so.1 - - diff --git a/docker/Dockerfile.worker.pytorch.cu100 b/docker/Dockerfile.worker.pytorch.cu100 deleted file mode 100644 index 73a74db1d..000000000 --- a/docker/Dockerfile.worker.pytorch.cu100 +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -FROM nvidia/cuda:10.0-devel-ubuntu16.04 -ENV CUDA_VERSION=10.0 -ARG REGION - -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi - -ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 -ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -ARG CUDNN_VERSION=7.4.1.5-1+cuda$CUDA_VERSION - -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - build-essential \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ - cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ - lsb-release \ - libcudnn7=${CUDNN_VERSION} \ - libnuma-dev \ - gcc-4.9 \ - g++-4.9 \ - gcc-4.9-base - -RUN apt-get update &&\ - apt-get -y install python-pip &&\ - pip install --upgrade pip - -RUN pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 - -# Install NCCL -ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881 - -RUN cd / && \ - wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ - cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ - mkdir -p /usr/local/nccl && \ - tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \ - echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig && rm -rf /nccl-$NCCL_VERSION - - -WORKDIR /root/ - -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - - -################################ install your framework ################################ -# install pytorch -ARG FRAMEWORK_VERSION=1.0.1 -RUN pip --no-cache-dir install \ - future \ - numpy \ - pyyaml \ - setuptools \ - six \ - typing \ - protobuf \ - torchvision==0.2.2 \ - torch==$FRAMEWORK_VERSION - -################################ install your framework ################################ - - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - -# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet. -RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 - - -# Install BytePS -ARG BYTEPS_NCCL_LINK=shared -RUN cd $BYTEPS_PATH &&\ - BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py install &&\ - BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py bdist_wheel - -# Remove GCC pinning -RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove g++ /usr/bin/g++-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 - -RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \ - rm -rf /usr/local/cuda/lib64/libcuda.so.1 - - diff --git a/docker/Dockerfile.worker.pytorch.cu90 b/docker/Dockerfile.worker.pytorch.cu90 deleted file mode 100644 index df2833793..000000000 --- a/docker/Dockerfile.worker.pytorch.cu90 +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -FROM nvidia/cuda:9.0-devel-ubuntu16.04 -ENV CUDA_VERSION=9.0 -ARG REGION - -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi - -ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 -ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -ARG CUDNN_VERSION=7.4.1.5-1+cuda$CUDA_VERSION - -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - build-essential \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ - cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ - lsb-release \ - libcudnn7=${CUDNN_VERSION} \ - libnuma-dev \ - gcc-4.9 \ - g++-4.9 \ - gcc-4.9-base - -RUN apt-get update &&\ - apt-get -y install python-pip &&\ - pip install --upgrade pip - -RUN pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 - -# Install NCCL -ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881 - -RUN cd / && \ - wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ - cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ - mkdir -p /usr/local/nccl && \ - tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \ - echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig && rm -rf /nccl-$NCCL_VERSION - - -WORKDIR /root/ - -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - - -################################ install your framework ################################ -# install pytorch -ARG FRAMEWORK_VERSION=1.0.1 -RUN pip --no-cache-dir install \ - future \ - numpy \ - pyyaml \ - setuptools \ - six \ - typing \ - protobuf \ - torchvision==0.2.2 \ - torch==$FRAMEWORK_VERSION - -################################ install your framework ################################ - - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - -# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet. -RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 - - -# Install BytePS -ARG BYTEPS_NCCL_LINK=shared -RUN cd $BYTEPS_PATH &&\ - BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py install &&\ - BYTEPS_WITHOUT_TENSORFLOW=1 python setup.py bdist_wheel - -# Remove GCC pinning -RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove g++ /usr/bin/g++-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 - -RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \ - rm -rf /usr/local/cuda/lib64/libcuda.so.1 - - diff --git a/docker/Dockerfile.worker.tensorflow.cu100 b/docker/Dockerfile.worker.tensorflow.cu100 deleted file mode 100644 index 7ff2fae93..000000000 --- a/docker/Dockerfile.worker.tensorflow.cu100 +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -FROM nvidia/cuda:10.0-devel-ubuntu16.04 -ENV CUDA_VERSION=10.0 -ARG REGION - -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi - -ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 -ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -ARG CUDNN_VERSION=7.4.1.5-1+cuda$CUDA_VERSION - -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - build-essential \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ - cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ - lsb-release \ - libcudnn7=${CUDNN_VERSION} \ - libnuma-dev \ - gcc-4.9 \ - g++-4.9 \ - gcc-4.9-base - -RUN apt-get update &&\ - apt-get -y install python-pip &&\ - pip install --upgrade pip - -RUN pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 - -# Install NCCL -ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881 - -RUN cd / && \ - wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ - cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ - mkdir -p /usr/local/nccl && \ - tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \ - echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig && rm -rf /nccl-$NCCL_VERSION - - -WORKDIR /root/ - -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - - -################################ install your framework ################################ -# install tensorflow -ARG FRAMEWORK_VERSION=1.12.0 -RUN pip --no-cache-dir install tensorflow-gpu==$FRAMEWORK_VERSION && \ - rm -rf /tmp/pip && \ - rm -rf /root/.cache - -################################ install your framework ################################ - - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - -# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet. -RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 - - -# Install BytePS -ARG BYTEPS_NCCL_LINK=shared -RUN cd $BYTEPS_PATH &&\ - BYTEPS_WITHOUT_PYTORCH=1 python setup.py install &&\ - BYTEPS_WITHOUT_PYTORCH=1 python setup.py bdist_wheel - -# Remove GCC pinning -RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove g++ /usr/bin/g++-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 - -RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \ - rm -rf /usr/local/cuda/lib64/libcuda.so.1 - - diff --git a/docker/Dockerfile.worker.tensorflow.cu90 b/docker/Dockerfile.worker.tensorflow.cu90 deleted file mode 100644 index 50b393a5a..000000000 --- a/docker/Dockerfile.worker.tensorflow.cu90 +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= - -FROM nvidia/cuda:9.0-devel-ubuntu16.04 -ENV CUDA_VERSION=9.0 -ARG REGION - -RUN rm -f /tmp/pip.conf &&\ - echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf - -RUN if [ "x$REGION" = "xchina" ]; then mkdir -p ~/.pip && mv /tmp/pip.conf ~/.pip/; fi - -ENV USE_CUDA_PATH /usr/local/cuda:/usr/local/cudnn/lib64 -ENV PATH /usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} -ENV LD_LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /usr/local/cudnn/lib64:/usr/local/cuda/lib64:$LIBRARY_PATH - -ENV BYTEPS_BASE_PATH /usr/local -ENV BYTEPS_PATH $BYTEPS_BASE_PATH/byteps -ENV BYTEPS_GIT_LINK https://github.com/bytedance/byteps - -ARG CUDNN_VERSION=7.4.1.5-1+cuda$CUDA_VERSION - -RUN apt-get update &&\ - apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends \ - build-essential \ - ca-certificates \ - git \ - curl \ - wget \ - vim \ - libopenblas-dev \ - liblapack-dev \ - libopencv-dev \ - python \ - python-dev \ - python-setuptools \ - libjemalloc-dev \ - graphviz \ - cmake \ - libjpeg-dev \ - libpng-dev \ - iftop \ - lsb-release \ - libcudnn7=${CUDNN_VERSION} \ - libnuma-dev \ - gcc-4.9 \ - g++-4.9 \ - gcc-4.9-base - -RUN apt-get update &&\ - apt-get -y install python-pip &&\ - pip install --upgrade pip - -RUN pip --no-cache-dir install \ - matplotlib \ - numpy==1.15.2 \ - scipy \ - sklearn \ - pandas \ - graphviz==0.9.0 \ - mxboard \ - tensorboard==1.0.0a6 - -# Install NCCL -ENV NCCL_VERSION=d7a58cfa5865c4f627a128c3238cc72502649881 - -RUN cd / && \ - wget -q -O - https://github.com/NVIDIA/nccl/archive/$NCCL_VERSION.tar.gz | tar -xzf - && \ - cd nccl-$NCCL_VERSION && make -j src.build && make pkg.txz.build && \ - mkdir -p /usr/local/nccl && \ - tar -Jxf /nccl-$NCCL_VERSION/build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \ - echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig && rm -rf /nccl-$NCCL_VERSION - - -WORKDIR /root/ - -RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/cudnn/lib64" >> /etc/ld.so.conf.d/cuda.conf && \ - echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf && \ - ldconfig - -RUN ln -sf /usr/local/cudnn/include/cudnn.h /usr/local/cuda/include/ && \ - ln -sf /usr/local/cudnn/lib64/libcudnn* /usr/local/cuda/lib64 &&\ - ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so && \ - ln -sf /usr/local/cuda/lib64/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 - - -################################ install your framework ################################ -# install tensorflow -ARG FRAMEWORK_VERSION=1.12.0 -RUN pip --no-cache-dir install tensorflow-gpu==$FRAMEWORK_VERSION && \ - rm -rf /tmp/pip && \ - rm -rf /root/.cache - -################################ install your framework ################################ - - -RUN cd $BYTEPS_BASE_PATH &&\ - git clone --recurse-submodules $BYTEPS_GIT_LINK - -# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet. -RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ - update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ - update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 - - -# Install BytePS -ARG BYTEPS_NCCL_LINK=shared -RUN cd $BYTEPS_PATH &&\ - BYTEPS_WITHOUT_PYTORCH=1 python setup.py install &&\ - BYTEPS_WITHOUT_PYTORCH=1 python setup.py bdist_wheel - -# Remove GCC pinning -RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ - update-alternatives --remove g++ /usr/bin/g++-4.9 && \ - update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 - -RUN rm -rf /usr/local/cuda/lib64/libcuda.so && \ - rm -rf /usr/local/cuda/lib64/libcuda.so.1 - - diff --git a/docker/README.md b/docker/README.md index a84763ee2..16b8ddcaf 100644 --- a/docker/README.md +++ b/docker/README.md @@ -5,13 +5,6 @@ You may need to manually build them to get the latest functionalities of BytePS, | Docker Image Name | Source Dockerfile | Description | | --- | --- | --- | -| bytepsimage/worker_mxnet | Dockerfile.worker.mxnet.cu90 | worker image for MXNet (CUDA 9.0) | -| bytepsimage/worker_pytorch | Dockerfile.worker.pytorch.cu90 | worker image for PyTorch (CUDA 9.0) | -| bytepsimage/worker_tensorflow | Dockerfile.worker.tensorflow.cu90 | worker image for TensorFlow (CUDA 9.0) | -| bytepsimage/worker_mxnet_rdma | Dockerfile.worker.mxnet.cu100.rdma | worker image for MXNet with RDMA support (CUDA 10.0) | -| bytepsimage/worker_pytorch_rdma | Dockerfile.worker.pytorch.cu100.rdma | worker image for PyTorch with RDMA support (CUDA 10.0) | -| bytepsimage/worker_tensorflow_rdma | Dockerfile.worker.tensorflow.cu100.rdma | worker image for TensorFlow with RDMA support (CUDA 10.0) | -| bytepsimage/byteps_server | Dockerfile.server | server/scheduler image | -| bytepsimage/byteps_server_rdma | Dockerfile.server.rdma | server/scheduler image with RDMA support | -| bytepsimage/mxnet15 | Dockerfile.mix.mxnet15 | all-in-one image with MXNet 1.5.0 (CUDA 10.0), applicable to worker/server/scheduler | - +| bytepsimage/mxnet | Dockerfile.mxnet | Image for MXNet | +| bytepsimage/pytorch | Dockerfile.pytorch | Image for PyTorch | +| bytepsimage/tensorflow | Dockerfile.tensorflow | Image for TensorFlow | diff --git a/docs/architecture.md b/docs/architecture.md index ff2e21f80..e6a7dd940 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -5,7 +5,8 @@ We highly recommend you to read [BytePS's rationale](./rationale.md) first befor From application views, BytePS is a communication library just like Horovod. The plugins handle framework-specific transformation (e.g., on data structure), and put communication tasks into BytePS priority queues. The BytePS Core then gets the tasks (priority-aware, not FIFO) and handles the actual communication. - +![byteps_architecture](https://user-images.githubusercontent.com/13852819/69873605-c3d39e00-12f3-11ea-942d-97af2606bb40.png) + ## General Workflow To demonstrate the work flow of BytePS, below we use a common data-parallel training scenario as an example. Say we have multiple worker machines (we refer them as "**workers**"), and each machine (worker) has multiple GPUs. We also have some CPU machines that serve as PS (we refer them as "**servers**"). diff --git a/docs/cross-barrier.md b/docs/cross-barrier.md index c8baba2e6..09e2dba51 100644 --- a/docs/cross-barrier.md +++ b/docs/cross-barrier.md @@ -19,14 +19,15 @@ completion of all parameters. Fig.1 shows the dependency graph with global barrier. Machine learning frameworks such as PyTorch and TensorFlow have similar dependencies when using BytePS for push and pull. -![](images/dag_barrier.png) +![dag_barrier](https://user-images.githubusercontent.com/13852819/69863244-4b5ee400-12d7-11ea-9356-2dd41dff95ab.png) *Fig.1: Dependency Graph With Global Barrier* Fig. 2 shows the dependency graph after removing global barrier. What we do here is to change the dependency graph from Fig. 1 to Fig. 2 by removing the barrier, building layer-wise dependencies while guaranteeing computation correctness. -![](images/dag_without_barrier.png) + +![dag_without_barrier](https://user-images.githubusercontent.com/13852819/69863268-5d408700-12d7-11ea-8b39-5e48e3d94c2b.png) *Fig.2: Dependency Graph After Removing Global Barrier* diff --git a/docs/images/byteps_architecture.png b/docs/images/byteps_architecture.png deleted file mode 100644 index 93bf14bb9..000000000 Binary files a/docs/images/byteps_architecture.png and /dev/null differ diff --git a/docs/images/dag_barrier.png b/docs/images/dag_barrier.png deleted file mode 100644 index 6137cb8c8..000000000 Binary files a/docs/images/dag_barrier.png and /dev/null differ diff --git a/docs/images/dag_without_barrier.png b/docs/images/dag_without_barrier.png deleted file mode 100644 index 04df725b3..000000000 Binary files a/docs/images/dag_without_barrier.png and /dev/null differ diff --git a/docs/images/perf_tcp_resnet50.png b/docs/images/perf_tcp_resnet50.png deleted file mode 100644 index 9fe181a0d..000000000 Binary files a/docs/images/perf_tcp_resnet50.png and /dev/null differ diff --git a/docs/images/perf_tcp_vgg16.png b/docs/images/perf_tcp_vgg16.png deleted file mode 100644 index 7812a74a3..000000000 Binary files a/docs/images/perf_tcp_vgg16.png and /dev/null differ diff --git a/docs/performance.md b/docs/performance.md index f8f0a62c0..124525592 100644 --- a/docs/performance.md +++ b/docs/performance.md @@ -1,15 +1,19 @@ # BytePS Performance with 100Gbps RDMA -## NVLink + RDMA +## NVLink + TCP -We show our experiment on BERT-large training, which is based on GluonNLP toolkit. The model uses mixed precision. +We test two models: VGG16 (communication-intensive) and Resnet50 (computation-intensive) on a popular public cloud. Both models are trained using fp32. -We use Tesla V100 32GB GPUs and set batch size equal to 64 per GPU. Each machine has 8 V100 GPUs (32GB memory) with NVLink-enabled. -Machines are inter-connected with 100 Gbps RoCEv2 network. +We use Tesla V100 16GB GPUs and set batch size equal to 64 *per GPU*. The machines are in fact VMs on the cloud. Each machine has 8 V100 GPUs with NVLink-enabled. Machines are inter-connected with 20 Gbps TCP/IP network. -BytePS outperforms Horovod (after carefully tuned) by 16% in this case, both with RDMA enabled. +BytePS outperforms Horovod (NCCL) by 44% for Resnet50, and 100% for VGG16. -![perf_rdma_nvlink](https://user-images.githubusercontent.com/13852819/68922123-cb545c80-07b5-11ea-884b-7d541a848031.png) + + +![vgg16_tcp](https://user-images.githubusercontent.com/13852819/69873424-41e37500-12f3-11ea-93b8-705215e3e901.png) +![resnet50_tcp](https://user-images.githubusercontent.com/13852819/69873419-40b24800-12f3-11ea-9ff3-0f11347c089e.png) + +You can reproduce the results using the Dockerfiles and example scripts we provide. ## PCIe + RDMA diff --git a/docs/step-by-step-tutorial.md b/docs/step-by-step-tutorial.md index b24733404..67e5d3100 100644 --- a/docs/step-by-step-tutorial.md +++ b/docs/step-by-step-tutorial.md @@ -8,89 +8,85 @@ When you have successfully run through these examples, read the [best practice]( ### TensorFlow ``` -docker pull bytepsimage/worker_tensorflow +docker pull bytepsimage/tensorflow -nvidia-docker run -it --net=host --shm-size=32768m bytepsimage/worker_tensorflow bash +nvidia-docker run -it --net=host --shm-size=32768m bytepsimage/tensorflow bash # now you are in docker environment -export NVIDIA_VISIBLE_DEVICES=0,1,2,3 # say you have 4 GPUs +export NVIDIA_VISIBLE_DEVICES=0,1,2,3 # gpus list export DMLC_WORKER_ID=0 # your worker id -export DMLC_NUM_WORKER=1 # you only have one worker -export DMLC_ROLE=worker # your role is worker +export DMLC_NUM_WORKER=1 # one worker +export DMLC_ROLE=worker # the following value does not matter for non-distributed jobs export DMLC_NUM_SERVER=1 export DMLC_PS_ROOT_URI=10.0.0.1 export DMLC_PS_ROOT_PORT=1234 -# can also try: export EVAL_TYPE=mnist -export EVAL_TYPE=benchmark -python /usr/local/byteps/launcher/launch.py \ - /usr/local/byteps/example/tensorflow/run_tensorflow_byteps.sh \ - --model ResNet50 --num-iters 1000000 +python3 /usr/local/byteps/launcher/launch.py \ + python3 /usr/local/byteps/example/tensorflow/synthetic_benchmark.py \ + --model ResNet50 --num-iters 1000000 ``` ### PyTorch ``` -docker pull bytepsimage/worker_pytorch +docker pull bytepsimage/pytorch -nvidia-docker run -it --net=host --shm-size=32768m bytepsimage/worker_pytorch bash +nvidia-docker run -it --net=host --shm-size=32768m bytepsimage/pytorch bash # now you are in docker environment -export NVIDIA_VISIBLE_DEVICES=0,1,2,3 # say you have 4 GPUs +export NVIDIA_VISIBLE_DEVICES=0,1,2,3 # gpus list export DMLC_WORKER_ID=0 # your worker id -export DMLC_NUM_WORKER=1 # you only have one worker -export DMLC_ROLE=worker # your role is worker +export DMLC_NUM_WORKER=1 # one worker +export DMLC_ROLE=worker # the following value does not matter for non-distributed jobs export DMLC_NUM_SERVER=1 export DMLC_PS_ROOT_URI=10.0.0.1 export DMLC_PS_ROOT_PORT=1234 -export EVAL_TYPE=benchmark -python /usr/local/byteps/launcher/launch.py \ - /usr/local/byteps/example/pytorch/start_pytorch_byteps.sh \ - --model resnet50 --num-iters 1000000 +python3 /usr/local/byteps/launcher/launch.py \ + python3 /usr/local/byteps/example/pytorch/benchmark_byteps.py \ + --model resnet50 --num-iters 1000000 ``` ### MXNet ``` -docker pull bytepsimage/worker_mxnet +docker pull bytepsimage/mxnet -nvidia-docker run -it --net=host --shm-size=32768m bytepsimage/worker_mxnet bash +nvidia-docker run -it --net=host --shm-size=32768m bytepsimage/mxnet bash # now you are in docker environment -export NVIDIA_VISIBLE_DEVICES=0,1,2,3 # say you have 4 GPUs +export NVIDIA_VISIBLE_DEVICES=0,1,2,3 # gpus list export DMLC_WORKER_ID=0 # your worker id -export DMLC_NUM_WORKER=1 # you only have one worker -export DMLC_ROLE=worker # your role is worker +export DMLC_NUM_WORKER=1 # one worker +export DMLC_ROLE=worker # the following value does not matter for non-distributed jobs export DMLC_NUM_SERVER=1 export DMLC_PS_ROOT_URI=10.0.0.1 export DMLC_PS_ROOT_PORT=1234 -export EVAL_TYPE=benchmark -python /usr/local/byteps/launcher/launch.py \ - /usr/local/byteps/example/mxnet/start_mxnet_byteps.sh \ - --benchmark 1 --batch-size=32 +python3 /usr/local/byteps/launcher/launch.py \ + python3 /usr/local/byteps/example/mxnet/train_imagenet_byteps.py \ + --benchmark 1 --batch-size=32 ``` ## Distributed Training (TCP) -Let's say you have two workers, and each one with 4 GPUs. For simplicity we use one server. +Let's say you have two workers, and each one with 4 GPUs. For simplicity we use one server. In practice, you need more servers (at least equal to the number of workers) to achieve high performance. -The way to launch the scheduler and the server are the same for any framework. + +For the workers, you need to pay attention to `DMLC_WORKER_ID`. This is the main difference compared to single machine jobs. Let's say the 2 workers are using TensorFlow. For the scheduler: ``` -# scheduler can use the same image as servers -docker pull bytepsimage/byteps_server +docker pull bytepsimage/tensorflow -docker run -it --net=host bytepsimage/byteps_server bash +docker run -it --net=host bytepsimage/tensorflow bash # now you are in docker environment export DMLC_NUM_WORKER=2 @@ -99,14 +95,14 @@ export DMLC_NUM_SERVER=1 export DMLC_PS_ROOT_URI=10.0.0.1 # the scheduler IP export DMLC_PS_ROOT_PORT=1234 # the scheduler port -python /usr/local/byteps/launcher/launch.py +python3 /usr/local/byteps/launcher/launch.py ``` For the server: ``` -docker pull bytepsimage/byteps_server +docker pull bytepsimage/tensorflow -docker run -it --net=host bytepsimage/byteps_server bash +docker run -it --net=host bytepsimage/tensorflow bash # now you are in docker environment export DMLC_NUM_WORKER=2 @@ -115,85 +111,78 @@ export DMLC_NUM_SERVER=1 export DMLC_PS_ROOT_URI=10.0.0.1 # the scheduler IP export DMLC_PS_ROOT_PORT=1234 # the scheduler port -# 4 threads should be enough for a server -export MXNET_OMP_MAX_THREADS=4 - -python /usr/local/byteps/launcher/launch.py +python3 /usr/local/byteps/launcher/launch.py ``` -For the workers, you need to pay attention to `DMLC_WORKER_ID`. This is the main difference compared to single machine jobs. Let's say the 2 workers are using MXNet. For worker-0: ``` -docker pull bytepsimage/worker_mxnet +docker pull bytepsimage/tensorflow -nvidia-docker run -it --net=host --shm-size=32768m bytepsimage/worker_mxnet bash +nvidia-docker run -it --net=host --shm-size=32768m bytepsimage/tensorflow bash -# now you are in docker environment -export NVIDIA_VISIBLE_DEVICES=0,1,2,3 # say you have 4 GPUs -export DMLC_WORKER_ID=0 # worker-0 -export DMLC_NUM_WORKER=2 # 2 workers -export DMLC_ROLE=worker # your role is worker +export NVIDIA_VISIBLE_DEVICES=0,1,2,3 +export DMLC_WORKER_ID=0 +export DMLC_NUM_WORKER=2 +export DMLC_ROLE=worker export DMLC_NUM_SERVER=1 export DMLC_PS_ROOT_URI=10.0.0.1 # the scheduler IP export DMLC_PS_ROOT_PORT=1234 # the scheduler port -export EVAL_TYPE=benchmark -python /usr/local/byteps/launcher/launch.py \ - /usr/local/byteps/example/mxnet/start_mxnet_byteps.sh \ - --benchmark 1 --batch-size=32 +python3 /usr/local/byteps/launcher/launch.py \ + python3 /usr/local/byteps/example/tensorflow/synthetic_benchmark.py \ + --model ResNet50 --num-iters 1000000 ``` For worker-1: ``` -docker pull bytepsimage/worker_mxnet +docker pull bytepsimage/tensorflow -nvidia-docker run -it --net=host --shm-size=32768m bytepsimage/worker_mxnet bash +nvidia-docker run -it --net=host --shm-size=32768m bytepsimage/tensorflow bash -# now you are in docker environment -export NVIDIA_VISIBLE_DEVICES=0,1,2,3 # say you have 4 GPUs -export DMLC_WORKER_ID=1 # worker-1 -export DMLC_NUM_WORKER=2 # 2 workers -export DMLC_ROLE=worker # your role is worker +export NVIDIA_VISIBLE_DEVICES=0,1,2,3 +export DMLC_WORKER_ID=1 +export DMLC_NUM_WORKER=2 +export DMLC_ROLE=worker export DMLC_NUM_SERVER=1 export DMLC_PS_ROOT_URI=10.0.0.1 # the scheduler IP export DMLC_PS_ROOT_PORT=1234 # the scheduler port -export EVAL_TYPE=benchmark -python /usr/local/byteps/launcher/launch.py \ - /usr/local/byteps/example/mxnet/start_mxnet_byteps.sh \ - --benchmark 1 --batch-size=32 +python3 /usr/local/byteps/launcher/launch.py \ + python3 /usr/local/byteps/example/tensorflow/synthetic_benchmark.py \ + --model ResNet50 --num-iters 1000000 ``` -If your workers use TensorFlow, you need to change the image name to `bytepsimage/worker_tensorflow`, and replace the python script with + +If your workers use PyTorch, you need to change the image name to `bytepsimage/pytorch`, and replace the python script of the workers with + ``` -python /usr/local/byteps/launcher/launch.py \ - /usr/local/byteps/example/tensorflow/run_tensorflow_byteps.sh \ - --model ResNet50 --num-iters 1000000 +python3 /usr/local/byteps/launcher/launch.py \ + python3 /usr/local/byteps/example/pytorch/benchmark_byteps.py \ + --model resnet50 --num-iters 1000000 ``` -If your workers use PyTorch, you need to change the image name to `bytepsimage/worker_pytorch`, and replace the python script with +If your workers use MXNet, you need to change the image name to `bytepsimage/mxnet`, and replace the python script of the workers with ``` -python /usr/local/byteps/launcher/launch.py \ - /usr/local/byteps/example/pytorch/start_pytorch_byteps.sh \ - --model resnet50 --num-iters 1000000 +python3 /usr/local/byteps/launcher/launch.py \ + python3 /usr/local/byteps/example/mxnet/train_imagenet_byteps.py \ + --benchmark 1 --batch-size=32 ``` ## Distributed Training with RDMA -The steps to launch RDMA tasks are basically similar to the above. The main differences are that (1) you need to specify your RDMA devices when running a docker, and (2) you need to set `DMLC_ENABLE_RDMA=1`. To run this example, your `nvidia-docker` need to support cuda 10. +The steps to launch RDMA tasks are basically similar to the above. The main differences are that (1) you need to specify your RDMA devices when running a docker, and (2) you need to set `DMLC_ENABLE_RDMA=1`. -In the following, let's continue to use the example: you have two workers and one server, and the workers are using MXNet. +In the following, let's continue to use the example: you have two workers and one server, and the workers are using TensorFlow. For the scheduler: ``` -# the scheduler may use the same image as servers -docker pull bytepsimage/byteps_server_rdma +docker pull bytepsimage/tensorflow # specify your rdma device (usually under /dev/infiniband, but depend on your system configurations) -docker run -it --net=host --device /dev/infiniband/rdma_cm --device /dev/infiniband/issm0 --device /dev/infiniband/ucm0 --device /dev/infiniband/umad0 --device /dev/infiniband/uverbs0 --cap-add IPC_LOCK bytepsimage/byteps_server_rdma bash +docker run -it --net=host --device /dev/infiniband/rdma_cm --device /dev/infiniband/issm0 --device /dev/infiniband/ucm0 --device /dev/infiniband/umad0 --device /dev/infiniband/uverbs0 --cap-add IPC_LOCK bytepsimage/tensorflow bash # now you are in docker environment export DMLC_ENABLE_RDMA=1 @@ -209,15 +198,15 @@ export DMLC_PS_ROOT_URI=10.0.0.100 export DMLC_PS_ROOT_PORT=9000 # launch the job -python /usr/local/byteps/launcher/launch.py +python3 /usr/local/byteps/launcher/launch.py ``` For the server: ``` -docker pull bytepsimage/byteps_server_rdma +docker pull bytepsimage/tensorflow # specify your rdma device (usually under /dev/infiniband, but depend on your system configurations) -docker run -it --net=host --device /dev/infiniband/rdma_cm --device /dev/infiniband/issm0 --device /dev/infiniband/ucm0 --device /dev/infiniband/umad0 --device /dev/infiniband/uverbs0 --cap-add IPC_LOCK bytepsimage/byteps_server_rdma bash +docker run -it --net=host --device /dev/infiniband/rdma_cm --device /dev/infiniband/issm0 --device /dev/infiniband/ucm0 --device /dev/infiniband/umad0 --device /dev/infiniband/uverbs0 --cap-add IPC_LOCK bytepsimage/tensorflow bash # now you are in docker environment export DMLC_ENABLE_RDMA=1 @@ -228,24 +217,21 @@ export DMLC_NUM_SERVER=1 # the RDMA interface name of the server export DMLC_INTERFACE=eth5 -# 4 threads should be enough for a server -export MXNET_OMP_MAX_THREADS=4 - # your scheduler's RDMA NIC information (IP, port) export DMLC_PS_ROOT_URI=10.0.0.100 export DMLC_PS_ROOT_PORT=9000 # launch the job -python /usr/local/byteps/launcher/launch.py +python3 /usr/local/byteps/launcher/launch.py ``` For worker-0: ``` -docker pull bytepsimage/worker_mxnet_rdma +docker pull bytepsimage/tensorflow # specify your rdma device (usually under /dev/infiniband, but depend on your system configurations) -nvidia-docker run -it --net=host --shm-size=32768m --device /dev/infiniband/rdma_cm --device /dev/infiniband/issm0 --device /dev/infiniband/ucm0 --device /dev/infiniband/umad0 --device /dev/infiniband/uverbs0 --cap-add IPC_LOCK bytepsimage/worker_mxnet_rdma bash +nvidia-docker run -it --net=host --shm-size=32768m --device /dev/infiniband/rdma_cm --device /dev/infiniband/issm0 --device /dev/infiniband/ucm0 --device /dev/infiniband/umad0 --device /dev/infiniband/uverbs0 --cap-add IPC_LOCK bytepsimage/tensorflow bash # now you are in docker environment export NVIDIA_VISIBLE_DEVICES=0,1,2,3 @@ -264,20 +250,19 @@ export DMLC_PS_ROOT_URI=10.0.0.100 export DMLC_PS_ROOT_PORT=9000 # launch the job -export EVAL_TYPE=benchmark -python /usr/local/byteps/launcher/launch.py \ - /usr/local/byteps/example/mxnet/start_mxnet_byteps.sh \ - --benchmark 1 --batch-size=32 +python3 /usr/local/byteps/launcher/launch.py \ + python3 /usr/local/byteps/example/tensorflow/synthetic_benchmark.py \ + --model ResNet50 --num-iters 1000000 ``` For worker-1: ``` -docker pull bytepsimage/worker_mxnet_rdma +docker pull bytepsimage/tensorflow # specify your rdma device (usually under /dev/infiniband, but depend on your system configurations) -nvidia-docker run -it --net=host --shm-size=32768m --device /dev/infiniband/rdma_cm --device /dev/infiniband/issm0 --device /dev/infiniband/ucm0 --device /dev/infiniband/umad0 --device /dev/infiniband/uverbs0 --cap-add IPC_LOCK bytepsimage/worker_mxnet_rdma bash +nvidia-docker run -it --net=host --shm-size=32768m --device /dev/infiniband/rdma_cm --device /dev/infiniband/issm0 --device /dev/infiniband/ucm0 --device /dev/infiniband/umad0 --device /dev/infiniband/uverbs0 --cap-add IPC_LOCK bytepsimage/tensorflow bash # now you are in docker environment export NVIDIA_VISIBLE_DEVICES=0,1,2,3 @@ -296,24 +281,25 @@ export DMLC_PS_ROOT_URI=10.0.0.100 export DMLC_PS_ROOT_PORT=9000 # launch the job -export EVAL_TYPE=benchmark -python /usr/local/byteps/launcher/launch.py \ - /usr/local/byteps/example/mxnet/start_mxnet_byteps.sh \ - --benchmark 1 --batch-size=32 +python3 /usr/local/byteps/launcher/launch.py \ + python3 /usr/local/byteps/example/tensorflow/synthetic_benchmark.py \ + --model ResNet50 --num-iters 1000000 ``` -If your workers use TensorFlow, you need to change the image name to `bytepsimage/worker_tensorflow_rdma`, and replace the python script with + +If your workers use PyTorch, you need to change the image name to `bytepsimage/pytorch`, and replace the python script of the workers with + ``` -python /usr/local/byteps/launcher/launch.py \ - /usr/local/byteps/example/tensorflow/run_tensorflow_byteps.sh \ - --model ResNet50 --num-iters 1000000 +python3 /usr/local/byteps/launcher/launch.py \ + python3 /usr/local/byteps/example/pytorch/benchmark_byteps.py \ + --model resnet50 --num-iters 1000000 ``` -If your workers use PyTorch, you need to change the image name to `bytepsimage/worker_pytorch_rdma`, and replace the python script with +If your workers use MXNet, you need to change the image name to `bytepsimage/mxnet`, and replace the python script of the workers with ``` -python /usr/local/byteps/launcher/launch.py \ - /usr/local/byteps/example/pytorch/start_pytorch_byteps.sh \ - --model resnet50 --num-iters 1000000 +python3 /usr/local/byteps/launcher/launch.py \ + python3 /usr/local/byteps/example/mxnet/train_imagenet_byteps.py \ + --benchmark 1 --batch-size=32 ``` \ No newline at end of file diff --git a/example/keras/run_keras.sh b/example/keras/run_keras.sh deleted file mode 100755 index 3d4ca705f..000000000 --- a/example/keras/run_keras.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -path="`dirname $0`" - -if [ "$EVAL_TYPE" == "imagenet" ]; then - echo "Run Keras ImageNet ..." - python $path/keras_imagenet_resnet50.py $@ -elif [ "$EVAL_TYPE" == "mnist" ]; then - echo "Run Keras MNIST ..." - python $path/keras_mnist.py $@ -elif [ "$EVAL_TYPE" == "mnist_advanced" ]; then - echo "Run Keras MNIST-advanced ..." - python $path/keras_mnist_advanced.py $@ -else - echo "Error: unsupported $EVAL_TYPE" - exit 1 -fi diff --git a/example/mxnet-gluon/run_mnist_gluon.sh b/example/mxnet-gluon/run_mnist_gluon.sh deleted file mode 100644 index 2632e358c..000000000 --- a/example/mxnet-gluon/run_mnist_gluon.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -export NVIDIA_VISIBLE_DEVICES=0,1 -export DMLC_WORKER_ID=0 -export DMLC_NUM_WORKER=1 -export DMLC_ROLE=worker - -# the following value does not matter for non-distributed jobs -export DMLC_NUM_SERVER=1 -export DMLC_PS_ROOT_URI=127.0.0.1 -export DMLC_PS_ROOT_PORT=9000 - -path="`dirname $0`" -echo $path - -python $path/../../launcher/launch.py \ - python $path/train_mnist_byteps.py \ No newline at end of file diff --git a/example/mxnet/start_mxnet_byteps.sh b/example/mxnet/start_mxnet_byteps.sh deleted file mode 100755 index ff1c537d3..000000000 --- a/example/mxnet/start_mxnet_byteps.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -path="`dirname $0`" - -python $path/train_imagenet_byteps.py $@ diff --git a/example/mxnet-gluon/train_mnist_byteps.py b/example/mxnet/train_gluon_mnist_byteps.py similarity index 100% rename from example/mxnet-gluon/train_mnist_byteps.py rename to example/mxnet/train_gluon_mnist_byteps.py diff --git a/example/pytorch/microbenchmark-byteps.py b/example/pytorch/microbenchmark-byteps.py deleted file mode 100644 index 1a1ae05a1..000000000 --- a/example/pytorch/microbenchmark-byteps.py +++ /dev/null @@ -1,82 +0,0 @@ -from __future__ import print_function - -import torch -import argparse -import torch.backends.cudnn as cudnn -from byteps.torch.ops import push_pull_async_inplace, poll, synchronize -import byteps.torch as bps -import time -import numpy as np - - -parser = argparse.ArgumentParser(description='PyTorch BytePS Synthetic Benchmark', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--num-warmup', type=int, default=10, - help='number of warm-up steps that don\'t count towards benchmark') -parser.add_argument('--num-iters', type=int, default=1000, - help='number of benchmark iterations') -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA') -parser.add_argument('--no-wait', type=bool, default=True, - help='wait for other worker request first') -parser.add_argument('--gpu', type=int, default=-1, - help='use a specified gpu') - -args = parser.parse_args() -args.cuda = not args.no_cuda and torch.cuda.is_available() - -bps.init() - -# BytePS: pin GPU to local rank. -if args.gpu >= 0: - torch.cuda.set_device(args.gpu) -else: - torch.cuda.set_device(bps.local_rank()) - -cudnn.benchmark = True - - -def log(s, nl=True): - if bps.rank() != 0: - return - print(s, end='\n' if nl else '') - - -def benchmark(tensor, average, name): - if not args.no_wait and bps.rank() == 0: - time.sleep(0.01) - start = time.time() - handle = push_pull_async_inplace(tensor, average, name) - while True: - if poll(handle): - synchronize(handle) - break - end = time.time() - return (end - start) * 1000 - - -log('Number of GPUs: %d' % (bps.size())) - -# Benchmark -log('Running benchmark...') - -log('size (Byte) \t avg. time (ms) \t std.dev (ms)') -for i in range(8): - size = 10**i - data = torch.rand(size, dtype=torch.float32) - if args.cuda: - data = data.cuda() - # warm up - for j in range(args.num_warmup): - benchmark(tensor=data, average=True, name=str(i)) - # timeit - durations = [] - for j in range(args.num_iters): - t = benchmark(tensor=data, average=True, name=str(i)) - durations.append(t) - avg = np.mean(durations) - std = np.std(durations) - - log('%d \t %s \t %s' % (4*size, '%.3f'%avg, '%.3f'%std)) - -log('End benchmark.') diff --git a/example/pytorch/start_pytorch_byteps.sh b/example/pytorch/start_pytorch_byteps.sh deleted file mode 100755 index f82a279cc..000000000 --- a/example/pytorch/start_pytorch_byteps.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -path="`dirname $0`" - -if [ "$EVAL_TYPE" == "mnist" ]; then - echo "training mnist..." - python $path/train_mnist_byteps.py $@ -elif [ "$EVAL_TYPE" == "imagenet" ]; then - echo "training imagenet..." - python $path/train_imagenet_resnet50_byteps.py $@ -elif [ "$EVAL_TYPE" == "benchmark" ]; then - echo "running benchmark..." - python $path/benchmark_byteps.py $@ -elif [ "$EVAL_TYPE" == "nobarrierbenchmark" ]; then - echo "running benchmark without global barrier..." - python $path/benchmark_cross_barrier_byteps.py $@ -elif [ "$EVAL_TYPE" == "microbenchmark" ]; then - echo "running microbenchmark" - python $path/microbenchmark-byteps.py $@ -else - echo "Error: unsupported $EVAL_TYPE" - exit 1 -fi diff --git a/example/tensorflow/run_tensorflow_byteps.sh b/example/tensorflow/run_tensorflow_byteps.sh deleted file mode 100755 index 68ff57cd4..000000000 --- a/example/tensorflow/run_tensorflow_byteps.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -path="`dirname $0`" - -if [ "$EVAL_TYPE" == "benchmark" ]; then - echo "Run synthetic benchmark..." - python3 $path/synthetic_benchmark.py $@ -elif [ "$EVAL_TYPE" == "mnist" ]; then - echo "Run MNIST ..." - python3 $path/tensorflow_mnist.py $@ -else - echo "Error: unsupported $EVAL_TYPE" - exit 1 -fi diff --git a/launcher/launch.py b/launcher/launch.py index 51541d019..927c78c3d 100644 --- a/launcher/launch.py +++ b/launcher/launch.py @@ -10,7 +10,6 @@ COMMON_REQUIRED_ENVS = ["DMLC_ROLE", "DMLC_NUM_WORKER", "DMLC_NUM_SERVER", "DMLC_PS_ROOT_URI", "DMLC_PS_ROOT_PORT"] WORKER_REQUIRED_ENVS = ["DMLC_WORKER_ID"] -SERVER_REQUIRED_ENVS = ["BYTEPS_SERVER_MXNET_PATH"] def check_env(): assert "DMLC_ROLE" in os.environ and \ @@ -23,8 +22,6 @@ def check_env(): if num_worker == 1: required_envs = [] required_envs += WORKER_REQUIRED_ENVS - else: - required_envs += SERVER_REQUIRED_ENVS for env in required_envs: if env not in os.environ: print("The env " + env + " is missing") @@ -60,5 +57,4 @@ def worker(local_rank, local_size, command): t[i].join() else: - sys.path.insert(0, os.getenv("BYTEPS_SERVER_MXNET_PATH")+"/python") - import mxnet + import byteps.server diff --git a/setup.py b/setup.py index 9df9176da..8319c45ff 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ from distutils.version import LooseVersion import traceback +server_lib = Extension('byteps.server.c_lib', []) tensorflow_lib = Extension('byteps.tensorflow.c_lib', []) mxnet_lib = Extension('byteps.mxnet.c_lib', []) pytorch_lib = Extension('byteps.torch.c_lib', []) @@ -280,6 +281,25 @@ def get_common_options(build_ext): EXTRA_OBJECTS=EXTRA_OBJECTS) +def build_server(build_ext, options): + server_lib.define_macros = options['MACROS'] + server_lib.include_dirs = options['INCLUDES'] + server_lib.sources = ['byteps/server/server.cc', + 'byteps/common/cpu_reducer.cc', + 'byteps/common/logging.cc'] + server_lib.extra_compile_args = options['COMPILE_FLAGS'] + \ + ['-DBYTEPS_BUILDING_SERVER'] + server_lib.extra_link_args = options['LINK_FLAGS'] + server_lib.extra_objects = options['EXTRA_OBJECTS'] + server_lib.library_dirs = options['LIBRARY_DIRS'] + if int(os.environ.get('BYTEPS_USE_RDMA', 0)): + server_lib.libraries = ['rdmacm', 'ibverbs'] + else: + server_lib.libraries = [] + + build_ext.build_extension(server_lib) + + def check_tf_version(): try: import tensorflow as tf @@ -775,6 +795,12 @@ def build_extensions(self): options = get_common_options(self) built_plugins = [] + try: + build_server(self, options) + except: + raise DistutilsSetupError('An ERROR occured while building the server module.\n\n' + '%s' % traceback.format_exc()) + # If PyTorch is installed, it must be imported before others, otherwise # we may get an error: dlopen: cannot load any more object with static TLS if not int(os.environ.get('BYTEPS_WITHOUT_PYTORCH', 0)): @@ -818,8 +844,9 @@ def build_extensions(self): raise if not built_plugins: - raise DistutilsError( - 'TensorFlow, MXNet, PyTorch plugins were excluded from build. Aborting.') + print('INFO: Only server module is built.') + return + if not any(built_plugins): raise DistutilsError( 'None of TensorFlow, MXNet, PyTorch plugins were built. See errors above.') @@ -851,7 +878,7 @@ def build_extensions(self): 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy' ], - ext_modules=[tensorflow_lib, mxnet_lib, pytorch_lib], + ext_modules=[server_lib, tensorflow_lib, mxnet_lib, pytorch_lib], # $ setup.py publish support. cmdclass={ 'upload': UploadCommand,