more build updates:

(1) nccl submodule, cnmem submodule (2) mpi ops fallback test (3) a bit more blob interface (4) fixed tests (5) caffe2.python.io -> caffe2.python.dataio to avoid name conflicts (6) In the build system autogen __init__.py instead of having manual rules just to copy over an empty __init__.py.
okly366 · Aug 3, 2016 · 1ede7a7 · 1ede7a7
1 parent b2c2d0b
commit 1ede7a7
Show file tree

Hide file tree

Showing 29 changed files with 156 additions and 1,629 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,9 @@
 [submodule "third_party/pybind11"]
 	path = third_party/pybind11
 	url = https://github.com/pybind/pybind11.git
+[submodule "third_party/nccl"]
+	path = third_party/nccl
+	url = https://github.com/nvidia/nccl.git
+[submodule "third_party/cnmem"]
+	path = third_party/cnmem
+	url = https://github.com/nvidia/cnmem.git
diff --git a/Makefile b/Makefile
@@ -16,6 +16,6 @@ lint:
 	@find caffe2 -type f -exec python brewtool/cpplint.py {} \;
 
 linecount:
-	@cloc --read-lang-def=brewtool/caffe.cloc caffe2 pycaffe2 || \
+	@cloc --read-lang-def=brewtool/caffe.cloc caffe2 || \
 		echo "Cloc is not available on the machine. You can install cloc with " && \
 		echo "    sudo apt-get install cloc"
diff --git a/build.py b/build.py
@@ -108,6 +108,7 @@ class Config(object):
         'arch=compute_30,code=sm_30',
         'arch=compute_35,code=sm_35',
         'arch=compute_50,code=sm_50',
+        'arch=compute_61,code=sm_61',
     ]
     # additional CUDA cflags to pass to nvcc.
     CUDA_CFLAGS = []

diff --git a/caffe/BREW b/caffe/BREW
diff --git a/caffe/__init__.py b/caffe/__init__.py
diff --git a/caffe/proto/BREW b/caffe/proto/BREW
@@ -4,11 +4,3 @@ proto_library(
     name = 'caffe_proto',
     srcs = ['caffe.proto'],
 )
-
-filegroup(
-    name = "caffe_proto_py",
-    srcs = ["__init__.py"],
-    deps = [
-        "//caffe:caffe_python",
-    ]
-)
diff --git a/caffe/proto/__init__.py b/caffe/proto/__init__.py
diff --git a/caffe2/BREW b/caffe2/BREW
@@ -26,7 +26,7 @@ cc_library(
   deps = [
     ":core",
     ":core_gpu_cu",
-    "//third_party/cnmem:cnmem",
+    "//third_party:cnmem",
     "//third_party:cuda",
   ],
   whole_archive = True,
@@ -48,6 +48,7 @@ cc_test(
               excludes=["*gpu_test*"]),
   deps = [
       ":core",
+      "//caffe2/operators:core_ops",
       "//third_party:gtest",
       "//caffe2/test:caffe2_gtest_main",
   ],
@@ -63,11 +64,6 @@ cc_test(
   ],
 )
 
-filegroup(
-    name = "caffe2_python",
-    srcs = ["__init__.py"],
-)
-
 cc_library(
     name = "all_available_ops",
     srcs = [],
@@ -79,6 +75,7 @@ cc_library(
     optional_deps = [
       "//caffe2/operators:core_ops_gpu",
       "//caffe2/operators:core_ops_cudnn",
+      "//caffe2/contrib/nccl:nccl_ops",
       "//caffe2/cuda_rtc:rtc_ops",
       "//caffe2/db:db_gpu",
       "//caffe2/image:image_ops",

diff --git a/caffe2/__init__.py b/caffe2/__init__.py
diff --git a/caffe2/contrib/nccl/BREW b/caffe2/contrib/nccl/BREW
@@ -0,0 +1,10 @@
+cc_library(
+  name = "nccl_ops",
+  srcs = Glob(["*.cc"]),
+  hdrs = Glob(["*.h"]),
+  deps = [
+    "//caffe2:core_gpu",
+    "//third_party:nccl",
+  ],
+  whole_archive = True,
+)
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
@@ -58,6 +58,9 @@ class Blob {
     return *static_cast<const T*>(pointer_);
   }
 
+  const void* GetRaw() const { return pointer_; }
+  void* GetRaw() { return pointer_; }
+
   /**
    * @brief Gets a mutable pointer to the stored object.
    *
@@ -73,6 +76,7 @@ class Blob {
       return static_cast<T*>(pointer_);
     } else {
       if (is_new_object) *is_new_object = true;
+      VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
       return Reset<T>(new T());
     }
   }
@@ -87,28 +91,53 @@ class Blob {
    */
   template <class T>
   T* Reset(T* allocated) {
-    if (pointer_) {
-      CHECK_NOTNULL(destroy_)(pointer_);
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
     }
-    VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
     meta_ = TypeMeta::Make<T>();
     pointer_ = static_cast<void*>(allocated);
     destroy_ = &Destroy<T>;
     return allocated;
   }
 
+  /**
+   * Sets the underlying object to the allocated one, but does not take over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * Unlike Reset, this does not take over the ownership of the pointer and the
+   * caller is responsible for making sure that the lifetime of the allocated
+   * blob outlasts the lifetime of any access to this blob, until another Reset
+   * call is made or the blob is destructed.
+   */
+  template <class T>
+  typename std::remove_const<T>::type* ShareExternal(
+      typename std::remove_const<T>::type* allocated) {
+    return static_cast<T*>(
+        ShareExternal(static_cast<void*>(allocated),
+        TypeMeta::Make<typename std::remove_const<T>::type>()));
+  }
+
+  void* ShareExternal(void* allocated, const TypeMeta& meta) {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    meta_ = meta;
+    pointer_ = static_cast<void*>(allocated);
+    destroy_ = nullptr;
+    return allocated;
+  }
+
   /**
    * Resets the Blob to an empty one.
    */
   inline void Reset() {
-    if (pointer_) {
-      CHECK_NOTNULL(destroy_)(pointer_);
-      pointer_ = nullptr;
-      meta_ = TypeMeta();
-      destroy_ = nullptr;
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
     }
     pointer_ = nullptr;
     meta_ = TypeMeta();
+    destroy_ = nullptr;
   }
 
   /**

diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
@@ -69,6 +69,32 @@ TEST(BlobTest, BlobWrongType) {
   ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
 }
 
+TEST(BlobTest, BlobReset) {
+  Blob blob;
+  std::unique_ptr<Foo> foo(new Foo());
+  EXPECT_TRUE(blob.Reset(foo.release()) != nullptr);
+  // Also test that Reset works.
+  blob.Reset();
+}
+
+TEST(BlobTest, BlobShareExternalPointer) {
+  Blob blob;
+  std::unique_ptr<Foo> foo(new Foo());
+  EXPECT_EQ(blob.ShareExternal<Foo>(foo.get()), foo.get());
+  EXPECT_TRUE(blob.IsType<Foo>());
+  // Also test that Reset works.
+  blob.Reset();
+}
+
+TEST(BlobTest, BlobShareExternalObject) {
+  Blob blob;
+  Foo foo;
+  EXPECT_EQ(blob.ShareExternal<Foo>(&foo), &foo);
+  EXPECT_TRUE(blob.IsType<Foo>());
+  // Also test that Reset works.
+  blob.Reset();
+}
+
 TEST(BlobTest, StringSerialization) {
   const std::string kTestString = "Hello world?";
   Blob blob;
@@ -558,6 +584,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
         "DUMMY_ENGINE");
     Workspace ws;
     auto load_op = CreateOperator(op_def, &ws);
+    EXPECT_TRUE(load_op != nullptr);
     LOG(INFO) << "Running operator";
 
     load_op->Run();

diff --git a/caffe2/core/context_gpu_test.cc b/caffe2/core/context_gpu_test.cc
@@ -1,3 +1,4 @@
+#include <chrono>
 #include <future>
 #include <random>
 #include <thread>
@@ -55,6 +56,8 @@ namespace {
 void TEST_GetStreamAddress(cudaStream_t* ptr) {
   CUDAContext context(0);
   *ptr = context.cuda_stream();
+  // Sleep for a while so we have concurrent thread executions
+  std::this_thread::sleep_for(std::chrono::seconds(1));  
 }
 }  // namespace
 

diff --git a/caffe2/mpi/mpi_ops.h b/caffe2/mpi/mpi_ops.h
@@ -35,9 +35,13 @@ class MPIBroadcastOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
+    CAFFE_ENFORCE(OperatorBase::OutputIsType<Tensor<Context>>(0),
+                  "Output is of wrong type.");
     auto* output = Output(0);
     // Make sure that output is already allocated.
-    CHECK_GT(output->size(), 0);
+    CAFFE_ENFORCE(output->size() > 0,
+                  "Broadcast op uses in-place operation so the output "
+                  "should be already allocated.");
     MPI_CHECK(MPI_Bcast(
         output->raw_mutable_data(),
         output->nbytes(),

diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
@@ -54,8 +54,18 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
 
   bool RunOnDevice() override {
     for (int i = 0; i < InputSize(); ++i) {
-      local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
-          Input(i), &context_);
+      if (OperatorBase::InputIsType<TensorCUDA>(i)) {
+        local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
+            Input(i), &context_);
+      } else {
+        VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
+        // Note(jiayq): This removes a const but conceptually
+        // local_input_blobs will only be used as const blob input for the
+        // base op so we are still fine.
+        local_input_blobs_[i]->ShareExternal(
+            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
+            OperatorBase::Inputs()[i]->meta());
+      }
     }
     // Sync to make sure copies are done.
     context_.FinishDeviceComputation();
@@ -65,6 +75,9 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
       return false;
     }
     for (int i = 0; i < OutputSize(); ++i) {
+      CAFFE_ENFORCE(local_output_blobs_[i]->IsType<TensorCPU>(),
+                    "GPU fallback op currently does not support non-TensorCPU "
+                    "output type.");
       Output(i)->CopyFrom(
           local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
     }

diff --git a/caffe2/proto/BREW b/caffe2/proto/BREW
@@ -4,11 +4,3 @@ proto_library(
     name = 'caffe2_proto',
     srcs = Glob(['*.proto']),
 )
-
-filegroup(
-    name = "caffe2_proto_py",
-    srcs = ["__init__.py"],
-    deps = [
-        "//caffe2:caffe2_python",
-    ]
-)
diff --git a/caffe2/proto/__init__.py b/caffe2/proto/__init__.py
diff --git a/caffe2/python/BREW b/caffe2/python/BREW
@@ -39,8 +39,8 @@ py_library(
   srcs=Glob(["*.py"], excludes=["*_test.py"]),
   deps=[
       ":caffe2_python_cpu",
-      "//caffe/proto:caffe_proto_py",
-      "//caffe2/proto:caffe2_proto_py",
+      "//caffe/proto:caffe_proto",
+      "//caffe2/proto:caffe2_proto",
       "//caffe2/python/mint:mint",
   ],
   optional_deps=[

diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
diff --git a/caffe2/python/io.py → caffe2/python/dataio.py b/caffe2/python/io.py → caffe2/python/dataio.py
diff --git a/caffe2/python/dataset.py b/caffe2/python/dataset.py
@@ -14,7 +14,7 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core, workspace
-from caffe2.python.io import Reader, Writer
+from caffe2.python.dataio import Reader, Writer
 from caffe2.python.schema import Struct
 import numpy as np
 

diff --git a/caffe2/python/mint/BREW b/caffe2/python/mint/BREW
@@ -1,7 +1,6 @@
 py_library(
   name = "mint",
   srcs = [
-      "__init__.py",
       "app.py",
       "static/css/simple-sidebar.css",
       "templates/index.html",

diff --git a/caffe2/python/mint/__init__.py b/caffe2/python/mint/__init__.py
diff --git a/third_party/BREW b/third_party/BREW
@@ -92,10 +92,49 @@ cc_thirdparty_target(
   ],
 )
 
-cc_thirdparty_target(
-  name="cnmen",
-  deps=["//third_party/cnmem:cnmem"],
-  cc_obj_files = [],
+shell_script(
+  name = "cnmem_header",
+  srcs = ["cnmem/include/cnmem.h"],
+  commands=[
+    "DST=$CAFFE2_GENDIR/third_party/include/",
+    "mkdir -p $DST",
+    "cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem/include/cnmem.h $DST/",
+  ],
+)
+
+cc_library(
+  name = "cnmem",
+  srcs = [
+    "cnmem/src/cnmem.cpp",
+  ],
+  deps = [
+      ":cnmem_header",
+      ":cuda",
+  ]
+)
+
+shell_script(
+  name = "nccl_header",
+  srcs = ["nccl/src/nccl.h"],
+  commands=[
+    "DST=$CAFFE2_GENDIR/third_party/include/",
+    "mkdir -p $DST",
+    "cp $CAFFE2_SRCDIR/$CAFFE2_CWD/nccl/src/nccl.h $DST/",
+  ],
+)
+
+cuda_library(
+  name = "nccl",
+  srcs = Glob(["nccl/src/*.cu"]),
+  deps = [
+      ":nccl_header",
+      ":cuda",
+  ],
+  compiler_flags=[
+      "-Wno-switch",  # NCCL does not follow strict switch enum check.
+      "-DNCCL_MAJOR=1 -DNCCL_MINOR=2 -DNCCL_PATCH=3",
+      "-DCUDA_MAJOR=__CUDACC_VER_MAJOR__ -DCUDA_MINOR=__CUDACC_VER_MINOR__",
+  ],
 )
 
 ###############################################################################

diff --git a/third_party/cnmem b/third_party/cnmem