Oneflow-Inc · Yipeng1994 · Jun 23, 2022 · Mar 31, 2022 · Mar 31, 2022 · Mar 31, 2022
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -347,7 +347,7 @@ jobs:
 
   find-test-cache-distributed:
     name: "Find test cache (distributed)"
-    if: github.event.pull_request.draft == false && github.base_ref == 'master' && false
+    if: github.event.pull_request.draft == false && github.base_ref == 'master' && contains(github.event.pull_request.labels.*.name, 'need-test-distributed')
     runs-on: ubuntu-latest
     needs: [build-oneflow]
     env:
@@ -411,10 +411,10 @@ jobs:
 
   test-distributed:
     name: Distributed test suite
-    needs: [wait_for_gpu_slot, find-test-cache-distributed]
+    needs: [wait_for_gpu_slot, find-test-cache-distributed, test]
     runs-on: ${{ matrix.runs-on }}
     timeout-minutes: 120
-    if: github.event.pull_request.draft == false && github.base_ref == 'master' && false
+    if: github.event.pull_request.draft == false && github.base_ref == 'master' && contains(github.event.pull_request.labels.*.name, 'need-test-distributed')
     concurrency:
       group: distributed-test-${{ matrix.entry }}-rank-${{ matrix.rank }}
       cancel-in-progress: false
@@ -439,6 +439,22 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
+      - name: Checkout Oneflow-Inc/vision
+        if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
+        uses: actions/checkout@v2
+        with:
+          repository: Oneflow-Inc/vision
+          # please use a commit here
+          ref: ${{ env.FLOW_VISION_COMMIT}}
+          path: ${{ env.FLOW_VISION_SRC}}
+      - name: Checkout Oneflow-Inc/libai
+        if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
+        uses: actions/checkout@v2
+        with:
+          repository: Oneflow-Inc/libai
+          # please use a commit here
+          ref: ${{ env.LIBAI_COMMIT}}
+          path: ${{ env.LIBAI_SRC}}
       - name: Remove container
         timeout-minutes: 45
         if: ${{ contains(matrix.runs-on, 'self-hosted') }}
@@ -537,6 +553,12 @@ jobs:
           ls ${ONEFLOW_WHEEL_PATH}
           docker exec ${TEST_CONTAINER_NAME} python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
           docker exec ${TEST_CONTAINER_NAME} python3 -m pip install --find-links=${ONEFLOW_WHEEL_PATH} oneflow
+      - name: Install downstream libs
+        if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
+        run: |
+          docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.FLOW_VISION_SRC}}
+          docker exec ${TEST_CONTAINER_NAME} python3 -m pip install pybind11 --user
+          docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.LIBAI_SRC}}
       - name: Module API test (distributed)
         timeout-minutes: 90
         if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'module' && matrix.device == 'cuda' && fromJson(matrix.is-distributed) }}

diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake
@@ -197,7 +197,7 @@ generate_functional_api_and_pybind11_cpp(FUNCTIONAL_GENERATED_SRCS FUNCTIONAL_GE
                                          FUNCTIONAL_PYBIND11_SRCS ${PROJECT_SOURCE_DIR})
 oneflow_add_library(of_functional_obj STATIC ${FUNCTIONAL_GENERATED_SRCS}
                     ${FUNCTIONAL_GENERATED_HRCS})
-target_link_libraries(of_functional_obj glog::glog)
+target_link_libraries(of_functional_obj LLVMSupportWithHeader glog::glog)
 add_dependencies(of_functional_obj prepare_oneflow_third_party)
 
 if(BUILD_PYTHON)
@@ -214,7 +214,7 @@ if(BUILD_PYTHON)
     of_functional_tensor_obj STATIC ${FUNCTIONAL_TENSOR_GENERATED_SRCS}
     ${FUNCTIONAL_TENSOR_GENERATED_HRCS} ${FUNCTIONAL_OPS_GENERATED_SRCS}
     ${FUNCTIONAL_OPS_GENERATED_HRCS})
-  target_link_libraries(of_functional_tensor_obj glog::glog)
+  target_link_libraries(of_functional_tensor_obj LLVMSupportWithHeader glog::glog)
   add_dependencies(of_functional_tensor_obj prepare_oneflow_third_party)
   target_include_directories(of_functional_tensor_obj PRIVATE ${Python_INCLUDE_DIRS}
                                                               ${Python_NumPy_INCLUDE_DIRS})
@@ -274,6 +274,22 @@ if(WITH_MLIR)
   set(ONEFLOW_MLIR_LIBS -Wl,--no-as-needed MLIROneFlowExtension -Wl,--as-needed)
 endif()
 
+if("${LLVM_PROVIDER}" STREQUAL "install")
+  get_property(LLVM_INSTALL_DIR GLOBAL PROPERTY LLVM_INSTALL_DIR)
+  check_variable_defined(LLVM_INSTALL_DIR)
+  find_library(LLVMSupportLib LLVMSupport PATHS ${LLVM_INSTALL_DIR}/lib REQUIRED)
+  add_library(LLVMSupportWithHeader UNKNOWN IMPORTED)
+  set_property(TARGET LLVMSupportWithHeader PROPERTY IMPORTED_LOCATION ${LLVMSupportLib})
+else()
+  add_library(LLVMSupportWithHeader INTERFACE IMPORTED)
+  target_link_libraries(LLVMSupportWithHeader INTERFACE LLVMSupport)
+endif()
+check_variable_defined(LLVM_INCLUDE_DIRS)
+set_property(TARGET LLVMSupportWithHeader PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+                                                   ${LLVM_INCLUDE_DIRS})
+
+list(APPEND oneflow_third_party_libs LLVMSupportWithHeader)
+
 include(op_schema)
 
 get_property(EXTERNAL_INCLUDE_DIRS GLOBAL PROPERTY EXTERNAL_INCLUDE_DIRS)
@@ -317,7 +333,9 @@ endif()
 if(BUILD_PYTHON)
 
   # py ext lib
-  oneflow_add_library(of_pyext_obj SHARED ${of_pyext_obj_cc})
+  # This library should be static to make sure all python symbols are included in the final ext shared lib,
+  # so that it is safe to do wheel audits of multiple pythons version in parallel.
+  oneflow_add_library(of_pyext_obj STATIC ${of_pyext_obj_cc})
   target_include_directories(of_pyext_obj PRIVATE ${Python_INCLUDE_DIRS}
                                                   ${Python_NumPy_INCLUDE_DIRS})
   target_link_libraries(of_pyext_obj oneflow pybind11::headers)

diff --git a/cmake/op_schema.cmake b/cmake/op_schema.cmake
@@ -81,5 +81,5 @@ set_source_files_properties(${GENERATED_OP_SCHEMA_H} ${GENERATED_OP_SCHEMA_CPP}
                                                                                            TRUE)
 
 oneflow_add_library(of_op_schema OBJECT ${GENERATED_OP_SCHEMA_H} ${GENERATED_OP_SCHEMA_CPP})
-target_link_libraries(of_op_schema glog::glog)
+target_link_libraries(of_op_schema LLVMSupportWithHeader glog::glog)
 add_dependencies(of_op_schema prepare_oneflow_third_party)
diff --git a/cmake/util.cmake b/cmake/util.cmake
@@ -269,6 +269,12 @@ function(set_compile_options_to_oneflow_target target)
   endif()
 endfunction()
 
+function(check_variable_defined variable)
+  if(NOT DEFINED ${variable})
+    message(FATAL_ERROR "Variable ${variable} is not defined")
+  endif()
+endfunction()
+
 function(checkDirAndAppendSlash)
   set(singleValues DIR;OUTPUT)
   set(prefix ARG)

diff --git a/docs/source/graph.rst b/docs/source/graph.rst
@@ -20,12 +20,11 @@ Base class for running neural networks in Static Graph Mode.
 
 .. autoclass:: oneflow.nn.graph.graph_config.GraphConfig
     :members: enable_amp,
+            enable_zero,
             allow_fuse_model_update_ops,
             allow_fuse_add_to_output,
             allow_fuse_cast_scale,
             set_gradient_accumulation_steps,
-            set_zero_redundancy_optimizer_mode,
-            set_zero_redundancy_optimizer_min_size_after_split,
             enable_cudnn_conv_heuristic_search_algo,
     :member-order: bysource
 

diff --git a/oneflow/api/common/variable_tensor_mgr.h b/oneflow/api/common/variable_tensor_mgr.h
@@ -28,6 +28,10 @@ inline Maybe<void> FillVariableTensorMgr(
   auto mgr = Global<VariableTensorMgr>::Get();
   return mgr->Fill(variable_op_names, variable_tensors);
 }
+inline void ClearVariableTensorMgr() {
+  auto mgr = Global<VariableTensorMgr>::Get();
+  mgr->Clear();
+}
 
 inline std::tuple<std::vector<std::string>, std::vector<std::shared_ptr<one::Tensor>>>
 DumpVariableTensorMgr() {

diff --git a/oneflow/api/cpp/framework/graph.cpp b/oneflow/api/cpp/framework/graph.cpp
@@ -127,6 +127,9 @@ class Graph::GraphImpl final {
   std::vector<Tensor> Forward(const std::vector<Tensor>& inputs);
   void set_batch_size(int batch_size) { batch_size_ = batch_size; }
 
+  of::Maybe<void> RegisterJobPass(
+      const std::function<std::string(const std::string& job)>& pass_fn);
+
  private:
   of::Maybe<void> CollectInputOutputInfos();
   of::Maybe<void> Compile(const std::vector<Tensor>& inputs);
@@ -135,6 +138,7 @@ class Graph::GraphImpl final {
   of::Maybe<void> BuildGraph();
   of::Maybe<void> LoadCheckpoint();
   of::Maybe<void> RegisterTensors(const std::vector<Tensor>& inputs);
+  of::Maybe<of::Job> ApplyJobPasses(const of::Job& job);
 
   std::shared_ptr<of::NNGraph> graph_ = nullptr;
   std::string model_path_;
@@ -149,6 +153,7 @@ class Graph::GraphImpl final {
   of::HashMap<std::string, std::shared_ptr<of::one::Tensor>> variable_op_name_to_tensor_;
   std::shared_ptr<of::one::TensorTuple> output_tensor_tuple_;
   std::shared_ptr<of::one::TensorTuple> parameter_tensor_tuple_;
+  std::vector<std::function<std::string(const std::string&)>> registered_job_passes_;
 };
 
 Graph::Graph(const std::string& model_path, const Device& device)
@@ -168,6 +173,10 @@ InputOutputInfos Graph::GetInputInfos() { return graph_->GetInputInfos(); }
 
 InputOutputInfos Graph::GetOutputInfos() { return graph_->GetOutputInfos(); }
 
+void Graph::RegisterJobPass(const std::function<std::string(const std::string& job)>& pass_fn) {
+  CHECK_JUST(graph_->RegisterJobPass(pass_fn));
+}
+
 IValue Graph::Forward(const IValue& inputs) {
   std::vector<Tensor> input_tensors;
   if (inputs.IsNone()) {
@@ -234,6 +243,28 @@ of::Maybe<void> Graph::GraphImpl::CollectInputOutputInfos() {
   return of::Maybe<void>::Ok();
 }
 
+of::Maybe<void> Graph::GraphImpl::RegisterJobPass(
+    const std::function<std::string(const std::string& job)>& pass_fn) {
+  if (is_compiled_) {
+    return of::Error::RuntimeError() << "job pass should be registered before compile and forward";
+  }
+  registered_job_passes_.emplace_back(pass_fn);
+  return of::Maybe<void>::Ok();
+}
+
+of::Maybe<of::Job> Graph::GraphImpl::ApplyJobPasses(const of::Job& job) {
+  auto current_job = std::make_shared<of::Job>(job);
+  for (const auto& pass_fn : registered_job_passes_) {
+    std::string new_serialized_job = pass_fn(current_job->SerializeAsString());
+    of::Job new_job;
+    if (!new_job.ParseFromString(new_serialized_job)) {
+      return of::Error::RuntimeError() << "invalid serialized job after pass applied";
+    }
+    current_job->Swap(&new_job);
+  }
+  return current_job;
+}
+
 std::vector<Tensor> Graph::GraphImpl::Forward(const std::vector<Tensor>& inputs) {
   if (!is_compiled_) {
     static std::mutex mtx;
@@ -299,9 +330,12 @@ of::Maybe<void> Graph::GraphImpl::BuildGraph() {
   }
   JUST(LoadCheckpoint());
   JUST(of::CurJobBuildAndInferCtx_Complete());
-  const std::shared_ptr<of::Job> complete_job = JUST(of::GetCurrentJob());
+  std::shared_ptr<of::Job> complete_job = JUST(of::GetCurrentJob());
   int64_t job_id = JUST(of::JobBuildAndInferCtx_GetCurrentJobId());
   CHECK(of::Global<OneFlowEnv>::Get() != nullptr);
+
+  // apply custom job passes
+  complete_job = JUST(ApplyJobPasses(*complete_job));
   graph_ = std::make_shared<of::NNGraph>(job_.job_conf().job_name(), *complete_job, job_id,
                                          of::Global<OneFlowEnv>::Get()->GetSessionCtx());
   {

diff --git a/oneflow/api/cpp/framework/graph.h b/oneflow/api/cpp/framework/graph.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensor.h"
 #include <cstddef>
 #include <string>
+#include <functional>
 #include <unordered_map>
 
 namespace oneflow {
@@ -64,6 +65,8 @@ class Graph {
   IValue Forward(const IValue& inputs);
   void set_batch_size(int batch_size);
 
+  void RegisterJobPass(const std::function<std::string(const std::string& job)>& pass_fn);
+
   static Graph Load(const std::string& model_path, const Device& device = Device("cpu"));
 
  private:

diff --git a/oneflow/api/python/framework/nn_graph.cpp b/oneflow/api/python/framework/nn_graph.cpp
@@ -80,12 +80,18 @@ ONEFLOW_API_PYBIND11_MODULE("nn.graph.", m) {
   m.def("RunLazyNNGraph", &RunLazyNNGraph);
   m.def("SoftSyncNNGraphBuffers", &SoftSyncNNGraphBuffers);
   m.def("AddTensorAsGraphLoss", &AddTensorAsGraphLoss);
+  m.def("ConvertJobToTosaIR", [](const std::string& serialized_job) -> Maybe<std::string> {
+    Job job;
+    CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job))
+        << "serialized job conversion failed.";
+    return ConvertJobToTosaIR(&job);
+  });
   m.def("SaveJobToIR",
         [](const std::string& serialized_job, const std::string& path) -> Maybe<void> {
           Job job;
-          CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job));
+          CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job))
+              << "serialized job conversion failed.";
           return SaveJobToIR(&job, path);
-          ;
         });
   m.def("LoadSerializedJobFromIR", [](const std::string& path) -> Maybe<py::bytes> {
     Job job;

diff --git a/oneflow/api/python/framework/tensor.cpp b/oneflow/api/python/framework/tensor.cpp
@@ -337,6 +337,7 @@ static PyObject* PyTensorObject_type(PyObject* self, PyObject* args, PyObject* k
     ASSERT(CopyBetweenMirroredTensorAndNumpy<T>(PyTensor_Unpack(self), copied,            \
                                                 BlobNumpyCopyUtil<T>::From, "mut",        \
                                                 /*block_host_until_done=*/false));        \
+    Py_DECREF(copied);                                                                    \
     Py_RETURN_NONE;                                                                       \
     END_HANDLE_ERRORS                                                                     \
   }