diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake
index 178b8fedbd3..205224541a2 100644
--- a/cmake/oneflow.cmake
+++ b/cmake/oneflow.cmake
@@ -197,7 +197,7 @@ generate_functional_api_and_pybind11_cpp(FUNCTIONAL_GENERATED_SRCS FUNCTIONAL_GE
                                          FUNCTIONAL_PYBIND11_SRCS ${PROJECT_SOURCE_DIR})
 oneflow_add_library(of_functional_obj STATIC ${FUNCTIONAL_GENERATED_SRCS}
                     ${FUNCTIONAL_GENERATED_HRCS})
-target_link_libraries(of_functional_obj glog::glog)
+target_link_libraries(of_functional_obj LLVMSupportWithHeader glog::glog)
 add_dependencies(of_functional_obj prepare_oneflow_third_party)
 
 if(BUILD_PYTHON)
@@ -214,7 +214,7 @@ if(BUILD_PYTHON)
     of_functional_tensor_obj STATIC ${FUNCTIONAL_TENSOR_GENERATED_SRCS}
     ${FUNCTIONAL_TENSOR_GENERATED_HRCS} ${FUNCTIONAL_OPS_GENERATED_SRCS}
     ${FUNCTIONAL_OPS_GENERATED_HRCS})
-  target_link_libraries(of_functional_tensor_obj glog::glog)
+  target_link_libraries(of_functional_tensor_obj LLVMSupportWithHeader glog::glog)
   add_dependencies(of_functional_tensor_obj prepare_oneflow_third_party)
   target_include_directories(of_functional_tensor_obj PRIVATE ${Python_INCLUDE_DIRS}
                                                               ${Python_NumPy_INCLUDE_DIRS})
@@ -274,6 +274,22 @@ if(WITH_MLIR)
   set(ONEFLOW_MLIR_LIBS -Wl,--no-as-needed MLIROneFlowExtension -Wl,--as-needed)
 endif()
 
+if("${LLVM_PROVIDER}" STREQUAL "install")
+  get_property(LLVM_INSTALL_DIR GLOBAL PROPERTY LLVM_INSTALL_DIR)
+  check_variable_defined(LLVM_INSTALL_DIR)
+  find_library(LLVMSupportLib LLVMSupport PATHS ${LLVM_INSTALL_DIR}/lib REQUIRED)
+  add_library(LLVMSupportWithHeader UNKNOWN IMPORTED)
+  set_property(TARGET LLVMSupportWithHeader PROPERTY IMPORTED_LOCATION ${LLVMSupportLib})
+else()
+  add_library(LLVMSupportWithHeader INTERFACE IMPORTED)
+  target_link_libraries(LLVMSupportWithHeader INTERFACE LLVMSupport)
+endif()
+check_variable_defined(LLVM_INCLUDE_DIRS)
+set_property(TARGET LLVMSupportWithHeader PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+                                                   ${LLVM_INCLUDE_DIRS})
+
+list(APPEND oneflow_third_party_libs LLVMSupportWithHeader)
+
 include(op_schema)
 
 get_property(EXTERNAL_INCLUDE_DIRS GLOBAL PROPERTY EXTERNAL_INCLUDE_DIRS)
diff --git a/cmake/op_schema.cmake b/cmake/op_schema.cmake
index ce790c1918b..5017fab574e 100644
--- a/cmake/op_schema.cmake
+++ b/cmake/op_schema.cmake
@@ -81,5 +81,5 @@ set_source_files_properties(${GENERATED_OP_SCHEMA_H} ${GENERATED_OP_SCHEMA_CPP}
                                                                                            TRUE)
 
 oneflow_add_library(of_op_schema OBJECT ${GENERATED_OP_SCHEMA_H} ${GENERATED_OP_SCHEMA_CPP})
-target_link_libraries(of_op_schema glog::glog)
+target_link_libraries(of_op_schema LLVMSupportWithHeader glog::glog)
 add_dependencies(of_op_schema prepare_oneflow_third_party)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 3aaae830e12..a69128f416e 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -269,6 +269,12 @@ function(set_compile_options_to_oneflow_target target)
   endif()
 endfunction()
 
+function(check_variable_defined variable)
+  if(NOT DEFINED ${variable})
+    message(FATAL_ERROR "Variable ${variable} is not defined")
+  endif()
+endfunction()
+
 function(checkDirAndAppendSlash)
   set(singleValues DIR;OUTPUT)
   set(prefix ARG)
diff --git a/docs/source/graph.rst b/docs/source/graph.rst
index 5ec08061a8a..270e5a01cf0 100644
--- a/docs/source/graph.rst
+++ b/docs/source/graph.rst
@@ -20,12 +20,11 @@ Base class for running neural networks in Static Graph Mode.
 
 .. autoclass:: oneflow.nn.graph.graph_config.GraphConfig
     :members: enable_amp,
+            enable_zero,
             allow_fuse_model_update_ops,
             allow_fuse_add_to_output,
             allow_fuse_cast_scale,
             set_gradient_accumulation_steps,
-            set_zero_redundancy_optimizer_mode,
-            set_zero_redundancy_optimizer_min_size_after_split,
             enable_cudnn_conv_heuristic_search_algo,
     :member-order: bysource
 
diff --git a/oneflow/api/python/framework/nn_graph.cpp b/oneflow/api/python/framework/nn_graph.cpp
index e02d86e9eb1..9e0c939b3e2 100644
--- a/oneflow/api/python/framework/nn_graph.cpp
+++ b/oneflow/api/python/framework/nn_graph.cpp
@@ -80,12 +80,18 @@ ONEFLOW_API_PYBIND11_MODULE("nn.graph.", m) {
   m.def("RunLazyNNGraph", &RunLazyNNGraph);
   m.def("SoftSyncNNGraphBuffers", &SoftSyncNNGraphBuffers);
   m.def("AddTensorAsGraphLoss", &AddTensorAsGraphLoss);
+  m.def("ConvertJobToTosaIR", [](const std::string& serialized_job) -> Maybe<std::string> {
+    Job job;
+    CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job))
+        << "serialized job conversion failed.";
+    return ConvertJobToTosaIR(&job);
+  });
   m.def("SaveJobToIR",
         [](const std::string& serialized_job, const std::string& path) -> Maybe<void> {
           Job job;
-          CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job));
+          CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job))
+              << "serialized job conversion failed.";
           return SaveJobToIR(&job, path);
-          ;
         });
   m.def("LoadSerializedJobFromIR", [](const std::string& path) -> Maybe<py::bytes> {
     Job job;
diff --git a/oneflow/core/autograd/gradient_funcs/normalization.cpp b/oneflow/core/autograd/gradient_funcs/normalization.cpp
index e336edf519d..c12fcb60442 100644
--- a/oneflow/core/autograd/gradient_funcs/normalization.cpp
+++ b/oneflow/core/autograd/gradient_funcs/normalization.cpp
@@ -136,15 +136,14 @@ class NormalizationGrad : public OpExprGradFunction<NormalizationGradCaptureStat
       return Maybe<void>::Ok();
     }
 
-    DimVector dim_vec;
+    Shape shape;
     for (int i = 0; i < x->shape()->NumAxes(); ++i) {
       if (i != ctx->axis) {
-        dim_vec.emplace_back(1);
+        shape.emplace_back(1);
       } else {
-        dim_vec.emplace_back(x->shape()->At(ctx->axis));
+        shape.emplace_back(x->shape()->At(ctx->axis));
       }
     }
-    Shape shape(dim_vec);
     const auto& reshaped_gamma = JUST(functional::Reshape(gamma, shape));
     const auto& reshaped_inv_variance = JUST(functional::Reshape(inv_variance, shape));
 
diff --git a/oneflow/core/common/fixed_vector.h b/oneflow/core/common/fixed_vector.h
deleted file mode 100644
index b3d1c98c827..00000000000
--- a/oneflow/core/common/fixed_vector.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_COMMON_FIXED_VECTOR_H_
-#define ONEFLOW_CORE_COMMON_FIXED_VECTOR_H_
-
-#include <array>
-#include <initializer_list>
-#include <vector>
-#include <glog/logging.h>
-
-namespace oneflow {
-
-template<typename _InIter>
-using RequireInputIter = typename std::enable_if<
-    std::is_convertible<typename std::iterator_traits<_InIter>::iterator_category,
-                        std::input_iterator_tag>::value>::type;
-
-template<typename T, int kMaxSize>
-class fixed_vector final {
- public:
-  using value_type = T;
-  using size_type = std::size_t;
-  using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
-  using const_reference = const value_type&;
-  using pointer = T*;
-  using const_pointer = const T*;
-  using iterator = T*;
-  using const_iterator = const T*;
-  using reverse_iterator = std::reverse_iterator<iterator>;
-  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
-
-  fixed_vector() : size_(0) {}
-  explicit fixed_vector(size_t size) { assign(size, T()); }
-  explicit fixed_vector(size_t size, const T& val) { assign(size, val); }
-  template<class InputIt, typename = RequireInputIter<InputIt>>
-  fixed_vector(InputIt first, InputIt last) {
-    assign(first, last);
-  }
-  fixed_vector(const fixed_vector& rhs) { *this = rhs; }
-  fixed_vector(fixed_vector&& rhs) { *this = std::move(rhs); }
-  fixed_vector(std::initializer_list<T> rhs) { assign(rhs); }
-  ~fixed_vector() = default;
-
-  fixed_vector& operator=(const fixed_vector& rhs) {
-    size_ = rhs.size();
-    CheckSize();
-    std::copy(rhs.begin(), rhs.end(), begin());
-    return *this;
-  }
-  fixed_vector& operator=(fixed_vector&& rhs) noexcept {
-    size_ = rhs.size();
-    CheckSize();
-    std::copy(rhs.begin(), rhs.end(), begin());
-    return *this;
-  }
-  fixed_vector& operator=(std::initializer_list<T> ilist) {
-    size_ = ilist.size();
-    assign(ilist);
-    return *this;
-  }
-  void assign(size_type count, const value_type& value) {
-    size_ = count;
-    CheckSize();
-    std::fill(begin(), begin() + size_, value);
-  }
-  template<class InputIt, typename = RequireInputIter<InputIt>>
-  void assign(InputIt first, InputIt last) {
-    size_ = last - first;
-    CheckSize();
-    std::copy(first, last, begin());
-  }
-  void assign(std::initializer_list<T> ilist) {
-    size_ = ilist.size();
-    CheckSize();
-    std::copy(ilist.begin(), ilist.end(), begin());
-  }
-
-  reference at(size_type pos) {
-    CheckPos(pos);
-    return data_.at(pos);
-  }
-  const_reference at(size_type pos) const {
-    CheckPos(pos);
-    return data_.at(pos);
-  }
-
-  reference operator[](size_type pos) {
-    CheckPos(pos);
-    return data_[pos];
-  }
-  const_reference operator[](size_type pos) const {
-    CheckPos(pos);
-    return data_[pos];
-  }
-
-  reference front() {
-    CheckPos(0);
-    return data_.at(0);
-  }
-  const_reference front() const {
-    CheckPos(0);
-    return data_.at(0);
-  }
-
-  reference back() {
-    CheckPos(0);
-    return data_.at(size_ - 1);
-  }
-  const_reference back() const {
-    CheckPos(0);
-    return data_.at(size_ - 1);
-  }
-
-  T* data() noexcept { return data_.data(); }
-  const T* data() const noexcept { return data_.data(); }
-
-  iterator begin() noexcept { return data_.data(); }
-  const_iterator begin() const noexcept { return data_.data(); }
-  const_iterator cbegin() const noexcept { return data_.data(); }
-
-  iterator end() noexcept { return data_.data() + size_; }
-  const_iterator end() const noexcept { return data_.data() + size_; }
-  const_iterator cend() const noexcept { return data_.data() + size_; }
-
-  reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
-  const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); }
-  const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(cend()); }
-
-  reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
-  const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); }
-  const_reverse_iterator crend() const noexcept { return const_reverse_iterator(cbegin()); }
-
-  bool empty() const noexcept { return size_ == 0; }
-
-  size_type size() const noexcept { return size_; }
-
-  size_type max_size() const noexcept { return kMaxSize; }
-
-  size_type capacity() const noexcept { return kMaxSize; }
-
-  void clear() noexcept { size_ = 0; }
-
-  iterator insert(iterator pos, const T& value) {
-    MoveNToEnd(pos, 1);
-    *pos = value;
-    return pos;
-  }
-  iterator insert(iterator pos, T&& value) {
-    MoveNToEnd(pos, 1);
-    *pos = std::move(value);
-    return pos;
-  }
-  iterator insert(iterator pos, size_type count, const T& value) {
-    MoveNToEnd(pos, count);
-    std::fill(pos, pos + count, value);
-    return pos;
-  }
-  template<class InputIt, typename = RequireInputIter<InputIt>>
-  void insert(iterator pos, InputIt first, InputIt last) {
-    MoveNToEnd(pos, last - first);
-    std::copy(first, last, pos);
-  }
-  iterator insert(iterator pos, std::initializer_list<T> ilist) {
-    MoveNToEnd(pos, ilist.size());
-    std::copy(ilist.begin(), ilist.end(), pos);
-    return pos;
-  }
-
-  template<class... Args>
-  iterator emplace(iterator pos, Args&&... args) {
-    MoveNToEnd(pos, 1);
-    new (&*pos) T(std::forward<Args>(args)...);
-    return pos;
-  }
-
-  iterator erase(iterator pos) {
-    MoveNToBegin(pos + 1, 1);
-    return pos;
-  }
-  iterator erase(iterator first, iterator last) {
-    if (first >= last) { return last; }
-    MoveNToBegin(last, last - first);
-    return first;
-  }
-
-  void push_back(const T& value) { insert(end(), value); }
-  void push_back(T&& value) { insert(end(), std::move(value)); }
-  void emplace_back(const T& value) { insert(end(), value); }
-  template<class... Args>
-  void emplace_back(Args&&... args) {
-    insert(end(), std::forward<Args>(args)...);
-  }
-
-  void pop_back() { --size_; }
-
-  void resize(size_type count) { resize(count, T()); }
-  void resize(size_type count, const value_type& value) {
-    if (count == size_) { return; }
-    if (count < size_) {
-      erase(begin() + count, end());
-      return;
-    }
-    insert(end(), count - size_, value);
-  }
-
-  void swap(fixed_vector& rhs) noexcept {
-    fixed_vector tmp;
-    tmp = rhs;
-    rhs = *this;
-    *this = tmp;
-  }
-
-  bool operator==(const fixed_vector& rhs) const {
-    if (size() != rhs.size()) { return false; }
-    return std::equal(begin(), end(), rhs.begin());
-  }
-
-  bool operator!=(const fixed_vector& rhs) const { return !(*this == rhs); }
-
-  bool operator>=(const fixed_vector& rhs) const { return !(*this < rhs); }
-
-  bool operator<=(const fixed_vector& rhs) const { return !(*this > rhs); }
-
-  bool operator>(const fixed_vector& rhs) const {
-    return std::lexicographical_compare(rhs.begin(), rhs.end(), begin(), end());
-  }
-
-  bool operator<(const fixed_vector& rhs) const {
-    return std::lexicographical_compare(begin(), end(), rhs.begin(), rhs.end());
-  }
-
- private:
-  void CheckSize() const { CheckSize(size_); }
-  void CheckSize(size_t size) const { CHECK_LE(size, kMaxSize); }
-  void CheckPos(size_t pos) const { CHECK_LE(pos, size_); }
-  void MoveNToEnd(iterator first, size_t N) {
-    CheckSize(size_ + N);
-    iterator old_end = end();
-    size_ += N;
-    iterator new_end = end();
-    std::copy_backward(first, old_end, new_end);
-  }
-  void MoveNToBegin(iterator last, size_t N) {
-    CheckPos(last - N - begin());
-    iterator old_end = end();
-    size_ -= N;
-    std::copy(last, old_end, last - N);
-  }
-
-  size_t size_;
-  std::array<T, kMaxSize> data_;
-};
-
-template<class T, long kMaxSize>
-void swap(fixed_vector<T, kMaxSize>& lhs, fixed_vector<T, kMaxSize>& rhs) {
-  return lhs.swap(rhs);
-}
-
-#define SHAPE_MAX_AXIS_SIZE 20
-
-}  // namespace oneflow
-
-#endif  // ONEFLOW_CORE_COMMON_FIXED_VECTOR_H_
diff --git a/oneflow/core/common/fixed_vector_test.cpp b/oneflow/core/common/fixed_vector_test.cpp
deleted file mode 100644
index cb79b7510e0..00000000000
--- a/oneflow/core/common/fixed_vector_test.cpp
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/common/fixed_vector.h"
-#include "gtest/gtest.h"
-#include <functional>
-#include <algorithm>
-
-namespace oneflow {
-
-namespace test {
-
-using FixedVec = fixed_vector<int, 32>;
-
-TEST(fixed_vector, constructor_0) {
-  FixedVec a(8);
-  ASSERT_EQ(a.size(), 8);
-}
-
-TEST(fixed_vector, constructor_1) {
-  int value = 30;
-  FixedVec a(8, value);
-  ASSERT_TRUE(std::all_of(a.begin(), a.end(), [value](const int x) { return x == value; }));
-}
-
-TEST(fixed_vector, constructor_2) {
-  std::vector<int> vec{1, 2, 3, 4};
-  FixedVec a(vec.begin(), vec.end());
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, constructor_3) {
-  std::vector<int> vec{1, 2, 3, 4};
-  FixedVec b(vec.begin(), vec.end());
-  FixedVec a(b);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, constructor_4) {
-  std::vector<int> vec{1, 2, 3, 4};
-  FixedVec b(vec.begin(), vec.end());
-  FixedVec a(std::move(b));
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, constructor_5) {
-  std::vector<int> vec{1, 2, 3, 4};
-  FixedVec a{1, 2, 3, 4};
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, operator_assign_0) {
-  std::vector<int> vec{1, 2, 3, 4};
-  FixedVec b(vec.begin(), vec.end());
-  FixedVec a;
-  a = b;
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, operator_assign_1) {
-  std::vector<int> vec{1, 2, 3, 4};
-  FixedVec b(vec.begin(), vec.end());
-  FixedVec a;
-  a = std::move(b);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, operator_assign_2) {
-  std::vector<int> vec{1, 2, 3, 4};
-  FixedVec a;
-  a = {1, 2, 3, 4};
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, assign_0) {
-  int value = 30;
-  FixedVec a;
-  a.assign(8, value);
-  ASSERT_TRUE(std::all_of(a.begin(), a.end(), [value](const int x) { return x == value; }));
-}
-
-TEST(fixed_vector, assign_1) {
-  std::vector<int> vec{1, 2, 3, 4};
-  FixedVec a;
-  a.assign(vec.begin(), vec.end());
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, assign_2) {
-  std::vector<int> vec{1, 2, 3, 4};
-  FixedVec a;
-  a.assign({1, 2, 3, 4});
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, const_at) {
-  int value = 33;
-  const FixedVec a{value};
-  ASSERT_EQ(a.at(0), value);
-}
-
-TEST(fixed_vector, at) {
-  int value = 33;
-  FixedVec a{0};
-  a.at(0) = value;
-  ASSERT_EQ(a.at(0), value);
-}
-
-TEST(fixed_vector, const_front) {
-  int value = 33;
-  const FixedVec a{value};
-  ASSERT_EQ(a.front(), value);
-}
-
-TEST(fixed_vector, front) {
-  int value = 33;
-  FixedVec a{0};
-  a.front() = value;
-  ASSERT_EQ(a.front(), value);
-}
-
-TEST(fixed_vector, const_back) {
-  int value = 33;
-  const FixedVec a{1, value};
-  ASSERT_EQ(a.back(), value);
-}
-
-TEST(fixed_vector, back) {
-  int value = 33;
-  FixedVec a{1, 0};
-  a.back() = value;
-  ASSERT_EQ(a.back(), value);
-}
-
-TEST(fixed_vector, const_data) {
-  int value = 33;
-  const FixedVec a{value};
-  ASSERT_EQ(*a.data(), value);
-}
-
-TEST(fixed_vector, data) {
-  int value = 33;
-  FixedVec a{0};
-  *a.data() = value;
-  ASSERT_EQ(*a.data(), value);
-}
-
-TEST(fixed_vector, const_begin) {
-  int value = 33;
-  const FixedVec a{value};
-  ASSERT_EQ(*a.begin(), value);
-}
-
-TEST(fixed_vector, begin) {
-  int value = 33;
-  FixedVec a{0};
-  *a.begin() = value;
-  ASSERT_EQ(*a.begin(), value);
-}
-
-TEST(fixed_vector, cbegin) {
-  int value = 33;
-  FixedVec a{value};
-  ASSERT_EQ(*a.cbegin(), value);
-}
-
-TEST(fixed_vector, const_end) {
-  const FixedVec a{0, 1, 2};
-  ASSERT_EQ(a.begin() + a.size(), a.end());
-}
-
-TEST(fixed_vector, end) {
-  FixedVec a{0, 1, 2};
-  ASSERT_EQ(a.begin() + a.size(), a.end());
-}
-
-TEST(fixed_vector, cend) {
-  FixedVec a{0, 1, 2};
-  ASSERT_EQ(a.cbegin() + a.size(), a.cend());
-}
-
-TEST(fixed_vector, const_rbegin) {
-  int value = 33;
-  const FixedVec a{0, value};
-  ASSERT_EQ(*a.rbegin(), value);
-}
-
-TEST(fixed_vector, rbegin) {
-  int value = 33;
-  FixedVec a{0, 0};
-  *a.rbegin() = value;
-  ASSERT_EQ(*a.rbegin(), value);
-}
-
-TEST(fixed_vector, crbegin) {
-  int value = 33;
-  FixedVec a{0, value};
-  ASSERT_EQ(*a.crbegin(), value);
-}
-
-TEST(fixed_vector, const_rend) {
-  const FixedVec a{0, 1, 2};
-  ASSERT_EQ(a.rbegin() + a.size(), a.rend());
-}
-
-TEST(fixed_vector, rend) {
-  FixedVec a{0, 1, 2};
-  ASSERT_EQ(a.rbegin() + a.size(), a.rend());
-}
-
-TEST(fixed_vector, crend) {
-  FixedVec a{0, 1, 2};
-  ASSERT_EQ(a.crbegin() + a.size(), a.crend());
-}
-
-TEST(fixed_vector, insert_0) {
-  std::vector<int> vec{0, 1, 2, 3};
-  FixedVec a{1, 2};
-  a.insert(a.begin(), 0);
-  a.insert(a.end(), 3);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, insert_1) {
-  std::vector<int> vec{0, 1, 2, 3};
-  FixedVec a{1, 2};
-  int zero = 0;
-  int three = 3;
-  a.insert(a.begin(), std::move(zero));
-  a.insert(a.end(), std::move(three));
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, insert_2) {
-  std::vector<int> vec{0, 0, 1, 2, 3, 3};
-  FixedVec a{1, 2};
-  a.insert(a.begin(), 2, 0);
-  a.insert(a.end(), 2, 3);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, insert_3) {
-  std::vector<int> vec{0, 0, 1, 2, 3, 3};
-  FixedVec a{1, 2};
-  int zero = 0;
-  int three = 3;
-  a.insert(a.begin(), 2, std::move(zero));
-  a.insert(a.end(), 2, std::move(three));
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, insert_4) {
-  std::vector<int> vec{0, 0, 1, 2, 3, 3};
-  FixedVec a{1, 2};
-  std::vector<int> zeros{0, 0};
-  std::vector<int> threes{3, 3};
-  a.insert(a.begin(), zeros.begin(), zeros.end());
-  a.insert(a.end(), threes.begin(), threes.end());
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, insert_5) {
-  std::vector<int> vec{0, 0, 1, 2, 3, 3};
-  FixedVec a{1, 2};
-  a.insert(a.begin(), {0, 0});
-  a.insert(a.end(), {3, 3});
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, emplace) {
-  std::vector<int> vec{0, 1, 2, 3};
-  FixedVec a{1, 2};
-  a.emplace(a.begin(), 0);
-  a.emplace(a.end(), 3);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, erase_0) {
-  std::vector<int> vec{1, 2};
-  FixedVec a{0, 1, 2, 3};
-  a.erase(a.begin());
-  a.erase(a.end() - 1);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, erase_1) {
-  std::vector<int> vec{1, 2};
-  FixedVec a{0, 0, 1, 2, 3, 3};
-  a.erase(a.begin(), a.begin() + 2);
-  a.erase(a.end() - 2, a.end());
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, push_back_0) {
-  std::vector<int> vec{0, 1, 2, 3};
-  FixedVec a{0, 1, 2};
-  a.emplace_back(3);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, push_back_1) {
-  std::vector<int> vec{0, 1, 2, 3};
-  FixedVec a{0, 1, 2};
-  int three = 3;
-  a.emplace_back(std::move(three));
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, emplace_back) {
-  std::vector<int> vec{0, 1, 2, 3};
-  FixedVec a{0, 1, 2};
-  a.emplace_back(3);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, pop_back) {
-  std::vector<int> vec{0, 1, 2};
-  FixedVec a{0, 1, 2, 3};
-  a.pop_back();
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, resize_0) {
-  std::vector<int> vec{0, 1, 2};
-  FixedVec a{0, 1, 2};
-  a.resize(3);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, resize_1) {
-  std::vector<int> vec{0, 1, 2};
-  FixedVec a{0, 1, 2};
-  a.resize(3, 9527);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, resize_2) {
-  std::vector<int> vec{0};
-  FixedVec a{0, 1, 2};
-  a.resize(1);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, resize_3) {
-  std::vector<int> vec{0};
-  FixedVec a{0, 1, 2};
-  a.resize(1, 9527);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, resize_4) {
-  std::vector<int> vec{0, 1, 2, 0, 0};
-  FixedVec a{0, 1, 2};
-  a.resize(5);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, resize_5) {
-  std::vector<int> vec{0, 1, 2, 3, 3};
-  FixedVec a{0, 1, 2};
-  a.resize(5, 3);
-  ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin()));
-}
-
-TEST(fixed_vector, swap) {
-  std::vector<int> vec_0{0, 1, 2, 0, 0};
-  std::vector<int> vec_1{0, 1, 2, 3, 3};
-  FixedVec a_0(vec_1.begin(), vec_1.end());
-  FixedVec a_1(vec_0.begin(), vec_0.end());
-  a_0.swap(a_1);
-  ASSERT_TRUE(std::equal(a_0.begin(), a_0.end(), vec_0.begin()));
-  ASSERT_TRUE(std::equal(a_1.begin(), a_1.end(), vec_1.begin()));
-}
-
-void WithTwoVector(std::function<void(const std::vector<int>&, const std::vector<int>&)> Handler) {
-  std::vector<int> a{0, 1, 2, 3, 4};
-  std::vector<int> b{0, 1, 2, 3};
-  std::vector<int> c{4, 3, 2};
-  Handler(a, a);
-  Handler(a, b);
-  Handler(a, c);
-  Handler(b, a);
-  Handler(b, b);
-  Handler(b, c);
-  Handler(c, a);
-  Handler(c, b);
-  Handler(c, c);
-}
-
-#define TEST_LOGICAL_OPERATOR(test_name, logical_op)                                             \
-  TEST(fixed_vector, test_name) {                                                                \
-    WithTwoVector([](const std::vector<int>& lhs, const std::vector<int>& rhs) {                 \
-      ASSERT_EQ((lhs logical_op rhs),                                                            \
-                (FixedVec(lhs.begin(), lhs.end()) logical_op FixedVec(rhs.begin(), rhs.end()))); \
-    });                                                                                          \
-  }
-
-TEST_LOGICAL_OPERATOR(eq, ==);
-TEST_LOGICAL_OPERATOR(ne, !=);
-TEST_LOGICAL_OPERATOR(gt, >);
-TEST_LOGICAL_OPERATOR(ge, >=);
-TEST_LOGICAL_OPERATOR(lt, <);
-TEST_LOGICAL_OPERATOR(le, <=);
-
-}  // namespace test
-
-}  // namespace oneflow
diff --git a/oneflow/core/common/shape.cpp b/oneflow/core/common/shape.cpp
index 2286cac3290..5d9b5a96a35 100644
--- a/oneflow/core/common/shape.cpp
+++ b/oneflow/core/common/shape.cpp
@@ -59,50 +59,32 @@ int64_t ShiftNegativeAxis(int64_t axis, const int64_t num_axes) {
   return axis;
 }
 
-Shape::Shape(const std::initializer_list<int64_t>& dim_vec)
-    : dim_vec_(dim_vec), is_initialized_(true) {}
-Shape::Shape(const DimVector& dim_vec) : dim_vec_(dim_vec), is_initialized_(true) {}
-Shape::Shape(DimVector&& dim_vec) : dim_vec_(std::move(dim_vec)), is_initialized_(true) {}
-Shape::Shape(const ShapeProto& shape_proto) : is_initialized_(true) {
-  dim_vec_.assign(shape_proto.dim().begin(), shape_proto.dim().end());
-}
-
-Shape& Shape::operator=(const Shape& shape) {
-  dim_vec_ = shape.dim_vec_;
-  is_initialized_ = shape.is_initialized_;
-  return *this;
-}
-
-Shape& Shape::assign(const DimVector& dim_vec) {
-  dim_vec_ = dim_vec;
-  is_initialized_ = true;
-  return *this;
-}
+Shape::Shape(const DimVector& dim_vec) : DimVector(dim_vec), is_initialized_(true) {}
+Shape::Shape(DimVector&& dim_vec) : DimVector(std::move(dim_vec)), is_initialized_(true) {}
+Shape::Shape(const ShapeProto& shape_proto)
+    : DimVector(shape_proto.dim().begin(), shape_proto.dim().end()), is_initialized_(true) {}
 
 Shape& Shape::CheckNumAxesIdenticalAndAssign(const ShapeView& shape_view) {
   CHECK_EQ(NumAxes(), shape_view.NumAxes());
-  std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), dim_vec_.data());
+  std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), data());
   return *this;
 }
 
 Shape& Shape::LeftOnesExtendedAssign(const ShapeView& shape_view) {
   CHECK_GE(NumAxes(), shape_view.NumAxes());
   size_t left_ones_size = NumAxes() - shape_view.NumAxes();
-  FOR_RANGE(int, i, 0, left_ones_size) { dim_vec_.at(i) = 1LL; }
-  std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(),
-            dim_vec_.data() + left_ones_size);
+  FOR_RANGE(int, i, 0, left_ones_size) { (*this)[i] = 1LL; }
+  std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), data() + left_ones_size);
   return *this;
 }
 
-bool Shape::operator==(const Shape& rhs) const { return dim_vec_ == rhs.dim_vec_; }
-
 std::string Shape::ToString() const {
   std::stringstream ss;
   int32_t idx = 0;
   ss << "(";
-  for (int64_t dim : dim_vec_) {
+  for (int64_t dim : *this) {
     ss << dim;
-    if (++idx != dim_vec_.size() || dim_vec_.size() == 1) { ss << ","; }
+    if (++idx != size() || size() == 1) { ss << ","; }
   }
   ss << ")";
   return ss.str();
@@ -111,21 +93,21 @@ std::string Shape::ToString() const {
 std::string Shape::DebugStr() const { return ToString(); }
 
 void Shape::ToProto(ShapeProto* ret) const {
-  *(ret->mutable_dim()) = PbRf<int64_t>(dim_vec_.begin(), dim_vec_.end());
+  *(ret->mutable_dim()) = PbRf<int64_t>(begin(), end());
 }
 
 int64_t Shape::At(int64_t index) const {
   CHECK_GE(index, 0);
   CHECK_LT(index, this->NumAxes()) << " Shape: " << DebugStr() << " visit index: " << index
                                    << " > num_axes: " << this->NumAxes();
-  return dim_vec_.at(index);
+  return (*this)[index];
 }
 
 void Shape::Set(int64_t index, int64_t val) {
   CHECK_GE(index, 0);
   CHECK_LT(index, this->NumAxes()) << " Shape: " << DebugStr() << " visit index: " << index
                                    << " > num_axes: " << this->NumAxes();
-  dim_vec_.at(index) = val;
+  (*this)[index] = val;
 }
 
 int64_t Shape::Count(int64_t begin_axis, int64_t end_axis) const {
@@ -206,9 +188,9 @@ Maybe<Shape> Shape::Slice(int64_t start_dim, int64_t end_dim) const {
   int64_t ndims = this->NumAxes();
   if (start_dim > ndims) { start_dim = ndims; }
   if (end_dim > ndims) { end_dim = ndims; }
-  DimVector dim_vec;
-  for (int64_t i = start_dim; i < end_dim && i < ndims; ++i) { dim_vec.emplace_back(this->At(i)); }
-  return std::make_shared<Shape>(dim_vec);
+  std::shared_ptr<Shape> shape = std::make_shared<Shape>();
+  for (int64_t i = start_dim; i < end_dim && i < ndims; ++i) { shape->emplace_back(this->At(i)); }
+  return shape;
 }
 
 }  // namespace oneflow
diff --git a/oneflow/core/common/shape.h b/oneflow/core/common/shape.h
index 408a46c57ca..7a94ad85a6d 100644
--- a/oneflow/core/common/shape.h
+++ b/oneflow/core/common/shape.h
@@ -32,22 +32,37 @@ namespace cfg {
 class ShapeProto;
 }  // namespace cfg
 
-class Shape final {
+class Shape final : public DimVector {
  public:
   // OF_DISALLOW_COPY_AND_MOVE(Shape);
+  using DimVector::DimVector;
   Shape() : is_initialized_(false) {}
   explicit Shape(const DimVector& dim_vec);
   explicit Shape(DimVector&& dim_vec);
   explicit Shape(const ShapeProto& shape_proto);
-  Shape(const std::initializer_list<int64_t>& dim_vec);
+  // explicit constructor from ShapeView
+  explicit Shape(ShapeView shape_view);
   ~Shape() = default;
-  Shape& operator=(const Shape& shape);
-  Shape& assign(const DimVector& dim_vec);
+
+#define OVERRIDE_ADD_DATA_FUNC(func)              \
+  template<typename... Args>                      \
+  void func(Args... args) {                       \
+    DimVector::func(std::forward<Args>(args)...); \
+    is_initialized_ = true;                       \
+  }
+
+  OVERRIDE_ADD_DATA_FUNC(assign)
+  OVERRIDE_ADD_DATA_FUNC(push_back)
+  OVERRIDE_ADD_DATA_FUNC(emplace_back)
+  OVERRIDE_ADD_DATA_FUNC(append)
+  OVERRIDE_ADD_DATA_FUNC(insert)
+  OVERRIDE_ADD_DATA_FUNC(resize)
+
+#undef OVERRIDE_ADD_DATA_FUNC
+
   Shape& CheckNumAxesIdenticalAndAssign(const ShapeView& shape_view);
   Shape& LeftOnesExtendedAssign(const ShapeView& shape_view);
 
-  bool operator==(const Shape& rhs) const;
-  bool operator!=(const Shape& rhs) const { return !(*this == rhs); }
   std::string DebugStr() const;
   std::string ToString() const;
 
@@ -58,16 +73,16 @@ class Shape final {
 
   // Getters and Setters
   bool is_initialized() const { return is_initialized_; }
-  const DimVector& dim_vec() const { return dim_vec_; }
-  DimVector& dim_vec() { return dim_vec_; }
+  const DimVector& dim_vec() const { return *this; }
+  DimVector& dim_vec() { return *this; }
   int64_t elem_cnt() const {
-    return std::accumulate(dim_vec_.begin(), dim_vec_.end(), int64_t(1), std::multiplies<>());
+    return std::accumulate(begin(), end(), int64_t(1), std::multiplies<>());
   }
   int64_t At(int64_t index) const;
   void Set(int64_t index, int64_t val);
   int64_t NumAxes() const {
     CHECK(is_initialized());
-    return dim_vec_.size();
+    return size();
   }
   int64_t Count(int64_t begin_axis, int64_t end_axis) const;
   int64_t Count(int64_t begin_axis) const;
@@ -82,13 +97,14 @@ class Shape final {
 
   Maybe<Shape> Slice(int64_t start_dim, int64_t end_dim) const;
 
-  ShapeView ToShapeView() const { return ShapeView(dim_vec_.data(), dim_vec_.size()); }
+  ShapeView ToShapeView() const { return ShapeView(data(), size()); }
 
-  MutShapeView ToMutShapeView() { return MutShapeView(dim_vec_.data(), dim_vec_.size()); }
+  MutShapeView ToMutShapeView() { return MutShapeView(data(), size()); }
 
  private:
-  DimVector dim_vec_;
-  bool is_initialized_;
+  // Set default value here because some constructors are inherited from DimVector
+  // TODO(daquexian): remove this field and make it initializied by construction
+  bool is_initialized_ = true;
 };
 
 int64_t ShiftNegativeAxis(int64_t axis, const int64_t num_axes);
@@ -99,7 +115,7 @@ Shape ZeroDimCompatiableShape(const Shape& shape);
 Shape CreateReducedShapeOrOnesShape(const ShapeView& shape, const AxisVector& axis_vec);
 template<typename StreamT>
 void Shape::SerializeWithTextFormat(StreamT& out_stream) const {
-  for (int64_t dim : dim_vec_) { out_stream << std::to_string(dim) << ' '; }
+  for (int64_t dim : *this) { out_stream << std::to_string(dim) << ' '; }
 }
 
 std::ostream& operator<<(std::ostream& out, const Shape& shape);
diff --git a/oneflow/core/common/shape_vec.h b/oneflow/core/common/shape_vec.h
index c97870a4832..18a34ed7741 100644
--- a/oneflow/core/common/shape_vec.h
+++ b/oneflow/core/common/shape_vec.h
@@ -16,24 +16,15 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_COMMON_SHAPE_VEC_H_
 #define ONEFLOW_CORE_COMMON_SHAPE_VEC_H_
 
-#include "oneflow/core/common/fixed_vector.h"
+#include "oneflow/core/common/small_vector.h"
 
 namespace oneflow {
 
-//#define DISABLE_FIXED_SHAPE_VEC
 #define SHAPE_MAX_AXIS_SIZE 20
 
-#if defined(DISABLE_FIXED_SHAPE_VEC)
+typedef small_vector<int64_t, SHAPE_MAX_AXIS_SIZE> DimVector;
+typedef small_vector<int64_t, SHAPE_MAX_AXIS_SIZE> AxisVector;
 
-typedef std::vector<int64_t> DimVector;
-typedef std::vector<int64_t> AxisVector;
-
-#else
-
-typedef fixed_vector<int64_t, SHAPE_MAX_AXIS_SIZE> DimVector;
-typedef fixed_vector<int64_t, SHAPE_MAX_AXIS_SIZE> AxisVector;
-
-#endif
 }  // namespace oneflow
 
 #endif  // ONEFLOW_CORE_COMMON_SHAPE_VEC_H_
diff --git a/oneflow/core/common/shape_view.cpp b/oneflow/core/common/shape_view.cpp
index cd0bb3d1370..648034665fe 100644
--- a/oneflow/core/common/shape_view.cpp
+++ b/oneflow/core/common/shape_view.cpp
@@ -77,7 +77,7 @@ template<typename DimT>
 void ShapeViewBase<DimT>::ToShape(Shape* shape) const {
   DimVector dim_vec;
   this->ToDimVector(&dim_vec);
-  shape->assign(dim_vec);
+  *shape = Shape(dim_vec);
 }
 
 template class ShapeViewBase<const int64_t>;
diff --git a/oneflow/core/common/small_vector.h b/oneflow/core/common/small_vector.h
new file mode 100644
index 00000000000..6aee5359f2b
--- /dev/null
+++ b/oneflow/core/common/small_vector.h
@@ -0,0 +1,53 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_COMMON_SMALL_VECTOR_H_
+#define ONEFLOW_CORE_COMMON_SMALL_VECTOR_H_
+
+#include "llvm/ADT/SmallVector.h"
+
+namespace oneflow {
+
+template<typename T, size_t N>
+class small_vector : public llvm::SmallVector<T, N> {
+  using Base = llvm::SmallVector<T, N>;
+
+ public:
+  // https://stackoverflow.com/questions/27954940/a-using-statement-compiles-with-g-fails-compilation-with-clang
+  using Base::Base;
+
+  typename Base::reference at(typename Base::size_type idx) {
+    CHECK_LT(idx, Base::size());
+    return (*this)[idx];
+  }
+  typename Base::const_reference at(typename Base::size_type idx) const {
+    CHECK_LT(idx, Base::size());
+    return (*this)[idx];
+  }
+  typename Base::const_iterator cbegin() const {
+    return (typename Base::const_iterator)this->BeginX;
+  }
+  typename Base::const_iterator cend() const {
+    return (typename Base::const_iterator)(this->BeginX) + Base::size();
+  }
+  typename Base::const_iterator cbegin() { return (typename Base::const_iterator)this->BeginX; }
+  typename Base::const_iterator cend() {
+    return (typename Base::const_iterator)(this->BeginX) + Base::size();
+  }
+};
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_CORE_COMMON_SMALL_VECTOR_H_
diff --git a/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp b/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp
index d119c3eeaf3..a8be6b054ed 100644
--- a/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp
+++ b/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp
@@ -72,9 +72,9 @@ class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary {
   void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0,
               size_t num_src1_dims, const int64_t* src1_dims, const void* src1,
               void* dst) override {
-    DimVector src0_dim_vec;
-    DimVector src1_dim_vec;
-    DimVector dst_dim_vec;
+    Shape src0_shape;
+    Shape src1_shape;
+    Shape dst_shape;
     size_t num_dims = 0;
     int64_t simplified_src0_dims[kMaxNumDims];
     int64_t simplified_src1_dims[kMaxNumDims];
@@ -85,15 +85,13 @@ class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary {
     CheckInplace(num_dims, simplified_src0_dims, src0, simplified_src1_dims, src1,
                  simplified_dst_dims, dst);
     for (int64_t i = 0; i < num_dims; ++i) {
-      src0_dim_vec.push_back(simplified_src0_dims[i]);
-      src1_dim_vec.push_back(simplified_src1_dims[i]);
-      dst_dim_vec.push_back(simplified_dst_dims[i]);
+      src0_shape.push_back(simplified_src0_dims[i]);
+      src1_shape.push_back(simplified_src1_dims[i]);
+      dst_shape.push_back(simplified_dst_dims[i]);
     }
-    binary_func(
-        stream, XpuVarNdarray<Dst>(Shape(dst_dim_vec), reinterpret_cast<Dst*>(dst), num_dims),
-        XpuVarNdarray<const Src>(Shape(src0_dim_vec), reinterpret_cast<const Src*>(src0), num_dims),
-        XpuVarNdarray<const Src>(Shape(src1_dim_vec), reinterpret_cast<const Src*>(src1),
-                                 num_dims));
+    binary_func(stream, XpuVarNdarray<Dst>(dst_shape, reinterpret_cast<Dst*>(dst), num_dims),
+                XpuVarNdarray<const Src>(src0_shape, reinterpret_cast<const Src*>(src0), num_dims),
+                XpuVarNdarray<const Src>(src1_shape, reinterpret_cast<const Src*>(src1), num_dims));
   }
 };
 
diff --git a/oneflow/core/framework/tensor_methods.cpp b/oneflow/core/framework/tensor_methods.cpp
index 7ed41a652f8..6f6cf271660 100644
--- a/oneflow/core/framework/tensor_methods.cpp
+++ b/oneflow/core/framework/tensor_methods.cpp
@@ -188,7 +188,7 @@ Maybe<Tensor> Unsqueeze(const std::shared_ptr<Tensor>& input, const int32_t& exp
       cnt++;
     }
     target_dim_vec[expand_dim] = 1;
-    target_stride_vec[expand_dim] = strides->At(expand_dim);
+    target_stride_vec[expand_dim] = expand_dim < ndim ? strides->At(expand_dim) : 1;
   }
 
   int64_t storage_offset = JUST(JUST(input->AsMirroredTensor())->storage_offset());
diff --git a/oneflow/core/job/eager_nccl_comm_manager.cpp b/oneflow/core/job/eager_nccl_comm_manager.cpp
index d8b77cdbb72..959a7837010 100644
--- a/oneflow/core/job/eager_nccl_comm_manager.cpp
+++ b/oneflow/core/job/eager_nccl_comm_manager.cpp
@@ -71,6 +71,9 @@ void CreateNcclComm(ncclComm_t* comm, const int dev, const std::string& key,
           << ", nccl_unique_id = " << NcclUniqueId2String(nccl_unique_id) << ", rank = " << rank
           << ", key = {" << key << "}\n";
   OF_NCCL_CHECK(ncclCommInitRank(comm, device_vec.size(), nccl_unique_id, rank));
+  VLOG(2) << " EagerNcclCommMgr::ncclCommInitRank succeed device_vec.size() = " << device_vec.size()
+          << ", nccl_unique_id = " << NcclUniqueId2String(nccl_unique_id) << ", rank = " << rank
+          << ", key = {" << key << "}\n";
 }
 
 }  // namespace
diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp
index 5afc24192da..8ae659fd541 100644
--- a/oneflow/core/job/job_build_and_infer_ctx.cpp
+++ b/oneflow/core/job/job_build_and_infer_ctx.cpp
@@ -997,13 +997,19 @@ Maybe<void> LazyJobBuildAndInferCtx::Complete() {
     }
   };
   int32_t pass_cnt = 0;
+  const int64_t prev_v = FLAGS_v;
   auto DoPass = [&](const std::string& pass_name, int32_t cnt = 0) -> Maybe<void> {
+    VLOG(1) << job_name << " is compiling with pass"
+            << " pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name
+            << (cnt > 0 ? std::to_string(cnt) : "");
     if (unlikely(NeedLogJob(pass_name))) {
       std::string cnt_str = cnt > 0 ? std::to_string(cnt) : "";
       LogJob("pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + cnt_str + "-before");
+      FLAGS_v = 3;
     }
     JUST(JobPass4Name(pass_name)(mut_job(), &job_pass_ctx));
     if (unlikely(NeedLogJob(pass_name))) {
+      FLAGS_v = prev_v;
       std::string cnt_str = cnt > 0 ? std::to_string(cnt) : "";
       LogJob("pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + cnt_str + "-after");
     }
diff --git a/oneflow/core/job/job_builder.cpp b/oneflow/core/job/job_builder.cpp
index a7f81384376..b13bd8a67fd 100644
--- a/oneflow/core/job/job_builder.cpp
+++ b/oneflow/core/job/job_builder.cpp
@@ -18,6 +18,7 @@ limitations under the License.
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/common/container_util.h"
 #include "oneflow/core/job/job.pb.h"
+#include "oneflow/core/job/sbp_parallel.pb.h"
 #include "oneflow/core/operator/operator.h"
 
 namespace oneflow {
diff --git a/oneflow/core/job/job_conf.proto b/oneflow/core/job/job_conf.proto
index 69aa7ad29f0..03638feec30 100644
--- a/oneflow/core/job/job_conf.proto
+++ b/oneflow/core/job/job_conf.proto
@@ -211,6 +211,7 @@ message JobConfigProto {
   optional bool enable_gradients_stats_aggregation = 106 [default = true];
   optional string optimizer_placement_optimization_mode = 107;
   optional int64 optimizer_placement_optimization_threshold = 108 [default = 1024];
+  optional int64 optimizer_placement_optimization_shard_restore_level = 110 [default = 2];
 
   optional QatConfig qat_config = 109;
 
diff --git a/oneflow/core/job/job_ir.cpp b/oneflow/core/job/job_ir.cpp
index 792735a0354..f5552b92514 100644
--- a/oneflow/core/job/job_ir.cpp
+++ b/oneflow/core/job/job_ir.cpp
@@ -19,6 +19,10 @@ namespace oneflow {
 
 #ifndef WITH_MLIR
 
+Maybe<std::string> ConvertJobToTosaIR(Job* job) {
+  UNIMPLEMENTED_THEN_RETURN() << "ConvertJobToTosaIR is only supported WITH_MLIR";
+}
+
 Maybe<void> SaveJobToIR(Job* job, const std::string& path) {
   UNIMPLEMENTED_THEN_RETURN() << "SaveJobToIR is only supported WITH_MLIR";
 }
diff --git a/oneflow/core/job/job_ir.h b/oneflow/core/job/job_ir.h
index c57d0eebeb8..7dbd8da0c31 100644
--- a/oneflow/core/job/job_ir.h
+++ b/oneflow/core/job/job_ir.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 namespace oneflow {
 
+Maybe<std::string> ConvertJobToTosaIR(Job* job);
 Maybe<void> SaveJobToIR(Job* job, const std::string& path);
 Maybe<void> LoadJobFromIR(Job* job, const std::string& path);
 
diff --git a/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp b/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp
index 15ca99eab73..d51c171df19 100644
--- a/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp
+++ b/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp
@@ -90,6 +90,7 @@ const AMPList& AutoMixedPrecisionLists::ClearList() {
                                "identity",
                                "flatten",
                                "squeeze",
+                               "embedding",
                                "expand_dims",
                                "cast_to_static_shape",
                                "parallel_cast",
diff --git a/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp b/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp
index 1ca857fd11f..7aaf2e75426 100644
--- a/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp
+++ b/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp
@@ -13,10 +13,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include <glog/logging.h>
+#include <cstdint>
 #include "oneflow/core/common/util.h"
+#include "oneflow/core/framework/nd_sbp.h"
+#include "oneflow/core/framework/user_op_conf.h"
+#include "oneflow/core/job/nd_sbp_util.h"
+#include "oneflow/core/job/sbp_parallel.h"
+#include "oneflow/core/job/sbp_parallel.pb.h"
 #include "oneflow/core/job_rewriter/job_pass.h"
 #include "oneflow/core/graph/op_graph.h"
 #include "oneflow/core/job/job_desc.h"
+#include "oneflow/core/operator/op_conf.pb.h"
+#include "oneflow/core/operator/operator.h"
 
 namespace oneflow {
 
@@ -31,7 +40,7 @@ int64_t GetSoleOutBlobSize(const OpNode* node) {
 class DataParallelNodeSequence final {
  public:
   DataParallelNodeSequence(std::vector<const OpNode*> nodes, int64_t order)
-      : nodes_(std::move(nodes)), order_(order) {
+      : nodes_(std::move(nodes)), order_(order), len_(nodes_.size()) {
     const OpNode* var_node = nodes_.front();
     CHECK(var_node->op().op_conf().has_variable_conf());
     model_size_ = GetSoleOutBlobSize(var_node);
@@ -50,13 +59,23 @@ class DataParallelNodeSequence final {
 
   int64_t model_size() const { return model_size_; }
 
+  int64_t len() const { return len_; }
+
+  void resize(const int64_t size) {
+    CHECK(size <= len_);
+    CHECK(size > 1);
+    nodes_.resize(size);
+    len_ = nodes().size();
+  }
+
  private:
   std::vector<const OpNode*> nodes_;
   int64_t order_;
   int64_t model_size_;
+  int64_t len_;
 };
 
-using SequencePtr = std::shared_ptr<const DataParallelNodeSequence>;
+using SequencePtr = std::shared_ptr<DataParallelNodeSequence>;
 
 ParallelConf NonDistributedParallelConf4ParallelId(const ParallelDesc& pd,
                                                    const int64_t parallel_id) {
@@ -76,7 +95,6 @@ Maybe<void> GetDataParallelVariableAndNaiveSuccNode(
   // Find sequence like: vairable -> cast_fp32_to_fp16
   if (!start->op().op_conf().has_variable_conf()) { return Maybe<void>::Ok(); }
   const ParallelDesc& pd = start->parallel_desc();
-  if (pd.device_type() != DeviceType::kCUDA) { return Maybe<void>::Ok(); }
   if (pd.parallel_num() == 1) { return Maybe<void>::Ok(); }
   const OpNode* cur_node = start;
   while (cur_node != nullptr) {
@@ -85,12 +103,21 @@ Maybe<void> GetDataParallelVariableAndNaiveSuccNode(
       if (cur_node->in_edges().size() > 1) { break; }
       if (cur_node->op().input_bns().size() != 1) { break; }
       const std::string& sole_ibn = cur_node->op().SoleIbn();
-      if (!cur_node->SbpParallel4BnInOp(sole_ibn).has_broadcast_parallel()) { break; }
+      const NdSbp& ibn_nd_sbp = cur_node->NdSbp4BnInOp(sole_ibn);
+      bool has_broadcast = false;
+      FOR_RANGE(int, i, 0, ibn_nd_sbp.sbp_parallel_size()) {
+        if (ibn_nd_sbp.sbp_parallel(i).has_broadcast_parallel()) { has_broadcast = true; };
+      }
+      if (!has_broadcast) { break; }
     }
-    if (!IsAllowed(cur_node)) { break; }
     if (cur_node->op().output_bns().size() != 1) { break; }
     const std::string& sole_obn = cur_node->op().SoleObn();
-    if (!cur_node->SbpParallel4BnInOp(sole_obn).has_broadcast_parallel()) { break; }
+    const NdSbp& obn_nd_sbp = cur_node->NdSbp4BnInOp(sole_obn);
+    bool has_broadcast = false;
+    FOR_RANGE(int, i, 0, obn_nd_sbp.sbp_parallel_size()) {
+      if (obn_nd_sbp.sbp_parallel(i).has_broadcast_parallel()) { has_broadcast = true; };
+    }
+    if (!has_broadcast) { break; }
     out->emplace_back(cur_node);
     if (cur_node->out_edges().size() == 1) {
       cur_node = cur_node->SoleOutEdge()->dst_node();
@@ -123,6 +150,79 @@ void SetBroadcastParallel4Consumers(JobBuilder* builder, const SequencePtr& sequ
   });
 }
 
+void SetNdSbp4OpNodeIbn(JobBuilder* builder, const OpNode* node, const std::string& ibn,
+                        const NdSbp& nd_sbp) {
+  OpBlobArg op_blob_arg;
+  op_blob_arg.set_op_name(node->op().op_name());
+  op_blob_arg.set_bn_in_op(ibn);
+  builder->SetNdSbp4Oba(op_blob_arg, nd_sbp);
+}
+
+void SetNdSbp4Consumers(JobBuilder* builder, const SequencePtr& sequence, const NdSbp& nd_sbp) {
+  const OpNode* node = sequence->GetLastNode();
+  const LogicalBlobId& lbi = node->op().BnInOp2Lbi(node->op().SoleObn());
+  const int64_t shard_restore_level =
+      builder->job().job_conf().optimizer_placement_optimization_shard_restore_level();
+  // If shard_restore_level == 0, no limit on consumer
+  if (shard_restore_level == 1) {
+    // Input lbn for parallel cast op
+    std::string parallel_cast_input_lbn = GenLogicalBlobName(lbi);
+    // Add indentity to enable mem reuse of boxing op when there is no op between var op and boxing.
+    if (sequence->len() == 1) {
+      VLOG(3) << "ZeRO find a data-parallel sequence only has one variable "
+              << sequence->GetVariableNode()->op().op_name();
+      const auto var_identity_op =
+          user_op::UserOpConfWrapperBuilder("System-ZeRO-Identity-" + node->op().op_name() + "-"
+                                            + NewUniqueId())
+              .Op("identity")
+              .Input("in", GenLogicalBlobName(lbi))
+              .Output("out")
+              .ScopeSymbolId(node->op().op_conf().scope_symbol_id())
+              .Build();
+      builder->AddOps(node->parallel_desc().parallel_conf(), {var_identity_op.op_conf()});
+      parallel_cast_input_lbn = var_identity_op.output("out", 0);
+    }
+    // Add parallel cast op to make soft limt on consumer to consume weight with Broadcast SBP.
+    const auto parallel_cast_op =
+        user_op::UserOpConfWrapperBuilder("System-ZeRO-ParallelCast-" + node->op().op_name() + "-"
+                                          + NewUniqueId())
+            .Op("hierarchical_parallel_cast")
+            .Input("in", parallel_cast_input_lbn)
+            .Output("out")
+            .Attr<std::vector<std::string>>("nd_sbp", NdSbpToStringList(nd_sbp))
+            .Attr<std::string>("grad_mode", "identity")  // don't do ndsbp cast at backward
+            .Attr<std::vector<std::string>>("grad_nd_sbp", std::vector<std::string>())
+            .ScopeSymbolId(node->op().op_conf().scope_symbol_id())
+            .Build();
+    builder->AddOps(node->parallel_desc().parallel_conf(), {parallel_cast_op.op_conf()});
+
+    // Make consumers to consume parallel cast op
+    auto out_lbn = parallel_cast_op.output("out", 0);
+    node->ForEachNodeOnOutEdge([&](const OpNode* out_node) {
+      for (const std::string& ibn : out_node->op().input_bns()) {
+        if (out_node->op().BnInOp2Lbi(ibn) == lbi) {
+          if (!CHECK_JUST(builder->IsInMutOpTransaction(out_node->op().op_name()))) {
+            CHECK_JUST(builder->MutOpTransactionMut(out_node->op().op_conf()));
+          }
+          OperatorConf& mut_consumer_op =
+              CHECK_JUST(builder->MutOpTransactionGet(out_node->op().op_name()));
+          const auto& old_lbn = ReplaceInputLbnInOpCustomizedConf(&mut_consumer_op, ibn, out_lbn);
+          CHECK_EQ(old_lbn, GenLogicalBlobName(lbi));
+        }
+      }
+    });
+  } else if (shard_restore_level == 2) {
+    // Hard limt consumer to consume weight as Broadcast.
+    node->ForEachNodeOnOutEdge([&](const OpNode* out_node) {
+      for (const std::string& ibn : out_node->op().input_bns()) {
+        if (out_node->op().BnInOp2Lbi(ibn) == lbi) {
+          SetNdSbp4OpNodeIbn(builder, out_node, ibn, nd_sbp);
+        }
+      }
+    });
+  }
+}
+
 std::function<int64_t(const OpNode*)> MakeGetterOpNode2TopoOrder(const OpGraph& op_graph) {
   HashMap<const OpNode*, int64_t> op_node2topo_order;
   int64_t node_cnt = 0;
@@ -152,7 +252,7 @@ void ForEachDataParallelNodeSequence(const OpGraph& op_graph,
     CHECK_JUST(GetDataParallelVariableAndNaiveSuccNode(node, IsAllowed, &nodes));
     if (nodes.empty()) { return; }
     const int64_t order = GetMinConsumerOrder(op_graph, nodes.back(), OpNode2Order);
-    Handler(std::make_shared<const DataParallelNodeSequence>(std::move(nodes), order));
+    Handler(std::make_shared<DataParallelNodeSequence>(std::move(nodes), order));
   });
 }
 
@@ -188,6 +288,24 @@ bool IsS0Parallel(const SbpSignature& signature, const std::string& bn) {
   return IsS0Parallel(signature.bn_in_op2sbp_parallel().at(bn));
 }
 
+bool IsNdSbpMatch(const NdSbpSignature& signature, const std::string& bn, const NdSbp& nd_sbp) {
+  return signature.bn_in_op2nd_sbp().at(bn) == nd_sbp;
+}
+
+bool IsNdSbpSupported4Op(const OpNode* node, const NdSbp& nd_sbp) {
+  if (node->op().input_bns().size() != 1 || node->op().output_bns().size() != 1) { return false; }
+  std::vector<NdSbpSignature> list;
+  auto LogicalBlobDesc4Ibn = [&](const std::string& bn) -> Maybe<const BlobDesc&> {
+    return Maybe<const BlobDesc&>(node->LogicalBlobDesc4Lbi(node->op().BnInOp2Lbi(bn)));
+  };
+  CHECK_JUST(node->op().GetNdSbpSignatureList(LogicalBlobDesc4Ibn, node->parallel_desc(), &list));
+  const auto IsInAndOutMatch = [&](const NdSbpSignature& signature) {
+    return IsNdSbpMatch(signature, node->op().SoleIbn(), nd_sbp)
+           && IsNdSbpMatch(signature, node->op().SoleObn(), nd_sbp);
+  };
+  return std::any_of(list.cbegin(), list.cend(), IsInAndOutMatch);
+}
+
 bool IsS0SignatureSupported(const OpNode* node) {
   if (node->op().input_bns().size() != 1 || node->op().output_bns().size() != 1) { return false; }
   SbpSignatureList list;
@@ -222,42 +340,141 @@ void ForEachModelSizeBalancedPartition(
   }
 }
 
-Maybe<void> RewriteDistributedSplit(const OpGraph& op_graph, JobBuilder* builder) {
-  const int64_t threshold = builder->job().job_conf().optimizer_placement_optimization_threshold();
-  const auto IsAllowed = [threshold](const OpNode* n) -> bool {
-    if (n->op().op_conf().has_variable_conf()) {
-      const Shape shape(n->op().op_conf().variable_conf().shape());
-      const int64_t parallel_num = n->parallel_desc().parallel_num();
-      // Parameter needs to be able to evenly splited and one slice size >= threshold
-      return shape.At(0) % parallel_num == 0 && shape.elem_cnt() >= threshold * parallel_num;
+namespace {
+bool IsSplitValid(const Shape& shape, const NdSbp& nd_sbp, const Shape& hierachy,
+                  int64_t min_size) {
+  if (shape.NumAxes() < 1 || shape.elem_cnt() < 1) { return false; }
+  CHECK_EQ(nd_sbp.sbp_parallel_size(), hierachy.NumAxes());
+  Shape cur_shape = shape;
+  if (cur_shape.elem_cnt() < min_size) { return false; }
+  FOR_RANGE(int64_t, i, 0, hierachy.NumAxes()) {
+    const auto& sbp = nd_sbp.sbp_parallel(i);
+    if (sbp.has_split_parallel()) {
+      const int64_t dim = sbp.split_parallel().axis();
+      if (dim >= cur_shape.NumAxes()) { return false; }
+      // Evenly split.
+      if (cur_shape.At(dim) % hierachy.At(i) != 0) { return false; }
+      cur_shape.Set(dim, cur_shape.At(dim) / hierachy.At(i));
+      // Larger then min size.
+      if (cur_shape.elem_cnt() < min_size) { return false; }
+    }
+  }
+  return true;
+}
+
+void GenerateSplitSignature(const NdSbp& var_nd_sbp, const OperatorConf& new_var_op_conf,
+                            std::string& new_split_signature, int64_t& split_dim) {
+  if (new_var_op_conf.variable_conf().nd_sbp_size() > 0 && NdSbpIsAllBroadcast(var_nd_sbp)) {
+    // split last dim
+    split_dim = new_var_op_conf.variable_conf().nd_sbp_size() - 1;
+    // All B, B -> S0
+    new_split_signature = "S(0)";
+  } else {
+    // ND sbp, (*, B, S, *) -> (*, S, S, *)
+    // ND sbp, (*, S, B, *) -> (*, S, S, *)
+    FOR_RANGE(int64_t, j, 0, new_var_op_conf.variable_conf().nd_sbp_size()) {
+      if (new_var_op_conf.variable_conf().nd_sbp(j) == "B") {
+        std::vector<int64_t> adjacent_dim{j - 1, j + 1};
+        for (auto const& dim_to_try : adjacent_dim) {
+          if (dim_to_try >= 0 && dim_to_try < new_var_op_conf.variable_conf().nd_sbp_size()) {
+            SbpParallel sbp;
+            if (ParseSbpParallelFromString(new_var_op_conf.variable_conf().nd_sbp(dim_to_try), &sbp)
+                && sbp.has_split_parallel()) {
+              new_split_signature = new_var_op_conf.variable_conf().nd_sbp(dim_to_try);
+              split_dim = j;
+            }
+          }
+          if (new_split_signature != "") break;
+        }
+      }
+      // Only split one more dim.
+      if (new_split_signature != "") break;
+    }
+  }
+}
+void ShardSequence(JobBuilder* builder, const int64_t threshold, const ParallelDesc& pd,
+                   std::vector<SequencePtr>&& sorted_sequences) {
+  // For all sorted sequnence, set the variable op in the sequence to S
+  // and add ctrl edge to control the exectuion order between variable ops.
+  // A sequence is a variable op and its cast(fp32 to fp16) op. This is because the forward pass
+  // consume the fp16 variable and the optimizer consume the fp32 variable.
+  std::string prev_allowed_op_name = "";
+  for (int64_t i = 0; i < sorted_sequences.size(); ++i) {
+    const OpNode* var_node = sorted_sequences.at(i)->GetVariableNode();
+    OperatorConf new_var_op_conf = var_node->op().op_conf();
+    const std::string& sole_obn = var_node->op().SoleObn();
+    const NdSbp& var_nd_sbp = var_node->NdSbp4BnInOp(sole_obn);
+    const Shape& logical_shape = Shape(new_var_op_conf.variable_conf().shape());
+
+    std::string new_split_signature = "";
+    int64_t split_dim = 0;
+    GenerateSplitSignature(var_nd_sbp, new_var_op_conf, new_split_signature, split_dim);
+    if (new_split_signature != "") {
+      *new_var_op_conf.mutable_variable_conf()->mutable_nd_sbp(split_dim) = new_split_signature;
     } else {
-      return IsS0SignatureSupported(n);
+      continue;
+    }
+
+    bool split_is_allowed = true;
+    if (split_is_allowed) {
+      NdSbp new_nd_sbp;
+      std::vector<std::string> nd_sbp_str_vec;
+      for (const auto& sbp_str : new_var_op_conf.variable_conf().nd_sbp()) {
+        nd_sbp_str_vec.push_back(sbp_str);
+      }
+      ParseNdSbpFromStringList(nd_sbp_str_vec, &new_nd_sbp);
+      // check allowed by min shard size and evenly split
+      if (split_is_allowed) {
+        split_is_allowed = IsSplitValid(logical_shape, new_nd_sbp, *pd.hierarchy(), threshold);
+      }
+      if (split_is_allowed) {
+        // resize sequence by new nd sbp limit
+        auto& cur_seq = sorted_sequences.at(i);
+        int64_t max_len = 1;
+        if (cur_seq->len() > 1) {
+          FOR_RANGE(int64_t, node_idx, 1, cur_seq->len()) {
+            if (IsNdSbpSupported4Op(cur_seq->nodes().at(node_idx), new_nd_sbp)) {
+              ++max_len;
+            } else {
+              break;
+            }
+          }
+        }
+        if (max_len < cur_seq->len()) { cur_seq->resize(max_len); }
+      }
     }
+    if (!split_is_allowed) {
+      VLOG(3) << var_node->op().op_name() << " failed to change form B to  S "
+              << " with op conf " << new_var_op_conf.variable_conf().DebugString();
+      continue;
+    }
+    if (i != 0) { new_var_op_conf.add_ctrl_in_op_name(prev_allowed_op_name); }
+    builder->MutOpsOnlyOnce({new_var_op_conf});
+    // Set consumers to consum this variable op's cast op's output as Broadcast.
+    if (new_split_signature != "") {
+      SetNdSbp4Consumers(builder, sorted_sequences.at(i), var_nd_sbp);
+    }
+    prev_allowed_op_name = var_node->op().op_name();
+    VLOG(3) << var_node->op().op_name() << " succeed to change form B to " << new_split_signature
+            << " on ranks dim " << split_dim << " with op conf "
+            << new_var_op_conf.variable_conf().DebugString();
+  }
+}
+}  // namespace
+
+Maybe<void> RewriteDistributedSplit(const OpGraph& op_graph, JobBuilder* builder) {
+  const int64_t threshold = builder->job().job_conf().optimizer_placement_optimization_threshold();
+  const auto IsAllowed = [](const OpNode* n) -> bool {
+    // No need to limit here.
+    return true;
   };
   const auto PlacementSequencesAsSplitParallel = [&](const ParallelDesc& pd,
                                                      std::vector<SequencePtr>&& sorted_sequences) {
-    // For all sorted sequnence, set the variable op in the sequence to S(0)
-    // and add ctrl edge to control the exectuion order between variable ops.
-    // A sequence is a variable op and its cast(fp32 to fp16) op. This is because the forward pass
-    // consume the fp16 variable and the optimizer consume the fp32 variable.
-    for (int64_t i = 0; i < sorted_sequences.size(); ++i) {
-      const OpNode* var_node = sorted_sequences.at(i)->GetVariableNode();
-      OperatorConf new_var_op_conf = var_node->op().op_conf();
-      CHECK_EQ(pd.hierarchy()->NumAxes(), 1);
-      new_var_op_conf.mutable_variable_conf()->clear_nd_sbp();
-      *new_var_op_conf.mutable_variable_conf()->add_nd_sbp() = "S(0)";
-      if (i != 0) {
-        const std::string& prev_op_name =
-            sorted_sequences.at(i - 1)->GetVariableNode()->op().op_name();
-        new_var_op_conf.add_ctrl_in_op_name(prev_op_name);
-      }
-      builder->MutOpsOnlyOnce({new_var_op_conf});
-      // Set consumers to consum this variable op's cast op's output as Broadcast.
-      SetBroadcastParallel4Consumers(builder, sorted_sequences.at(i));
-    }
+    ShardSequence(builder, threshold, pd, std::forward<std::vector<SequencePtr>>(sorted_sequences));
   };
   ForEachParallelSortedNodeSequence(op_graph, IsAllowed, SequenceCompSortedByOrderAsc,
                                     PlacementSequencesAsSplitParallel);
+  JUST(builder->MutOpTransactionCommit());
   return Maybe<void>::Ok();
 }
 
@@ -313,7 +530,8 @@ class OptimizerPlacementOptimizationPass final : public JobPass {
 
   Maybe<void> Apply(Job* job, JobPassCtx* ctx) const override {
     if (!(ctx->job_desc().IsTrain()
-          && ctx->job_desc().job_conf().has_optimizer_placement_optimization_mode())) {
+          && ctx->job_desc().job_conf().has_optimizer_placement_optimization_mode()
+          && ctx->job_desc().job_conf().optimizer_placement_optimization_mode() != "none")) {
       return Maybe<void>::Ok();
     }
     const std::string& mode = ctx->job_desc().job_conf().optimizer_placement_optimization_mode();
diff --git a/oneflow/extension/python/numpy_internal.h b/oneflow/extension/python/numpy_internal.h
index fd3e9594034..84590a38990 100644
--- a/oneflow/extension/python/numpy_internal.h
+++ b/oneflow/extension/python/numpy_internal.h
@@ -22,7 +22,7 @@ limitations under the License.
 // ************************
 
 #include "oneflow/core/common/data_type.h"
-#include "oneflow/core/common/fixed_vector.h"
+#include "oneflow/core/common/small_vector.h"
 #include "oneflow/core/common/shape_vec.h"
 
 // PyArrayObject cannot be forward declared, or a compile error will occur
diff --git a/oneflow/ir/install-llvm.cmake b/oneflow/ir/install-llvm.cmake
index e7c09ba1aae..1b7226dccab 100644
--- a/oneflow/ir/install-llvm.cmake
+++ b/oneflow/ir/install-llvm.cmake
@@ -49,6 +49,8 @@ if(NOT llvm_monorepo_POPULATED)
   endif()
 endif()
 
+set(LLVM_INCLUDE_DIRS ${llvm_monorepo_SOURCE_DIR}/llvm/include;${llvm_monorepo_BINARY_DIR}/include)
+
 if(WITH_MLIR)
   set(LLVM_DIR ${LLVM_INSTALL_DIR}/lib/cmake/llvm)
   set(MLIR_DIR ${LLVM_INSTALL_DIR}/lib/cmake/mlir)
diff --git a/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp b/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp
index 488198828ac..80836da5786 100644
--- a/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp
+++ b/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp
@@ -14,21 +14,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "OneFlow/OneFlowOps.h"
+#include <cstdint>
 #include <iostream>
 #include <string>
 #include "OneFlow/OneFlowDialect.h"
 #include "OneFlow/Passes.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/Passes.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/OpImplementation.h"
 
 #include "mlir/Pass/Pass.h"
@@ -36,11 +41,46 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/variable_tensor_mgr.h"
+
+#include <limits>
 
 namespace mlir {
 
 namespace oneflow {
 
+Value CreateTranspose(Location& loc, ConversionPatternRewriter& rewriter, Value input,
+                      ArrayRef<int32_t> perms) {
+  int perms_size = perms.size();
+  auto transpose_perms = rewriter.create<tosa::ConstOp>(
+      loc, RankedTensorType::get({perms_size}, rewriter.getI32Type()),
+      rewriter.getI32TensorAttr(perms));
+  const auto shape_type = input.getType().cast<ShapedType>();
+  std::vector<int64_t> ranked_type;
+  for (const auto& index : perms) ranked_type.push_back(shape_type.getDimSize(index));
+  return rewriter.create<tosa::TransposeOp>(
+      loc, RankedTensorType::get(ranked_type, shape_type.getElementType()), input, transpose_perms);
+};
+
+Value CreateBNOp(Location loc, ConversionPatternRewriter& rewriter, Value output, Value x,
+                 Value mean, Value variance, Value epsilon, Value gamma, Value beta) {
+  const auto output_type = output.getType();
+  // sub_op = sub(input, mean)
+  auto sub_op0 = rewriter.create<tosa::SubOp>(loc, output_type, x, mean);
+  // add_op0 = add(var, epsilon)
+  auto add_op0 = rewriter.create<tosa::AddOp>(loc, variance.getType(), variance, epsilon);
+  // rsqrt_op = rsqrt(add_op0)
+  auto rsqrt_op = rewriter.create<tosa::RsqrtOp>(loc, variance.getType(), add_op0);
+  // op4 = mul(sub_op, rsqrt_op)
+  auto mul_op0 = rewriter.create<tosa::MulOp>(loc, output_type, sub_op0, rsqrt_op, 0);
+  // op5 = mul(mul_op0, gamma)
+  auto mul_op1 = rewriter.create<tosa::MulOp>(loc, output_type, mul_op0, gamma, 0);
+  // op6 = add(mul_op1, beta)
+  auto batch_norm = rewriter.create<tosa::AddOp>(loc, output_type, mul_op1, beta);
+  return batch_norm;
+};
+
 struct ScalarMulByTensorOpLowering final : public OpConversionPattern<ScalarMulByTensorOp> {
  public:
   using OpConversionPattern<ScalarMulByTensorOp>::OpConversionPattern;
@@ -72,18 +112,421 @@ struct ScalarMulByTensorOpLowering final : public OpConversionPattern<ScalarMulB
   }
 };
 
+struct JobLowering final : public OpConversionPattern<Job> {
+ public:
+  using OpConversionPattern<Job>::OpConversionPattern;
+  LogicalResult matchAndRewrite(Job op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    auto func =
+        rewriter.create<mlir::func::FuncOp>(op.getLoc(), op.getName(), op.getFunctionType());
+    rewriter.inlineRegionBefore(op.getRegion(), func.getBody(), func.end());
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+struct ReturnOpLowering final : public OpConversionPattern<ReturnOp> {
+ public:
+  using OpConversionPattern<ReturnOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(ReturnOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<mlir::func::ReturnOp>(op,
+                                                      /* operands */ op.operands());
+    return success();
+  }
+};
+
+struct InputOpLowering final : public OpConversionPattern<InputOp> {
+ public:
+  using OpConversionPattern<InputOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(InputOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    // TODO: more choices to passing data between tosa and oneflow
+    const auto newValues = op.input();
+    const auto is_block_arg = newValues.dyn_cast<BlockArgument>() != nullptr;
+    if (!is_block_arg) op->emitError("input is not block arg");
+    rewriter.replaceOp(op, newValues);
+    return success();
+  }
+};
+
+struct OutputOpLowering final : public OpConversionPattern<OutputOp> {
+ public:
+  using OpConversionPattern<OutputOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(OutputOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    // TODO: more choices to passing data between tosa and oneflow
+    const auto newValues = op.input();
+    rewriter.replaceOp(op, newValues);
+    return success();
+  }
+};
+
+struct VariableOpLowering final : public OpConversionPattern<VariableOp> {
+ public:
+  using OpConversionPattern<VariableOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(VariableOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    const auto mgr = ::oneflow::Global<::oneflow::VariableTensorMgr>::Get();
+    // decide whether call by python or not
+    if (!mgr) op->emitError("oneflow variable op doesn't support pure mlir file conversion");
+
+    const auto tensor = mgr->Get(op.op_name().str());
+    const auto value = support::TensorToDenseElementsAttr(tensor, rewriter.getContext());
+    const auto output = op.output().getType();
+
+    rewriter.replaceOpWithNewOp<tosa::ConstOp>(op, output, value);
+    return success();
+  }
+};
+
 struct CastOpLowering final : public OpConversionPattern<CastOp> {
  public:
   using OpConversionPattern<CastOp>::OpConversionPattern;
   LogicalResult matchAndRewrite(CastOp op, OpAdaptor adaptor,
                                 ConversionPatternRewriter& rewriter) const override {
-    rewriter.replaceOpWithNewOp<tosa::CastOp>(op,
-                                              /* output */ op.out().getType(),
-                                              /* input */ op.in());
+    auto output = op.out().getType();
+    auto input = op.in();
+    rewriter.replaceOpWithNewOp<tosa::CastOp>(op, output, input);
+    return success();
+  }
+};
+
+struct ReluOpLowering final : public OpConversionPattern<ReluOp> {
+ public:
+  using OpConversionPattern<ReluOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(ReluOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    const auto floatMax = std::numeric_limits<float>::max();
+    const auto intMax = std::numeric_limits<long long>::max();
+
+    const auto output = op.y().getType();
+    auto input = op.x();
+    auto max_int = static_cast<uint64_t>(intMax);
+    auto max_fp = static_cast<::llvm::APFloat>(floatMax);
+
+    rewriter.replaceOpWithNewOp<tosa::ReluNOp>(op, output, input, max_int, max_fp);
+    return success();
+  }
+};
+
+struct BroadcastAddOpLowering final : public OpConversionPattern<BroadcastAddOp> {
+ public:
+  using OpConversionPattern<BroadcastAddOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(BroadcastAddOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    const auto output = op.z().getType();
+    auto input1 = op.x();
+    auto input2 = op.y();
+
+    rewriter.replaceOpWithNewOp<tosa::AddOp>(op, output, input1, input2);
+    return success();
+  }
+};
+
+struct Add2OpLowering final : public OpConversionPattern<Add2Op> {
+ public:
+  using OpConversionPattern<Add2Op>::OpConversionPattern;
+  LogicalResult matchAndRewrite(Add2Op op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    const auto output = op.out().getType();
+    auto input1 = op.in0();
+    auto input2 = op.in1();
+
+    rewriter.replaceOpWithNewOp<tosa::AddOp>(op, output, input1, input2);
+    return success();
+  }
+};
+
+struct AvgPool2DOpLowering final : public OpConversionPattern<AvgPool2DOp> {
+ public:
+  using OpConversionPattern<AvgPool2DOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(AvgPool2DOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    auto get_pair_int64_from_array = [](ArrayAttr arr) -> std::pair<int64_t, int64_t> {
+      return {arr.getValue()[0].cast<IntegerAttr>().getSInt(),
+              arr.getValue()[1].cast<IntegerAttr>().getSInt()};
+    };
+
+    auto reshape_type = [](ShapedType shape_type, ArrayRef<int32_t> perms) -> RankedTensorType {
+      std::vector<int64_t> ranked_type;
+      for (auto index : perms) ranked_type.push_back(shape_type.getDimSize(index));
+      return RankedTensorType::get(ranked_type, shape_type.getElementType());
+    };
+
+    auto stride_pairs = get_pair_int64_from_array(op.stride());
+    auto pad_pairs = get_pair_int64_from_array(op.padding());
+    auto kernel_pairs = get_pair_int64_from_array(op.kernel_size());
+
+    auto loc = op.getLoc();
+    auto perms = {0, 2, 3, 1};
+
+    const auto kernel = rewriter.getI64ArrayAttr({kernel_pairs.first, kernel_pairs.second});
+    const auto stride = rewriter.getI64ArrayAttr({stride_pairs.first, stride_pairs.second});
+    const auto pad = rewriter.getI64ArrayAttr(
+        {pad_pairs.first, pad_pairs.second, pad_pairs.first, pad_pairs.second});
+
+    auto input = CreateTranspose(loc, rewriter, op.x(), perms);
+    auto output = reshape_type(op.y().getType().cast<ShapedType>(), perms);
+
+    auto avg_pool2d = rewriter.create<tosa::AvgPool2dOp>(loc, output, input, kernel, stride, pad);
+
+    auto out = CreateTranspose(loc, rewriter, avg_pool2d, {0, 3, 1, 2});
+    rewriter.replaceOp(op, {out});
+    return success();
+  }
+};
+
+struct MaxPool2DOpLowering final : public OpConversionPattern<MaxPool2DOp> {
+ public:
+  using OpConversionPattern<MaxPool2DOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(MaxPool2DOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    auto get_pair_int64_from_array = [](ArrayAttr arr) -> std::pair<int64_t, int64_t> {
+      return {arr.getValue()[0].cast<IntegerAttr>().getSInt(),
+              arr.getValue()[1].cast<IntegerAttr>().getSInt()};
+    };
+    auto reshape_type = [](ShapedType shape_type, ArrayRef<int32_t> perms) -> RankedTensorType {
+      std::vector<int64_t> ranked_type;
+      for (auto index : perms) ranked_type.push_back(shape_type.getDimSize(index));
+      return RankedTensorType::get(ranked_type, shape_type.getElementType());
+    };
+    // TODO: support return indice
+    if (op.return_indices()) op->emitError("not support return indices now");
+    auto stride_pairs = get_pair_int64_from_array(op.stride());
+    auto kernel_pairs = get_pair_int64_from_array(op.kernel_size());
+    auto pad_pairs = get_pair_int64_from_array(op.padding());
+
+    auto loc = op.getLoc();
+    auto perms = {0, 2, 3, 1};
+
+    const auto kernel = rewriter.getI64ArrayAttr({kernel_pairs.first, kernel_pairs.second});
+    const auto stride = rewriter.getI64ArrayAttr({stride_pairs.first, stride_pairs.second});
+    const auto pad = rewriter.getI64ArrayAttr(
+        {pad_pairs.first, pad_pairs.second, pad_pairs.first, pad_pairs.second});
+
+    auto input = CreateTranspose(loc, rewriter, op.x(), perms);
+    auto output = reshape_type(op.y().getType().cast<ShapedType>(), perms);
+
+    auto max_pool2d = rewriter.create<tosa::MaxPool2dOp>(loc, output, input, kernel, stride, pad);
+
+    auto y = CreateTranspose(loc, rewriter, max_pool2d, {0, 3, 1, 2});
+
+    auto indice_output = op.indice().getType();
+    auto value = DenseElementsAttr::get(indice_output, rewriter.getZeroAttr(rewriter.getI64Type()));
+
+    auto indice = rewriter.create<tosa::ConstOp>(loc, indice_output, value);
+    rewriter.replaceOp(op, {y, indice});
+    return success();
+  }
+};
+
+struct FlattenOpLowering final : public OpConversionPattern<FlattenOp> {
+ public:
+  using OpConversionPattern<FlattenOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(FlattenOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    const auto start_dim = op.start_dim();
+    const auto end_dim = op.end_dim();
+    const auto in_type = op.in().getType();
+
+    const auto in_shape = in_type.cast<ShapedType>();
+    const auto rank = in_type.dyn_cast<RankedTensorType>().getRank();
+
+    // calculate reshape_vec
+    std::vector<int64_t> reshape_vec;
+    for (auto dim = 0; dim < start_dim; ++dim) { reshape_vec.push_back(in_shape.getDimSize(dim)); }
+    auto last_dim = end_dim < 0 ? rank : end_dim + 1;
+    int flatten_size = 1;
+    for (auto dim = start_dim; dim < last_dim; ++dim) { flatten_size *= in_shape.getDimSize(dim); }
+    reshape_vec.push_back(flatten_size);
+    if (end_dim > 0) {
+      for (auto dim = end_dim + 1; dim < rank; ++dim) {
+        reshape_vec.push_back(in_shape.getDimSize(dim));
+      }
+    }
+    // generate reshape op
+    const auto output = RankedTensorType::get(reshape_vec, in_shape.getElementType());
+    auto input1 = op.in();
+    auto new_shape = rewriter.getI64ArrayAttr(reshape_vec);
+
+    rewriter.replaceOpWithNewOp<tosa::ReshapeOp>(op, output, input1, new_shape);
     return success();
   }
 };
 
+struct MatmulOpLowering final : public OpConversionPattern<MatmulOp> {
+ public:
+  using OpConversionPattern<MatmulOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(MatmulOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    // TODO: more throw for robust in matmul shape rank
+    auto loc = op.getLoc();
+
+    auto preprocess = [&](Value matrix, bool transpose) -> Value {
+      auto shape_type = matrix.getType().cast<ShapedType>();
+      if (transpose) { matrix = CreateTranspose(loc, rewriter, matrix, {1, 0}); }
+
+      shape_type = matrix.getType().cast<ShapedType>();
+      auto reshape_type = RankedTensorType::get(
+          {1, shape_type.getDimSize(0), shape_type.getDimSize(1)}, shape_type.getElementType());
+
+      return rewriter.create<tosa::ReshapeOp>(
+          op.getLoc(), reshape_type, matrix,
+          rewriter.getI64ArrayAttr({1, shape_type.getDimSize(0), shape_type.getDimSize(1)}));
+    };
+
+    auto a = preprocess(op.a(), op.transpose_a());
+    auto b = preprocess(op.b(), op.transpose_b());
+
+    const auto out_shape_type = op.out().getType().cast<ShapedType>();
+    const auto out_reshape_type =
+        RankedTensorType::get({1, out_shape_type.getDimSize(0), out_shape_type.getDimSize(1)},
+                              out_shape_type.getElementType());
+
+    auto matmul = rewriter.create<tosa::MatMulOp>(loc, out_reshape_type, a, b);
+    const auto new_shape =
+        rewriter.getI64ArrayAttr({out_shape_type.getDimSize(0), out_shape_type.getDimSize(1)});
+
+    rewriter.replaceOpWithNewOp<tosa::ReshapeOp>(op, out_shape_type, matmul, new_shape);
+    return success();
+  }
+};
+
+struct NormalizationInferenceOpLowering final
+    : public OpConversionPattern<NormalizationInferenceOp> {
+ public:
+  using OpConversionPattern<NormalizationInferenceOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(NormalizationInferenceOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    auto reshape_dim = [&](Type type, Value value) -> Value {
+      RankedTensorType in_type = value.getType().dyn_cast<RankedTensorType>();
+      RankedTensorType out_type = type.cast<RankedTensorType>();
+      SmallVector<int64_t> new_shape = {in_type.getShape()[0]};
+      for (auto i = 2; i < out_type.getRank(); ++i) new_shape.push_back(1);
+      auto new_type = RankedTensorType::get(new_shape, out_type.getElementType());
+      return rewriter.create<tosa::ReshapeOp>(op->getLoc(), new_type, value,
+                                              rewriter.getI64ArrayAttr(new_shape));
+    };
+
+    auto loc = op->getLoc();
+    const auto out_type = op.y().getType();
+
+    const auto epsilon_type = RankedTensorType::get({}, rewriter.getF32Type());
+    // epsilon   = reshape(epsilon, shape_1)
+    auto epsilon = rewriter.create<tosa::ConstOp>(
+        loc, epsilon_type, DenseElementsAttr::get(epsilon_type, op.epsilon()));
+    //  mean = reshape(mean, shape_0)
+    auto mean = reshape_dim(out_type, adaptor.moving_mean());
+    //  variance= reshape(variance, shape_0)
+    auto variance = reshape_dim(out_type, adaptor.moving_variance());
+    // scale = reshape(scale, shape_0)
+    auto gamma = reshape_dim(out_type, adaptor.gamma());
+    // beta = reshape(beta, shape_0)
+    auto beta = reshape_dim(out_type, adaptor.beta());
+    auto output = op.y();
+    auto x = op.x();
+
+    auto batch_norm =
+        oneflow::CreateBNOp(loc, rewriter, output, x, mean, variance, epsilon, gamma, beta);
+    rewriter.replaceOp(op, {batch_norm});
+    return success();
+  }
+};
+
+struct NormalizationOpLowering final : public OpConversionPattern<NormalizationOp> {
+ public:
+  using OpConversionPattern<NormalizationOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(NormalizationOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    auto reshape_dim = [&](Type type, Value value) -> Value {
+      const RankedTensorType in_type = value.getType().dyn_cast<RankedTensorType>();
+      const RankedTensorType out_type = type.cast<RankedTensorType>();
+      SmallVector<int64_t> new_shape = {in_type.getShape()[0]};
+      for (auto i = 2; i < out_type.getRank(); ++i) new_shape.push_back(1);
+      const auto new_type = RankedTensorType::get(new_shape, out_type.getElementType());
+      return rewriter.create<tosa::ReshapeOp>(op->getLoc(), new_type, value,
+                                              rewriter.getI64ArrayAttr(new_shape));
+    };
+
+    auto loc = op->getLoc();
+    const auto out_type = op.y().getType();
+
+    const auto epsilon_type = RankedTensorType::get({}, rewriter.getF32Type());
+    // epsilon   = reshape(epsilon, shape_1)
+    auto epsilon = rewriter.create<tosa::ConstOp>(
+        loc, epsilon_type, DenseElementsAttr::get(epsilon_type, op.epsilon()));
+    //  mean = reshape(mean, shape_0)
+    auto mean = reshape_dim(out_type, adaptor.moving_mean());
+    //  variance= reshape(variance, shape_0)
+    auto variance = reshape_dim(out_type, adaptor.moving_variance());
+    // scale = reshape(scale, shape_0)
+    auto gamma = reshape_dim(out_type, adaptor.gamma());
+    // beta = reshape(beta, shape_0)
+    auto beta = reshape_dim(out_type, adaptor.beta());
+    auto output = op.y();
+    auto x = op.x();
+
+    auto batch_norm =
+        oneflow::CreateBNOp(loc, rewriter, output, x, mean, variance, epsilon, gamma, beta);
+    auto moving_mean = op.moving_mean();
+    auto moving_variance = op.moving_variance();
+
+    rewriter.replaceOp(op, {batch_norm, moving_mean, moving_variance});
+    return success();
+  }
+};
+
+struct Conv2DOpLowering final : public OpConversionPattern<Conv2DOp> {
+ public:
+  using OpConversionPattern<Conv2DOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(Conv2DOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter& rewriter) const override {
+    auto get_pair_int64_from_array = [](ArrayAttr arr) -> std::pair<int64_t, int64_t> {
+      return {arr.getValue()[0].cast<IntegerAttr>().getSInt(),
+              arr.getValue()[1].cast<IntegerAttr>().getSInt()};
+    };
+    auto reshape_type = [](ShapedType shape_type, ArrayRef<int32_t> perms) -> RankedTensorType {
+      std::vector<int64_t> ranked_type;
+      for (auto index : perms) ranked_type.push_back(shape_type.getDimSize(index));
+      return RankedTensorType::get(ranked_type, shape_type.getElementType());
+    };
+
+    auto stride_pairs = get_pair_int64_from_array(op.strides());
+    auto pad_pairs = get_pair_int64_from_array(op.padding_beforeAttr());
+    auto dilation_pairs = get_pair_int64_from_array(op.dilation_rate());
+
+    const auto pad = rewriter.getI64ArrayAttr(
+        {pad_pairs.first, pad_pairs.second, pad_pairs.first, pad_pairs.second});
+    const auto stride = rewriter.getI64ArrayAttr({stride_pairs.first, stride_pairs.second});
+    const auto dilation = rewriter.getI64ArrayAttr({dilation_pairs.first, dilation_pairs.second});
+
+    auto bias = op.bias();
+    auto loc = op.getLoc();
+    if (!bias) {
+      const auto output_shape = op.out().getType().cast<ShapedType>();
+      const auto output_channels = output_shape.getDimSize(1);
+      const auto bias_elem_type = output_shape.getElementType();
+      const auto type = RankedTensorType::get(output_channels, bias_elem_type);
+      bias = rewriter.create<tosa::ConstOp>(
+          op.getLoc(), type, DenseElementsAttr::get(type, rewriter.getZeroAttr(bias_elem_type)));
+    }
+
+    auto perms = {0, 2, 3, 1};
+    auto in = CreateTranspose(loc, rewriter, op.in(), perms);
+    auto weight = CreateTranspose(loc, rewriter, op.weight(), perms);
+    const auto output = reshape_type(op.out().getType().cast<ShapedType>(), perms);
+
+    auto conv2d =
+        rewriter.create<tosa::Conv2DOp>(loc, output, in, weight, bias, pad, stride, dilation);
+
+    auto res = CreateTranspose(loc, rewriter, conv2d, {0, 3, 1, 2});
+    rewriter.replaceOp(op, {res});
+    return success();
+    getTypeConverter();
+  }
+};
+
 namespace {
 struct OneFlowLoweringToTosaPass : public LowerOneFlowToTosaPassBase<OneFlowLoweringToTosaPass> {
   void runOnOperation() override;
@@ -95,11 +538,20 @@ std::unique_ptr<Pass> createLowerOneFlowToTosaPass() {
 }
 
 void OneFlowLoweringToTosaPass::runOnOperation() {
-  ConversionTarget target(getContext());
-  target.addLegalDialect<memref::MemRefDialect, mlir::func::FuncDialect, tosa::TosaDialect>();
+  MLIRContext* context = &getContext();
+  ConversionTarget target(*context);
+  target.addLegalDialect<memref::MemRefDialect, mlir::func::FuncDialect, tosa::TosaDialect,
+                         tensor::TensorDialect, arith::ArithmeticDialect>();
   target.addIllegalDialect<OneFlowDialect>();
-  RewritePatternSet patterns(&getContext());
-  patterns.insert<CastOpLowering, ScalarMulByTensorOpLowering>(&getContext());
+
+  TypeConverter typeConverter;
+  typeConverter.addConversion([](Type type) { return type; });
+  RewritePatternSet patterns(context);
+  patterns.add<CastOpLowering, ScalarMulByTensorOpLowering, ReluOpLowering, Conv2DOpLowering,
+               AvgPool2DOpLowering, FlattenOpLowering, Add2OpLowering, MaxPool2DOpLowering,
+               MatmulOpLowering, BroadcastAddOpLowering, JobLowering, ReturnOpLowering,
+               VariableOpLowering, InputOpLowering, OutputOpLowering, NormalizationOpLowering,
+               NormalizationInferenceOpLowering>(typeConverter, context);
   if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) {
     getOperation()->dump();
     signalPassFailure();
diff --git a/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp b/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp
index d4093babaf9..6a4e3bb380b 100644
--- a/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp
+++ b/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp
@@ -69,9 +69,9 @@ ::oneflow::Symbol<::oneflow::Device> MakeDevice(const mlir::Attribute& device_ta
   return ::oneflow::Device::ParseAndNew(device_info).GetOrThrow();
 }
 
-template<typename T>
+template<typename T, typename MLIR_T>
 mlir::DenseElementsAttr __TensorToDenseElementsAttr(
-    const std::shared_ptr<::oneflow::one::Tensor>& tensor, const mlir::FloatType& float_type) {
+    const std::shared_ptr<::oneflow::one::Tensor>& tensor, const MLIR_T& mlir_type) {
   ::oneflow::LazyMode::Guard guard{false};
   const auto tensor_ = ::oneflow::one::functional::ToContiguous(tensor).GetPtrOrThrow();
   auto shape = tensor_->shape();
@@ -81,7 +81,7 @@ mlir::DenseElementsAttr __TensorToDenseElementsAttr(
     CHECK_JUST(::oneflow::BlobBufferCopyUtil<T>::To(ofblob_ptr, data.data(), data.size()));
   };
   ::oneflow::one::SyncAccessTensorWithTimeOut(tensor_, callback, "const").GetOrThrow();
-  return mlir::DenseElementsAttr::get(mlir::RankedTensorType::get(shape_vec, float_type),
+  return mlir::DenseElementsAttr::get(mlir::RankedTensorType::get(shape_vec, mlir_type),
                                       llvm::makeArrayRef(data));
 }
 
@@ -115,7 +115,12 @@ mlir::DenseElementsAttr TensorToDenseElementsAttr(
     const std::shared_ptr<::oneflow::one::Tensor>& tensor, MLIRContext* ctx) {
   const auto dtype = tensor->dtype()->data_type();
   if (dtype == ::oneflow::DataType::kFloat) {
-    return __TensorToDenseElementsAttr<float>(tensor, mlir::FloatType::getF32(ctx));
+    return __TensorToDenseElementsAttr<float, mlir::FloatType>(tensor,
+                                                               mlir::FloatType::getF32(ctx));
+  } else if (dtype == ::oneflow::DataType::kInt64) {
+    auto mlir_type = mlir::IntegerType::IntegerType::get(
+        ctx, 64, mlir::IntegerType::SignednessSemantics::Signed);
+    return __TensorToDenseElementsAttr<int64_t, mlir::IntegerType>(tensor, mlir_type);
   }
   llvm::errs() << "Converting oneflow::Tensor to mlir::DenseElementsAttr only support float32 now."
                << "\n";
@@ -132,8 +137,9 @@ std::shared_ptr<::oneflow::one::Tensor> DenseElementsAttrToTensor(
     return __DenseElementsAttrToTensor<float>(dense_attr_, device_tag_attr, device_name_attr,
                                               ::oneflow::DataType::kFloat);
   }
-  llvm::errs() << "Converting mlir::DenseElementsAttr to oneflow::Tensor only support float32 now."
-               << "\n";
+  llvm::errs()
+      << "Converting mlir::DenseElementsAttr to oneflow::Tensor only support float32 and int64 now."
+      << "\n";
   exit(EXIT_FAILURE);
 }
 
diff --git a/oneflow/ir/oneflow-extension/ir_pass.cpp b/oneflow/ir/oneflow-extension/ir_pass.cpp
index e339b95f077..038c9578f55 100644
--- a/oneflow/ir/oneflow-extension/ir_pass.cpp
+++ b/oneflow/ir/oneflow-extension/ir_pass.cpp
@@ -177,6 +177,11 @@ Maybe<void> IRRoundTrip<ir_pass_type>::Apply(Job* job, JobPassCtx* ctx) const {
 template class IRRoundTrip<kBeforeAD>;
 template class IRRoundTrip<kAfterAD>;
 
+Maybe<std::string> ConvertJobToTosaIR(Job* job) {
+  RoundTripOneFlowJobWrapper<kBeforeAD> job_wrapper(job);
+  return ::mlir::oneflow::ConvertJobToTosaIR(job_wrapper);
+}
+
 Maybe<void> SaveJobToIR(Job* job, const std::string& path) {
   // TODO: check path is valid dir
   if (IsInDebugMode()) { TeePersistentLogStream::Create("saved_job")->Write(*job); }
diff --git a/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h b/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h
index 7d8baa77f68..b2afa4fbae1 100644
--- a/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h
+++ b/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h
@@ -150,6 +150,7 @@ void RoundTripOneFlowJob(
 
 void registerFromOneFlowJobTranslation();
 
+std::string ConvertJobToTosaIR(RoundTripOneFlowJobWrapperInterface& job_wrapper);
 void SaveJobToIR(RoundTripOneFlowJobWrapperInterface& job_wrapper, const std::string& path);
 void LoadJobFromIR(RoundTripOneFlowJobWrapperInterface& job_wrapper, const std::string& path);
 
diff --git a/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp b/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp
index fb876048a99..9491b715593 100644
--- a/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp
+++ b/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp
@@ -850,6 +850,35 @@ void RoundTripOneFlowJob(
   }
 }
 
+std::string ConvertJobToTosaIR(RoundTripOneFlowJobWrapperInterface& job_wrapper) {
+  const ::oneflow::Job* job = job_wrapper.job();
+  mlir::MLIRContext context;
+  context.getOrLoadDialect<oneflow::OneFlowDialect>();
+  context.loadDialect<mlir::func::FuncDialect>();
+
+  OwningOpRef<ModuleOp> module(
+      ModuleOp::create(FileLineColLoc::get(&context, "", /*line=*/0, /*column=*/0)));
+  JobImporter imp(job_wrapper, &context, module.get());
+  if (succeeded(imp.ProcessJob())) {
+    mlir::PassManager pm(&context);
+    pm.addPass(createCanonicalizerPass());
+    pm.addPass(createLowerOneFlowToTosaPass());
+    if (mlir::failed(pm.run(*module))) {
+      module->emitError("Failed to run oneflow-to-tosa pass");
+      exit(EXIT_FAILURE);
+    }
+
+    std::string mlir;
+    llvm::raw_string_ostream os_mlir(mlir);
+    module->print(os_mlir);
+    return mlir;
+  } else {
+    const auto& job_name = job->job_conf().job_name();
+    llvm::errs() << "fail to convert job to IR, job_name: " << job_name << "\n";
+    exit(EXIT_FAILURE);
+  }
+}
+
 void SaveJobToIR(RoundTripOneFlowJobWrapperInterface& job_wrapper, const std::string& path) {
   const ::oneflow::Job* job = job_wrapper.job();
   mlir::MLIRContext context;
diff --git a/oneflow/ir/test/Frontend/OneFlowToIree.mlir b/oneflow/ir/test/Frontend/OneFlowToIree.mlir
new file mode 100644
index 00000000000..834063b7a71
--- /dev/null
+++ b/oneflow/ir/test/Frontend/OneFlowToIree.mlir
@@ -0,0 +1,266 @@
+// RUN: oneflow-opt %s \
+// RUN: -split-input-file \
+// RUN: -lower-oneflow-to-tosa \
+// RUN: -verify-diagnostics -o - \
+// RUN: | ireec \
+// RUN: --iree-input-type=tosa \
+// RUN: --iree-vm-bytecode-module-output-format=flatbuffer-binary \
+// RUN: -iree-hal-target-backends=dylib-llvm-aot \
+// RUN: -iree-mlir-to-vm-bytecode-module -
+
+
+oneflow.job @test_func(%arg0: tensor<1xf32>) -> tensor<1xf32>
+{
+    oneflow.return %arg0 : tensor<1xf32>
+}
+
+
+oneflow.job @test_input(%arg0: tensor<1xf32>) -> tensor<1xf32>
+{
+    %res = "oneflow.input"(%arg0)
+    {
+        data_type = 2 : i32,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        is_dynamic = false,
+        nd_sbp = ["B"],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018427412479 : i64,
+        shape = [1 : si64]
+    } : (tensor<1xf32>) -> tensor<1xf32>
+    oneflow.return %res : tensor<1xf32>
+}
+
+
+oneflow.job @test_output(%arg0: tensor<1xf32>) -> tensor<1xf32>
+{
+    %res = "oneflow.output"(%arg0)
+    {
+        data_type = 2 : i32,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        is_dynamic = false,
+        nd_sbp = ["B"],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018427412479 : i64,
+        shape = [1 : si64]
+    } : (tensor<1xf32>) -> tensor<1xf32>
+    oneflow.return %res : tensor<1xf32>
+}
+
+
+oneflow.job @test_variable() -> tensor<64x3x7x7xf32>
+{
+    %res = "oneflow.variable"() {
+        data_type = 2 : i32,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        nd_sbp = ["B"],
+        op_name = "fw.model.conv1.weight",
+        output_lbns = ["fw.model.conv1.weight/out"],
+        scope_symbol_id = 4611686018427432959 : i64,
+        shape = [64 : si64, 3 : si64, 7 : si64, 7 : si64]
+    } : () -> tensor<64x3x7x7xf32>
+    oneflow.return %res : tensor<64x3x7x7xf32>
+}
+
+
+oneflow.job @test_add_n2(%arg0: tensor<1x7x7xf32>, %arg1: tensor<1x7x7xf32>) -> tensor<1x7x7xf32>
+{
+    %res = "oneflow.add_n2"(%arg0, %arg1)
+    {
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        op_name = "",
+        op_type_name = "add_n",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018431205375 : i64
+    } : (tensor<1x7x7xf32>, tensor<1x7x7xf32>) -> tensor<1x7x7xf32>
+    oneflow.return %res: tensor<1x7x7xf32>
+}
+
+
+oneflow.job @test_broadcast_add(%arg0: tensor<1x1000xf32>, %arg1: tensor<1000xf32>) -> tensor<1x1000xf32>
+{
+    %res = "oneflow.broadcast_add"(%arg0, %arg1)
+    {
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018431234047 : i64
+    } : (tensor<1x1000xf32>, tensor<1000xf32>) -> tensor<1x1000xf32>
+    oneflow.return %res : tensor<1x1000xf32>
+}
+
+
+oneflow.job @test_max_pool_2d(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32>
+{
+    %y, %indice = "oneflow.max_pool_2d"(%arg0)
+    {
+        ceil_mode = false,
+        data_format = "channels_first",
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        dilation = [1 : si32, 1 : si32],
+        hierarchy = [1], kernel_size = [3 : si32, 3 : si32],
+        op_name = "",
+        output_lbns = ["", ""],
+        padding = [1 : si32, 1 : si32],
+        return_indices = false,
+        scope_symbol_id = 4611686018427502591 : i64,
+        stride = [2 : si32, 2 : si32]
+    } : (tensor<1x64x112x112xf32>) -> (tensor<1x64x56x56xf32>, tensor<1x64x56x56xi64>)
+    oneflow.return %y :  tensor<1x64x56x56xf32>
+}
+
+
+oneflow.job @test_avg_pool_2d(%arg0: tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32>
+{
+    %res = "oneflow.avg_pool_2d"(%arg0)
+    {
+        ceil_mode = false,
+        count_include_pad = true,
+        data_format = "channels_first",
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        divisor_override = 0 : si32,
+        hierarchy = [1],
+        kernel_size = [7 : si32, 7 : si32],
+        op_name = "model.avgpool-avg_pool_2d-172",
+        output_lbns = ["model.avgpool-avg_pool_2d-172/y_0"],
+        padding = [0 : si32, 0 : si32],
+        scope_symbol_id = 4611686018430775295 : i64,
+        stride = [7 : si32, 7 : si32]
+    } : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32>
+    oneflow.return %res: tensor<1x2048x1x1xf32>
+}
+
+
+oneflow.job @test_conv2d(%arg0: tensor<1x3x224x224xf32>, %arg1: tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32>
+{
+    %res = "oneflow.conv2d"(%arg0, %arg1)
+    {
+        data_format = "channels_first",
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        dilation_rate = [1 : si32, 1 : si32],
+        filters = 512 : si32,
+        groups = 1 : si32,
+        hierarchy = [1],
+        kernel_size = [1 : si32, 1 : si32],
+        op_name = "",
+        operand_segment_sizes = dense<[1, 1, 0, 0]> : vector<4xi32>,
+        output_lbns = [""],
+        padding_before = [0 : si32, 0 : si32],
+        scope_symbol_id = 4611686018431012863 : i64,
+        strides = [1 : si32, 1 : si32]
+    } : (tensor<1x3x224x224xf32>, tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32>
+    oneflow.return %res : tensor<1x5x224x224xf32>
+}
+
+
+oneflow.job @test_flatten(%arg0: tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32>
+{
+    %res = "oneflow.flatten"(%arg0)
+    {
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        end_dim = 2 : si32,
+        hierarchy = [1],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018431217663 : i64,
+        start_dim = 1 : si32
+    } : (tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32>
+    oneflow.return %res : tensor<4x6x1xf32>
+}
+
+
+oneflow.job @test_matmul(%arg0: tensor<1x2048xf32>, %arg1: tensor<1000x2048xf32>) ->tensor<1x1000xf32>
+{
+    %res = "oneflow.matmul"(%arg0, %arg1)
+    {
+        alpha = 1.000000e+00 : f64,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018431234047 : i64,
+        transpose_a = false,
+        transpose_b = true
+    } : (tensor<1x2048xf32>, tensor<1000x2048xf32>) -> tensor<1x1000xf32>
+    oneflow.return %res : tensor<1x1000xf32>
+}
+
+
+oneflow.job @test_relu(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+    %res = "oneflow.relu"(%arg0)
+    {
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018427424767 : i64
+    } : (tensor<1xf32>) -> tensor<1xf32>
+    oneflow.return %res : tensor<1xf32>
+}
+
+oneflow.job @test_bn(
+%x:               tensor<1x64x112x112xf32>,
+%moving_mean:     tensor<64xf32>,
+%moving_variance: tensor<64xf32>,
+%gamma:           tensor<64xf32>,
+%beta:            tensor<64xf32>) -> tensor<1x64x112x112xf32>
+{
+    %y, %mean, %inv_variance = "oneflow.normalization"(%x, %moving_mean, %moving_variance, %gamma, %beta)
+    {
+        axis = 1 : si32,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        epsilon = 9.99999974E-6 : f32,
+        hierarchy = [1],
+        momentum = 0.899999976 : f32,
+        op_name = "",
+        operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>,
+        output_lbns = ["", "", ""],
+        result_segment_sizes = dense<1> : vector<3xi32>,
+        scope_symbol_id = 4611686018427453439 : i64,
+        training = true
+    } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>)
+    oneflow.return %y: tensor<1x64x112x112xf32>
+}
+
+oneflow.job @test_bn_infer(
+%x:               tensor<1x64x112x112xf32>,
+%moving_mean:     tensor<64xf32>,
+%moving_variance: tensor<64xf32>,
+%gamma:           tensor<64xf32>,
+%beta:            tensor<64xf32>) -> tensor<1x64x112x112xf32>
+{
+    %y = "oneflow.normalization_infer"(%x, %moving_mean, %moving_variance, %gamma, %beta)
+    {
+        axis = 1 : si32,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        epsilon = 9.99999974E-6 : f32,
+        hierarchy = [1],
+        momentum = 0.899999976 : f32,
+        op_name = "",
+        operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>,
+        output_lbns = ["", "", ""],
+        result_segment_sizes = dense<1> : vector<3xi32>,
+        scope_symbol_id = 4611686018427453439 : i64,
+        training = true
+    } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x64x112x112xf32>
+    oneflow.return %y: tensor<1x64x112x112xf32>
+}
diff --git a/oneflow/ir/test/Frontend/lit.local.cfg b/oneflow/ir/test/Frontend/lit.local.cfg
new file mode 100644
index 00000000000..a63a9b31aa9
--- /dev/null
+++ b/oneflow/ir/test/Frontend/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.WITH_ONEFLOW_IREE:
+  config.unsupported = True
diff --git a/oneflow/ir/test/Frontend/test_iree_resnet.py b/oneflow/ir/test/Frontend/test_iree_resnet.py
new file mode 100644
index 00000000000..7ac4e226965
--- /dev/null
+++ b/oneflow/ir/test/Frontend/test_iree_resnet.py
@@ -0,0 +1,113 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+# RUN: python3 %s
+
+from oneflow_iree.compiler import Runner
+from flowvision.models import resnet50
+import oneflow as flow
+import oneflow.unittest
+import unittest
+import os
+import numpy as np
+import time
+
+
+os.environ["ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION"] = "1"
+
+
+def _test_iree_resnet_cpu(test_case):
+    model = resnet50(pretrained=True)
+    model.eval()
+
+    class GraphModule(flow.nn.Graph):
+        def __init__(self):
+            super().__init__()
+            self.model = model
+
+        def build(self, x):
+            return self.model(x)
+
+    func = Runner(GraphModule, return_numpy=True)
+    input = flow.ones([1, 3, 224, 224])
+    f = GraphModule()
+    for iter in range(3):
+        print("======== in cpu iter" + str(iter + 1))
+        iree_output = func(input)
+        start_time = time.time()
+        graph_output = f(input)
+        gap = time.time() - start_time
+        print("graph cost: " + str(gap))
+        graph_output = graph_output.cpu().detach().numpy()
+        rtol = np.abs((graph_output - iree_output) / iree_output)
+        np.set_printoptions(threshold=np.inf)
+        print(
+            np.transpose(
+                np.concatenate((graph_output, iree_output, rtol), axis=0), [1, 0]
+            )
+        )
+        # the rtol accumulate layer by layer
+        test_case.assertTrue(
+            np.allclose(iree_output, graph_output, rtol=1.0e-1, atol=1e-3)
+        )
+
+
+def _test_iree_resnet_cuda(test_case):
+    model = resnet50(pretrained=True).cuda()
+    model.eval()
+
+    class GraphModule(flow.nn.Graph):
+        def __init__(self):
+            super().__init__()
+            self.model = model
+
+        def build(self, x):
+            return self.model(x)
+
+    func = Runner(GraphModule, return_numpy=True).cuda()
+    input = flow.ones([1, 3, 224, 224]).cuda()
+    f = GraphModule()
+    for iter in range(3):
+        print("======== in cuda iter" + str(iter + 1))
+        iree_output = func(input)
+        start_time = time.time()
+        graph_output = f(input)
+        gap = time.time() - start_time
+        print("graph cost: " + str(gap))
+        graph_output = graph_output.cpu().detach().numpy()
+        rtol = np.abs((graph_output - iree_output) / iree_output)
+        np.set_printoptions(threshold=np.inf)
+        print(
+            np.transpose(
+                np.concatenate((graph_output, iree_output, rtol), axis=0), [1, 0]
+            )
+        )
+        # the rtol accumulate layer by layer
+        test_case.assertTrue(
+            np.allclose(iree_output, graph_output, rtol=1.0e-1, atol=1e-3)
+        )
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestIreeResnet(oneflow.unittest.TestCase):
+    def test_iree_resnet_cpu(test_case):
+        _test_iree_resnet_cpu(test_case)
+
+    def test_iree_resnet_cuda(test_case):
+        _test_iree_resnet_cuda(test_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/oneflow/ir/test/Frontend/test_iree_runner.py b/oneflow/ir/test/Frontend/test_iree_runner.py
new file mode 100644
index 00000000000..a0caa90fecd
--- /dev/null
+++ b/oneflow/ir/test/Frontend/test_iree_runner.py
@@ -0,0 +1,71 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+# RUN: python3 %s
+
+from oneflow_iree.compiler import Runner
+import oneflow as flow
+import oneflow.unittest
+import unittest
+import numpy as np
+
+
+class RELU(flow.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.relu = flow.nn.ReLU()
+
+    def forward(self, x):
+        return self.relu(x)
+
+
+class GraphModule(flow.nn.Graph):
+    def __init__(self):
+        super().__init__()
+        self.fw = RELU()
+
+    def build(self, x):
+        return self.fw(x)
+
+
+def _test_check_iree_runner(test_case):
+    func = Runner(GraphModule, return_numpy=True).cuda()
+    # run on iree cuda backend
+    input = flow.Tensor([-1.0, 1.0])
+    output = func(input)
+    test_case.assertTrue(np.allclose(output, [0.0, 1.0]))
+    # change input shape
+    input = flow.Tensor([-1.0, 1.0, -1])
+    output = func(input)
+    test_case.assertTrue(np.allclose(output, [0.0, 1.0, 0.0]))
+    # change on iree cpu backend
+    func = func.cpu()
+    input = flow.Tensor([-1.0, 0.0, 1.0])
+    output = func(input)
+    test_case.assertTrue(np.allclose(output, [0.0, 0.0, 1.0]))
+    # change input shape
+    input = flow.Tensor([-1, 1.0])
+    output = func(input)
+    test_case.assertTrue(np.allclose(output, [0.0, 1.0]))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestCheckIreeRunner(oneflow.unittest.TestCase):
+    def test_check_iree_runner(test_case):
+        _test_check_iree_runner(test_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir b/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir
new file mode 100644
index 00000000000..34ee5b499dc
--- /dev/null
+++ b/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir
@@ -0,0 +1,16 @@
+// RUN: oneflow-opt %s \
+// RUN: -pass-pipeline="func.func(tosa-to-linalg)" -cse \
+// RUN: --linalg-fuse-elementwise-ops -linalg-bufferize \
+// RUN: -tensor-bufferize -func-bufferize -buffer-results-to-out-params \
+// RUN: -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm \
+// RUN: -convert-func-to-llvm -convert-memref-to-llvm -reconcile-unrealized-casts --print-after-all \
+// RUN: | oneflow-translate -mlir-to-llvmir | clang -x ir - -c -o test.o
+
+builtin.module {
+  func.func @Graph_0(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+    %0 = "tosa.cast"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+    %1 = "tosa.reluN"(%0) {max_fp = 3.40282347E+38 : f32, max_int = 9223372036854775807 : i64} : (tensor<2xf32>) -> tensor<2xf32>
+    %2 = "tosa.cast"(%1) : (tensor<2xf32>) -> tensor<2xf32>
+    func.return %2 : tensor<2xf32>
+  }
+}
diff --git a/oneflow/ir/test/OneFlow/conversion/OneFlowToTosa.mlir b/oneflow/ir/test/OneFlow/conversion/OneFlowToTosa.mlir
new file mode 100644
index 00000000000..3028b3c04bf
--- /dev/null
+++ b/oneflow/ir/test/OneFlow/conversion/OneFlowToTosa.mlir
@@ -0,0 +1,342 @@
+// RUN: oneflow-opt %s \
+// RUN: -split-input-file \
+// RUN: -lower-oneflow-to-tosa \
+// RUN: -verify-diagnostics -o - \
+// RUN: | FileCheck %s
+
+
+// CHECK-LABEL: test_func
+// CHECK: return [[V0:%.+]] : tensor<1xf32>
+oneflow.job @test_func(%arg0: tensor<1xf32>) -> tensor<1xf32>
+{
+    oneflow.return %arg0 : tensor<1xf32>
+}
+
+
+// CHECK-LABEL: test_input
+// CHECK: return [[V0:%.+]] : tensor<1xf32>
+oneflow.job @test_input(%arg0: tensor<1xf32>) -> tensor<1xf32>
+{
+    %res = "oneflow.input"(%arg0)
+    {
+        data_type = 2 : i32,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        is_dynamic = false,
+        nd_sbp = ["B"],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018427412479 : i64,
+        shape = [1 : si64]
+    } : (tensor<1xf32>) -> tensor<1xf32>
+    oneflow.return %res : tensor<1xf32>
+}
+
+
+// CHECK-LABEL: test_output
+// CHECK: return [[V0:%.+]] : tensor<1xf32>
+oneflow.job @test_output(%arg0: tensor<1xf32>) -> tensor<1xf32>
+{
+    %res = "oneflow.output"(%arg0)
+    {
+        data_type = 2 : i32,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        is_dynamic = false,
+        nd_sbp = ["B"],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018427412479 : i64,
+        shape = [1 : si64]
+    } : (tensor<1xf32>) -> tensor<1xf32>
+    oneflow.return %res : tensor<1xf32>
+}
+
+
+// CHECK-LABEL: test_variable
+// CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<64x3x7x7xf32>} : () -> tensor<64x3x7x7xf32>
+// CHECK: return [[V0]] : tensor<64x3x7x7xf32>
+oneflow.job @test_variable() -> tensor<64x3x7x7xf32>
+{
+    %res = "oneflow.variable"() {
+        data_type = 2 : i32,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        nd_sbp = ["B"],
+        op_name = "fw.model.conv1.weight",
+        output_lbns = ["fw.model.conv1.weight/out"],
+        scope_symbol_id = 4611686018427432959 : i64,
+        shape = [64 : si64, 3 : si64, 7 : si64, 7 : si64]
+    } : () -> tensor<64x3x7x7xf32>
+    oneflow.return %res : tensor<64x3x7x7xf32>
+}
+
+
+//CHECK-LABEL: test_add_n2
+//CHECK: [[V0:%.+]] = "tosa.add"(%arg0, %arg1) : (tensor<1x7x7xf32>, tensor<1x7x7xf32>) -> tensor<1x7x7xf32>
+//CHECK: return [[V0]] : tensor<1x7x7xf32>
+oneflow.job @test_add_n2(%arg0: tensor<1x7x7xf32>, %arg1: tensor<1x7x7xf32>) -> tensor<1x7x7xf32>
+{
+    %res = "oneflow.add_n2"(%arg0, %arg1)
+    {
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        op_name = "",
+        op_type_name = "add_n",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018431205375 : i64
+    } : (tensor<1x7x7xf32>, tensor<1x7x7xf32>) -> tensor<1x7x7xf32>
+    oneflow.return %res: tensor<1x7x7xf32>
+}
+
+
+//CHECK-LABEL: test_broadcast_add
+//CHECK: [[V0:%.+]] = "tosa.add"(%arg0, %arg1) : (tensor<1x1000xf32>, tensor<1000xf32>) -> tensor<1x1000xf32>
+//CHECK: return [[V0]] : tensor<1x1000xf32>
+oneflow.job @test_broadcast_add(%arg0: tensor<1x1000xf32>, %arg1: tensor<1000xf32>) -> tensor<1x1000xf32>
+{
+    %res = "oneflow.broadcast_add"(%arg0, %arg1)
+    {
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018431234047 : i64
+    } : (tensor<1x1000xf32>, tensor<1000xf32>) -> tensor<1x1000xf32>
+    oneflow.return %res : tensor<1x1000xf32>
+}
+
+
+//CHECK-LABEL: test_max_pool_2d
+//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+//CHECK: [[V1:%.+]] = "tosa.transpose"(%arg0, [[V0]]) : (tensor<1x64x112x112xf32>, tensor<4xi32>) -> tensor<1x112x112x64xf32>
+//CHECK: [[V2:%.+]] = "tosa.max_pool2d"([[V1]]) {kernel = [3, 3], pad = [1, 1, 1, 1], stride = [2, 2]} : (tensor<1x112x112x64xf32>) -> tensor<1x56x56x64xf32>
+//CHECK: [[V3:%.+]] = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+//CHECK: [[V4:%.+]] = "tosa.transpose"([[V2]], [[V3]]) : (tensor<1x56x56x64xf32>, tensor<4xi32>) -> tensor<1x64x56x56xf32>
+//CHECK: [[V5:%.+]] = "tosa.const"() {value = dense<0> : tensor<1x64x56x56xi64>} : () -> tensor<1x64x56x56xi64>
+//CHECK: return [[V4]] : tensor<1x64x56x56xf32>
+oneflow.job @test_max_pool_2d(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32>
+{
+    %y, %indice = "oneflow.max_pool_2d"(%arg0)
+    {
+        ceil_mode = false,
+        data_format = "channels_first",
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        dilation = [1 : si32, 1 : si32],
+        hierarchy = [1], kernel_size = [3 : si32, 3 : si32],
+        op_name = "",
+        output_lbns = ["", ""],
+        padding = [1 : si32, 1 : si32],
+        return_indices = false,
+        scope_symbol_id = 4611686018427502591 : i64,
+        stride = [2 : si32, 2 : si32]
+    } : (tensor<1x64x112x112xf32>) -> (tensor<1x64x56x56xf32>, tensor<1x64x56x56xi64>)
+    oneflow.return %y :  tensor<1x64x56x56xf32>
+}
+
+
+//CHECK-LABEL: test_avg_pool_2d
+//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+//CHECK: [[V1:%.+]] = "tosa.transpose"(%arg0, [[V0]]) : (tensor<1x2048x7x7xf32>, tensor<4xi32>) -> tensor<1x7x7x2048xf32>
+//CHECK: [[V2:%.+]] = "tosa.avg_pool2d"([[V1]]) {kernel = [7, 7], pad = [0, 0, 0, 0], stride = [7, 7]} : (tensor<1x7x7x2048xf32>) -> tensor<1x1x1x2048xf32>
+//CHECK: [[V3:%.+]] = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+//CHECK: [[V4:%.+]] = "tosa.transpose"([[V2]], [[V3]]) : (tensor<1x1x1x2048xf32>, tensor<4xi32>) -> tensor<1x2048x1x1xf32>
+//CHECK: return [[V4]] : tensor<1x2048x1x1xf32>
+oneflow.job @test_avg_pool_2d(%arg0: tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32>
+{
+    %res = "oneflow.avg_pool_2d"(%arg0)
+    {
+        ceil_mode = false,
+        count_include_pad = true,
+        data_format = "channels_first",
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        divisor_override = 0 : si32,
+        hierarchy = [1],
+        kernel_size = [7 : si32, 7 : si32],
+        op_name = "model.avgpool-avg_pool_2d-172",
+        output_lbns = ["model.avgpool-avg_pool_2d-172/y_0"],
+        padding = [0 : si32, 0 : si32],
+        scope_symbol_id = 4611686018430775295 : i64,
+        stride = [7 : si32, 7 : si32]
+    } : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32>
+    oneflow.return %res: tensor<1x2048x1x1xf32>
+}
+
+
+//CHECK-LABEL: test_conv2d
+//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<5xf32>} : () -> tensor<5xf32>
+//CHECK: [[V1:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+//CHECK: [[V2:%.+]] = "tosa.transpose"(%arg0, [[V1]]) : (tensor<1x3x224x224xf32>, tensor<4xi32>) -> tensor<1x224x224x3xf32>
+//CHECK: [[V3:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+//CHECK: [[V4:%.+]] = "tosa.transpose"(%arg1, [[V3]]) : (tensor<5x3x1x1xf32>, tensor<4xi32>) -> tensor<5x1x1x3xf32>
+//CHECK: [[V5:%.+]] = "tosa.conv2d"([[V2]], [[V4]], [[V0]]) {dilation = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]} : (tensor<1x224x224x3xf32>, tensor<5x1x1x3xf32>, tensor<5xf32>) -> tensor<1x224x224x5xf32>
+//CHECK: [[V6:%.+]] = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+//CHECK: [[V7:%.+]] = "tosa.transpose"([[V5]], [[V6]]) : (tensor<1x224x224x5xf32>, tensor<4xi32>) -> tensor<1x5x224x224xf32>
+//CHECK: return [[V7]] : tensor<1x5x224x224xf32>
+oneflow.job @test_conv2d(%arg0: tensor<1x3x224x224xf32>, %arg1: tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32>
+{
+    %res = "oneflow.conv2d"(%arg0, %arg1)
+    {
+        data_format = "channels_first",
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        dilation_rate = [1 : si32, 1 : si32],
+        filters = 512 : si32,
+        groups = 1 : si32,
+        hierarchy = [1],
+        kernel_size = [1 : si32, 1 : si32],
+        op_name = "",
+        operand_segment_sizes = dense<[1, 1, 0, 0]> : vector<4xi32>,
+        output_lbns = [""],
+        padding_before = [0 : si32, 0 : si32],
+        scope_symbol_id = 4611686018431012863 : i64,
+        strides = [1 : si32, 1 : si32]
+    } : (tensor<1x3x224x224xf32>, tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32>
+    oneflow.return %res : tensor<1x5x224x224xf32>
+}
+
+
+//CHECK-LABEL: test_flatten
+//CHECK: [[V0:%.+]] = "tosa.reshape"(%arg0) {new_shape = [4, 6, 1]} : (tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32>
+//CHECK: return [[V0]] : tensor<4x6x1xf32>
+oneflow.job @test_flatten(%arg0: tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32>
+{
+    %res = "oneflow.flatten"(%arg0)
+    {
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        end_dim = 2 : si32,
+        hierarchy = [1],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018431217663 : i64,
+        start_dim = 1 : si32
+    } : (tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32>
+    oneflow.return %res : tensor<4x6x1xf32>
+}
+
+
+//CHECK-LABEL: test_matmul
+//CHECK: [[V0:%.+]] = "tosa.reshape"(%arg0) {new_shape = [1, 1, 2048]} : (tensor<1x2048xf32>) -> tensor<1x1x2048xf32>
+//CHECK: [[V1:%.+]] = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+//CHECK: [[V2:%.+]] = "tosa.transpose"(%arg1, [[V1]]) : (tensor<1000x2048xf32>, tensor<2xi32>) -> tensor<2048x1000xf32>
+//CHECK: [[V3:%.+]] = "tosa.reshape"([[V2]]) {new_shape = [1, 2048, 1000]} : (tensor<2048x1000xf32>) -> tensor<1x2048x1000xf32>
+//CHECK: [[V4:%.+]] = "tosa.matmul"([[V0]], [[V3]]) : (tensor<1x1x2048xf32>, tensor<1x2048x1000xf32>) -> tensor<1x1x1000xf32>
+//CHECK: [[V5:%.+]] = "tosa.reshape"([[V4]]) {new_shape = [1, 1000]} : (tensor<1x1x1000xf32>) -> tensor<1x1000xf32>
+//CHECK: return [[V5]] : tensor<1x1000xf32>
+oneflow.job @test_matmul(%arg0: tensor<1x2048xf32>, %arg1: tensor<1000x2048xf32>) ->tensor<1x1000xf32>
+{
+    %res = "oneflow.matmul"(%arg0, %arg1)
+    {
+        alpha = 1.000000e+00 : f64,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018431234047 : i64,
+        transpose_a = false,
+        transpose_b = true
+    } : (tensor<1x2048xf32>, tensor<1000x2048xf32>) -> tensor<1x1000xf32>
+    oneflow.return %res : tensor<1x1000xf32>
+}
+
+
+//CHECK-LABEL: test_relu
+//CHECK: [[V0:%.+]] = "tosa.reluN"(%arg0) {max_fp = 3.40282347E+38 : f32, max_int = 9223372036854775807 : i64} : (tensor<1xf32>) -> tensor<1xf32>
+//CHECK: return [[V0]] : tensor<1xf32>
+oneflow.job @test_relu(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+    %res = "oneflow.relu"(%arg0)
+    {
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        hierarchy = [1],
+        op_name = "",
+        output_lbns = [""],
+        scope_symbol_id = 4611686018427424767 : i64
+    } : (tensor<1xf32>) -> tensor<1xf32>
+    oneflow.return %res : tensor<1xf32>
+}
+
+//CHECK-LABEL: test_bn
+//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
+//CHECK: [[V1:%.+]] = "tosa.reshape"(%arg1) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32>
+//CHECK: [[V2:%.+]] = "tosa.reshape"(%arg2) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32>
+//CHECK: [[V3:%.+]] = "tosa.reshape"(%arg3) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32>
+//CHECK: [[V4:%.+]] = "tosa.reshape"(%arg4) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32>
+//CHECK: [[V5:%.+]] = "tosa.sub"(%arg0, [[V1]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
+//CHECK: [[V6:%.+]] = "tosa.add"([[V2]], [[V0]]) : (tensor<64x1x1xf32>, tensor<f32>) -> tensor<64x1x1xf32>
+//CHECK: [[V7:%.+]] = "tosa.rsqrt"([[V6]]) : (tensor<64x1x1xf32>) -> tensor<64x1x1xf32>
+//CHECK: [[V8:%.+]] = "tosa.mul"([[V5]], [[V7]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
+//CHECK: [[V9:%.+]] = "tosa.mul"([[V8]], [[V3]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
+//CHECK: [[V10:%.+]] = "tosa.add"([[V9]], [[V4]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
+//CHECK: return [[V10]] : tensor<1x64x112x112xf32>
+oneflow.job @test_bn(
+%x:               tensor<1x64x112x112xf32>,
+%moving_mean:     tensor<64xf32>,
+%moving_variance: tensor<64xf32>,
+%gamma:           tensor<64xf32>,
+%beta:            tensor<64xf32>) -> tensor<1x64x112x112xf32>
+{
+    %y, %mean, %inv_variance = "oneflow.normalization"(%x, %moving_mean, %moving_variance, %gamma, %beta)
+    {
+        axis = 1 : si32,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        epsilon = 9.99999974E-6 : f32,
+        hierarchy = [1],
+        momentum = 0.899999976 : f32,
+        op_name = "",
+        operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>,
+        output_lbns = ["", "", ""],
+        result_segment_sizes = dense<1> : vector<3xi32>,
+        scope_symbol_id = 4611686018427453439 : i64,
+        training = true
+    } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>)
+    oneflow.return %y: tensor<1x64x112x112xf32>
+}
+
+//CHECK-LABEL: test_bn_infer
+//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<9.99999974E-6> : tensor<f32>} : () -> tensor<f32>
+//CHECK: [[V1:%.+]] = "tosa.reshape"(%arg1) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32>
+//CHECK: [[V2:%.+]] = "tosa.reshape"(%arg2) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32>
+//CHECK: [[V3:%.+]] = "tosa.reshape"(%arg3) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32>
+//CHECK: [[V4:%.+]] = "tosa.reshape"(%arg4) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32>
+//CHECK: [[V5:%.+]] = "tosa.sub"(%arg0, [[V1]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
+//CHECK: [[V6:%.+]] = "tosa.add"([[V2]], [[V0]]) : (tensor<64x1x1xf32>, tensor<f32>) -> tensor<64x1x1xf32>
+//CHECK: [[V7:%.+]] = "tosa.rsqrt"([[V6]]) : (tensor<64x1x1xf32>) -> tensor<64x1x1xf32>
+//CHECK: [[V8:%.+]] = "tosa.mul"([[V5]], [[V7]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
+//CHECK: [[V9:%.+]] = "tosa.mul"([[V8]], [[V3]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
+//CHECK: [[V10:%.+]] = "tosa.add"([[V9]], [[V4]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
+//CHECK: return [[V10]] : tensor<1x64x112x112xf32>
+oneflow.job @test_bn_infer(
+%x:               tensor<1x64x112x112xf32>,
+%moving_mean:     tensor<64xf32>,
+%moving_variance: tensor<64xf32>,
+%gamma:           tensor<64xf32>,
+%beta:            tensor<64xf32>) -> tensor<1x64x112x112xf32>
+{
+    %y = "oneflow.normalization_infer"(%x, %moving_mean, %moving_variance, %gamma, %beta)
+    {
+        axis = 1 : si32,
+        device_name = ["@0:0"],
+        device_tag = "cpu",
+        epsilon = 9.99999974E-6 : f32,
+        hierarchy = [1],
+        momentum = 0.899999976 : f32,
+        op_name = "",
+        operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>,
+        output_lbns = ["", "", ""],
+        result_segment_sizes = dense<1> : vector<3xi32>,
+        scope_symbol_id = 4611686018427453439 : i64,
+        training = true
+    } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x64x112x112xf32>
+    oneflow.return %y: tensor<1x64x112x112xf32>
+}
diff --git a/oneflow/ir/test/lit.cfg.py b/oneflow/ir/test/lit.cfg.py
index af8af5b28af..275f16893d1 100644
--- a/oneflow/ir/test/lit.cfg.py
+++ b/oneflow/ir/test/lit.cfg.py
@@ -105,3 +105,10 @@
     ]
 )
 llvm_config.add_tool_substitutions(tools, tool_dirs)
+
+try:
+    import oneflow_iree.compiler
+
+    config.WITH_ONEFLOW_IREE = True
+except ImportError:
+    config.WITH_ONEFLOW_IREE = False
diff --git a/oneflow/user/kernels/arg_where_kernel_util.cpp b/oneflow/user/kernels/arg_where_kernel_util.cpp
index 25af71e776d..cc85cad8ad0 100644
--- a/oneflow/user/kernels/arg_where_kernel_util.cpp
+++ b/oneflow/user/kernels/arg_where_kernel_util.cpp
@@ -15,7 +15,7 @@ limitations under the License.
 */
 #include "oneflow/user/kernels/arg_where_kernel_util.h"
 #include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/common/fixed_vector.h"
+#include "oneflow/core/common/small_vector.h"
 #include "oneflow/core/kernel/kernel_util.h"
 
 namespace oneflow {
diff --git a/oneflow/user/kernels/arg_where_kernel_util.cu b/oneflow/user/kernels/arg_where_kernel_util.cu
index 61e6de4f543..522078e42ab 100644
--- a/oneflow/user/kernels/arg_where_kernel_util.cu
+++ b/oneflow/user/kernels/arg_where_kernel_util.cu
@@ -15,7 +15,7 @@ limitations under the License.
 */
 #include "oneflow/user/kernels/arg_where_kernel_util.h"
 #include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/common/fixed_vector.h"
+#include "oneflow/core/common/small_vector.h"
 #include "oneflow/core/cuda/elementwise.cuh"
 #include "oneflow/core/kernel/kernel_util.h"
 #include "oneflow/core/ep/cuda/cuda_stream.h"
diff --git a/oneflow/user/kernels/avg_pool_kernel_util.h b/oneflow/user/kernels/avg_pool_kernel_util.h
index d6586bb70bf..d0b0ab8aeab 100644
--- a/oneflow/user/kernels/avg_pool_kernel_util.h
+++ b/oneflow/user/kernels/avg_pool_kernel_util.h
@@ -65,7 +65,7 @@ struct XPUAdd {
   OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \
   OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
 
-typedef fixed_vector<int64_t, SHAPE_MAX_AXIS_SIZE> FixedDimVector;
+typedef small_vector<int64_t, SHAPE_MAX_AXIS_SIZE> FixedDimVector;
 
 class AvgPoolParams3D {
  public:
diff --git a/oneflow/user/kernels/dim_gather_kernels.cpp b/oneflow/user/kernels/dim_gather_kernels.cpp
index 2efe0a937a3..efe197e4bc8 100644
--- a/oneflow/user/kernels/dim_gather_kernels.cpp
+++ b/oneflow/user/kernels/dim_gather_kernels.cpp
@@ -53,7 +53,7 @@ class DimGatherKernel final : public user_op::OpKernel {
     int dim_value = 0;
     if (ndim > 0) { dim_value = input_tensor->shape().At(dim); }
 
-    fixed_vector<IDX_T, kDimGatherMaxDimCount> shape_vec(ndim);
+    small_vector<IDX_T, kDimGatherMaxDimCount> shape_vec(ndim);
     auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void {
       std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(),
                      [](int64_t dim) -> IDX_T { return static_cast<IDX_T>(dim); });
diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp
index 11ebbdc2e1c..df4721b6c3f 100644
--- a/oneflow/user/kernels/dim_scatter_kernels.cpp
+++ b/oneflow/user/kernels/dim_scatter_kernels.cpp
@@ -51,7 +51,7 @@ class DimScatterKernel final : public user_op::OpKernel {
     }
 
     const int ndim = src_tensor->shape().NumAxes();
-    fixed_vector<IDX_T, kDimGatherMaxDimCount> shape_vec(ndim);
+    small_vector<IDX_T, kDimGatherMaxDimCount> shape_vec(ndim);
     auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void {
       std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(),
                      [](int32_t dim) -> IDX_T { return static_cast<IDX_T>(dim); });
diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp
index e67322b02d2..34fab14c90c 100644
--- a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp
+++ b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp
@@ -49,7 +49,7 @@ class DimScatterScalarKernel final : public user_op::OpKernel {
     }
 
     const int ndim = out_tensor->shape().NumAxes();
-    fixed_vector<IDX_T, kDimGatherMaxDimCount> shape_vec(ndim);
+    small_vector<IDX_T, kDimGatherMaxDimCount> shape_vec(ndim);
     auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void {
       std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(),
                      [](int32_t dim) -> IDX_T { return static_cast<IDX_T>(dim); });
diff --git a/oneflow/user/kernels/gather_kernel_util.cpp b/oneflow/user/kernels/gather_kernel_util.cpp
index bec965bdf12..88705ca4bff 100644
--- a/oneflow/user/kernels/gather_kernel_util.cpp
+++ b/oneflow/user/kernels/gather_kernel_util.cpp
@@ -85,7 +85,7 @@ void GatherKernelUtilImpl<DeviceType::kCPU, T, K>::Forward(ep::Stream* stream, c
         const T* from = in + outer_idx * gather_dim_size * inner_dim_size + idx * inner_dim_size;
         std::copy(from, from + inner_dim_size, to);
       } else {
-        std::memset(reinterpret_cast<void*>(to), 0, inner_dim_size * sizeof(K));
+        std::memset(reinterpret_cast<void*>(to), 0, inner_dim_size * sizeof(T));
       }
     }
   }
diff --git a/oneflow/user/kernels/image_preprocess_kernels.cu b/oneflow/user/kernels/image_preprocess_kernels.cu
index 30fda3bd96d..2b2e287e69c 100644
--- a/oneflow/user/kernels/image_preprocess_kernels.cu
+++ b/oneflow/user/kernels/image_preprocess_kernels.cu
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/framework/framework.h"
-#include "oneflow/core/common/fixed_vector.h"
+#include "oneflow/core/common/small_vector.h"
 #include "oneflow/core/common/nd_index_offset_helper.h"
 #include "oneflow/core/ep/cuda/cuda_stream.h"
 
diff --git a/oneflow/user/kernels/max_pool_kernel_util.h b/oneflow/user/kernels/max_pool_kernel_util.h
index c62bf3bd3cd..821aa2020e0 100644
--- a/oneflow/user/kernels/max_pool_kernel_util.h
+++ b/oneflow/user/kernels/max_pool_kernel_util.h
@@ -41,7 +41,7 @@ namespace oneflow {
 
 #define POOL_DATA_TYPE_CUDA_SEQ POOL_DATA_TYPE_SEQ
 
-typedef fixed_vector<int64_t, SHAPE_MAX_AXIS_SIZE> FixedDimVector;
+typedef small_vector<int64_t, SHAPE_MAX_AXIS_SIZE> FixedDimVector;
 
 template<typename T>
 struct DeviceAdd {
diff --git a/oneflow/user/ops/arange_op.cpp b/oneflow/user/ops/arange_op.cpp
index 225f3fa37cf..73585347376 100644
--- a/oneflow/user/ops/arange_op.cpp
+++ b/oneflow/user/ops/arange_op.cpp
@@ -84,8 +84,9 @@ namespace oneflow {
   const Shape& parallel_hierarchy = *ctx->parallel_desc().hierarchy();
 
   const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-  const Shape& physical_shape =
-      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape();
+  const auto tensor_slice_view =
+      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id);
+  const Shape& physical_shape = tensor_slice_view.shape();
 
   *ctx->OutputShape("out", 0) = physical_shape;
 
diff --git a/oneflow/user/ops/constant_op.cpp b/oneflow/user/ops/constant_op.cpp
index 8cf14f3b22f..62d9bdcc050 100644
--- a/oneflow/user/ops/constant_op.cpp
+++ b/oneflow/user/ops/constant_op.cpp
@@ -29,8 +29,9 @@ namespace oneflow {
   const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0);
   const Shape& logical_shape = ctx->Attr<Shape>("shape");
   const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-  const Shape& physical_shape =
-      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape();
+  const auto tensor_slice_view =
+      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id);
+  const Shape& physical_shape = tensor_slice_view.shape();
 
   *ctx->OutputShape("out", 0) = physical_shape;
   return Maybe<void>::Ok();
diff --git a/oneflow/user/ops/distributions/normal_op.cpp b/oneflow/user/ops/distributions/normal_op.cpp
index 5af64e0d3bb..736a70e5d0b 100644
--- a/oneflow/user/ops/distributions/normal_op.cpp
+++ b/oneflow/user/ops/distributions/normal_op.cpp
@@ -32,8 +32,9 @@ namespace oneflow {
   const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0);
   const Shape& logical_shape = ctx->Attr<Shape>("shape");
   const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-  const Shape& physical_shape =
-      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape();
+  const auto tensor_slice_view =
+      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id);
+  const Shape& physical_shape = tensor_slice_view.shape();
 
   *ctx->OutputShape("out", 0) = physical_shape;
   return Maybe<void>::Ok();
diff --git a/oneflow/user/ops/distributions/uniform_int_op.cpp b/oneflow/user/ops/distributions/uniform_int_op.cpp
index 9e79e69c4e5..f01bb710f3c 100644
--- a/oneflow/user/ops/distributions/uniform_int_op.cpp
+++ b/oneflow/user/ops/distributions/uniform_int_op.cpp
@@ -35,8 +35,9 @@ namespace oneflow {
   const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0);
   const Shape& logical_shape = ctx->Attr<Shape>("shape");
   const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-  const Shape& physical_shape =
-      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape();
+  const auto tensor_slice_view =
+      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id);
+  const Shape& physical_shape = tensor_slice_view.shape();
 
   *ctx->OutputShape("out", 0) = physical_shape;
   return Maybe<void>::Ok();
diff --git a/oneflow/user/ops/distributions/uniform_op.cpp b/oneflow/user/ops/distributions/uniform_op.cpp
index 206a27426d8..b7d566aac49 100644
--- a/oneflow/user/ops/distributions/uniform_op.cpp
+++ b/oneflow/user/ops/distributions/uniform_op.cpp
@@ -35,8 +35,9 @@ namespace oneflow {
   const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0);
   const Shape& logical_shape = ctx->Attr<Shape>("shape");
   const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-  const Shape& physical_shape =
-      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape();
+  const auto tensor_slice_view =
+      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id);
+  const Shape& physical_shape = tensor_slice_view.shape();
 
   *ctx->OutputShape("out", 0) = physical_shape;
   return Maybe<void>::Ok();
diff --git a/oneflow/user/ops/eager_nccl_ops.cpp b/oneflow/user/ops/eager_nccl_ops.cpp
index 1399ea4d97a..bd4cdda1367 100644
--- a/oneflow/user/ops/eager_nccl_ops.cpp
+++ b/oneflow/user/ops/eager_nccl_ops.cpp
@@ -133,8 +133,9 @@ namespace oneflow {
     const Shape& parallel_hierarchy = *ctx->parallel_desc().hierarchy();
     const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0);
     const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-    const Shape& physical_shape =
-        GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, in_shape, parallel_id).shape();
+    const auto tensor_slice_view =
+        GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, in_shape, parallel_id);
+    const Shape& physical_shape = tensor_slice_view.shape();
     *out_shape = physical_shape;
   } else {
     *out_shape = in_shape;
diff --git a/oneflow/user/ops/empty_op.cpp b/oneflow/user/ops/empty_op.cpp
index f2060b49950..4489902d730 100644
--- a/oneflow/user/ops/empty_op.cpp
+++ b/oneflow/user/ops/empty_op.cpp
@@ -30,8 +30,9 @@ namespace oneflow {
   const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0);
   const Shape& logical_shape = ctx->Attr<Shape>("shape");
   const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-  const Shape& physical_shape =
-      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape();
+  const auto tensor_slice_view =
+      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id);
+  const Shape& physical_shape = tensor_slice_view.shape();
 
   *ctx->OutputShape("out", 0) = physical_shape;
   *ctx->OutputStride("out", 0) = Stride(physical_shape);
diff --git a/oneflow/user/ops/image_preprocess_ops.cpp b/oneflow/user/ops/image_preprocess_ops.cpp
index 6e72ccda1a5..00c6d419c8b 100644
--- a/oneflow/user/ops/image_preprocess_ops.cpp
+++ b/oneflow/user/ops/image_preprocess_ops.cpp
@@ -156,8 +156,9 @@ namespace oneflow {
   const Shape logical_shape = Shape({batch_size});
   const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
 
-  const Shape& physical_shape =
-      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape();
+  const auto tensor_slice_view =
+      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id);
+  const Shape& physical_shape = tensor_slice_view.shape();
   *ctx->OutputShape("out", 0) = physical_shape;
   return Maybe<void>::Ok();
 }
diff --git a/oneflow/user/ops/randperm_op.cpp b/oneflow/user/ops/randperm_op.cpp
index c7e83402b86..aa6103a2f0d 100644
--- a/oneflow/user/ops/randperm_op.cpp
+++ b/oneflow/user/ops/randperm_op.cpp
@@ -39,8 +39,9 @@ namespace oneflow {
   int32_t n = ctx->Attr<int32_t>("n");
   const Shape& logical_shape = Shape({n});
   const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-  const Shape& physical_shape =
-      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape();
+  const auto tensor_slice_view =
+      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id);
+  const Shape& physical_shape = tensor_slice_view.shape();
 
   *ctx->OutputShape("out", 0) = physical_shape;
 
diff --git a/oneflow/user/ops/stack_op.cpp b/oneflow/user/ops/stack_op.cpp
index 254cbcd1743..1dd129081bd 100644
--- a/oneflow/user/ops/stack_op.cpp
+++ b/oneflow/user/ops/stack_op.cpp
@@ -144,8 +144,6 @@ Maybe<void> GenGradOp(const user_op::UserOpWrapper& op, const user_op::AddOpFn&
 
 /*static*/ Maybe<void> StackGradOp::GetSbp(user_op::SbpContext* ctx) {
   const auto axis = ctx->Attr<int64_t>("axis");
-  const int64_t in_num_axes =
-      ctx->LogicalTensorDesc4InputArgNameAndIndex("in", 0).shape().NumAxes();
   const int64_t like_num_axes =
       ctx->LogicalTensorDesc4InputArgNameAndIndex("like", 0).shape().NumAxes();
   FOR_RANGE(int64_t, i, 0, like_num_axes) {
diff --git a/oneflow/user/utils/pool_util.h b/oneflow/user/utils/pool_util.h
index 9a21f8a9129..4deed023f1f 100644
--- a/oneflow/user/utils/pool_util.h
+++ b/oneflow/user/utils/pool_util.h
@@ -21,8 +21,8 @@ limitations under the License.
 
 namespace oneflow {
 
-typedef fixed_vector<int64_t, SHAPE_MAX_AXIS_SIZE> FixedDimVector;
-typedef fixed_vector<int32_t, SHAPE_MAX_AXIS_SIZE> FixedVector;
+typedef small_vector<int64_t, SHAPE_MAX_AXIS_SIZE> FixedDimVector;
+typedef small_vector<int32_t, SHAPE_MAX_AXIS_SIZE> FixedVector;
 
 class Params3D {
  public:
diff --git a/python/oneflow/framework/multi_client_session.py b/python/oneflow/framework/multi_client_session.py
index 72c6e093779..64a82c12b27 100644
--- a/python/oneflow/framework/multi_client_session.py
+++ b/python/oneflow/framework/multi_client_session.py
@@ -124,4 +124,7 @@ def update_resource_eagerly(self, resource_config):
         self._session_ctx.update_resource(config_proto_str)
 
     def __del__(self):
+        if self._env.is_shutting_down():
+            # After python shutting down, it's not safe to call oneflow
+            return
         self._TryClose()
diff --git a/python/oneflow/nn/graph/graph_config.py b/python/oneflow/nn/graph/graph_config.py
index dfb3795b3ac..ea48ad8d957 100644
--- a/python/oneflow/nn/graph/graph_config.py
+++ b/python/oneflow/nn/graph/graph_config.py
@@ -17,6 +17,7 @@
 
 from collections import OrderedDict
 
+import oneflow.boxing.nccl as nccl_config
 from oneflow.nn.graph.optimizer import OptDict
 import oneflow.core.job.job_conf_pb2 as job_conf_pb
 
@@ -45,24 +46,51 @@ def training(self):
             return False
         raise NotImplementedError
 
-    def set_outputs_buffer_size(self, value: int = 2):
-        r"""Set the outputs buffer size of ``nn.Graph``.
+    def enable_amp(self, mode: bool = True):
+        r"""If set to true, then graph will use mixed precision mode, it means use both float16 and float32 during model training.
 
-        When graph's outputs buffer size is greater than 2, multiple call on the graph can work like a pipeline. This makes multiple call takes less time.
+        For example:
 
-        The default outputs buffer size is 2.
+        .. code-block:: python
 
-        # TODO (lixiang): Explain the meaning of the size of buffer size and add sample code.
-        # The size of the buffer size indicates the maximum number of iterations that the output of the Graph and the Graph actually executed asynchronously can overlap.
-        # If the buffer size is 1, there is no pipeline. A size of 2 means that it can execute 1 iter ahead of time. A size of 3 means that two iters can be executed ahead of time.
+            import oneflow as flow
+
+            class Graph(flow.nn.Graph):
+                def __init__(self):
+                    super().__init__()
+                    self.linear = flow.nn.Linear(3, 8, False)
+                    self.config.enable_amp(True) # Use mixed precision mode.
+                def build(self, x):
+                    return self.linear(x)
+
+            graph = Graph()
 
         Args:
-            value (int): graph ouputs buffer size.
+            mode (bool, optional): The default vaule is True.
+
         """
-        self._outputs_buffer_size = value
+        assert type(mode) is bool
+        self.proto.enable_auto_mixed_precision = mode
 
-    def enable_amp(self, mode: bool = True):
-        r"""If set to true, then graph will use mixed precision mode, it means use both float16 and float32 during model training.
+    def set_zero_redundancy_optimizer_mode(self, mode: str = "distributed_split"):
+        raise RuntimeError(
+            "`set_zero_redundancy_optimizer_mode` has been changed to `enable_zero`, please use `enable_zero(True)` to activate ZeRO optimization."
+        )
+
+    def enable_zero(
+        self,
+        mode: bool = True,
+        *,
+        stage: int = 2,
+        shard_min_size: int = 1024,
+        shard_restore_level: int = 1,
+    ):
+        r"""Enable ZeRO redundancy optimizer.
+
+        This optimzation will reduce optimizer states memory consumption as described
+        by ZeRO https://arxiv.org/abs/1910.02054 .
+
+        The default zero stage is 2.
 
         For example:
 
@@ -74,17 +102,36 @@ class Graph(flow.nn.Graph):
                 def __init__(self):
                     super().__init__()
                     self.linear = flow.nn.Linear(3, 8, False)
-                    self.config.enable_amp(True) # Use mixed precision mode.
+                    self.config.enable_zero()
                 def build(self, x):
                     return self.linear(x)
 
             graph = Graph()
 
         Args:
-            mode (bool, optional): The default vaule is True.
+            mode (bool): if set to true, optimizer states of Data Parallel will be sharded across devices.
+            stage (int): optimization stage, range from 1 to 3. 
+            shard_min_size (int): min size of a shard of an optimizer state.
+            shard_restore_level (int): level to restore sharded parameter to whole parameter for consumer operators, level 0 is no restore, level 1 is soft restore, level 2 is hard restore. Note that this paremeter is at pre-alpha stage.
         """
-        assert type(mode) is bool
-        self.proto.enable_auto_mixed_precision = mode
+        if not mode:
+            self.proto.optimizer_placement_optimization_mode = "none"
+            return
+        assert stage >= 1 and stage <= 3, "ZeRO stage must range form 1 to 3."
+        assert (
+            shard_min_size > 0
+        ), "ZeRO min size of a sharded optimizer state must > 0."
+        assert stage >= 1 and stage <= 3, "ZeRO stage must range form 1 to 3."
+        if stage >= 1:
+            self.proto.optimizer_placement_optimization_mode = "distributed_split"
+            self.proto.optimizer_placement_optimization_threshold = shard_min_size
+            self.proto.optimizer_placement_optimization_shard_restore_level = (
+                shard_restore_level
+            )
+        if stage >= 2:
+            nccl_config.enable_use_compute_stream(True)
+        if stage >= 3:
+            nccl_config.disable_group_boxing_by_dst_parallel(True)
 
     def allow_fuse_model_update_ops(self, mode: bool = True):
         r"""If set to true, try to fuse cast + scale + l1_l2_regularize_gradient + model_update to one op to improve performance.
@@ -188,61 +235,23 @@ def build(self, x):
         """
         self.proto.num_gradient_accumulation_steps = value
 
-    def set_zero_redundancy_optimizer_mode(self, mode: str = "distributed_split"):
-        r"""Set mode to remove redundancy of optimizer states.
-        This optimzation will reduce optimizer states memory consumption as described
-        by ZeRO https://arxiv.org/abs/1910.02054 .
-
-        For example:
-
-        .. code-block:: python
-
-            import oneflow as flow
-
-            class Graph(flow.nn.Graph):
-                def __init__(self):
-                    super().__init__()
-                    self.linear = flow.nn.Linear(3, 8, False)
-                    self.config.set_zero_redundancy_optimizer_mode("distributed_split")
-                def build(self, x):
-                    return self.linear(x)
-
-            graph = Graph()
-
-        Args:
-            mode (str): "distributed_split" or "non_distributed". "distributed_split" mode
-                         will shard each optimizer state across devices. "non_distributed" mode
-                         will place each optimizer state to only one device.
-        """
-        assert mode in ("distributed_split", "non_distributed")
-        self.proto.optimizer_placement_optimization_mode = mode
-
-    def set_zero_redundancy_optimizer_min_size_after_split(self, value):
-        r"""Set the min size of optimizer state/grad/parameter after split.
-
-        For example:
-
-        .. code-block:: python
+    def set_outputs_buffer_size(self, value: int = 2):
+        r"""Set the outputs buffer size of ``nn.Graph``.
 
-            import oneflow as flow
+        When graph's outputs buffer size is greater than 2, multiple call on the graph can work like a pipeline. This makes multiple call takes less time.
 
-            class Graph(flow.nn.Graph):
-                def __init__(self):
-                    super().__init__()
-                    self.linear = flow.nn.Linear(3, 8, False)
-                    self.config.set_zero_redundancy_optimizer_mode("distributed_split")
-                    self.config.set_zero_redundancy_optimizer_min_size_after_split(1)
-                def build(self, x):
-                    return self.linear(x)
+        The default outputs buffer size is 2.
 
-            graph = Graph()
+        # TODO (lixiang): Explain the meaning of the size of buffer size and add sample code.
+        # The size of the buffer size indicates the maximum number of iterations that the output of the Graph and the Graph actually executed asynchronously can overlap.
+        # If the buffer size is 1, there is no pipeline. A size of 2 means that it can execute 1 iter ahead of time. A size of 3 means that two iters can be executed ahead of time.
 
         Args:
-            value (int): min size value.
+            value (int): graph ouputs buffer size.
         """
         assert isinstance(value, int)
         assert value >= 1
-        self.proto.optimizer_placement_optimization_threshold = value
+        self._outputs_buffer_size = value
 
     def enable_cudnn_conv_heuristic_search_algo(self, mode: bool = True):
         r""" Whether enable cudnn conv operatioin to use heuristic search algorithm.
diff --git a/python/oneflow/nn/modules/sparse.py b/python/oneflow/nn/modules/sparse.py
index 0731f7b21b9..b8eb4d50b9a 100644
--- a/python/oneflow/nn/modules/sparse.py
+++ b/python/oneflow/nn/modules/sparse.py
@@ -160,9 +160,12 @@ def forward(self, indices):
                 flow._C.embedding_renorm_(
                     self.weight, indices, self.max_norm, self.norm_type
                 )
-        return flow._C.embedding(
-            self.weight, indices, self.padding_idx, self.scale_grad_by_freq
-        )
+        if self.padding_idx is None and not self.scale_grad_by_freq:
+            return flow._C.gather(self.weight, indices, axis=0)
+        else:
+            return flow._C.embedding(
+                self.weight, indices, self.padding_idx, self.scale_grad_by_freq
+            )
 
 
 def embedding(
@@ -232,7 +235,10 @@ def embedding(
         with flow.no_grad():
             weight = flow._C.embedding_renorm_(weight, input, max_norm, norm_type)
 
-    return flow._C.embedding(weight, input, padding_idx, scale_grad_by_freq)
+    if padding_idx is None and not scale_grad_by_freq:
+        return flow._C.gather(weight, input, axis=0)
+    else:
+        return flow._C.embedding(weight, input, padding_idx, scale_grad_by_freq)
 
 
 if __name__ == "__main__":
diff --git a/python/oneflow/test/graph/test_graph_zero.py b/python/oneflow/test/graph/test_graph_zero.py
index 20fc7366bab..51fa38a8657 100644
--- a/python/oneflow/test/graph/test_graph_zero.py
+++ b/python/oneflow/test/graph/test_graph_zero.py
@@ -26,40 +26,42 @@ def train_with_graph(iter_num=1):
         P = flow.placement("cuda", ranks=[0, 1])
         B = flow.sbp.broadcast
         S0 = flow.sbp.split(0)
-        linear = flow.nn.Linear(8, 4)
-        linear = linear.to_global(placement=P, sbp=B)
-        flow.nn.init.constant_(linear.weight, 2.068758)
-        flow.nn.init.constant_(linear.bias, 0.23)
-        of_sgd = flow.optim.SGD(linear.parameters(), lr=0.001, momentum=0.9)
+
+        linear_dp = flow.nn.Linear(800, 400, bias=False)
+        linear_dp = linear_dp.to_global(placement=P, sbp=B)
+        flow.nn.init.constant_(linear_dp.weight, 2.068758)
+
+        linear_mp = flow.nn.Linear(400, 500, bias=False)
+        linear_mp = linear_mp.to_global(placement=P, sbp=S0)
+        flow.nn.init.constant_(linear_mp.weight, 2.068758)
+
+        of_sgd = flow.optim.SGD(
+            [{"params": linear_dp.parameters()}, {"params": linear_mp.parameters()}],
+            lr=0.001,
+            momentum=0.9,
+        )
         grad_scaler = flow.amp.StaticGradScaler(200)
 
-        x = flow.randint(1, 100, (4, 8), dtype=flow.float32, placement=P, sbp=S0)
+        x = flow.randint(1, 100, (6, 800), dtype=flow.float32, placement=P, sbp=S0)
 
         class LinearTrainGraphWithZeRO(flow.nn.Graph):
             def __init__(self):
                 super().__init__()
-                self.linear = linear
+                self.linear_dp = linear_dp
+                self.linear_mp = linear_mp
                 self.add_optimizer(of_sgd)
 
                 self.config.enable_amp(True)
                 self.set_grad_scaler(grad_scaler)
-                if zero_stage == 1:
-                    print("zero stage 1 optimization")
-                    self.config.set_zero_redundancy_optimizer_mode("distributed_split")
-                    self.config.set_zero_redundancy_optimizer_min_size_after_split(1)
-                if zero_stage == 2:
-                    self.config.set_zero_redundancy_optimizer_mode("distributed_split")
-                    self.config.set_zero_redundancy_optimizer_min_size_after_split(1)
-                    flow.boxing.nccl.enable_use_compute_stream(True)
-                if zero_stage == 3:
-                    print("zero stage 3 optimization")
-                    self.config.set_zero_redundancy_optimizer_mode("distributed_split")
-                    self.config.set_zero_redundancy_optimizer_min_size_after_split(1)
-                    flow.boxing.nccl.enable_use_compute_stream(True)
-                    flow.boxing.nccl.disable_group_boxing_by_dst_parallel(True)
+                self.config.enable_zero(
+                    True, stage=zero_stage, shard_min_size=1, shard_restore_level=0,
+                )
+                self.debug(2)
 
             def build(self, x):
-                out = self.linear(x)
+                out = self.linear_dp(x)
+                out = out.to_global(placement=P, sbp=B)
+                out = self.linear_mp(out)
                 loss = out.sum()
                 loss.backward()
                 return out
@@ -67,19 +69,26 @@ def build(self, x):
         class LinearEvalGraphWithZeRO(flow.nn.Graph):
             def __init__(self):
                 super().__init__()
-                self.linear = linear
+                self.linear_dp = linear_dp
+                self.linear_mp = linear_mp
 
                 self.config.enable_amp(True)
 
             def build(self, x):
-                out = self.linear(x)
+                out = self.linear_dp(x)
+                out = out.to_global(placement=P, sbp=B)
+                out = self.linear_mp(out)
                 return out
 
         linear_t_g = LinearTrainGraphWithZeRO()
+        linear_t_g.debug(1)
         linear_e_g = LinearEvalGraphWithZeRO()
+        linear_e_g.debug(1)
 
         def one_train_iter():
             out = linear_t_g(x)
+            if flow.env.get_rank() == 0:
+                print(linear_t_g)
 
         def one_eval_iter():
             out = linear_e_g(x)
@@ -89,8 +98,116 @@ def one_eval_iter():
 
         # After pass rewrite in training graph, parameters' sbp has been
         # changed from flow.sbp.broadcast to flow.sbp.split(0)
-        test_case.assertEqual(linear.weight.sbp[0], S0)
-        test_case.assertEqual(linear.bias.sbp[0], S0)
+        test_case.assertEqual(linear_dp.weight.sbp[0], S0)
+        test_case.assertEqual(linear_mp.weight.sbp[0], S0)
+
+        # In evaluation graph, paramters's sbp are flow.sbp.split(0).
+        # But their consumer will consum them as flow.sbp.broadcast.
+        one_eval_iter()
+
+    iter_num = 1
+    graph_check_list = train_with_graph(iter_num)
+
+
+def _test_linear_train_graph_2d_with_zero(test_case, zero_stage=1):
+    def train_with_graph(iter_num=1):
+        P = flow.placement("cuda", ranks=[[0, 1], [2, 3]])
+        B = flow.sbp.broadcast
+        S0 = flow.sbp.split(0)
+        S1 = flow.sbp.split(1)
+
+        def get_mixed_linear():
+            linear_dp_mp = flow.nn.Linear(800, 400, bias=False)
+            linear_dp_mp = linear_dp_mp.to_global(placement=P, sbp=[B, S0])
+            flow.nn.init.constant_(linear_dp_mp.weight, 1.068758)
+
+            linear_mp_dp = flow.nn.Linear(800, 400, bias=False)
+            linear_mp_dp = linear_mp_dp.to_global(placement=P, sbp=[S0, B])
+            flow.nn.init.constant_(linear_mp_dp.weight, 1.068758)
+
+            class MixedLinear(flow.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.dp_mp = linear_dp_mp
+                    self.mp_dp = linear_mp_dp
+
+                def forward(self, x):
+                    x = self.dp_mp(x)
+                    x = flow.relu(x)
+                    x = self.mp_dp(x)
+                    x = flow.relu(x)
+                    return x
+
+            return MixedLinear()
+
+        mixed_linear0 = get_mixed_linear()
+        mixed_linear1 = get_mixed_linear()
+
+        of_sgd = flow.optim.SGD(
+            [
+                {"params": mixed_linear0.parameters()},
+                {"params": mixed_linear1.parameters()},
+            ],
+            lr=0.001,
+            momentum=0.9,
+        )
+        grad_scaler = flow.amp.StaticGradScaler(200)
+
+        x = flow.rand((2, 800), dtype=flow.float32, placement=P, sbp=[S0, B])
+
+        class LinearTrainGraph2DWithZeRO(flow.nn.Graph):
+            def __init__(self):
+                super().__init__()
+                self.mixed_linear0 = mixed_linear0
+                self.mixed_linear0.config.activation_checkpointing = True
+                self.mixed_linear1 = mixed_linear1
+                self.mixed_linear1.config.activation_checkpointing = True
+                self.add_optimizer(of_sgd)
+
+                self.config.enable_amp(True)
+                self.set_grad_scaler(grad_scaler)
+                self.config.enable_zero(
+                    True, stage=zero_stage, shard_min_size=1, shard_restore_level=1,
+                )
+
+            def build(self, x):
+                out = self.mixed_linear0(x)
+                out = self.mixed_linear1(out)
+                loss = out.mean()
+                loss.backward()
+                return loss
+
+        class LinearEvalGraph2DWithZeRO(flow.nn.Graph):
+            def __init__(self):
+                super().__init__()
+                self.mixed_linear0 = mixed_linear0
+                self.mixed_linear1 = mixed_linear1
+
+                self.config.enable_amp(True)
+
+            def build(self, x):
+                out = self.mixed_linear0(x)
+                out = self.mixed_linear1(out)
+                return out
+
+        linear_t_g = LinearTrainGraph2DWithZeRO()
+        linear_e_g = LinearEvalGraph2DWithZeRO()
+
+        def one_train_iter():
+            out = linear_t_g(x)
+            # if flow.env.get_rank() == 0:
+            #    print(linear_t_g)
+
+        def one_eval_iter():
+            out = linear_e_g(x)
+
+        for i in range(iter_num):
+            one_train_iter()
+
+        for state in linear_t_g._state():
+            test_case.assertEqual(
+                state.origin.sbp, (oneflow.sbp.split(axis=0), oneflow.sbp.split(axis=0))
+            )
 
         # In evaluation graph, paramters's sbp are flow.sbp.split(0).
         # But their consumer will consum them as flow.sbp.broadcast.
@@ -113,5 +230,18 @@ def test_linear_train_graph_with_zero_3(test_case):
         _test_linear_train_graph_with_zero(test_case, 3)
 
 
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
+@flow.unittest.skip_unless_1n4d()
+class TestLinearTrainGraph2DWithZeRO(oneflow.unittest.TestCase):
+    def test_linear_train_graph_2d_with_zero_3(test_case):
+        _test_linear_train_graph_2d_with_zero(test_case, 3)
+
+    def test_linear_train_graph_2d_with_zero_2(test_case):
+        _test_linear_train_graph_2d_with_zero(test_case, 2)
+
+    def test_linear_train_graph_2d_with_zero_1(test_case):
+        _test_linear_train_graph_2d_with_zero(test_case, 1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/graph/test_optimization_conf.py b/python/oneflow/test/graph/test_optimization_conf.py
index da6348b7033..a60d339be8b 100644
--- a/python/oneflow/test/graph/test_optimization_conf.py
+++ b/python/oneflow/test/graph/test_optimization_conf.py
@@ -66,7 +66,7 @@ def __init__(self):
                 self.config.allow_fuse_add_to_output(True)
                 self.config.allow_fuse_cast_scale(True)
                 self.config.set_gradient_accumulation_steps(100)
-                self.config.set_zero_redundancy_optimizer_mode("distributed_split")
+                self.config.enable_zero(True)
                 self.config.enable_cudnn_conv_heuristic_search_algo(False)
 
             def build(self, x):
diff --git a/python/oneflow/test/modules/test_sparse.py b/python/oneflow/test/modules/test_sparse.py
index 2a1ed9812db..01df8da23bb 100644
--- a/python/oneflow/test/modules/test_sparse.py
+++ b/python/oneflow/test/modules/test_sparse.py
@@ -184,7 +184,7 @@ def test_embedding_functional(test_case):
     # NOTE(Yao Zihang): Set check_graph=False temporarily
     # Graph mode do not support inplace op with flow.no_grad()
     # See this issue: https://github.com/Oneflow-Inc/OneTeam/issues/1382
-    @autotest(n=5, check_graph="ValidatedFlase")
+    @autotest(n=5, rtol=1e-03, atol=1e-03, check_graph="ValidatedFlase")
     def test_embedding_renorm(test_case):
         device = random_device()
         emb_size = random(low=2) * 16