diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 499cde368df..b0085586049 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -546,7 +546,7 @@ jobs:
         run: |
           docker exec ${{ env.TEST_CONTAINER_NAME }} python3 -m oneflow --doctor
       - name: Exe test
-        if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' }}
+        if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' && matrix.device == 'cpu' }}
         timeout-minutes: 10
         run: |
           chmod +x ${{ steps.download-digest.outputs.entry-dir }}/bin/oneflow_testexe
diff --git a/cmake/caches/ci/cpu.cmake b/cmake/caches/ci/cpu.cmake
index ac23ae140b3..fc416e58016 100644
--- a/cmake/caches/ci/cpu.cmake
+++ b/cmake/caches/ci/cpu.cmake
@@ -9,3 +9,4 @@ set(CMAKE_BUILD_TYPE Release CACHE STRING "")
 set(CMAKE_GENERATOR Ninja CACHE STRING "")
 set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF CACHE BOOL "")
 set(BUILD_CPP_API ON CACHE BOOL "")
+set(WITH_MLIR ON CACHE BOOL "")
diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake
index 662dcc15e9f..17b5267f67d 100644
--- a/cmake/oneflow.cmake
+++ b/cmake/oneflow.cmake
@@ -103,11 +103,6 @@ foreach(oneflow_single_file ${oneflow_all_src})
     set(group_this ON)
   endif()
 
-  if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/api/common/.*\\.(h|cpp)$")
-      list(APPEND of_all_obj_cc ${oneflow_single_file})
-      set(group_this ON)
-    endif()
-
   if(BUILD_PYTHON)
 
     if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/api/python/.*\\.(h|cpp)$")
@@ -284,6 +279,18 @@ elseif(WIN32)
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /WHOLEARCHIVE:oneflow")
 endif()
 
+# oneflow api common
+if (BUILD_PYTHON OR BUILD_CPP_API)
+  file(GLOB_RECURSE of_api_common_files
+    ${PROJECT_SOURCE_DIR}/oneflow/api/common/*.h
+    ${PROJECT_SOURCE_DIR}/oneflow/api/common/*.cpp)
+  oneflow_add_library(of_api_common OBJECT ${of_api_common_files})
+  target_link_libraries(of_api_common oneflow)
+  if (WITH_MLIR)
+    target_link_libraries(of_api_common ${ONEFLOW_MLIR_LIBS})
+  endif()
+endif()
+
 if(BUILD_PYTHON)
 
   # py ext lib
@@ -304,7 +311,7 @@ if(BUILD_PYTHON)
   target_link_libraries(oneflow_internal PRIVATE
                         ${of_libs}
                         of_functional_tensor_obj
-                        ${ONEFLOW_MLIR_LIBS}
+                        of_api_common
                         ${oneflow_third_party_libs}
                         of_pyext_obj
                         ${oneflow_exe_third_party_libs})
@@ -346,19 +353,19 @@ if (BUILD_CPP_API)
     oneflow_add_library(oneflow_cpp ${of_cpp_api_files})
   endif()
   set_target_properties(oneflow_cpp PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${LIBONEFLOW_LIBRARY_DIR}" LIBRARY_OUTPUT_DIRECTORY "${LIBONEFLOW_LIBRARY_DIR}")
-  target_link_libraries(oneflow_cpp PRIVATE ${of_libs} ${ONEFLOW_MLIR_LIBS} ${oneflow_third_party_libs})
+  target_link_libraries(oneflow_cpp PRIVATE ${of_libs} of_api_common ${oneflow_third_party_libs})
 endif()
 
 file(RELATIVE_PATH PROJECT_BINARY_DIR_RELATIVE ${PROJECT_SOURCE_DIR} ${PROJECT_BINARY_DIR})
 
 function(oneflow_add_test target_name)
-  cmake_parse_arguments(arg "" "TEST_NAME" "SRCS" ${ARGN})
+  cmake_parse_arguments(arg "" "TEST_NAME;WORKING_DIRECTORY" "SRCS" ${ARGN})
   oneflow_add_executable(${target_name} ${arg_SRCS})
   if (BUILD_CUDA)
     target_link_libraries(${target_name} CUDA::cudart_static)
   endif()
   set_target_properties(${target_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin")
-  add_test(NAME ${arg_TEST_NAME} COMMAND ${target_name})
+  add_test(NAME ${arg_TEST_NAME} COMMAND ${target_name} WORKING_DIRECTORY ${arg_WORKING_DIRECTORY})
   set_tests_properties(
     ${arg_TEST_NAME}
   PROPERTIES
@@ -375,11 +382,12 @@ if(BUILD_TESTING)
 
   if (BUILD_CPP_API)
     file(GLOB_RECURSE cpp_api_test_files ${PROJECT_SOURCE_DIR}/oneflow/api/cpp/tests/*.cpp)
-    oneflow_add_test(oneflow_cpp_api_testexe SRCS ${cpp_api_test_files} TEST_NAME oneflow_cpp_api_test)
+    oneflow_add_test(oneflow_cpp_api_testexe SRCS ${cpp_api_test_files} TEST_NAME oneflow_cpp_api_test WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
     target_link_libraries(oneflow_cpp_api_testexe oneflow_cpp ${oneflow_test_libs})
   endif()
 endif()
 
+
 # build include
 add_custom_target(of_include_copy ALL)
 
@@ -420,10 +428,12 @@ endif(BUILD_PYTHON)
 if (BUILD_CPP_API)
   add_dependencies(of_include_copy oneflow_cpp)
 
-  set(OF_API_DIRS)
   file(GLOB_RECURSE api_h_files "${PROJECT_SOURCE_DIR}/oneflow/api/cpp/*.h")
-  list(APPEND OF_API_DIRS ${api_h_files})
-
-  copy_files("${OF_API_DIRS}" "${PROJECT_SOURCE_DIR}/oneflow/api/cpp" "${LIBONEFLOW_INCLUDE_DIR}" of_include_copy)
+  copy_files("${api_h_files}" "${PROJECT_SOURCE_DIR}/oneflow/api/cpp" "${LIBONEFLOW_INCLUDE_DIR}" of_include_copy)
   copy_files("${PROJECT_SOURCE_DIR}/cmake/oneflow-config.cmake" "${PROJECT_SOURCE_DIR}/cmake" "${LIBONEFLOW_SHARE_DIR}" of_include_copy)
+
+  if(WITH_MLIR)
+    file(GLOB mlir_shared_libs "${PROJECT_BINARY_DIR}/oneflow/ir/llvm_monorepo-build/lib/*.14git")
+    copy_files("${mlir_shared_libs}" "${PROJECT_BINARY_DIR}/oneflow/ir/llvm_monorepo-build/lib" "${LIBONEFLOW_LIBRARY_DIR}" of_include_copy)
+  endif(WITH_MLIR)
 endif(BUILD_CPP_API)
diff --git a/cmake/third_party/absl.cmake b/cmake/third_party/absl.cmake
index a30f673df60..2a85ca963d5 100644
--- a/cmake/third_party/absl.cmake
+++ b/cmake/third_party/absl.cmake
@@ -12,14 +12,14 @@ SET(ABSL_LIBRARY_DIR ${THIRD_PARTY_DIR}/absl/${CMAKE_INSTALL_LIBDIR} CACHE PATH
 
 if(WIN32)
   set(ABSL_BUILD_LIBRARY_DIR ${ABSL_INSTALL}/${CMAKE_INSTALL_LIBDIR})
-  set(ABSL_LIBRARY_NAMES absl_base.lib absl_spinlock_wait.lib absl_dynamic_annotations.lib
+  set(ABSL_LIBRARY_NAMES absl_spinlock_wait.lib absl_dynamic_annotations.lib
     absl_malloc_internal.lib absl_throw_delegate.lib absl_int128.lib absl_strings.lib absl_str_format_internal.lib
-    absl_time.lib absl_bad_optional_access.lib)
+    absl_time.lib absl_bad_optional_access.lib absl_base.lib)
 else()
   set(ABSL_BUILD_LIBRARY_DIR ${ABSL_INSTALL}/${CMAKE_INSTALL_LIBDIR})
-  set(ABSL_LIBRARY_NAMES libabsl_base.a libabsl_spinlock_wait.a libabsl_dynamic_annotations.a
+  set(ABSL_LIBRARY_NAMES libabsl_spinlock_wait.a libabsl_dynamic_annotations.a
     libabsl_malloc_internal.a libabsl_throw_delegate.a libabsl_int128.a libabsl_strings.a libabsl_str_format_internal.a
-    libabsl_time.a libabsl_bad_optional_access.a)
+    libabsl_time.a libabsl_bad_optional_access.a libabsl_base.a)
 endif()
 
 foreach(LIBRARY_NAME ${ABSL_LIBRARY_NAMES})
diff --git a/oneflow/api/common/device.cpp b/oneflow/api/common/device.cpp
deleted file mode 100644
index 566f9231066..00000000000
--- a/oneflow/api/common/device.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "oneflow/api/common/device.h"
-
-namespace oneflow {
-
-namespace {
-
-void CheckDeviceType(const std::string& type) {
-  if (Device::type_supported.find(type) == Device::type_supported.end()) {
-    std::string error_msg =
-        "Expected one of cpu, cuda device type at start of device string " + type;
-    throw std::runtime_error(error_msg);
-  }
-}
-
-}  // namespace
-
-/* static */ Maybe<Symbol<Device>> DeviceExportUtil::ParseAndNew(
-    const std::string& type_or_type_with_device_id) {
-  std::string type;
-  int device_id = -1;
-  ParsingDeviceTag(type_or_type_with_device_id, &type, &device_id).GetOrThrow();
-  CheckDeviceType(type);
-  if (device_id == -1) {
-    return Device::New(type);
-  } else {
-    return Device::New(type, device_id);
-  }
-}
-
-/* static */ Maybe<Symbol<Device>> DeviceExportUtil::New(const std::string& type,
-                                                         int64_t device_id) {
-  CheckDeviceType(type);
-  return Device::New(type, device_id);
-}
-
-}  // namespace oneflow
diff --git a/oneflow/api/common/device.h b/oneflow/api/common/ir_pass.cpp
similarity index 63%
rename from oneflow/api/common/device.h
rename to oneflow/api/common/ir_pass.cpp
index 8703666af7f..ca4111a9da0 100644
--- a/oneflow/api/common/device.h
+++ b/oneflow/api/common/ir_pass.cpp
@@ -14,17 +14,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-#ifndef ONEFLOW_API_COMMON_DEVICE_H_
-#define ONEFLOW_API_COMMON_DEVICE_H_
+#ifdef WITH_MLIR
 
-#include "oneflow/core/framework/device.h"
+#include "oneflow/ir/include/OneFlow/Extension.h"
+#include "oneflow/ir/oneflow-extension/include/OneFlow/OneFlowRoundTrip.h"
+#include <glog/logging.h>
 
 namespace oneflow {
-struct DeviceExportUtil final {
-  static Maybe<Symbol<Device>> ParseAndNew(const std::string& type_or_type_with_device_id);
 
-  static Maybe<Symbol<Device>> New(const std::string& type, int64_t device_id);
-};
+REGISTER_JOB_PASS("IRRoundTripBeforeAD", IRRoundTrip<kBeforeAD>);
+REGISTER_JOB_PASS("IRRoundTrip", IRRoundTrip<kAfterAD>);
+
 }  // namespace oneflow
 
-#endif  // !ONEFLOW_API_COMMON_DEVICE_H_
+#endif  // WITH_MLIR
diff --git a/oneflow/api/common/job_build_and_infer_ctx.h b/oneflow/api/common/job_build_and_infer_ctx.h
new file mode 100644
index 00000000000..8b475f8a2db
--- /dev/null
+++ b/oneflow/api/common/job_build_and_infer_ctx.h
@@ -0,0 +1,36 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef ONEFLOW_API_COMMON_JOB_BUILD_AND_INFER_CTX_H_
+#define ONEFLOW_API_COMMON_JOB_BUILD_AND_INFER_CTX_H_
+
+#include "oneflow/core/job/job.pb.h"
+#include "oneflow/core/job/job_build_and_infer_ctx_mgr.h"
+
+namespace oneflow {
+
+inline Maybe<Job> GetCurrentJob() {
+  auto* job_ctx_mgr = Global<LazyJobBuildAndInferCtxMgr>::Get();
+  CHECK_NOTNULL_OR_RETURN(job_ctx_mgr);
+  auto* job_ctx =
+      JUST(job_ctx_mgr->FindJobBuildAndInferCtx(*JUST(job_ctx_mgr->GetCurrentJobName())));
+  CHECK_NOTNULL_OR_RETURN(job_ctx);
+  return job_ctx->job();
+}
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_API_COMMON_JOB_BUILD_AND_INFER_CTX_H_
diff --git a/oneflow/api/common/scope.h b/oneflow/api/common/scope.h
new file mode 100644
index 00000000000..f0626e3ada1
--- /dev/null
+++ b/oneflow/api/common/scope.h
@@ -0,0 +1,54 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef ONEFLOW_API_COMMON_SCOPE_H_
+#define ONEFLOW_API_COMMON_SCOPE_H_
+
+#include <memory>
+#include <string>
+#include "oneflow/core/common/just.h"
+#include "oneflow/core/framework/device.h"
+#include "oneflow/core/framework/instructions_builder.h"
+#include "oneflow/core/framework/session_util.h"
+#include "oneflow/core/job/job_conf.cfg.h"
+#include "oneflow/core/job/job_conf.pb.h"
+#include "oneflow/core/job/scope.h"
+
+namespace oneflow {
+
+inline Maybe<Scope> MakeScope(const JobConfigProto& config_proto, const Device& device) {
+  std::shared_ptr<Scope> scope;
+  std::shared_ptr<cfg::JobConfigProto> cfg_config_proto =
+      std::make_shared<cfg::JobConfigProto>(config_proto);
+  JUST(LogicalRun([&](InstructionsBuilder* builder) -> Maybe<void> {
+    int64_t session_id = 0;
+    std::string device_tag = "cpu";
+    std::string machine_ids = "0";
+    std::string device_ids = "0";
+    if (device.type() == "cuda") {
+      device_tag = "gpu";
+      device_ids = std::to_string(device.device_id());
+    }
+    scope = JUST(builder->BuildInitialScope(session_id, cfg_config_proto, device_tag,
+                                            {machine_ids + ":" + device_ids}, nullptr, false));
+    return Maybe<void>::Ok();
+  }));
+  return scope;
+}
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_API_COMMON_SCOPE_H_
diff --git a/oneflow/api/cpp/env.cpp b/oneflow/api/cpp/env.cpp
index 6919948af02..ea31af15a69 100644
--- a/oneflow/api/cpp/env.cpp
+++ b/oneflow/api/cpp/env.cpp
@@ -24,14 +24,17 @@ limitations under the License.
 #include <random>
 #include <type_traits>
 #include "oneflow/api/cpp/env.h"
+#include "oneflow/core/common/global.h"
 #include "oneflow/core/common/just.h"
 #include "oneflow/core/common/multi_client.h"
 #include "oneflow/core/common/optional.h"
+#include "oneflow/core/framework/multi_client_session_context.h"
 #include "oneflow/core/framework/shut_down_util.h"
 #include "oneflow/core/job/cluster_instruction.h"
 #include "oneflow/core/job/env.pb.h"
 #include "oneflow/core/job/env_global_objects_scope.h"
 #include "oneflow/core/control/ctrl_bootstrap.h"
+#include "oneflow/core/job/session.h"
 #include "oneflow/core/rpc/include/base.h"
 #include "oneflow/core/vm/vm_util.h"
 #include "oneflow/core/thread/thread_consistent_id.h"
@@ -114,6 +117,12 @@ of::Maybe<void> initEnv() {
   CompleteEnvProto(env_proto);
   of::Global<of::EnvGlobalObjectsScope>::SetAllocated(new of::EnvGlobalObjectsScope());
   JUST(of::Global<of::EnvGlobalObjectsScope>::Get()->Init(env_proto));
+
+  of::ConfigProto config_proto;
+  config_proto.mutable_resource()->set_cpu_device_num(1);  // useless, will be set in TryInit
+  config_proto.set_session_id(of::NewSessionId());
+  of::Global<of::MultiClientSessionContext>::New();
+  of::Global<of::MultiClientSessionContext>::Get()->TryInit(config_proto).GetOrThrow();
   return of::Maybe<void>::Ok();
 }
 
@@ -129,6 +138,8 @@ void release() {
   if (IsEnvInited()) {
     // sync multi_client
     of::vm::ClusterSync().GetOrThrow();
+    of::Global<of::MultiClientSessionContext>::Get()->TryClose().GetOrThrow();
+    of::Global<of::MultiClientSessionContext>::Delete();
     // destory env
     if (of::IsMultiClient().GetOrThrow()) {
       OF_ENV_BARRIER();
@@ -137,7 +148,6 @@ void release() {
     }
     of::Global<of::EnvGlobalObjectsScope>::Delete();
   }
-  // TODO close session
   of::SetShuttingDown();
   of::ResetThisThreadUniqueConsistentId().GetOrThrow();
 }
diff --git a/oneflow/api/cpp/framework.h b/oneflow/api/cpp/framework.h
index 214ac3d2f5e..5d05fb65442 100644
--- a/oneflow/api/cpp/framework.h
+++ b/oneflow/api/cpp/framework.h
@@ -22,5 +22,6 @@ limitations under the License.
 #include "framework/dtype.h"
 #include "framework/tensor.h"
 #include "framework/ivalue.h"
+#include "framework/graph.h"
 
 #endif  // ONEFLOW_API_CPP_FRAMEWORK_H_
diff --git a/oneflow/api/cpp/framework/device.cpp b/oneflow/api/cpp/framework/device.cpp
index dd291de0353..3be4b3f5a46 100644
--- a/oneflow/api/cpp/framework/device.cpp
+++ b/oneflow/api/cpp/framework/device.cpp
@@ -15,9 +15,9 @@ limitations under the License.
 */
 
 #include "oneflow/api/cpp/framework/device.h"
-#include "oneflow/api/common/device.h"
 #include "oneflow/core/common/maybe.h"
 #include "oneflow/core/common/symbol.h"
+#include "oneflow/core/framework/device.h"
 
 namespace oneflow_api {
 
@@ -25,11 +25,11 @@ namespace of = oneflow;
 
 Device::Device(const std::string& type_or_type_with_device_id)
     : device_(std::make_shared<of::Symbol<of::Device>>(
-        of::DeviceExportUtil::ParseAndNew(type_or_type_with_device_id).GetOrThrow())) {}
+        of::Device::ParseAndNew(type_or_type_with_device_id).GetOrThrow())) {}
 
 Device::Device(const std::string& type, int64_t device_id)
-    : device_(std::make_shared<of::Symbol<of::Device>>(
-        of::DeviceExportUtil::New(type, device_id).GetOrThrow())) {}
+    : device_(
+        std::make_shared<of::Symbol<of::Device>>(of::Device::New(type, device_id).GetOrThrow())) {}
 
 const std::string& Device::type() const { return (*device_)->type(); }
 
diff --git a/oneflow/api/cpp/framework/device.h b/oneflow/api/cpp/framework/device.h
index e45c7efb0f7..2a7e79b2a23 100644
--- a/oneflow/api/cpp/framework/device.h
+++ b/oneflow/api/cpp/framework/device.h
@@ -32,6 +32,7 @@ namespace oneflow_api {
 
 class Device final {
   friend class Tensor;
+  friend class Graph;
 
  public:
   explicit Device(const std::string& type_or_type_with_device_id);
diff --git a/oneflow/api/cpp/framework/graph.cpp b/oneflow/api/cpp/framework/graph.cpp
new file mode 100644
index 00000000000..1a3f14d55d6
--- /dev/null
+++ b/oneflow/api/cpp/framework/graph.cpp
@@ -0,0 +1,395 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "oneflow/api/common/ofblob.h"
+#include "oneflow/api/common/scope.h"
+#include "oneflow/api/cpp/framework/device.h"
+#include "oneflow/api/cpp/framework/graph.h"
+#include "oneflow/api/cpp/framework/ivalue.h"
+#include "oneflow/api/cpp/framework/shape.h"
+#include "oneflow/api/cpp/framework/tensor.h"
+#include "oneflow/api/common/job_build_and_infer_ctx.h"
+#include "oneflow/api/python/job_build/job_build_and_infer.h"
+#include "oneflow/core/common/data_type.pb.h"
+#include "oneflow/core/common/global.h"
+#include "oneflow/core/common/hash_container.h"
+#include "oneflow/core/common/just.h"
+#include "oneflow/core/common/shape.h"
+#include "oneflow/core/common/symbol.h"
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/framework/device.h"
+#include "oneflow/core/framework/dtype.h"
+#include "oneflow/core/framework/multi_client_session_context.h"
+#include "oneflow/core/framework/nn_graph.h"
+#include "oneflow/core/framework/scope_util.h"
+#include "oneflow/core/framework/tensor.h"
+#include "oneflow/core/framework/tensor_tuple.h"
+#include "oneflow/core/framework/tensor_util.h"
+#include "oneflow/core/functional/functional_api.yaml.h"
+#include "oneflow/core/graph/op_graph.h"
+#include "oneflow/core/job/job.pb.h"
+#include "oneflow/core/job/job_build_and_infer_ctx.h"
+#include "oneflow/core/job/job_build_and_infer_ctx_mgr.h"
+#include "oneflow/core/job/job_conf.cfg.h"
+#include "oneflow/core/job/job_conf.pb.h"
+#include "oneflow/core/job/job_ir.h"
+#include "oneflow/core/job/job_set.pb.h"
+#include "oneflow/core/job/lazy_mode.h"
+#include "oneflow/core/job/parallel_desc.h"
+#include "oneflow/core/job/scope.h"
+#include "oneflow/core/job/session.h"
+#include "oneflow/core/operator/interface_blob_conf.pb.h"
+#include "oneflow/core/operator/op_conf.pb.h"
+#include "oneflow/core/register/logical_blob_id.pb.h"
+
+namespace oneflow_api {
+
+namespace of = oneflow;
+
+enum class XrtKind : int { kNone = 0, kTensorRT = 1 };
+
+namespace {
+
+class CompileScope {
+ public:
+  CompileScope(const of::JobConfigProto& job_config, const of::Device& device, XrtKind kind) {
+    const std::shared_ptr<of::Scope> scope = CHECK_JUST(of::MakeScope(job_config, device));
+    CHECK_JUST(of::ThreadLocalScopeStackPush(scope));
+
+    of::cfg::JobConfigProto job_config_cfg(job_config);
+    ConfigXrt(job_config_cfg, kind);
+    CHECK_JUST(of::JobBuildAndInferCtx_Open(job_config.job_name()));
+    CHECK_JUST(of::CurJobBuildAndInferCtx_SetJobConf(job_config_cfg));
+  }
+
+  ~CompileScope() {
+    CHECK_JUST(of::JobBuildAndInferCtx_Close());
+    CHECK_JUST(of::ThreadLocalScopeStackPop());
+  }
+
+ private:
+  of::LazyMode::Guard lazy_mode_enabled_guard{true};
+
+  void ConfigXrt(of::cfg::JobConfigProto& job_config_cfg, XrtKind kind) {
+    if (kind == XrtKind::kTensorRT) {
+#ifdef WITH_TENSORRT
+      *(job_config_cfg.mutable_xrt_config()->mutable_use_tensorrt()) = true;
+#else
+      LOG(WARNING) << "XRT TensorRT is unavailable while tensorrt is enabled";
+#endif
+    }
+  }
+};
+
+std::shared_ptr<of::one::TensorTuple> ConvertToTensorTuple(
+    const std::vector<std::shared_ptr<of::one::Tensor>>& tensors) {
+  auto tensor_tuple = std::make_shared<of::one::TensorTuple>();
+  for (const auto& tensor : tensors) { tensor_tuple->emplace_back(tensor); }
+  return tensor_tuple;
+}
+
+std::string GetDeviceTag(const Device& device) {
+  if (device.type() == "cuda") {
+    return "gpu";
+  } else {
+    return "cpu";
+  }
+}
+
+template<class T1, class T2>
+const std::pair<std::vector<T1>, std::vector<T2>> Unzip(const of::HashMap<T1, T2>& hash_map) {
+  std::vector<T1> vec1;
+  std::vector<T2> vec2;
+  for (const auto& entry : hash_map) {
+    vec1.emplace_back(entry.first);
+    vec2.emplace_back(entry.second);
+  }
+  return std::make_pair(vec1, vec2);
+}
+
+}  // namespace
+
+class Graph::GraphImpl final {
+ public:
+  explicit GraphImpl(const std::string& model_path, const Device& device = Device("cpu"));
+
+  GraphImpl(const GraphImpl& graph) = delete;
+  GraphImpl(GraphImpl&& graph) noexcept;
+
+  ~GraphImpl() = default;
+
+  GraphImpl& operator=(const GraphImpl& graph) = delete;
+  GraphImpl& operator=(GraphImpl&& graph) noexcept;
+
+  std::vector<Tensor> Forward(const std::vector<Tensor>& inputs);
+  void set_batch_size(int batch_size) { batch_size_ = batch_size; }
+  void enable_tensorrt() { xrt_kind_ = XrtKind::kTensorRT; }
+
+ private:
+  oneflow::Maybe<void> Compile(const std::vector<Tensor>& inputs);
+  oneflow::Maybe<std::vector<Tensor>> Run(const std::vector<Tensor>& inputs) const;
+  oneflow::Maybe<void> AddOp(oneflow::OperatorConf op_conf);
+  oneflow::Maybe<void> BuildGraph(const std::vector<Tensor>& inputs);
+  oneflow::Maybe<void> LoadCheckpoint();
+  oneflow::Maybe<void> RegisterTensors(const std::vector<Tensor>& inputs);
+
+  std::shared_ptr<oneflow::NNGraph> graph_ = nullptr;
+  std::string model_path_;
+  bool is_compiled_ = false;
+  int batch_size_ = 0;
+  XrtKind xrt_kind_ = XrtKind::kNone;
+  Device device_;
+  oneflow::Job job_;
+
+  oneflow::HashMap<std::string, int> input_name_to_order_;
+  oneflow::HashMap<std::string, std::shared_ptr<oneflow::one::Tensor>> output_name_to_tensor_;
+  oneflow::HashMap<std::string, std::shared_ptr<oneflow::one::Tensor>> variable_op_name_to_tensor_;
+  std::shared_ptr<oneflow::one::TensorTuple> output_tensor_tuple_;
+  std::shared_ptr<oneflow::one::TensorTuple> parameter_tensor_tuple_;
+};
+
+Graph::Graph(const std::string& model_path, const Device& device)
+    : graph_(std::make_unique<GraphImpl>(model_path, device)) {}
+
+Graph::~Graph() = default;
+
+Graph::Graph(Graph&& graph) noexcept : graph_(std::move(graph.graph_)) {}
+
+Graph& Graph::operator=(Graph&& graph) noexcept {
+  if (&graph == this) { return *this; }
+  graph_ = std::move(graph.graph_);
+  return *this;
+}
+
+IValue Graph::Forward(const IValue& inputs) {
+  std::vector<Tensor> input_tensors;
+  if (inputs.IsNone()) {
+    // do nothing
+  } else if (inputs.IsTensor()) {
+    input_tensors.emplace_back(inputs.ToTensor());
+  } else if (inputs.IsTensorVector()) {
+    input_tensors = inputs.ToTensorVector();
+  } else {
+    LOG(WARNING) << "Graph currently only support types: Tensor/vector(Tensor)/None";
+  }
+
+  std::vector<Tensor> output_tensors = graph_->Forward(input_tensors);
+  if (output_tensors.empty()) {
+    return IValue{};
+  } else if (output_tensors.size() == 1) {
+    return IValue(output_tensors.at(0));
+  } else {
+    return IValue(output_tensors);
+  }
+}
+
+void Graph::set_batch_size(int batch_size) { graph_->set_batch_size(batch_size); }
+
+void Graph::enable_tensorrt() { graph_->enable_tensorrt(); }
+
+Graph Graph::Load(const std::string& model_path, const Device& device) {
+  Graph graph(model_path, device);
+  return graph;
+}
+
+Graph::GraphImpl::GraphImpl(const std::string& model_path, const Device& device)
+    : model_path_(model_path), device_(device) {
+  CHECK_JUST(of::LoadJobFromIR(&job_, model_path + "/model.mlir"));
+  job_.mutable_job_conf()->mutable_predict_conf();
+  job_.mutable_job_conf()->set_job_name(job_.mutable_job_conf()->job_name() + of::NewUniqueId());
+  graph_ = std::make_shared<of::NNGraph>(job_.job_conf().job_name());
+  of::Global<of::MultiClientSessionContext>::Get()->AddCGraph(graph_).GetOrThrow();
+}
+
+Graph::GraphImpl::GraphImpl(GraphImpl&& graph) noexcept
+    : graph_(std::move(graph.graph_)),
+      model_path_(std::move(graph.model_path_)),
+      is_compiled_(graph.is_compiled_),
+      batch_size_(graph.batch_size_),
+      xrt_kind_(graph.xrt_kind_),
+      device_(std::move(graph.device_)),
+      job_(std::move(graph.job_)),
+      input_name_to_order_(std::move(graph.input_name_to_order_)),
+      output_name_to_tensor_(std::move(graph.output_name_to_tensor_)),
+      variable_op_name_to_tensor_(std::move(graph.variable_op_name_to_tensor_)),
+      output_tensor_tuple_(std::move(graph.output_tensor_tuple_)),
+      parameter_tensor_tuple_(std::move(graph.parameter_tensor_tuple_)) {}
+
+Graph::GraphImpl& Graph::GraphImpl::operator=(Graph::GraphImpl&& graph) noexcept {
+  if (&graph == this) { return *this; }
+  graph_ = std::move(graph.graph_);
+  model_path_ = std::move(graph.model_path_);
+  is_compiled_ = graph.is_compiled_;
+  batch_size_ = graph.batch_size_;
+  xrt_kind_ = graph.xrt_kind_;
+  device_ = std::move(graph.device_);
+  job_ = std::move(graph.job_);
+  input_name_to_order_ = std::move(graph.input_name_to_order_);
+  output_name_to_tensor_ = std::move(graph.output_name_to_tensor_);
+  variable_op_name_to_tensor_ = std::move(graph.variable_op_name_to_tensor_);
+  output_tensor_tuple_ = std::move(graph.output_tensor_tuple_);
+  parameter_tensor_tuple_ = std::move(graph.parameter_tensor_tuple_);
+  return *this;
+}
+
+std::vector<Tensor> Graph::GraphImpl::Forward(const std::vector<Tensor>& inputs) {
+  if (!is_compiled_) {
+    static std::mutex mtx;
+    std::lock_guard<std::mutex> lock(mtx);
+    Compile(inputs).GetOrThrow();
+    is_compiled_ = true;
+  }
+  return Run(inputs).GetOrThrow();
+}
+
+of::Maybe<void> Graph::GraphImpl::Compile(const std::vector<Tensor>& inputs) {
+  JUST(BuildGraph(inputs));
+  JUST(LoadCheckpoint());
+  JUST(RegisterTensors(inputs));
+  JUST(graph_->CompileAndInitRuntime());
+  return of::Maybe<void>::Ok();
+}
+
+of::Maybe<std::vector<Tensor>> Graph::GraphImpl::Run(const std::vector<Tensor>& inputs) const {
+  const auto input_tensor_tuple = std::make_shared<of::one::TensorTuple>();
+  for (const auto& tensor : inputs) { input_tensor_tuple->emplace_back(tensor.tensor_); }
+
+  JUST(of::RunLazyNNGraph(*input_tensor_tuple, *output_tensor_tuple_, *parameter_tensor_tuple_,
+                          graph_));
+  JUST(of::SoftSyncNNGraphBuffers(*output_tensor_tuple_, graph_));
+
+  std::vector<Tensor> outputs;
+  for (const auto& tensor : *output_tensor_tuple_) { outputs.emplace_back(Tensor(tensor)); }
+  return outputs;
+}
+
+of::Maybe<void> Graph::GraphImpl::AddOp(of::OperatorConf op_conf) {
+  {
+    const std::shared_ptr<of::Scope> scope = JUST(of::GetCurrentScope());
+    op_conf.set_scope_symbol_id(scope->symbol_id().value_or(0));
+  }
+  op_conf.set_device_tag(GetDeviceTag(device_));
+  if (batch_size_ > 0 && op_conf.has_input_conf()) {
+    op_conf.mutable_input_conf()->mutable_blob_conf()->mutable_shape()->mutable_dim()->Set(
+        0, batch_size_);
+  }
+  auto* ctx = JUST(of::GetCurInferCtx());
+  JUST(ctx->AddAndInferConsistentOp(op_conf));
+  return of::Maybe<void>::Ok();
+}
+
+of::Maybe<void> Graph::GraphImpl::BuildGraph(const std::vector<Tensor>& inputs) {
+  CompileScope build_graph_scope(job_.job_conf(), *device_.device_->shared_from_symbol(),
+                                 xrt_kind_);
+  {
+    int input_tensor_order = 0;
+    const of::OpGraph op_graph(job_);
+    op_graph.TopoForEachNode([&](const of::OpNode* node) -> of::Maybe<void> {
+      const of::OperatorConf& op_conf = node->op().op_conf();
+      JUST(AddOp(op_conf));
+      if (op_conf.has_input_conf()) {
+        input_name_to_order_[op_conf.name()] = input_tensor_order;
+        input_tensor_order += 1;
+      } else if (op_conf.has_variable_conf()) {
+        const of::LazyMode::Guard lazy_mode_disabled_guard{false};
+        const of::VariableOpConf& variable_conf = op_conf.variable_conf();
+        variable_op_name_to_tensor_[op_conf.name()] = JUST(of::one::functional::Empty(
+            of::Shape(variable_conf.shape()),
+            JUST(of::DType::Get(static_cast<of::DataType>(variable_conf.data_type()))),
+            *device_.device_));
+      }
+      return of::Maybe<void>::Ok();
+    });
+  }
+  JUST(of::CurJobBuildAndInferCtx_Complete());
+  {
+    const std::shared_ptr<of::Job> complete_job = JUST(of::GetCurrentJob());
+    const of::OpGraph complete_graph(*complete_job);
+    complete_graph.TopoForEachNode([&](const of::OpNode* node) -> of::Maybe<void> {
+      const of::LazyMode::Guard lazy_mode_disabled_guard{false};
+      const of::OperatorConf& op_conf = node->op().op_conf();
+      if (op_conf.has_output_conf()) {
+        of::InterfaceBlobConf blob_conf = op_conf.output_conf().blob_conf();
+        if (batch_size_ > 0) {
+          const std::string input_lbi_str = op_conf.output_conf().in();
+          const of::LogicalBlobId input_lbi = of::GenLogicalBlobId(input_lbi_str);
+          int64_t batch_size = node->LogicalBlobDesc4Lbi(input_lbi).shape().At(0);
+          blob_conf.mutable_shape()->set_dim(0, batch_size);
+        }
+        output_name_to_tensor_[op_conf.name()] = JUST(of::one::functional::Empty(
+            of::Shape(blob_conf.shape()),
+            JUST(of::DType::Get(static_cast<of::DataType>(blob_conf.data_type()))),
+            *device_.device_));
+      }
+      return of::Maybe<void>::Ok();
+    });
+  }
+  return of::Maybe<void>::Ok();
+}
+
+of::Maybe<void> Graph::GraphImpl::LoadCheckpoint() {
+  for (const auto& variable_op_name_and_tensor : variable_op_name_to_tensor_) {
+    const auto& variable_op_name = variable_op_name_and_tensor.first;
+    const auto& variable_tensor = variable_op_name_and_tensor.second;
+    const std::string variable_filename = model_path_ + "/" + variable_op_name + "/out";
+    const std::string buffer = [&]() {
+      std::ifstream variable_file(variable_filename, std::ios::binary);
+      CHECK(variable_file.is_open());
+      std::stringstream ss;
+      ss << variable_file.rdbuf();
+      return ss.str();
+    }();
+    const auto& callback =
+        std::make_shared<std::function<void(uint64_t)>>([&](uint64_t of_blob_ptr) {
+          CHECK_JUST(of::BlobBufferCopyUtil<void>::From(
+              of_blob_ptr, buffer.data(),
+              variable_tensor->shape()->elem_cnt()
+                  * of::GetSizeOfDataType(variable_tensor->dtype()->data_type())));
+        });
+    JUST(of::one::SyncAccessTensorWithTimeOut(variable_tensor, callback, "mut"));
+  }
+
+  return of::Maybe<void>::Ok();
+}
+
+of::Maybe<void> Graph::GraphImpl::RegisterTensors(const std::vector<Tensor>& inputs) {
+  {
+    std::vector<std::string> input_op_names(inputs.size());
+    std::vector<std::shared_ptr<of::one::Tensor>> input_tensors(inputs.size());
+    for (const auto& name_order : input_name_to_order_) {
+      input_op_names[name_order.second] = name_order.first;
+      input_tensors[name_order.second] = inputs.at(name_order.second).tensor_;
+    }
+    JUST(graph_->RegisterInputOpNamesAndTensors(input_op_names, input_tensors));
+  }
+  {
+    const auto& pair = Unzip(output_name_to_tensor_);
+    const std::vector<std::string>& output_op_names = pair.first;
+    const std::vector<std::shared_ptr<of::one::Tensor>>& output_tensors = pair.second;
+    JUST(graph_->RegisterOutputOpNamesAndTensors(output_op_names, output_tensors));
+    output_tensor_tuple_ = ConvertToTensorTuple(output_tensors);
+  }
+  {
+    const auto& pair = Unzip(variable_op_name_to_tensor_);
+    const std::vector<std::string>& variable_op_names = pair.first;
+    const std::vector<std::shared_ptr<of::one::Tensor>>& variable_tensors = pair.second;
+    JUST(graph_->RegisterVariableOpNamesAndTensors(variable_op_names, variable_tensors));
+    parameter_tensor_tuple_ = ConvertToTensorTuple(variable_tensors);
+  }
+  return of::Maybe<void>::Ok();
+}
+
+}  // namespace oneflow_api
diff --git a/oneflow/api/cpp/framework/graph.h b/oneflow/api/cpp/framework/graph.h
new file mode 100644
index 00000000000..c2f690b642d
--- /dev/null
+++ b/oneflow/api/cpp/framework/graph.h
@@ -0,0 +1,56 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef ONEFLOW_API_CPP_GRAPH_H_
+#define ONEFLOW_API_CPP_GRAPH_H_
+
+#include "device.h"
+#include "ivalue.h"
+#include "tensor.h"
+
+namespace oneflow {
+
+class NNGraph;
+
+}  // namespace oneflow
+
+namespace oneflow_api {
+
+class Graph {
+ public:
+  explicit Graph(const std::string& model_path, const Device& device = Device("cpu"));
+  ~Graph();
+
+  Graph(const Graph& graph) = delete;
+  Graph(Graph&& graph) noexcept;
+
+  Graph& operator=(const Graph& graph) = delete;
+  Graph& operator=(Graph&& graph) noexcept;
+
+  IValue Forward(const IValue& inputs);
+  void set_batch_size(int batch_size);
+  void enable_tensorrt();
+
+  static Graph Load(const std::string& model_path, const Device& device = Device("cpu"));
+
+ private:
+  class GraphImpl;
+  std::unique_ptr<GraphImpl> graph_;
+};
+
+}  // namespace oneflow_api
+
+#endif  // ONEFLOW_API_CPP_GRAPH_H_
diff --git a/oneflow/api/cpp/framework/tensor.cpp b/oneflow/api/cpp/framework/tensor.cpp
index ab01924b280..27b95f3ddbd 100644
--- a/oneflow/api/cpp/framework/tensor.cpp
+++ b/oneflow/api/cpp/framework/tensor.cpp
@@ -100,7 +100,7 @@ Tensor Tensor::from_buffer(const void* buffer, const Shape& shape, const Device&
 }
 
 template<typename T>
-void Tensor::copy_to(T* buffer) {
+void Tensor::copy_to(T* buffer) const {
   std::shared_ptr<of::one::MirroredTensor> local_tensor =
       tensor_->AsMirroredTensor().GetPtrOrThrow();
   const auto shape = this->shape();
@@ -130,7 +130,7 @@ void Tensor::copy_to(T* buffer) {
 const std::shared_ptr<oneflow::one::Tensor>& Tensor::__internal_tensor() const { return tensor_; }
 
 #define REGISTER_TENSOR_COPY_TO(cpp_dtype) \
-  template void Tensor::copy_to<cpp_dtype>(cpp_dtype * buffer);
+  template void Tensor::copy_to<cpp_dtype>(cpp_dtype * buffer) const;
 
 REGISTER_TENSOR_COPY_TO(float)
 REGISTER_TENSOR_COPY_TO(double)
diff --git a/oneflow/api/cpp/framework/tensor.h b/oneflow/api/cpp/framework/tensor.h
index 2a99bca8c0f..08ac8daf488 100644
--- a/oneflow/api/cpp/framework/tensor.h
+++ b/oneflow/api/cpp/framework/tensor.h
@@ -33,6 +33,8 @@ class Tensor;
 namespace oneflow_api {
 
 class Tensor final {
+  friend class Graph;
+
  public:
   explicit Tensor(const Shape& shape = Shape(), const Device& device = Device("cpu"),
                   const DType& dtype = DType::kFloat);
@@ -56,7 +58,7 @@ class Tensor final {
   [[nodiscard]] const std::shared_ptr<oneflow::one::Tensor>& __internal_tensor() const;
 
   template<typename T>
-  void copy_to(T* buffer);
+  void copy_to(T* buffer) const;
 
   [[nodiscard]] static Tensor from_buffer(const void* buffer, const Shape& shape,
                                           const Device& device, const DType& dtype);
diff --git a/oneflow/api/cpp/tests/api_test.cpp b/oneflow/api/cpp/tests/api_test.cpp
index 88619f3617b..1fbc790bcc2 100644
--- a/oneflow/api/cpp/tests/api_test.cpp
+++ b/oneflow/api/cpp/tests/api_test.cpp
@@ -15,7 +15,18 @@ limitations under the License.
 */
 
 #include "oneflow/api/cpp/tests/api_test.h"
+#include <cstddef>
 #include <random>
+#include <string>
+#ifdef __linux__
+
+#include <unistd.h>  // readlink
+
+#elif defined(__APPLE__)
+
+#include <mach-o/dyld.h>  //  _NSGetExecutablePath
+
+#endif
 
 namespace oneflow_api {
 
@@ -47,4 +58,27 @@ REGISTER_RANDOM_DATA(int8_t)
 REGISTER_RANDOM_DATA(int32_t)
 REGISTER_RANDOM_DATA(int64_t)
 
+std::string GetExeDir() {
+  const size_t path_max_size = 4096;  // PATH_MAX = 4096 on linux
+  char result[path_max_size];
+
+  const auto get_dir_from_path = [](char result[], size_t count) -> std::string {
+    std::string exe_path(result, (count > 0) ? count : 0);
+
+    // string(path).rfind('/') will never be string::npos on linux or macos.
+    return exe_path.substr(0, exe_path.rfind('/'));
+  };
+
+#ifdef __linux__
+  ssize_t count = readlink("/proc/self/exe", result, path_max_size);
+  return get_dir_from_path(result, count);
+#elif defined(__APPLE__)
+  uint32_t count = path_max_size;
+  CHECK_EQ(_NSGetExecutablePath(result, &count), 0) << "Fail to get executable file path.";
+  return get_dir_from_path(result, count);
+#else
+#error oneflow_api::GetExeDir() has not been supported on windows.
+#endif
+}
+
 }  // namespace oneflow_api
diff --git a/oneflow/api/cpp/tests/api_test.h b/oneflow/api/cpp/tests/api_test.h
index cec50969e69..c196bc90662 100644
--- a/oneflow/api/cpp/tests/api_test.h
+++ b/oneflow/api/cpp/tests/api_test.h
@@ -32,6 +32,8 @@ Shape RandomShape();
 template<typename T>
 std::vector<T> RandomData(size_t size);
 
+std::string GetExeDir();
+
 }  // namespace oneflow_api
 
 #endif  // !ONEFLOW_API_CPP_TESTS_API_TEST_H_
diff --git a/oneflow/api/cpp/tests/graph_test.cpp b/oneflow/api/cpp/tests/graph_test.cpp
new file mode 100644
index 00000000000..497da6b1bbb
--- /dev/null
+++ b/oneflow/api/cpp/tests/graph_test.cpp
@@ -0,0 +1,197 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <thread>
+#include <vector>
+#include "oneflow/api/cpp/framework.h"
+#include "oneflow/api/cpp/tests/api_test.h"
+
+namespace oneflow_api {
+
+namespace {
+
+inline Graph LoadGraph(const Device& device) {
+  Graph graph =
+      Graph::Load("./oneflow/api/cpp/tests/graph_test_model/affine_with_parameter", device);
+  return graph;
+}
+
+inline void Forward(Graph& graph, const Device& device, int expected_batch_dim = 1) {
+  std::vector<float> data(expected_batch_dim * 3);
+  std::fill(data.begin(), data.end(), 1);
+  std::vector<Tensor> inputs;
+  inputs.emplace_back(
+      Tensor::from_buffer(data.data(), Shape({expected_batch_dim, 3}), device, DType::kFloat));
+  const auto& value = graph.Forward(inputs);
+  ASSERT_TRUE(value.IsTensor());
+  Tensor output = value.ToTensor();
+  Shape shape = output.shape();
+  ASSERT_EQ(shape.At(0), expected_batch_dim);
+  ASSERT_EQ(shape.At(1), 4);
+  std::vector<float> buf(expected_batch_dim * 4);
+  output.copy_to(buf.data());
+  for (const float& element : buf) { ASSERT_EQ(element, 4); }
+}
+
+}  // namespace
+
+TEST(Api, graph_cpu_test) {
+  EnvScope scope;
+  Device device("cpu");
+  Graph graph = LoadGraph(device);
+  Forward(graph, device, 1);
+}
+
+#ifdef WITH_CUDA
+TEST(Api, graph_gpu_test) {
+  EnvScope scope;
+  Device device("cuda", 0);
+  Graph graph = LoadGraph(device);
+  Forward(graph, device);
+}
+
+TEST(Api, graph_multi_gpu_test) {
+  EnvScope scope;
+  Device device("cuda", 0);
+  Graph graph = LoadGraph(device);
+  Forward(graph, device);
+
+  Device device1("cuda", 1);
+  Graph graph1 = LoadGraph(device1);
+  Forward(graph1, device1);
+}
+
+TEST(Api, graph_trt_test) {
+  EnvScope scope;
+  Device device("cuda:0");
+  Graph graph = LoadGraph(device);
+  graph.enable_tensorrt();
+  Forward(graph, device);
+}
+#endif
+
+TEST(Api, graph_cpu_batching_test) {
+  EnvScope scope;
+  Device device("cpu");
+  Graph graph = LoadGraph(device);
+  graph.set_batch_size(10);
+  Forward(graph, device, 10);
+}
+
+#ifdef WITH_CUDA
+TEST(Api, graph_gpu_batching_test) {
+  EnvScope scope;
+  Device device("cuda", 0);
+  Graph graph = LoadGraph(device);
+  graph.set_batch_size(10);
+  Forward(graph, device, 10);
+}
+
+TEST(Api, graph_multi_device_test) {
+  EnvScope scope;
+  Device device("cuda", 0);
+  Graph graph = LoadGraph(device);
+  Forward(graph, device, 1);
+
+  Device device1("cuda", 1);
+  Graph graph1 = LoadGraph(device1);
+  Forward(graph1, device1, 1);
+
+  Device device2("cpu");
+  Graph graph2 = LoadGraph(device2);
+  Forward(graph2, device2, 1);
+}
+
+TEST(Api, graph_unload_test) {
+  {
+    EnvScope scope;
+
+    Device device("cuda", 0);
+    Graph graph = LoadGraph(device);
+    Forward(graph, device, 1);
+
+    {
+      Device device1("cuda", 1);
+      Graph graph1 = LoadGraph(device1);
+      Forward(graph1, device1, 1);
+    }
+
+    Device device2("cpu");
+    Graph graph2 = LoadGraph(device2);
+    Forward(graph2, device2, 1);
+  }
+
+  {
+    EnvScope scope;
+
+    Device device("cpu");
+    Graph graph = LoadGraph(device);
+    Forward(graph, device, 1);
+  }
+}
+#endif
+
+TEST(Api, graph_thread_test) {
+  EnvScope scope;
+
+  Device device("cpu");
+  std::vector<Graph> graphs;
+  for (int i = 0; i < 10; i++) { graphs.emplace_back(LoadGraph(device)); }
+
+  std::vector<std::thread> threads;
+  for (Graph& graph : graphs) {
+    threads.emplace_back(std::thread(std::bind(Forward, std::move(graph), device, 1)));
+  }
+  for (auto& thread : threads) { thread.join(); }
+}
+
+TEST(Api, graph_input_order_test) {
+  EnvScope scope;
+
+  Device device("cpu");
+  Graph graph = Graph::Load("./oneflow/api/cpp/tests/graph_test_model/affine_no_parameter", device);
+
+  std::vector<Tensor> inputs;
+  std::vector<float> x(3);
+  std::fill(x.begin(), x.end(), 1);
+  inputs.emplace_back(Tensor::from_buffer(x.data(), Shape({1, 3}), device, DType::kFloat));
+  std::vector<float> a(3 * 2);
+  std::fill(a.begin(), a.end(), 1);
+  inputs.emplace_back(Tensor::from_buffer(a.data(), Shape({3, 2}), device, DType::kFloat));
+  std::vector<float> b(2);
+  std::fill(b.begin(), b.end(), 1);
+  inputs.emplace_back(Tensor::from_buffer(b.data(), Shape({2}), device, DType::kFloat));
+
+  const auto& value = graph.Forward(inputs);
+  ASSERT_TRUE(value.IsTensor());
+  Tensor output = value.ToTensor();
+  Shape shape = output.shape();
+  ASSERT_EQ(shape.At(0), 1);
+  ASSERT_EQ(shape.At(1), 2);
+  std::array<float, 2> buf{};
+  output.copy_to(buf.data());
+  ASSERT_EQ(buf[0], 4);
+  ASSERT_EQ(buf[1], 4);
+}
+
+}  // namespace oneflow_api
diff --git a/oneflow/api/cpp/tests/graph_test_model/affine_no_parameter/model.mlir b/oneflow/api/cpp/tests/graph_test_model/affine_no_parameter/model.mlir
new file mode 100644
index 00000000000..30c09f7c841
--- /dev/null
+++ b/oneflow/api/cpp/tests/graph_test_model/affine_no_parameter/model.mlir
@@ -0,0 +1,11 @@
+module  {
+  oneflow.job @MyGraph_1(%arg0: tensor<1x3xf32>, %arg1: tensor<3x2xf32>, %arg2: tensor<2xf32>) -> tensor<1x2xf32> {
+    %output = "oneflow.input"(%arg0) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_1-input_0", output_lbns = ["_MyGraph_1-input_0/out"], scope_symbol_id = 4611686018427527167 : i64, shape = [1 : si64, 3 : si64]} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    %output_0 = "oneflow.input"(%arg1) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_1-input_1", output_lbns = ["_MyGraph_1-input_1/out"], scope_symbol_id = 4611686018427527167 : i64, shape = [3 : si64, 2 : si64]} : (tensor<3x2xf32>) -> tensor<3x2xf32>
+    %output_1 = "oneflow.input"(%arg2) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_1-input_2", output_lbns = ["_MyGraph_1-input_2/out"], scope_symbol_id = 4611686018427527167 : i64, shape = [2 : si64]} : (tensor<2xf32>) -> tensor<2xf32>
+    %0 = "oneflow.matmul"(%output, %output_0) {alpha = 1.000000e+00 : f64, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], op_name = "model-matmul_0", output_lbns = ["model-matmul_0/out_0"], scope_symbol_id = 4611686018427535359 : i64, transpose_a = false, transpose_b = false} : (tensor<1x3xf32>, tensor<3x2xf32>) -> tensor<1x2xf32>
+    %1 = "oneflow.broadcast_add"(%0, %output_1) {device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], op_name = "model-broadcast_add_1", output_lbns = ["model-broadcast_add_1/z_0"], scope_symbol_id = 4611686018427535359 : i64} : (tensor<1x2xf32>, tensor<2xf32>) -> tensor<1x2xf32>
+    %output_2 = "oneflow.output"(%1) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_1-output_0", output_lbns = ["_MyGraph_1-output_0/out"], scope_symbol_id = 4611686018427527167 : i64, shape = [1 : si64, 2 : si64]} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    oneflow.return %output_2 : tensor<1x2xf32>
+  }
+}
diff --git a/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.a/meta b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.a/meta
new file mode 100644
index 00000000000..421341fc956
--- /dev/null
+++ b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.a/meta
@@ -0,0 +1,5 @@
+shape {
+  dim: 3
+  dim: 4
+}
+data_type: kFloat
diff --git a/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.a/out b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.a/out
new file mode 100644
index 00000000000..be22e342567
Binary files /dev/null and b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.a/out differ
diff --git a/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.b/meta b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.b/meta
new file mode 100644
index 00000000000..166375025be
--- /dev/null
+++ b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.b/meta
@@ -0,0 +1,4 @@
+shape {
+  dim: 4
+}
+data_type: kFloat
diff --git a/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.b/out b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.b/out
new file mode 100644
index 00000000000..dcce8bfb97e
Binary files /dev/null and b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.b/out differ
diff --git a/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.mlir b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.mlir
new file mode 100644
index 00000000000..15a53af1f48
--- /dev/null
+++ b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.mlir
@@ -0,0 +1,11 @@
+module  {
+  oneflow.job @MyGraph_0(%arg0: tensor<1x3xf32>) -> tensor<1x4xf32> {
+    %output = "oneflow.input"(%arg0) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_0-input_0", output_lbns = ["_MyGraph_0-input_0/out"], scope_symbol_id = 4611686018427469823 : i64, shape = [1 : si64, 3 : si64]} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    %output_0 = "oneflow.variable"() {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], nd_sbp = ["B"], op_name = "model.a", output_lbns = ["model.a/out"], scope_symbol_id = 4611686018427482111 : i64, shape = [3 : si64, 4 : si64]} : () -> tensor<3x4xf32>
+    %output_1 = "oneflow.variable"() {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], nd_sbp = ["B"], op_name = "model.b", output_lbns = ["model.b/out"], scope_symbol_id = 4611686018427494399 : i64, shape = [4 : si64]} : () -> tensor<4xf32>
+    %0 = "oneflow.matmul"(%output, %output_0) {alpha = 1.000000e+00 : f64, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], op_name = "model-matmul_0", output_lbns = ["model-matmul_0/out_0"], scope_symbol_id = 4611686018427486207 : i64, transpose_a = false, transpose_b = false} : (tensor<1x3xf32>, tensor<3x4xf32>) -> tensor<1x4xf32>
+    %1 = "oneflow.broadcast_add"(%0, %output_1) {device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], op_name = "model-broadcast_add_1", output_lbns = ["model-broadcast_add_1/z_0"], scope_symbol_id = 4611686018427486207 : i64} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+    %output_2 = "oneflow.output"(%1) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_0-output_0", output_lbns = ["_MyGraph_0-output_0/out"], scope_symbol_id = 4611686018427469823 : i64, shape = [1 : si64, 4 : si64]} : (tensor<1x4xf32>) -> tensor<1x4xf32>
+    oneflow.return %output_2 : tensor<1x4xf32>
+  }
+}
diff --git a/oneflow/api/cpp/tests/tensor_test.cpp b/oneflow/api/cpp/tests/tensor_test.cpp
index 3241a220263..5960f961675 100644
--- a/oneflow/api/cpp/tests/tensor_test.cpp
+++ b/oneflow/api/cpp/tests/tensor_test.cpp
@@ -26,13 +26,13 @@ TEST(Api, device) {
   ASSERT_EQ(device.type(), "cpu");
 
 #ifdef WITH_CUDA
-  device = Device("cuda", 1);
+  device = Device("cuda:0");
   ASSERT_EQ(device.type(), "cuda");
-  ASSERT_EQ(device.device_id(), 1);
+  ASSERT_EQ(device.device_id(), 0);
 
-  device = Device("cuda:2");
+  device = Device("cuda", 1);
   ASSERT_EQ(device.type(), "cuda");
-  ASSERT_EQ(device.device_id(), 2);
+  ASSERT_EQ(device.device_id(), 1);
 #endif
 }
 
diff --git a/oneflow/api/python/framework/device.cpp b/oneflow/api/python/framework/device.cpp
index f843d8ccb74..06d5a733bfb 100644
--- a/oneflow/api/python/framework/device.cpp
+++ b/oneflow/api/python/framework/device.cpp
@@ -16,7 +16,6 @@ limitations under the License.
 #include <pybind11/pybind11.h>
 #include <pybind11/operators.h>
 #include "oneflow/core/control/global_process_ctx.h"
-#include "oneflow/api/common/device.h"
 #include "oneflow/api/python/of_api_registry.h"
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/common/str_util.h"
@@ -29,10 +28,10 @@ namespace oneflow {
 ONEFLOW_API_PYBIND11_MODULE("", m) {
   py::class_<Symbol<Device>, std::shared_ptr<Symbol<Device>>>(m, "device")
       .def(py::init([](const std::string& type_or_type_with_device_id) {
-        return DeviceExportUtil::ParseAndNew(type_or_type_with_device_id).GetOrThrow();
+        return Device::ParseAndNew(type_or_type_with_device_id).GetOrThrow();
       }))
       .def(py::init([](const std::string& type, int64_t device_id) {
-        return DeviceExportUtil::New(type, device_id).GetOrThrow();
+        return Device::New(type, device_id).GetOrThrow();
       }))
       .def_property_readonly("type", [](const Symbol<Device>& d) { return d->type(); })
       .def_property_readonly("index", [](const Symbol<Device>& d) { return d->device_id(); })
diff --git a/oneflow/api/python/functional/python_arg.cpp b/oneflow/api/python/functional/python_arg.cpp
index 15215102162..9eb31f214f0 100644
--- a/oneflow/api/python/functional/python_arg.cpp
+++ b/oneflow/api/python/functional/python_arg.cpp
@@ -15,8 +15,6 @@ limitations under the License.
 */
 
 #include "oneflow/api/python/functional/python_arg.h"
-
-#include "oneflow/api/common/device.h"
 #include "oneflow/api/python/functional/common.h"
 #include "oneflow/api/python/functional/indexing.h"
 #include "oneflow/core/common/scalar.h"
@@ -126,7 +124,7 @@ template<>
 Maybe<Symbol<Device>> PythonArg::ObjectAs<Symbol<Device>>() const {
   if (PyStringCheck(object_)) {
     const char* device_str = JUST(PyStringAsString(object_));
-    return DeviceExportUtil::ParseAndNew(device_str);
+    return Device::ParseAndNew(device_str);
   }
   return PyUnpackDevice(object_);
 }
diff --git a/oneflow/api/python/ir.cpp b/oneflow/api/python/ir.cpp
index 5840cb9d716..422242d37c4 100644
--- a/oneflow/api/python/ir.cpp
+++ b/oneflow/api/python/ir.cpp
@@ -28,9 +28,6 @@ ONEFLOW_API_PYBIND11_MODULE("ir", m) {
         [](const std::string& lib_path) { MutSharedLibPaths()->insert(lib_path); });
 }
 
-REGISTER_JOB_PASS("IRRoundTripBeforeAD", IRRoundTrip<kBeforeAD>);
-REGISTER_JOB_PASS("IRRoundTrip", IRRoundTrip<kAfterAD>);
-
 }  // namespace oneflow
 
 #endif  // WITH_MLIR
diff --git a/oneflow/core/framework/device.cpp b/oneflow/core/framework/device.cpp
index 52bb9a8e5d2..2dd191ec638 100644
--- a/oneflow/core/framework/device.cpp
+++ b/oneflow/core/framework/device.cpp
@@ -36,6 +36,14 @@ inline size_t HashDevice(const std::string& type, int64_t device_id) {
   return std::hash<std::string>()(type) ^ std::hash<int64_t>()(device_id);
 }
 
+void CheckDeviceType(const std::string& type) {
+  if (Device::type_supported.find(type) == Device::type_supported.end()) {
+    std::string error_msg =
+        "Expected one of cpu, cuda device type at start of device string " + type;
+    throw std::runtime_error(error_msg);
+  }
+}
+
 }  // namespace
 
 Device::Device(const std::string& type, int64_t device_id)
@@ -83,6 +91,19 @@ Maybe<void> Device::Init() {
   return New(type, GlobalProcessCtx::LocalRank());
 }
 
+/* static */ Maybe<Symbol<Device>> Device::ParseAndNew(
+    const std::string& type_or_type_with_device_id) {
+  std::string type;
+  int device_id = -1;
+  JUST(ParsingDeviceTag(type_or_type_with_device_id, &type, &device_id));
+  CheckDeviceType(type);
+  if (device_id == -1) {
+    return Device::New(type);
+  } else {
+    return Device::New(type, device_id);
+  }
+}
+
 Maybe<const std::string&> Device::of_type() const {
   static const HashMap<std::string, std::string> type2device_tag{
       {"cpu", "cpu"},
diff --git a/oneflow/core/framework/device.h b/oneflow/core/framework/device.h
index f667c6c476f..399dcb6bd3b 100644
--- a/oneflow/core/framework/device.h
+++ b/oneflow/core/framework/device.h
@@ -57,6 +57,7 @@ class Device final {
   static Maybe<Symbol<Device>> ThreadLocalGetOrNew(const std::string& type, int64_t device_id);
   static Maybe<Symbol<Device>> New(const std::string& type, int64_t device_id);
   static Maybe<Symbol<Device>> New(const std::string& type);
+  static Maybe<Symbol<Device>> ParseAndNew(const std::string& type_or_type_with_device_id);
 
   static Maybe<Symbol<Device>> MakeDeviceByParallelDesc(const ParallelDesc& parallel_desc);
   static const std::unordered_set<std::string> type_supported;
diff --git a/oneflow/core/framework/random_generator_impl.cpp b/oneflow/core/framework/random_generator_impl.cpp
index a99b93bf686..888cccaf5d6 100644
--- a/oneflow/core/framework/random_generator_impl.cpp
+++ b/oneflow/core/framework/random_generator_impl.cpp
@@ -18,6 +18,7 @@ limitations under the License.
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/framework/instructions_builder.h"
+#include "oneflow/core/framework/tensor_util.h"
 #include "oneflow/core/functional/functional.h"
 #include "oneflow/core/job/env_global_objects_scope.h"
 #include "oneflow/core/register/ofblob.h"
@@ -33,17 +34,6 @@ namespace one {
 
 namespace {
 
-Maybe<void> SyncAccessTensorWithTimeOut(
-    const std::shared_ptr<Tensor>& tensor,
-    const std::shared_ptr<std::function<void(uint64_t)>>& callback, const std::string& modifier) {
-  return SpinCounter::SpinWait(1, [&](const std::shared_ptr<SpinCounter>& sc) -> Maybe<void> {
-    return PhysicalRun([&](InstructionsBuilder* builder) -> Maybe<void> {
-      return builder->SyncAccessBlobByCallback(JUST(tensor->AsMirroredTensor()), sc, callback,
-                                               modifier);
-    });
-  });
-}
-
 Maybe<void> CPUSynchronize() {
   if (Global<EnvGlobalObjectsScope>::Get() != nullptr) { return vm::CurrentRankSync(); }
   return Maybe<void>::Ok();
diff --git a/oneflow/core/framework/tensor_util.cpp b/oneflow/core/framework/tensor_util.cpp
new file mode 100644
index 00000000000..6a615e25173
--- /dev/null
+++ b/oneflow/core/framework/tensor_util.cpp
@@ -0,0 +1,36 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/tensor_util.h"
+
+#include "oneflow/core/common/spin_counter.h"
+#include "oneflow/core/framework/instructions_builder.h"
+
+namespace oneflow {
+namespace one {
+
+Maybe<void> SyncAccessTensorWithTimeOut(
+    const std::shared_ptr<Tensor>& tensor,
+    const std::shared_ptr<std::function<void(uint64_t)>>& callback, const std::string& modifier) {
+  return SpinCounter::SpinWait(1, [&](const std::shared_ptr<SpinCounter>& sc) -> Maybe<void> {
+    return PhysicalRun([&](InstructionsBuilder* builder) -> Maybe<void> {
+      return builder->SyncAccessBlobByCallback(JUST(tensor->AsMirroredTensor()), sc, callback,
+                                               modifier);
+    });
+  });
+}
+
+}  // namespace one
+}  // namespace oneflow
diff --git a/oneflow/core/framework/tensor_util.h b/oneflow/core/framework/tensor_util.h
new file mode 100644
index 00000000000..ec9502230aa
--- /dev/null
+++ b/oneflow/core/framework/tensor_util.h
@@ -0,0 +1,29 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <string>
+
+#include "oneflow/core/common/maybe.h"
+
+namespace oneflow {
+namespace one {
+
+class Tensor;
+
+Maybe<void> SyncAccessTensorWithTimeOut(
+    const std::shared_ptr<Tensor>& tensor,
+    const std::shared_ptr<std::function<void(uint64_t)>>& callback, const std::string& modifier);
+}  // namespace one
+}  // namespace oneflow
diff --git a/oneflow/core/functional/impl/eye_functor.cpp b/oneflow/core/functional/impl/eye_functor.cpp
index 7e47e77a2fd..4de53d1be8f 100644
--- a/oneflow/core/functional/impl/eye_functor.cpp
+++ b/oneflow/core/functional/impl/eye_functor.cpp
@@ -19,6 +19,7 @@ limitations under the License.
 #include "oneflow/core/common/scalar.h"
 #include "oneflow/core/common/throw.h"
 #include "oneflow/core/common/util.h"
+#include "oneflow/core/framework/device.h"
 #include "oneflow/core/framework/attr_map.h"
 #include "oneflow/core/framework/nd_sbp.h"
 #include "oneflow/core/framework/op_builder.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "oneflow/core/functional/impl/common.h"
 #include "oneflow/core/job/lazy_mode.h"
 #include "oneflow/core/job/sbp_parallel.h"
-#include "oneflow/api/common/device.h"
 
 namespace oneflow {
 namespace one {
@@ -66,7 +66,7 @@ class EyeDeviceStrFunctor {
   Maybe<Tensor> operator()(const Scalar& rows, const Optional<Scalar>& cols,
                            const Symbol<DType>& dtype, const std::string& device,
                            const bool& requires_grad) const {
-    const Symbol<Device>& dev = JUST(DeviceExportUtil::ParseAndNew(device));
+    const Symbol<Device>& dev = JUST(Device::ParseAndNew(device));
     return JUST(functional::Eye(rows, cols, dtype, dev, requires_grad));
   }
 };
diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp
index 7e200b3b6f8..a8d63068c46 100644
--- a/oneflow/core/functional/impl/nn_functor.cpp
+++ b/oneflow/core/functional/impl/nn_functor.cpp
@@ -22,6 +22,7 @@ limitations under the License.
 #include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
 #include "oneflow/core/framework/tensor.h"
 #include "oneflow/core/framework/tensor_tuple.h"
+#include "oneflow/core/framework/tensor_util.h"
 #include "oneflow/core/framework/op_interpreter.h"
 #include "oneflow/core/framework/random_generator.h"
 #include "oneflow/core/functional/functional.h"
@@ -1597,17 +1598,6 @@ class FoldFunctor {
   std::shared_ptr<OpExpr> fold_op_;
 };
 
-Maybe<void> SyncAccessTensorWithTimeOut(
-    const std::shared_ptr<Tensor>& tensor,
-    const std::shared_ptr<std::function<void(uint64_t)>>& callback, const std::string& modifier) {
-  return SpinCounter::SpinWait(1, [&](const std::shared_ptr<SpinCounter>& sc) -> Maybe<void> {
-    return PhysicalRun([&](InstructionsBuilder* builder) -> Maybe<void> {
-      return builder->SyncAccessBlobByCallback(JUST(tensor->AsMirroredTensor()), sc, callback,
-                                               modifier);
-    });
-  });
-}
-
 class OneHotFunctor {
  public:
   OneHotFunctor() {
diff --git a/oneflow/core/functional/tensor_index.cpp b/oneflow/core/functional/tensor_index.cpp
index c846ab992bf..dbc4d159998 100644
--- a/oneflow/core/functional/tensor_index.cpp
+++ b/oneflow/core/functional/tensor_index.cpp
@@ -19,6 +19,7 @@ limitations under the License.
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/framework/instructions_builder.h"
 #include "oneflow/core/framework/tensor_tuple.h"
+#include "oneflow/core/framework/tensor_util.h"
 #include "oneflow/core/framework/nd_sbp.h"
 #include "oneflow/core/functional/functional.h"
 #include "oneflow/core/job/sbp_parallel.h"
@@ -30,17 +31,6 @@ namespace functional {
 
 namespace {
 
-Maybe<void> SyncAccessTensorWithTimeOut(
-    const std::shared_ptr<Tensor>& tensor,
-    const std::shared_ptr<std::function<void(uint64_t)>>& callback, const std::string& modifier) {
-  return SpinCounter::SpinWait(1, [&](const std::shared_ptr<SpinCounter>& sc) -> Maybe<void> {
-    return PhysicalRun([&](InstructionsBuilder* builder) -> Maybe<void> {
-      return builder->SyncAccessBlobByCallback(JUST(tensor->AsMirroredTensor()), sc, callback,
-                                               modifier);
-    });
-  });
-}
-
 int64_t CountSpecifiedDims(const TensorIndex& index) {
   int64_t specified_ndims = 0;
   for (int i = 0; i < index.size(); ++i) {
diff --git a/python/oneflow/framework/check_point_v2.py b/python/oneflow/framework/check_point_v2.py
index 5966ac5ba1a..1bb32572b4e 100644
--- a/python/oneflow/framework/check_point_v2.py
+++ b/python/oneflow/framework/check_point_v2.py
@@ -29,6 +29,8 @@
 import oneflow.core.framework.variable_meta_info_pb2 as variable_meta_info_pb
 import oneflow.framework.dtype as dtype_util
 import oneflow.framework.id_util as id_util
+from oneflow.framework.tensor import Tensor
+import oneflow.nn.graph.graph as graph_util
 import pickle
 
 SNAPSHOT_DONE_FILENAME = "snapshot_done"
@@ -120,10 +122,6 @@ def _save_tensor_to_disk(tensor: "oneflow.Tensor", dir_name: Union[str, Path]) -
 ValueContainer = Union[FileBackendVariableBlob, np.ndarray, "oneflow.Tensor"]
 
 
-def _ElemCnt(shape):
-    return np.prod(shape).astype(int).item()
-
-
 def _LoadSingleVariable(
     path: Optional[str], consistent_src_rank: Optional[int] = None
 ) -> "flow.Tensor":
@@ -205,6 +203,11 @@ def tensor_setstate(self, pickle_dict):
         )
 
 
+def RegisterMethods():
+    Tensor.__setstate__ = tensor_setstate
+    Tensor.__getstate__ = tensor_getstate
+
+
 def legacy_load(
     path: Union[str, Path], consistent_src_rank: Optional[int] = None,
 ) -> Dict[str, "flow.Tensor"]:
@@ -303,12 +306,12 @@ def save(
     """
     path: Path = Path(path)
 
-    if isinstance(obj, oneflow.nn.Graph):
-        graph: oneflow.nn.Graph = obj
+    if isinstance(obj, graph_util.Graph):
+        graph: graph_util.Graph = obj
         if not graph._is_compiled:
             raise RuntimeError("graph must be compiled first.")
 
-        os.makedirs(path, exist_ok=True)
+        path.mkdir(exist_ok=True)
 
         serialized_job = str(text_format.MessageToString(graph._forward_job_proto))
         oneflow._oneflow_internal.nn.graph.SaveJobToIR(serialized_job, str(path))
@@ -328,11 +331,5 @@ def save(
         pickle_path.write_bytes(pickled_bytes)
 
 
-def generate_values_by_initializer(initializer, shape, dtype):
-    np_dtype = np.dtype(dtype_util.convert_oneflow_dtype_to_numpy_dtype(dtype))
-    length = _ElemCnt(shape)
-    return np.array(initializer(length)).astype(np_dtype).reshape(shape)
-
-
 save_load_path = None
 consistent_src_dsk_rank = None
diff --git a/python/oneflow/framework/register_class_method_util.py b/python/oneflow/framework/register_class_method_util.py
index fce74ed0ca5..3bc4f78d00e 100644
--- a/python/oneflow/framework/register_class_method_util.py
+++ b/python/oneflow/framework/register_class_method_util.py
@@ -14,6 +14,7 @@
 limitations under the License.
 """
 import oneflow._oneflow_internal
+import oneflow.framework.check_point_v2 as check_point_v2
 import oneflow.framework.generator as generator
 import oneflow.framework.op_expr_util as op_expr_util
 import oneflow.framework.tensor as tensor_util
@@ -21,4 +22,5 @@
 
 def RegisterMethod4Class():
     tensor_util.RegisterMethods()
+    check_point_v2.RegisterMethods()
     op_expr_util.RegisterMethod4UserOpExpr()
diff --git a/python/oneflow/framework/tensor.py b/python/oneflow/framework/tensor.py
index 8fd569ac385..75afe06267e 100644
--- a/python/oneflow/framework/tensor.py
+++ b/python/oneflow/framework/tensor.py
@@ -15,7 +15,6 @@
 """
 import oneflow as flow
 from oneflow._oneflow_internal.exception import IndexException
-import oneflow.framework.check_point_v2 as check_point_v2
 import oneflow.framework.tensor_str as tensor_str_util
 import oneflow.ops.initializer_util as initializer_util
 import oneflow._oneflow_internal.lazy_mode as lazy_mode
@@ -694,7 +693,7 @@ def _init_by_initializer_conf(tensor, initializer_conf, random_seed=None):
     shape = tuple(tensor.shape)
     initializer = initializer_util.GetInitializer(initializer_conf, random_seed, shape)
 
-    np_arr = check_point_v2.generate_values_by_initializer(
+    np_arr = initializer_util.generate_values_by_initializer(
         initializer, shape, tensor.dtype
     )
     if tensor.is_consistent:
@@ -762,8 +761,6 @@ def RegisterMethods():
     Tensor.backward = _backward
     Tensor.__getitem__ = _getitem
     Tensor.__setitem__ = _setitem
-    Tensor.__setstate__ = check_point_v2.tensor_setstate
-    Tensor.__getstate__ = check_point_v2.tensor_getstate
     Tensor.__str__ = _str
     Tensor.__repr__ = _repr
     Tensor.__eq__ = _eq
diff --git a/python/oneflow/ops/initializer_util.py b/python/oneflow/ops/initializer_util.py
index 36aaf7f4afc..2a21173eb95 100644
--- a/python/oneflow/ops/initializer_util.py
+++ b/python/oneflow/ops/initializer_util.py
@@ -22,6 +22,7 @@
 import oneflow as flow
 import oneflow.core.job.initializer_conf_pb2 as initializer_conf_util
 import oneflow.core.operator.op_conf_pb2 as op_conf_util
+import oneflow.framework.dtype as dtype_util
 
 
 def constant_initializer(
@@ -1206,3 +1207,13 @@ def EmptyInitializerImpl(
     var_blob_shape: Sequence[int],
 ):
     return None
+
+
+def _elem_cnt(shape):
+    return np.prod(shape).astype(int).item()
+
+
+def generate_values_by_initializer(initializer, shape, dtype):
+    np_dtype = np.dtype(dtype_util.convert_oneflow_dtype_to_numpy_dtype(dtype))
+    length = _elem_cnt(shape)
+    return np.array(initializer(length)).astype(np_dtype).reshape(shape)