From 1d5e196d8530ffd2b9bf781abcf168b94ff9ca41 Mon Sep 17 00:00:00 2001
From: Cijie Xia <cijie.xia@mail.utoronto.ca>
Date: Tue, 26 Apr 2022 11:17:40 +0800
Subject: [PATCH] Revert "Merge branch 'master' into fea/graph_check_msg"

This reverts commit 28833b73a8041463e5e3d130784be386ee248bd8, reversing
changes made to baadf6045f2fce69c090e442a755229c1c949773.
---
 .github/workflows/canary.yml                  |   2 +-
 .github/workflows/on_merge.yml                |   2 +-
 .github/workflows/release.yml                 |   6 +-
 .github/workflows/simple.yml                  |  10 +-
 .github/workflows/test.yml                    |  80 +-
 .gitignore                                    |   2 -
 CMakeLists.txt                                |   3 +-
 README.md                                     |   1 -
 ci/check/run_clang_tidy.py                    |  15 +-
 .../expensive_generic_test_multi_client.sh    |   1 -
 ci/test/generic_test_multi_client.sh          |   1 -
 ci/test/multi_client_exception_test.sh        |   4 -
 ci/test/test_speed_multi_client.sh            |   6 +-
 cmake/oneflow.cmake                           |  11 +-
 cmake/third_party/nccl.cmake                  |   4 +-
 cmake/util.cmake                              |   1 +
 docs/source/functional.rst                    |   4 -
 docs/source/nn.rst                            |   1 -
 docs/source/one_embedding.rst                 |   8 +-
 docs/source/oneflow.rst                       |   8 +-
 docs/source/tensor.rst                        |   8 -
 .../api/python/autograd/autograd_function.cpp |  18 +-
 oneflow/api/python/autograd/function_node.cpp |   6 +-
 oneflow/api/python/caster/tensor.h            | 102 --
 oneflow/api/python/exception/exception.h      |  54 -
 oneflow/api/python/framework/doc.cpp          |  18 +-
 .../api/python/framework/one_embedding.cpp    | 359 -------
 .../framework/one_embedding_handler.cpp       |  79 ++
 oneflow/api/python/framework/tensor.cpp       | 722 +++----------
 oneflow/api/python/framework/tensor.h         |  52 -
 oneflow/api/python/functional/common.cpp      |  95 +-
 oneflow/api/python/functional/common.h        |  85 +-
 oneflow/api/python/functional/function_def.h  |   9 +-
 oneflow/api/python/functional/indexing.cpp    | 113 ++-
 oneflow/api/python/functional/indexing.h      |   4 +-
 oneflow/api/python/functional/py_function.cpp | 124 +++
 oneflow/api/python/functional/py_function.h   | 156 +++
 oneflow/api/python/functional/python_arg.cpp  | 156 +--
 oneflow/api/python/functional/python_arg.h    | 107 +-
 .../python/functional/python_arg_parser.cpp   | 122 ---
 .../api/python/functional/python_arg_parser.h | 108 --
 oneflow/api/python/functional/python_frame.h  |  90 --
 oneflow/api/python/functional/tensor_api.cpp  |  16 +-
 oneflow/api/python/functional/unpack_call.h   |  84 ++
 oneflow/api/python/functional/value_types.cpp |   5 +-
 oneflow/api/python/functional/value_types.h   |   2 +-
 oneflow/api/python/of_api_registry.h          |   1 -
 oneflow/api/python/utils/tensor_utils.cpp     |  13 +-
 oneflow/api/python/utils/tensor_utils.h       |  15 +-
 oneflow/core/autograd/autograd_engine.cpp     | 241 +++--
 oneflow/core/autograd/autograd_engine.h       | 101 +-
 .../autograd/gradient_funcs/activation.cpp    |  43 -
 .../autograd/gradient_funcs/avg_pooling.cpp   |   7 +-
 .../gradient_funcs/consistent_cast.cpp        |   6 +-
 .../consistent_to_consistent.cpp              |   4 +-
 .../gradient_funcs/partial_fc_sample.cpp      |   2 +-
 .../core/autograd/gradient_funcs/pooling.cpp  |   5 +-
 oneflow/core/common/check_level.cpp           |  31 -
 oneflow/core/common/check_level.h             |  19 +-
 oneflow/core/common/just.h                    |   2 +-
 oneflow/core/device/nccl_util.h               |   1 -
 oneflow/core/eager/eager_blob_object.cpp      |   3 +-
 .../core/embedding/cached_key_value_store.cu  |  13 +-
 oneflow/core/embedding/embedding_manager.cpp  |   6 +-
 oneflow/core/embedding/hash_functions.cuh     |  30 +-
 .../core/embedding/key_value_store_options.h  |   6 +-
 .../core/embedding/key_value_store_test.cpp   |  17 -
 .../core/embedding/mock_key_value_store.cu    | 249 -----
 oneflow/core/embedding/mock_key_value_store.h |  40 -
 oneflow/core/embedding/persistent_table.cpp   |  98 +-
 oneflow/core/embedding/persistent_table.h     |   1 -
 oneflow/core/ep/cpu/cpu_device_manager.cpp    |   2 +-
 oneflow/core/ep/cpu/cpu_device_manager.h      |   4 +-
 oneflow/core/ep/cpu/cpu_stream.cpp            |   2 +-
 oneflow/core/ep/cpu/cpu_stream.h              |   8 +-
 oneflow/core/ep/cpu/primitive/add.cpp         |   2 +-
 .../broadcast_elementwise_binary.cpp          |   2 +-
 oneflow/core/ep/cpu/primitive/permute.cpp     |  74 --
 oneflow/core/ep/cuda/cuda_device_manager.cpp  |   1 +
 oneflow/core/ep/cuda/cuda_device_manager.h    |   5 +-
 oneflow/core/ep/cuda/cuda_stream.cpp          |   2 +-
 oneflow/core/ep/cuda/cuda_stream.h            |   3 +-
 oneflow/core/framework/consistency_check.cpp  |  18 +-
 oneflow/core/framework/consistency_check.h    |   8 +-
 .../multi_client_session_context.cpp          |   1 -
 oneflow/core/framework/nn_graph.cpp           |   8 +-
 .../core/framework/op_expr_grad_function.h    |   2 -
 .../op_interpreter/dispatch_frame.cpp         |   4 +-
 .../framework/op_interpreter/dispatch_frame.h |   4 +-
 .../eager_consistent_op_interpreter.cpp       |   2 +-
 .../op_interpreter/lazy_op_interpreter.cpp    |   2 +-
 .../op_interpreter/op_interpreter.cpp         |  20 +-
 oneflow/core/framework/tensor.h               |  11 +-
 oneflow/core/framework/tensor_methods.cpp     | 105 +-
 oneflow/core/framework/tensor_rpc_util.cpp    |   2 +-
 oneflow/core/framework/tensor_rpc_util.h      |   8 +-
 oneflow/core/functional/functional_api.yaml   | 102 +-
 .../functional/impl/activation_functor.cpp    |  60 +-
 .../core/functional/impl/array_functor.cpp    |  55 +-
 .../core/functional/impl/binary_functor.cpp   |  37 -
 .../core/functional/impl/consistent_cast.cpp  |  53 +-
 oneflow/core/functional/impl/math_functor.cpp |  99 +-
 oneflow/core/functional/impl/nn_functor.cpp   | 317 ++----
 .../core/functional/impl/nn_grad_functor.cpp  |  11 +-
 .../core/functional/impl/unary_functor.cpp    |   4 +-
 oneflow/core/functional/tensor_index.cpp      |   6 +-
 oneflow/core/job/graph_scope_vars.cpp         |  26 +-
 oneflow/core/job/graph_scope_vars.h           |   9 +-
 oneflow/core/job/job_conf.proto               |   4 +-
 oneflow/core/job/runtime.cpp                  |  23 +-
 oneflow/core/job/runtime.h                    |   4 -
 oneflow/core/job_rewriter/autograd.cpp        | 402 ++------
 .../insert_nccl_logical_op_pass.cpp           |   7 +-
 .../replace_embedding_ops_pass.cpp            | 177 ++--
 ...t_sparse_softmax_cross_entropy_op_pass.cpp |  81 +-
 .../cuda_check_numerics_kernel_observer.cu    |   1 -
 oneflow/core/operator/operator.cpp            |   4 +-
 oneflow/core/profiler/kernel.h                |   2 +-
 oneflow/core/thread/thread.cpp                |   4 +-
 oneflow/core/thread/thread_manager.cpp        |  37 +-
 oneflow/core/thread/thread_manager.h          |   5 +-
 oneflow/core/vm/control_stream_type.h         |   2 +-
 oneflow/core/vm/instruction.h                 |   4 +-
 oneflow/core/vm/instruction_type.h            |   4 +-
 oneflow/core/vm/phy_instr_operand.h           |   2 +-
 oneflow/core/vm/stream.h                      |   2 +-
 oneflow/core/vm/stream_runtime_desc.h         |   2 +-
 oneflow/core/vm/stream_type.h                 |   6 +-
 oneflow/core/vm/virtual_machine_engine.h      |   2 +-
 oneflow/core/vm/vm_object.h                   |   4 +-
 oneflow/core/vm/vm_util.h                     |   2 +-
 oneflow/ir/include/OneFlow/OneFlowBase.td     |   2 +
 oneflow/ir/include/OneFlow/OneFlowUserOps.td  | 214 +---
 oneflow/user/kernels/activation_kernels.cpp   |   1 -
 oneflow/user/kernels/activation_kernels.cu    |  24 -
 oneflow/user/kernels/activation_kernels.h     |  54 -
 oneflow/user/kernels/bias_add_kernel.h        |   1 -
 .../kernels/binary_cross_entropy_kernel.cu    |   4 +-
 ...binary_cross_entropy_with_logits_kernel.cu |   4 +-
 oneflow/user/kernels/conv_cudnn_kernels.cpp   |  10 -
 ...cublas_bias_add_relu_matmul_grad_kernel.cu |   4 +-
 .../user/kernels/cublas_fused_mlp_kernel.cu   |   3 +-
 oneflow/user/kernels/data_shuffle_kernel.cu   | 953 +++---------------
 oneflow/user/kernels/deconv_cudnn_kernel.cpp  |   1 -
 .../user/kernels/dim_gather_kernel_util.cpp   |   7 +-
 .../user/kernels/dim_gather_kernel_util.cu    |  19 +-
 oneflow/user/kernels/dim_gather_kernel_util.h |  13 +-
 oneflow/user/kernels/dim_gather_kernels.cpp   |   7 +-
 .../kernels/distributions/normal_kernel.h     |   5 +-
 .../fused_dot_feature_interaction_kernel.cu   |   7 +-
 oneflow/user/kernels/gather_kernel.cpp        |   6 -
 oneflow/user/kernels/gather_kernel_util.cu    |   4 +-
 oneflow/user/kernels/gather_kernel_util.h     |   2 +-
 .../kernels/math_unary_elementwise_func.h     |  21 -
 .../user/kernels/model_update_kernel_util.h   |   3 +-
 .../user/kernels/multi_reduce_kernel_util.h   | 102 --
 oneflow/user/kernels/multi_reduce_kernels.cpp |  44 -
 oneflow/user/kernels/multi_reduce_kernels.cu  | 141 ---
 oneflow/user/kernels/multi_reduce_kernels.h   | 115 ---
 oneflow/user/kernels/one_embedding_kernels.cu | 298 ++----
 .../kernels/one_embedding_update_kernels.cu   | 236 +----
 oneflow/user/kernels/pool_gpu_kernel.cpp      |   2 +-
 oneflow/user/kernels/prelu_kernel.cu          |  53 +-
 oneflow/user/kernels/roc_auc_score_kernel.cpp | 116 ---
 oneflow/user/kernels/scalar_math_kernels.cu   |  11 +-
 .../kernels/upsample_bicubic2d_kernel.cpp     |   2 +-
 .../user/kernels/upsample_bicubic2d_kernel.cu |  10 +-
 oneflow/user/kernels/upsample_kernel.h        |   5 +-
 oneflow/user/ops/data_shuffle_op.cpp          |  32 +-
 oneflow/user/ops/deconv_op.cpp                |  12 +-
 oneflow/user/ops/dim_scatter_ops.cpp          |  51 +-
 oneflow/user/ops/distributions/normal_op.cpp  |  11 +-
 oneflow/user/ops/hardshrink_op.cpp            |  92 --
 oneflow/user/ops/math_unary_elementwise_seq.h |   6 +-
 oneflow/user/ops/model_update_ops.cpp         |   4 +-
 oneflow/user/ops/multi_reduce_ops.cpp         | 125 ---
 oneflow/user/ops/one_embedding_ops.cpp        |  66 +-
 oneflow/user/ops/pooling_op.cpp               |   4 +
 oneflow/user/ops/roc_auc_score_op.cpp         |  51 -
 oneflow/user/ops/upsample_op.cpp              |  48 +-
 oneflow/user/ops/where_op.cpp                 | 216 ++--
 python/oneflow/__init__.py                    |  17 +-
 python/oneflow/autograd/__init__.py           |   6 +-
 python/oneflow/autograd/autograd_mode.py      |  55 +-
 python/oneflow/framework/check_point_v2.py    |  17 +-
 python/oneflow/framework/docstr/__init__.py   |   4 -
 python/oneflow/framework/docstr/amax.py       |  54 -
 python/oneflow/framework/docstr/conv.py       |  55 +-
 python/oneflow/framework/docstr/deconv.py     | 128 ---
 .../oneflow/framework/docstr/index_select.py  |  53 +-
 python/oneflow/framework/docstr/math_ops.py   |  39 -
 .../oneflow/framework/docstr/roc_auc_score.py |  52 -
 python/oneflow/framework/docstr/swapaxes.py   |   6 +-
 python/oneflow/framework/docstr/swapdims.py   |  53 -
 python/oneflow/framework/docstr/tensor.py     | 167 +--
 python/oneflow/framework/docstr/unbind.py     |  44 -
 python/oneflow/framework/env_util.py          |   4 +-
 python/oneflow/framework/graph_build_util.py  |   2 +
 python/oneflow/framework/tensor.py            | 106 +-
 python/oneflow/nn/__init__.py                 |   1 -
 python/oneflow/nn/functional/__init__.py      |   6 +-
 python/oneflow/nn/graph/block.py              |   4 +-
 python/oneflow/nn/graph/graph.py              |   6 +-
 python/oneflow/nn/module.py                   |  13 +-
 python/oneflow/nn/modules/activation.py       |  55 +-
 python/oneflow/nn/modules/batchnorm.py        |  18 +-
 python/oneflow/nn/modules/conv.py             |  71 +-
 python/oneflow/nn/modules/fused_mlp.py        |  18 -
 python/oneflow/nn/modules/global_cast.py      | 102 +-
 python/oneflow/nn/modules/index_select.py     |  53 +
 python/oneflow/nn/modules/masked_select.py    |   3 +
 python/oneflow/nn/modules/normalization.py    | 129 +--
 python/oneflow/nn/modules/reduce_ops.py       |   4 +-
 python/oneflow/nn/optimizer/adamw.py          |   3 -
 python/oneflow/nn/optimizer/lambda_lr.py      |   3 -
 python/oneflow/nn/optimizer/optimizer.py      |  37 +-
 .../nn/optimizer/reduce_lr_on_plateau.py      |   8 +-
 python/oneflow/nn/optimizer/sgd.py            |   9 -
 python/oneflow/nn/utils/clip_grad.py          |   9 +-
 python/oneflow/one_embedding.py               | 580 ++++-------
 .../oneflow/test/exceptions/test_error_msg.py |   4 +-
 .../oneflow/test/exceptions/test_reshape.py   |   2 +-
 .../oneflow/test/expensive/pytorch_alexnet.py |   9 +
 .../test/expensive/pytorch_convmixer.py       |  66 --
 .../test/expensive/pytorch_convnext.py        | 188 ----
 .../test/expensive/pytorch_crossformer.py     | 818 ---------------
 .../test/expensive/pytorch_densenet.py        | 279 -----
 .../test/expensive/pytorch_efficientnet.py    | 617 ------------
 .../test/expensive/pytorch_ghostnet.py        | 225 -----
 .../test/expensive/pytorch_googlenet.py       | 276 -----
 .../test/expensive/pytorch_inception_v3.py    | 437 --------
 .../oneflow/test/expensive/pytorch_levit.py   | 535 ----------
 .../oneflow/test/expensive/pytorch_mnasnet.py | 219 ----
 .../test/expensive/pytorch_poolformer.py      | 437 --------
 python/oneflow/test/expensive/pytorch_pvt.py  | 365 -------
 .../oneflow/test/expensive/pytorch_res2net.py | 201 ----
 .../oneflow/test/expensive/pytorch_resmlp.py  | 239 -----
 .../oneflow/test/expensive/pytorch_resnet.py  |  17 +
 .../oneflow/test/expensive/pytorch_rexnet.py  | 244 -----
 .../test/expensive/pytorch_rexnetv1_lite.py   | 259 -----
 .../oneflow/test/expensive/pytorch_senet.py   | 361 -------
 .../test/expensive/pytorch_shufflenetv2.py    | 205 ----
 .../test/expensive/pytorch_squeezenet.py      | 140 ---
 .../expensive/pytorch_swin_transformer.py     | 783 --------------
 .../test/expensive/pytorch_uniformer.py       | 536 ----------
 .../test/expensive/pytroch_mlp_mixer.py       | 431 --------
 .../test/expensive/test_compatibility.py      | 154 ---
 .../test_compatiblity.py}                     |  23 +-
 .../test/expensive/test_convtranspose.py      |  27 -
 .../oneflow/test/expensive/test_id_shuffle.py | 243 ++---
 python/oneflow/test/expensive/test_permute.py |   4 +-
 .../test/graph/test_graph_clip_grad_norm.py   | 444 --------
 python/oneflow/test/graph/test_graph_debug.py |   9 +-
 ...test_graph_sparse_softmax_cross_entropy.py | 156 ---
 .../test_tvm_frontend_dependency_on_graph.py  |  30 +-
 .../oneflow/test/modules/test_activation.py   |  45 -
 python/oneflow/test/modules/test_addcmul.py   |  65 --
 python/oneflow/test/modules/test_amax.py      | 136 ---
 python/oneflow/test/modules/test_autograd.py  |  15 -
 .../test/modules/test_autograd_mode.py        |  31 +-
 .../modules/test_consistent_activation.py     |   7 -
 .../test/modules/test_consistent_addcmul.py   |  45 -
 .../test/modules/test_consistent_argsort.py   |   1 -
 .../test/modules/test_consistent_math_ops.py  |   6 +-
 .../test/modules/test_consistent_randn.py     | 124 ---
 .../modules/test_consistent_scatter_nd.py     |  63 --
 .../modules/test_consistent_scatter_ops.py    |  84 --
 .../test/modules/test_consistent_unbind.py    |  43 -
 .../test/modules/test_consistent_upsample.py  | 113 ---
 .../test/modules/test_consistent_view.py      |  55 -
 .../modules/test_consistent_weight_norm.py    |  42 -
 .../test/modules/test_consistent_where.py     | 371 -------
 .../test/modules/test_consistent_zeropad2d.py |  47 -
 python/oneflow/test/modules/test_conv2d.py    |  20 -
 python/oneflow/test/modules/test_conv3d.py    |  15 +-
 python/oneflow/test/modules/test_cum_ops.py   |  17 +-
 python/oneflow/test/modules/test_deconv2d.py  |  20 -
 python/oneflow/test/modules/test_generator.py |  12 -
 .../test/modules/test_id_shuffle_global.py    | 186 ++--
 .../oneflow/test/modules/test_index_select.py |  10 +-
 .../test/modules/test_masked_select.py        |   7 -
 python/oneflow/test/modules/test_math_ops.py  |   4 +-
 python/oneflow/test/modules/test_max.py       |  19 -
 python/oneflow/test/modules/test_min.py       |  19 -
 python/oneflow/test/modules/test_norm.py      |   9 -
 .../modules/test_one_embedding_adagrad.py     | 155 ---
 .../test/modules/test_one_embedding_adam.py   | 200 ----
 .../test/modules/test_one_embedding_sgd.py    | 146 ---
 .../oneflow/test/modules/test_parital_fc.py   |   7 +-
 python/oneflow/test/modules/test_prelu.py     |   6 +-
 python/oneflow/test/modules/test_prod.py      |  10 -
 ...st_resnet_load_torch_weight_compatibile.py |  51 -
 .../test/modules/test_roc_auc_score.py        |  69 --
 python/oneflow/test/modules/test_swapdims.py  |  47 -
 python/oneflow/test/modules/test_unbind.py    |  68 --
 python/oneflow/test/modules/test_upsample.py  |  14 +-
 python/oneflow/test/tensor/test_new_tensor.py |  68 --
 .../oneflow/test/tensor/test_tensor_part_1.py |  10 +-
 .../oneflow/test/tensor/test_tensor_part_2.py |  16 -
 .../torch_flow_dual_object.py                 |  45 +-
 .../__init__.py                               |   0
 .../oneflow_pytorch_compatiblity_test.py      |  18 +-
 .../generate_dispatch_stateful_ops.py         |  27 +-
 tools/functional/generate_functional_api.py   |  25 +-
 tools/functional/generate_tensor_api.py       |  27 +-
 tools/functional/generator.py                 |  37 +-
 306 files changed, 3086 insertions(+), 19468 deletions(-)
 delete mode 100644 oneflow/api/python/caster/tensor.h
 delete mode 100644 oneflow/api/python/exception/exception.h
 delete mode 100644 oneflow/api/python/framework/one_embedding.cpp
 create mode 100644 oneflow/api/python/framework/one_embedding_handler.cpp
 delete mode 100644 oneflow/api/python/framework/tensor.h
 create mode 100644 oneflow/api/python/functional/py_function.cpp
 delete mode 100644 oneflow/api/python/functional/python_arg_parser.cpp
 delete mode 100644 oneflow/api/python/functional/python_arg_parser.h
 delete mode 100644 oneflow/api/python/functional/python_frame.h
 create mode 100644 oneflow/api/python/functional/unpack_call.h
 delete mode 100644 oneflow/core/common/check_level.cpp
 delete mode 100644 oneflow/core/embedding/mock_key_value_store.cu
 delete mode 100644 oneflow/core/embedding/mock_key_value_store.h
 delete mode 100644 oneflow/user/kernels/multi_reduce_kernel_util.h
 delete mode 100644 oneflow/user/kernels/multi_reduce_kernels.cpp
 delete mode 100644 oneflow/user/kernels/multi_reduce_kernels.cu
 delete mode 100644 oneflow/user/kernels/multi_reduce_kernels.h
 delete mode 100644 oneflow/user/kernels/roc_auc_score_kernel.cpp
 delete mode 100644 oneflow/user/ops/hardshrink_op.cpp
 delete mode 100644 oneflow/user/ops/multi_reduce_ops.cpp
 delete mode 100644 oneflow/user/ops/roc_auc_score_op.cpp
 delete mode 100644 python/oneflow/framework/docstr/amax.py
 delete mode 100644 python/oneflow/framework/docstr/deconv.py
 delete mode 100644 python/oneflow/framework/docstr/roc_auc_score.py
 delete mode 100644 python/oneflow/framework/docstr/swapdims.py
 delete mode 100644 python/oneflow/framework/docstr/unbind.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_convmixer.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_convnext.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_crossformer.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_densenet.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_efficientnet.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_ghostnet.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_googlenet.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_inception_v3.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_levit.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_mnasnet.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_poolformer.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_pvt.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_res2net.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_resmlp.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_rexnet.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_rexnetv1_lite.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_senet.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_shufflenetv2.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_squeezenet.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_swin_transformer.py
 delete mode 100644 python/oneflow/test/expensive/pytorch_uniformer.py
 delete mode 100644 python/oneflow/test/expensive/pytroch_mlp_mixer.py
 delete mode 100644 python/oneflow/test/expensive/test_compatibility.py
 rename python/oneflow/test/{modules/test_module_to_half.py => expensive/test_compatiblity.py} (56%)
 delete mode 100644 python/oneflow/test/graph/test_graph_clip_grad_norm.py
 delete mode 100644 python/oneflow/test/graph/test_graph_sparse_softmax_cross_entropy.py
 delete mode 100644 python/oneflow/test/modules/test_addcmul.py
 delete mode 100644 python/oneflow/test/modules/test_amax.py
 delete mode 100644 python/oneflow/test/modules/test_consistent_addcmul.py
 delete mode 100644 python/oneflow/test/modules/test_consistent_randn.py
 delete mode 100644 python/oneflow/test/modules/test_consistent_scatter_nd.py
 delete mode 100644 python/oneflow/test/modules/test_consistent_scatter_ops.py
 delete mode 100644 python/oneflow/test/modules/test_consistent_unbind.py
 delete mode 100644 python/oneflow/test/modules/test_consistent_upsample.py
 delete mode 100644 python/oneflow/test/modules/test_consistent_view.py
 delete mode 100644 python/oneflow/test/modules/test_consistent_weight_norm.py
 delete mode 100644 python/oneflow/test/modules/test_consistent_where.py
 delete mode 100644 python/oneflow/test/modules/test_consistent_zeropad2d.py
 delete mode 100644 python/oneflow/test/modules/test_one_embedding_adagrad.py
 delete mode 100644 python/oneflow/test/modules/test_one_embedding_adam.py
 delete mode 100644 python/oneflow/test/modules/test_one_embedding_sgd.py
 delete mode 100644 python/oneflow/test/modules/test_resnet_load_torch_weight_compatibile.py
 delete mode 100644 python/oneflow/test/modules/test_roc_auc_score.py
 delete mode 100644 python/oneflow/test/modules/test_swapdims.py
 delete mode 100644 python/oneflow/test/modules/test_unbind.py
 delete mode 100644 python/oneflow/test/tensor/test_new_tensor.py
 rename python/oneflow/test_utils/{oneflow_pytorch_compatibility => oneflow_pytorch_compatiblity}/__init__.py (100%)
 rename python/oneflow/test_utils/{oneflow_pytorch_compatibility => oneflow_pytorch_compatiblity}/oneflow_pytorch_compatiblity_test.py (93%)

diff --git a/.github/workflows/canary.yml b/.github/workflows/canary.yml
index ded8e4cc20b..e06378f017e 100644
--- a/.github/workflows/canary.yml
+++ b/.github/workflows/canary.yml
@@ -55,7 +55,7 @@ jobs:
       - name: Checkout Oneflow-Inc/oneflow
         if: ${{ github.event.inputs.oneflow-ref == '' }}
         uses: actions/checkout@v2
-      - uses: Oneflow-Inc/get-oneflow@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow@support-auto-benchmark
         name: Build manylinux
         id: build-cuda
         with:
diff --git a/.github/workflows/on_merge.yml b/.github/workflows/on_merge.yml
index 65d939253d2..92a4ea499ed 100644
--- a/.github/workflows/on_merge.yml
+++ b/.github/workflows/on_merge.yml
@@ -15,6 +15,6 @@ jobs:
     if: github.event.pull_request.merged == true
     runs-on: ubuntu-latest
     steps:
-      - uses: Oneflow-Inc/get-oneflow/update-benchmark-history@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/update-benchmark-history@support-auto-benchmark
         name: Update benchmark history
         timeout-minutes: 10
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 33fd6acf9f0..2079196b920 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -33,7 +33,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-auto-benchmark
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -74,7 +74,7 @@ jobs:
           python3 -m pip install -U pip setuptools wheel --user
           python3 -m pip install oss2  --user
       - uses: actions/checkout@v2
-      - uses: Oneflow-Inc/get-oneflow@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow@support-auto-benchmark
         name: Build ${{ matrix.entry }}
         if: ${{ matrix.entry !='cpu' }}
         with:
@@ -98,7 +98,7 @@ jobs:
             3.8
             3.9
             3.10
-      - uses: Oneflow-Inc/get-oneflow@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow@support-auto-benchmark
         name: Build ${{ matrix.entry }}
         if: ${{ matrix.entry =='cpu' }}
         with:
diff --git a/.github/workflows/simple.yml b/.github/workflows/simple.yml
index d075b6987e4..4e73cff50bb 100644
--- a/.github/workflows/simple.yml
+++ b/.github/workflows/simple.yml
@@ -32,9 +32,9 @@ jobs:
           sudo apt-get install -y libopenblas-dev nasm python3-pip ninja-build
       - name: Download OneFlow custom clang-tidy
         run: |
-          wget https://github.com/Oneflow-Inc/llvm-project/releases/download/llvmorg-13.0.0-maybe/clang-tidy-13.AppImage
+          wget https://github.com/Oneflow-Inc/llvm-project/releases/download/llvmorg-13.0.0-maybe/clang-tidy-b836e4d-x86_64.AppImage
           wget https://raw.githubusercontent.com/oneflow-inc/llvm-project/maybe/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
-          chmod +x clang-tidy-13.AppImage run-clang-tidy.py
+          chmod +x clang-tidy-b836e4d-x86_64.AppImage run-clang-tidy.py
       - name: Build third party libs and generate files
         run: |
           mkdir build
@@ -55,7 +55,7 @@ jobs:
             -DBUILD_TESTING=ON \
             -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
           cd ..
-          ./run-clang-tidy.py -clang-tidy-binary ./clang-tidy-13.AppImage -p build -quiet -allow-enabling-alpha-checkers -extra-arg="-Xclang" -extra-arg="-analyzer-config" -extra-arg="-Xclang" -extra-arg="aggressive-binary-operation-simplification=true" "^(?!$(pwd)/build)"
+          ./run-clang-tidy.py -clang-tidy-binary ./clang-tidy-b836e4d-x86_64.AppImage -p build -quiet -allow-enabling-alpha-checkers -extra-arg="-Xclang" -extra-arg="-analyzer-config" -extra-arg="-Xclang" -extra-arg="aggressive-binary-operation-simplification=true" "^(?!$(pwd)/build)"
 
   hosted:
     name: CPU-only
@@ -241,7 +241,7 @@ jobs:
           repository: Oneflow-Inc/conda-env
           ref: 30a7f00eb48ee9009d85a848e720823e5054c66b
           path: conda-env
-      - uses: Oneflow-Inc/get-oneflow@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow@support-auto-benchmark
         name: Build with gcc7
         if: ${{ matrix.build-type == 'gcc7'}}
         with:
@@ -250,7 +250,7 @@ jobs:
           oneflow-build-env: conda
           conda-env-file: conda-env/dev/gcc7/environment-v2.yml
           conda-env-name: oneflow-dev-gcc7-v2
-      - uses: Oneflow-Inc/get-oneflow@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow@support-auto-benchmark
         name: Build with clang10
         if: ${{ matrix.build-type == 'clang10'}}
         with:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d3f30a757f7..275e0f4ed5b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -13,10 +13,10 @@ env:
   ONEFLOW_TIMEOUT_SECONDS: 90
   ONEFLOW_THRAED_LOCAL_CACHED_SIZE: 16384
   FLOW_VISION_SRC: flow_vision
-  FLOW_VISION_COMMIT: ca8ebc663b58667cf8cd1b6ef0c861522780b7bb
+  FLOW_VISION_COMMIT: 90acf53c77f69d6bbc26d805079d7765a3cb2608
   LIBAI_SRC: libai
   LIBAI_COMMIT: 7d31d9781e5f2d559dc0820f599e0bed798488ca
-  TEST_WITH_TORCH_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-pytorch-1.10.0-cuda11.3-cudnn8-runtime:06001b5bdd1a4f89f888e18d96c26f9ed2cd25d3
+  TEST_WITH_TORCH_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-pytorch-1.9.0-cuda10.2-cudnn7-runtime:70729b0680b5a32daba6f50b56e0c169cd1636fa
 
 jobs:
   check-priority-pr:
@@ -24,7 +24,7 @@ jobs:
     runs-on: ubuntu-latest
     if: github.event.pull_request.draft == false && github.base_ref == 'master' && contains(github.event.pull_request.requested_reviewers.*.login, 'oneflow-ci-bot')
     steps:
-      - uses: Oneflow-Inc/get-oneflow/priority-pr@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/priority-pr@support-auto-benchmark
         name: Check priority PR closed
         id: save-cache
         timeout-minutes: 5
@@ -158,13 +158,13 @@ jobs:
           fi
           echo "is_secrets_accessible=1" >> $GITHUB_ENV
       - name: Wait for GPU slot
-        uses: Oneflow-Inc/get-oneflow/wait-for-gpu@fix-benchmark-typo
+        uses: Oneflow-Inc/get-oneflow/wait-for-gpu@support-auto-benchmark
         if: env.is_secrets_accessible == '1'
-        timeout-minutes: 120
+        timeout-minutes: 90
         continue-on-error: true
         with:
           token: ${{ secrets.CI_PERSONAL_ACCESS_TOKEN }}
-          timeout-minutes: 4
+          timeout-minutes: 3
           max-try-times: 30
 
   find-build-cache:
@@ -182,7 +182,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-auto-benchmark
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -231,7 +231,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-auto-benchmark
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -245,7 +245,7 @@ jobs:
         run: |
           echo "::error file=test.yml,line=204,col=10::steps.save-cache.outputs.cache-hit != matrix.cache-hit"
           exit 1
-      - uses: Oneflow-Inc/get-oneflow@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow@support-auto-benchmark
         name: Build manylinux ${{ matrix.entry }}
         id: build-cpu
         if: ${{ matrix.entry =='cpu' && !matrix.cache-hit }}
@@ -266,7 +266,7 @@ jobs:
           python-versions: |
             3.6
             3.7
-      - uses: Oneflow-Inc/get-oneflow@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow@support-auto-benchmark
         name: Build manylinux ${{ matrix.entry }}
         id: build-openvino
         if: ${{ matrix.entry =='openvino' && !matrix.cache-hit }}
@@ -287,7 +287,7 @@ jobs:
           clean-ccache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }}
           python-versions: |
             3.6
-      - uses: Oneflow-Inc/get-oneflow@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow@support-auto-benchmark
         name: Build manylinux ${{ matrix.entry }}
         id: build-cuda
         if: ${{ matrix.entry =='cu102' && !matrix.cache-hit }}
@@ -307,7 +307,7 @@ jobs:
           clean-ccache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }}
           python-versions: |
             3.7
-      - uses: Oneflow-Inc/get-oneflow@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow@support-auto-benchmark
         name: Build manylinux ${{ matrix.entry }}
         id: build-xla
         if: ${{ matrix.entry =='cu102_xla' && !matrix.cache-hit }}
@@ -327,7 +327,7 @@ jobs:
           clean-ccache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }}
           python-versions: |
             3.6
-      - uses: Oneflow-Inc/get-oneflow@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow@support-auto-benchmark
         name: Build ${{ matrix.entry }}
         if: ${{ matrix.entry == 'llvm13' && !matrix.cache-hit }}
         with:
@@ -366,7 +366,7 @@ jobs:
             })
       - name: Upload packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm13' && matrix.entry != 'cu102_xla' }}
-        uses: Oneflow-Inc/get-oneflow/digest/upload@fix-benchmark-typo
+        uses: Oneflow-Inc/get-oneflow/digest/upload@support-auto-benchmark
         timeout-minutes: 10
         with:
           digest: ${{ steps.save-cache.outputs.build-digest }}
@@ -377,7 +377,7 @@ jobs:
           dst-dir: cpack
       - name: Upload whl
         if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm13' && matrix.entry != 'cu102_xla' }}
-        uses: Oneflow-Inc/get-oneflow/digest/upload@fix-benchmark-typo
+        uses: Oneflow-Inc/get-oneflow/digest/upload@support-auto-benchmark
         timeout-minutes: 10
         with:
           digest: ${{ steps.save-cache.outputs.build-digest }}
@@ -389,7 +389,7 @@ jobs:
 
   find-test-cache-distributed:
     name: "Find test cache (distributed)"
-    if: github.event.pull_request.draft == false && github.base_ref == 'master' && false
+    if: github.event.pull_request.draft == false && github.base_ref == 'master'
     runs-on: ubuntu-latest
     needs: [build-oneflow]
     env:
@@ -402,7 +402,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-auto-benchmark
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -433,7 +433,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-auto-benchmark
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -465,7 +465,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-auto-benchmark
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -518,7 +518,7 @@ jobs:
         if: ${{ contains(matrix.runs-on, 'self-hosted') }}
         run: |
           docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-auto-benchmark
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -534,7 +534,7 @@ jobs:
           exit 1
       - name: Download wheel and packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/digest/download@fix-benchmark-typo
+        uses: Oneflow-Inc/get-oneflow/digest/download@support-auto-benchmark
         id: download-digest
         timeout-minutes: 10
         with:
@@ -602,11 +602,11 @@ jobs:
       - name: Benchmark Test
         timeout-minutes: 100
         if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'benchmark' && matrix.device == 'cuda' }}
-        uses: Oneflow-Inc/get-oneflow/pytest-benchmark@fix-benchmark-typo
+        uses: Oneflow-Inc/get-oneflow/pytest-benchmark@support-auto-benchmark
         with:
           collect-path: ${{ env.FLOW_VISION_SRC }}/benchmark
           container-name: ${{ env.TEST_CONTAINER_NAME }}
-          unknown-threshold: 15
+          unkown-threshold: 10
           error-threshold: 20
 
       # end pytest benchmark
@@ -643,7 +643,7 @@ jobs:
     name: Distributed test suite
     needs: [wait_for_gpu_slot, find-test-cache-distributed]
     runs-on: ${{ matrix.runs-on }}
-    if: github.event.pull_request.draft == false && github.base_ref == 'master' && false
+    if: github.event.pull_request.draft == false && github.base_ref == 'master'
     concurrency:
       group: distributed-test-${{ matrix.entry }}-rank-${{ matrix.rank }}
       cancel-in-progress: false
@@ -673,7 +673,7 @@ jobs:
         if: ${{ contains(matrix.runs-on, 'self-hosted') }}
         run: |
           docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-auto-benchmark
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -689,7 +689,7 @@ jobs:
           exit 1
       - name: Download wheel and packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/digest/download@fix-benchmark-typo
+        uses: Oneflow-Inc/get-oneflow/digest/download@support-auto-benchmark
         id: download-digest
         timeout-minutes: 10
         with:
@@ -699,7 +699,7 @@ jobs:
           ssh-tank-path: ${{ env.SSH_TANK_PATH }}
       - name: Get primary node
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/master-address@fix-benchmark-typo
+        uses: Oneflow-Inc/get-oneflow/master-address@support-auto-benchmark
         id: get-primary-node
         with:
           rank: ${{ matrix.rank }}
@@ -733,7 +733,7 @@ jobs:
         working-directory: ${{ env.ONEFLOW_SRC }}
         run: |
           docker run -d --rm --privileged --shm-size=8g \
-            --pids-limit 1000 \
+            --pids-limit -1 \
             --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
             --runtime=nvidia \
             -v /dataset:/dataset:ro -v /model_zoo:/model_zoo:ro \
@@ -812,7 +812,7 @@ jobs:
 
   test:
     name: Test suite
-    needs: [wait_for_gpu_slot, find-test-cache]
+    needs: [wait_for_gpu_slot, find-test-cache, test-distributed]
     runs-on: ${{ matrix.runs-on }}
     if: github.event.pull_request.draft == false && github.base_ref == 'master'
     strategy:
@@ -859,7 +859,7 @@ jobs:
         if: ${{ contains(matrix.runs-on, 'self-hosted') }}
         run: |
           docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-auto-benchmark
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -875,7 +875,7 @@ jobs:
           exit 1
       - name: Download wheel and packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/digest/download@fix-benchmark-typo
+        uses: Oneflow-Inc/get-oneflow/digest/download@support-auto-benchmark
         id: download-digest
         timeout-minutes: 10
         with:
@@ -913,14 +913,6 @@ jobs:
           EXTRA_DOCKER_ARGS+=" --env ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1"
           EXTRA_DOCKER_ARGS+=" --env ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1"
           echo "EXTRA_DOCKER_ARGS=${EXTRA_DOCKER_ARGS}" >> $GITHUB_ENV
-      - name: Set Thread Limit (CPU)
-        if: ${{ !fromJson(matrix.cache-hit) && matrix.device == 'cpu' }}
-        run: |
-          echo "THREAD_LIMIT=10000" >> $GITHUB_ENV
-      - name: Set Thread Limit (CUDA)
-        if: ${{ !fromJson(matrix.cache-hit) && matrix.device == 'cuda' }}
-        run: |
-          echo "THREAD_LIMIT=10000" >> $GITHUB_ENV
       - name: Enable ONEFLOW_TEST_VERBOSE
         if: ${{ contains(github.event.pull_request.labels.*.name, 'need-test-verbose') }}
         run: |
@@ -940,7 +932,6 @@ jobs:
         working-directory: ${{ env.ONEFLOW_SRC }}
         run: |
           docker run -d --rm --privileged --shm-size=8g \
-            --pids-limit ${{ env.THREAD_LIMIT }} \
             --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
             --runtime=nvidia \
             -v /dataset:/dataset:ro -v /model_zoo:/model_zoo:ro \
@@ -1028,7 +1019,7 @@ jobs:
         uses: actions/checkout@v2
         with:
           repository: Oneflow-Inc/models
-          ref: d6b2b8260e87541726ed87361171438d258e6a4d
+          ref: 51933864c0f8305db27d47db1955fe1647620b73
           path: oneflow-models
       - name: ResNet50 Graph DDP test
         id: models-resnet50
@@ -1039,7 +1030,6 @@ jobs:
       - name: Speed test
         id: speed
         timeout-minutes: 20
-        continue-on-error: ${{ !contains(github.event.pull_request.labels.*.name, 'need-pass-speed-test') }}
         if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'speed-test' && matrix.device == 'cuda' }}
         run: |
           docker exec -e ONEFLOW_MODELS_DIR=$PWD/oneflow-models ${{ env.TEST_CONTAINER_NAME }} bash ci/test/test_speed_multi_client.sh
@@ -1157,7 +1147,7 @@ jobs:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
           fetch-depth: 0
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@fix-benchmark-typo
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-auto-benchmark
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -1218,10 +1208,6 @@ jobs:
             -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
           cd ..
           git diff -U0 ${{ github.event.pull_request.base.sha }} | ./clang-tidy-diff.py -clang-tidy-binary ./clang-tidy-13.AppImage -path build -allow-enabling-alpha-checkers -j $(nproc) -p1 -extra-arg="-Xclang" -extra-arg="-analyzer-config" -extra-arg="-Xclang" -extra-arg="aggressive-binary-operation-simplification=true" -warnings-as-errors="$(cat ./ci/check/clang_tidy_warnings_as_errors_on_diff)"
-      - name: Check error message absence in changed files
-        if: ${{ !fromJSON(steps.save-cache.outputs.cache-hit) && contains(github.event.pull_request.labels.*.name, 'need-check-error-message') }}
-        run: |
-          git diff -U0 ${{ github.event.pull_request.base.sha }} | ./clang-tidy-diff.py -clang-tidy-binary ./clang-tidy-13.AppImage -path build -allow-enabling-alpha-checkers -j $(nproc) -p1 -extra-arg="-Xclang" -extra-arg="-analyzer-config" -extra-arg="-Xclang" -extra-arg="aggressive-binary-operation-simplification=true" -checks=-*,maybe-need-error-msg -warnings-as-errors=* -skip-line-filter
       - name: Remove automerge
         if: ${{ !fromJSON(steps.save-cache.outputs.cache-hit) && failure() && cancelled() == false && contains(github.event.pull_request.labels.*.name, 'automerge') }}
         uses: actions/github-script@v4
diff --git a/.gitignore b/.gitignore
index 9e1a36c7d7d..e2c788dc055 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,5 +40,3 @@ unittest-log-*
 /data-test
 /tmp
 /python/oneflow/test/dataloader/data-test/
-
-/target
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b1138cf9de..694f451935e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,6 @@ option(WITH_MLIR_CUDA_CODEGEN "" OFF)
 option(OF_SOFTMAX_USE_FAST_MATH "" ON)
 option(OF_LAYER_NORM_USE_FAST_MATH "" ON)
 option(TREAT_WARNINGS_AS_ERRORS "" ON)
-option(MAYBE_NEED_ERROR_MSG_CHECK "" OFF)
 # Reference:
 # https://medium.com/@alasher/colored-c-compiler-output-with-ninja-clang-gcc-10bfe7f2b949
 option(OF_FORCE_COLORED_DIAGNOSTICS "Always produce ANSI-colored diagnostics (GNU/Clang only)." ON)
@@ -242,7 +241,7 @@ if(WIN32)
 else()
   set(EXTRA_CXX_FLAGS "-std=c++14 -Wall -Wno-sign-compare -Wno-unused-function -fPIC")
   if(APPLE)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-deprecated-declarations")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-deprecated-declarations -Wno-mismatched-tags")
   endif()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${EXTRA_CXX_FLAGS}")
diff --git a/README.md b/README.md
index fee1c755fe5..43ab9e67e9a 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,6 @@
 
 ### System Requirements
 
-- Linux. As for now, there is no pre-built release for macOS, Windows.
 - Python 3.6, 3.7, 3.8, 3.9, 3.10
 - (**Highly recommended**) Upgrade pip
 
diff --git a/ci/check/run_clang_tidy.py b/ci/check/run_clang_tidy.py
index 6807aaf9504..9b2e115f6c2 100644
--- a/ci/check/run_clang_tidy.py
+++ b/ci/check/run_clang_tidy.py
@@ -86,9 +86,6 @@ def download(build_dir, dry=False) -> Optional[List[str]]:
     parser.add_argument(
         "--build_dir", required=True,
     )
-    parser.add_argument(
-        "--check-error-msg", action="store_true", default=False,
-    )
     args = parser.parse_args()
     loop = asyncio.get_event_loop()
     downloaded = download(args.build_dir, dry=True)
@@ -100,11 +97,9 @@ def download(build_dir, dry=False) -> Optional[List[str]]:
         .read_text()
         .strip()
     )
-    cmd = f"git diff -U0 master | {downloaded[1]} -clang-tidy-binary {downloaded[0]} -path {args.build_dir} -j $(nproc) -p1 -allow-enabling-alpha-checkers -extra-arg=-Xclang -extra-arg=-analyzer-config -extra-arg=-Xclang -extra-arg=aggressive-binary-operation-simplification=true"
-    if args.check_error_msg:
-        command = f" cd .. && {cmd} -warnings-as-errors='{warnings_as_errors}' && {cmd} -checks=-*,maybe-need-error-msg -warnings-as-errors=* -skip-line-filter"
-    else:
-        command = f"cd .. && {cmd} -warnings-as-errors='{warnings_as_errors}'"
-
-    ret_code = loop.run_until_complete(run_command(command))
+    ret_code = loop.run_until_complete(
+        run_command(
+            f"cd .. && git diff -U0 master | {downloaded[1]} -clang-tidy-binary {downloaded[0]} -path {args.build_dir} -j $(nproc) -p1  -allow-enabling-alpha-checkers -extra-arg=-Xclang -extra-arg=-analyzer-config -extra-arg=-Xclang -extra-arg=aggressive-binary-operation-simplification=true -warnings-as-errors='{warnings_as_errors}'"
+        )
+    )
     exit(ret_code)
diff --git a/ci/test/expensive_generic_test_multi_client.sh b/ci/test/expensive_generic_test_multi_client.sh
index 341db7bbe56..86f1208dc1c 100644
--- a/ci/test/expensive_generic_test_multi_client.sh
+++ b/ci/test/expensive_generic_test_multi_client.sh
@@ -27,7 +27,6 @@ export ONEFLOW_TEST_DEVICE_NUM=1
 
 COMMON_PYTEST_ARGS="--max-worker-restart=0 -x --durations=50 --capture=sys"
 python3 -m pytest ${COMMON_PYTEST_ARGS} --failed-first --dist loadfile ${parallel_spec} ${PWD}
-exit 0
 if [[ "$(python3 -c 'import oneflow.sysconfig;print(oneflow.sysconfig.has_rpc_backend_grpc())')" == *"True"* ]]; then
     export ONEFLOW_TEST_DEVICE_NUM=2
     python3 -m oneflow.distributed.launch --nproc_per_node 2 -m pytest ${COMMON_PYTEST_ARGS} ${PWD}
diff --git a/ci/test/generic_test_multi_client.sh b/ci/test/generic_test_multi_client.sh
index 7c756ef10aa..c01873284cf 100644
--- a/ci/test/generic_test_multi_client.sh
+++ b/ci/test/generic_test_multi_client.sh
@@ -30,7 +30,6 @@ export ONEFLOW_TEST_DEVICE_NUM=1
 
 COMMON_PYTEST_ARGS="-p no:randomly -p no:cacheprovider --max-worker-restart=0 -x --durations=50 --capture=sys"
 time python3 -m pytest ${COMMON_PYTEST_ARGS} --dist loadfile ${parallel_spec} ${ONEFLOW_TEST_DIR}
-exit 0
 if [[ "$(python3 -c 'import oneflow.sysconfig;print(oneflow.sysconfig.has_rpc_backend_grpc())')" == *"True"* ]]; then
     export ONEFLOW_TEST_DEVICE_NUM=2
     time python3 ${src_dir}/ci/test/multi_launch.py \
diff --git a/ci/test/multi_client_exception_test.sh b/ci/test/multi_client_exception_test.sh
index 872347468b8..c7dcaa27fd6 100644
--- a/ci/test/multi_client_exception_test.sh
+++ b/ci/test/multi_client_exception_test.sh
@@ -14,8 +14,6 @@ mkdir -p $test_tmp_dir
 cp -r $test_dir $test_tmp_dir
 cd ${test_tmp_dir}/$(basename $test_dir)
 
-export ONEFLOW_DEBUG_MODE=1
-
 for file in $(ls ${PWD}/test_*.py)
 do
     if test -f $file
@@ -33,5 +31,3 @@ do
         fi
     fi
 done
-
-unset ONEFLOW_DEBUG_MODE
diff --git a/ci/test/test_speed_multi_client.sh b/ci/test/test_speed_multi_client.sh
index ed94c1d6bf8..3188685761d 100755
--- a/ci/test/test_speed_multi_client.sh
+++ b/ci/test/test_speed_multi_client.sh
@@ -31,9 +31,9 @@ python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet
 python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 2x3x224x224 --no-show-memory --times 200 | check_relative_speed 0.99 | write_to_file_and_print
 python3 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 1x3x224x224 --no-show-memory --times 200 | check_relative_speed 0.95 | write_to_file_and_print
 
-python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 1 | write_to_file_and_print
-python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 4 | write_to_file_and_print
-python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 8 | write_to_file_and_print
+python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 1 | check_relative_speed 0.91 | write_to_file_and_print
+python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 4 | check_relative_speed 0.91 | write_to_file_and_print
+python3 scripts/swin_dataloader_compare_speed_with_pytorch.py --batch_size 32 --num_workers 8 | check_relative_speed 0.93 | write_to_file_and_print
 
 export OMP_NUM_THREADS=1
 python3 -m oneflow.distributed.launch --nproc_per_node 2 scripts/compare_speed_with_pytorch.py Vision/classification/image/resnet50/models/resnet50.py resnet50 16x3x224x224 --no-show-memory --times 100 --ddp | check_relative_speed 1.12 | check_millisecond_time 136.3 2 | write_to_file_and_print
diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake
index 4862c6cd9df..9c7ecd27d7d 100644
--- a/cmake/oneflow.cmake
+++ b/cmake/oneflow.cmake
@@ -168,15 +168,10 @@ add_custom_target(
   COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_clang_format.py
           --source_dir ${CMAKE_CURRENT_SOURCE_DIR}/tools/oneflow-tblgen --fix --quiet)
 # clang tidy
-set(RUN_CLANG_TIDY_ARGS --build_dir ${CMAKE_BINARY_DIR})
-if(MAYBE_NEED_ERROR_MSG_CHECK)
-  list(APPEND RUN_CLANG_TIDY_ARGS --check-error-msg)
-endif()
-message(STATUS "RUN_CLANG_TIDY_ARGS: ${RUN_CLANG_TIDY_ARGS}")
 add_custom_target(
-  of_tidy COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/ci/check/run_clang_tidy.py
-                  ${RUN_CLANG_TIDY_ARGS} DEPENDS of_git_version oneflow_deps of_cfgobj
-                                                 of_functional_obj of_functional_tensor_obj)
+  of_tidy COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/ci/check/run_clang_tidy.py --build_dir
+                  ${CMAKE_BINARY_DIR} DEPENDS of_git_version oneflow_deps of_cfgobj
+                                              of_functional_obj of_functional_tensor_obj)
 # generate version
 set(OF_GIT_VERSION_DIR ${CMAKE_CURRENT_BINARY_DIR}/of_git_version)
 set(OF_GIT_VERSION_FILE ${OF_GIT_VERSION_DIR}/version.cpp)
diff --git a/cmake/third_party/nccl.cmake b/cmake/third_party/nccl.cmake
index 29e89f8b4c0..0f32478dc3b 100644
--- a/cmake/third_party/nccl.cmake
+++ b/cmake/third_party/nccl.cmake
@@ -34,7 +34,7 @@ else()
   set(NCCL_INCLUDE_DIR ${NCCL_INSTALL_DIR}/include)
   set(NCCL_LIBRARY_DIR ${NCCL_INSTALL_DIR}/lib)
 
-  set(NCCL_URL https://github.com/NVIDIA/nccl/archive/refs/tags/v2.12.10-1.tar.gz)
+  set(NCCL_URL https://github.com/NVIDIA/nccl/archive/30ca3fcacf8a73c48d7b8f7aaa54ae8bff89e884.zip)
   use_mirror(VARIABLE NCCL_URL URL ${NCCL_URL})
 
   list(APPEND NCCL_LIBRARIES ${NCCL_LIBRARY_DIR}/${NCCL_LIBRARY_NAME})
@@ -47,7 +47,7 @@ else()
       nccl
       PREFIX nccl
       URL ${NCCL_URL}
-      URL_MD5 bdb91f80b78c99831f09ca8bb28a1032
+      URL_MD5 84d390b56922332486bb92f4e7895d1d
       UPDATE_COMMAND ""
       CONFIGURE_COMMAND ""
       BUILD_IN_SOURCE 1
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 3aaae830e12..2aea33aae2c 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -230,6 +230,7 @@ function(set_compile_options_to_oneflow_target target)
   target_treat_warnings_as_errors(${target})
   target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Werror=return-type>)
   # the mangled name between `struct X` and `class X` is different in MSVC ABI, remove it while windows is supported (in MSVC/cl or clang-cl)
+  target_try_compile_options(${target} -Wno-mismatched-tags)
   target_try_compile_options(${target} -Wno-covered-switch-default)
 
   if(OMP_FLAGS)
diff --git a/docs/source/functional.rst b/docs/source/functional.rst
index 5976d2e3107..fba145b6950 100644
--- a/docs/source/functional.rst
+++ b/docs/source/functional.rst
@@ -6,15 +6,11 @@ Functional operations for neural networks
 .. autofunction:: conv1d
 .. autofunction:: conv2d
 .. autofunction:: conv3d
-.. autofunction:: conv_transpose1d
-.. autofunction:: conv_transpose2d
-.. autofunction:: conv_transpose3d
 .. autofunction:: adaptive_avg_pool1d
 .. autofunction:: adaptive_avg_pool2d
 .. autofunction:: adaptive_avg_pool3d
 .. autofunction:: relu
 .. autofunction:: hardsigmoid
-.. autofunction:: hardshrink
 .. autofunction:: hardswish
 .. autofunction:: hardtanh
 .. autofunction:: normalize
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 4168124b791..bf3001df45c 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -39,7 +39,6 @@ Operators for neural networks
         GLU,
         GroupNorm,
         Hardsigmoid,
-        Hardshrink,
         Hardswish,
         Hardtanh,
         Identity,
diff --git a/docs/source/one_embedding.rst b/docs/source/one_embedding.rst
index bc1e2ad91ae..9a86a93ca97 100644
--- a/docs/source/one_embedding.rst
+++ b/docs/source/one_embedding.rst
@@ -7,12 +7,8 @@ OneFlow one_embedding operations.
     :members: forward,
               save_snapshot,
               load_snapshot,
+
 .. autofunction:: oneflow.one_embedding.MultiTableEmbedding.forward
-.. autoclass:: MultiTableMultiColumnEmbedding
-    :members: forward,
-              save_snapshot,
-              load_snapshot,
-.. autofunction:: oneflow.one_embedding.MultiTableMultiColumnEmbedding.forward
 .. autofunction:: oneflow.one_embedding.make_device_mem_store_options
 .. autofunction:: oneflow.one_embedding.make_cached_ssd_store_options       
 .. autofunction:: oneflow.one_embedding.make_cached_host_mem_store_options
@@ -20,5 +16,3 @@ OneFlow one_embedding operations.
 .. autofunction:: oneflow.one_embedding.make_normal_initializer
 .. autofunction:: oneflow.one_embedding.make_table_options
 .. autofunction:: oneflow.one_embedding.make_table
-.. autofunction:: oneflow.one_embedding.make_persistent_table_reader
-.. autofunction:: oneflow.one_embedding.make_persistent_table_writer
diff --git a/docs/source/oneflow.rst b/docs/source/oneflow.rst
index 35c4c4d5bab..518f7092214 100644
--- a/docs/source/oneflow.rst
+++ b/docs/source/oneflow.rst
@@ -11,9 +11,7 @@ oneflow
             acos, 
             acosh, 
             add, 
-            addcmul, 
             addmm, 
-            amax,
             arccos, 
             arcsin, 
             arcsinh, 
@@ -114,7 +112,6 @@ oneflow
             randint,
             randperm,
             reciprocal,
-            roc_auc_score,
             roll,
             round, 
             rsqrt,
@@ -146,7 +143,6 @@ oneflow
             sqrt, 
             square,  
             swapaxes, 
-            swapdims, 
             tan, 
             tanh, 
             tensor, 
@@ -155,7 +151,6 @@ oneflow
             t,
             tril, 
             unsqueeze, 
-            unbind, 
             permute,
             var, 
             where, 
@@ -164,8 +159,7 @@ oneflow
             is_nonzero,
             is_tensor,
             no_grad,
-            set_grad_enabled,
-            enable_grad,
+            grad_enable,
             inference_mode,
             is_grad_enabled,
             is_floating_point,
diff --git a/docs/source/tensor.rst b/docs/source/tensor.rst
index 706d3509825..e30f09f7edd 100644
--- a/docs/source/tensor.rst
+++ b/docs/source/tensor.rst
@@ -9,10 +9,7 @@ OneFlow Tensor Class
             acosh, 
             add, 
             add_, 
-            addcmul,
-            addcmul_,
             addmm,
-            amax,
             arccos, 
             arccosh, 
             arcsin, 
@@ -79,7 +76,6 @@ OneFlow Tensor Class
             grad, 
             grad_fn, 
             gt, 
-            half,
             in_top_k, 
             index_select,
             int, 
@@ -157,15 +153,12 @@ OneFlow Tensor Class
             stride, 
             sum,
             swapaxes, 
-            swapdims, 
             sub, 
             sub_, 
             tan, 
             tanh, 
             tile, 
             to,
-            local_to_global,
-            global_to_global,
             to_global,
             to_local,
             to_consistent,
@@ -177,7 +170,6 @@ OneFlow Tensor Class
             type_as, 
             t,
             T,
-            unbind, 
             unfold, 
             uniform_, 
             unsqueeze, 
diff --git a/oneflow/api/python/autograd/autograd_function.cpp b/oneflow/api/python/autograd/autograd_function.cpp
index 5c279a71d4b..4592e351d7c 100644
--- a/oneflow/api/python/autograd/autograd_function.cpp
+++ b/oneflow/api/python/autograd/autograd_function.cpp
@@ -20,7 +20,7 @@ limitations under the License.
 #include <pybind11/functional.h>
 
 #include "oneflow/api/python/of_api_registry.h"
-#include "oneflow/api/python/functional/common.h"
+#include "oneflow/core/framework/tensor_tuple.h"
 #include "oneflow/core/autograd/autograd_function.h"
 #include "oneflow/core/framework/op_expr_grad_function.h"
 #include "oneflow/core/framework/tensor_tuple.h"
@@ -34,21 +34,15 @@ namespace {
 // Transform input to TensorTuple
 Maybe<one::TensorTuple> UnpackTensorTuple(const py::object& input) {
   one::TensorTuple tp;
-  if (one::PyTensor_Check(input.ptr())) {
+  if (py::isinstance<one::Tensor>(input)) {
     tp.emplace_back(input.cast<std::shared_ptr<one::Tensor>>());
   } else if (py::isinstance<py::tuple>(input)) {
-    auto tuple = input.cast<py::tuple>();
-    for (int i = 0; i < tuple.size(); ++i) {
-      PyObject* obj = tuple[i].ptr();
-      if (!one::PyTensor_Check(obj)) {
-        return Error::RuntimeError()
-               << "expected Tensor as element " << i << ", but got "
-               << one::functional::PyStringAsString(PyObject_Str((PyObject*)Py_TYPE(obj)));
-      }
-      tp.emplace_back(one::PyTensor_Unpack(obj));
+    for (const auto& tensor : input.cast<py::tuple>()) {
+      CHECK_OR_RETURN(py::isinstance<one::Tensor>(tensor));
+      tp.emplace_back(tensor.cast<std::shared_ptr<one::Tensor>>());
     }
   } else {
-    return Error::RuntimeError() << "Only support tensor or list of tensors";
+    throw std::runtime_error("Only support tensor or list of tensors");
   }
   return tp;
 }
diff --git a/oneflow/api/python/autograd/function_node.cpp b/oneflow/api/python/autograd/function_node.cpp
index 617dde54df3..16afa84b983 100644
--- a/oneflow/api/python/autograd/function_node.cpp
+++ b/oneflow/api/python/autograd/function_node.cpp
@@ -29,7 +29,7 @@ struct FunctionNodeUtil final {
   static std::string ToString(const one::FunctionNode& func_node) {
     std::stringstream ss;
     ss << "<";
-    ss << func_node.name();
+    ss << func_node.GetOpTypeName();
     ss << " at " << &func_node;
     ss << ">";
     return ss.str();
@@ -45,11 +45,11 @@ ONEFLOW_API_PYBIND11_MODULE("", m) {
       .def("_register_hook_dict", []() { TODO(); })
       .def_property_readonly(
           "next_functions",
-          [](const one::FunctionNode& func_node) { return func_node.next_functions(); })
+          [](const one::FunctionNode& func_node) { return func_node.GetNextFunctions(); })
       .def_property_readonly("metadata", []() { TODO(); })
       .def_property_readonly("requires_grad", []() { TODO(); })
       .def("register_hook", []() { TODO(); })
-      .def("name", [](const one::FunctionNode& func_node) { return func_node.name(); });
+      .def("name", [](const one::FunctionNode& func_node) { return func_node.GetOpTypeName(); });
 }
 
 }  // namespace oneflow
diff --git a/oneflow/api/python/caster/tensor.h b/oneflow/api/python/caster/tensor.h
deleted file mode 100644
index 06bb85477ee..00000000000
--- a/oneflow/api/python/caster/tensor.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include <pybind11/pybind11.h>
-
-#include "oneflow/api/python/caster/common.h"
-#include "oneflow/api/python/framework/tensor.h"
-
-namespace pybind11 {
-namespace detail {
-
-template<typename T>
-struct tensor_type_caster {
- public:
-  bool load(handle src, bool convert) {
-    using namespace oneflow::one;
-    value_ = nullptr;
-    if (!src) { return false; }
-    if (src.is_none()) { return true; }
-    if (!PyTensor_Check(src.ptr())) { return false; }
-    value_ = PyTensor_Unpack(src.ptr());
-    return true;
-  }
-
-  template<typename U>
-  static handle cast(U&& src, return_value_policy policy, handle parent) {
-    using namespace oneflow::one;
-    return reinterpret_steal<object>(PyTensor_New(std::const_pointer_cast<Tensor>(src))).release();
-  }
-
-  operator std::shared_ptr<T>*() { return &value_; }
-  operator std::shared_ptr<T>&() { return value_; }
-  operator std::shared_ptr<T>&&() && { return std::move(value_); }
-
-  static constexpr auto name = _("tensor");
-  template<typename U>
-  using cast_op_type = pybind11::detail::cast_op_type<std::shared_ptr<T>>;
-
- protected:
-  std::shared_ptr<T> value_;
-};
-
-template<typename T>
-struct parameter_type_caster {
- public:
-  bool load(handle src, bool convert) {
-    using namespace oneflow::one;
-    value_ = nullptr;
-    if (!src) { return false; }
-    if (src.is_none()) { return true; }
-    if (!PyTensor_Check(src.ptr())) { return false; }
-    value_ = PyTensor_Unpack(src.ptr());
-    return true;
-  }
-
-  template<typename U>
-  static handle cast(U&& src, return_value_policy policy, handle parent) {
-    using namespace oneflow::one;
-    return reinterpret_steal<object>(PyParameter_New(std::const_pointer_cast<Parameter>(src)))
-        .release();
-  }
-
-  operator std::shared_ptr<T>*() { return &value_; }
-  operator std::shared_ptr<T>&() { return value_; }
-  operator std::shared_ptr<T>&&() && { return std::move(value_); }
-
-  static constexpr auto name = _("parameter");
-  template<typename U>
-  using cast_op_type = pybind11::detail::cast_op_type<std::shared_ptr<T>>;
-
- protected:
-  std::shared_ptr<T> value_;
-};
-
-template<>
-struct type_caster<std::shared_ptr<oneflow::one::Tensor>>
-    : public tensor_type_caster<oneflow::one::Tensor> {};
-template<>
-struct type_caster<std::shared_ptr<const oneflow::one::Tensor>>
-    : public tensor_type_caster<const oneflow::one::Tensor> {};
-
-template<>
-struct type_caster<std::shared_ptr<oneflow::one::Parameter>>
-    : public parameter_type_caster<oneflow::one::Parameter> {};
-template<>
-struct type_caster<std::shared_ptr<const oneflow::one::Parameter>>
-    : public parameter_type_caster<const oneflow::one::Parameter> {};
-
-}  // namespace detail
-}  // namespace pybind11
diff --git a/oneflow/api/python/exception/exception.h b/oneflow/api/python/exception/exception.h
deleted file mode 100644
index c27e074a339..00000000000
--- a/oneflow/api/python/exception/exception.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_API_PYTHON_COMMON_EXCEPTION_H_
-#define ONEFLOW_API_PYTHON_COMMON_EXCEPTION_H_
-
-#include <Python.h>
-#include <pybind11/pybind11.h>
-
-#include "oneflow/core/common/exception.h"
-
-namespace py = pybind11;
-
-#define HANDLE_ERRORS try {
-#define END_HANDLE_ERRORS_RETSTMT(retstmt)                \
-  }                                                       \
-  catch (py::error_already_set & e) {                     \
-    e.restore();                                          \
-    retstmt;                                              \
-  }                                                       \
-  catch (const oneflow::RuntimeException& e) {            \
-    PyErr_SetString(PyExc_RuntimeError, e.what());        \
-    retstmt;                                              \
-  }                                                       \
-  catch (const oneflow::IndexException& e) {              \
-    PyErr_SetString(PyExc_IndexError, e.what());          \
-    retstmt;                                              \
-  }                                                       \
-  catch (const oneflow::NotImplementedException& e) {     \
-    PyErr_SetString(PyExc_NotImplementedError, e.what()); \
-    retstmt;                                              \
-  }                                                       \
-  catch (const std::exception& e) {                       \
-    PyErr_SetString(PyExc_RuntimeError, e.what());        \
-    retstmt;                                              \
-  }
-
-#define END_HANDLE_ERRORS END_HANDLE_ERRORS_RETSTMT(return NULL)
-#define END_HANDLE_ERRORS_RET(retval) END_HANDLE_ERRORS_RETSTMT(return retval)
-#define END_HANDLE_ERRORS_NORET END_HANDLE_ERRORS_RETSTMT(void)
-
-#endif  // ONEFLOW_API_PYTHON_COMMON_EXCEPTION_H_
diff --git a/oneflow/api/python/framework/doc.cpp b/oneflow/api/python/framework/doc.cpp
index cac9f834f86..ed30a93c240 100644
--- a/oneflow/api/python/framework/doc.cpp
+++ b/oneflow/api/python/framework/doc.cpp
@@ -40,26 +40,14 @@ py::object AddFunctionDoc(py::object f, const std::string& doc_string) {
       THROW(RuntimeError) << "function "
                           << PyBytes_AsString(
                                  PyUnicode_AsEncodedString(f->func_name, "utf-8", "~E~"))
-                          << " already has a docstring";
+                          << " already has a docstring.";
     }
     f->func_doc = PyUnicode_FromString(doc_str);
-  } else if (strcmp(Py_TYPE(obj)->tp_name, "method_descriptor") == 0) {
-    PyMethodDescrObject* f = (PyMethodDescrObject*)obj;
-    if (f->d_method->ml_doc) {
-      THROW(RuntimeError) << "function " << f->d_method->ml_name << "already has a docstring";
-    }
-    f->d_method->ml_doc = doc_str;
-  } else if (strcmp(Py_TYPE(obj)->tp_name, "getset_descriptor") == 0) {
-    PyMethodDescrObject* f = (PyMethodDescrObject*)obj;
-    if (f->d_method->ml_doc) {
-      THROW(RuntimeError) << "function " << f->d_method->ml_name << "already has a docstring";
-    }
-    f->d_method->ml_doc = doc_str;
   } else if (py::isinstance<py::detail::generic_type>(f)) {
     if (py::hasattr(f, "__doc__")) {
       auto doc = py::getattr(f, "__doc__");
       if (!doc.is(py::none())) {
-        THROW(RuntimeError) << Py_TYPE(obj)->tp_name << " already has a docstring";
+        THROW(RuntimeError) << Py_TYPE(obj)->tp_name << " already has a docstring.";
       }
     }
     py::setattr(f, "__doc__", py::reinterpret_steal<py::object>(PyUnicode_FromString(doc_str)));
@@ -69,7 +57,7 @@ py::object AddFunctionDoc(py::object f, const std::string& doc_string) {
     auto* f = (PyCFunctionObject*)(PyInstanceMethod_Function(obj));
     f->m_ml->ml_doc = doc_str;
   } else {
-    THROW(RuntimeError) << "function is " << Py_TYPE(obj)->tp_name << ", not a valid function";
+    THROW(RuntimeError) << "function is " << Py_TYPE(obj)->tp_name << ", not a valid function.";
   }
   f.inc_ref();
   return f;
diff --git a/oneflow/api/python/framework/one_embedding.cpp b/oneflow/api/python/framework/one_embedding.cpp
deleted file mode 100644
index 68568b72cbd..00000000000
--- a/oneflow/api/python/framework/one_embedding.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <pybind11/numpy.h>
-#include <pybind11/operators.h>
-#include "oneflow/api/python/of_api_registry.h"
-#include "oneflow/core/embedding/embedding_manager.h"
-#include "oneflow/core/embedding/persistent_table.h"
-#include "oneflow/core/embedding/hash_functions.cuh"
-#include "oneflow/core/framework/dtype.h"
-
-namespace py = pybind11;
-
-namespace oneflow {
-
-class OneEmbeddingHandler final {
- public:
-  OneEmbeddingHandler(const std::string& key_value_store_option_string, int64_t local_rank_id,
-                      int64_t rank_id, int64_t world_size)
-      : local_rank_id_(local_rank_id), rank_id_(rank_id), world_size_(world_size) {
-    embedding::KeyValueStoreOptions key_value_store_options(key_value_store_option_string);
-    embedding_name_ = key_value_store_options.Name();
-    CreateKeyValueStore(key_value_store_options);
-  }
-
-  void LoadSnapshot(const std::string& snapshot_name) {
-#ifdef WITH_CUDA
-    Global<embedding::EmbeddingManager>::Get()->LoadSnapshot(embedding_name_, local_rank_id_,
-                                                             rank_id_, snapshot_name);
-#else
-    UNIMPLEMENTED() << "Only Support with CUDA";
-#endif
-  }
-
-  void SaveSnapshot(const std::string& snapshot_name) {
-#ifdef WITH_CUDA
-    Global<embedding::EmbeddingManager>::Get()->SaveSnapshot(embedding_name_, local_rank_id_,
-                                                             rank_id_, snapshot_name);
-#else
-    UNIMPLEMENTED() << "Only Support with CUDA";
-#endif
-  }
-
- private:
-  void CreateKeyValueStore(const embedding::KeyValueStoreOptions& key_value_store_options) {
-#ifdef WITH_CUDA
-    Global<embedding::EmbeddingManager>::Get()->CreateKeyValueStore(
-        key_value_store_options, local_rank_id_, rank_id_, world_size_);
-#else
-    UNIMPLEMENTED() << "Only Support with CUDA";
-#endif
-  }
-
-  std::string embedding_name_;
-  int64_t local_rank_id_;
-  int64_t rank_id_;
-  int64_t world_size_;
-};
-
-namespace embedding {
-
-class PersistentTableWriter {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(PersistentTableWriter);
-  PersistentTableWriter() = default;
-  virtual ~PersistentTableWriter() = default;
-
-  virtual void Write(const py::array& keys, const py::array& values) = 0;
-  virtual void Close() = 0;
-};
-
-template<typename Key, typename Value>
-class PersistentTableWriterImpl : public PersistentTableWriter {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(PersistentTableWriterImpl);
-  PersistentTableWriterImpl(const std::vector<std::string>& paths, const std::string& snapshot_name,
-                            uint32_t storage_dim, uint64_t target_chunk_size_mb,
-                            uint16_t physical_block_size)
-      : closed_(false), snapshot_name_(snapshot_name), storage_dim_(storage_dim) {
-    tables_.resize(paths.size());
-    for (size_t i = 0; i < paths.size(); ++i) {
-      PersistentTableOptions options;
-      options.path = paths[i];
-      options.key_size = sizeof(Key);
-      options.value_size = storage_dim * sizeof(Value);
-      options.target_chunk_size_mb = target_chunk_size_mb;
-      options.physical_block_size = physical_block_size;
-      tables_[i] = NewPersistentTable(options);
-    }
-  }
-  ~PersistentTableWriterImpl() override { CloseImpl(); }
-
-  void Write(const py::array& keys, const py::array& values) override {
-    pybind11::dtype::of<int32_t>().equal(pybind11::dtype::of<int64_t>());
-    CHECK(!closed_) << "Write on closed table";
-    CHECK_EQ(keys.ndim(), 1);
-    CHECK_EQ(values.ndim(), 2);
-    CHECK_EQ(keys.shape(0), values.shape(0));
-    CHECK_EQ(values.shape(1), storage_dim_);
-    CHECK(keys.dtype().equal(py::dtype::of<Key>()));
-    CHECK(values.dtype().equal(py::dtype::of<Value>()));
-    const size_t n = keys.size();
-    std::vector<std::vector<Key>> keys_buffers(tables_.size());
-    std::vector<std::vector<char>> values_buffers(tables_.size());
-    for (size_t i = 0; i < n; ++i) {
-      const Key key = *(reinterpret_cast<const Key*>(keys.template data(i)));
-      const uint32_t shard = ShardingHash()(key) % tables_.size();
-      keys_buffers[shard].push_back(key);
-      const size_t values_offset = values_buffers[shard].size();
-      values_buffers[shard].resize(values_offset + storage_dim_ * sizeof(Value));
-      for (size_t j = 0; j < values.shape(1); ++j) {
-        std::memcpy(values_buffers[shard].data() + values_offset + j * values.itemsize(),
-                    values.template data(i, j), values.itemsize());
-      }
-    }
-    for (size_t shard = 0; shard < tables_.size(); ++shard) {
-      tables_[shard]->Put(keys_buffers[shard].size(), keys_buffers[shard].data(),
-                          values_buffers[shard].data());
-    }
-  }
-
-  void Close() override { CloseImpl(); }
-
- private:
-  void CloseImpl() {
-    if (!closed_) {
-      for (auto& table : tables_) {
-        table->SaveSnapshot(snapshot_name_);
-        table.reset();
-      }
-    }
-    closed_ = true;
-  }
-
-  bool closed_;
-  std::string snapshot_name_;
-  std::vector<std::unique_ptr<PersistentTable>> tables_;
-  uint32_t storage_dim_;
-};
-
-template<typename Key>
-std::shared_ptr<PersistentTableWriter> NewPersistentTableWriter(
-    const std::vector<std::string>& paths, const std::string& snapshot_name,
-    const Symbol<DType>& key_type, const Symbol<DType>& value_type, uint32_t storage_dim,
-    uint64_t target_chunk_size_mb, uint16_t physical_block_size) {
-  if (value_type->data_type() == DataType::kFloat) {
-    return std::shared_ptr<PersistentTableWriter>(new PersistentTableWriterImpl<Key, float>(
-        paths, snapshot_name, storage_dim, target_chunk_size_mb, physical_block_size));
-  } else {
-    UNIMPLEMENTED();
-  }
-}
-
-std::shared_ptr<PersistentTableWriter> NewPersistentTableWriter(
-    const std::vector<std::string>& paths, const std::string& snapshot_name,
-    const Symbol<DType>& key_type, const Symbol<DType>& value_type, uint32_t storage_dim,
-    uint64_t target_chunk_size_mb, uint16_t physical_block_size) {
-  if (key_type->data_type() == DataType::kInt32) {
-    return NewPersistentTableWriter<int32_t>(paths, snapshot_name, key_type, value_type,
-                                             storage_dim, target_chunk_size_mb,
-                                             physical_block_size);
-  } else if (key_type->data_type() == DataType::kUInt32) {
-    return NewPersistentTableWriter<uint32_t>(paths, snapshot_name, key_type, value_type,
-                                              storage_dim, target_chunk_size_mb,
-                                              physical_block_size);
-  } else if (key_type->data_type() == DataType::kInt64) {
-    return NewPersistentTableWriter<int64_t>(paths, snapshot_name, key_type, value_type,
-                                             storage_dim, target_chunk_size_mb,
-                                             physical_block_size);
-  } else if (key_type->data_type() == DataType::kUInt64) {
-    return NewPersistentTableWriter<uint64_t>(paths, snapshot_name, key_type, value_type,
-                                              storage_dim, target_chunk_size_mb,
-                                              physical_block_size);
-  } else {
-    UNIMPLEMENTED();
-    return std::shared_ptr<embedding::PersistentTableWriter>(nullptr);
-  }
-}
-
-class PersistentTableReader {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(PersistentTableReader);
-  PersistentTableReader() = default;
-  virtual ~PersistentTableReader() = default;
-
-  virtual std::tuple<py::object, py::object> Next() = 0;
-  virtual void Close() = 0;
-};
-
-template<typename Key, typename Value>
-class PersistentTableReaderImpl : public PersistentTableReader {
- public:
-  constexpr static uint32_t kBatchSize = 65536;
-  OF_DISALLOW_COPY_AND_MOVE(PersistentTableReaderImpl);
-  PersistentTableReaderImpl(const std::vector<std::string>& paths, const std::string& snapshot_name,
-                            uint32_t storage_dim, uint64_t target_chunk_size_mb,
-                            uint16_t physical_block_size)
-      : closed_(false),
-        snapshot_name_(snapshot_name),
-        storage_dim_(storage_dim),
-        current_table_(0) {
-    tables_.resize(paths.size());
-    iterators_.resize(paths.size());
-    for (size_t i = 0; i < paths.size(); ++i) {
-      PersistentTableOptions options;
-      options.path = paths[i];
-      options.key_size = sizeof(Key);
-      options.value_size = storage_dim * sizeof(Value);
-      options.target_chunk_size_mb = target_chunk_size_mb;
-      options.physical_block_size = physical_block_size;
-      tables_[i] = NewPersistentTable(options);
-      iterators_[i] =
-          std::unique_ptr<PersistentTable::Iterator>(tables_[i]->ReadSnapshot(snapshot_name));
-    }
-    keys_buffer_.resize(kBatchSize);
-    values_buffer_.resize(kBatchSize * storage_dim_);
-  }
-  ~PersistentTableReaderImpl() override { CloseImpl(); }
-
-  std::tuple<py::object, py::object> Next() override {
-    while (current_table_ < tables_.size()) {
-      uint32_t n_result = 0;
-      iterators_[current_table_]->Next(kBatchSize, &n_result, keys_buffer_.data(),
-                                       values_buffer_.data());
-      if (n_result != 0) {
-        py::array_t<Key> keys_arr(py::array::ShapeContainer({n_result}));
-        py::array_t<Value> values_arr(py::array::ShapeContainer({n_result, storage_dim_}));
-        std::memcpy(keys_arr.mutable_data(), keys_buffer_.data(), n_result * sizeof(Key));
-        std::memcpy(values_arr.mutable_data(), values_buffer_.data(),
-                    n_result * storage_dim_ * sizeof(Value));
-        return std::make_tuple(keys_arr, values_arr);
-      } else {
-        current_table_ += 1;
-        continue;
-      }
-    }
-    throw py::stop_iteration();
-  }
-
-  void Close() override { CloseImpl(); }
-
- private:
-  void CloseImpl() {
-    if (!closed_) {
-      for (auto& table : tables_) { table.reset(); }
-    }
-    closed_ = true;
-  }
-
-  bool closed_;
-  std::string snapshot_name_;
-  std::vector<std::unique_ptr<PersistentTable>> tables_;
-  std::vector<std::unique_ptr<PersistentTable::Iterator>> iterators_;
-  uint32_t storage_dim_;
-  size_t current_table_;
-  std::vector<Key> keys_buffer_;
-  std::vector<Value> values_buffer_;
-};
-
-template<typename Key>
-std::shared_ptr<PersistentTableReader> NewPersistentTableReader(
-    const std::vector<std::string>& paths, const std::string& snapshot_name,
-    const Symbol<DType>& key_type, const Symbol<DType>& value_type, uint32_t storage_dim,
-    uint64_t target_chunk_size_mb, uint16_t physical_block_size) {
-  if (value_type->data_type() == DataType::kFloat) {
-    return std::shared_ptr<PersistentTableReader>(new PersistentTableReaderImpl<Key, float>(
-        paths, snapshot_name, storage_dim, target_chunk_size_mb, physical_block_size));
-  } else {
-    UNIMPLEMENTED();
-  }
-}
-
-std::shared_ptr<PersistentTableReader> NewPersistentTableReader(
-    const std::vector<std::string>& paths, const std::string& snapshot_name,
-    const Symbol<DType>& key_type, const Symbol<DType>& value_type, uint32_t storage_dim,
-    uint64_t target_chunk_size_mb, uint16_t physical_block_size) {
-  if (key_type->data_type() == DataType::kInt32) {
-    return NewPersistentTableReader<int32_t>(paths, snapshot_name, key_type, value_type,
-                                             storage_dim, target_chunk_size_mb,
-                                             physical_block_size);
-  } else if (key_type->data_type() == DataType::kUInt32) {
-    return NewPersistentTableReader<uint32_t>(paths, snapshot_name, key_type, value_type,
-                                              storage_dim, target_chunk_size_mb,
-                                              physical_block_size);
-  } else if (key_type->data_type() == DataType::kInt64) {
-    return NewPersistentTableReader<int64_t>(paths, snapshot_name, key_type, value_type,
-                                             storage_dim, target_chunk_size_mb,
-                                             physical_block_size);
-  } else if (key_type->data_type() == DataType::kUInt64) {
-    return NewPersistentTableReader<uint64_t>(paths, snapshot_name, key_type, value_type,
-                                              storage_dim, target_chunk_size_mb,
-                                              physical_block_size);
-  } else {
-    UNIMPLEMENTED();
-    return std::shared_ptr<embedding::PersistentTableReader>(nullptr);
-  }
-}
-
-}  // namespace embedding
-
-ONEFLOW_API_PYBIND11_MODULE("", m) {
-  py::class_<OneEmbeddingHandler, std::shared_ptr<OneEmbeddingHandler>>(m, "OneEmbeddingHandler")
-      .def(py::init([](const std::string& key_value_store_option_str, const int64_t local_rank_id,
-                       const int64_t rank_id, const int64_t world_size) {
-        return std::make_shared<OneEmbeddingHandler>(key_value_store_option_str, local_rank_id,
-                                                     rank_id, world_size);
-      }))
-      .def("SaveSnapshot", &OneEmbeddingHandler::SaveSnapshot)
-      .def("LoadSnapshot", &OneEmbeddingHandler::LoadSnapshot);
-
-  py::class_<embedding::PersistentTableWriter, std::shared_ptr<embedding::PersistentTableWriter>>(
-      m, "PersistentTableWriter")
-      .def(py::init([](const std::vector<std::string>& paths, const std::string& snapshot_name,
-                       const Symbol<DType>& key_type, const Symbol<DType>& value_type,
-                       uint32_t storage_dim, uint64_t target_chunk_size_mb,
-                       uint16_t physical_block_size) {
-        return embedding::NewPersistentTableWriter(paths, snapshot_name, key_type, value_type,
-                                                   storage_dim, target_chunk_size_mb,
-                                                   physical_block_size);
-      }))
-      .def("__enter__", [](embedding::PersistentTableWriter* writer) { return writer; })
-      .def("__exit__", [](embedding::PersistentTableWriter* writer, const py::object& exc_type,
-                          const py::object& exc_val, const py::object& exc_tb) { writer->Close(); })
-      .def("write", &embedding::PersistentTableWriter::Write)
-      .def("close", &embedding::PersistentTableWriter::Close);
-
-  py::class_<embedding::PersistentTableReader, std::shared_ptr<embedding::PersistentTableReader>>(
-      m, "PersistentTableReader")
-      .def(py::init([](const std::vector<std::string>& paths, const std::string& snapshot_name,
-                       const Symbol<DType>& key_type, const Symbol<DType>& value_type,
-                       uint32_t storage_dim, uint64_t target_chunk_size_mb,
-                       uint16_t physical_block_size) {
-        return embedding::NewPersistentTableReader(paths, snapshot_name, key_type, value_type,
-                                                   storage_dim, target_chunk_size_mb,
-                                                   physical_block_size);
-      }))
-      .def("__next__", &embedding::PersistentTableReader::Next)
-      .def("__iter__", [](embedding::PersistentTableReader* reader) { return reader; })
-      .def("__enter__", [](embedding::PersistentTableReader* reader) { return reader; })
-      .def("__exit__", [](embedding::PersistentTableReader* reader, const py::object& exc_type,
-                          const py::object& exc_val, const py::object& exc_tb) { reader->Close(); })
-      .def("close", &embedding::PersistentTableReader::Close);
-}
-
-}  // namespace oneflow
diff --git a/oneflow/api/python/framework/one_embedding_handler.cpp b/oneflow/api/python/framework/one_embedding_handler.cpp
new file mode 100644
index 00000000000..cbad01244fc
--- /dev/null
+++ b/oneflow/api/python/framework/one_embedding_handler.cpp
@@ -0,0 +1,79 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <pybind11/pybind11.h>
+#include <pybind11/operators.h>
+#include "oneflow/api/python/of_api_registry.h"
+#include "oneflow/core/embedding/embedding_manager.h"
+namespace py = pybind11;
+
+namespace oneflow {
+
+class OneEmbeddingHandler final {
+ public:
+  OneEmbeddingHandler(const std::string& key_value_store_option_string, int64_t local_rank_id,
+                      int64_t rank_id, int64_t world_size)
+      : local_rank_id_(local_rank_id), rank_id_(rank_id), world_size_(world_size) {
+    embedding::KeyValueStoreOptions key_value_store_options(key_value_store_option_string);
+    embedding_name_ = key_value_store_options.Name();
+    CreateKeyValueStore(key_value_store_options);
+  }
+
+  void LoadSnapshot(const std::string& snapshot_name) {
+#ifdef WITH_CUDA
+    Global<embedding::EmbeddingManager>::Get()->LoadSnapshot(embedding_name_, local_rank_id_,
+                                                             rank_id_, snapshot_name);
+#else
+    UNIMPLEMENTED() << "Only Support with CUDA";
+#endif
+  }
+
+  void SaveSnapshot(const std::string& snapshot_name) {
+#ifdef WITH_CUDA
+    Global<embedding::EmbeddingManager>::Get()->SaveSnapshot(embedding_name_, local_rank_id_,
+                                                             rank_id_, snapshot_name);
+#else
+    UNIMPLEMENTED() << "Only Support with CUDA";
+#endif
+  }
+
+ private:
+  void CreateKeyValueStore(const embedding::KeyValueStoreOptions& key_value_store_options) {
+#ifdef WITH_CUDA
+    Global<embedding::EmbeddingManager>::Get()->CreateKeyValueStore(
+        key_value_store_options, local_rank_id_, rank_id_, world_size_);
+#else
+    UNIMPLEMENTED() << "Only Support with CUDA";
+#endif
+  }
+
+  std::string embedding_name_;
+  int64_t local_rank_id_;
+  int64_t rank_id_;
+  int64_t world_size_;
+};
+
+ONEFLOW_API_PYBIND11_MODULE("", m) {
+  py::class_<OneEmbeddingHandler, std::shared_ptr<OneEmbeddingHandler>>(m, "OneEmbeddingHandler")
+      .def(py::init([](const std::string& key_value_store_option_str, const int64_t local_rank_id,
+                       const int64_t rank_id, const int64_t world_size) {
+        return std::make_shared<OneEmbeddingHandler>(key_value_store_option_str, local_rank_id,
+                                                     rank_id, world_size);
+      }))
+      .def("SaveSnapshot", &OneEmbeddingHandler::SaveSnapshot)
+      .def("LoadSnapshot", &OneEmbeddingHandler::LoadSnapshot);
+}
+
+}  // namespace oneflow
diff --git a/oneflow/api/python/framework/tensor.cpp b/oneflow/api/python/framework/tensor.cpp
index 50fc1dc967b..54568acb099 100644
--- a/oneflow/api/python/framework/tensor.cpp
+++ b/oneflow/api/python/framework/tensor.cpp
@@ -13,620 +13,196 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/api/python/framework/tensor.h"
-
 #include <pybind11/pybind11.h>
-#include <Python.h>
-#include "oneflow/api/python/exception/exception.h"
+#include <pybind11/stl.h>
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+
+#include "oneflow/core/common/throw.h"
 #include "oneflow/api/python/framework/size.h"
-#include "oneflow/api/python/functional/common.h"
-#include "oneflow/api/python/functional/python_arg.h"
-#include "oneflow/api/python/functional/functional_api.yaml.pybind.h"
-#include "oneflow/api/python/functional/tensor_api.yaml.pybind.h"
 #include "oneflow/api/python/of_api_registry.h"
 #include "oneflow/api/python/ofblob/ofblob.e.h"
 #include "oneflow/api/python/utils/tensor_utils.h"
-#include "oneflow/core/autograd/autograd_engine.h"
+#include "oneflow/api/python/functional/tensor_api.yaml.pybind.h"
 #include "oneflow/core/framework/tensor.h"
 #include "oneflow/core/framework/tensor_rpc_util.h"
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/framework/stride.h"
+#include "oneflow/core/framework/py_distribute.h"
+#include "oneflow/core/job/placement.cfg.h"
+#include "oneflow/core/job/global_for.h"
 #include "oneflow/core/framework/dtype.h"
 #include "oneflow/core/framework/placement_utils.h"
-#include "oneflow/core/functional/functional.h"
-#include "oneflow/core/functional/tensor_index.h"
+#include "oneflow/core/autograd/autograd_engine.h"
+#include "oneflow/core/common/decorator.h"
 
 namespace py = pybind11;
 
 namespace oneflow {
-namespace one {
-
-#define ASSERT(x) (x).GetOrThrow()
-#define ASSERT_PTR(x) (x).GetPtrOrThrow()
-#define PY_XINCREF(p) (({ Py_XINCREF(p); }), (p))
-
-#if PY_VERSION_HEX < 0x03070000
-#define PYGETSET_NAME(name) const_cast<char*>(name)
-#else
-#define PYGETSET_NAME(name) (name)
-#endif
-
-PyTypeObject* PyTensorObject_Type = NULL;
-PyTypeObject* PyParameterObject_Type = NULL;
-
-static int PyTensorObject_init(PyObject* self, PyObject* args, PyObject* kwargs) {
-  HANDLE_ERRORS
-  auto* temp = functional::_legacy_tensor_ctor(NULL, args, kwargs);
-  if (PyErr_Occurred()) { throw py::error_already_set(); }
-  auto* _self = (PyTensorObject*)self;
-  _self->data = PyTensor_Unpack(temp);
-  _self->data->set_pyobject(self);
-
-  // reset temp data to prevent clearing the pyobject
-  // when the temp is deallocated
-  ((PyTensorObject*)temp)->data.reset();
-  Py_XDECREF(temp);
-  return 0;
-  END_HANDLE_ERRORS_RET(-1)
-}
-
-static void PyTensorObject_dealloc(PyObject* self) {
-  auto* _self = (PyTensorObject*)self;
-  // clear pyobject
-  if (_self->data) {
-    _self->data->set_pyobject(NULL);
-    _self->data.reset();
-  }
-  // clear __dict__
-  PyObject** dict_ptr = _PyObject_GetDictPtr(self);
-  if (dict_ptr) { Py_CLEAR(*dict_ptr); }
-  auto* type = Py_TYPE(self);
-  type->tp_free(self);
-  Py_DECREF(type);
-}
-
-static int PyParameterObject_init(PyObject* self, PyObject* args, PyObject* kwargs) {
-  HANDLE_ERRORS
-  PyObject* data = NULL;
-  int requires_grad = 1;
-  static const char* keywords[3] = {"data", "requires_grad", NULL};
-  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|p:__init__", const_cast<char**>(keywords),
-                                   &data, &requires_grad)) {
-    return -1;
-  }
-  if (self) {
-    auto* _self = (PyTensorObject*)self;
-    _self->data = ASSERT_PTR(Parameter::MakeTensor(PyTensor_Unpack(data), requires_grad));
-    _self->data->set_pyobject(self);
-  }
-  return 0;
-  END_HANDLE_ERRORS_RET(-1)
-}
-
-static Py_ssize_t PyTensorObject_length(PyTensorObject* self) {
-  if (self->data->ndim() == 0) { return 0; }
-  return self->data->dim(0);
-}
-
-static PyObject* PyTensorObject_getitem(PyObject* self, Py_ssize_t item) {
-  HANDLE_ERRORS
-  const auto& p = PyTensor_Unpack(self);
-  return PyTensor_New(
-      ASSERT_PTR(functional::TensorGetItem(p, {functional::detail::IndexItem(item)})));
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject_subscript(PyObject* self, PyObject* item) {
-  HANDLE_ERRORS
-  const auto& p = PyTensor_Unpack(self);
-  functional::PythonArg arg(item);
-  return PyTensor_New(ASSERT_PTR(functional::TensorGetItem(p, arg.As<functional::TensorIndex>())));
-  END_HANDLE_ERRORS
-}
-
-static int PyTensorObject_ass_subscript(PyObject* self, PyObject* item, PyObject* value) {
-  HANDLE_ERRORS
-  const auto& p = PyTensor_Unpack(self);
-  const auto& v = PyTensor_Unpack(value);
-  functional::PythonArg arg(item);
-  ASSERT(functional::TensorSetItem(p, arg.As<functional::TensorIndex>(), v));
-  return 0;
-  END_HANDLE_ERRORS_RET(-1)
-}
-
-static PySequenceMethods PyTensorObject_as_sequence = {
-    (lenfunc)PyTensorObject_length, NULL, /*sq_concat*/
-    NULL,                                 /*sq_repeat*/
-    (ssizeargfunc)PyTensorObject_getitem, /*sq_item*/
-};
-
-static PyMappingMethods PyTensorObject_as_mapping = {
-    (lenfunc)PyTensorObject_length,
-    (binaryfunc)PyTensorObject_subscript,
-    (objobjargproc)PyTensorObject_ass_subscript,
-};
-
-static PyObject* PyTensorObject_storage_offset(PyObject* self, PyObject* unused) {
-  HANDLE_ERRORS
-  return functional::CastToPyObject(PyTensor_Unpack(self)->storage_offset());
-  END_HANDLE_ERRORS
-}
 
-static PyObject* PyTensorObject_stride(PyObject* self, PyObject* unused) {
-  HANDLE_ERRORS
-  const auto& stride = ASSERT_PTR(PyTensor_Unpack(self)->stride());
-  PyObject* tup = PyTuple_New(stride->NumAxes());
-  for (int i = 0; i < stride->NumAxes(); ++i) {
-    PyTuple_SetItem(tup, i, PyLong_FromUnsignedLong(stride->At(i)));
-  }
-  return tup;
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject_is_contiguous(PyObject* self, PyObject* unused) {
-  HANDLE_ERRORS
-  return functional::CastToPyObject(PyTensor_Unpack(self)->is_contiguous());
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject_contiguous(PyObject* self, PyObject* unused) {
-  HANDLE_ERRORS
-  return PyTensor_New(PyTensor_Unpack(self)->contiguous());
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject_requires_grad_(PyObject* self, PyObject* args, PyObject* kwargs) {
-  HANDLE_ERRORS
-  int requires_grad = 1;
-  static const char* keywords[2] = {"requires_grad", NULL};
-  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|p:requires_grad_", const_cast<char**>(keywords),
-                                   &requires_grad)) {
-    return NULL;
-  }
-  ASSERT(PyTensor_Unpack(self)->set_requires_grad(requires_grad));
-  Py_XINCREF(self);
-  return self;
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject_retain_grad(PyObject* self, PyObject* unused) {
-  HANDLE_ERRORS
-  const auto& t = PyTensor_Unpack(self);
-  if (!t->requires_grad()) {
-    return PyErr_Format(PyExc_RuntimeError,
-                        "can't retain_grad on Tensor that has requires_grad=False");
-  }
-  ASSERT(t->set_retain_grad(true));
-  Py_RETURN_NONE;
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject_detach(PyObject* self, PyObject* unused) {
-  HANDLE_ERRORS
-  return PyTensor_New(ASSERT_PTR(PyTensor_Unpack(self)->detach()));
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject_clone(PyObject* self, PyObject* unused) {
-  HANDLE_ERRORS
-  return PyTensor_New(ASSERT_PTR(PyTensor_Unpack(self)->clone()));
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject_zero_(PyObject* self, PyObject* unused) {
-  HANDLE_ERRORS
-  ASSERT(EagerMirroredTensorZeros(PyTensor_Unpack(self)));
-  Py_XINCREF(self);
-  return self;
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject_register_hook(PyObject* self, PyObject* hook) {
-  HANDLE_ERRORS
-  const auto& _hook = py::cast<AutogradMeta::Hook>(py::reinterpret_borrow<py::object>(hook));
-  ASSERT(RegisterTensorHook(PyTensor_Unpack(self), _hook));
-  Py_RETURN_NONE;
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject__register_post_grad_accumulation_hook(PyObject* self,
-                                                                      PyObject* hook) {
-  HANDLE_ERRORS
-  const auto& _hook = py::cast<AutogradMeta::Hook>(py::reinterpret_borrow<py::object>(hook));
-  ASSERT(RegisterTensorPostGradAccumulationHook(PyTensor_Unpack(self), _hook));
-  Py_RETURN_NONE;
-  END_HANDLE_ERRORS
-}
+namespace one {
 
-static PyObject* PyTensorObject_global_id(PyObject* self, PyObject* unused) {
-  HANDLE_ERRORS
-  uint64_t global_id = static_cast<uint64_t>(ASSERT(PyTensor_Unpack(self)->transport_token()));
-  return functional::CastToPyObject(global_id);
-  END_HANDLE_ERRORS
-}
+namespace {
 
-static PyObject* PyTensorObject_check_meta_consistency(PyObject* self, PyObject* unused) {
-  HANDLE_ERRORS
-  ASSERT(CheckMetaConsistency(PyTensor_Unpack(self)));
-  Py_RETURN_NONE;
-  END_HANDLE_ERRORS
+const Symbol<DType>* GetTensorDType(const Tensor& tensor) {
+  return &CHECK_JUST(DType::Get(tensor.dtype()->data_type()));
 }
 
-static PyObject* PyTensorObject_to_numpy(PyObject* self, PyObject* unused) {
-  HANDLE_ERRORS
-  const auto& t = PyTensor_Unpack(self);
-  DataType data_type = t->dtype()->data_type();
+Maybe<py::array> ApiEagerMirroredTensorToNumpy(const py::handle& py_tensor) {
+  const std::shared_ptr<Tensor> tensor = py::cast<const std::shared_ptr<Tensor>>(py_tensor);
+  DataType data_type = tensor->dtype()->data_type();
   switch (data_type) {
 #define SWITCH_EAGER_TENSOR_TO_NUMPY(cpp_type, of_type) \
-  case of_type: return ASSERT(EagerMirroredTensorToNumpy<cpp_type>(self));
+  case of_type: return EagerMirroredTensorToNumpy<cpp_type>(py_tensor);
     OF_PP_FOR_EACH_TUPLE(SWITCH_EAGER_TENSOR_TO_NUMPY, POD_DATA_TYPE_SEQ)
-    case DataType::kFloat16: return ASSERT(EagerMirroredTensorToNumpy<float16>(self));
-    default: {
-      return PyErr_Format(PyExc_RuntimeError, "Invalid datatype");
-    }
-  }
-#undef SWITCH_EAGER_TENSOR_TO_NUMPY
-  END_HANDLE_ERRORS
-}
-
-#define DEFINE_TENSOR_METHOD(T, type_proto)                                               \
-  static PyObject* PyTensorObject__copy_to_numpy_##T(PyObject* self, PyObject* array) {   \
-    HANDLE_ERRORS                                                                         \
-    ASSERT(CopyBetweenMirroredTensorAndNumpy<T>(PyTensor_Unpack(self), array,             \
-                                                BlobNumpyCopyUtil<T>::To, "const",        \
-                                                /*block_host_until_done=*/true));         \
-    Py_RETURN_NONE;                                                                       \
-    END_HANDLE_ERRORS                                                                     \
-  }                                                                                       \
-  static PyObject* PyTensorObject__copy_from_numpy_##T(PyObject* self, PyObject* array) { \
-    HANDLE_ERRORS                                                                         \
-    auto* copied = PyArray_NewCopy((PyArrayObject*)array, NPY_CORDER);                    \
-    ASSERT(CopyBetweenMirroredTensorAndNumpy<T>(PyTensor_Unpack(self), copied,            \
-                                                BlobNumpyCopyUtil<T>::From, "mut",        \
-                                                /*block_host_until_done=*/false));        \
-    Py_RETURN_NONE;                                                                       \
-    END_HANDLE_ERRORS                                                                     \
-  }
-OF_PP_FOR_EACH_TUPLE(DEFINE_TENSOR_METHOD, POD_DATA_TYPE_SEQ)
-#undef DEFINE_TENSOR_METHOD
-
-static PyObject* PyTensorObject__get_copy_mirrored_tensor_to_numpy_func_name(PyObject* self,
-                                                                             PyObject* unused) {
-  HANDLE_ERRORS
-  return functional::CastToPyObject(
-      GetCopyMirroredTensorToNumpyFuncName(PyTensor_Unpack(self)->dtype()->data_type()));
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject__get_copy_mirrored_tensor_from_numpy_func_name(PyObject* self,
-                                                                               PyObject* unused) {
-  HANDLE_ERRORS
-  return functional::CastToPyObject(
-      GetCopyMirroredTensorFromNumpyFuncName(PyTensor_Unpack(self)->dtype()->data_type()));
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject__register_storage_delete_hook(PyObject* self, PyObject* hook) {
-  HANDLE_ERRORS
-  auto _hook = py::cast<std::function<void()>>(py::reinterpret_borrow<py::object>(hook));
-  ASSERT(PyTensor_Unpack(self)->RegisterStorageDeleteHook(_hook));
-  Py_RETURN_NONE;
-  END_HANDLE_ERRORS
-}
-
-static PyMethodDef PyTensorObject_methods[] = {
-    {"storage_offset", PyTensorObject_storage_offset, METH_NOARGS, NULL},
-    {"stride", PyTensorObject_stride, METH_NOARGS, NULL},
-    {"is_contiguous", PyTensorObject_is_contiguous, METH_NOARGS, NULL},
-    {"contiguous", PyTensorObject_contiguous, METH_NOARGS, NULL},
-    {"requires_grad_", (PyCFunction)PyTensorObject_requires_grad_, METH_VARARGS | METH_KEYWORDS,
-     NULL},
-    {"retain_grad", PyTensorObject_retain_grad, METH_NOARGS, NULL},
-    {"detach", PyTensorObject_detach, METH_NOARGS, NULL},
-    {"clone", PyTensorObject_clone, METH_NOARGS, NULL},
-    {"zero_", PyTensorObject_zero_, METH_NOARGS, NULL},
-    {"register_hook", PyTensorObject_register_hook, METH_O, NULL},
-    {"_register_post_grad_accumulation_hook", PyTensorObject__register_post_grad_accumulation_hook,
-     METH_O, NULL},
-    {"global_id", PyTensorObject_global_id, METH_NOARGS, NULL},
-    {"check_meta_consistency", PyTensorObject_check_meta_consistency, METH_NOARGS, NULL},
-    {"to_numpy", PyTensorObject_to_numpy, METH_NOARGS, NULL},
-#define DEFINE_TENSOR_METHOD(T, type_proto)                                \
-  {"_copy_to_numpy_" #T, PyTensorObject__copy_to_numpy_##T, METH_O, NULL}, \
-      {"_copy_from_numpy_" #T, PyTensorObject__copy_from_numpy_##T, METH_O, NULL},
-    OF_PP_FOR_EACH_TUPLE(DEFINE_TENSOR_METHOD, POD_DATA_TYPE_SEQ)
-#undef DEFINE_TENSOR_METHOD
-        {"_get_copy_mirrored_tensor_to_numpy_func_name",
-         PyTensorObject__get_copy_mirrored_tensor_to_numpy_func_name, METH_NOARGS, NULL},
-    {"_get_copy_mirrored_tensor_from_numpy_func_name",
-     PyTensorObject__get_copy_mirrored_tensor_from_numpy_func_name, METH_NOARGS, NULL},
-    {"_register_storage_delete_hook", PyTensorObject__register_storage_delete_hook, METH_O, NULL},
-    {NULL}};
-
-static PyObject* PyTensorObject_ndim(PyObject* self, void* unused) {
-  return functional::CastToPyObject(PyTensor_Unpack(self)->ndim());
-}
-
-static PyObject* PyTensorObject_shape(PyObject* self, void* unused) {
-  return functional::CastToPyObject(PyTensor_Unpack(self)->shape());
-}
-
-static PyObject* PyTensorObject_dtype(PyObject* self, void* unused) {
-  HANDLE_ERRORS
-  const Symbol<DType>* dtype = &ASSERT(DType::Get(PyTensor_Unpack(self)->dtype()->data_type()));
-  return functional::CastToPyObject(dtype);
-  END_HANDLE_ERRORS
-}
-
-static PyObject* PyTensorObject_is_cuda(PyObject* self, void* unused) {
-  return functional::CastToPyObject(PyTensor_Unpack(self)->is_cuda());
-}
-
-static PyObject* PyTensorObject_grad(PyObject* self, void* unused) {
-  HANDLE_ERRORS
-  return PyTensor_New(ASSERT_PTR(PyTensor_Unpack(self)->acc_grad()));
-  END_HANDLE_ERRORS
-}
-
-static int PyTensorObject_set_grad(PyObject* self, PyObject* grad, void* unused) {
-  HANDLE_ERRORS
-  const auto& t = PyTensor_Unpack(self);
-  if (self == grad) { PyErr_Format(PyExc_RuntimeError, "can't assign Tensor as its own grad"); }
-  if (grad && grad != Py_None) {
-    ASSERT(t->set_acc_grad(ASSERT_PTR(PyTensor_Unpack(grad)->detach())));
-  } else {
-    ASSERT(t->set_acc_grad(NULL));
+    case DataType::kFloat16: return EagerMirroredTensorToNumpy<float16>(py_tensor);
+    default: return Maybe<py::array>(Error::UnimplementedError() << "Invalid datatype");
   }
-  return 0;
-  END_HANDLE_ERRORS_RET(-1)
-}
-
-static PyObject* PyTensorObject__is_grad_acc_inplace(PyObject* self, void* unused) {
-  return functional::CastToPyObject(PyTensor_Unpack(self)->autograd_meta()->is_grad_acc_inplace());
-}
-
-static int PyTensorObject_set__is_grad_acc_inplace(PyObject* self, PyObject* is_inplace,
-                                                   void* unused) {
-  PyTensor_Unpack(self)->mut_autograd_meta()->set_is_grad_acc_inplace(is_inplace);
-  return 0;
-}
-
-static PyObject* PyTensorObject_data(PyObject* self, void* unused) {
-  HANDLE_ERRORS
-  return PyTensor_New(ASSERT_PTR(PyTensor_Unpack(self)->data()));
-  END_HANDLE_ERRORS
-}
-
-static int PyTensorObject_set_data(PyObject* self, PyObject* data, void* unused) {
-  HANDLE_ERRORS
-  const auto& t = PyTensor_Unpack(self);
-  auto hooks = t->autograd_meta()->hooks();
-  ASSERT(t->set_data(PyTensor_Unpack(data)));
-  // Re-register hooks
-  for (const auto& hook : hooks) { ASSERT(RegisterTensorHook(t, hook)); }
-  return 0;
-  END_HANDLE_ERRORS_RET(-1)
-}
-
-static PyObject* PyTensorObject_grad_fn(PyObject* self, void* unused) {
-  return functional::CastToPyObject(PyTensor_Unpack(self)->grad_fn_node());
-}
-
-static PyObject* PyTensorObject_is_leaf(PyObject* self, void* unused) {
-  return functional::CastToPyObject(PyTensor_Unpack(self)->is_leaf());
-}
-
-static PyObject* PyTensorObject_requires_grad(PyObject* self, void* unused) {
-  return functional::CastToPyObject(PyTensor_Unpack(self)->requires_grad());
-}
-
-static int PyTensorObject_set_requires_grad(PyObject* self, PyObject* requires_grad, void* unused) {
-  HANDLE_ERRORS
-  const auto& t = PyTensor_Unpack(self);
-  CHECK_OR_THROW(t->is_leaf()) << Error::RuntimeError()
-                               << "You can only change requires_grad flags of leaf tensors.";
-  ASSERT(t->set_requires_grad(requires_grad == Py_True));
-  return 0;
-  END_HANDLE_ERRORS_RET(-1)
-}
-
-static PyObject* PyTensorObject_is_lazy(PyObject* self, void* unused) {
-  return functional::CastToPyObject(PyTensor_Unpack(self)->is_lazy());
 }
 
-static PyObject* PyTensorObject_is_eager(PyObject* self, void* unused) {
-  return functional::CastToPyObject(PyTensor_Unpack(self)->is_eager());
+template<typename T>
+Maybe<void> CopyMirroredTensorToNumpy(const std::shared_ptr<Tensor>& tensor, py::array_t<T> array) {
+  return CopyBetweenMirroredTensorAndNumpy<T>(tensor, array.ptr(), BlobNumpyCopyUtil<T>::To,
+                                              "const",
+                                              /*block_host_until_done=*/true);
 }
 
-static PyObject* PyTensorObject_is_global(PyObject* self, void* unused) {
-  return functional::CastToPyObject(PyTensor_Unpack(self)->is_consistent());
-}
-
-static PyObject* PyTensorObject_is_local(PyObject* self, void* unused) {
-  return functional::CastToPyObject(PyTensor_Unpack(self)->is_local());
-}
+template<typename T>
+Maybe<void> CopyMirroredTensorFromNumpy(const std::shared_ptr<Tensor>& tensor,
+                                        py::array_t<T> array) {
+  // When asynchronously copying array data to tensor, we need to back up the
+  // array at the same time.
+  // Only NPY_CORDER is supported, and it makes sure that the array is C-style contiguous.
+  auto* copied_array = PyArray_NewCopy((PyArrayObject*)array.ptr(), NPY_CORDER);
+  JUST(CopyBetweenMirroredTensorAndNumpy<T>(tensor, copied_array, BlobNumpyCopyUtil<T>::From, "mut",
+                                            /*block_host_until_done=*/false));
 
-static PyObject* PyTensorObject__tensor_buffer_shapes_and_dtypes(PyObject* self, void* unused) {
-  HANDLE_ERRORS
-  return functional::CastToPyObject(MaybeGetTensorBufferShapesAndDTypes(PyTensor_Unpack(self)));
-  END_HANDLE_ERRORS
+  Py_DECREF(copied_array);
+  return Maybe<void>::Ok();
 }
 
-static PyObject* PyTensorObject_device(PyObject* self, void* unused) {
-  HANDLE_ERRORS
-  return functional::CastToPyObject(PyTensor_Unpack(self)->device());
-  END_HANDLE_ERRORS
+std::shared_ptr<Tensor> ApiNewTensor(py::args args, py::kwargs kwargs) {
+  return py::cast<std::shared_ptr<Tensor>>(functional::_legacy_tensor_ctor(args, kwargs));
 }
 
-static PyObject* PyTensorObject_placement(PyObject* self, void* unused) {
-  HANDLE_ERRORS
-  return functional::CastToPyObject(PyTensor_Unpack(self)->parallel_desc());
-  END_HANDLE_ERRORS
+Maybe<void> ApiSetRequiresGrad(Tensor& tensor, bool requires_grad) {
+  CHECK_OR_RETURN(tensor.is_leaf())
+      << Error::RuntimeError() << "You can only change requires_grad flags of leaf tensors.";
+  JUST(tensor.set_requires_grad(requires_grad));
+  return Maybe<void>::Ok();
 }
 
-static PyObject* PyTensorObject_sbp(PyObject* self, void* unused) {
-  HANDLE_ERRORS
-  return functional::CastToPyObject(TensorGetPyTupleOfSbp(*PyTensor_Unpack(self)));
-  END_HANDLE_ERRORS
+std::shared_ptr<Parameter> ApiNewParameter(const std::shared_ptr<Tensor>& data,
+                                           bool requires_grad) {
+  return Parameter::MakeTensor(data, requires_grad).GetPtrOrThrow();
 }
 
-// NOLINTNEXTLINE
-static PyGetSetDef PyTensorObject_properties[] = {
-    {PYGETSET_NAME("ndim"), (getter)PyTensorObject_ndim, NULL, NULL, NULL},
-    {PYGETSET_NAME("shape"), (getter)PyTensorObject_shape, NULL, NULL, NULL},
-    {PYGETSET_NAME("dtype"), (getter)PyTensorObject_dtype, NULL, NULL, NULL},
-    {PYGETSET_NAME("is_cuda"), (getter)PyTensorObject_is_cuda, NULL, NULL, NULL},
-    {PYGETSET_NAME("grad"), (getter)PyTensorObject_grad, (setter)PyTensorObject_set_grad, NULL,
-     NULL},
-    {PYGETSET_NAME("_is_grad_acc_inplace"), (getter)PyTensorObject__is_grad_acc_inplace,
-     (setter)PyTensorObject_set__is_grad_acc_inplace, NULL, NULL},
-    {PYGETSET_NAME("data"), (getter)PyTensorObject_data, (setter)PyTensorObject_set_data, NULL,
-     NULL},
-    {PYGETSET_NAME("grad_fn"), (getter)PyTensorObject_grad_fn, NULL, NULL, NULL},
-    {PYGETSET_NAME("is_leaf"), (getter)PyTensorObject_is_leaf, NULL, NULL, NULL},
-    {PYGETSET_NAME("requires_grad"), (getter)PyTensorObject_requires_grad,
-     (setter)PyTensorObject_set_requires_grad, NULL, NULL},
-    {PYGETSET_NAME("is_lazy"), (getter)PyTensorObject_is_lazy, NULL, NULL, NULL},
-    {PYGETSET_NAME("is_eager"), (getter)PyTensorObject_is_eager, NULL, NULL, NULL},
-    {PYGETSET_NAME("is_global"), (getter)PyTensorObject_is_global, NULL, NULL, NULL},
-    {PYGETSET_NAME("is_local"), (getter)PyTensorObject_is_local, NULL, NULL, NULL},
-    {PYGETSET_NAME("_tensor_buffer_shapes_and_dtypes"),
-     (getter)PyTensorObject__tensor_buffer_shapes_and_dtypes, NULL, NULL, NULL},
-    {PYGETSET_NAME("device"), (getter)PyTensorObject_device, NULL, NULL, NULL},
-    {PYGETSET_NAME("placement"), (getter)PyTensorObject_placement, NULL, NULL, NULL},
-    {PYGETSET_NAME("sbp"), (getter)PyTensorObject_sbp, NULL, NULL, NULL},
-    {NULL}};
-
-// create a Tensor instance
-static PyObject* TensorMetaCls_call(PyObject* type, PyObject* args, PyObject* kwargs) {
-  return PyType_Type.tp_call(type, args, kwargs);
-}
-
-static void TensorMetaCls_dealloc(PyObject* type) { PyType_Type.tp_dealloc(type); }
-
-static PyHeapTypeObject* MakeTensorMetaclass() {
-  PyObject* name = PyUnicode_FromString("_TensorMeta");
-
-  auto* heap_type = (PyHeapTypeObject*)PyType_Type.tp_alloc(&PyType_Type, 0);
-  heap_type->ht_name = name;
-  heap_type->ht_qualname = PY_XINCREF(name);
-
-  auto* type = &heap_type->ht_type;
-  type->tp_name = "_TensorMeta";
-  type->tp_base = PY_XINCREF(&PyType_Type);
-  type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+}  // namespace
 
-  type->tp_call = TensorMetaCls_call;
-  type->tp_dealloc = TensorMetaCls_dealloc;
+using namespace pybind11::literals;
 
-  if (PyType_Ready(type) < 0) { return NULL; }
-  PyObject_SetAttrString((PyObject*)type, "__module__", PyUnicode_FromString("oneflow._C"));
-  return heap_type;
-}
-
-static PyHeapTypeObject* TensorMetaclass_Type = MakeTensorMetaclass();
-
-static PyTypeObject* MakeTensorType() {
-  PyObject* name = PyUnicode_FromString("Tensor");
-
-  auto* metaclass = &TensorMetaclass_Type->ht_type;
-  auto* heap_type = (PyHeapTypeObject*)metaclass->tp_alloc(metaclass, 0);
-  if (!heap_type) { return NULL; }
-  heap_type->ht_name = name;
-  heap_type->ht_qualname = PY_XINCREF(name);
-  auto* type = &heap_type->ht_type;
-  type->tp_name = "Tensor";
-  type->tp_basicsize = sizeof(PyTensorObject);
-
-  type->tp_init = PyTensorObject_init;
-  type->tp_dealloc = PyTensorObject_dealloc;
-  type->tp_getset = PyTensorObject_properties;
-  type->tp_methods = PyTensorObject_methods;
-
-  type->tp_as_number = &heap_type->as_number;
-  type->tp_as_sequence = &PyTensorObject_as_sequence;
-  type->tp_as_mapping = &PyTensorObject_as_mapping;
-
-  type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
-
-  if (PyType_Ready(type) < 0) { return NULL; }
-  PyObject_SetAttrString((PyObject*)type, "__module__", PyUnicode_FromString("oneflow"));
-  return type;
-}
-
-static PyTypeObject* MakeParameterType() {
-  PyObject* name = PyUnicode_FromString("Parameter");
-
-  auto* metaclass = &TensorMetaclass_Type->ht_type;
-  auto* heap_type = (PyHeapTypeObject*)metaclass->tp_alloc(metaclass, 0);
-  if (!heap_type) { return NULL; }
-  heap_type->ht_name = name;
-  heap_type->ht_qualname = PY_XINCREF(name);
-  auto* type = &heap_type->ht_type;
-  type->tp_name = "Parameter";
-  type->tp_basicsize = sizeof(PyTensorObject);
-
-  type->tp_init = PyParameterObject_init;
-
-  type->tp_base = PY_XINCREF(PyTensorObject_Type);
-
-  type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
-
-  if (PyType_Ready(type) < 0) { return NULL; }
-  PyObject_SetAttrString((PyObject*)type, "__module__", PyUnicode_FromString("oneflow.nn"));
-  return type;
-}
-
-PyObject* PyTensor_New(const std::shared_ptr<Tensor>& data) {
-  if (!data) { Py_RETURN_NONE; }
-  if (data->pyobject()) { return PY_XINCREF((PyObject*)(data->pyobject())); }
-  auto* self = (PyTensorObject*)PyTensorObject_Type->tp_alloc(PyTensorObject_Type, 0);
-  if (self) {
-    self->data = data;
-    self->data->set_pyobject(self);
-  }
-  return (PyObject*)self;
-}
-
-PyObject* PyParameter_New(const std::shared_ptr<Parameter>& data) {
-  if (!data) { Py_RETURN_NONE; }
-  if (data->pyobject()) { return PY_XINCREF((PyObject*)(data->pyobject())); }
-  auto* self = (PyTensorObject*)PyTensorObject_Type->tp_alloc(PyParameterObject_Type, 0);
-  if (self) {
-    self->data = data;
-    self->data->set_pyobject(self);
-  }
-  return (PyObject*)self;
-}
+ONEFLOW_API_PYBIND11_MODULE("", m) {
+  py::class_<Tensor, std::shared_ptr<Tensor>>(m, "Tensor")
+      .def(py::init(&ApiNewTensor))
+      // Properties of pytorch
+      .def_property_readonly("ndim", &Tensor::ndim)
+      .def_property_readonly("shape", &Tensor::shape)
+      .def_property_readonly("dtype", &GetTensorDType)
+      .def_property_readonly("is_cuda", &Tensor::is_cuda)
+      .def_property("grad", &Tensor::acc_grad,
+                    [](Tensor& t, const std::shared_ptr<Tensor>& grad) -> Maybe<void> {
+                      CHECK_OR_RETURN(t.is_leaf())
+                          << Error::RuntimeError()
+                          << "You can only change gradient of leaf tensors.";
+                      if (grad != nullptr) {
+                        JUST(t.set_acc_grad(JUST(grad->detach())));
+                      } else {
+                        JUST(t.set_acc_grad(nullptr));
+                      }
+                      return Maybe<void>::Ok();
+                    })
+      .def_property(
+          "_is_grad_acc_inplace",
+          [](const Tensor& t) -> bool { return t.autograd_meta()->is_grad_acc_inplace(); },
+          [](Tensor& t, bool is_inplace) {
+            t.mut_autograd_meta()->set_is_grad_acc_inplace(is_inplace);
+          })
+      .def_property("data", &Tensor::data,
+                    [](const std::shared_ptr<Tensor>& t,
+                       const std::shared_ptr<Tensor>& other) -> Maybe<void> {
+                      auto hooks = t->autograd_meta()->hooks();
+                      JUST(t->set_data(other));
+                      // Re-register hooks
+                      for (const auto& hook : hooks) { JUST(RegisterTensorHook(t, hook)); }
+                      return Maybe<void>::Ok();
+                    })
+      .def("storage_offset", &Tensor::storage_offset)
+      .def("stride",
+           [](const Tensor& t) -> Maybe<py::tuple> {
+             const auto& stride = JUST(t.stride())->StrideVec();
+             return py::tuple(py::make_iterator(stride.begin(), stride.end()));
+           })
+      .def("is_contiguous", &Tensor::is_contiguous)
+      .def("contiguous", &Tensor::contiguous)
+      .def_property_readonly("grad_fn", &Tensor::grad_fn_node)
+      .def_property_readonly("is_leaf", &Tensor::is_leaf)
+      .def_property("requires_grad", &Tensor::requires_grad, &ApiSetRequiresGrad)
+      // Methods of pytorch
+      .def(
+          "requires_grad_",
+          [](Tensor& t, bool requires_grad) -> Maybe<Tensor&> {
+            JUST(ApiSetRequiresGrad(t, requires_grad));
+            return t;
+          },
+          "requires_grad"_a = true)
+      .def("retain_grad",
+           [](Tensor& t) -> Maybe<void> {
+             if (!t.is_leaf()) { JUST(t.set_retain_grad(true)); }
+             return Maybe<void>::Ok();
+           })
+      .def("detach", &Tensor::detach)
+      .def("clone", &Tensor::clone)
+      // OneFlow tensor properties other than pytorch tensor
+      .def_property_readonly("is_lazy", &Tensor::is_lazy)
+      .def_property_readonly("is_eager", &Tensor::is_eager)
+      .def_property_readonly("is_global", &Tensor::is_consistent)
+      .def_property_readonly("is_local", &Tensor::is_local)
+      .def("zeros_", &EagerMirroredTensorZeros)
+      .def("register_hook", &RegisterTensorHook)
+      .def("_register_post_grad_accumulation_hook", &RegisterTensorPostGradAccumulationHook)
+      // local tensor only
+      .def_property_readonly("_tensor_buffer_shapes_and_dtypes",
+                             &MaybeGetTensorBufferShapesAndDTypes)
+      .def_property_readonly("device", &Tensor::device)
+      .def("global_id",
+           [](const one::Tensor& tensor) -> Maybe<int64_t> {
+             return static_cast<uint64_t>(JUST(tensor.transport_token()));
+           })
+      .def("check_meta_consistency", CheckMetaConsistency)
+      .def("to_numpy", &ApiEagerMirroredTensorToNumpy, py::return_value_policy::move)
+#define DEFINE_TENSOR_METHOD(T, type_proto)                 \
+  .def("_copy_to_numpy_" #T, &CopyMirroredTensorToNumpy<T>) \
+      .def("_copy_from_numpy_" #T, &CopyMirroredTensorFromNumpy<T>)
+          OF_PP_FOR_EACH_TUPLE(DEFINE_TENSOR_METHOD, POD_DATA_TYPE_SEQ)
+#undef DEFINE_TENSOR_METHOD
+      .def("_get_copy_mirrored_tensor_to_numpy_func_name",
+           [](const Tensor& tensor) {
+             return GetCopyMirroredTensorToNumpyFuncName(tensor.dtype()->data_type());
+           })
+      .def("_get_copy_mirrored_tensor_from_numpy_func_name",
+           [](const Tensor& tensor) {
+             return GetCopyMirroredTensorFromNumpyFuncName(tensor.dtype()->data_type());
+           })
+      .def("_register_storage_delete_hook", &Tensor::RegisterStorageDeleteHook)
+      // consistent tensor only
+      .def_property_readonly("placement", &Tensor::parallel_desc)
+      .def_property_readonly("sbp", &TensorGetPyTupleOfSbp);
 
-PyObject* PyParameter_New(const std::shared_ptr<Tensor>& data, bool requires_grad) {
-  if (!data) { Py_RETURN_NONE; }
-  auto* self = (PyTensorObject*)PyTensorObject_Type->tp_alloc(PyParameterObject_Type, 0);
-  if (self) {
-    self->data = ASSERT_PTR(Parameter::MakeTensor(data, requires_grad));
-    self->data->set_pyobject(self);
-  }
-  return (PyObject*)self;
+  auto nn = m.def_submodule("nn");
+  py::class_<Parameter, std::shared_ptr<Parameter>, Tensor>(nn, "Parameter")
+      .def(py::init(&ApiNewParameter), "data"_a, "requires_grad"_a = true);
 }
 
 }  // namespace one
-}  // namespace oneflow
 
-#undef ASSERT
-#undef ASSERT_PTR
-
-using namespace oneflow::one;
-
-ONEFLOW_API_PYBIND11_MODULE("", m) {
-  PyTensorObject_Type = MakeTensorType();
-  PyParameterObject_Type = MakeParameterType();
-  if (PyTensorObject_Type
-      && PyModule_AddObject(m.ptr(), "Tensor", (PyObject*)PyTensorObject_Type) < 0) {
-    return;
-  }
-  auto nn = m.def_submodule("nn");
-  if (PyParameterObject_Type
-      && PyModule_AddObject(nn.ptr(), "Parameter", (PyObject*)PyParameterObject_Type) < 0) {
-    return;
-  }
-}
+}  // namespace oneflow
diff --git a/oneflow/api/python/framework/tensor.h b/oneflow/api/python/framework/tensor.h
deleted file mode 100644
index e919b8273b6..00000000000
--- a/oneflow/api/python/framework/tensor.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_API_PYTHON_FRAMEWORK_TENSOR_H_
-#define ONEFLOW_API_PYTHON_FRAMEWORK_TENSOR_H_
-
-#include <Python.h>
-
-#include "oneflow/core/framework/tensor.h"
-
-namespace oneflow {
-namespace one {
-
-typedef struct {
-  PyObject_HEAD;
-  std::shared_ptr<Tensor> data;
-} PyTensorObject;
-
-extern PyTypeObject* PyTensorObject_Type;
-extern PyTypeObject* PyParameterObject_Type;
-
-inline bool PyTensor_Check(PyObject* op) { return PyObject_TypeCheck(op, PyTensorObject_Type); }
-
-inline bool PyTensor_CheckExact(PyObject* op) {
-  return op->ob_type == PyTensorObject_Type || op->ob_type == PyParameterObject_Type;
-}
-
-inline std::shared_ptr<Tensor>& PyTensor_Unpack(PyObject* op) {
-  assert(PyTensor_Check(op));
-  return ((PyTensorObject*)op)->data;
-}
-
-PyObject* PyTensor_New(const std::shared_ptr<Tensor>& data);
-PyObject* PyParameter_New(const std::shared_ptr<Parameter>& data);
-PyObject* PyParameter_New(const std::shared_ptr<Tensor>& data, bool requires_grad);
-
-}  // namespace one
-}  // namespace oneflow
-
-#endif  // ONEFLOW_API_PYTHON_FRAMEWORK_TENSOR_H_
diff --git a/oneflow/api/python/functional/common.cpp b/oneflow/api/python/functional/common.cpp
index 96c5b165f7e..f525483789e 100644
--- a/oneflow/api/python/functional/common.cpp
+++ b/oneflow/api/python/functional/common.cpp
@@ -58,27 +58,37 @@ bool PyStringSequenceCheck(PyObject* obj) {
   return PySequenceCheck(obj, [](PyObject* item) { return PyStringCheck(item); });
 }
 
-std::string PyStringAsString(PyObject* obj) {
-  PyObject* bytes = PyUnicode_AsEncodedString(obj, "utf-8", "~E~");
+Maybe<std::string> PyStringAsString(PyObject* str_obj) {
+  PyObject* bytes = PyUnicode_AsEncodedString(str_obj, "utf-8", "~E~");
   std::string str = PyBytes_AS_STRING(bytes);
   Py_XDECREF(bytes);
   return str;
 }
 
-std::string PyObjectToReprStr(PyObject* obj) {
+Maybe<std::string> PyObjectToReprStr(PyObject* obj) {
   PyObject* repr_obj = PyObject_Repr(obj);
-  std::string str = PyStringAsString(repr_obj);
+  std::string str = *JUST(PyStringAsString(repr_obj));
   Py_XDECREF(repr_obj);
   return str;
 }
 
+bool PyTensorCheck(PyObject* obj) {
+  auto handle = py::reinterpret_borrow<py::object>(obj);
+  return py::isinstance<Tensor>(handle);
+}
+
+Maybe<Tensor> PyUnpackTensor(PyObject* obj) {
+  auto handle = py::reinterpret_borrow<py::object>(obj);
+  return py::cast<std::shared_ptr<Tensor>>(handle);
+}
+
 // Tensor list
 bool PyTensorSequenceCheck(PyObject* obj) {
-  return PySequenceCheck(obj, [](PyObject* item) { return PyTensor_Check(item); });
+  return PySequenceCheck(obj, [](PyObject* item) { return PyTensorCheck(item); });
 }
-std::vector<std::shared_ptr<Tensor>> PyUnpackTensorSequence(PyObject* obj) {
+Maybe<std::vector<std::shared_ptr<Tensor>>> PyUnpackTensorSequence(PyObject* obj) {
   return PyUnpackSequence<std::shared_ptr<Tensor>>(
-      obj, [](PyObject* item) { return PyTensor_Unpack(item); });
+      obj, [](PyObject* item) { return PyUnpackTensor(item); });
 }
 
 // TensorTuple
@@ -87,7 +97,7 @@ bool PyTensorTupleCheck(PyObject* obj) {
   return py::isinstance<TensorTuple>(handle);
 }
 
-std::shared_ptr<TensorTuple> PyUnpackTensorTuple(PyObject* obj) {
+Maybe<TensorTuple> PyUnpackTensorTuple(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return py::cast<std::shared_ptr<TensorTuple>>(handle);
 }
@@ -95,16 +105,15 @@ std::shared_ptr<TensorTuple> PyUnpackTensorTuple(PyObject* obj) {
 // Scalar
 bool PyScalarCheck(PyObject* obj) { return PyLong_Check(obj) || PyFloat_Check(obj); }
 
-Scalar PyUnpackScalar(PyObject* obj) {
+Maybe<Scalar> PyUnpackScalar(PyObject* obj) {
   if (PyBool_Check(obj)) {
-    return obj == Py_True;
+    return std::make_shared<Scalar>(obj == Py_True);
   } else if (PyLong_Check(obj)) {
-    return static_cast<int64_t>(PyLong_AsLongLong(obj));
+    return std::make_shared<Scalar>(static_cast<int64_t>(PyLong_AsLongLong(obj)));
   } else if (PyFloat_Check(obj)) {
-    return PyFloat_AsDouble(obj);
+    return std::make_shared<Scalar>(PyFloat_AsDouble(obj));
   }
-  THROW(RuntimeError) << "The object is not scalar, but is " << Py_TYPE(obj)->tp_name;
-  return 0;
+  UNIMPLEMENTED_THEN_RETURN() << "The object is not scalar, but is " << Py_TYPE(obj)->tp_name;
 }
 
 // DType
@@ -112,7 +121,7 @@ bool PyDTypeCheck(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return py::isinstance<Symbol<DType>>(handle);
 }
-Symbol<DType> PyUnpackDType(PyObject* obj) {
+Maybe<Symbol<DType>> PyUnpackDType(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return *py::cast<Symbol<DType>*>(handle);
 }
@@ -121,7 +130,7 @@ Symbol<DType> PyUnpackDType(PyObject* obj) {
 bool PyDTypeSequenceCheck(PyObject* obj) {
   return PySequenceCheck(obj, [](PyObject* item) { return PyDTypeCheck(item); });
 }
-std::vector<Symbol<DType>> PyUnpackDTypeSequence(PyObject* obj) {
+Maybe<std::vector<Symbol<DType>>> PyUnpackDTypeSequence(PyObject* obj) {
   return PyUnpackSequence<Symbol<DType>>(obj, [](PyObject* item) { return PyUnpackDType(item); });
 }
 
@@ -129,10 +138,10 @@ std::vector<Symbol<DType>> PyUnpackDTypeSequence(PyObject* obj) {
 bool PyShapeSequenceCheck(PyObject* obj) {
   return PySequenceCheck(obj, [](PyObject* item) { return PyLongSequenceCheck(item); });
 }
-std::vector<Shape> PyUnpackShapeSequence(PyObject* obj) {
-  return PyUnpackSequence<Shape>(obj, [](PyObject* item) -> Shape {
-    const auto& shape = PyUnpackLongSequence<int64_t>(item);
-    return Shape(DimVector(shape.begin(), shape.end()));
+Maybe<std::vector<Shape>> PyUnpackShapeSequence(PyObject* obj) {
+  return PyUnpackSequence<Shape>(obj, [](PyObject* item) -> Maybe<Shape> {
+    const auto& shape = JUST(PyUnpackLongSequence<int64_t>(item));
+    return std::make_shared<Shape>(DimVector(shape->begin(), shape->end()));
   });
 }
 
@@ -141,7 +150,7 @@ bool PyGeneratorCheck(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return py::isinstance<Generator>(handle);
 }
-std::shared_ptr<Generator> PyUnpackGenerator(PyObject* obj) {
+Maybe<Generator> PyUnpackGenerator(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return py::cast<std::shared_ptr<one::Generator>>(handle);
 }
@@ -151,7 +160,7 @@ bool PyDeviceCheck(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return py::isinstance<Symbol<Device>>(handle);
 }
-Symbol<Device> PyUnpackDevice(PyObject* obj) {
+Maybe<Symbol<Device>> PyUnpackDevice(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return *py::cast<std::shared_ptr<Symbol<Device>>>(handle);
 }
@@ -161,7 +170,7 @@ bool PyParallelDescCheck(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return py::isinstance<Symbol<ParallelDesc>>(handle);
 }
-Symbol<ParallelDesc> PyUnpackParallelDesc(PyObject* obj) {
+Maybe<Symbol<ParallelDesc>> PyUnpackParallelDesc(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return *py::cast<std::shared_ptr<Symbol<ParallelDesc>>>(handle);
 }
@@ -171,7 +180,7 @@ bool PySbpParallelCheck(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return py::isinstance<Symbol<SbpParallel>>(handle);
 }
-Symbol<SbpParallel> PyUnpackSbpParallel(PyObject* obj) {
+Maybe<Symbol<SbpParallel>> PyUnpackSbpParallel(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return *py::cast<std::shared_ptr<Symbol<SbpParallel>>>(handle);
 }
@@ -180,29 +189,29 @@ Symbol<SbpParallel> PyUnpackSbpParallel(PyObject* obj) {
 bool PySbpParallelSequenceCheck(PyObject* obj) {
   return PySequenceCheck(obj, [](PyObject* item) { return PySbpParallelCheck(item); });
 }
-std::vector<Symbol<SbpParallel>> PyUnpackSbpParallelSequence(PyObject* obj) {
+Maybe<std::vector<Symbol<SbpParallel>>> PyUnpackSbpParallelSequence(PyObject* obj) {
   return PyUnpackSequence<Symbol<SbpParallel>>(
       obj, [](PyObject* item) { return PyUnpackSbpParallel(item); });
 }
 
 // Tensor index
 bool PyTensorIndexCheck(PyObject* obj) {
-  return PySlice_Check(obj) || PyLong_Check(obj) || obj == Py_Ellipsis || obj == Py_None
-         || PyTensor_Check(obj) || PySequence_Check(obj) || PyUnicode_Check(obj)
-         || numpy::PyArrayCheckLongScalar(obj);
+  return PySlice_Check(obj) || PyLong_Check(obj) || numpy::PyArrayCheckLongScalar(obj)
+         || obj == Py_Ellipsis || obj == Py_None || PyTensorCheck(obj) || PySequence_Check(obj)
+         || PyUnicode_Check(obj);
 }
-TensorIndex PyUnpackTensorIndex(PyObject* obj) {
-  TensorIndex tensor_index;
+Maybe<TensorIndex> PyUnpackTensorIndex(PyObject* obj) {
+  auto tensor_index = std::make_shared<TensorIndex>();
   // Obvious single-entry cases.
   if (PySlice_Check(obj)                     // NOLINT
       || PyLong_Check(obj)                   // NOLINT
+      || numpy::PyArrayCheckLongScalar(obj)  // NOLINT
       || obj == Py_Ellipsis                  // NOLINT
       || obj == Py_None                      // NOLINT
-      || PyTensor_Check(obj)                 // NOLINT
+      || PyTensorCheck(obj)                  // NOLINT
       || !PySequence_Check(obj)              // NOLINT
-      || numpy::PyArrayCheckLongScalar(obj)  // NOLINT
       || PyUnicode_Check(obj)) {
-    tensor_index.emplace_back(detail::UnpackIndexItem(obj));
+    tensor_index->emplace_back(*JUST(detail::UnpackIndexItem(obj)));
     return tensor_index;
   }
   PyObject* tup = NULL;
@@ -223,7 +232,7 @@ TensorIndex PyUnpackTensorIndex(PyObject* obj) {
     // Negative size indicates a Python error in the PySequence_Size call.
     if (n < 0) {
       PyErr_Clear();
-      tensor_index.emplace_back(detail::UnpackIndexItem(obj));
+      tensor_index->emplace_back(*JUST(detail::UnpackIndexItem(obj)));
       return tensor_index;
     }
     // The follow comments are from numpy:
@@ -238,7 +247,7 @@ TensorIndex PyUnpackTensorIndex(PyObject* obj) {
      * tuple. (`a[[[1,2], [3,4]]] == a[[1,2], [3,4]]`)
      */
     if (n >= /*NPY_MAXDIMS=*/32) {
-      tensor_index.emplace_back(detail::UnpackIndexItem(obj));
+      tensor_index->emplace_back(*JUST(detail::UnpackIndexItem(obj)));
       return tensor_index;
     }
     // Check whether we should unpack the index like a tuple.
@@ -246,15 +255,15 @@ TensorIndex PyUnpackTensorIndex(PyObject* obj) {
     for (Py_ssize_t i = 0; i < n; ++i) {
       PyObject* item = PySequence_GetItem(obj, i);
       if (commit_to_unpack) {
-        CHECK_OR_THROW(item) << "Sequence index is required.";
+        CHECK_OR_RETURN(item) << "Sequence index is required.";
       } else {
         if (!item) {
           PyErr_Clear();
           break;
         }
-        if (PySequence_Check(item)   // NOLINT
-            || PySlice_Check(item)   // NOLINT
-            || PyTensor_Check(item)  // NOLINT
+        if (PySequence_Check(item)  // NOLINT
+            || PySlice_Check(item)  // NOLINT
+            || PyTensorCheck(item)  // NOLINT
             || item == Py_Ellipsis || item == Py_None) {
           commit_to_unpack = true;
         }
@@ -264,15 +273,15 @@ TensorIndex PyUnpackTensorIndex(PyObject* obj) {
     if (commit_to_unpack) {
       tup = PySequence_Tuple(obj);
     } else {
-      tensor_index.emplace_back(detail::UnpackIndexItem(obj));
+      tensor_index->emplace_back(*JUST(detail::UnpackIndexItem(obj)));
       return tensor_index;
     }
   }
 
-  tensor_index.resize(n);
+  tensor_index->resize(n);
   for (Py_ssize_t i = 0; i < n; ++i) {
     PyObject* item = PySequence_GetItem(tup, i);
-    tensor_index[i] = detail::UnpackIndexItem(item);
+    tensor_index->at(i) = *JUST(detail::UnpackIndexItem(item));
     Py_DECREF(item);
   }
   Py_DECREF(tup);
@@ -285,7 +294,7 @@ bool PyOpExprCheck(PyObject* obj) {
   return py::isinstance<OpExpr>(handle);
 }
 
-std::shared_ptr<OpExpr> PyUnpackOpExpr(PyObject* obj) {
+Maybe<OpExpr> PyUnpackOpExpr(PyObject* obj) {
   auto handle = py::reinterpret_borrow<py::object>(obj);
   return py::cast<std::shared_ptr<OpExpr>>(handle);
 }
diff --git a/oneflow/api/python/functional/common.h b/oneflow/api/python/functional/common.h
index 18c6674d31d..8749f5ab228 100644
--- a/oneflow/api/python/functional/common.h
+++ b/oneflow/api/python/functional/common.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <vector>
 #include <pybind11/pybind11.h>
 
-#include "oneflow/api/python/framework/tensor.h"
 #include "oneflow/core/common/throw.h"
 #include "oneflow/core/common/maybe.h"
 #include "oneflow/core/common/preprocessor.h"
@@ -63,19 +62,29 @@ using PyObjectPtr = std::unique_ptr<PyObject, PyObjectPtrDeleter>;
   OF_PP_MAKE_TUPLE_SEQ(float) \
   OF_PP_MAKE_TUPLE_SEQ(double)
 
+template<typename T>
+T dereference(T&& val) {
+  return std::forward<T>(val);
+}
+
+template<typename T>
+T dereference(std::shared_ptr<T>&& val) {
+  return *val;
+}
+
 bool PySequenceCheck(PyObject* obj);
 bool PySequenceCheck(PyObject* obj, const std::function<bool(PyObject*)>& item_check);
 
 template<typename T, typename UnpackItemFunc>
-inline std::vector<T> PyUnpackSequence(PyObject* obj, UnpackItemFunc unpack_item) {
+inline Maybe<std::vector<T>> PyUnpackSequence(PyObject* obj, UnpackItemFunc unpack_item) {
   bool is_tuple = PyTuple_Check(obj);
-  CHECK_OR_THROW(is_tuple || PyList_Check(obj))
+  CHECK_OR_RETURN(is_tuple || PyList_Check(obj))
       << "The object is not list or tuple, but is " << Py_TYPE(obj)->tp_name;
   size_t size = is_tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
-  std::vector<T> values(size);
+  auto values = std::make_shared<std::vector<T>>(size);
   for (int i = 0; i < size; ++i) {
     PyObject* item = is_tuple ? PyTuple_GET_ITEM(obj, i) : PyList_GET_ITEM(obj, i);
-    values[i] = unpack_item(item);
+    values->at(i) = dereference<T>(JUST(unpack_item(item)));
   }
   return values;
 }
@@ -85,100 +94,80 @@ bool PyLongSequenceCheck(PyObject* obj);
 bool PyFloatSquenceCheck(PyObject* obj);
 
 template<typename T>
-inline std::vector<T> PyUnpackLongSequence(PyObject* obj) {
+inline Maybe<std::vector<T>> PyUnpackLongSequence(PyObject* obj) {
   return PyUnpackSequence<T>(
-      obj, [](PyObject* item) -> T { return static_cast<T>(PyLong_AsLongLong(item)); });
+      obj, [](PyObject* item) -> Maybe<T> { return static_cast<T>(PyLong_AsLongLong(item)); });
 }
 
 template<typename T>
-inline std::vector<T> PyUnpackFloatSequence(PyObject* obj) {
+inline Maybe<std::vector<T>> PyUnpackFloatSequence(PyObject* obj) {
   return PyUnpackSequence<T>(
-      obj, [](PyObject* item) -> T { return static_cast<T>(PyFloat_AsDouble(item)); });
+      obj, [](PyObject* item) -> Maybe<T> { return static_cast<T>(PyFloat_AsDouble(item)); });
 }
 
 // String
 bool PyStringCheck(PyObject* obj);
 bool PyStringSequenceCheck(PyObject* obj);
 
-std::string PyStringAsString(PyObject* obj);
+Maybe<std::string> PyStringAsString(PyObject* str_obj);
 
-std::string PyObjectToReprStr(PyObject* obj);
+Maybe<std::string> PyObjectToReprStr(PyObject* obj);
 
 // Scalar
 bool PyScalarCheck(PyObject* obj);
-Scalar PyUnpackScalar(PyObject* obj);
+Maybe<Scalar> PyUnpackScalar(PyObject* obj);
+
+// Tensor
+bool PyTensorCheck(PyObject* obj);
+Maybe<Tensor> PyUnpackTensor(PyObject* obj);
 
 // Tensor list
 bool PyTensorSequenceCheck(PyObject* obj);
-std::vector<std::shared_ptr<Tensor>> PyUnpackTensorSequence(PyObject* obj);
+Maybe<std::vector<std::shared_ptr<Tensor>>> PyUnpackTensorSequence(PyObject* obj);
 
 // TensorTuple
 bool PyTensorTupleCheck(PyObject* obj);
-std::shared_ptr<TensorTuple> PyUnpackTensorTuple(PyObject* obj);
+Maybe<TensorTuple> PyUnpackTensorTuple(PyObject* obj);
 
 // DType
 bool PyDTypeCheck(PyObject* obj);
-Symbol<DType> PyUnpackDType(PyObject* obj);
+Maybe<Symbol<DType>> PyUnpackDType(PyObject* obj);
 
 // DType list
 bool PyDTypeSequenceCheck(PyObject* obj);
-std::vector<Symbol<DType>> PyUnpackDTypeSequence(PyObject* obj);
+Maybe<std::vector<Symbol<DType>>> PyUnpackDTypeSequence(PyObject* obj);
 
 // Shape list
 bool PyShapeSequenceCheck(PyObject* obj);
-std::vector<Shape> PyUnpackShapeSequence(PyObject* obj);
+Maybe<std::vector<Shape>> PyUnpackShapeSequence(PyObject* obj);
 
 // Generator
 bool PyGeneratorCheck(PyObject* obj);
-std::shared_ptr<Generator> PyUnpackGenerator(PyObject* obj);
+Maybe<Generator> PyUnpackGenerator(PyObject* obj);
 
 // Device
 bool PyDeviceCheck(PyObject* obj);
-Symbol<Device> PyUnpackDevice(PyObject* obj);
+Maybe<Symbol<Device>> PyUnpackDevice(PyObject* obj);
 
 // Placement
 bool PyParallelDescCheck(PyObject* obj);
-Symbol<ParallelDesc> PyUnpackParallelDesc(PyObject* obj);
+Maybe<Symbol<ParallelDesc>> PyUnpackParallelDesc(PyObject* obj);
 
 // SBP
 bool PySbpParallelCheck(PyObject* obj);
-Symbol<SbpParallel> PyUnpackSbpParallel(PyObject* obj);
+Maybe<Symbol<SbpParallel>> PyUnpackSbpParallel(PyObject* obj);
 
 // SBP list
 bool PySbpParallelSequenceCheck(PyObject* obj);
-std::vector<Symbol<SbpParallel>> PyUnpackSbpParallelSequence(PyObject* obj);
+Maybe<std::vector<Symbol<SbpParallel>>> PyUnpackSbpParallelSequence(PyObject* obj);
 
 // Tensor index
 bool PyTensorIndexCheck(PyObject* obj);
-TensorIndex PyUnpackTensorIndex(PyObject* obj);
+Maybe<TensorIndex> PyUnpackTensorIndex(PyObject* obj);
 
 // OpExpr
 bool PyOpExprCheck(PyObject* obj);
-std::shared_ptr<OpExpr> PyUnpackOpExpr(PyObject* obj);
-
-template<typename T>
-inline PyObject* CastToPyObject(T&& t) {
-  return py::cast(t).inc_ref().ptr();
-}
-
-template<>
-inline PyObject* CastToPyObject<Maybe<Tensor>>(Maybe<Tensor>&& t) {
-  return PyTensor_New(t.GetPtrOrThrow());
-}
-
-template<>
-inline PyObject* CastToPyObject<Maybe<TensorTuple>>(Maybe<TensorTuple>&& t) {
-  const auto& tensor_tuple = t.GetPtrOrThrow();
-  py::tuple tup(tensor_tuple->size());
-  for (int i = 0; i < tensor_tuple->size(); ++i) { tup[i] = py::cast(tensor_tuple->at(i)); }
-  return py::cast<py::object>(tup).inc_ref().ptr();
-}
-
-template<>
-inline PyObject* CastToPyObject<Maybe<void>>(Maybe<void>&& t) {
-  t.GetOrThrow();
-  Py_RETURN_NONE;
-}
+Maybe<OpExpr> PyUnpackOpExpr(PyObject* obj);
 
 // int64_t
 Maybe<int64_t> PyUnpackLong(PyObject* py_obj);
diff --git a/oneflow/api/python/functional/function_def.h b/oneflow/api/python/functional/function_def.h
index 619ba5b6945..a1a4ab94a07 100644
--- a/oneflow/api/python/functional/function_def.h
+++ b/oneflow/api/python/functional/function_def.h
@@ -19,16 +19,19 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <vector>
+#include <pybind11/pybind11.h>
 
 #include "oneflow/api/python/functional/python_arg.h"
 #include "oneflow/api/python/functional/value_types.h"
 
+namespace py = pybind11;
+
 namespace oneflow {
 namespace one {
 namespace functional {
 
 struct ReturnDef {
-  explicit ReturnDef(const ValueType& t) : type(t) {}
+  ReturnDef(const ValueType& t) : type(t) {}
   ValueType type;
 };
 
@@ -51,7 +54,7 @@ struct ArgumentDef {
         keyword_only(arg_keyword_only),
         optional(arg_optional),
         has_default_value(true) {
-    default_value = std::make_shared<detail::TypedDefaultVal<T>>(arg_val);
+    default_value = std::make_shared<detail::TypedImmediate<T>>(arg_val);
   }
 
   std::string name;
@@ -61,7 +64,7 @@ struct ArgumentDef {
   bool keyword_only;
   bool optional;
   bool has_default_value;
-  std::shared_ptr<const detail::DefaultVal> default_value;
+  std::shared_ptr<const detail::Immediate> default_value;
 };
 
 struct FunctionDef {
diff --git a/oneflow/api/python/functional/indexing.cpp b/oneflow/api/python/functional/indexing.cpp
index 37e39ae06a3..d92cb23f86d 100644
--- a/oneflow/api/python/functional/indexing.cpp
+++ b/oneflow/api/python/functional/indexing.cpp
@@ -33,48 +33,49 @@ namespace functional {
 
 namespace detail {
 
-void PySliceUnpack(PyObject* object, Py_ssize_t* start, Py_ssize_t* stop, Py_ssize_t* step) {
+Maybe<void> PySliceUnpack(PyObject* object, Py_ssize_t* start, Py_ssize_t* stop, Py_ssize_t* step) {
   PySliceObject* obj = (PySliceObject*)object;
   if (obj->step == Py_None) {
     *step = 1;
   } else {
-    CHECK_OR_THROW(_PyEval_SliceIndex(obj->step, step))
-        << "Invalid slice " << PyObjectToReprStr(object);
-    CHECK_NE_OR_THROW(*step, 0) << "slice step cannot be zero.";
+    CHECK_OR_RETURN(_PyEval_SliceIndex(obj->step, step))
+        << "Invalid slice " << *JUST(PyObjectToReprStr(object));
+    CHECK_NE_OR_RETURN(*step, 0) << "slice step cannot be zero.";
     if (*step < -PY_SSIZE_T_MAX) *step = -PY_SSIZE_T_MAX;
   }
   if (obj->start == Py_None) {
     *start = *step < 0 ? PY_SSIZE_T_MAX : 0;
   } else {
-    CHECK_OR_THROW(_PyEval_SliceIndex(obj->start, start))
-        << "Invalid slice " << PyObjectToReprStr(object);
+    CHECK_OR_RETURN(_PyEval_SliceIndex(obj->start, start))
+        << "Invalid slice " << *JUST(PyObjectToReprStr(object));
   }
   if (obj->stop == Py_None) {
     *stop = *step < 0 ? PY_SSIZE_T_MIN : PY_SSIZE_T_MAX;
   } else {
-    CHECK_OR_THROW(_PyEval_SliceIndex(obj->stop, stop))
-        << "Invalid slice " << PyObjectToReprStr(object);
+    CHECK_OR_RETURN(_PyEval_SliceIndex(obj->stop, stop))
+        << "Invalid slice " << *JUST(PyObjectToReprStr(object));
   }
+  return Maybe<void>::Ok();
 }
 
-DataType InferScalarType(PyObject* object) {
+Maybe<DataType> InferScalarType(PyObject* object) {
   if (PyBool_Check(object)) {
     return DataType::kBool;
   } else if (PyLong_Check(object)) {
     return DataType::kInt64;
   } else if (PyArray_Check(object)) {
-    return numpy::GetOFDataTypeFromNpArray(reinterpret_cast<PyArrayObject*>(object)).GetOrThrow();
+    return numpy::GetOFDataTypeFromNpArray(reinterpret_cast<PyArrayObject*>(object));
   } else if (PyArray_CheckScalar(object)) {
-    return numpy::NumpyTypeToOFDataType(PyArray_DescrFromScalar(object)->type_num).GetOrThrow();
+    return numpy::NumpyTypeToOFDataType(PyArray_DescrFromScalar(object)->type_num);
   } else if (PySequence_Check(object)) {
     int64_t length = PySequence_Length(object);
-    CHECK_GT_OR_THROW(length, 0) << "Index should not be empty.";
+    CHECK_GT_OR_RETURN(length, 0) << "Index should not be empty.";
     DataType scalar_type = DataType::kInvalidDataType;
     for (int64_t i = 0; i < length; ++i) {
       PyObjectPtr item(PySequence_GetItem(object, i));
-      const auto& item_scalar_type = InferScalarType(item.get());
+      const auto& item_scalar_type = JUST(InferScalarType(item.get()));
       if (scalar_type != DataType::kInvalidDataType) {
-        CHECK_EQ_OR_THROW(scalar_type, item_scalar_type)
+        CHECK_EQ_OR_RETURN(scalar_type, item_scalar_type)
             << "Different scalar types are not allowed.";
       } else {
         scalar_type = item_scalar_type;
@@ -82,51 +83,53 @@ DataType InferScalarType(PyObject* object) {
     }
     return scalar_type;
   }
-  THROW(TypeError) << "Can't infer scalar type of " << Py_TYPE(object)->tp_name;
-  return DataType::kInvalidDataType;
+  UNIMPLEMENTED_THEN_RETURN() << "Can't infer scalar type of " << Py_TYPE(object)->tp_name;
 }
 
-void ParseScalar(PyObject* object, char* data, const DataType& dtype) {
+Maybe<void> ParseScalar(PyObject* object, char* data, const DataType& dtype) {
   if (dtype == DataType::kInt64) {
-    CHECK_OR_THROW(PyLong_Check(object) || numpy::PyArrayCheckLongScalar(object))
+    CHECK_OR_RETURN(PyLong_Check(object) || numpy::PyArrayCheckLongScalar(object))
         << "Expected a long value.";
     *(reinterpret_cast<int64_t*>(data)) = PyLong_AsLongLong(object);
+    return Maybe<void>::Ok();
   } else if (dtype == DataType::kInt32) {
-    CHECK_OR_THROW(PyLong_Check(object) || numpy::PyArrayCheckLongScalar(object))
+    CHECK_OR_RETURN(PyLong_Check(object) || numpy::PyArrayCheckLongScalar(object))
         << "Expected a long value.";
     *(reinterpret_cast<int32_t*>(data)) = PyLong_AsLongLong(object);
+    return Maybe<void>::Ok();
   } else if (dtype == DataType::kUInt8 || dtype == DataType::kBool) {
-    CHECK_OR_THROW(PyBool_Check(object) || PyLong_Check(object)
-                   || numpy::PyArrayCheckLongScalar(object))
+    CHECK_OR_RETURN(PyBool_Check(object) || PyLong_Check(object)
+                    || numpy::PyArrayCheckLongScalar(object))
         << "Expected a boolean or long value.";
     if (PyBool_Check(object) || numpy::PyArrayCheckBoolScalar(object)) {
       *(reinterpret_cast<bool*>(data)) = (object == Py_True);
     } else {
       int64_t value = PyLong_AsLongLong(object);
-      CHECK_OR_THROW(value >= 0 && value <= 255) << "Out of range 0-255.";
-      *(reinterpret_cast<uint8_t*>(data)) = static_cast<uint8_t>(value);
+      CHECK_OR_RETURN(value >= 0 && value <= 255) << "Out of range 0-255.";
+      *(reinterpret_cast<uint8_t*>(data)) = value;
     }
-  } else {
-    THROW(TypeError) << "Can't parse scalar with data type " << dtype;
+    return Maybe<void>::Ok();
   }
+  UNIMPLEMENTED_THEN_RETURN() << "Can't parse scalar with data type " << dtype;
 }
 
-void RecursiveParseAndAssign(PyObject* object, char* data, const int& ndims, const int& dim,
-                             const ShapeView& shape, const DimVector& strides,
-                             const DataType& dtype) {
+Maybe<void> RecursiveParseAndAssign(PyObject* object, char* data, const int& ndims, const int& dim,
+                                    const ShapeView& shape, const DimVector& strides,
+                                    const DataType& dtype) {
   if (dim == ndims) { return ParseScalar(object, data, dtype); }
   auto seq = PyObjectPtr(PySequence_Fast(object, "Expected a sequence."));
   int64_t size = PySequence_Fast_GET_SIZE(seq.get());
-  CHECK_EQ_OR_THROW(size, shape.At(dim)) << "Sequence size is " << size << " at dimemsion " << dim
-                                         << ", but expected " << shape.At(dim);
+  CHECK_EQ_OR_RETURN(size, shape.At(dim)) << "Sequence size is " << size << " at dimemsion " << dim
+                                          << ", but expected " << shape.At(dim);
   for (int64_t i = 0; i < size; ++i) {
     PyObject* item = PySequence_Fast_GET_ITEM(seq.get(), i);
-    RecursiveParseAndAssign(item, data, ndims, dim + 1, shape, strides, dtype);
+    JUST(RecursiveParseAndAssign(item, data, ndims, dim + 1, shape, strides, dtype));
     data += strides.at(dim) * GetSizeOfDataType(dtype);
   }
+  return Maybe<void>::Ok();
 }
 
-void ParseArrayToBlob(PyObject* object, Blob* blob) {
+Maybe<void> ParseArrayToBlob(PyObject* object, Blob* blob) {
   const DataType dtype = blob->data_type();
   const int ndims = blob->shape().NumAxes();
   DimVector strides(ndims);
@@ -135,28 +138,30 @@ void ParseArrayToBlob(PyObject* object, Blob* blob) {
     strides[i] = size;
     size *= blob->shape().At(i);
   }
-  RecursiveParseAndAssign(object, blob->mut_dptr<char>(), ndims, 0, blob->shape(), strides, dtype);
+  JUST(RecursiveParseAndAssign(object, blob->mut_dptr<char>(), ndims, 0, blob->shape(), strides,
+                               dtype));
+  return Maybe<void>::Ok();
 }
 
-Shape InferArraySizes(PyObject* object) {
+Maybe<Shape> InferArraySizes(PyObject* object) {
   DimVector sizes;
   PyObject* seq = object;
   PyObjectPtr handle;
   while (PySequence_Check(seq)) {
     int64_t length = PySequence_Length(seq);
-    CHECK_GT_OR_THROW(length, 0) << "Index should not be empty.";
+    CHECK_GT_OR_RETURN(length, 0) << "Index should not be empty.";
     sizes.emplace_back(length);
-    CHECK_LE_OR_THROW(sizes.size(), /*MAX_DIMS=*/128)
+    CHECK_LE_OR_RETURN(sizes.size(), /*MAX_DIMS=*/128)
         << "Too many dimensions " << Py_TYPE(seq)->tp_name;
     if (length == 0) break;
     handle = PyObjectPtr(PySequence_GetItem(seq, 0));
     seq = handle.get();
   }
-  return Shape(sizes);
+  return std::make_shared<Shape>(sizes);
 }
 
 Maybe<Tensor> ConvertToIndexingTensor(PyObject* object) {
-  const DataType dtype = InferScalarType(object);
+  const DataType dtype = JUST(InferScalarType(object));
   const auto& device = JUST(Device::New("cpu"));
 
   // index type must be integers
@@ -167,8 +172,8 @@ Maybe<Tensor> ConvertToIndexingTensor(PyObject* object) {
   // In advanced indexing condition, index can be array object, need to handle it specially.
   if (PyArray_Check(object)) { return TensorWithData(object, NullOpt, device, false); }
 
-  const auto& sizes = InferArraySizes(object);
-  const auto& tensor = JUST(functional::Empty(sizes, CHECK_JUST(DType::Get(dtype)), device));
+  const auto& sizes = JUST(InferArraySizes(object));
+  const auto& tensor = JUST(functional::Empty(*sizes, CHECK_JUST(DType::Get(dtype)), device));
   // Prevent the python object release until the callback is complete.
   Py_INCREF(object);
   auto handle = std::shared_ptr<PyObject>(PyObjectPtr(object));
@@ -179,7 +184,7 @@ Maybe<Tensor> ConvertToIndexingTensor(PyObject* object) {
         [handle](uint64_t ofblob_ptr) {
           auto* of_blob = reinterpret_cast<OfBlob*>(ofblob_ptr);
           CHECK_JUST(Global<ForeignLockHelper>::Get()->WithScopedAcquire([&]() -> Maybe<void> {
-            ParseArrayToBlob(handle.get(), of_blob->mut_blob());
+            JUST(ParseArrayToBlob(handle.get(), of_blob->mut_blob()));
             return Maybe<void>::Ok();
           }));
         },
@@ -188,28 +193,28 @@ Maybe<Tensor> ConvertToIndexingTensor(PyObject* object) {
   return tensor;
 }
 
-IndexItem UnpackIndexItem(PyObject* object) {
+Maybe<IndexItem> UnpackIndexItem(PyObject* object) {
   if (object == Py_Ellipsis) {
-    return IndexItem(EllipsisIndex{});
+    return std::make_shared<IndexItem>(EllipsisIndex{});
   } else if (PySlice_Check(object)) {
     Py_ssize_t start, end, step;
-    PySliceUnpack(object, &start, &end, &step);
-    return IndexItem(start, end, step);
+    JUST(PySliceUnpack(object, &start, &end, &step));
+    return std::make_shared<IndexItem>(start, end, step);
   } else if (PyLong_Check(object) && object != Py_False && object != Py_True) {
-    return IndexItem(static_cast<int64_t>(PyLong_AsLongLong(object)));
+    return std::make_shared<IndexItem>(static_cast<int64_t>(PyLong_AsLongLong(object)));
   } else if (numpy::PyArrayCheckLongScalar(object)) {
-    return IndexItem(static_cast<int64_t>(PyLong_AsLongLong(object)));
+    return std::make_shared<IndexItem>(static_cast<int64_t>(PyLong_AsLongLong(object)));
   } else if (object == Py_False || object == Py_True) {
-    return IndexItem(object == Py_True);
+    return std::make_shared<IndexItem>(object == Py_True);
   } else if (object == Py_None) {
-    return IndexItem(NoneIndex{});
-  } else if (PyTensor_Check(object)) {
-    return IndexItem(PyTensor_Unpack(object));
+    return std::make_shared<IndexItem>(NoneIndex{});
+  } else if (PyTensorCheck(object)) {
+    auto obj = py::reinterpret_borrow<py::object>(object);
+    return std::make_shared<IndexItem>(py::cast<std::shared_ptr<Tensor>>(obj));
   } else if (PySequence_Check(object)) {
-    return IndexItem(ConvertToIndexingTensor(object).GetPtrOrThrow());
+    return std::make_shared<IndexItem>(JUST(ConvertToIndexingTensor(object)));
   }
-  THROW(TypeError) << "Invalid index " << Py_TYPE(object)->tp_name;
-  return IndexItem();
+  UNIMPLEMENTED_THEN_RETURN() << "Invalid index of " << Py_TYPE(object)->tp_name;
 }
 
 }  // namespace detail
diff --git a/oneflow/api/python/functional/indexing.h b/oneflow/api/python/functional/indexing.h
index 4e157ce5e94..550accbabbb 100644
--- a/oneflow/api/python/functional/indexing.h
+++ b/oneflow/api/python/functional/indexing.h
@@ -29,11 +29,11 @@ namespace functional {
 
 namespace detail {
 
-void PySliceUnpack(PyObject* object, Py_ssize_t* start, Py_ssize_t* stop, Py_ssize_t* step);
+Maybe<void> PySliceUnpack(PyObject* object, Py_ssize_t* start, Py_ssize_t* stop, Py_ssize_t* step);
 
 Maybe<Tensor> ConvertToIndexingTensor(PyObject* object);
 
-IndexItem UnpackIndexItem(PyObject* object);
+Maybe<IndexItem> UnpackIndexItem(PyObject* object);
 
 }  // namespace detail
 
diff --git a/oneflow/api/python/functional/py_function.cpp b/oneflow/api/python/functional/py_function.cpp
new file mode 100644
index 00000000000..4c64448d930
--- /dev/null
+++ b/oneflow/api/python/functional/py_function.cpp
@@ -0,0 +1,124 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/api/python/functional/py_function.h"
+#include "oneflow/api/python/functional/common.h"
+
+namespace oneflow {
+namespace one {
+namespace functional {
+
+void ReportKwargsError(const py::kwargs& kwargs, const FunctionDef& function, size_t max_pos_args) {
+  for (auto it = kwargs.begin(); it != kwargs.end(); ++it) {
+    if (!PyStringCheck(it->first.ptr())) {
+      THROW(TypeError) << function.name << "(): keywords must be strings.";
+    }
+    bool unexpected_param = true;
+    const std::string key = PyStringAsString(it->first.ptr()).GetOrThrow();
+    for (const auto& arg : function.argument_def) {
+      if (arg.name == key) {
+        unexpected_param = false;
+        break;
+      }
+    }
+    if (unexpected_param) {
+      THROW(TypeError) << function.name  // NOLINT
+                       << "(): got an unexpected keyword argument '" << key << "'";
+    } else {
+      THROW(TypeError) << function.name  // NOLINT
+                       << "(): got multiple values for argument '" << key << "'";
+    }
+  }
+  THROW(TypeError) << function.name << "(): kwargs unknown error.";
+}
+
+// The argument parsing refers to the implementation of Pytorch.
+bool ParseArgs(const py::args& args, const py::kwargs& kwargs, std::vector<PythonArg>* parsed_args,
+               const FunctionDef& function, size_t max_pos_args, bool raise_exception) {
+  bool treat_args_as_list = false;
+  size_t nargs = args.size();
+  size_t remaining_kwargs = kwargs.size();
+
+  if (max_pos_args == 1) {
+    const auto& type = function.argument_def.at(0).type;
+    treat_args_as_list = IsIntegralListType(type) || type == kSHAPE || type == kTENSOR_TUPLE;
+  }
+  if (nargs > max_pos_args && !treat_args_as_list) {
+    if (raise_exception) {
+      THROW(TypeError) << function.name << "(): takes " << max_pos_args
+                       << " positional arguments but " << nargs << " were given.";
+    }
+    return false;
+  }
+  int arg_pos = 0;
+  for (int i = 0; i < function.argument_def.size(); ++i) {
+    const auto& param = function.argument_def.at(i);
+    py::object obj;
+    if (arg_pos < nargs) {
+      if (param.keyword_only) {
+        if (raise_exception) {
+          THROW(TypeError) << function.name << "(): argument '" << param.name
+                           << "' is keyword only.";
+        }
+        return false;
+      }
+      obj = args[arg_pos];
+    } else {
+      if (kwargs.contains(param.name.c_str())) {
+        obj = kwargs[param.name.c_str()];
+        remaining_kwargs--;
+      }
+    }
+
+    if (obj) {
+      if (arg_pos == 0 && treat_args_as_list && !param.keyword_only
+          && (PyLong_Check(obj.ptr()) || PyTensorCheck(obj.ptr()))) {
+        obj = args;
+        arg_pos = nargs;
+      } else {
+        arg_pos++;
+      }
+      PythonArg arg(obj, param.size);
+      if ((obj == Py_None && param.optional) || PythonArgCheck(arg, param.type)) {
+        parsed_args->at(i) = std::move(arg);
+      } else {
+        if (raise_exception) {
+          THROW(TypeError)
+              << function.name << "(): argument '" << param.name << "' must be "
+              << ValueTypeName(param.type).GetOrThrow() << ", not "
+              << PyStringAsString(PyObject_Str((PyObject*)Py_TYPE(obj.ptr()))).GetOrThrow();
+        }
+        return false;
+      }
+    } else {
+      if (!param.has_default_value) {
+        if (raise_exception) {
+          THROW(TypeError) << function.name << "(): missing required argument " << param.name;
+        }
+        return false;
+      }
+      parsed_args->at(i) = PythonArg(param.default_value);
+    }
+  }
+  if (remaining_kwargs > 0) {
+    if (raise_exception) { ReportKwargsError(kwargs, function, max_pos_args); }
+    return false;
+  }
+  return true;
+}
+
+}  // namespace functional
+}  // namespace one
+}  // namespace oneflow
diff --git a/oneflow/api/python/functional/py_function.h b/oneflow/api/python/functional/py_function.h
index e69de29bb2d..cc23b7ea4d2 100644
--- a/oneflow/api/python/functional/py_function.h
+++ b/oneflow/api/python/functional/py_function.h
@@ -0,0 +1,156 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_API_PYTHON_FUNCTIONAL_PY_FUNCTION_H_
+#define ONEFLOW_API_PYTHON_FUNCTIONAL_PY_FUNCTION_H_
+
+#include <pybind11/pybind11.h>
+#include <Python.h>
+#include <string>
+
+#include "oneflow/api/python/functional/common.h"
+#include "oneflow/api/python/functional/function_def.h"
+#include "oneflow/api/python/functional/python_arg.h"
+#include "oneflow/api/python/functional/unpack_call.h"
+#include "oneflow/core/common/just.h"
+#include "oneflow/core/common/throw.h"
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/job/lazy_mode.h"
+#include "oneflow/core/framework/op_interpreter/dispatch_frame.h"
+#include "oneflow/api/python/env/env.h"
+
+namespace py = pybind11;
+
+namespace oneflow {
+namespace one {
+namespace functional {
+
+bool ParseArgs(const py::args& args, const py::kwargs& kwargs,  // NOLINT
+               std::vector<PythonArg>* parsed_args, const FunctionDef& function,
+               size_t max_pos_args, bool raise_exception);
+
+template<typename... SchemaT>
+class PyFunctionDispatcher {
+ public:
+  static_assert(sizeof...(SchemaT) >= 1, "Requires 1 template argument at least.");
+
+  template<size_t I>
+  using schema_t = typename std::tuple_element<I, std::tuple<SchemaT...>>::type;
+
+  PyFunctionDispatcher()
+      : schema_size_(sizeof...(SchemaT)), func_name_(schema_t<0>::function_def.name) {
+    signatures_.resize(schema_size_);
+    InitSignatures(std::make_index_sequence<sizeof...(SchemaT)>{});
+  }
+
+  template<size_t I0, size_t... I>
+  py::object call(const py::args& args, const py::kwargs& kwargs,
+                  std::index_sequence<I0, I...>) const {
+    using T = schema_t<I0>;
+    std::vector<PythonArg> parsed_args(T::max_args);
+    if (ParseArgs(args, kwargs, &parsed_args, T::function_def, T::max_pos_args,
+                  /*raise_exception*/ schema_size_ == 1)) {
+      return detail::unpack_call(*T::func, parsed_args);
+    }
+    return call(args, kwargs, std::index_sequence<I...>{});
+  }
+
+  py::object call(const py::args& args, const py::kwargs& kwargs, std::index_sequence<>) const {
+    std::ostringstream ss;
+    ss << func_name_
+       << "(): received an invalid combination of arguments. The valid signatures are:";
+    for (int i = 0; i < signatures_.size(); ++i) {
+      ss << "\n\t*" << i << ": " << signatures_.at(i);
+    }
+    THROW(TypeError) << ss.str();
+    return py::none();
+  }
+
+  std::string func_name() { return func_name_; }
+
+ private:
+  template<size_t... I>
+  void InitSignatures(std::index_sequence<I...>) {
+    __attribute__((__unused__)) int dummy[] = {
+        ((void)(signatures_[I] = schema_t<I>::signature), 0)...};
+  }
+
+ private:
+  size_t schema_size_;
+  const std::string func_name_;
+  std::vector<const char*> signatures_;
+};
+
+namespace {
+std::string get_cur_frame_stack_str(int32_t max_stack_depth) {
+  std::string cur_f_str;
+  PyFrameObject* cur_frame = PyEval_GetFrame();
+  for (int32_t i = 0; i < max_stack_depth; i++) {
+    if (cur_frame == NULL) break;
+    const int32_t stack_index = (-1) * i - 1;
+    cur_f_str = "Python Stack[" + std::to_string(stack_index)
+                + "]: " + PyObjectToReprStr((PyObject*)cur_frame).GetOrThrow() + "; " + cur_f_str;
+    cur_frame = cur_frame->f_back;
+  }
+  return cur_f_str;
+}
+
+int32_t get_cur_stack_depth() {
+  int32_t current_stack_depth = 0;
+  PyFrameObject* f = PyEval_GetFrame();
+  while (f) {
+    current_stack_depth++;
+    f = f->f_back;
+  }
+  return current_stack_depth;
+}
+
+std::string get_cur_frame_stack_str() {
+  const bool debug_mode = GetGraphDebugMode();
+  const int32_t max_stack_depth = GetGraphDebugMaxPyStackDepth();
+  if (debug_mode) {  // show more info for the stack trace in debug mode
+    int32_t current_stack_depth = get_cur_stack_depth();
+    std::string cur_f_str = get_cur_frame_stack_str(max_stack_depth);
+    if (current_stack_depth > max_stack_depth) {  // show how many stack depth remaining to be shown
+      int32_t remaining_stack_depth = current_stack_depth - max_stack_depth;
+      cur_f_str += " ... " + std::to_string(remaining_stack_depth) + " more; ";
+    }
+    return cur_f_str;
+  }
+
+  return get_cur_frame_stack_str(max_stack_depth);
+}
+}  // namespace
+
+template<typename... SchemaT>
+inline py::object PyFunction(const py::args& args, const py::kwargs& kwargs) {
+  static PyFunctionDispatcher<SchemaT...> dispatcher;
+
+  if (OF_PREDICT_FALSE(LazyMode::is_enabled())) {
+    std::string cur_f_str =
+        get_cur_frame_stack_str() + "C API: <func " + dispatcher.func_name() + ">";
+    // User DispathFram to pass frame info to OpExpr or Interpreter.
+    DispatchFrame::Guard f_guard(cur_f_str);
+    return dispatcher.call(args, kwargs, std::make_index_sequence<sizeof...(SchemaT)>{});
+  } else {
+    return dispatcher.call(args, kwargs, std::make_index_sequence<sizeof...(SchemaT)>{});
+  }
+}
+
+}  // namespace functional
+}  // namespace one
+}  // namespace oneflow
+
+#endif  // ONEFLOW_API_PYTHON_FUNCTIONAL_PY_FUNCTION_H_
diff --git a/oneflow/api/python/functional/python_arg.cpp b/oneflow/api/python/functional/python_arg.cpp
index 03824d5e492..ac48f1513d4 100644
--- a/oneflow/api/python/functional/python_arg.cpp
+++ b/oneflow/api/python/functional/python_arg.cpp
@@ -13,9 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/api/python/functional/python_arg.h"
 
-#include "oneflow/api/python/framework/tensor.h"
+#include "oneflow/api/python/functional/python_arg.h"
 #include "oneflow/api/python/functional/common.h"
 #include "oneflow/api/python/functional/indexing.h"
 #include "oneflow/extension/python/numpy.h"
@@ -34,21 +33,17 @@ namespace oneflow {
 namespace one {
 namespace functional {
 
-#define INSTANCE_OBJECT_AS_INTEGER(T)                                                            \
-  template<>                                                                                     \
-  T PythonArg::ObjectAs<T>() const {                                                             \
-    return static_cast<T>(PyLong_AsLongLong(object_));                                           \
-  }                                                                                              \
-  template<>                                                                                     \
-  std::vector<T> PythonArg::ObjectAs<std::vector<T>>() const {                                   \
-    if (size_ > 0 && PyLong_Check(object_)) {                                                    \
-      return std::vector<T>(size_, static_cast<T>(PyLong_AsLongLong(object_)));                  \
-    }                                                                                            \
-    return PyUnpackLongSequence<T>(object_);                                                     \
-  }                                                                                              \
-  template<>                                                                                     \
-  std::shared_ptr<std::vector<T>> PythonArg::ObjectAs<std::shared_ptr<std::vector<T>>>() const { \
-    return std::make_shared<std::vector<T>>(ObjectAs<std::vector<T>>());                         \
+#define INSTANCE_OBJECT_AS_INTEGER(T)                                                             \
+  template<>                                                                                      \
+  Maybe<T> PythonArg::ObjectAs<T>() const {                                                       \
+    return static_cast<T>(PyLong_AsLongLong(object_));                                            \
+  }                                                                                               \
+  template<>                                                                                      \
+  Maybe<std::vector<T>> PythonArg::ObjectAs<std::vector<T>>() const {                             \
+    if (size_ > 0 && PyLong_Check(object_)) {                                                     \
+      return std::make_shared<std::vector<T>>(size_, static_cast<T>(PyLong_AsLongLong(object_))); \
+    }                                                                                             \
+    return PyUnpackLongSequence<T>(object_);                                                      \
   }
 
 OF_PP_FOR_EACH_TUPLE(INSTANCE_OBJECT_AS_INTEGER, INTEGER_TYPE_SEQ)
@@ -56,142 +51,149 @@ OF_PP_FOR_EACH_TUPLE(INSTANCE_OBJECT_AS_INTEGER, INTEGER_TYPE_SEQ)
 
 #define INSTANCE_OBJECT_AS_FLOAT(T)                                                              \
   template<>                                                                                     \
-  T PythonArg::ObjectAs<T>() const {                                                             \
+  Maybe<T> PythonArg::ObjectAs<T>() const {                                                      \
     return static_cast<T>(PyFloat_AsDouble(object_));                                            \
   }                                                                                              \
   template<>                                                                                     \
-  std::vector<T> PythonArg::ObjectAs<std::vector<T>>() const {                                   \
+  Maybe<std::vector<T>> PythonArg::ObjectAs<std::vector<T>>() const {                            \
     if (size_ > 0 && PyFloat_Check(object_)) {                                                   \
-      return std::vector<T>(size_, static_cast<T>(PyFloat_AsDouble(object_)));                   \
+      return std::make_shared<std::vector<T>>(size_, static_cast<T>(PyFloat_AsDouble(object_))); \
     }                                                                                            \
     return PyUnpackFloatSequence<T>(object_);                                                    \
-  }                                                                                              \
-  template<>                                                                                     \
-  std::shared_ptr<std::vector<T>> PythonArg::ObjectAs<std::shared_ptr<std::vector<T>>>() const { \
-    return std::make_shared<std::vector<T>>(ObjectAs<std::vector<T>>());                         \
   }
 
 OF_PP_FOR_EACH_TUPLE(INSTANCE_OBJECT_AS_FLOAT, FLOATING_TYPE_SEQ)
 #undef INSTANCE_OBJECT_AS_FLOAT
 
-#define INSTANCE_OBJECT_AS_SHARED_PTR(T)                               \
-  template<>                                                           \
-  std::shared_ptr<T> PythonArg::ObjectAs<std::shared_ptr<T>>() const { \
-    return std::make_shared<T>(ObjectAs<T>());                         \
-  }
-
 template<>
-std::string PythonArg::ObjectAs<std::string>() const {
-  return PyStringAsString(object_);
+Maybe<std::string> PythonArg::ObjectAs<std::string>() const {
+  return JUST(PyStringAsString(object_));
 }
-INSTANCE_OBJECT_AS_SHARED_PTR(std::string)
 
 template<>
-Scalar PythonArg::ObjectAs<Scalar>() const {
+Maybe<Scalar> PythonArg::ObjectAs<Scalar>() const {
   return PyUnpackScalar(object_);
 }
-INSTANCE_OBJECT_AS_SHARED_PTR(Scalar)
 
 template<>
-std::shared_ptr<one::Tensor> PythonArg::ObjectAs<std::shared_ptr<one::Tensor>>() const {
-  return PyTensor_Unpack(object_);
+Maybe<std::shared_ptr<one::Tensor>> PythonArg::ObjectAs<std::shared_ptr<one::Tensor>>() const {
+  return JUST(PyUnpackTensor(object_));
 }
 
 template<>
-one::TensorTuple PythonArg::ObjectAs<one::TensorTuple>() const {
-  if (PyTensorTupleCheck(object_)) { return *PyUnpackTensorTuple(object_); }
-  const auto& v = PyUnpackTensorSequence(object_);
-  one::TensorTuple values(v.size());
-  for (int i = 0; i < v.size(); ++i) { values[i] = v.at(i); }
+Maybe<one::Tensor> PythonArg::ObjectAs<one::Tensor>() const {
+  return PyUnpackTensor(object_);
+}
+
+template<>
+Maybe<std::shared_ptr<one::TensorTuple>> PythonArg::ObjectAs<std::shared_ptr<one::TensorTuple>>()
+    const {
+  if (PyTensorTupleCheck(object_)) { return JUST(PyUnpackTensorTuple(object_)); }
+  const auto& v = JUST(PyUnpackTensorSequence(object_));
+  auto values = std::make_shared<one::TensorTuple>(v->size());
+  for (int i = 0; i < v->size(); ++i) { values->at(i) = v->at(i); }
   return values;
 }
-INSTANCE_OBJECT_AS_SHARED_PTR(one::TensorTuple)
 
 template<>
-Symbol<DType> PythonArg::ObjectAs<Symbol<DType>>() const {
+Maybe<one::TensorTuple> PythonArg::ObjectAs<one::TensorTuple>() const {
+  return *JUST(ObjectAs<std::shared_ptr<one::TensorTuple>>());
+}
+
+template<>
+Maybe<Symbol<DType>> PythonArg::ObjectAs<Symbol<DType>>() const {
   return PyUnpackDType(object_);
 }
 
 template<>
-std::vector<Symbol<DType>> PythonArg::ObjectAs<std::vector<Symbol<DType>>>() const {
+Maybe<std::vector<Symbol<DType>>> PythonArg::ObjectAs<std::vector<Symbol<DType>>>() const {
   return PyUnpackDTypeSequence(object_);
 }
-INSTANCE_OBJECT_AS_SHARED_PTR(std::vector<Symbol<DType>>)
 
 template<>
-Shape PythonArg::ObjectAs<Shape>() const {
-  const auto& shape = PyUnpackLongSequence<int64_t>(object_);
-  return Shape(DimVector(shape.begin(), shape.end()));
+Maybe<Shape> PythonArg::ObjectAs<Shape>() const {
+  const auto& shape = JUST(PyUnpackLongSequence<int64_t>(object_));
+  return std::make_shared<Shape>(DimVector(shape->begin(), shape->end()));
 }
-INSTANCE_OBJECT_AS_SHARED_PTR(Shape)
 
 template<>
-std::vector<Shape> PythonArg::ObjectAs<std::vector<Shape>>() const {
+Maybe<std::vector<Shape>> PythonArg::ObjectAs<std::vector<Shape>>() const {
   return PyUnpackShapeSequence(object_);
 }
-INSTANCE_OBJECT_AS_SHARED_PTR(std::vector<Shape>)
 
 template<>
-std::shared_ptr<one::Generator> PythonArg::ObjectAs<std::shared_ptr<one::Generator>>() const {
+Maybe<std::shared_ptr<one::Generator>> PythonArg::ObjectAs<std::shared_ptr<one::Generator>>()
+    const {
+  return JUST(PyUnpackGenerator(object_));
+}
+
+template<>
+Maybe<one::Generator> PythonArg::ObjectAs<one::Generator>() const {
   return PyUnpackGenerator(object_);
 }
 
 template<>
-Symbol<Device> PythonArg::ObjectAs<Symbol<Device>>() const {
+Maybe<Symbol<Device>> PythonArg::ObjectAs<Symbol<Device>>() const {
   if (PyStringCheck(object_)) {
-    std::string device_str = PyStringAsString(object_);
-    return Device::ParseAndNew(device_str).GetOrThrow();
+    std::string device_str = *JUST(PyStringAsString(object_));
+    return Device::ParseAndNew(device_str);
   }
   return PyUnpackDevice(object_);
 }
 
 template<>
-Symbol<ParallelDesc> PythonArg::ObjectAs<Symbol<ParallelDesc>>() const {
+Maybe<Symbol<ParallelDesc>> PythonArg::ObjectAs<Symbol<ParallelDesc>>() const {
   return PyUnpackParallelDesc(object_);
 }
 
 template<>
-Symbol<SbpParallel> PythonArg::ObjectAs<Symbol<SbpParallel>>() const {
+Maybe<Symbol<SbpParallel>> PythonArg::ObjectAs<Symbol<SbpParallel>>() const {
   return PyUnpackSbpParallel(object_);
 }
 
 template<>
-std::vector<Symbol<SbpParallel>> PythonArg::ObjectAs<std::vector<Symbol<SbpParallel>>>() const {
+Maybe<std::vector<Symbol<SbpParallel>>> PythonArg::ObjectAs<std::vector<Symbol<SbpParallel>>>()
+    const {
   if (PySbpParallelCheck(object_)) {
-    return std::vector<Symbol<SbpParallel>>(1, PyUnpackSbpParallel(object_));
+    return std::make_shared<std::vector<Symbol<SbpParallel>>>(1,
+                                                              JUST(PyUnpackSbpParallel(object_)));
   }
   return PyUnpackSbpParallelSequence(object_);
 }
-INSTANCE_OBJECT_AS_SHARED_PTR(std::vector<Symbol<SbpParallel>>)
 
 template<>
-TensorIndex PythonArg::ObjectAs<TensorIndex>() const {
+Maybe<TensorIndex> PythonArg::ObjectAs<TensorIndex>() const {
   return PyUnpackTensorIndex(object_);
 }
-INSTANCE_OBJECT_AS_SHARED_PTR(TensorIndex)
 
 template<>
-std::shared_ptr<one::OpExpr> PythonArg::ObjectAs<std::shared_ptr<one::OpExpr>>() const {
+Maybe<std::shared_ptr<one::OpExpr>> PythonArg::ObjectAs<std::shared_ptr<one::OpExpr>>() const {
+  return JUST(PyUnpackOpExpr(object_));
+}
+
+template<>
+Maybe<one::OpExpr> PythonArg::ObjectAs<one::OpExpr>() const {
   return PyUnpackOpExpr(object_);
 }
 
 template<>
-PyObject* PythonArg::ObjectAs<PyObject*>() const {
+Maybe<PyObject*> PythonArg::ObjectAs<PyObject*>() const {
   return object_;
 }
 
 template<>
-std::vector<std::string> PythonArg::ObjectAs<std::vector<std::string>>() const {
-  return PyUnpackSequence<std::string>(
-      object_, [](PyObject* item) -> std::string { return PyStringAsString(item); });
+Maybe<const PyObject*> PythonArg::ObjectAs<const PyObject*>() const {
+  return object_;
 }
 
-INSTANCE_OBJECT_AS_SHARED_PTR(std::vector<std::string>)
-
-#undef INSTANCE_OBJECT_AS_SHARED_PTR
+template<>
+Maybe<std::vector<std::string>> PythonArg::ObjectAs<std::vector<std::string>>() const {
+  return PyUnpackSequence<std::string>(
+      object_, [](PyObject* item) -> Maybe<std::string> { return JUST(PyStringAsString(item)); });
+}
 
-bool PythonArg::TypeCheck(ValueType type) const {
-  if (tag_ == HAS_DEFAULT) { return default_val_->value_type() == type; }
+Maybe<bool> PythonArg::TypeCheck(ValueType type) const {
+  if (active_tag_ == HAS_IMMEDIATE) { return immediate_->value_type() == type; }
   switch (type) {
     case kINT32:
     case kUINT32:
@@ -217,7 +219,7 @@ bool PythonArg::TypeCheck(ValueType type) const {
       return PyScalarCheck(object_) || numpy::PyArrayCheckLongScalar(object_)
              || numpy::PyArrayCheckFloatScalar(object_);
     case kTENSOR:
-    case kTENSOR_REF: return PyTensor_Check(object_);
+    case kTENSOR_REF: return PyTensorCheck(object_);
     case kTENSOR_TUPLE: return PyTensorTupleCheck(object_) || PyTensorSequenceCheck(object_);
     case kDTYPE: return PyDTypeCheck(object_);
     case kSHAPE: return PyLongSequenceCheck(object_);
@@ -234,13 +236,15 @@ bool PythonArg::TypeCheck(ValueType type) const {
     case kDTYPE_LIST: return PyDTypeSequenceCheck(object_);
     case kSHAPE_LIST: return PyShapeSequenceCheck(object_);
     default: {
-      THROW(RuntimeError) << "Can not check type " << ValueTypeName(type);
+      OF_UNIMPLEMENTED() << "Can not check type " << JUST(ValueTypeName(type));
     }
   }
   return false;
 }
 
-bool PythonArgCheck(const PythonArg& arg, ValueType type) { return arg.TypeCheck(type); }
+bool PythonArgCheck(const PythonArg& arg, ValueType type) {
+  return arg.TypeCheck(type).GetOrThrow();
+}
 
 }  // namespace functional
 }  // namespace one
diff --git a/oneflow/api/python/functional/python_arg.h b/oneflow/api/python/functional/python_arg.h
index 1c2dd490ccb..faab037a8bf 100644
--- a/oneflow/api/python/functional/python_arg.h
+++ b/oneflow/api/python/functional/python_arg.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define ONEFLOW_API_PYTHON_FUNCTIONAL_PYTHON_ARG_H_
 
 #include <pybind11/pybind11.h>
-#include <Python.h>
 
 #include "oneflow/core/common/throw.h"
 #include "oneflow/api/python/functional/value_types.h"
@@ -31,87 +30,103 @@ namespace functional {
 
 namespace detail {
 
-struct DefaultVal {
+struct Immediate {
   virtual ValueType value_type() const = 0;
   virtual const void* Ptr() const = 0;
 };
 
 template<typename T>
-struct TypedDefaultVal final : public DefaultVal {
+struct TypedImmediate final : public Immediate {
   T content;
-  explicit TypedDefaultVal(const T& v) : content(v) {}
+  explicit TypedImmediate(const T& v) : content(v) {}
 
   ValueType value_type() const override { return ValueTypeOf<T>(); }
   const void* Ptr() const override { return &content; }
 };
 
-template<typename T>
-struct optional_traits {
-  using type = void;
-};
-
-template<typename T>
-struct optional_traits<Optional<T>> {
-  using type =
-      decltype(std::declval<Optional<T>>().Data_YouAreNotAllowedToCallThisFuncOutsideThisFile());
-};
-
 }  // namespace detail
 
 class PythonArg {
  public:
   PythonArg() = default;
-  PythonArg(const py::object& object, int size = 0) : PythonArg(object.ptr(), size) {}
-  PythonArg(PyObject* object, int size = 0)
-      : object_(object), default_val_(), size_(size), tag_(HAS_OBJECT) {}
+  PythonArg(const py::object& object)
+      : object_(object.ptr()), immediate_(), size_(0), active_tag_(HAS_OBJECT) {}
 
-  PythonArg(const std::shared_ptr<const detail::DefaultVal>& value, int size = 0)
-      : object_(nullptr), default_val_(value), size_(size), tag_(HAS_DEFAULT) {}
+  PythonArg(const py::object& object, int size)
+      : object_(object.ptr()), immediate_(), size_(size), active_tag_(HAS_OBJECT) {}
+
+  PythonArg(const std::shared_ptr<const detail::Immediate>& value)
+      : object_(nullptr), immediate_(value), size_(0), active_tag_(HAS_IMMEDIATE) {}
 
   template<typename T, typename std::enable_if<!py::detail::is_pyobject<T>::value, int>::type = 0>
-  PythonArg(const T& value, int size = 0)
+  PythonArg(const T& value)
       : object_(nullptr),
-        default_val_(std::make_shared<detail::TypedDefaultVal<T>>(value)),
-        size_(size),
-        tag_(HAS_DEFAULT) {}
+        immediate_(std::make_shared<detail::TypedImmediate<T>>(value)),
+        size_(0),
+        active_tag_(HAS_IMMEDIATE) {}
 
   virtual ~PythonArg() = default;
 
-  template<typename T, typename std::enable_if<!internal::IsOptional<T>::value, int>::type = 0>
-  T As() const {
-    if (tag_ == HAS_DEFAULT) {
-      CHECK_EQ_OR_THROW(ValueTypeOf<T>(), default_val_->value_type())
-          << "Could not convert default value from type " << default_val_->value_type()
-          << " to type " << ValueTypeOf<T>();
-      return *reinterpret_cast<const T*>(default_val_->Ptr());
-    }
-    CHECK_EQ_OR_THROW(tag_, HAS_OBJECT);
-    return ObjectAs<oneflow::detail::remove_cvref_t<T>>();
+  PythonArg(const PythonArg& other)
+      : object_(other.object_),
+        immediate_(other.immediate_),
+        size_(other.size_),
+        active_tag_(other.active_tag_) {}
+  PythonArg(PythonArg&& other)
+      : object_(other.object_),
+        immediate_(std::move(other.immediate_)),
+        size_(other.size_),
+        active_tag_(other.active_tag_) {}
+
+  PythonArg& operator=(const PythonArg& other) {
+    object_ = other.object_;
+    immediate_ = other.immediate_;
+    size_ = other.size_;
+    active_tag_ = other.active_tag_;
+    return *this;
   }
+  PythonArg& operator=(PythonArg&& other) {
+    object_ = other.object_;
+    immediate_ = std::move(other.immediate_);
+    size_ = other.size_;
+    active_tag_ = other.active_tag_;
+    return *this;
+  }
+
+  template<typename T>
+  struct ObjectAsHelper {
+    Maybe<T> operator()(const PythonArg* self) { return self->ObjectAs<T>(); }
+  };
+  template<typename T>
+  struct ObjectAsHelper<Optional<T>> {
+    Maybe<Optional<T>> operator()(const PythonArg* self) {
+      if (self->object_ == Py_None) { return std::make_shared<Optional<T>>(); }
+      return std::make_shared<Optional<T>>(JUST(self->ObjectAs<T>()));
+    }
+  };
 
-  template<typename T, typename std::enable_if<internal::IsOptional<T>::value, int>::type = 0>
+  template<typename T>
   T As() const {
-    if (tag_ == HAS_DEFAULT) {
-      CHECK_EQ_OR_THROW(ValueTypeOf<T>(), default_val_->value_type())
-          << "Could not convert default value from type " << default_val_->value_type()
+    if (active_tag_ == HAS_IMMEDIATE) {
+      CHECK_EQ_OR_THROW(ValueTypeOf<T>(), immediate_->value_type())
+          << "Could not convert immediate value from type " << immediate_->value_type()
           << " to type " << ValueTypeOf<T>();
-      return *reinterpret_cast<const T*>(default_val_->Ptr());
+      return *reinterpret_cast<const T*>(immediate_->Ptr());
     }
-    CHECK_EQ_OR_THROW(tag_, HAS_OBJECT);
-    if (object_ == Py_None) { return T(); }
-    return ObjectAs<typename detail::optional_traits<T>::type>();
+    CHECK_EQ_OR_THROW(active_tag_, HAS_OBJECT);
+    return ObjectAsHelper<oneflow::detail::remove_cvref_t<T>>()(this).GetOrThrow();
   }
 
-  bool TypeCheck(ValueType type) const;
+  Maybe<bool> TypeCheck(ValueType type) const;
 
  private:
   template<typename T>
-  T ObjectAs() const;
+  Maybe<T> ObjectAs() const;
 
   PyObject* object_;
-  std::shared_ptr<const detail::DefaultVal> default_val_;
+  std::shared_ptr<const detail::Immediate> immediate_;
   size_t size_;
-  enum { HAS_OBJECT, HAS_DEFAULT, HAS_NONE } tag_;
+  enum { HAS_OBJECT, HAS_IMMEDIATE, HAS_NONE } active_tag_;
 };
 
 bool PythonArgCheck(const PythonArg& arg, ValueType type);
diff --git a/oneflow/api/python/functional/python_arg_parser.cpp b/oneflow/api/python/functional/python_arg_parser.cpp
deleted file mode 100644
index 736c683d058..00000000000
--- a/oneflow/api/python/functional/python_arg_parser.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/api/python/functional/python_arg_parser.h"
-#include "oneflow/api/python/functional/common.h"
-#include "oneflow/api/python/functional/python_arg.h"
-
-namespace oneflow {
-namespace one {
-namespace functional {
-
-void FunctionSchema::ReportKwargsError(PyObject* kwargs) const {
-  PyObject *key = nullptr, *value = nullptr;
-  Py_ssize_t pos = 0;
-
-  while (PyDict_Next(kwargs, &pos, &key, &value)) {
-    if (!PyStringCheck(key)) { THROW(TypeError) << def_->name << "(): keywords must be strings"; }
-    bool unexpected_param = true;
-    const std::string string_key = PyStringAsString(key);
-    for (const auto& arg : def_->argument_def) {
-      if (arg.name == string_key) {
-        unexpected_param = false;
-        break;
-      }
-    }
-    if (unexpected_param) {
-      THROW(TypeError) << def_->name  // NOLINT
-                       << "(): got an unexpected keyword argument '" << string_key << "'";
-    } else {
-      THROW(TypeError) << def_->name  // NOLINT
-                       << "(): got multiple values for argument '" << string_key << "'";
-    }
-  }
-  THROW(TypeError) << def_->name << "(): kwargs unknown error";
-}
-
-// The argument parsing refers to the implementation of Pytorch.
-bool FunctionSchema::Parse(PyObject* args, PyObject* kwargs, PythonArg* parsed_args,
-                           bool raise_exception) const {
-  bool treat_args_as_list = false;
-  size_t nargs = args ? PyTuple_Size(args) : 0;
-  size_t remaining_kwargs = kwargs ? PyDict_Size(kwargs) : 0;
-
-  if (max_pos_nargs_ == 1) {
-    const auto& type = def_->argument_def.at(0).type;
-    treat_args_as_list = IsIntegralListType(type) || type == kSHAPE || type == kTENSOR_TUPLE;
-  }
-  if (nargs > max_pos_nargs_ && !treat_args_as_list) {
-    if (raise_exception) {
-      THROW(TypeError) << def_->name << "(): takes " << max_pos_nargs_
-                       << " positional arguments but " << nargs << " were given";
-    }
-    return false;
-  }
-  int arg_pos = 0;
-  for (int i = 0; i < def_->argument_def.size(); ++i) {
-    const auto& param = def_->argument_def.at(i);
-    PyObject* obj = NULL;
-    if (args && arg_pos < nargs) {
-      if (param.keyword_only) {
-        if (raise_exception) {
-          THROW(TypeError) << def_->name << "(): argument '" << param.name << "' is keyword only";
-        }
-        return false;
-      }
-      obj = PyTuple_GetItem(args, arg_pos);
-    } else if (kwargs) {
-      obj = PyDict_GetItemString(kwargs, param.name.c_str());
-      if (obj) { remaining_kwargs--; }
-    }
-
-    if (obj) {
-      if (arg_pos == 0 && treat_args_as_list && !param.keyword_only
-          && (PyLong_Check(obj) || PyTensor_Check(obj))) {
-        obj = args;
-        arg_pos = nargs;
-      } else {
-        arg_pos++;
-      }
-      PythonArg arg(obj, param.size);
-      if ((obj == Py_None && param.optional) || PythonArgCheck(arg, param.type)) {
-        parsed_args[i] = arg;
-      } else {
-        if (raise_exception) {
-          THROW(TypeError) << def_->name << "(): argument '" << param.name << "' must be "
-                           << ValueTypeName(param.type) << ", not "
-                           << PyStringAsString(PyObject_Str((PyObject*)Py_TYPE(obj)));
-        }
-        return false;
-      }
-    } else {
-      if (!param.has_default_value) {
-        if (raise_exception) {
-          THROW(TypeError) << def_->name << "(): missing required argument " << param.name;
-        }
-        return false;
-      }
-      parsed_args[i] = param.default_value;
-    }
-  }
-  if (remaining_kwargs > 0) {
-    if (raise_exception) { ReportKwargsError(kwargs); }
-    return false;
-  }
-  return true;
-}
-
-}  // namespace functional
-}  // namespace one
-}  // namespace oneflow
diff --git a/oneflow/api/python/functional/python_arg_parser.h b/oneflow/api/python/functional/python_arg_parser.h
deleted file mode 100644
index 71845c6f562..00000000000
--- a/oneflow/api/python/functional/python_arg_parser.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_API_PYTHON_FUNCTIONAL_PYTHON_ARG_PARSER_H_
-#define ONEFLOW_API_PYTHON_FUNCTIONAL_PYTHON_ARG_PARSER_H_
-
-#include <Python.h>
-
-#include "oneflow/api/python/functional/function_def.h"
-#include "oneflow/api/python/functional/python_arg.h"
-#include "oneflow/core/common/throw.h"
-#include "oneflow/core/common/util.h"
-
-namespace oneflow {
-namespace one {
-namespace functional {
-
-template<int N>
-class ParsedArgs {
- public:
-  ParsedArgs() = default;
-
-  const PythonArg& operator[](size_t idx) const { return data[idx]; }
-  PythonArg& operator[](size_t idx) { return data[idx]; }
-
- public:
-  PythonArg data[N];
-};
-
-class FunctionSchema {
- public:
-  FunctionSchema() = default;
-  FunctionSchema(const std::string& signature, const FunctionDef* def, size_t max_pos_nargs)
-      : signature_(signature), def_(def), max_pos_nargs_(max_pos_nargs) {}
-
-  const std::string& signature() const { return signature_; }
-
-  bool Parse(PyObject* args, PyObject* kwargs, PythonArg* parsed_args, bool raise_exception) const;
-
- private:
-  void ReportKwargsError(PyObject* kwargs) const;
-
-  std::string signature_;
-  const FunctionDef* def_;
-  size_t max_pos_nargs_;
-};
-
-template<typename... SchemaT>
-class PythonArgParser {
- public:
-  static_assert(sizeof...(SchemaT) >= 1, "requires 1 template argument at least.");
-  static constexpr size_t kSchemaSize = sizeof...(SchemaT);
-  static constexpr size_t N = std::max({SchemaT::max_args...});
-
-  template<size_t I>
-  using schema_t = typename std::tuple_element<I, std::tuple<SchemaT...>>::type;
-
-  PythonArgParser(const std::string& name) : name_(name) {
-    Init(std::make_index_sequence<sizeof...(SchemaT)>{});
-  }
-
-  int Parse(PyObject* args, PyObject* kwargs, ParsedArgs<N>* parsed_args) const {
-    bool raise_exception = (kSchemaSize == 1);
-    for (int i = 0; i < kSchemaSize; ++i) {
-      if (schema_[i].Parse(args, kwargs, parsed_args->data, raise_exception)) { return i; }
-    }
-    ReportInvalidArgsError(args, kwargs);
-    return -1;
-  }
-
- private:
-  template<size_t... I>
-  void Init(std::index_sequence<I...>) {
-    __attribute__((__unused__)) int dummy[] = {
-        ((void)(schema_[I] = FunctionSchema(schema_t<I>::signature, &schema_t<I>::function_def,
-                                            schema_t<I>::max_pos_args)),
-         0)...};
-  }
-
-  void ReportInvalidArgsError(PyObject* args, PyObject* kwargs) const {
-    std::ostringstream ss;
-    ss << name_ << "(): received an invalid combination of arguments. The valid signatures are:";
-    for (int i = 0; i < kSchemaSize; ++i) { ss << "\n\t*" << i << ": " << schema_[i].signature(); }
-    THROW(TypeError) << ss.str();
-  }
-
- private:
-  std::string name_;
-  FunctionSchema schema_[kSchemaSize];
-};
-
-}  // namespace functional
-}  // namespace one
-}  // namespace oneflow
-
-#endif  // ONEFLOW_API_PYTHON_FUNCTIONAL_PYTHON_ARG_PARSER_H_
diff --git a/oneflow/api/python/functional/python_frame.h b/oneflow/api/python/functional/python_frame.h
deleted file mode 100644
index c6db38dac15..00000000000
--- a/oneflow/api/python/functional/python_frame.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_API_PYTHON_FUNCTIONAL_PYTHON_FRAME_H_
-#define ONEFLOW_API_PYTHON_FUNCTIONAL_PYTHON_FRAME_H_
-
-#include <Python.h>
-
-#include "oneflow/api/python/functional/common.h"
-#include "oneflow/core/framework/op_interpreter/dispatch_frame.h"
-#include "oneflow/core/job/graph_scope_vars.h"
-
-namespace oneflow {
-namespace one {
-namespace functional {
-
-namespace {
-std::string get_cur_frame_stack_str(int32_t max_stack_depth) {
-  std::string cur_f_str;
-  PyFrameObject* cur_frame = PyEval_GetFrame();
-  for (int32_t i = 0; i < max_stack_depth; i++) {
-    if (cur_frame == NULL) break;
-    const int32_t stack_index = (-1) * i - 1;
-    cur_f_str = "Python Stack[" + std::to_string(stack_index)
-                + "]: " + PyObjectToReprStr((PyObject*)cur_frame) + "; " + cur_f_str;
-    cur_frame = cur_frame->f_back;
-  }
-  return cur_f_str;
-}
-
-int32_t get_cur_stack_depth() {
-  int32_t current_stack_depth = 0;
-  PyFrameObject* f = PyEval_GetFrame();
-  while (f) {
-    current_stack_depth++;
-    f = f->f_back;
-  }
-  return current_stack_depth;
-}
-
-std::string get_cur_frame_stack_str() {
-  const bool debug_mode = GetGraphDebugMode();
-  const int32_t max_stack_depth = GetGraphDebugMaxPyStackDepth();
-  if (debug_mode) {  // show more info for the stack trace in debug mode
-    int32_t current_stack_depth = get_cur_stack_depth();
-    std::string cur_f_str = get_cur_frame_stack_str(max_stack_depth);
-    if (current_stack_depth > max_stack_depth) {  // show how many stack depth remaining to be shown
-      int32_t remaining_stack_depth = current_stack_depth - max_stack_depth;
-      cur_f_str += " ... " + std::to_string(remaining_stack_depth) + " more; ";
-    }
-    return cur_f_str;
-  }
-
-  return get_cur_frame_stack_str(max_stack_depth);
-}
-}  // namespace
-
-class PythonFrameGuard {
- public:
-  PythonFrameGuard() {
-    if (OF_PREDICT_FALSE(LazyMode::is_enabled())) {
-      prev_frame_str_ = DispatchFrame::get_str();
-      DispatchFrame::set_str(get_cur_frame_stack_str());
-    }
-  }
-  ~PythonFrameGuard() {
-    if (OF_PREDICT_FALSE(LazyMode::is_enabled())) { DispatchFrame::set_str(prev_frame_str_); }
-  }
-
- private:
-  std::string prev_frame_str_;
-};
-
-}  // namespace functional
-}  // namespace one
-}  // namespace oneflow
-
-#endif  // ONEFLOW_API_PYTHON_FUNCTIONAL_PYTHON_FRAME_H_
diff --git a/oneflow/api/python/functional/tensor_api.cpp b/oneflow/api/python/functional/tensor_api.cpp
index 90f8a80c279..465f1af01cc 100644
--- a/oneflow/api/python/functional/tensor_api.cpp
+++ b/oneflow/api/python/functional/tensor_api.cpp
@@ -50,7 +50,7 @@ class TensorWithDataFunctor {
     //  its a eager tensor by Run functional::Empty() in LazyMode::Grad(false)
     LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false);
 
-    if (PyTensor_Check(data)) {
+    if (PyTensorCheck(data)) {
       // Throw warnings like pytorch.
       auto ret = PyErr_WarnEx(
           PyExc_UserWarning,
@@ -60,7 +60,7 @@ class TensorWithDataFunctor {
           1);
       if (ret != 0) { return Error::RuntimeError(); }
 
-      const auto& other = PyTensor_Unpack(data);
+      const auto& other = JUST(PyUnpackTensor(data));
       return MakeTensorFromOtherTensor(other, dtype, device, requires_grad);
     } else {
       // Make tensor from python sequence or numpy array.
@@ -79,7 +79,7 @@ class ConsistentTensorWithDataFunctor {
     LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false);
     JUST(CheckDeviceIdsIsValid(placement));
 
-    if (PyTensor_Check(data)) {
+    if (PyTensorCheck(data)) {
       // Throw warnings like pytorch.
       auto ret = PyErr_WarnEx(
           PyExc_UserWarning,
@@ -89,7 +89,7 @@ class ConsistentTensorWithDataFunctor {
           1);
       if (ret != 0) { return Error::RuntimeError(); }
 
-      const auto& other = PyTensor_Unpack(data);
+      const auto& other = JUST(PyUnpackTensor(data));
       return MakeTensorFromOtherTensor(other, dtype, placement, sbp_tuple, requires_grad);
     }
     // Make consistent tensor from python sequence or numpy array.
@@ -138,8 +138,8 @@ class TensorWithDataCtorFunctor {
     LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false);
 
     const auto& dtype = DType::Float();
-    if (PyTensor_Check(data)) {
-      const auto& other = PyTensor_Unpack(data);
+    if (PyTensorCheck(data)) {
+      const auto& other = JUST(PyUnpackTensor(data));
       return MakeTensorFromOtherTensor(other, dtype, device,
                                        /*requires_grad=*/false);
     }
@@ -164,8 +164,8 @@ class ConsistentTensorWithDataCtorFunctor {
     LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false);
 
     const auto& dtype = DType::Float();
-    if (PyTensor_Check(data)) {
-      const auto& other = PyTensor_Unpack(data);
+    if (PyTensorCheck(data)) {
+      const auto& other = JUST(PyUnpackTensor(data));
       return MakeTensorFromOtherTensor(other, dtype, placement, sbp_tuple,
                                        /*requires_grad=*/false);
     }
diff --git a/oneflow/api/python/functional/unpack_call.h b/oneflow/api/python/functional/unpack_call.h
new file mode 100644
index 00000000000..4cf05564ae0
--- /dev/null
+++ b/oneflow/api/python/functional/unpack_call.h
@@ -0,0 +1,84 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_API_PYTHON_FUNCTIONAL_UNPACK_CALL_H_
+#define ONEFLOW_API_PYTHON_FUNCTIONAL_UNPACK_CALL_H_
+
+#include "oneflow/api/python/functional/python_arg.h"
+
+#include <tuple>
+#include <utility>
+#include "oneflow/core/common/throw.h"
+#include "oneflow/core/framework/tensor.h"
+#include "oneflow/core/framework/tensor_tuple.h"
+#include "oneflow/core/common/function_traits.h"
+
+namespace oneflow {
+namespace one {
+namespace functional {
+
+namespace detail {
+
+template<typename F, typename R>
+struct unpack_call_dispatcher {
+  template<size_t... I>
+  static R apply(const F& f, const std::vector<PythonArg>& args, std::index_sequence<I...>) {
+    return f(args[I]
+                 .As<oneflow::detail::remove_cvref_t<typename std::tuple_element<
+                     I, typename function_traits<F>::args_type>::type>>()...);
+  }
+};
+
+template<typename T>
+inline py::object CastToPyObject(T&& t) {
+  return py::cast(t);
+}
+
+template<>
+inline py::object CastToPyObject<Maybe<Tensor>>(Maybe<Tensor>&& t) {
+  return py::cast(t.GetPtrOrThrow());
+}
+
+template<>
+inline py::object CastToPyObject<Maybe<TensorTuple>>(Maybe<TensorTuple>&& t) {
+  const auto& tensor_tuple = t.GetPtrOrThrow();
+  py::tuple tup(tensor_tuple->size());
+  for (int i = 0; i < tensor_tuple->size(); ++i) { tup[i] = py::cast(tensor_tuple->at(i)); }
+  return py::cast<py::object>(tup);
+}
+
+template<>
+inline py::object CastToPyObject<Maybe<void>>(Maybe<void>&& t) {
+  t.GetOrThrow();
+  return py::none();
+}
+
+template<typename F>
+py::object unpack_call(const F& f, const std::vector<PythonArg>& args) {
+  constexpr size_t nargs = function_traits<F>::nargs;
+  CHECK_EQ_OR_THROW(nargs, args.size())
+      << "Requires " << nargs << " arguments, but " << args.size() << " is given.";
+  using R = typename function_traits<F>::return_type;
+  return CastToPyObject(
+      unpack_call_dispatcher<F, R>::apply(f, args, std::make_index_sequence<nargs>{}));
+}
+
+}  // namespace detail
+
+}  // namespace functional
+}  // namespace one
+}  // namespace oneflow
+
+#endif  // ONEFLOW_API_PYTHON_FUNCTIONAL_UNPACK_CALL_H_
diff --git a/oneflow/api/python/functional/value_types.cpp b/oneflow/api/python/functional/value_types.cpp
index f71511f8701..61319bd77eb 100644
--- a/oneflow/api/python/functional/value_types.cpp
+++ b/oneflow/api/python/functional/value_types.cpp
@@ -15,7 +15,6 @@ limitations under the License.
 */
 #include "oneflow/api/python/functional/value_types.h"
 
-#include "oneflow/core/common/throw.h"
 #include "oneflow/core/common/hash_container.h"
 
 namespace oneflow {
@@ -71,10 +70,10 @@ HashMap<ValueType, std::string>* GetValueTypeNameMap() {
   return &value_type_name_map;
 }
 
-const std::string& ValueTypeName(ValueType type) {
+Maybe<const std::string&> ValueTypeName(ValueType type) {
   const auto* type_name_map = GetValueTypeNameMap();
   const auto& it = type_name_map->find(type);
-  CHECK_OR_THROW(it != type_name_map->end()) << "Value type " << type << " has no type name.";
+  CHECK_OR_RETURN(it != type_name_map->end()) << "Value type " << type << " has no type name.";
   return it->second;
 }
 
diff --git a/oneflow/api/python/functional/value_types.h b/oneflow/api/python/functional/value_types.h
index 9489e76e815..887be9f86ec 100644
--- a/oneflow/api/python/functional/value_types.h
+++ b/oneflow/api/python/functional/value_types.h
@@ -176,7 +176,7 @@ VALUE_TYPE_OF_IMPL(const PyObject*, kPY_OBJECT);
 
 #undef VALUE_TYPE_OF_IMPL
 
-const std::string& ValueTypeName(ValueType type);
+Maybe<const std::string&> ValueTypeName(ValueType type);
 
 bool IsIntegralType(ValueType type);
 bool IsIntegralListType(ValueType type);
diff --git a/oneflow/api/python/of_api_registry.h b/oneflow/api/python/of_api_registry.h
index 0c8064011f9..e415a5be648 100644
--- a/oneflow/api/python/of_api_registry.h
+++ b/oneflow/api/python/of_api_registry.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <vector>
 #include <functional>
 #include "oneflow/api/python/caster/maybe.h"
-#include "oneflow/api/python/caster/tensor.h"
 #include "oneflow/api/python/caster/optional.h"
 #include "oneflow/core/common/preprocessor.h"
 
diff --git a/oneflow/api/python/utils/tensor_utils.cpp b/oneflow/api/python/utils/tensor_utils.cpp
index 0eba252b46e..178965e1ec5 100644
--- a/oneflow/api/python/utils/tensor_utils.cpp
+++ b/oneflow/api/python/utils/tensor_utils.cpp
@@ -249,8 +249,8 @@ Maybe<Tensor> MakeConsistentTensorFromData(PyObject* data, const Optional<Symbol
       local_tensor, placement, *JUST(GetSbpList(broadcast_nd_sbp)), shape, local_tensor->dtype()));
 
   std::vector<Symbol<SbpParallel>> grad_sbp_tuple;
-  auto consistent_tensor = JUST(functional::ToConsistent(broadcast_tensor, placement, sbp_tuple,
-                                                         grad_sbp_tuple, /* check_meta */ false));
+  auto consistent_tensor =
+      JUST(functional::ToConsistent(broadcast_tensor, placement, sbp_tuple, grad_sbp_tuple));
   JUST(consistent_tensor->set_requires_grad(requires_grad));
   return consistent_tensor;
 }
@@ -261,10 +261,10 @@ Maybe<Tensor> MakeTensorFromOtherTensor(const std::shared_ptr<Tensor>& other) {
     return functional::Copy(other, device->type(), device->device_id());
   } else {
     const Symbol<NdSbp>& nd_sbp = JUST(other->nd_sbp());
-    const std::vector<Symbol<SbpParallel>>& sbp_tuple = *JUST(GetSbpList(nd_sbp));
+    std::vector<Symbol<SbpParallel>> sbp_tuple(nd_sbp->sbp_parallel().size());
+    for (int i = 0; i < sbp_tuple.size(); ++i) { sbp_tuple[i] = nd_sbp->sbp_parallel().Get(i); }
     std::vector<Symbol<SbpParallel>> grad_sbp_tuple;
-    return functional::ToConsistent(other, JUST(other->parallel_desc()), sbp_tuple, grad_sbp_tuple,
-                                    /* check_meta */ false);
+    return functional::ToConsistent(other, JUST(other->parallel_desc()), sbp_tuple, grad_sbp_tuple);
   }
 }
 
@@ -297,9 +297,8 @@ Maybe<Tensor> MakeTensorFromOtherTensor(const std::shared_ptr<Tensor>& other,
                                         const std::vector<Symbol<SbpParallel>>& sbp_tuple,
                                         const bool& requires_grad) {
   std::vector<Symbol<SbpParallel>> grad_sbp_tuple;
-  bool check_meta = other->is_consistent() ? false : true;
   std::shared_ptr<Tensor> tensor =
-      JUST(functional::ToConsistent(other, placement, sbp_tuple, grad_sbp_tuple, check_meta));
+      JUST(functional::ToConsistent(other, placement, sbp_tuple, grad_sbp_tuple));
   if (dtype) {
     const Symbol<DType>& dtype_ = JUST(dtype);
     if (tensor->dtype() != dtype_) { tensor = JUST(functional::Cast(tensor, dtype_)); }
diff --git a/oneflow/api/python/utils/tensor_utils.h b/oneflow/api/python/utils/tensor_utils.h
index fc498f984fe..d2c9411cfef 100644
--- a/oneflow/api/python/utils/tensor_utils.h
+++ b/oneflow/api/python/utils/tensor_utils.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include <pybind11/functional.h>
 #include <pybind11/numpy.h>
 
-#include "oneflow/api/python/framework/tensor.h"
 #include "oneflow/extension/python/numpy.h"
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/framework/dtype.h"
@@ -58,14 +57,14 @@ namespace one {
 Maybe<void> EagerMirroredTensorZeros(const std::shared_ptr<Tensor>& t);
 
 template<typename T>
-inline static Maybe<PyObject*> EagerMirroredTensorToNumpy(PyObject* py_tensor) {
-  const auto& t = PyTensor_Unpack(py_tensor);
+inline static Maybe<py::array> EagerMirroredTensorToNumpy(const py::handle& py_tensor) {
+  const std::shared_ptr<Tensor> t = py::cast<const std::shared_ptr<Tensor>>(py_tensor);
 
   std::shared_ptr<MirroredTensor> tensor = JUST(t->AsMirroredTensor());
   CHECK_OR_RETURN(JUST(tensor->device()) == JUST(Device::New("cpu")));
   CHECK_OR_RETURN(tensor->is_eager()) << "eager tensors supported only.";
   // set base object attr
-  py::handle handle = py::handle(py_tensor);
+  py::handle handle = py::handle(py_tensor.ptr());
 
   const size_t ndim = tensor->ndim();
   const auto shape = numpy::OFShapeToNumpyShape(tensor->shape()->dim_vec());
@@ -82,11 +81,9 @@ inline static Maybe<PyObject*> EagerMirroredTensorToNumpy(PyObject* py_tensor) {
     return builder->SyncAccessBlobByCallback(tensor, btb, Callback, "mut");
   }));
   JUST(btb->WaitUntilCntEqualZero(VirtualMachine::GetPredicatorNoMoreInstructionsFinished()));
-  return py::array(py::buffer_info(data_ptr, sizeof(T), py::format_descriptor<T>::format(), ndim,
-                                   shape, stride),
-                   handle)
-      .release()
-      .ptr();
+  return py::array(
+      py::buffer_info(data_ptr, sizeof(T), py::format_descriptor<T>::format(), ndim, shape, stride),
+      handle);
 }
 
 template<typename T>
diff --git a/oneflow/core/autograd/autograd_engine.cpp b/oneflow/core/autograd/autograd_engine.cpp
index ee03f5d1f6c..99fe98bea97 100644
--- a/oneflow/core/autograd/autograd_engine.cpp
+++ b/oneflow/core/autograd/autograd_engine.cpp
@@ -14,7 +14,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-#include <memory>
 #include <stack>
 #include <queue>
 #include "oneflow/core/autograd/autograd_engine.h"
@@ -35,38 +34,6 @@ namespace one {
 
 namespace {
 
-void GatherFunctionNodes(FunctionNode* node, std::stack<std::shared_ptr<FunctionNode>>& stack) {
-  for (auto& prev_node : node->next_functions()) {
-    if (prev_node) {
-      if (prev_node.use_count() == 1) { stack.push(prev_node); }
-    }
-  }
-}
-
-/* NOTE:
- * Stack overflows when releasing a very deep computation graph without
- * a custom deleter.
- *
- * For example, here is a very deep computation graph:
- * Tensor -> FunctionNode -> Tensor -> FunctionNode -> ... -> Tensor -> FunctionNode
- * When releasing the first Tensor, it will trigger the recursive deletion and stack overflow.
- *
- * So we must set a custom deleter and release them iteratively.
- */
-void FunctionNodeDeleter(FunctionNode* node) {
-  std::stack<std::shared_ptr<FunctionNode>> stack;
-  node->ReleaseData();
-  GatherFunctionNodes(node, stack);
-  delete node;
-
-  while (!stack.empty()) {
-    auto now_node = std::move(stack.top());
-    stack.pop();
-    now_node->ReleaseData();
-    GatherFunctionNodes(now_node.get(), stack);
-  }
-}
-
 bool IsReadyToRun(const std::vector<std::shared_ptr<AutogradMeta>>& out_meta_datas) {
   return std::any_of(out_meta_datas.begin(), out_meta_datas.end(),
                      [](const std::shared_ptr<AutogradMeta>& meta_data) {
@@ -139,6 +106,35 @@ Maybe<TensorTuple> AutogradEngine::RunBackwardAndReturnInputsTensorGradIf(
                                               create_graph);
 }
 
+StackFunctionNode::StackFunctionNode(
+    const std::string& op_type_name,
+    const std::shared_ptr<const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>&
+        backward_fn,
+    const TensorTuple& inputs, const TensorTuple& outputs)
+    : FunctionNode(op_type_name) {
+  input_meta_data_.resize(inputs.size());
+  next_functions_->reserve(inputs.size());
+  for (int i = 0; i < inputs.size(); ++i) {
+    if (inputs.at(i)->requires_grad()) {
+      input_meta_data_.at(i) = inputs.at(i)->mut_autograd_meta();
+      next_functions_->emplace_back(inputs.at(i)->mut_grad_fn_node());
+    }
+  }
+
+  output_meta_data_.resize(outputs.size());
+  output_tensor_infos_.reserve(outputs.size());
+  for (int i = 0; i < outputs.size(); ++i) {
+    const auto& autograd_meta =
+        NewAutogradMeta(outputs.at(i)->requires_grad(), outputs.at(i)->is_leaf());
+    outputs.at(i)->set_autograd_meta(autograd_meta);
+    output_meta_data_.at(i) = outputs.at(i)->mut_autograd_meta();
+    output_tensor_infos_.emplace_back(TensorInfo(*outputs.at(i)));
+  }
+
+  backward_fn_ = backward_fn;
+  is_in_stack_ = false;
+}
+
 Maybe<void> FunctionNode::AccGrad4RetainGradTensor() {
   for (const std::shared_ptr<AutogradMeta>& out : output_meta_data_) {
     if (out->retain_grad()) { JUST(CopyOrAccGrad(out.get(), /*autograd_mode=*/false)); }
@@ -159,9 +155,8 @@ Maybe<void> FunctionNode::AccGrad4LeafTensor(bool create_graph) {
         auto& tensor_info = output_tensor_infos_[i];
         const auto& placement = JUST(tensor_info.placement());
         const auto& nd_sbp = JUST(tensor_info.sbp());
-        JUST(out->set_acc_grad(
-            JUST(functional::ToConsistent(acc_grad, placement, *JUST(GetSbpList(nd_sbp)),
-                                          GetNoneSbpList(), /* check_meta */ false))));
+        JUST(out->set_acc_grad(JUST(functional::ToConsistent(
+            acc_grad, placement, *JUST(GetSbpList(nd_sbp)), GetNoneSbpList()))));
       }
     }
   }
@@ -174,9 +169,14 @@ void FunctionNode::ReleaseOutTensorArgs() {
   }
 }
 
+void StackFunctionNode::ReleaseData() {
+  if (!input_meta_data_.empty()) { backward_fn_.reset(); }
+  is_in_stack_ = false;
+}
+
 Maybe<bool> FunctionNode::Apply(bool create_graph) {
-  CHECK_NOTNULL_OR_RETURN(backward_fn_)
-      << "This FunctionNode with name `" << name() << "` has been released.\n"
+  CHECK_NOTNULL_OR_RETURN(backward_fn_.get())
+      << "This FunctionNode with name `" << GetOpTypeName() << "` has been released.\n"
       << "Maybe you try to backward through the node a second time. Specify retain_graph=True when "
          "calling .backward() or autograd.grad() the first time.";
   if (!IsReadyToRun(output_meta_data_)) { return false; }
@@ -191,42 +191,132 @@ Maybe<bool> FunctionNode::Apply(bool create_graph) {
           JUST(JUST(oneflow::VectorAt(output_meta_data_, i))->current_grad()->GetAccTensor(hooks));
     }
   }
-  JUST(backward_fn_->body(output_grads, &input_grads, create_graph));
+  JUST((*backward_fn_)(output_grads, &input_grads, create_graph));
   for (int i = 0; i < input_meta_data_.size(); ++i) {
-    if (JUST(VectorAt(input_grads, i))) {
+    if (input_grads.at(i)) {
       CHECK_NOTNULL_OR_RETURN(input_meta_data_.at(i))
-          << name_
+          << op_type_name_
           << " calculate grad for tensor which requires_grad is False. Please submit an issue in "
              "`https://github.com/Oneflow-Inc/oneflow/issues` and we will fix it as soon as "
-             "possible";
+             "possiable";
       JUST(input_meta_data_.at(i)->current_grad()->PushPartialTensor(input_grads.at(i)));
     }
   }
   return true;
 }
 
-void GraphFunctionNode::ReleaseData() {
-  if (backward_fn_ && backward_fn_->status()) { backward_fn_.reset(); }
+void StackAutogradEngine::ClearEngine() { node_list_.clear(); }
+
+void StackAutogradEngine::ClearReleasedFunctionNodes() {
+  node_list_.erase(std::remove_if(node_list_.begin(), node_list_.end(),
+                                  [](const std::weak_ptr<FunctionNode>& node) {
+                                    return node.lock() == nullptr;
+                                  }),
+                   node_list_.end());
+}
+
+Maybe<void> StackAutogradEngine::RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs,
+                                                                    const TensorTuple& out_grads,
+                                                                    bool retain_graph,
+                                                                    bool create_graph) {
+  ClearReleasedFunctionNodes();
+  for (int i = 0; i < outputs.size(); ++i) {
+    JUST(JUST(outputs.at(i)->current_grad())->PushPartialTensor(out_grads.at(i)));
+  }
+  // Runs each FunctionNode
+  for (const auto& weak_func_node : node_list_) {
+    const auto& func_node = weak_func_node.lock();
+    CHECK_NOTNULL_OR_RETURN(func_node);
+    if (JUST(func_node->Apply(create_graph))) {
+      JUST(func_node->AccGrad4LeafTensor(create_graph));
+      JUST(func_node->AccGrad4RetainGradTensor());
+      func_node->ReleaseOutTensorArgs();
+      if (!retain_graph) { func_node->ReleaseData(); }
+    }
+  }
+  if (!retain_graph) { ClearEngine(); }
+  return Maybe<void>::Ok();
+}
+
+Maybe<TensorTuple> StackAutogradEngine::RunBackwardAndReturnInputsTensorGrad(
+    const TensorTuple& outputs, const TensorTuple& inputs, const TensorTuple& out_grads,
+    bool retain_graph, bool create_graph) {
+  ClearReleasedFunctionNodes();
+  std::shared_ptr<TensorTuple> input_current_grad = std::make_shared<TensorTuple>(inputs.size());
+  std::vector<bool> ori_retain_grad(inputs.size());
+  for (int i = 0; i < inputs.size(); ++i) {
+    ori_retain_grad.at(i) = inputs.at(i)->retain_grad();
+    JUST(inputs.at(i)->set_retain_grad(true));
+  }
+  for (int i = 0; i < outputs.size(); ++i) {
+    JUST(JUST(outputs.at(i)->current_grad())->PushPartialTensor(out_grads.at(i)));
+  }
+  // Runs each FunctionNode
+  for (const auto& weak_func_node : node_list_) {
+    const auto& func_node = weak_func_node.lock();
+    CHECK_NOTNULL_OR_RETURN(func_node);
+    if (JUST(func_node->Apply(create_graph))) {
+      JUST(func_node->AccGrad4RetainGradTensor());
+      func_node->ReleaseOutTensorArgs();
+      if (!retain_graph) { func_node->ReleaseData(); }
+    }
+  }
+  // Gets input grads and resume retain_grad
+  for (int i = 0; i < inputs.size(); ++i) {
+    input_current_grad->at(i) = JUST(inputs.at(i)->acc_grad());
+    if (!ori_retain_grad.at(i)) {
+      JUST(inputs.at(i)->set_acc_grad(nullptr));
+      JUST(inputs.at(i)->set_retain_grad(false));
+    }
+  }
+  if (!retain_graph) { ClearEngine(); }
+  return input_current_grad;
 }
 
-/*static*/ std::shared_ptr<GraphFunctionNode> GraphFunctionNode::New(
-    const std::string& name, const std::shared_ptr<BackwardFunction>& backward_fn,
-    const TensorTuple& inputs, const TensorTuple& outputs) {
-  auto node = std::shared_ptr<GraphFunctionNode>(
-      new GraphFunctionNode(name, backward_fn, inputs, outputs), FunctionNodeDeleter);
-  return node;
+Maybe<FunctionNode> StackAutogradEngine::AddBackwardFuncPtr(
+    const std::string& op_type_name,
+    const std::shared_ptr<const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>&
+        backward_fn,
+    const TensorTuple& inputs, TensorTuple* outputs) {
+  // Firstly push function_node of tensor in stack which is leaf and requires_grad
+  for (const std::shared_ptr<Tensor>& in_tensor : inputs) {
+    if (in_tensor->is_leaf() && in_tensor->requires_grad()) {
+      if (!in_tensor->grad_fn_node()) { JUST(AddAccumulateFunctionNode(in_tensor)); }
+      StackFunctionNode* stack_function_node =
+          dynamic_cast<StackFunctionNode*>(in_tensor->mut_grad_fn_node().get());
+      if (!stack_function_node->is_in_stack()) {
+        stack_function_node->set_is_in_stack(true);
+        node_list_.push_front(in_tensor->mut_grad_fn_node());
+      }
+    }
+  }
+
+  std::shared_ptr<StackFunctionNode> func_node =
+      std::make_shared<StackFunctionNode>(op_type_name, backward_fn, inputs, *outputs);
+  for (const std::shared_ptr<Tensor>& out_tensor : *outputs) {
+    out_tensor->set_grad_fn_node(func_node);
+  }
+  func_node->set_is_in_stack(true);
+  node_list_.push_front(func_node);
+  return std::static_pointer_cast<FunctionNode>(func_node);
+}
+
+void GraphFunctionNode::ReleaseData() {
+  if (!input_meta_data_.empty()) { backward_fn_.reset(); }
 }
 
-GraphFunctionNode::GraphFunctionNode(const std::string& name,
-                                     const std::shared_ptr<BackwardFunction>& backward_fn,
-                                     const TensorTuple& inputs, const TensorTuple& outputs)
-    : FunctionNode(name, backward_fn) {
+GraphFunctionNode::GraphFunctionNode(
+    const std::string& op_type_name,
+    const std::shared_ptr<const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>&
+        backward_fn,
+    const TensorTuple& inputs, const TensorTuple& outputs)
+    : FunctionNode(op_type_name) {
   input_meta_data_.resize(inputs.size());
-  next_functions_.reserve(inputs.size());
+  next_functions_->reserve(inputs.size());
   for (int i = 0; i < inputs.size(); ++i) {
     if (inputs.at(i)->requires_grad()) {
       input_meta_data_.at(i) = inputs.at(i)->mut_autograd_meta();
-      next_functions_.emplace_back(inputs.at(i)->mut_grad_fn_node());
+      next_functions_->emplace_back(inputs.at(i)->mut_grad_fn_node());
     }
   }
 
@@ -263,7 +353,7 @@ Maybe<void> GraphTask::ComputeDependencies() {
     FunctionNode* node = stack.top();
     stack.pop();
     if (/*bool has_seen=*/!seen.insert(node).second) { continue; }
-    for (const auto& next_grad_fn : node->next_functions()) {
+    for (const auto& next_grad_fn : *(node->GetNextFunctions())) {
       FunctionNode* next_node = next_grad_fn.get();
       dependencies_[next_node] += 1;
       if (seen.find(next_node) == seen.end()) { stack.push(next_node); }
@@ -281,9 +371,9 @@ Maybe<void> GraphTask::ComputeDependenciesAndPruneNode(const TensorTuple& inputs
     size_t next_function_idx_;
 
     FunctionNode* GetNextFunction() {
-      if (next_function_idx_ < node_->next_functions().size()) {
+      if (next_function_idx_ < node_->GetNextFunctions()->size()) {
         next_function_idx_ += 1;
-        return node_->next_functions().at(next_function_idx_ - 1).get();
+        return node_->GetNextFunctions()->at(next_function_idx_ - 1).get();
       } else {
         return nullptr;
       }
@@ -313,11 +403,11 @@ Maybe<void> GraphTask::ComputeDependenciesAndPruneNode(const TensorTuple& inputs
         continue;  // recurse
       }
     } else {
-      bool need_execute =
-          std::any_of(frame.node_->next_functions().begin(), frame.node_->next_functions().end(),
-                      [&](const std::shared_ptr<FunctionNode>& fn) {
-                        return need_execute_.find(fn.get()) != need_execute_.end();
-                      });
+      bool need_execute = std::any_of(frame.node_->GetNextFunctions()->begin(),
+                                      frame.node_->GetNextFunctions()->end(),
+                                      [&](const std::shared_ptr<FunctionNode>& fn) {
+                                        return need_execute_.find(fn.get()) != need_execute_.end();
+                                      });
       if (need_execute) { need_execute_.insert(frame.node_); }
       seen.insert(frame.node_);
       stack.pop();
@@ -345,7 +435,7 @@ Maybe<void> GraphTask::Apply(bool save_grad_for_leaf) {
     node->ReleaseOutTensorArgs();
     if (!retain_graph_) { node->ReleaseData(); }
 
-    for (const auto& next_grad_fn : node->next_functions()) {
+    for (const auto& next_grad_fn : *(node->GetNextFunctions())) {
       FunctionNode* next_node = next_grad_fn.get();
       dependencies_[next_node] -= 1;
       if (dependencies_[next_node] == 0) { queue.push(next_node); }
@@ -395,8 +485,10 @@ Maybe<TensorTuple> GraphAutogradEngine::RunBackwardAndReturnInputsTensorGrad(
   return input_current_grad;
 }
 
-Maybe<FunctionNode> GraphAutogradEngine::AddNode(
-    const std::string& name, const std::shared_ptr<BackwardFunction>& backward_fn,
+Maybe<FunctionNode> GraphAutogradEngine::AddBackwardFuncPtr(
+    const std::string& op_type_name,
+    const std::shared_ptr<const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>&
+        backward_fn,
     const TensorTuple& inputs, TensorTuple* outputs) {
   // Firstly push function_node of tensor in stack which is leaf and requires_grad
   for (const std::shared_ptr<Tensor>& in_tensor : inputs) {
@@ -406,7 +498,7 @@ Maybe<FunctionNode> GraphAutogradEngine::AddNode(
   }
 
   std::shared_ptr<FunctionNode> func_node =
-      GraphFunctionNode::New(name, backward_fn, inputs, *outputs);
+      std::make_shared<GraphFunctionNode>(op_type_name, backward_fn, inputs, *outputs);
   for (const std::shared_ptr<Tensor>& out_tensor : *outputs) {
     out_tensor->set_grad_fn_node(func_node);
   }
@@ -414,17 +506,18 @@ Maybe<FunctionNode> GraphAutogradEngine::AddNode(
 }
 
 AutogradEngine* GetThreadLocalAutogradEngine() {
+  // thread_local static StackAutogradEngine autograd_engine;
   thread_local static GraphAutogradEngine autograd_engine;
   return &autograd_engine;
 }
 
 Maybe<void> AddAccumulateFunctionNode(const std::shared_ptr<Tensor>& tensor) {
-  auto backward_fn = std::make_shared<BackwardFunction>();
-  backward_fn->body = [=](const TensorTuple& out_grads, TensorTuple* in_grads,
-                          bool create_graph) -> Maybe<void> { return Maybe<void>::Ok(); };
-  backward_fn->status = []() { return false; };
-  tensor->set_grad_fn_node(GraphFunctionNode::New(
-      "accumulate_grad", backward_fn, /*inputs=*/TensorTuple{}, /*outputs*/ TensorTuple{tensor}));
+  auto backward_fn =
+      std::make_shared<std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>(
+          [=](const TensorTuple& out_grads, TensorTuple* in_grads,
+              bool create_graph) -> Maybe<void> { return Maybe<void>::Ok(); });
+  tensor->set_grad_fn_node(std::make_shared<GraphFunctionNode>(
+      "accumulate_grad", backward_fn, TensorTuple(), TensorTuple({tensor})));
   return Maybe<void>::Ok();
 }
 
diff --git a/oneflow/core/autograd/autograd_engine.h b/oneflow/core/autograd/autograd_engine.h
index 2a0aafdf5b8..bcb710cbc41 100644
--- a/oneflow/core/autograd/autograd_engine.h
+++ b/oneflow/core/autograd/autograd_engine.h
@@ -31,13 +31,6 @@ namespace one {
 class Tensor;
 class TensorTuple;
 
-using CaptureStatus = bool;
-
-struct BackwardFunction {
-  std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)> body;
-  std::function<CaptureStatus()> status;
-};
-
 // Calculates one backward op
 class FunctionNode {
  public:
@@ -51,25 +44,26 @@ class FunctionNode {
   // `Apply` in second time
   virtual void ReleaseData() = 0;
 
-  const std::vector<std::shared_ptr<FunctionNode>>& next_functions() const {
+  // Getters
+  const std::shared_ptr<std::vector<std::shared_ptr<FunctionNode>>>& GetNextFunctions() const {
     return next_functions_;
   }
-  const std::string& name() const { return name_; }
+  const std::string& GetOpTypeName() const { return op_type_name_; }
 
  protected:
-  explicit FunctionNode(const std::string& name,
-                        const std::shared_ptr<BackwardFunction>& backward_fn)
-      : name_(name), backward_fn_(backward_fn) {}
+  explicit FunctionNode(const std::string& op_type_name)
+      : op_type_name_(op_type_name),
+        next_functions_(new std::vector<std::shared_ptr<FunctionNode>>{}) {}
 
-  const std::string name_;
-  std::vector<std::shared_ptr<FunctionNode>> next_functions_;
+  const std::string op_type_name_;
+  std::shared_ptr<std::vector<std::shared_ptr<FunctionNode>>> next_functions_;
 
   std::vector<std::shared_ptr<AutogradMeta>> input_meta_data_;
   std::vector<std::shared_ptr<AutogradMeta>> output_meta_data_;
   std::vector<TensorInfo> output_tensor_infos_;
-
   // Actual backward function builds in `AutogradInterpreter` to calculate one backward op
-  std::shared_ptr<BackwardFunction> backward_fn_;
+  std::shared_ptr<const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>
+      backward_fn_;
 };
 
 class AutogradEngine {
@@ -85,9 +79,11 @@ class AutogradEngine {
                                                             bool retain_graph, bool create_graph);
   virtual void ClearEngine() = 0;
   // Builds FunctionNode, binding to all `outputs_` tensors and saving in AutogradEngine
-  virtual Maybe<FunctionNode> AddNode(const std::string& name,
-                                      const std::shared_ptr<BackwardFunction>& backward_fn,
-                                      const TensorTuple& inputs, TensorTuple* outputs) = 0;
+  virtual Maybe<FunctionNode> AddBackwardFuncPtr(
+      const std::string& op_type_name,
+      const std::shared_ptr<
+          const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>& backward_fn,
+      const TensorTuple& inputs, TensorTuple* outputs) = 0;
 
  protected:
   AutogradEngine() = default;
@@ -103,22 +99,67 @@ class AutogradEngine {
                                                                   bool create_graph) = 0;
 };
 
+// Stack Autograd Node and Engine
+class StackFunctionNode final : public FunctionNode {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(StackFunctionNode);
+  StackFunctionNode(
+      const std::string& op_type_name,
+      const std::shared_ptr<
+          const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>& backward_fn,
+      const TensorTuple& inputs, const TensorTuple& outputs);
+  StackFunctionNode() = delete;
+  ~StackFunctionNode() override = default;
+
+  void ReleaseData() override;
+  bool is_in_stack() const { return is_in_stack_; }
+  void set_is_in_stack(bool in_stack) { is_in_stack_ = in_stack; }
+
+ private:
+  bool is_in_stack_;
+};
+
+class StackAutogradEngine final : public AutogradEngine {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(StackAutogradEngine);
+  StackAutogradEngine() = default;
+  ~StackAutogradEngine() override = default;
+
+  void ClearEngine() override;
+  Maybe<FunctionNode> AddBackwardFuncPtr(
+      const std::string& op_type_name,
+      const std::shared_ptr<
+          const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>& backward_fn,
+      const TensorTuple& inputs, TensorTuple* outputs) override;
+
+ private:
+  // StackFunctionNode must be saved in engine, because any node in list may be released at any
+  // moment.
+  std::list<std::weak_ptr<FunctionNode>> node_list_;
+  void ClearReleasedFunctionNodes();
+  Maybe<void> RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs,
+                                                 const TensorTuple& out_grads, bool retain_graph,
+                                                 bool create_graph) override;
+  Maybe<TensorTuple> RunBackwardAndReturnInputsTensorGrad(const TensorTuple& outputs,
+                                                          const TensorTuple& inputs,
+                                                          const TensorTuple& out_grads,
+                                                          bool retain_graph,
+                                                          bool create_graph) override;
+};
+
 // Graph Autograd Node and Engine
 class GraphFunctionNode final : public FunctionNode {
  public:
   OF_DISALLOW_COPY_AND_MOVE(GraphFunctionNode);
-  static std::shared_ptr<GraphFunctionNode> New(
-      const std::string& name, const std::shared_ptr<BackwardFunction>& backward_fn,
+  GraphFunctionNode(
+      const std::string& op_type_name,
+      const std::shared_ptr<
+          const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>& backward_fn,
       const TensorTuple& inputs, const TensorTuple& outputs);
-
   GraphFunctionNode() = delete;
   ~GraphFunctionNode() override = default;
 
   void ReleaseData() override;
-
- private:
-  GraphFunctionNode(const std::string& name, const std::shared_ptr<BackwardFunction>& backward_fn,
-                    const TensorTuple& inputs, const TensorTuple& outputs);
 };
 
 class GraphTask final {
@@ -146,9 +187,11 @@ class GraphAutogradEngine final : public AutogradEngine {
   ~GraphAutogradEngine() override = default;
 
   void ClearEngine() override{};
-  Maybe<FunctionNode> AddNode(const std::string& name,
-                              const std::shared_ptr<BackwardFunction>& backward_fn,
-                              const TensorTuple& inputs, TensorTuple* outputs) override;
+  Maybe<FunctionNode> AddBackwardFuncPtr(
+      const std::string& op_type_name,
+      const std::shared_ptr<
+          const std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>& backward_fn,
+      const TensorTuple& inputs, TensorTuple* outputs) override;
 
  private:
   Maybe<void> RunBackwardAndSaveGrads4LeafTensor(const TensorTuple& outputs,
diff --git a/oneflow/core/autograd/gradient_funcs/activation.cpp b/oneflow/core/autograd/gradient_funcs/activation.cpp
index 328b9b51338..c22c684acb6 100644
--- a/oneflow/core/autograd/gradient_funcs/activation.cpp
+++ b/oneflow/core/autograd/gradient_funcs/activation.cpp
@@ -124,48 +124,6 @@ class HardSigmoid : public BaseActivation {
   }
 };
 
-struct HardShrinkCaptureState : public AutoGradCaptureState {
-  bool requires_grad = true;
-  double lambd = 0.5;
-};
-
-class HardShrink : public OpExprGradFunction<HardShrinkCaptureState> {
- public:
-  Maybe<void> Init(const OpExpr& op) override {
-    const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
-    CHECK_NOTNULL_OR_RETURN(fw_op_expr) << "Forward op must be not null";
-    base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
-    return Maybe<void>::Ok();
-  }
-
-  Maybe<void> Capture(HardShrinkCaptureState* ctx, const TensorTuple& inputs,
-                      const TensorTuple& outputs, const AttrMap& attrs) const override {
-    CHECK_EQ_OR_RETURN(inputs.size(), 1) << "Input grad size must be equal 1";
-    ctx->requires_grad = JUST(oneflow::VectorAt(inputs, 0))->requires_grad();
-    if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
-
-    ComposedAttrMap composed_attrs(attrs, base_attrs_);
-    ctx->lambd = JUST(composed_attrs.GetAttr<double>("lambd"));
-    ctx->SaveTensorForBackward(JUST(oneflow::VectorAt(outputs, 0)));
-    return Maybe<void>::Ok();
-  }
-
-  Maybe<void> Apply(const HardShrinkCaptureState* ctx, const TensorTuple& out_grads,
-                    TensorTuple* in_grads) const override {
-    CHECK_EQ_OR_RETURN(out_grads.size(), 1) << "Output grad size must be equal 1";
-    in_grads->resize(1);
-    if (ctx->requires_grad) {
-      const auto& y = JUST(oneflow::VectorAt(ctx->SavedTensors(), 0));
-      *JUST(oneflow::VectorAt(in_grads, 0)) =
-          JUST(functional::HardShrinkGrad(y, JUST(oneflow::VectorAt(out_grads, 0)), ctx->lambd));
-    }
-    return Maybe<void>::Ok();
-  }
-
- private:
-  AttrMap base_attrs_;
-};
-
 class HardSwish : public BaseActivation {
  public:
   Maybe<void> Apply(const BaseActivationCaptureState* ctx, const TensorTuple& out_grads,
@@ -550,7 +508,6 @@ REGISTER_OP_EXPR_GRAD_FUNCTION("softsign", Softsign);
 REGISTER_OP_EXPR_GRAD_FUNCTION("relu", ReLU);
 REGISTER_OP_EXPR_GRAD_FUNCTION("gelu", GeLU);
 REGISTER_OP_EXPR_GRAD_FUNCTION("hardsigmoid", HardSigmoid);
-REGISTER_OP_EXPR_GRAD_FUNCTION("hardshrink", HardShrink);
 REGISTER_OP_EXPR_GRAD_FUNCTION("hardswish", HardSwish);
 REGISTER_OP_EXPR_GRAD_FUNCTION("leaky_relu", LeakyRelu);
 REGISTER_OP_EXPR_GRAD_FUNCTION("hardtanh", HardTanh);
diff --git a/oneflow/core/autograd/gradient_funcs/avg_pooling.cpp b/oneflow/core/autograd/gradient_funcs/avg_pooling.cpp
index 91a57d90d58..ba449ae04e1 100644
--- a/oneflow/core/autograd/gradient_funcs/avg_pooling.cpp
+++ b/oneflow/core/autograd/gradient_funcs/avg_pooling.cpp
@@ -28,6 +28,7 @@ namespace {
 struct AvgPoolingCaptureState : public AutoGradCaptureState {
   bool requires_grad;
   size_t input_index;
+  size_t output_index;
 
   std::string data_format;
   std::vector<int32_t> padding;
@@ -64,6 +65,7 @@ Maybe<void> AvgPoolingNdGrad::Capture(AvgPoolingCaptureState* ctx, const TensorT
   if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
 
   ctx->input_index = ctx->SaveTensorForBackward(inputs.at(0));
+  ctx->output_index = ctx->SaveTensorForBackward(outputs.at(0));
 
   ComposedAttrMap composed_attrs(attrs, base_attrs_);
   ctx->data_format = JUST(composed_attrs.GetAttr<std::string>("data_format"));
@@ -84,11 +86,12 @@ Maybe<void> AvgPoolingNdGrad::Apply(const AvgPoolingCaptureState* ctx, const Ten
 
   int32_t ndims = ctx->kernel_size.size();
   const auto& input = ctx->SavedTensors().at(ctx->input_index);
+  const auto& output = ctx->SavedTensors().at(ctx->output_index);
 
   in_grads->resize(1);
   in_grads->at(0) = JUST(functional::AvgPoolingNdGrad(
-      input, out_grads.at(0), ndims, ctx->data_format, ctx->padding, ctx->kernel_size, ctx->stride,
-      ctx->ceil_mode, ctx->count_include_pad, ctx->divisor_override));
+      input, output, out_grads.at(0), ndims, ctx->data_format, ctx->padding, ctx->kernel_size,
+      ctx->stride, ctx->ceil_mode, ctx->count_include_pad, ctx->divisor_override));
 
   return Maybe<void>::Ok();
 }
diff --git a/oneflow/core/autograd/gradient_funcs/consistent_cast.cpp b/oneflow/core/autograd/gradient_funcs/consistent_cast.cpp
index ae23dd24f14..1928687dd2b 100644
--- a/oneflow/core/autograd/gradient_funcs/consistent_cast.cpp
+++ b/oneflow/core/autograd/gradient_funcs/consistent_cast.cpp
@@ -57,9 +57,9 @@ class CastToConsistent : public OpExprGradFunction<CastConsistentCaptureState> {
     {
       Symbol<NdSbp> nd_sbp_constraint = ctx->nd_sbp;
       Symbol<ParallelDesc> parallel_desc_constraint = ctx->parallel_desc;
-      out_grad = JUST(functional::ToConsistent(out_grad, parallel_desc_constraint,
-                                               *JUST(GetSbpList(nd_sbp_constraint)),
-                                               GetNoneSbpList(), /* check_meta */ false));
+      out_grad =
+          JUST(functional::ToConsistent(out_grad, parallel_desc_constraint,
+                                        *JUST(GetSbpList(nd_sbp_constraint)), GetNoneSbpList()));
     }
     in_grads->at(0) = JUST(OpInterpUtil::Dispatch<Tensor>(*grad_op_, {out_grad}));
     return Maybe<void>::Ok();
diff --git a/oneflow/core/autograd/gradient_funcs/consistent_to_consistent.cpp b/oneflow/core/autograd/gradient_funcs/consistent_to_consistent.cpp
index 57c6245dfd7..f23f42abae9 100644
--- a/oneflow/core/autograd/gradient_funcs/consistent_to_consistent.cpp
+++ b/oneflow/core/autograd/gradient_funcs/consistent_to_consistent.cpp
@@ -57,8 +57,8 @@ class ConsistentToConsistentGradFunction : public OpExprGradFunction<ConsistentT
     const auto& grad_nd_sbp = grad_nd_sbp_.value_or(JUST(out_grad->nd_sbp()));
     const auto& grad_sbp_list = JUST(GetSbpList(grad_nd_sbp));
     const auto& grad_grad_sbp_list = JUST(GetSbpList(ctx->nd_sbp));
-    (*in_grads)[0] = JUST(one::functional::ToConsistent(
-        out_grad, ctx->parallel_desc, *grad_sbp_list, *grad_grad_sbp_list, /* check_meta */ false));
+    in_grads->at(0) = JUST(one::functional::ToConsistent(out_grad, ctx->parallel_desc,
+                                                         *grad_sbp_list, *grad_grad_sbp_list));
     return Maybe<void>::Ok();
   }
 
diff --git a/oneflow/core/autograd/gradient_funcs/partial_fc_sample.cpp b/oneflow/core/autograd/gradient_funcs/partial_fc_sample.cpp
index 897c7680cb3..2af6356970e 100644
--- a/oneflow/core/autograd/gradient_funcs/partial_fc_sample.cpp
+++ b/oneflow/core/autograd/gradient_funcs/partial_fc_sample.cpp
@@ -57,7 +57,7 @@ Maybe<void> PartialFCSample::Capture(PartialFCSampleState* ctx, const TensorTupl
 Maybe<void> PartialFCSample::Apply(const PartialFCSampleState* ctx, const TensorTuple& out_grads,
                                    TensorTuple* in_grads) const {
   CHECK_EQ_OR_RETURN(out_grads.size(), 3);
-  in_grads->resize(2);
+  in_grads->resize(1);
   if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
   const auto& diff_sampled_weight = out_grads.at(2);  // diff of sampled_weight
 
diff --git a/oneflow/core/autograd/gradient_funcs/pooling.cpp b/oneflow/core/autograd/gradient_funcs/pooling.cpp
index 64e09f94d61..8965f3e991e 100644
--- a/oneflow/core/autograd/gradient_funcs/pooling.cpp
+++ b/oneflow/core/autograd/gradient_funcs/pooling.cpp
@@ -29,6 +29,7 @@ namespace {
 struct PoolingCaptureState : public AutoGradCaptureState {
   bool requires_grad;
   size_t input_index;
+  size_t output_index;
   size_t indice_index;
 
   std::string data_format;
@@ -71,6 +72,7 @@ Maybe<void> PoolingNdGrad::Capture(PoolingCaptureState* ctx, const TensorTuple&
   if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
 
   ctx->input_index = ctx->SaveTensorForBackward(inputs.at(0));
+  ctx->output_index = ctx->SaveTensorForBackward(outputs.at(0));
   ctx->indice_index = ctx->SaveTensorForBackward(outputs.at(1));
 
   ComposedAttrMap composed_attrs(attrs, base_attrs_);
@@ -91,11 +93,12 @@ Maybe<void> PoolingNdGrad::Apply(const PoolingCaptureState* ctx, const TensorTup
 
   int32_t ndims = ctx->kernel_size.size();
   const auto& input = ctx->SavedTensors().at(ctx->input_index);
+  const auto& output = ctx->SavedTensors().at(ctx->output_index);
   const auto& indice = ctx->SavedTensors().at(ctx->indice_index);
 
   in_grads->resize(1);
   in_grads->at(0) = JUST(functional::PoolingNdGrad(
-      input, indice, out_grads.at(0), mode_, ndims, ctx->data_format, ctx->padding,
+      input, output, indice, out_grads.at(0), mode_, ndims, ctx->data_format, ctx->padding,
       ctx->kernel_size, ctx->stride, ctx->dilation, ctx->return_indices, ctx->ceil_mode));
 
   return Maybe<void>::Ok();
diff --git a/oneflow/core/common/check_level.cpp b/oneflow/core/common/check_level.cpp
deleted file mode 100644
index 6396c484b7f..00000000000
--- a/oneflow/core/common/check_level.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include <cstdlib>
-#include <type_traits>
-#include "oneflow/core/common/just.h"
-#include "oneflow/core/common/maybe.h"
-#include "oneflow/core/common/env_var/debug_mode.h"
-#include "oneflow/xrt/utility/env.h"
-
-namespace oneflow {
-
-bool IsEnvEnabled(int32_t check_level) {
-  static const int env_check_level = EnvToInt(ONEFOW_CHECK_LEVEL, -1);
-  static const bool env_debug_mode = IsInDebugMode();
-  return env_debug_mode || env_check_level >= check_level;
-}
-
-}  // namespace oneflow
diff --git a/oneflow/core/common/check_level.h b/oneflow/core/common/check_level.h
index 755b7fb0317..58df7a076d6 100644
--- a/oneflow/core/common/check_level.h
+++ b/oneflow/core/common/check_level.h
@@ -13,13 +13,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#ifndef ONEFLOW_CORE_COMMON_CHECK_LEVEL_H_
-#define ONEFLOW_CORE_COMMON_CHECK_LEVEL_H_
+#ifndef ONEFLOW_CORE_CHECK_LEVEL_H_
+#define ONEFLOW_CORE_CHECK_LEVEL_H_
+
+#include <cstdlib>
+#include <type_traits>
+#include "oneflow/core/common/just.h"
+#include "oneflow/core/common/maybe.h"
+#include "oneflow/core/common/env_var/debug_mode.h"
+#include "oneflow/xrt/utility/env.h"
 
 namespace oneflow {
 
-bool IsEnvEnabled(int32_t check_level);
+bool IsEnvEnabled(int32_t check_level) {
+  static const int env_check_level = EnvToInt(ONEFOW_CHECK_LEVEL, -1);
+  static const bool env_debug_mode = IsInDebugMode();
+  return env_debug_mode || env_check_level >= check_level;
+}
 
 }  // namespace oneflow
 
-#endif  // ONEFLOW_CORE_COMMON_CHECK_LEVEL_H_
+#endif  // ONEFLOW_CORE_CHECK_LEVEL_H_
diff --git a/oneflow/core/common/just.h b/oneflow/core/common/just.h
index 5872d62b95e..e26f097e0d3 100644
--- a/oneflow/core/common/just.h
+++ b/oneflow/core/common/just.h
@@ -28,7 +28,7 @@ template<typename T, typename Enabled = void>
 class Maybe;
 
 template<typename T>
-class Optional;
+struct Optional;
 
 Maybe<std::string> FormatErrorStr(const std::shared_ptr<cfg::ErrorProto>&);
 namespace {
diff --git a/oneflow/core/device/nccl_util.h b/oneflow/core/device/nccl_util.h
index 905b24cdc81..e4d90f8a07e 100644
--- a/oneflow/core/device/nccl_util.h
+++ b/oneflow/core/device/nccl_util.h
@@ -49,7 +49,6 @@ inline ncclDataType_t GetNcclDataType(const DataType& dt) {
 #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= 21003
     case DataType::kBFloat16: return ncclBfloat16;
 #endif
-    case DataType::kUInt8: return ncclUint8;
     case DataType::kUInt32: return ncclUint32;
     case DataType::kUInt64: return ncclUint64;
     default: UNIMPLEMENTED();
diff --git a/oneflow/core/eager/eager_blob_object.cpp b/oneflow/core/eager/eager_blob_object.cpp
index 7bc34976c90..d4058552394 100644
--- a/oneflow/core/eager/eager_blob_object.cpp
+++ b/oneflow/core/eager/eager_blob_object.cpp
@@ -64,8 +64,7 @@ Maybe<void> EagerBlobObject::TryAllocateBlobBodyMemory(DeviceCtx* device_ctx) {
     return Maybe<void>::Ok();
   }
   if (tensor_storage_->blob_dptr() != nullptr) {
-    CHECK_GE_OR_RETURN(tensor_storage_->blob_bytes(), blob->ByteSizeOfBlobBody())
-        << "This blob has been allocated memory, but less than needed space.";
+    CHECK_GE_OR_RETURN(tensor_storage_->blob_bytes(), required_body_bytes);
     return Maybe<void>::Ok();
   }
   {
diff --git a/oneflow/core/embedding/cached_key_value_store.cu b/oneflow/core/embedding/cached_key_value_store.cu
index 6557a7820d8..0003e9c0418 100644
--- a/oneflow/core/embedding/cached_key_value_store.cu
+++ b/oneflow/core/embedding/cached_key_value_store.cu
@@ -54,6 +54,7 @@ class CacheKeyValueStoreImpl : public KeyValueStore {
   }
   ~CacheKeyValueStoreImpl() {
     CudaCurrentDeviceGuard guard(device_index_);
+    SyncCacheToStore();
     OF_CUDA_CHECK(cudaFree(num_buffer_));
     OF_CUDA_CHECK(cudaFreeHost(host_num_buffer_));
     if (max_query_length_ != 0) {
@@ -176,14 +177,13 @@ void CacheKeyValueStoreImpl<Key, Elem>::LoadSnapshot(
     const std::string& name, const std::function<void(KVIterator* iter)>& Hook) {
   CudaCurrentDeviceGuard guard(device_index_);
   std::lock_guard<std::recursive_mutex> lock(mutex_);
-  CHECK_GT(max_query_length_, 0);
   cache_->Clear();
-  auto device =
-      Global<ep::DeviceManagerRegistry>::Get()->GetDevice(DeviceType::kCUDA, device_index_);
-  CHECK(device);
-  auto* stream = device->CreateStream();
   store_->LoadSnapshot(name, [&](KVIterator* iter) {
     if (cache_->Policy() == CacheOptions::Policy::kFull) {
+      auto device =
+          Global<ep::DeviceManagerRegistry>::Get()->GetDevice(DeviceType::kCUDA, device_index_);
+      CHECK(device);
+      auto* stream = device->CreateStream();
       auto* cuda_stream = stream->As<ep::CudaStream>();
       while (true) {
         iter->NextN(stream, max_query_length_, num_buffer_, keys_buffer_, values_buffer_);
@@ -199,13 +199,13 @@ void CacheKeyValueStoreImpl<Key, Elem>::LoadSnapshot(
         CHECK_JUST(stream->Sync());
         CHECK_EQ(*host_num_buffer_, 0);
       }
+      device->DestroyStream(stream);
     }
     if (Hook) {
       iter->Reset();
       Hook(iter);
     }
   });
-  device->DestroyStream(stream);
   store_->LoadSnapshot(name);
 }
 
@@ -227,7 +227,6 @@ void CacheKeyValueStoreImpl<Key, Elem>::SyncCacheToStore() {
   auto* stream = device->CreateStream();
   auto* cuda_stream = stream->As<ep::CudaStream>();
   const uint64_t dump_capacity = cache_->DumpCapacity();
-  CHECK_GT(max_query_length_, 0);
   for (uint64_t start_key_index = 0; start_key_index < dump_capacity;
        start_key_index += max_query_length_) {
     cache_->Dump(stream, start_key_index,
diff --git a/oneflow/core/embedding/embedding_manager.cpp b/oneflow/core/embedding/embedding_manager.cpp
index 01371fe1bec..e1efcdc5c86 100644
--- a/oneflow/core/embedding/embedding_manager.cpp
+++ b/oneflow/core/embedding/embedding_manager.cpp
@@ -24,8 +24,6 @@ namespace embedding {
 
 #ifdef WITH_CUDA
 
-constexpr size_t kDefaultMaxQueryLength = 65536;
-
 KeyValueStore* EmbeddingManager::GetKeyValueStore(const std::string& embedding_name,
                                                   int64_t rank_id) {
   std::pair<std::string, int64_t> map_key = std::make_pair(embedding_name, rank_id);
@@ -63,9 +61,7 @@ void EmbeddingManager::CreateKeyValueStore(const KeyValueStoreOptions& key_value
     std::unique_ptr<Cache> cache = NewCache(cache_options.at(i));
     store = NewCachedKeyValueStore(std::move(store), std::move(cache));
   }
-  store->ReserveQueryLength(kDefaultMaxQueryLength);
-  CHECK(key_value_store_map_.emplace(map_key, std::move(store)).second)
-      << "Can't create an embedding with same name of an existing embedding, the name: " << name;
+  key_value_store_map_.emplace(map_key, std::move(store));
 }
 
 void EmbeddingManager::SaveSnapshot(const std::string& embedding_name, int64_t local_rank_id,
diff --git a/oneflow/core/embedding/hash_functions.cuh b/oneflow/core/embedding/hash_functions.cuh
index 5d865ba22da..1359a496854 100644
--- a/oneflow/core/embedding/hash_functions.cuh
+++ b/oneflow/core/embedding/hash_functions.cuh
@@ -17,7 +17,6 @@ limitations under the License.
 #define ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_H_
 
 #include <stdint.h>
-#include "oneflow/core/common/data_type.h"
 
 namespace oneflow {
 
@@ -39,14 +38,14 @@ static const uint64_t PRIME64_5 =
 
 #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
 
-OF_DEVICE_FUNC uint64_t XXH64_round(uint64_t acc, uint64_t input) {
+__device__ __host__ __forceinline__ uint64_t XXH64_round(uint64_t acc, uint64_t input) {
   acc += input * PRIME64_2;
   acc = XXH_rotl64(acc, 31);
   acc *= PRIME64_1;
   return acc;
 }
 
-OF_DEVICE_FUNC uint64_t xxh64_uint64(uint64_t v, uint64_t seed) {
+__device__ __host__ __forceinline__ uint64_t xxh64_uint64(uint64_t v, uint64_t seed) {
   uint64_t acc = seed + PRIME64_5;
   acc += sizeof(uint64_t);
   acc = acc ^ XXH64_round(0, v);
@@ -69,30 +68,33 @@ static const size_t kLruCacheHashSeed = 5;
 }  // namespace
 
 struct ShardingHash {
-  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kShardingHashSeed); }
-  OF_DEVICE_FUNC size_t operator()(uint32_t v) { return xxh64_uint64(v, kShardingHashSeed); }
-  OF_DEVICE_FUNC size_t operator()(int32_t v) {
-    return xxh64_uint64(static_cast<uint32_t>(v), kShardingHashSeed);
-  }
-  OF_DEVICE_FUNC size_t operator()(int64_t v) {
-    return xxh64_uint64(static_cast<uint64_t>(v), kShardingHashSeed);
+  __device__ __host__ __forceinline__ size_t operator()(uint64_t v) {
+    return xxh64_uint64(v, kShardingHashSeed);
   }
 };
 
 struct LocalUniqueHash {
-  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLocalUniqueHashSeed); }
+  __device__ __host__ __forceinline__ size_t operator()(uint64_t v) {
+    return xxh64_uint64(v, kLocalUniqueHashSeed);
+  }
 };
 
 struct GlobalUniqueHash {
-  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kGlobalUniqueHashSeed); }
+  __device__ __host__ __forceinline__ size_t operator()(uint64_t v) {
+    return xxh64_uint64(v, kGlobalUniqueHashSeed);
+  }
 };
 
 struct FullCacheHash {
-  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kFullCacheHashSeed); }
+  __device__ __host__ __forceinline__ size_t operator()(uint64_t v) {
+    return xxh64_uint64(v, kFullCacheHashSeed);
+  }
 };
 
 struct LruCacheHash {
-  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLruCacheHashSeed); }
+  __device__ __host__ __forceinline__ size_t operator()(uint64_t v) {
+    return xxh64_uint64(v, kLruCacheHashSeed);
+  }
 };
 
 }  // namespace embedding
diff --git a/oneflow/core/embedding/key_value_store_options.h b/oneflow/core/embedding/key_value_store_options.h
index 2ee4d7d7825..cc663f3c1a7 100644
--- a/oneflow/core/embedding/key_value_store_options.h
+++ b/oneflow/core/embedding/key_value_store_options.h
@@ -133,7 +133,7 @@ class KeyValueStoreOptions final {
     }
     CHECK(persistent_table.contains("physical_block_size"));
     CHECK(persistent_table["physical_block_size"].is_number());
-    persistent_table_physical_block_size_ = persistent_table["physical_block_size"].get<int64_t>();
+    persistent_table_phisical_block_size_ = persistent_table["physical_block_size"].get<int64_t>();
     if (persistent_table.contains("capacity_hint")) {
       CHECK(persistent_table["capacity_hint"].is_number());
       persistent_table_capacity_hint_ = persistent_table["capacity_hint"].get<int64_t>();
@@ -148,7 +148,7 @@ class KeyValueStoreOptions final {
   int64_t LineSize() const { return line_size_; }
   const std::vector<CacheOptions>& GetCachesOptions() const { return cache_options_; }
   const std::vector<std::string>& PersistentTablePaths() const { return persistent_table_paths_; }
-  int64_t PersistentTablePhysicalBlockSize() const { return persistent_table_physical_block_size_; }
+  int64_t PersistentTablePhysicalBlockSize() const { return persistent_table_phisical_block_size_; }
   int64_t PersistentTableCapacityHint() const { return persistent_table_capacity_hint_; }
   bool IsFullCache() const {
     if (cache_options_.size() > 0 && cache_options_.at(0).policy == CacheOptions::Policy::kFull) {
@@ -163,7 +163,7 @@ class KeyValueStoreOptions final {
   std::string name_;
   int64_t line_size_;
   std::vector<std::string> persistent_table_paths_;
-  int64_t persistent_table_physical_block_size_;
+  int64_t persistent_table_phisical_block_size_;
   int64_t persistent_table_capacity_hint_;
   std::vector<CacheOptions> cache_options_;
 };
diff --git a/oneflow/core/embedding/key_value_store_test.cpp b/oneflow/core/embedding/key_value_store_test.cpp
index bb00f53e5e6..1caed72bbad 100644
--- a/oneflow/core/embedding/key_value_store_test.cpp
+++ b/oneflow/core/embedding/key_value_store_test.cpp
@@ -15,7 +15,6 @@ limitations under the License.
 */
 #include "oneflow/core/embedding/persistent_table_key_value_store.h"
 #include "oneflow/core/embedding/cached_key_value_store.h"
-#include "oneflow/core/embedding/mock_key_value_store.h"
 #include "oneflow/core/embedding/cache.h"
 #include "oneflow/core/device/cuda_util.h"
 #include <gtest/gtest.h>
@@ -245,22 +244,6 @@ TEST(CachedKeyValueStore, Full) {
   Global<ep::DeviceManagerRegistry>::Delete();
 }
 
-TEST(MockKeyValueStore, Mock) {
-  if (!HasCudaDevice()) { return; }
-  Global<ep::DeviceManagerRegistry>::New();
-  MockKeyValueStoreOptions store_options{};
-  std::string path = CreateTempDirectory();
-  uint32_t value_length = 128;
-  store_options.value_size = value_length * sizeof(float);
-  store_options.key_size = GetSizeOfDataType(DataType::kUInt64);
-  std::unique_ptr<KeyValueStore> store = NewMockKeyValueStore(store_options);
-  store->ReserveQueryLength(128);
-  TestKeyValueStore(store.get(), 1024, 1024, value_length);
-  store.reset();
-  PosixFile::RecursiveDelete(path);
-  Global<ep::DeviceManagerRegistry>::Delete();
-}
-
 #endif  // WITH_CUDA
 
 }  // namespace
diff --git a/oneflow/core/embedding/mock_key_value_store.cu b/oneflow/core/embedding/mock_key_value_store.cu
deleted file mode 100644
index f62c8779376..00000000000
--- a/oneflow/core/embedding/mock_key_value_store.cu
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/embedding/mock_key_value_store.h"
-#include "oneflow/core/device/cuda_util.h"
-
-namespace oneflow {
-
-namespace embedding {
-
-namespace {
-
-template<typename Key>
-class IteratorImpl : public KVIterator {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(IteratorImpl);
-  IteratorImpl(HashMap<Key, std::string>* store, uint32_t key_size, uint32_t value_size,
-               uint32_t max_query_length, void* host_keys_buffer, void* host_values_buffer,
-               uint32_t* host_num_buffer)
-      : store_(store),
-        pos_(store->begin()),
-        key_size_(key_size),
-        value_size_(value_size),
-        max_query_length_(max_query_length),
-        host_keys_buffer_(host_keys_buffer),
-        host_values_buffer_(host_values_buffer),
-        host_num_buffer_(host_num_buffer) {}
-  ~IteratorImpl() override = default;
-
-  void NextN(ep::Stream* stream, uint32_t n_request, uint32_t* n_result, void* keys,
-             void* values) override {
-    CHECK_LE(n_request, max_query_length_);
-    auto cuda_stream = stream->As<ep::CudaStream>();
-    CHECK_JUST(cuda_stream->Sync());
-    *host_num_buffer_ = 0;
-    while (*host_num_buffer_ < n_request && pos_ != store_->end()) {
-      reinterpret_cast<Key*>(host_keys_buffer_)[*host_num_buffer_] = pos_->first;
-      std::memcpy(reinterpret_cast<char*>(host_values_buffer_) + *host_num_buffer_ * value_size_,
-                  pos_->second.data(), value_size_);
-    }
-    OF_CUDA_CHECK(cudaMemcpyAsync(n_result, host_num_buffer_, sizeof(uint32_t), cudaMemcpyDefault,
-                                  cuda_stream->cuda_stream()));
-    const uint32_t num_keys = *host_num_buffer_;
-    if (num_keys != 0) {
-      OF_CUDA_CHECK(cudaMemcpyAsync(keys, host_keys_buffer_, num_keys * key_size_,
-                                    cudaMemcpyDefault, cuda_stream->cuda_stream()));
-      OF_CUDA_CHECK(cudaMemcpyAsync(values, host_values_buffer_, num_keys * value_size_,
-                                    cudaMemcpyDefault, cuda_stream->cuda_stream()));
-    }
-  }
-
-  void Reset() override { pos_ = store_->begin(); }
-
- private:
-  HashMap<Key, std::string>* store_;
-  typename HashMap<Key, std::string>::iterator pos_;
-  uint32_t key_size_;
-  uint32_t value_size_;
-  uint32_t max_query_length_;
-  void* host_keys_buffer_;
-  void* host_values_buffer_;
-  uint32_t* host_num_buffer_;
-};
-
-template<typename Key>
-class KeyValueStoreImpl : public KeyValueStore {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(KeyValueStoreImpl);
-  explicit KeyValueStoreImpl(const MockKeyValueStoreOptions& options)
-      : device_index_(-1), max_query_length_(0) {
-    OF_CUDA_CHECK(cudaGetDevice(&device_index_));
-    key_size_ = options.key_size;
-    value_size_ = options.value_size;
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
-        device_index_, reinterpret_cast<void**>(&host_query_keys_), key_size_ * max_query_length_));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
-                                          reinterpret_cast<void**>(&host_query_values_),
-                                          value_size_ * max_query_length_));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast<void**>(&host_n_missing_),
-                                          sizeof(uint32_t)));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
-                                          reinterpret_cast<void**>(&host_missing_indices_),
-                                          sizeof(uint32_t) * max_query_length_));
-  }
-  ~KeyValueStoreImpl() {
-    CudaCurrentDeviceGuard guard(device_index_);
-    if (max_query_length_ != 0) {
-      OF_CUDA_CHECK(cudaFreeHost(host_query_keys_));
-      OF_CUDA_CHECK(cudaFreeHost(host_query_values_));
-      OF_CUDA_CHECK(cudaFreeHost(host_missing_indices_));
-    }
-    OF_CUDA_CHECK(cudaFreeHost(host_n_missing_));
-  }
-
-  uint32_t KeySize() const override { return key_size_; }
-
-  uint32_t ValueSize() const override { return value_size_; }
-
-  uint32_t MaxQueryLength() const override { return max_query_length_; }
-
-  void ReserveQueryLength(uint32_t query_length) override {
-    CudaCurrentDeviceGuard guard(device_index_);
-    if (query_length <= max_query_length_) { return; }
-    if (max_query_length_ != 0) {
-      OF_CUDA_CHECK(cudaFreeHost(host_query_keys_));
-      OF_CUDA_CHECK(cudaFreeHost(host_query_values_));
-      OF_CUDA_CHECK(cudaFreeHost(host_missing_indices_));
-    }
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
-        device_index_, reinterpret_cast<void**>(&host_query_keys_), key_size_ * query_length));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(
-        device_index_, reinterpret_cast<void**>(&host_query_values_), value_size_ * query_length));
-    OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_,
-                                          reinterpret_cast<void**>(&host_missing_indices_),
-                                          sizeof(uint32_t) * query_length));
-    max_query_length_ = query_length;
-  }
-
-  void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values,
-           uint32_t* n_missing, uint32_t* missing_indices) override;
-  void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override;
-  bool SnapshotExists(const std::string& name) override;
-  void LoadSnapshot(const std::string& name) override;
-  void LoadSnapshot(const std::string& name,
-                    const std::function<void(KVIterator* iter)>& Hook) override;
-  void SaveSnapshot(const std::string& name) override;
-
- private:
-  int device_index_;
-  uint32_t max_query_length_;
-  uint32_t key_size_;
-  uint32_t value_size_;
-  Key* host_query_keys_{};
-  uint8_t* host_query_values_{};
-  uint32_t* host_n_missing_{};
-  uint32_t* host_missing_indices_{};
-  HashMap<Key, std::string> store_;
-  HashMap<std::string, HashMap<Key, std::string>> snapshots_;
-  std::mutex mutex_;
-};
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::Get(ep::Stream* stream, uint32_t num_keys, const void* keys,
-                                 void* values, uint32_t* n_missing, uint32_t* missing_indices) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  auto cuda_stream = stream->As<ep::CudaStream>();
-  CHECK_LE(num_keys, max_query_length_);
-  if (num_keys == 0) {
-    OF_CUDA_CHECK(cudaMemsetAsync(n_missing, 0, sizeof(uint32_t),
-                                  stream->As<ep::CudaStream>()->cuda_stream()));
-    return;
-  }
-  OF_CUDA_CHECK(cudaMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, cudaMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  CHECK_JUST(cuda_stream->Sync());
-  *host_n_missing_ = 0;
-  for (uint32_t i = 0; i < num_keys; ++i) {
-    auto it = store_.find(host_query_keys_[i]);
-    if (it != store_.end()) {
-      std::memcpy(host_query_values_ + i * value_size_, it->second.data(), value_size_);
-    } else {
-      host_missing_indices_[*host_n_missing_] = i;
-      *host_n_missing_ += 1;
-    }
-  }
-  OF_CUDA_CHECK(cudaMemcpyAsync(values, host_query_values_, num_keys * value_size_,
-                                cudaMemcpyDefault, cuda_stream->cuda_stream()));
-  OF_CUDA_CHECK(cudaMemcpyAsync(n_missing, host_n_missing_, sizeof(uint32_t), cudaMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  OF_CUDA_CHECK(cudaMemcpyAsync(missing_indices, host_missing_indices_,
-                                (*host_n_missing_) * sizeof(uint32_t), cudaMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::Put(ep::Stream* stream, uint32_t num_keys, const void* keys,
-                                 const void* values) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  auto cuda_stream = stream->As<ep::CudaStream>();
-  CHECK_LE(num_keys, max_query_length_);
-  if (num_keys == 0) { return; }
-  OF_CUDA_CHECK(cudaMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, cudaMemcpyDefault,
-                                cuda_stream->cuda_stream()));
-  OF_CUDA_CHECK(cudaMemcpyAsync(host_query_values_, values, value_size_ * num_keys,
-                                cudaMemcpyDefault, cuda_stream->cuda_stream()));
-  CHECK_JUST(cuda_stream->Sync());
-  for (uint32_t i = 0; i < num_keys; ++i) {
-    store_[host_query_keys_[i]] = std::string(
-        reinterpret_cast<const char*>(host_query_values_) + i * value_size_, value_size_);
-  }
-}
-
-template<typename Key>
-bool KeyValueStoreImpl<Key>::SnapshotExists(const std::string& name) {
-  return snapshots_.find(name) != snapshots_.end();
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::LoadSnapshot(const std::string& name) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  LoadSnapshot(name, nullptr);
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::LoadSnapshot(const std::string& name,
-                                          const std::function<void(KVIterator* iter)>& Hook) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  store_ = snapshots_[name];
-  if (Hook) {
-    IteratorImpl<Key> iterator(&store_, KeySize(), ValueSize(), max_query_length_, host_query_keys_,
-                               host_query_values_, host_n_missing_);
-    Hook(&iterator);
-  }
-}
-
-template<typename Key>
-void KeyValueStoreImpl<Key>::SaveSnapshot(const std::string& name) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  snapshots_[name] = store_;
-}
-
-}  // namespace
-
-std::unique_ptr<KeyValueStore> NewMockKeyValueStore(const MockKeyValueStoreOptions& options) {
-  if (options.key_size == sizeof(uint64_t)) {
-    return std::unique_ptr<KeyValueStore>(new KeyValueStoreImpl<uint64_t>(options));
-  } else if (options.key_size == sizeof(uint32_t)) {
-    return std::unique_ptr<KeyValueStore>(new KeyValueStoreImpl<uint32_t>(options));
-  } else {
-    UNIMPLEMENTED();
-    return nullptr;
-  }
-}
-
-}  // namespace embedding
-
-}  // namespace oneflow
diff --git a/oneflow/core/embedding/mock_key_value_store.h b/oneflow/core/embedding/mock_key_value_store.h
deleted file mode 100644
index 2e3d53d90e4..00000000000
--- a/oneflow/core/embedding/mock_key_value_store.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_EMBEDDING_MOCK_KEY_VALUE_STORE_H_
-#define ONEFLOW_CORE_EMBEDDING_MOCK_KEY_VALUE_STORE_H_
-
-#include "oneflow/core/embedding/key_value_store.h"
-
-namespace oneflow {
-
-namespace embedding {
-
-#ifdef WITH_CUDA
-
-struct MockKeyValueStoreOptions {
-  uint32_t key_size = 0;
-  uint32_t value_size = 0;
-};
-
-std::unique_ptr<KeyValueStore> NewMockKeyValueStore(const MockKeyValueStoreOptions& options);
-
-#endif  // WITH_CUDA
-
-}  // namespace embedding
-
-}  // namespace oneflow
-
-#endif  // ONEFLOW_CORE_EMBEDDING_MOCK_KEY_VALUE_STORE_H_
diff --git a/oneflow/core/embedding/persistent_table.cpp b/oneflow/core/embedding/persistent_table.cpp
index 8c3eb8050d7..091c5a7b01b 100644
--- a/oneflow/core/embedding/persistent_table.cpp
+++ b/oneflow/core/embedding/persistent_table.cpp
@@ -15,7 +15,6 @@ limitations under the License.
 */
 #include "oneflow/core/embedding/persistent_table.h"
 #include "oneflow/core/common/util.h"
-#include "oneflow/core/embedding/hash_functions.cuh"
 
 #ifdef __linux__
 
@@ -333,9 +332,6 @@ class Worker final {
   std::thread thread_;
 };
 
-template<typename Key, typename Engine>
-class SnapshotIteratorImpl;
-
 template<typename Key, typename Engine>
 class PersistentTableImpl : public PersistentTable {
  public:
@@ -358,10 +354,8 @@ class PersistentTableImpl : public PersistentTable {
   void LoadSnapshot(const std::string& name,
                     const std::function<void(Iterator* iter)>& Hook) override;
   void SaveSnapshot(const std::string& name) override;
-  Iterator* ReadSnapshot(const std::string& name) override;
 
  private:
-  friend class SnapshotIteratorImpl<Key, Engine>;
   std::string KeyFilePath(uint64_t chunk_id) const;
   std::string ValueFilePath(uint64_t chunk_id) const;
   std::string IndexFilePath(const std::string& name, uint64_t chunk_id) const;
@@ -380,7 +374,6 @@ class PersistentTableImpl : public PersistentTable {
   uint64_t num_logical_blocks_per_chunk_;
   uint64_t num_values_per_chunk_;
   uint32_t num_values_per_block_;
-  uint32_t physical_block_size_;
   uint32_t logical_block_size_;
 
   std::vector<std::unique_ptr<Worker<Engine>>> workers_;
@@ -402,7 +395,6 @@ PersistentTableImpl<Key, Engine>::PersistentTableImpl(const PersistentTableOptio
     : root_dir_(options.path),
       key_size_(options.key_size),
       value_size_(options.value_size),
-      physical_block_size_(options.physical_block_size),
       logical_block_size_(GetLogicalBlockSize(options.physical_block_size, value_size_)),
       blocks_buffer_(options.physical_block_size),
       writable_key_file_chunk_id_(-1) {
@@ -497,8 +489,7 @@ void PersistentTableImpl<Key, Engine>::Get(uint32_t num_keys, const void* keys,
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   offsets_buffer_.resize(num_keys);
   void* blocks_ptr = nullptr;
-  if (value_size_ == logical_block_size_
-      && reinterpret_cast<uintptr_t>(values) % physical_block_size_ == 0) {
+  if (value_size_ == logical_block_size_) {
     blocks_ptr = values;
   } else {
     blocks_buffer_.Resize(num_keys * logical_block_size_);
@@ -581,8 +572,7 @@ void PersistentTableImpl<Key, Engine>::Put(uint32_t num_keys, const void* keys,
                                            const void* values) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   const void* blocks_ptr = nullptr;
-  if (value_size_ == logical_block_size_
-      && reinterpret_cast<uintptr_t>(values) % physical_block_size_ == 0) {
+  if (value_size_ == logical_block_size_) {
     blocks_ptr = values;
   } else {
     const uint32_t num_blocks = RoundUp(num_keys, num_values_per_block_);
@@ -742,12 +732,6 @@ void PersistentTableImpl<Key, Engine>::SaveSnapshot(const std::string& name) {
   SaveSnapshotImpl(name);
 }
 
-template<typename Key, typename Engine>
-PersistentTable::Iterator* PersistentTableImpl<Key, Engine>::ReadSnapshot(const std::string& name) {
-  return new SnapshotIteratorImpl<Key, Engine>(this, name, value_size_, logical_block_size_,
-                                               num_values_per_block_, num_values_per_chunk_);
-}
-
 template<typename Key, typename Engine>
 void PersistentTableImpl<Key, Engine>::ParallelFor(size_t total,
                                                    const ForRange<Engine>& for_range) {
@@ -769,84 +753,6 @@ void PersistentTableImpl<Key, Engine>::ParallelFor(size_t total,
   bc.WaitForeverUntilCntEqualZero();
 }
 
-template<typename Key, typename Engine>
-class SnapshotIteratorImpl : public PersistentTable::Iterator {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(SnapshotIteratorImpl);
-  SnapshotIteratorImpl(PersistentTableImpl<Key, Engine>* table, const std::string& snapshot_name,
-                       uint32_t value_size, uint32_t logical_block_size,
-                       uint32_t num_values_per_block, uint64_t num_values_per_chunk)
-      : table_(table),
-        snapshot_name_(snapshot_name),
-        value_size_(value_size),
-        logical_block_size_(logical_block_size),
-        num_values_per_block_(num_values_per_block),
-        num_values_per_chunk_(num_values_per_chunk),
-        current_chunk_(0) {
-    const std::string snapshot_list = table_->SnapshotListFilePath(snapshot_name);
-    std::ifstream list_if(snapshot_list);
-    std::string index_filename;
-    while (std::getline(list_if, index_filename)) { indices_names_.push_back(index_filename); }
-  }
-  ~SnapshotIteratorImpl() override = default;
-
-  void Next(uint32_t num_keys, uint32_t* return_keys, void* keys, void* values) override {
-    *return_keys = 0;
-    while (current_chunk_ < indices_names_.size()) {
-      if (!chunk_iterator_) {
-        const std::string snapshot_base = table_->SnapshotDirPath(snapshot_name_);
-        const uint64_t chunk_id = GetChunkId(indices_names_[current_chunk_], kIndexFileNamePrefix);
-        PosixFile index_file(PosixFile::JoinPath(snapshot_base, indices_names_[current_chunk_]),
-                             O_RDONLY, 0644);
-        const size_t index_file_size = index_file.Size();
-        CHECK_EQ(index_file_size % sizeof(uint64_t), 0);
-        if (index_file_size == 0) {
-          current_chunk_ += 1;
-          continue;
-        }
-        const size_t n_entries = index_file_size / sizeof(uint64_t);
-        indices_file_.reset(new PosixMappedFile(std::move(index_file), index_file_size, PROT_READ));
-        PosixFile key_file(table_->KeyFilePath(chunk_id), O_RDONLY, 0644);
-        keys_file_.reset(new PosixMappedFile(std::move(key_file), key_file.Size(), PROT_READ));
-        PosixFile value_file(table_->ValueFilePath(chunk_id), O_RDONLY, 0644);
-        values_file_.reset(
-            new PosixMappedFile(std::move(value_file), value_file.Size(), PROT_READ));
-        chunk_iterator_.reset(new ChunkIteratorImpl<Key>(
-            value_size_, logical_block_size_, num_values_per_block_, num_values_per_chunk_,
-            chunk_id, n_entries, static_cast<const Key*>(keys_file_->ptr()),
-            static_cast<const uint64_t*>(indices_file_->ptr()), values_file_->ptr()));
-      }
-      chunk_iterator_->Next(num_keys, return_keys, keys, values);
-      if (*return_keys == 0) {
-        chunk_iterator_.reset();
-        keys_file_.reset();
-        values_file_.reset();
-        indices_file_.reset();
-        current_chunk_ += 1;
-        continue;
-      } else {
-        return;
-      }
-    }
-  }
-
-  void Reset() override { UNIMPLEMENTED(); }
-
- private:
-  PersistentTableImpl<Key, Engine>* table_;
-  std::string snapshot_name_;
-  uint32_t value_size_;
-  uint32_t logical_block_size_;
-  uint32_t num_values_per_block_;
-  uint64_t num_values_per_chunk_;
-  size_t current_chunk_;
-  std::vector<std::string> indices_names_;
-  std::unique_ptr<PosixMappedFile> keys_file_;
-  std::unique_ptr<PosixMappedFile> values_file_;
-  std::unique_ptr<PosixMappedFile> indices_file_;
-  std::unique_ptr<ChunkIteratorImpl<Key>> chunk_iterator_;
-};
-
 template<typename Engine>
 std::unique_ptr<PersistentTable> DispatchKeyType(const PersistentTableOptions& options) {
   if (options.key_size == 4) {
diff --git a/oneflow/core/embedding/persistent_table.h b/oneflow/core/embedding/persistent_table.h
index 39520c6fa2b..febf28675fb 100644
--- a/oneflow/core/embedding/persistent_table.h
+++ b/oneflow/core/embedding/persistent_table.h
@@ -60,7 +60,6 @@ class PersistentTable {
   virtual void LoadSnapshot(const std::string& name,
                             const std::function<void(Iterator* iter)>& Hook) = 0;
   virtual void SaveSnapshot(const std::string& name) = 0;
-  virtual Iterator* ReadSnapshot(const std::string& name) = 0;
 };
 
 std::unique_ptr<PersistentTable> NewPersistentTable(const PersistentTableOptions& options);
diff --git a/oneflow/core/ep/cpu/cpu_device_manager.cpp b/oneflow/core/ep/cpu/cpu_device_manager.cpp
index cb76db3af90..e72fa7874ca 100644
--- a/oneflow/core/ep/cpu/cpu_device_manager.cpp
+++ b/oneflow/core/ep/cpu/cpu_device_manager.cpp
@@ -30,7 +30,7 @@ DeviceManagerRegistry* CpuDeviceManager::registry() const { return registry_; }
 std::shared_ptr<Device> CpuDeviceManager::GetDevice(size_t device_index) {
   std::lock_guard<std::mutex> lock(device_mutex_);
   if (!device_) { device_.reset(new CpuDevice(this)); }
-  device_->SetNumThreads(device_num_threads_);
+  dynamic_cast<CpuDevice*>(device_.get())->SetNumThreads(device_num_threads_);
   return device_;
 }
 
diff --git a/oneflow/core/ep/cpu/cpu_device_manager.h b/oneflow/core/ep/cpu/cpu_device_manager.h
index f01e5380d93..f2cc30c6101 100644
--- a/oneflow/core/ep/cpu/cpu_device_manager.h
+++ b/oneflow/core/ep/cpu/cpu_device_manager.h
@@ -22,8 +22,6 @@ namespace oneflow {
 
 namespace ep {
 
-class CpuDevice;
-
 class CpuDeviceManager : public DeviceManager {
  public:
   OF_DISALLOW_COPY_AND_MOVE(CpuDeviceManager);
@@ -41,7 +39,7 @@ class CpuDeviceManager : public DeviceManager {
  private:
   size_t device_num_threads_;
   std::mutex device_mutex_;
-  std::shared_ptr<CpuDevice> device_;
+  std::shared_ptr<Device> device_;
   DeviceManagerRegistry* registry_;
 };
 
diff --git a/oneflow/core/ep/cpu/cpu_stream.cpp b/oneflow/core/ep/cpu/cpu_stream.cpp
index 4818d704bc8..4b5efae4c0c 100644
--- a/oneflow/core/ep/cpu/cpu_stream.cpp
+++ b/oneflow/core/ep/cpu/cpu_stream.cpp
@@ -21,7 +21,7 @@ namespace ep {
 
 DeviceType CpuStream::device_type() const { return DeviceType::kCPU; }
 
-CpuDevice* CpuStream::device() const { return device_; }
+Device* CpuStream::device() const { return device_; }
 
 Maybe<void> CpuStream::Sync() { return Maybe<void>::Ok(); }
 
diff --git a/oneflow/core/ep/cpu/cpu_stream.h b/oneflow/core/ep/cpu/cpu_stream.h
index 9171e19d924..b1ec9e72a9b 100644
--- a/oneflow/core/ep/cpu/cpu_stream.h
+++ b/oneflow/core/ep/cpu/cpu_stream.h
@@ -80,7 +80,7 @@ class CpuNumThreadsGuard {
 class CpuStream : public Stream {
  public:
   OF_DISALLOW_COPY_AND_MOVE(CpuStream);
-  explicit CpuStream(CpuDevice* device) : device_(device) {
+  explicit CpuStream(Device* device) : device_(device) {
 #ifdef WITH_ONEDNN
     onednn_engine_.reset(new dnnl::engine(dnnl::engine::kind::cpu, 0));
     onednn_stream_.reset(new dnnl::stream(*onednn_engine_));
@@ -90,7 +90,7 @@ class CpuStream : public Stream {
   ~CpuStream() override = default;
 
   DeviceType device_type() const override;
-  CpuDevice* device() const override;
+  Device* device() const override;
   Maybe<void> Sync() override;
   void RecordEvent(Event* event) override;
 
@@ -103,7 +103,7 @@ class CpuStream : public Stream {
   void ParallelFor(int64_t begin, int64_t end, const F& func, size_t grain_size) {
 #if OF_CPU_THREADING_RUNTIME != OF_RUNTIME_SEQ
     auto DivUp = [](int64_t x, int64_t y) { return (x + y - 1) / y; };
-    size_t num_threads = device()->GetNumThreads();
+    size_t num_threads = dynamic_cast<CpuDevice*>(device())->GetNumThreads();
 #endif
     if (begin >= end) { return; }
 #if OF_CPU_THREADING_RUNTIME == OF_RUNTIME_OMP
@@ -149,7 +149,7 @@ class CpuStream : public Stream {
   std::unique_ptr<dnnl::engine> onednn_engine_;
   std::unique_ptr<dnnl::stream> onednn_stream_;
 #endif
-  CpuDevice* device_;
+  Device* device_;
   static constexpr size_t kParallelForDefaultGrain = 32768;
 };
 
diff --git a/oneflow/core/ep/cpu/primitive/add.cpp b/oneflow/core/ep/cpu/primitive/add.cpp
index 1389588cd1a..a90bb666655 100644
--- a/oneflow/core/ep/cpu/primitive/add.cpp
+++ b/oneflow/core/ep/cpu/primitive/add.cpp
@@ -89,7 +89,7 @@ class AddOneDnnImpl : public Add {
       if (srcs[i] == dst) { LOG(FATAL) << "Only the first parameter can be operated inplace"; }
     }
     CpuStream* cpu_stream = stream->As<CpuStream>();
-    size_t num_threads = cpu_stream->device()->GetNumThreads();
+    size_t num_threads = static_cast<CpuDevice*>(cpu_stream->device())->GetNumThreads();
     CpuNumThreadsGuard guard(num_threads);
 
     dnnl::engine* onednn_engine = stream->As<CpuStream>()->onednn_engine();
diff --git a/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp b/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp
index 439576e2442..0ad0ec1ab27 100644
--- a/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp
+++ b/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp
@@ -179,7 +179,7 @@ class OneDnnBroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary {
               size_t num_src1_dims, const int64_t* src1_dims, const void* src1,
               void* dst) override {
     CpuStream* cpu_stream = stream->As<CpuStream>();
-    size_t num_threads = cpu_stream->device()->GetNumThreads();
+    size_t num_threads = static_cast<CpuDevice*>(cpu_stream->device())->GetNumThreads();
     CpuNumThreadsGuard guard(num_threads);
 
     dnnl::engine* onednn_engine = stream->As<CpuStream>()->onednn_engine();
diff --git a/oneflow/core/ep/cpu/primitive/permute.cpp b/oneflow/core/ep/cpu/primitive/permute.cpp
index cee1eb8b974..d24e8e07706 100644
--- a/oneflow/core/ep/cpu/primitive/permute.cpp
+++ b/oneflow/core/ep/cpu/primitive/permute.cpp
@@ -15,8 +15,6 @@ limitations under the License.
 */
 #include "oneflow/core/ep/include/primitive/permute.h"
 #include "oneflow/core/ep/common/primitive/permute_impl.h"
-#include "oneflow/core/ep/cpu/cpu_stream.h"
-#include "oneflow/core/ep/cpu/cpu_device.h"
 
 namespace oneflow {
 
@@ -66,73 +64,6 @@ class PermuteImpl : public Permute {
   }
 };
 
-#ifdef WITH_ONEDNN
-constexpr size_t kMaxOneDnnMovementSize = 4;
-constexpr size_t kMaxOneDnnMapSize = 5;
-uint32_t OnednnDatatypeTagMap[kMaxOneDnnMapSize] = {0, dnnl_u8, dnnl_f16, 0, dnnl_s32};
-class OneDnnPermuteImpl : public Permute {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(OneDnnPermuteImpl);
-  OneDnnPermuteImpl() = default;
-  ~OneDnnPermuteImpl() override = default;
-
-  using Permute::Launch;
-  void Launch(Stream* stream, DataType data_type, size_t num_dims, const int64_t* src_dims,
-              const void* src, const int* permutation, void* dst) override {
-    CHECK_LE(num_dims, kMaxNumDims);
-    CHECK_GT(num_dims, 0);
-    CpuStream* cpu_stream = stream->As<CpuStream>();
-    size_t num_threads = static_cast<CpuDevice*>(cpu_stream->device())->GetNumThreads();  // NOLINT
-    CpuNumThreadsGuard guard(num_threads);
-
-    dnnl::engine* onednn_engine = stream->As<CpuStream>()->onednn_engine();
-    dnnl::stream* onednn_stream = stream->As<CpuStream>()->onednn_stream();
-    size_t onednn_num_dims = num_dims;
-    dnnl::memory::dims onednn_dims(kMaxNumDims + 1, 0);
-    dnnl::memory::dims onednn_permute(kMaxNumDims + 1, 0);
-    dnnl::memory::dims src_stride(kMaxNumDims + 1, 0);
-    dnnl::memory::dims dst_stride(kMaxNumDims + 1, 0);
-    for (int64_t dim = onednn_num_dims - 1; dim >= 0; dim--) {
-      onednn_dims[dim] = src_dims[dim];
-      onednn_permute[dim] = permutation[dim];
-    }
-    size_t movement_size = GetSizeOfDataType(data_type);
-    if (movement_size > kMaxOneDnnMovementSize) {
-      onednn_dims[onednn_num_dims] = movement_size / kMaxOneDnnMovementSize;
-      onednn_permute[onednn_num_dims] = onednn_num_dims;
-      onednn_num_dims = onednn_num_dims + 1;
-      movement_size = kMaxOneDnnMovementSize;
-    }
-    onednn_dims.resize(onednn_num_dims);
-
-    src_stride[onednn_num_dims - 1] = 1;
-    dst_stride[onednn_permute[onednn_num_dims - 1]] = 1;
-    for (int64_t i = onednn_num_dims - 2; i >= 0; i--) {
-      src_stride[i] = src_stride[i + 1] * onednn_dims[i + 1];
-      dst_stride[onednn_permute[i]] =
-          dst_stride[onednn_permute[i + 1]] * onednn_dims[onednn_permute[i + 1]];
-    }
-
-    dnnl::memory::data_type onednn_data_type =
-        static_cast<dnnl::memory::data_type>(OnednnDatatypeTagMap[movement_size]);
-    // The reorder primitive requires the source and destination tensors to have the same shape.
-    // Implicit broadcasting is not supported.
-    auto src_mem_desc = dnnl::memory::desc(onednn_dims, onednn_data_type, src_stride);
-    auto dst_mem_desc = dnnl::memory::desc(onednn_dims, onednn_data_type, dst_stride);
-    auto src_mem = dnnl::memory(src_mem_desc, *onednn_engine, const_cast<void*>(src));
-    auto dst_mem = dnnl::memory(dst_mem_desc, *onednn_engine, dst);
-    auto reorder_primitive_desc =
-        dnnl::reorder::primitive_desc(*onednn_engine, src_mem_desc, *onednn_engine, dst_mem_desc);
-    auto reorder_primitive = dnnl::reorder(reorder_primitive_desc);
-    std::unordered_map<int, dnnl::memory> reorder_args{{DNNL_ARG_SRC, src_mem},
-                                                       {DNNL_ARG_DST, dst_mem}};
-    reorder_primitive.execute(*onednn_stream, reorder_args);
-    onednn_stream->wait();
-  }
-};
-
-#endif  // WITH_ONEDNN
-
 class PermuteFactoryImpl : public PermuteFactory {
  public:
   OF_DISALLOW_COPY_AND_MOVE(PermuteFactoryImpl);
@@ -141,12 +72,7 @@ class PermuteFactoryImpl : public PermuteFactory {
 
   std::unique_ptr<Permute> New(size_t max_num_dims) override {
     if (max_num_dims <= kMaxNumDims) {
-#ifdef WITH_ONEDNN
-      return std::unique_ptr<Permute>(new OneDnnPermuteImpl());
-#else
       return std::unique_ptr<Permute>(new PermuteImpl());
-#endif  // WITH_ONEDNN
-
     } else {
       return nullptr;
     }
diff --git a/oneflow/core/ep/cuda/cuda_device_manager.cpp b/oneflow/core/ep/cuda/cuda_device_manager.cpp
index 624c2e48a74..cf221e329e9 100644
--- a/oneflow/core/ep/cuda/cuda_device_manager.cpp
+++ b/oneflow/core/ep/cuda/cuda_device_manager.cpp
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/ep/cuda/cuda_device_manager.h"
+#include "oneflow/core/ep/cuda/cuda_device.h"
 #include "oneflow/core/device/cuda_util.h"
 
 #ifdef WITH_CUDA
diff --git a/oneflow/core/ep/cuda/cuda_device_manager.h b/oneflow/core/ep/cuda/cuda_device_manager.h
index 1463d085d0e..88ae1f6b86d 100644
--- a/oneflow/core/ep/cuda/cuda_device_manager.h
+++ b/oneflow/core/ep/cuda/cuda_device_manager.h
@@ -21,9 +21,8 @@ limitations under the License.
 #ifdef WITH_CUDA
 
 namespace oneflow {
-namespace ep {
 
-class CudaDevice;
+namespace ep {
 
 class CudaDeviceManager : public DeviceManager {
  public:
@@ -40,7 +39,7 @@ class CudaDeviceManager : public DeviceManager {
 
  private:
   std::mutex devices_mutex_;
-  std::vector<std::shared_ptr<CudaDevice>> devices_;
+  std::vector<std::shared_ptr<Device>> devices_;
   DeviceManagerRegistry* registry_;
 };
 
diff --git a/oneflow/core/ep/cuda/cuda_stream.cpp b/oneflow/core/ep/cuda/cuda_stream.cpp
index 3970462fa24..5c7625a893a 100644
--- a/oneflow/core/ep/cuda/cuda_stream.cpp
+++ b/oneflow/core/ep/cuda/cuda_stream.cpp
@@ -141,7 +141,7 @@ Maybe<void> CudaStream::OnExecutionContextTeardown() { return Maybe<void>::Ok();
 
 DeviceType CudaStream::device_type() const { return DeviceType::kCUDA; }
 
-CudaDevice* CudaStream::device() const { return device_; }
+Device* CudaStream::device() const { return device_; }
 
 Maybe<void> CudaStream::Sync() {
   cudaError_t err = cudaStreamSynchronize(cuda_stream_);
diff --git a/oneflow/core/ep/cuda/cuda_stream.h b/oneflow/core/ep/cuda/cuda_stream.h
index efea18ad5d1..c0fbba966cf 100644
--- a/oneflow/core/ep/cuda/cuda_stream.h
+++ b/oneflow/core/ep/cuda/cuda_stream.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define ONEFLOW_CORE_EP_CUDA_CUDA_STREAM_H_
 
 #include "oneflow/core/ep/include/stream.h"
-#include "oneflow/core/ep/cuda/cuda_device.h"
 
 #ifdef WITH_CUDA
 
@@ -76,7 +75,7 @@ class CudaStream : public Stream {
   static constexpr uint32_t kDefaultBlockSize = 256;
 
   DeviceType device_type() const override;
-  CudaDevice* device() const override;
+  Device* device() const override;
   Maybe<void> Sync() override;
   void RecordEvent(Event* event) override;
 
diff --git a/oneflow/core/framework/consistency_check.cpp b/oneflow/core/framework/consistency_check.cpp
index 259c6af7577..acaaf235395 100644
--- a/oneflow/core/framework/consistency_check.cpp
+++ b/oneflow/core/framework/consistency_check.cpp
@@ -29,7 +29,7 @@ namespace oneflow {
 
 namespace {
 
-struct FlatMetaInfoConsistency;
+class FlatMetaInfoConsistency;
 
 class CheckMetaInfoConsistencyAsyncTransportCtx : public AsyncTransportCtx {
  public:
@@ -223,8 +223,8 @@ NonRecursiveMetaInfoConsistencyCheckScope::~NonRecursiveMetaInfoConsistencyCheck
 Maybe<void> MetaInfoConsistencyCheck(const Symbol<ParallelDesc>& placement,
                                      const Optional<Symbol<NdSbp>>& nd_sbp,
                                      const Optional<Symbol<NdSbp>>& grad_nd_sbp,
-                                     const size_t debug_level, bool force_check) {
-  if ((IsEnvEnabled(debug_level) || force_check) && !IsMetaInfoConsistencyCheckDisable()) {
+                                     const size_t debug_level) {
+  if (IsEnvEnabled(debug_level) && !IsMetaInfoConsistencyCheckDisable()) {
     JUST(MetaInfoConsistencyCheckUtil(placement, nd_sbp, grad_nd_sbp));
   }
   return Maybe<void>::Ok();
@@ -232,8 +232,8 @@ Maybe<void> MetaInfoConsistencyCheck(const Symbol<ParallelDesc>& placement,
 
 Maybe<void> MetaInfoConsistencyCheck(const Symbol<ParallelDesc>& placement,
                                      const Optional<Symbol<NdSbp>>& nd_sbp,
-                                     const size_t debug_level, bool force_check) {
-  if ((IsEnvEnabled(debug_level) || force_check) && !IsMetaInfoConsistencyCheckDisable()) {
+                                     const size_t debug_level) {
+  if (IsEnvEnabled(debug_level) && !IsMetaInfoConsistencyCheckDisable()) {
     JUST(MetaInfoConsistencyCheckUtil(placement, nd_sbp, Optional<Symbol<NdSbp>>()));
   }
   return Maybe<void>::Ok();
@@ -242,22 +242,22 @@ Maybe<void> MetaInfoConsistencyCheck(const Symbol<ParallelDesc>& placement,
 Maybe<void> MetaInfoConsistencyCheck(const Symbol<ParallelDesc>& placement,
                                      const std::vector<Symbol<SbpParallel>>& sbp_tuple,
                                      const std::vector<Symbol<SbpParallel>>& grad_sbp_tuple,
-                                     const size_t debug_level, bool force_check) {
+                                     const size_t debug_level) {
   Optional<Symbol<NdSbp>> nd_sbp;
   Optional<Symbol<NdSbp>> grad_nd_sbp;
   if (!sbp_tuple.empty()) { grad_nd_sbp = JUST(GetNdSbp(sbp_tuple)); }
   if (!grad_sbp_tuple.empty()) { grad_nd_sbp = JUST(GetNdSbp(grad_sbp_tuple)); }
-  JUST(MetaInfoConsistencyCheck(placement, nd_sbp, grad_nd_sbp, debug_level, force_check));
+  JUST(MetaInfoConsistencyCheck(placement, nd_sbp, grad_nd_sbp, debug_level));
   return Maybe<void>::Ok();
 }
 
 Maybe<void> MetaInfoConsistencyCheck(const Symbol<ParallelDesc>& placement,
                                      const std::vector<Symbol<SbpParallel>>& sbp_tuple,
-                                     const size_t debug_level, bool force_check) {
+                                     const size_t debug_level) {
   Optional<Symbol<NdSbp>> nd_sbp;
   Optional<Symbol<NdSbp>> grad_nd_sbp;
   if (!sbp_tuple.empty()) { grad_nd_sbp = JUST(GetNdSbp(sbp_tuple)); }
-  JUST(MetaInfoConsistencyCheck(placement, nd_sbp, grad_nd_sbp, debug_level, force_check));
+  JUST(MetaInfoConsistencyCheck(placement, nd_sbp, grad_nd_sbp, debug_level));
   return Maybe<void>::Ok();
 }
 
diff --git a/oneflow/core/framework/consistency_check.h b/oneflow/core/framework/consistency_check.h
index ebb768b5381..698c437b29c 100644
--- a/oneflow/core/framework/consistency_check.h
+++ b/oneflow/core/framework/consistency_check.h
@@ -36,20 +36,20 @@ Maybe<void> DataConsistencyCheck(const void* buffer_ptr, size_t buffer_size,
 Maybe<void> MetaInfoConsistencyCheck(const Symbol<ParallelDesc>& placement,
                                      const Optional<Symbol<NdSbp>>& nd_sbp,
                                      const Optional<Symbol<NdSbp>>& grad_nd_sbp,
-                                     const size_t debug_level, bool force_check);
+                                     const size_t debug_level);
 
 Maybe<void> MetaInfoConsistencyCheck(const Symbol<ParallelDesc>& placement,
                                      const Optional<Symbol<NdSbp>>& nd_sbp,
-                                     const size_t debug_level, bool force_check);
+                                     const size_t debug_level);
 
 Maybe<void> MetaInfoConsistencyCheck(const Symbol<ParallelDesc>& placement,
                                      const std::vector<Symbol<SbpParallel>>& sbp_tuple,
                                      const std::vector<Symbol<SbpParallel>>& grad_sbp_tuple,
-                                     const size_t debug_level, bool force_check);
+                                     const size_t debug_level);
 
 Maybe<void> MetaInfoConsistencyCheck(const Symbol<ParallelDesc>& placement,
                                      const std::vector<Symbol<SbpParallel>>& sbp_tuple,
-                                     const size_t debug_level, bool force_check);
+                                     const size_t debug_level);
 
 }  // namespace oneflow
 
diff --git a/oneflow/core/framework/multi_client_session_context.cpp b/oneflow/core/framework/multi_client_session_context.cpp
index 01559d71b83..e26f32c2ed1 100644
--- a/oneflow/core/framework/multi_client_session_context.cpp
+++ b/oneflow/core/framework/multi_client_session_context.cpp
@@ -140,7 +140,6 @@ Maybe<void> MultiClientSessionContext::TryInit(const std::string& config_proto_s
 }
 
 Maybe<void> MultiClientSessionContext::UpdateResource(const Resource& reso_proto) {
-  CHECK_OR_RETURN(is_inited_) << " session must be inited when updating resource.";
   CHECK_NOTNULL_OR_RETURN((Global<ResourceDesc, ForSession>::Get()));
   Global<ResourceDesc, ForSession>::Get()->Update(reso_proto);
   return Maybe<void>::Ok();
diff --git a/oneflow/core/framework/nn_graph.cpp b/oneflow/core/framework/nn_graph.cpp
index 3cd9d979c0c..a64286c344d 100644
--- a/oneflow/core/framework/nn_graph.cpp
+++ b/oneflow/core/framework/nn_graph.cpp
@@ -348,9 +348,8 @@ Maybe<void> NNGraph::GetVariableRealBlobAfterSyncPlan() {
         auto lazy_mode_disabled_guard = LazyMode::Guard(/*is_enabled*/ false);
         std::vector<Symbol<SbpParallel>> grad_sbp_tuple;
         // To consistent from a local or consistent tensor.
-        bool check_meta = load_tensor_iter->second->is_consistent() ? false : true;
         tensor = JUST(one::functional::ToConsistent(load_tensor_iter->second, placement, *sbp_tuple,
-                                                    grad_sbp_tuple, check_meta));
+                                                    grad_sbp_tuple));
         JUST(vm::CurrentRankSync());
         VLOG(2) << "Lazy nn.Graph name " << name_ << " op: " << op_attribute.op_conf().name()
                 << " created in JobPass, nn.Graph has loaded the tensor from state dict for this "
@@ -383,9 +382,8 @@ Maybe<void> NNGraph::GetVariableRealBlobAfterSyncPlan() {
         }
         {
           auto lazy_mode_disabled_guard = LazyMode::Guard(/* is_enabled */ false);
-          const auto& new_tensor = JUST(
-              one::functional::ToConsistent(tensor, JUST(tensor->parallel_desc()),
-                                            optimized_sbp_parallels, {}, /* check_meta */ false));
+          const auto& new_tensor = JUST(one::functional::ToConsistent(
+              tensor, JUST(tensor->parallel_desc()), optimized_sbp_parallels, {}));
           JUST(vm::CurrentRankSync());
           // Use tensor.set_data inferface and make new TensorImpl instead of the old one.
           JUST(tensor->set_data(new_tensor));
diff --git a/oneflow/core/framework/op_expr_grad_function.h b/oneflow/core/framework/op_expr_grad_function.h
index 02dacf23ebc..d4f3561a83b 100644
--- a/oneflow/core/framework/op_expr_grad_function.h
+++ b/oneflow/core/framework/op_expr_grad_function.h
@@ -192,8 +192,6 @@ class OpExprGradClosure {
     return impl_->ApplyIf(state_.get(), out_grads, in_grads);
   }
 
-  const std::shared_ptr<AutoGradCaptureState>& state() const { return state_; }
-
  private:
   std::shared_ptr<OpExprGradFunctionIf> impl_;
   std::shared_ptr<AutoGradCaptureState> state_;
diff --git a/oneflow/core/framework/op_interpreter/dispatch_frame.cpp b/oneflow/core/framework/op_interpreter/dispatch_frame.cpp
index 656494b731d..4757c272e71 100644
--- a/oneflow/core/framework/op_interpreter/dispatch_frame.cpp
+++ b/oneflow/core/framework/op_interpreter/dispatch_frame.cpp
@@ -23,8 +23,8 @@ namespace oneflow {
   return &frame_str;
 }
 
-/* static */ const std::string& DispatchFrame::get_str() { return *get_str_ptr(); }
+/* static */ std::string DispatchFrame::get_str() { return *get_str_ptr(); }
 
-/* static */ void DispatchFrame::set_str(const std::string& str) { *get_str_ptr() = str; }
+/* static */ void DispatchFrame::set_str(std::string str) { *get_str_ptr() = std::move(str); }
 
 }  // namespace oneflow
diff --git a/oneflow/core/framework/op_interpreter/dispatch_frame.h b/oneflow/core/framework/op_interpreter/dispatch_frame.h
index 8ef97d465a5..9068bf29c1f 100644
--- a/oneflow/core/framework/op_interpreter/dispatch_frame.h
+++ b/oneflow/core/framework/op_interpreter/dispatch_frame.h
@@ -26,8 +26,7 @@ class DispatchFrame {
   DispatchFrame() = delete;
   ~DispatchFrame() = delete;
 
-  static const std::string& get_str();
-  static void set_str(const std::string& str);
+  static std::string get_str();
 
   class Guard {
    public:
@@ -42,6 +41,7 @@ class DispatchFrame {
 
  private:
   static std::string* get_str_ptr();
+  static void set_str(std::string str);
 };
 
 }  // namespace oneflow
diff --git a/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp
index b9367514689..733a6705df5 100644
--- a/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp
+++ b/oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp
@@ -118,7 +118,7 @@ Maybe<void> Interpret(const UserOpExpr& user_op_expr, const TensorTuple& inputs,
   if (inputs.empty()) {
     // check consistency placement and nd_sbp, do not check in non-src op because it is assumed that
     // InferSbp in op is a deterministic algorithm
-    JUST(MetaInfoConsistencyCheck(parallel_desc, ctx.nd_sbp, 1, /* force_check */ false));
+    JUST(MetaInfoConsistencyCheck(parallel_desc, ctx.nd_sbp, 1));
     const auto& infer_args =
         JUST(SrcOpConsistentTensorMetaInferArgs::New(ctx.attrs, parallel_desc, JUST(ctx.nd_sbp)));
     result = JUST(user_op_expr.mut_consistent_tensor_infer_cache()->GetOrInfer(*infer_args));
diff --git a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp
index 54c2ed95884..f75228f9004 100644
--- a/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp
+++ b/oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp
@@ -719,7 +719,7 @@ Maybe<void> LazyInterpreterApplyImplForSourceUserOpExpr(const UserOpExpr& op_exp
     CHECK_OR_RETURN(!ctx.device.has_value());
     const auto& parallel_desc_sym = JUST(ctx.parallel_desc);
     parallel_desc = parallel_desc_sym.shared_from_symbol();
-    JUST(MetaInfoConsistencyCheck(parallel_desc_sym, ctx.nd_sbp, 1, /* force_check */ false));
+    JUST(MetaInfoConsistencyCheck(parallel_desc_sym, ctx.nd_sbp, 1));
     is_local = false;
   } else {
     // NOTE(chengcheng): local
diff --git a/oneflow/core/framework/op_interpreter/op_interpreter.cpp b/oneflow/core/framework/op_interpreter/op_interpreter.cpp
index 32474bb50ac..778b2341ab2 100644
--- a/oneflow/core/framework/op_interpreter/op_interpreter.cpp
+++ b/oneflow/core/framework/op_interpreter/op_interpreter.cpp
@@ -99,16 +99,16 @@ Maybe<void> AutogradInterpreter::Apply(const OpExpr& op_expr, const TensorTuple&
   std::shared_ptr<OpExprGradClosure> grad_closure(nullptr);
   if (requires_grad && !LazyMode::is_enabled()) {
     grad_closure = JUST(op_expr.GetOrCreateOpGradClosure());
-    auto backward_fn = std::make_shared<BackwardFunction>();
-    backward_fn->body = [=](const TensorTuple& out_grads, TensorTuple* in_grads,
-                            bool create_graph) -> Maybe<void> {
-      autograd::AutoGradMode mode(create_graph);
-      JUST(grad_closure->Apply(out_grads, in_grads));
-      return Maybe<void>::Ok();
-    };
-    backward_fn->status = [=]() { return grad_closure->state()->SavedTensors().size() > 0; };
-    JUST(GetThreadLocalAutogradEngine()->AddNode(op_expr.op_type_name() + "_backward", backward_fn,
-                                                 inputs, outputs));
+    auto backward_fn =
+        std::make_shared<std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>(
+            [=](const TensorTuple& out_grads, TensorTuple* in_grads,
+                bool create_graph) -> Maybe<void> {
+              autograd::AutoGradMode mode(create_graph);
+              JUST(grad_closure->Apply(out_grads, in_grads));
+              return Maybe<void>::Ok();
+            });
+    JUST(GetThreadLocalAutogradEngine()->AddBackwardFuncPtr(op_expr.op_type_name() + "_backward",
+                                                            backward_fn, inputs, outputs));
   }
   // Update outputs autograd meta
   // Note: if requires_grad is True, we will create a new autograd meta for each output
diff --git a/oneflow/core/framework/tensor.h b/oneflow/core/framework/tensor.h
index b95dbd3c280..e44305cde91 100644
--- a/oneflow/core/framework/tensor.h
+++ b/oneflow/core/framework/tensor.h
@@ -114,17 +114,8 @@ class Tensor : public std::enable_shared_from_this<Tensor> {
   virtual Maybe<MirroredTensor> AsMirroredTensor() = 0;
   virtual Maybe<ConsistentTensor> AsConsistentTensor() = 0;
 
-  // The same tensor instance should share the python object to ensure that
-  // their id are consistent in Python. That is if x and y are hold the same tensor,
-  // then `id(x)` should equal to `id(y)`
-  void* pyobject() const { return pyobject_; }
-  void set_pyobject(void* object) { pyobject_ = object; }
-
  protected:
-  Tensor() : pyobject_(nullptr) {}
-
- private:
-  void* pyobject_;
+  Tensor() = default;
 };
 
 class StaticZerosTensor final : public Tensor {
diff --git a/oneflow/core/framework/tensor_methods.cpp b/oneflow/core/framework/tensor_methods.cpp
index df52daae301..eaa58513674 100644
--- a/oneflow/core/framework/tensor_methods.cpp
+++ b/oneflow/core/framework/tensor_methods.cpp
@@ -102,20 +102,20 @@ Maybe<Tensor> Reshape(const std::shared_ptr<Tensor>& input, const Shape& target_
 
   if (autograd::GradMode::is_enabled() && input->requires_grad()) {
     Shape input_shape(input->shape()->dim_vec());
-    auto backward_fn = std::make_shared<BackwardFunction>();
-    backward_fn->body = [=](const TensorTuple& out_grads, TensorTuple* in_grads,
-                            bool create_graph) -> Maybe<void> {
-      autograd::AutoGradMode mode(create_graph);
-      CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
-      in_grads->resize(1);
-      *JUST(oneflow::VectorAt(in_grads, 0)) =
-          JUST(functional::Reshape(JUST(oneflow::VectorAt(out_grads, 0)), input_shape));
-      return Maybe<void>::Ok();
-    };
-    backward_fn->status = []() { return false; };
+    auto backward_fn =
+        std::make_shared<std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>(
+            [=](const TensorTuple& out_grads, TensorTuple* in_grads,
+                bool create_graph) -> Maybe<void> {
+              autograd::AutoGradMode mode(create_graph);
+              CHECK_EQ_OR_RETURN(out_grads.size(), 1);
+              in_grads->resize(1);
+              *JUST(oneflow::VectorAt(in_grads, 0)) =
+                  JUST(functional::Reshape(JUST(oneflow::VectorAt(out_grads, 0)), input_shape));
+              return Maybe<void>::Ok();
+            });
     TensorTuple outputs{output};
-    JUST(GetThreadLocalAutogradEngine()->AddNode("view::reshape_backward", backward_fn, {input},
-                                                 &outputs));
+    JUST(GetThreadLocalAutogradEngine()->AddBackwardFuncPtr("view::reshape_backward", backward_fn,
+                                                            {input}, &outputs));
   }
   return output;
 }
@@ -156,20 +156,21 @@ Maybe<Tensor> Slice(const std::shared_ptr<Tensor>& input, const std::vector<int6
 
   auto output = JUST(BasicView(input, Shape(target_dims), Stride(target_strides), storage_offset));
   if (input->requires_grad()) {
-    auto backward_fn = std::make_shared<BackwardFunction>();
-    backward_fn->body = [=](const TensorTuple& out_grads, TensorTuple* in_grads,
-                            bool create_graph) -> Maybe<void> {
-      autograd::AutoGradMode mode(create_graph);
-      CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
-      in_grads->resize(1);
-      (*in_grads)[0] = JUST(functional::SliceGrad(
-          JUST(VectorAt(out_grads, 0)), Shape(input->shape()->dim_vec()), starts, ends, steps));
-      return Maybe<void>::Ok();
-    };
-    backward_fn->status = []() { return true; };
+    auto backward_fn =
+        std::make_shared<std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>(
+            [=](const TensorTuple& out_grads, TensorTuple* in_grads,
+                bool create_graph) -> Maybe<void> {
+              autograd::AutoGradMode mode(create_graph);
+              CHECK_EQ_OR_RETURN(out_grads.size(), 1);
+              in_grads->resize(1);
+              (*in_grads)[0] = JUST(functional::SliceGrad(JUST(VectorAt(out_grads, 0)),
+                                                          Shape(input->shape()->dim_vec()), starts,
+                                                          ends, steps));
+              return Maybe<void>::Ok();
+            });
     TensorTuple outputs{output};
-    JUST(GetThreadLocalAutogradEngine()->AddNode("view::slice_backward", backward_fn, {input},
-                                                 &outputs));
+    JUST(GetThreadLocalAutogradEngine()->AddBackwardFuncPtr("view::slice_backward", backward_fn,
+                                                            {input}, &outputs));
   }
   return output;
 }
@@ -204,20 +205,20 @@ Maybe<Tensor> Unsqueeze(const std::shared_ptr<Tensor>& input, const int32_t& exp
       JUST(BasicView(input, Shape(target_dim_vec), Stride(target_stride_vec), storage_offset));
 
   if (autograd::GradMode::is_enabled() && input->requires_grad()) {
-    auto backward_fn = std::make_shared<BackwardFunction>();
-    backward_fn->body = [=](const TensorTuple& out_grads, TensorTuple* in_grads,
-                            bool create_graph) -> Maybe<void> {
-      autograd::AutoGradMode mode(create_graph);
-      CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
-      in_grads->resize(1);
-      *JUST(oneflow::VectorAt(in_grads, 0)) =
-          JUST(functional::Reshape(JUST(oneflow::VectorAt(out_grads, 0)), *shape));
-      return Maybe<void>::Ok();
-    };
-    backward_fn->status = []() { return false; };
+    auto backward_fn =
+        std::make_shared<std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>(
+            [=](const TensorTuple& out_grads, TensorTuple* in_grads,
+                bool create_graph) -> Maybe<void> {
+              autograd::AutoGradMode mode(create_graph);
+              CHECK_EQ_OR_RETURN(out_grads.size(), 1);
+              in_grads->resize(1);
+              *JUST(oneflow::VectorAt(in_grads, 0)) =
+                  JUST(functional::Reshape(JUST(oneflow::VectorAt(out_grads, 0)), *shape));
+              return Maybe<void>::Ok();
+            });
     TensorTuple outputs{output};
-    JUST(GetThreadLocalAutogradEngine()->AddNode("view::unsqueeze_backward", backward_fn, {input},
-                                                 &outputs));
+    JUST(GetThreadLocalAutogradEngine()->AddBackwardFuncPtr("view::unsqueeze_backward", backward_fn,
+                                                            {input}, &outputs));
   }
   return output;
 }
@@ -253,20 +254,20 @@ Maybe<Tensor> Squeeze(const std::shared_ptr<Tensor>& input,
       JUST(BasicView(input, Shape(target_dim_vec), Stride(target_stride_vec), storage_offset));
 
   if (autograd::GradMode::is_enabled() && input->requires_grad()) {
-    auto backward_fn = std::make_shared<BackwardFunction>();
-    backward_fn->body = [=](const TensorTuple& out_grads, TensorTuple* in_grads,
-                            bool create_graph) -> Maybe<void> {
-      autograd::AutoGradMode mode(create_graph);
-      CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
-      in_grads->resize(1);
-      *JUST(oneflow::VectorAt(in_grads, 0)) = JUST(functional::Reshape(
-          JUST(oneflow::VectorAt(out_grads, 0)), Shape(input->shape()->dim_vec())));
-      return Maybe<void>::Ok();
-    };
-    backward_fn->status = []() { return true; };
+    auto backward_fn =
+        std::make_shared<std::function<Maybe<void>(const TensorTuple&, TensorTuple*, bool)>>(
+            [=](const TensorTuple& out_grads, TensorTuple* in_grads,
+                bool create_graph) -> Maybe<void> {
+              autograd::AutoGradMode mode(create_graph);
+              CHECK_EQ_OR_RETURN(out_grads.size(), 1);
+              in_grads->resize(1);
+              *JUST(oneflow::VectorAt(in_grads, 0)) = JUST(functional::Reshape(
+                  JUST(oneflow::VectorAt(out_grads, 0)), Shape(input->shape()->dim_vec())));
+              return Maybe<void>::Ok();
+            });
     TensorTuple outputs{output};
-    JUST(GetThreadLocalAutogradEngine()->AddNode("view::squeeze_backward", backward_fn, {input},
-                                                 &outputs));
+    JUST(GetThreadLocalAutogradEngine()->AddBackwardFuncPtr("view::squeeze_backward", backward_fn,
+                                                            {input}, &outputs));
   }
   return output;
 }
diff --git a/oneflow/core/framework/tensor_rpc_util.cpp b/oneflow/core/framework/tensor_rpc_util.cpp
index 029b897c63b..b331f09a467 100644
--- a/oneflow/core/framework/tensor_rpc_util.cpp
+++ b/oneflow/core/framework/tensor_rpc_util.cpp
@@ -30,7 +30,7 @@ limitations under the License.
 namespace oneflow {
 namespace private_details {
 
-struct FlatTensorConsistency;
+class FlatTensorConsistency;
 
 class CheckConsistencyAsyncTransportCtx : public AsyncTransportCtx {
  public:
diff --git a/oneflow/core/framework/tensor_rpc_util.h b/oneflow/core/framework/tensor_rpc_util.h
index deb36bc45d9..486a0c08a47 100644
--- a/oneflow/core/framework/tensor_rpc_util.h
+++ b/oneflow/core/framework/tensor_rpc_util.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "oneflow/core/common/optional.h"
 #include "oneflow/core/common/decorator.h"
 #include "oneflow/core/rpc/include/global_process_ctx.h"
-#include "oneflow/core/common/check_level.h"
 
 namespace oneflow {
 
@@ -54,16 +53,13 @@ struct CheckConsistentTensorMeta<RetT, const std::shared_ptr<one::Tensor>&, Args
   template<RetT (*func)(const std::shared_ptr<one::Tensor>&, Args...)>
   static RetT Call(const std::shared_ptr<one::Tensor>& tensor, Args... args) {
     std::shared_ptr<private_details::CheckConsistencyAsyncTransportCtx> ctx;
-    static bool is_env_enabled_check = IsEnvEnabled(/* check_level */ 1);
     int64_t* depth = private_details::MutThreadLocalTensorMetaCheckDepth();
-    if (*depth == 0 && is_env_enabled_check) {
-      ctx = JUST(private_details::LaunchTensorMetaConsistencyCheck(*tensor));
-    }
+    if (*depth == 0) { ctx = JUST(private_details::LaunchTensorMetaConsistencyCheck(*tensor)); }
     ++*depth;
     RetT ret = func(tensor, args...);
     --*depth;
     // Always synchronize consistent tensor meta even if `func` failed.
-    if (*depth == 0 && is_env_enabled_check) { JUST(private_details::BusyWaitAndCheck(ctx)); }
+    if (*depth == 0) { JUST(private_details::BusyWaitAndCheck(ctx)); }
     return ret;
   }
 };
diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml
index 9d5f43647a1..632a8ad582f 100755
--- a/oneflow/core/functional/functional_api.yaml
+++ b/oneflow/core/functional/functional_api.yaml
@@ -56,14 +56,6 @@
     ]
   bind_python: true
 
-- name: "addcmul"
-  signature: "Tensor (Tensor input, Tensor tensor1, Tensor tensor2, *, Scalar value=1) => Addcmul"
-  bind_python: true
-
-- name: "addcmul_"
-  signature: "Tensor (Tensor input, Tensor tensor1, Tensor tensor2, *, Scalar value=1) => InplaceAddcmul"
-  bind_python: true
-
 - name: "div"
   signature:
     [
@@ -282,7 +274,7 @@
   bind_python: True
 
 - name: "reduce_prod"
-  signature: "Tensor (Tensor x, Int32List axis, Bool keepdims=False, *, DataType dtype=None) => ReduceProd"
+  signature: "Tensor (Tensor x, Int32List axis, Bool keepdims=False) => ReduceProd"
   bind_python: True
 
 - name: "reduce_min_device_stage"
@@ -338,15 +330,10 @@
   bind_python: True
 
 - name: "swapaxes"
-  signature: "Tensor (Tensor input, Int32 dim0, Int32 dim1) => Swapaxes"
-  bind_python: True
-
-- name: "swapdims"
-  signature: "Tensor (Tensor input, Int32 dim0, Int32 dim1) => Swapdims"
-  bind_python: True
-
-- name: "amax"
-  signature: "Tensor (Tensor input, Int32List[1] dim=None, Bool keepdim=False) => Amax"
+  signature:
+    [
+      "Tensor (Tensor input, Int32 dim0, Int32 dim1) => Swapaxes",
+    ]
   bind_python: True
 
 - name: "permute"
@@ -361,14 +348,6 @@
   signature: "Tensor (Tensor input) => TransposeAllDimFunction"
   bind_python: True 
 
-- name: "not_equal_zero"
-  signature: "Tensor (Tensor x) => NotEqualZero"
-  bind_python: False
-
-- name: "not_equal_zero_grad"
-  signature: "Tensor (Tensor x, Tensor dy) => NotEqualZeroGrad"
-  bind_python: False
-
 - name: "reciprocal"
   signature: "Tensor (Tensor x) => Reciprocal"
   bind_python: True
@@ -562,14 +541,6 @@
   signature: "Tensor (Tensor dy, Tensor x) => HardSigmoidGrad"
   bind_python: False
 
-- name: "hardshrink"
-  signature: "Tensor (Tensor x, *, Double lambd=0.5, Bool inplace=False) => HardShrink"
-  bind_python: True
-
-- name: "hardshrink_grad"
-  signature: "Tensor (Tensor y, Tensor dy, Double lambd=0.5) => HardShrinkGrad"
-  bind_python: False
-
 - name: "softmax"
   signature: "Tensor (Tensor x, Int64 dim=None) => Softmax"
   bind_python: True
@@ -591,7 +562,7 @@
   bind_python: False
 
 - name: "leaky_relu"
-  signature: "Tensor (Tensor x, Float alpha, Bool inplace=False) => LeakyRelu"
+  signature: "Tensor (Tensor x, Float alpha) => LeakyRelu"
   bind_python: True
 
 - name: "leaky_relu_grad"
@@ -799,23 +770,23 @@
 
 - name: "deconv1d"
   signature:
-    'Tensor (Tensor x, Tensor weight, Tensor bias=None, Int32List[1] stride=1, 
-    Int32List[1] padding=0, Int32List[1] output_padding=0, Int32 groups=1, 
-    Int32List[1] dilation=1, String data_format="channels_first") => Deconv1d'
+    "Tensor (Tensor x, Tensor weight, Tensor bias=None, Int32 filters,
+    Int32List padding, String data_format, Int32List kernel_size, 
+    Int32List output_padding, Int32List strides, Int32List dilation, Int32 groups=1) => Deconv1d"
   bind_python: True
 
 - name: "deconv2d"
   signature:
-    'Tensor (Tensor x, Tensor weight, Tensor bias=None, Int32List[2] stride=1, 
-    Int32List[2] padding=0, Int32List[2] output_padding=0, Int32 groups=1, 
-    Int32List[2] dilation=1, String data_format="channels_first") => Deconv2d'
+    "Tensor (Tensor x, Tensor weight, Tensor bias=None, Int32 filters,
+    Int32List padding, String data_format, Int32List kernel_size, 
+    Int32List output_padding, Int32List strides, Int32List dilation, Int32 groups=1) => Deconv2d"
   bind_python: True
 
 - name: "deconv3d"
   signature:
-    'Tensor (Tensor x, Tensor weight, Tensor bias=None, Int32List[3] stride=1, 
-    Int32List[3] padding=0, Int32List[3] output_padding=0, Int32 groups=1, 
-    Int32List[3] dilation=1, String data_format="channels_first") => Deconv3d'
+    "Tensor (Tensor x, Tensor weight, Tensor bias=None, Int32 filters,
+    Int32List padding, String data_format, Int32List kernel_size, 
+    Int32List output_padding, Int32List strides, Int32List dilation, Int32 groups=1) => Deconv3d"
   bind_python: True
 
 - name: "expand"
@@ -1168,7 +1139,7 @@
 
 - name: "pooling_grad"
   signature:
-    "Tensor (Tensor x, Tensor indice, Tensor dy, String mode, Int32 ndims,
+    "Tensor (Tensor x, Tensor y, Tensor indice, Tensor dy, String mode, Int32 ndims,
     String data_format, Int32List padding, Int32List kernel_size,
     Int32List stride, Int32List dilation, Bool return_indices, Bool ceil_mode) => PoolingNdGrad"
   bind_python: False
@@ -1704,7 +1675,7 @@
 
 - name: "avg_pooling_grad"
   signature:
-    "Tensor (Tensor x, Tensor dy, Int32 ndims, String data_format, Int32List padding,
+    "Tensor (Tensor x, Tensor y, Tensor dy, Int32 ndims, String data_format, Int32List padding,
     Int32List kernel_size, Int32List stride, Bool ceil_mode, Bool count_include_pad,
     Int32 divisor_override=0) => AvgPoolingNdGrad"
   bind_python: False
@@ -1738,7 +1709,7 @@
   bind_python: False
 
 - name: "to_global"
-  signature: "Tensor (Tensor x, Placement placement, SbpList sbp, SbpList grad_sbp, Bool check_meta) => ToConsistent"
+  signature: "Tensor (Tensor x, Placement placement, SbpList sbp, SbpList grad_sbp) => ToConsistent"
   bind_python: True
 
 - name: "to_local"
@@ -1897,12 +1868,6 @@
   ]
   bind_python: True
 
-- name: "unbind"
-  signature: [
-    "TensorTuple (Tensor x, Int64 dim=0) => Unbind",
-  ]
-  bind_python: True
-
 - name: "chunk"
   signature: [
     "TensorTuple (Tensor x, Int64 chunks, Int64 dim=0) => Chunk",
@@ -2021,11 +1986,6 @@
   signature: 'TensorTuple (TensorTuple tensors, String indexing="ij") => Meshgrid'
   bind_python: True
 
-- name: "index_select"
-  signature:
-    "Tensor (Tensor input, Int64 dim, Tensor index) => IndexSelect"
-  bind_python: True
-
 - name: "decode_onerec"
   signature: "Tensor (Tensor input, String key, DataType dtype, Shape shape, Bool is_dynamic=False, Shape reshape=None, Shape batch_padding=None) => DecodeOneRec"
   bind_python: True
@@ -2063,7 +2023,7 @@
   bind_python: True
 
 - name: "cumsum"
-  signature: "Tensor (Tensor input, Int64 dim, *, DataType dtype=None) => Cumsum"
+  signature: "Tensor (Tensor input, Int64 dim) => Cumsum"
   bind_python: True
 
 - name: "cumsum_grad"
@@ -2071,7 +2031,7 @@
   bind_python: False 
 
 - name: "cumprod"
-  signature: "Tensor (Tensor input, Int64 dim, *, DataType dtype=None) => Cumprod"
+  signature: "Tensor (Tensor input, Int64 dim) => Cumprod"
   bind_python: True
 
 - name: "cumprod_grad"
@@ -2079,7 +2039,7 @@
   bind_python: False
 
 - name: "one_embedding_id_shuffle"
-  signature: "TensorTuple (Tensor ids, Tensor table_ids=None, Int32 num_tables=1) => OneEmbeddingIdShuffle"
+  signature: "TensorTuple (Tensor ids, Tensor column_ids=None, Int32 num_columns=1) => OneEmbeddingIdShuffle"
   bind_python: True
 
 - name: "one_embedding_embedding_shuffle"
@@ -2091,23 +2051,11 @@
   bind_python: True
 
 - name: "one_embedding_lookup"
-  signature: "Tensor (Tensor shadow, Tensor ids, Tensor table_ids=None, DataType dtype, Int64 embedding_size, Int32 num_tables, String embedding_tables, String embedding_store_options) => OneEmbeddingLookup"
+  signature: "Tensor (Tensor shadow, Tensor ids, Tensor column_ids=None, DataType dtype, Int64 embedding_size, Int32 num_columns, String embedding_columns, String embedding_store_options) => OneEmbeddingLookup"
   bind_python: True
 
 - name: "one_embedding_unique_key_value_pair"
-  signature: "TensorTuple (Tensor keys, Tensor values=None, Int32 num_tables) => OneEmbeddingUniqueKeyValuePair"
-  bind_python: True
-
-- name: "one_embedding_sgd_update"
-  signature: "Tensor (Tensor num_unique_ids, Tensor unique_embeddings, Tensor embedding_grad, Tensor learning_rate, Tensor down_scale_by_tensor, Tensor skip_if, Double scale, Float weight_decay, Float momentum) => OneEmbeddingSgdUpdate"
-  bind_python: True
-
-- name: "one_embedding_adam_update"
-  signature: "Tensor (Tensor num_unique_ids, Tensor unique_embeddings, Tensor embedding_grad, Tensor learning_rate, Tensor down_scale_by_tensor, Tensor skip_if, Tensor bias_correction1=None, Tensor bias_correction2=None, Double scale=1.0, Float weight_decay=0.0, Float beta1=0.9, Float beta2=0.999, Float epsilon=0, Bool do_bias_correction=True) => OneEmbeddingAdamUpdate"
-  bind_python: True
-
-- name: "one_embedding_adagrad_update"
-  signature: "Tensor (Tensor num_unique_ids, Tensor unique_embeddings, Tensor embedding_grad, Tensor learning_rate, Tensor down_scale_by_tensor, Tensor skip_if, Tensor train_step, Double scale=1.0, Float weight_decay=0.0, Float lr_decay=0.0, Float epsilon=0) => OneEmbeddingAdagradUpdate"
+  signature: "TensorTuple (Tensor keys, Tensor values=None, Int32 num_columns) => OneEmbeddingUniqueKeyValuePair"
   bind_python: True
 
 - name: "einsum"
@@ -2125,7 +2073,3 @@
 - name: "isinf"
   signature: "Tensor (Tensor input) => IsInf"
   bind_python: True
-
-- name: "roc_auc_score"
-  signature: "Tensor (Tensor label, Tensor pred) => RocAucScore"
-  bind_python: True
diff --git a/oneflow/core/functional/impl/activation_functor.cpp b/oneflow/core/functional/impl/activation_functor.cpp
index b21053ce236..2f0be03608b 100644
--- a/oneflow/core/functional/impl/activation_functor.cpp
+++ b/oneflow/core/functional/impl/activation_functor.cpp
@@ -268,51 +268,6 @@ class HardSigmoidGradFunctor : public BinaryFunctor {
         CHECK_JUST(one::OpBuilder("hardsigmoid_grad").Input("dy").Input("x").Output("dx").Build());
   }
 };
-
-class HardShrinkFunctor {
- public:
-  HardShrinkFunctor() {
-    op_ = CHECK_JUST(one::OpBuilder("hardshrink").Input("in").Output("out").Build());
-  }
-
-  Maybe<Tensor> operator()(const std::shared_ptr<Tensor>& x, const double& lambd,
-                           bool inplace) const {
-    MutableAttrMap attrs;
-    CHECK_GT_OR_RETURN(lambd, 0) << "lambd must be greater than 0";
-    JUST(attrs.SetAttr<double>("lambd", lambd));
-    if (inplace) {
-      JUST(CheckInplaceValid(x));
-      std::shared_ptr<TensorTuple> outputs = std::make_shared<TensorTuple>(1);
-      // outputs->at(0) = x;
-      *JUST(oneflow::VectorAt(outputs.get(), 0)) = x;
-      JUST(OpInterpUtil::Dispatch(*op_, {x}, outputs.get(), attrs));
-      // return outputs->at(0);
-      return *JUST(oneflow::VectorAt(outputs.get(), 0));
-    } else {
-      return OpInterpUtil::Dispatch<one::Tensor>(*op_, {x}, attrs);
-    }
-  }
-
- private:
-  std::shared_ptr<OpExpr> op_;
-};
-
-class HardShrinkGradFunctor {
- public:
-  HardShrinkGradFunctor() {
-    op_ = CHECK_JUST(one::OpBuilder("hardshrink_grad").Input("dy").Input("y").Output("dx").Build());
-  }
-  Maybe<Tensor> operator()(const std::shared_ptr<Tensor>& y, const std::shared_ptr<Tensor>& dy,
-                           const double& lambd) const {
-    MutableAttrMap attrs;
-    JUST(attrs.SetAttr<double>("lambd", lambd));
-    return OpInterpUtil::Dispatch<one::Tensor>(*op_, {dy, y}, attrs);
-  }
-
- private:
-  std::shared_ptr<OpExpr> op_;
-};
-
 class SoftmaxFunctorBase {
  public:
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input,
@@ -407,19 +362,10 @@ class LeakyReluFunctor {
   LeakyReluFunctor() {
     op_ = CHECK_JUST(one::OpBuilder("leaky_relu").Input("x").Output("y").Build());
   }
-  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const float& alpha,
-                           bool inplace) const {
+  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const float& alpha) const {
     MutableAttrMap attrs;
     JUST(attrs.SetAttr<float>("alpha", alpha));
-    if (inplace) {
-      JUST(CheckInplaceValid(x));
-      std::shared_ptr<TensorTuple> outputs = std::make_shared<TensorTuple>(1);
-      *JUST(oneflow::VectorAt(outputs.get(), 0)) = x;
-      JUST(OpInterpUtil::Dispatch(*op_, {x}, outputs.get(), attrs));
-      return *JUST(oneflow::VectorAt(outputs.get(), 0));
-    } else {
-      return OpInterpUtil::Dispatch<one::Tensor>(*op_, {x}, attrs);
-    }
+    return OpInterpUtil::Dispatch<one::Tensor>(*op_, {x}, attrs);
   }
 
  private:
@@ -622,8 +568,6 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<impl::GluFunctor>("Glu");
   m.add_functor<impl::HardSigmoidFunctor>("HardSigmoid");
   m.add_functor<impl::HardSigmoidGradFunctor>("HardSigmoidGrad");
-  m.add_functor<impl::HardShrinkFunctor>("HardShrink");
-  m.add_functor<impl::HardShrinkGradFunctor>("HardShrinkGrad");
   m.add_functor<impl::SoftmaxFunctor>("Softmax");
   m.add_functor<impl::SoftmaxGradFunctor>("SoftmaxGrad");
   m.add_functor<impl::LogSoftmaxFunctor>("LogSoftmax");
diff --git a/oneflow/core/functional/impl/array_functor.cpp b/oneflow/core/functional/impl/array_functor.cpp
index e0dd52f7334..89a0ffa4f5c 100644
--- a/oneflow/core/functional/impl/array_functor.cpp
+++ b/oneflow/core/functional/impl/array_functor.cpp
@@ -15,7 +15,6 @@ limitations under the License.
 */
 
 #include "oneflow/core/autograd/autograd_mode.h"
-#include "oneflow/core/common/data_type.pb.h"
 #include "oneflow/core/common/maybe.h"
 #include "oneflow/core/common/scalar.h"
 #include "oneflow/core/common/global.h"
@@ -2151,23 +2150,6 @@ class SplitFunctor {
   }
 };
 
-class UnbindFunctor {
- public:
-  UnbindFunctor() {}
-  Maybe<TensorTuple> operator()(const std::shared_ptr<one::Tensor>& x, const int64_t& dim) const {
-    int32_t axis = dim;
-    const int32_t ndim = x->ndim();
-    if (axis < 0) { axis += ndim; }
-    CHECK_OR_RETURN((dim >= -ndim) && (dim < ndim))
-        << "Dimension out of range (expected to be in range of [" << -ndim << "," << ndim - 1
-        << "], but got " << dim << ")";
-    int32_t dim_size = x->shape()->At(axis);
-    TensorTuple unbinds(dim_size);
-    for (int i = 0; i < dim_size; ++i) { unbinds[i] = JUST(Select(x, axis, i)); }
-    return unbinds;
-  }
-};
-
 class ChunkFunctor {
  public:
   ChunkFunctor() {}
@@ -2420,41 +2402,8 @@ class MeshgridFunctor {
   }
 };
 
-class IndexSelectFunctor {
- public:
-  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, const int64_t& dim,
-                           const std::shared_ptr<one::Tensor>& index) const {
-    const int64_t input_num_axes = input->shape()->NumAxes();
-    const int64_t index_num_axes = index->shape()->NumAxes();
-    CHECK_EQ_OR_RETURN(index_num_axes, 1)
-        << "IndexError: index_select(): Index is supposed to be a vector";
-    bool index_dtype_flag =
-        (index->dtype()->data_type() == kInt32) || (index->dtype()->data_type() == kInt64);
-    CHECK_EQ_OR_RETURN(index_dtype_flag, true)
-        << "RuntimeError: index_select(): Expected dtype int32 or int64 for index";
-    int64_t new_dim = dim;
-    if (dim < 0) { new_dim += input_num_axes; }
-    CHECK_LE_OR_RETURN(new_dim, input_num_axes)
-        << "IndexError: Dimension out of range (expected to be in range of [" << -input_num_axes
-        << ", " << input_num_axes - 1 << "], but got " << new_dim << ")";
-    DimVector index_broad_cast(input_num_axes);
-    for (int i = 0; i < input_num_axes; i++) { index_broad_cast[i] = input->shape()->At(i); }
-    index_broad_cast[new_dim] = 1;
-    Shape expand_shape(index_broad_cast);
-    auto index_gather =
-        JUST(functional::Expand(JUST(functional::Slice(index, {0}, {1}, {1})), expand_shape));
-    for (int i = 1; i < index->dim(0); i++) {
-      index_gather = JUST(functional::Concat(
-          {index_gather, JUST(functional::Expand(JUST(functional::Slice(index, {i}, {i + 1}, {1})),
-                                                 expand_shape))},
-          new_dim));
-    }
-
-    return JUST(functional::DimGather(input, new_dim, index_gather, false));
-  }
-};
-
 namespace {
+
 inline Maybe<bool> device_equal(const std::string& device_name, const int device_id,
                                 Symbol<Device> device) {
   return (device_name == device->type() && device_id == device->device_id());
@@ -2859,7 +2808,6 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<impl::ReduceSumLikeFunctor>("ReduceSumLike");
   m.add_functor<impl::BroadcastReduceSumLikeFunctor>("BroadcastReduceSumLike");
   m.add_functor<impl::SplitFunctor>("Split");
-  m.add_functor<impl::UnbindFunctor>("Unbind");
   m.add_functor<impl::ChunkFunctor>("Chunk");
   m.add_functor<impl::SplitLikeFunctor>("SplitLike");
   m.add_functor<impl::SplitWithSizeFunctor>("SplitWithSize");
@@ -2867,7 +2815,6 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<impl::UnsortedBatchSegmentSumFunctor>("UnsortedBatchSegmentSum");
   m.add_functor<impl::MaskedFillFunctor>("MaskedFill");
   m.add_functor<impl::MeshgridFunctor>("Meshgrid");
-  m.add_functor<impl::IndexSelectFunctor>("IndexSelect");
   m.add_functor<impl::ToFunctor, impl::To2Functor, impl::To3Functor, impl::To4Functor>("To");
   m.add_functor<impl::TopKFunctor>("TopK");
   m.add_functor<impl::InTopKFunctor>("InTopK");
diff --git a/oneflow/core/functional/impl/binary_functor.cpp b/oneflow/core/functional/impl/binary_functor.cpp
index 52fac7e4ef6..c331f6c5777 100644
--- a/oneflow/core/functional/impl/binary_functor.cpp
+++ b/oneflow/core/functional/impl/binary_functor.cpp
@@ -25,7 +25,6 @@ limitations under the License.
 #include "oneflow/core/framework/tensor_tuple.h"
 #include "oneflow/core/functional/functional.h"
 #include "oneflow/core/functional/function_library.h"
-#include "oneflow/core/functional/sequence_function.h"
 
 namespace oneflow {
 namespace one {
@@ -179,40 +178,6 @@ class InplaceMulFunctor {
   std::shared_ptr<OpExpr> broadcast_mul_op_;
 };
 
-class AddcmulBaseFunctor {
- public:
-  AddcmulBaseFunctor() = default;
-  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input,
-                           const std::shared_ptr<one::Tensor>& tensor1,
-                           const std::shared_ptr<one::Tensor>& tensor2, const Scalar& value,
-                           bool inplace) const {
-    return SequenceFunction<Maybe<Tensor>()>([&]() { return functional::Mul(tensor1, tensor2); })
-        .then([&](const auto& x) { return functional::ScalarMul(value, x); })
-        .then([&](const auto& x) { return functional::Add(input, x, /*alpha=*/1, inplace); })
-        .call();
-  }
-};
-
-class AddcmulFunctor : public AddcmulBaseFunctor {
- public:
-  AddcmulFunctor() = default;
-  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input,
-                           const std::shared_ptr<one::Tensor>& tensor1,
-                           const std::shared_ptr<one::Tensor>& tensor2, const Scalar& value) const {
-    return AddcmulBaseFunctor::operator()(input, tensor1, tensor2, value, /*inplace=*/false);
-  }
-};
-
-class InplaceAddcmulFunctor : public AddcmulBaseFunctor {
- public:
-  InplaceAddcmulFunctor() = default;
-  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input,
-                           const std::shared_ptr<one::Tensor>& tensor1,
-                           const std::shared_ptr<one::Tensor>& tensor2, const Scalar& value) const {
-    return AddcmulBaseFunctor::operator()(input, tensor1, tensor2, value, /*inplace=*/true);
-  }
-};
-
 class DivFunctor : public BinaryFloatFunctor {
  public:
   DivFunctor() {
@@ -401,8 +366,6 @@ class ReshapeLikeFunctor : public BinaryFunctor {
 
 ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<impl::AddFunctor>("Add");
-  m.add_functor<impl::AddcmulFunctor>("Addcmul");
-  m.add_functor<impl::InplaceAddcmulFunctor>("InplaceAddcmul");
   m.add_functor<impl::Atan2Functor>("Atan2");
   m.add_functor<impl::SubFunctor>("Sub");
   m.add_functor<impl::MulFunctor>("Mul");
diff --git a/oneflow/core/functional/impl/consistent_cast.cpp b/oneflow/core/functional/impl/consistent_cast.cpp
index 113efbcc720..8b0efecae90 100644
--- a/oneflow/core/functional/impl/consistent_cast.cpp
+++ b/oneflow/core/functional/impl/consistent_cast.cpp
@@ -44,7 +44,6 @@ limitations under the License.
 #include "oneflow/core/common/cpp_attribute.h"
 #include "oneflow/core/ccl/ccl.h"
 #include "oneflow/core/common/constant.h"
-#include "oneflow/core/common/env_var/debug_mode.h"
 
 namespace oneflow {
 namespace one {
@@ -54,28 +53,6 @@ namespace impl {
 
 namespace {
 
-// NOTE: use env variable 'ONEFLOW_EAGER_LOCAL_TO_GLOBAL_BALANCED_OVERRIDE' indicate whether the
-// shape and dtype of input tensor on each rank is the same when cast local tensor to global tensor.
-// If set true, there will be no meta-information synchronization on each rank.
-Optional<bool> ParseEagerLocalToGlobalBalancedOverride() {
-  const char* env_p = std::getenv("ONEFLOW_EAGER_LOCAL_TO_GLOBAL_BALANCED_OVERRIDE");
-  if (env_p == nullptr) {
-    return Optional<bool>();
-  } else {
-    return ParseBooleanFromEnv("ONEFLOW_EAGER_LOCAL_TO_GLOBAL_BALANCED_OVERRIDE", false);
-  }
-}
-
-bool NeedSyncAndCheckShapeAndDtype(bool check_meta_hint) {
-  thread_local Optional<bool> eager_local_to_global_balanced_override =
-      ParseEagerLocalToGlobalBalancedOverride();
-  if (eager_local_to_global_balanced_override.has_value()) {
-    return IsInDebugMode() || !CHECK_JUST(eager_local_to_global_balanced_override);
-  } else {
-    return IsInDebugMode() || check_meta_hint;
-  }
-}
-
 // clang-format off
 FLAT_MSG_BEGIN(FlatShapeAndDataType);
   // Methods
@@ -210,18 +187,7 @@ Maybe<void> GetConcatenatedShapeAndCheckDtype(
 
 Maybe<void> GetLogicalShapeAndDataType(Shape* logical_shape, DataType* /* in and out */ dtype,
                                        std::shared_ptr<const Shape> physical_shape,
-                                       Symbol<ParallelDesc> parallel_desc, Symbol<NdSbp> nd_sbp,
-                                       bool sync_and_check_meta) {
-  if (!sync_and_check_meta) {
-    if (JUST(RankGroup::New(parallel_desc)) != JUST(RankGroupScope::CurrentRankGroup())) {
-      const auto& flat_shape_dtype =
-          JUST(BroadcastShapeAndDtype(*physical_shape, *dtype, parallel_desc));
-      physical_shape = JUST(flat_shape_dtype->ToShape());
-      *dtype = flat_shape_dtype->dtype();
-    }
-    *logical_shape = *JUST(GetLogicalShape(*physical_shape, *nd_sbp, *parallel_desc));
-    return Maybe<void>::Ok();
-  }
+                                       Symbol<ParallelDesc> parallel_desc, Symbol<NdSbp> nd_sbp) {
   if (nd_sbp->sbp_parallel_size() == 1 && nd_sbp->sbp_parallel(0).has_split_parallel()) {
     const auto& rank2flat_shape_dtype =
         JUST(BroadcastGatherShapeAndDataType(*physical_shape, *dtype, parallel_desc));
@@ -288,7 +254,7 @@ Maybe<Tensor> ConsistentToConsistent(const std::shared_ptr<Tensor>& x,
 Maybe<Tensor> LocalToConsistent(const std::shared_ptr<Tensor>& x,
                                 Symbol<ParallelDesc> parallel_desc,
                                 const std::vector<Symbol<SbpParallel>>& sbp_parallels,
-                                const std::shared_ptr<OpExpr>& op, bool check_meta_hint) {
+                                const std::shared_ptr<OpExpr>& op) {
   CHECK_OR_RETURN(!x->is_lazy())
       << "local_tensor.to_global() is not supported within nn.Graph for now";
   CHECK_OR_RETURN(x->is_local()) << Error::UnimplementedError() << "local tensors supported only";
@@ -314,9 +280,7 @@ Maybe<Tensor> LocalToConsistent(const std::shared_ptr<Tensor>& x,
   Symbol<NdSbp> nd_sbp = JUST(GetNdSbp(sbp_parallels));
   const auto& shape = std::make_shared<Shape>();
   DataType dtype = x->dtype()->data_type();
-  bool sync_and_check_meta = NeedSyncAndCheckShapeAndDtype(check_meta_hint);
-  JUST(GetLogicalShapeAndDataType(shape.get(), &dtype, x->shape(), parallel_desc, nd_sbp,
-                                  sync_and_check_meta));
+  JUST(GetLogicalShapeAndDataType(shape.get(), &dtype, x->shape(), parallel_desc, nd_sbp));
   MutableAttrMap attrs;
   JUST(attrs.SetAttr<Shape>("shape", *shape));
   JUST(attrs.SetAttr<DataType>("dtype", dtype));
@@ -340,7 +304,7 @@ class LocalToConsistentFunctor {
                            const Shape& shape, const Symbol<DType>& dtype) const {
     JUST(CheckDeviceIdsIsValid(parallel_desc));
     NonRecursiveMetaInfoConsistencyCheckScope no_recursive_meta_info_conisitency_check_scope;
-    JUST(MetaInfoConsistencyCheck(parallel_desc, sbp_parallels, 1, /* force_check */ false));
+    JUST(MetaInfoConsistencyCheck(parallel_desc, sbp_parallels, 1));
     CHECK_OR_RETURN(x->is_local());
     std::shared_ptr<one::Tensor> input = x;
     // copy to right device first if input's device type is wrong
@@ -380,18 +344,15 @@ class ToConsistentFunctor {
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
                            Symbol<ParallelDesc> parallel_desc,
                            const std::vector<Symbol<SbpParallel>>& sbp_parallels,
-                           const std::vector<Symbol<SbpParallel>>& grad_sbp_parallels,
-                           bool check_meta) const {
+                           const std::vector<Symbol<SbpParallel>>& grad_sbp_parallels) const {
     JUST(CheckDeviceIdsIsValid(parallel_desc));
     NonRecursiveMetaInfoConsistencyCheckScope scope;
-    JUST(MetaInfoConsistencyCheck(parallel_desc, sbp_parallels, grad_sbp_parallels, 1,
-                                  /* force_check */ check_meta));
+    JUST(MetaInfoConsistencyCheck(parallel_desc, sbp_parallels, grad_sbp_parallels, 1));
     std::shared_ptr<Tensor> tensor;
     if (x->is_consistent()) {
       tensor = JUST(ConsistentToConsistent(x, parallel_desc, sbp_parallels, grad_sbp_parallels));
     } else {
-      tensor = JUST(
-          LocalToConsistent(x, parallel_desc, sbp_parallels, local_to_consistent_op_, check_meta));
+      tensor = JUST(LocalToConsistent(x, parallel_desc, sbp_parallels, local_to_consistent_op_));
     }
     return tensor;
   }
diff --git a/oneflow/core/functional/impl/math_functor.cpp b/oneflow/core/functional/impl/math_functor.cpp
index d6e32a57133..d3286a5f5c4 100644
--- a/oneflow/core/functional/impl/math_functor.cpp
+++ b/oneflow/core/functional/impl/math_functor.cpp
@@ -411,25 +411,6 @@ class Min2Functor {
   }
 };
 
-class AmaxFunctor {
- public:
-  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
-                           const Optional<std::vector<int32_t>>& dim, const bool& keepdim) const {
-    if (!dim.has_value()) { return ReduceMax(x, {}, keepdim); }
-
-    const int32_t ndim = x->ndim();
-    std::vector<int32_t>& dims = *JUST(dim);
-    for (int i = 0; i < dims.size(); i++) {
-      if (dims[i] < -ndim || dims[i] >= ndim) {
-        return Error::IndexError() << "Dimension out of range (expected to be in range of ["
-                                   << -ndim << ", " << ndim - 1 << "], but got " << dims[i] << ")";
-      }
-      if (dims[i] < 0) { dims[i] += ndim; }
-    }
-    return ReduceMax(x, dims, keepdim);
-  }
-};
-
 class ReduceSumFunctor {
  public:
   ReduceSumFunctor() {
@@ -677,29 +658,27 @@ class ReduceProdFunctor {
         one::OpBuilder("reduce_prod").Input("input_tensor").Output("output_tensor").Build());
   }
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const std::vector<int32_t>& axis,
-                           const bool& keepdims, const Optional<Symbol<DType>>& dtype) const {
+                           const bool& keepdims) const {
     MutableAttrMap attrs;
-    std::shared_ptr<one::Tensor> tensor = x;
-    if (dtype.has_value() && (dtype != x->dtype())) { tensor = JUST(Cast(tensor, JUST(dtype))); }
     TensorProcessor tensor_processor;
     Symbol<DType> lowest_dtype;
-    if (DType::priority_order[tensor->dtype()->data_type()]
+    if (DType::priority_order[x->dtype()->data_type()]
         == DType::priority_order[DType::Bool()->data_type()]) {
       lowest_dtype = DType::Int64();
     } else {
-      lowest_dtype = tensor->dtype();
+      lowest_dtype = x->dtype();
     }
-    JUST(tensor_processor.AddInputs({tensor}, lowest_dtype).Apply());
+    JUST(tensor_processor.AddInputs({x}, lowest_dtype).Apply());
     TensorTuple input_tuple = JUST(tensor_processor.GetInputs());
     if (axis.empty()) {
-      std::vector<int32_t> reduce_axis(tensor->shape()->NumAxes());
+      std::vector<int32_t> reduce_axis(x->shape()->NumAxes());
       std::iota(reduce_axis.begin(), reduce_axis.end(), 0);
       JUST(attrs.SetAttr<std::vector<int32_t>>("axis", reduce_axis));
     } else {
       JUST(attrs.SetAttr<std::vector<int32_t>>("axis", axis));
     }
     JUST(attrs.SetAttr<bool>("keepdims", keepdims));
-    return JUST(OpInterpUtil::Dispatch<Tensor>(*op_, input_tuple, attrs));
+    return OpInterpUtil::Dispatch<Tensor>(*op_, input_tuple, attrs);
   }
 
  private:
@@ -753,11 +732,9 @@ class Transpose2dimFunctor {
     if (dim1 < 0) { dim_1 += ndim; }
 
     CHECK_OR_RETURN(dim_0 >= 0 && dim0 < ndim)
-        << "Dimension out of range (expected to be in range of [" << -ndim << ", " << ndim - 1
-        << "], but got " << dim_0 << ")";
+        << "Invalid dim0:" << dim_0 << " len(shape):" << ndim;
     CHECK_OR_RETURN(dim_1 >= 0 && dim1 < ndim)
-        << "Dimension out of range (expected to be in range of [" << -ndim << ", " << ndim - 1
-        << "], but got " << dim_1 << ")";
+        << "Invalid dim1:" << dim_1 << " len(shape):" << ndim;
     for (int32_t i = 0; i < ndim; ++i) { permute.emplace_back(i); }
     std::swap(permute[dim_0], permute[dim_1]);
 
@@ -816,6 +793,28 @@ class AsStridedGradFunctor {
   std::shared_ptr<OpExpr> op_;
 };
 
+class SwapaxesFunctor {
+ public:
+  SwapaxesFunctor() {}
+  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const int32_t dim0,
+                           const int32_t dim1) const {
+    const int64_t ndim = x->shape()->NumAxes();
+    int32_t dim_0 = dim0;
+    int32_t dim_1 = dim1;
+
+    if (dim0 < 0) { dim_0 += ndim; }
+    if (dim1 < 0) { dim_1 += ndim; }
+
+    CHECK_OR_RETURN(dim_0 >= 0 && dim0 < ndim)
+        << "Dimension out of range (expected to be in range of [" << -ndim << ", " << ndim - 1
+        << "], but got " << dim_0 << ")";
+    CHECK_OR_RETURN(dim_1 >= 0 && dim1 < ndim)
+        << "Dimension out of range (expected to be in range of [" << -ndim << ", " << ndim - 1
+        << "], but got " << dim_1 << ")";
+    return Transpose2dim(x, dim0, dim1);
+  }
+};
+
 class ArangeFunctor {
  public:
   ArangeFunctor() { op_ = CHECK_JUST(one::OpBuilder("arange").Output("out").Build()); }
@@ -1100,7 +1099,8 @@ class VectorNormFunctor {
     if (ord.IsIntegral() || ord.IsFloatingPoint()) {
       double ord_val = JUST(ord.As<double>());
       if (ord_val == 0) {
-        res = JUST(ReduceSum(JUST(functional::NotEqualZero(x)), dim, keepdim));
+        std::vector<int32_t> dim_column(1, 0);
+        res = JUST(ReduceSum(JUST(ScalarLogicalNotEqual(x, 0)), dim_column, keepdim));
       } else if (ord_val == INFINITY) {
         res = JUST(ReduceMax(JUST(Abs(x)), dim, keepdim));
       } else if (ord_val == -INFINITY) {
@@ -1550,15 +1550,10 @@ class MinimumFunctor {
 
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
                            const std::shared_ptr<one::Tensor>& y) const {
-    TensorProcessor tensor_processor;
-    JUST(tensor_processor.PromoteInputsToCommonDtype(true).AddInputs({x, y}).Apply());
-    TensorTuple input_tuple = JUST(tensor_processor.GetInputs());
     if (*x->shape() == *y->shape()) {
-      return OpInterpUtil::Dispatch<Tensor>(*elementwise_minimum_op_,
-                                            {input_tuple[0], input_tuple[1]});
+      return OpInterpUtil::Dispatch<Tensor>(*elementwise_minimum_op_, {x, y});
     } else {
-      return OpInterpUtil::Dispatch<Tensor>(*broadcast_minimum_op_,
-                                            {input_tuple[0], input_tuple[1]});
+      return OpInterpUtil::Dispatch<Tensor>(*broadcast_minimum_op_, {x, y});
     }
   }
 
@@ -1578,15 +1573,10 @@ class MaximumFunctor {
 
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
                            const std::shared_ptr<one::Tensor>& y) const {
-    TensorProcessor tensor_processor;
-    JUST(tensor_processor.PromoteInputsToCommonDtype(true).AddInputs({x, y}).Apply());
-    TensorTuple input_tuple = JUST(tensor_processor.GetInputs());
     if (*x->shape() == *y->shape()) {
-      return OpInterpUtil::Dispatch<Tensor>(*elementwise_maximum_op_,
-                                            {input_tuple[0], input_tuple[1]});
+      return OpInterpUtil::Dispatch<Tensor>(*elementwise_maximum_op_, {x, y});
     } else {
-      return OpInterpUtil::Dispatch<Tensor>(*broadcast_maximum_op_,
-                                            {input_tuple[0], input_tuple[1]});
+      return OpInterpUtil::Dispatch<Tensor>(*broadcast_maximum_op_, {x, y});
     }
   }
 
@@ -2139,8 +2129,7 @@ class CumBaseFunctor {
   explicit CumBaseFunctor(std::string op_name) {
     op_ = CHECK_JUST(one::OpBuilder(op_name).Input("x").Output("y").Build());
   }
-  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, int64_t dim,
-                           const Optional<Symbol<DType>>& dtype) const {
+  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, int64_t dim) const {
     auto ndim = input->ndim();
     if (dim < 0) { dim += ndim; }
     CHECK_OR_RETURN(dim >= 0 && dim < ndim)
@@ -2150,11 +2139,7 @@ class CumBaseFunctor {
     MutableAttrMap attrs;
     JUST(attrs.SetAttr<int64_t>("dim", dim));
     TensorProcessor tensor_processor;
-    if (dtype) {
-      JUST(tensor_processor.AddInputs({input}, JUST(dtype)).Apply());
-    } else {
-      JUST(tensor_processor.AddInputs({input}, DType::Int64()).Apply());
-    }
+    JUST(tensor_processor.AddInputs({input}, DType::Int64()).Apply());
     TensorTuple input_tuple = JUST(tensor_processor.GetInputs());
     return OpInterpUtil::Dispatch<Tensor>(*op_, input_tuple, attrs);
   }
@@ -2802,7 +2787,6 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<ReduceMeanFunctor>("ReduceMean");
   m.add_functor<ReduceMinFunctor>("ReduceMin");
   m.add_functor<MinFunctor, Min2Functor>("Min");
-  m.add_functor<AmaxFunctor>("Amax");
   m.add_functor<ReduceSumFunctor>("ReduceSum");
   m.add_functor<ReduceAllFunctor>("ReduceAll");
   m.add_functor<ReduceAnyFunctor>("ReduceAny");
@@ -2820,15 +2804,14 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<TransposeFunctor>("Permute");
   m.add_functor<AsStridedFunctor>("AsStrided");
   m.add_functor<AsStridedGradFunctor>("AsStridedGrad");
-  m.add_functor<Transpose2dimFunctor>("Swapaxes");
-  m.add_functor<Transpose2dimFunctor>("Swapdims");
+  m.add_functor<SwapaxesFunctor>("Swapaxes");
   m.add_functor<ArangeFunctor, Arange2Functor>("Arange");
   m.add_functor<ConsistentArangeFunctor, ConsistentArange2Functor>("ConsistentArange");
   m.add_functor<CastFunctor>("Cast");
   m.add_functor<ClampFunctor>("Clamp");
   m.add_functor<ClampInplaceFunctor>("ClampInplace");
-  m.add_functor<ClipFunctor>("Clip");
-  m.add_functor<ClipInplaceFunctor>("ClipInplace");
+  m.add_functor<ClampFunctor>("Clip");
+  m.add_functor<ClampInplaceFunctor>("ClipInplace");
   m.add_functor<SqrtSquareSumFunctor>("SqrtSquareSum");
   m.add_functor<VectorNormFunctor, ScalarVectorNormFunctor>("VectorNorm");
   m.add_functor<ScalarMatrixNormFunctor, MatrixNormFunctor>("MatrixNorm");
diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp
index 9b78dae1f97..bcd3a45f301 100644
--- a/oneflow/core/functional/impl/nn_functor.cpp
+++ b/oneflow/core/functional/impl/nn_functor.cpp
@@ -135,30 +135,24 @@ class Conv3dFunctor : public ConvBaseFunctor {
 
 class DeConvBaseFunctor {
  public:
-  explicit DeConvBaseFunctor(const int& num_spatial_dims) : num_spatial_dims_(num_spatial_dims) {
+  explicit DeConvBaseFunctor() {
     bias_op_ = CHECK_JUST(one::OpBuilder("bias_add").Input("a").Input("b").Output("out").Build());
   }
   virtual ~DeConvBaseFunctor() = default;
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
                            const std::shared_ptr<one::Tensor>& weight,
-                           const Optional<one::Tensor>& bias, const std::vector<int32_t>& stride,
-                           const std::vector<int32_t>& padding,
-                           const std::vector<int32_t>& output_padding, const int32_t& groups,
-                           const std::vector<int32_t>& dilation,
-                           const std::string& data_format) const {
+                           const Optional<one::Tensor>& bias, const int32_t& filters,
+                           const std::vector<int32_t>& padding, const std::string& data_format,
+                           const std::vector<int32_t>& kernel_size,
+                           const std::vector<int32_t>& output_padding,
+                           const std::vector<int32_t>& strides,
+                           const std::vector<int32_t>& dilation, const int32_t& groups) const {
     MutableAttrMap deconv_attrs;
-    std::vector<int32_t> kernel_size_vec(num_spatial_dims_);
-    int32_t kernel_idx_offset = 2;
-    if (data_format == "channels_last") { kernel_idx_offset = 1; }
-    for (int i = 0; i < num_spatial_dims_; i++) {
-      kernel_size_vec[i] = ((weight->shape())->At(i + kernel_idx_offset));
-    }
-
-    JUST(deconv_attrs.SetAttr<int32_t>("filters", (weight->shape())->At(1) * groups));
+    JUST(deconv_attrs.SetAttr<int32_t>("filters", filters));
     JUST(deconv_attrs.SetAttr<std::vector<int32_t>>("padding_before", padding));
-    JUST(deconv_attrs.SetAttr<std::vector<int32_t>>("kernel_size", kernel_size_vec));
+    JUST(deconv_attrs.SetAttr<std::vector<int32_t>>("kernel_size", kernel_size));
     JUST(deconv_attrs.SetAttr<std::vector<int32_t>>("output_padding", output_padding));
-    JUST(deconv_attrs.SetAttr<std::vector<int32_t>>("strides", stride));
+    JUST(deconv_attrs.SetAttr<std::vector<int32_t>>("strides", strides));
     JUST(deconv_attrs.SetAttr<std::vector<int32_t>>("dilation_rate", dilation));
     JUST(deconv_attrs.SetAttr<int32_t>("groups", groups));
     JUST(deconv_attrs.SetAttr<std::string>("data_format", data_format));
@@ -176,12 +170,11 @@ class DeConvBaseFunctor {
  protected:
   std::shared_ptr<OpExpr> deconv_op_;
   std::shared_ptr<OpExpr> bias_op_;
-  int32_t num_spatial_dims_;
 };
 
 class DeConv1dFunctor : public DeConvBaseFunctor {
  public:
-  DeConv1dFunctor() : DeConvBaseFunctor(/*num_spatial_dims_=*/1) {
+  DeConv1dFunctor() {
     deconv_op_ =
         CHECK_JUST(one::OpBuilder("deconv1d").Input("in").Input("weight").Output("out").Build());
   }
@@ -189,7 +182,7 @@ class DeConv1dFunctor : public DeConvBaseFunctor {
 
 class DeConv2dFunctor : public DeConvBaseFunctor {
  public:
-  DeConv2dFunctor() : DeConvBaseFunctor(/*num_spatial_dims_=*/2) {
+  DeConv2dFunctor() {
     deconv_op_ =
         CHECK_JUST(one::OpBuilder("deconv2d").Input("in").Input("weight").Output("out").Build());
   }
@@ -197,7 +190,7 @@ class DeConv2dFunctor : public DeConvBaseFunctor {
 
 class DeConv3dFunctor : public DeConvBaseFunctor {
  public:
-  DeConv3dFunctor() : DeConvBaseFunctor(/*num_spatial_dims_=*/3) {
+  DeConv3dFunctor() {
     deconv_op_ =
         CHECK_JUST(one::OpBuilder("deconv3d").Input("in").Input("weight").Output("out").Build());
   }
@@ -320,7 +313,7 @@ class FusedMLPFunctor {
       k = n;
     }
 
-#if CUDA_VERSION >= 11060
+#if CUDA_VERSION >= 11050
     DeviceType device_type{};
     if (x->is_consistent()) {
       device_type = JUST(x->parallel_desc())->device_type();
@@ -339,7 +332,7 @@ class FusedMLPFunctor {
       JUST(attrs.SetAttr<bool>("skip_final_activation", skip_final_activation));
       return OpInterpUtil::Dispatch<Tensor>(*fused_op_[weight_size], input, attrs);
     }
-#endif  // CUDA_VERSION >= 11060
+#endif  // CUDA_VERSION >= 11050
 
     // Fall back to Naive matmul + bias_add + relu
     std::shared_ptr<one::Tensor> out = x;
@@ -1098,10 +1091,10 @@ class SparseSoftmaxCrossEntropyFunctor {
       s0s1_sbp_parallels.emplace_back(logits_nd_sbp.sbp_parallel(1));
       max_global_stage_input0 = JUST(functional::ToConsistent(
           max_device_stage->at(0), JUST(max_device_stage->at(0)->parallel_desc()),
-          new_sbp_parallels, s0s1_sbp_parallels, /* check_meta */ false));
+          new_sbp_parallels, s0s1_sbp_parallels));
       max_global_stage_input1 = JUST(functional::ToConsistent(
           max_device_stage->at(2), JUST(max_device_stage->at(0)->parallel_desc()),
-          new_sbp_parallels, s0s1_sbp_parallels, /* check_meta */ false));
+          new_sbp_parallels, s0s1_sbp_parallels));
     }
     // op_reduce_max_global_stage_
     attrs.clear();
@@ -1113,7 +1106,7 @@ class SparseSoftmaxCrossEntropyFunctor {
     if (logits_nd_sbp.sbp_parallel_size() == 2) {
       broadcast_sub_input = JUST(functional::ToConsistent(
           broadcast_sub_input, JUST(max_device_stage->at(0)->parallel_desc()), new_sbp_parallels,
-          new_sbp_parallels, /* check_meta */ false));
+          new_sbp_parallels));
     }
     // op_broadcast_sub_
     attrs.clear();
@@ -1132,7 +1125,7 @@ class SparseSoftmaxCrossEntropyFunctor {
       std::vector<Symbol<SbpParallel>> empty_grad_sbp_parallels;
       broadcast_div_input1 = JUST(functional::ToConsistent(
           output_reduce_sum->at(0), JUST(output_reduce_sum->at(0)->parallel_desc()),
-          new_sbp_parallels, new_sbp_parallels, /* check_meta */ false));
+          new_sbp_parallels, new_sbp_parallels));
     }
     // op_broadcast_div_
     attrs.clear();
@@ -2373,43 +2366,43 @@ class FusedDotFeatureInteractionFunctor {
 class OneEmbeddingIdShuffleFunctor {
  public:
   OneEmbeddingIdShuffleFunctor() {
-    op_table_ids_has_in_out_ = CHECK_JUST(one::OpBuilder("id_shuffle")
-                                              .Input("ids")
-                                              .Input("table_ids")
-                                              .Output("num_unique_matrix")
-                                              .Output("inverse_unique_partition_indices")
-                                              .Output("cur_rank_num_unique")
-                                              .Output("cur_rank_unique_ids")
-                                              .Output("cur_rank_unique_table_ids")
-                                              .Output("cur_rank_inverse_indices")
-                                              .Build());
-    op_table_ids_no_in_has_out_ = CHECK_JUST(one::OpBuilder("id_shuffle")
-                                                 .Input("ids")
-                                                 .Output("num_unique_matrix")
-                                                 .Output("inverse_unique_partition_indices")
-                                                 .Output("cur_rank_num_unique")
-                                                 .Output("cur_rank_unique_ids")
-                                                 .Output("cur_rank_unique_table_ids")
-                                                 .Output("cur_rank_inverse_indices")
-                                                 .Build());
+    op_column_ids_has_in_out_ = CHECK_JUST(one::OpBuilder("id_shuffle")
+                                               .Input("ids")
+                                               .Input("column_ids")
+                                               .Output("num_unique_matrix")
+                                               .Output("inverse_unique_partition_indices")
+                                               .Output("cur_rank_num_unique")
+                                               .Output("cur_rank_unique_ids")
+                                               .Output("cur_rank_unique_column_ids")
+                                               .Output("cur_rank_inverse_indices")
+                                               .Build());
+    op_column_ids_no_in_has_out_ = CHECK_JUST(one::OpBuilder("id_shuffle")
+                                                  .Input("ids")
+                                                  .Output("num_unique_matrix")
+                                                  .Output("inverse_unique_partition_indices")
+                                                  .Output("cur_rank_num_unique")
+                                                  .Output("cur_rank_unique_ids")
+                                                  .Output("cur_rank_unique_column_ids")
+                                                  .Output("cur_rank_inverse_indices")
+                                                  .Build());
   }
 
   Maybe<TensorTuple> operator()(const std::shared_ptr<one::Tensor>& ids,
-                                const Optional<one::Tensor>& table_ids,
-                                const int32_t& num_tables) const {
+                                const Optional<one::Tensor>& column_ids,
+                                const int32_t& num_columns) const {
     MutableAttrMap attrs;
-    JUST(attrs.SetAttr<int32_t>("num_tables", num_tables));
-    if (table_ids) {
-      return OpInterpUtil::Dispatch<TensorTuple>(*op_table_ids_has_in_out_, {ids, JUST(table_ids)},
-                                                 attrs);
+    JUST(attrs.SetAttr<int32_t>("num_columns", num_columns));
+    if (column_ids) {
+      return OpInterpUtil::Dispatch<TensorTuple>(*op_column_ids_has_in_out_,
+                                                 {ids, JUST(column_ids)}, attrs);
     } else {
-      return OpInterpUtil::Dispatch<TensorTuple>(*op_table_ids_no_in_has_out_, {ids}, attrs);
+      return OpInterpUtil::Dispatch<TensorTuple>(*op_column_ids_no_in_has_out_, {ids}, attrs);
     }
   }
 
  private:
-  std::shared_ptr<OpExpr> op_table_ids_has_in_out_;
-  std::shared_ptr<OpExpr> op_table_ids_no_in_has_out_;
+  std::shared_ptr<OpExpr> op_column_ids_has_in_out_;
+  std::shared_ptr<OpExpr> op_column_ids_no_in_has_out_;
 };
 
 class OneEmbeddingEmbeddingShuffleFunctor {
@@ -2467,42 +2460,42 @@ class OneEmbeddingEmbeddingGradientShuffleFunctor {
 class OneEmbeddingLookupFunctor {
  public:
   OneEmbeddingLookupFunctor() {
-    op_has_table_ids_ = CHECK_JUST(one::OpBuilder("embedding_lookup_placeholder")
+    op_has_column_ids_ = CHECK_JUST(one::OpBuilder("embedding_lookup_placeholder")
+                                        .Input("shadow")
+                                        .Input("ids")
+                                        .Input("column_ids")
+                                        .Output("embeddings")
+                                        .Build());
+    op_no_column_ids_ = CHECK_JUST(one::OpBuilder("embedding_lookup_placeholder")
                                        .Input("shadow")
                                        .Input("ids")
-                                       .Input("table_ids")
                                        .Output("embeddings")
                                        .Build());
-    op_no_table_ids_ = CHECK_JUST(one::OpBuilder("embedding_lookup_placeholder")
-                                      .Input("shadow")
-                                      .Input("ids")
-                                      .Output("embeddings")
-                                      .Build());
   }
 
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& shadow,
                            const std::shared_ptr<one::Tensor>& ids,
-                           const Optional<one::Tensor>& table_ids, const Symbol<DType>& dtype,
-                           const int64_t embedding_size, const int32_t num_tables,
-                           const std::string& embedding_tables,
+                           const Optional<one::Tensor>& column_ids, const Symbol<DType>& dtype,
+                           const int64_t embedding_size, const int32_t num_columns,
+                           const std::string& embedding_columns,
                            const std::string& key_value_store_options) const {
     MutableAttrMap attrs;
     JUST(attrs.SetAttr<DataType>("dtype", dtype->data_type()));
     JUST(attrs.SetAttr<int64_t>("embedding_size", embedding_size));
-    JUST(attrs.SetAttr<int32_t>("num_tables", num_tables));
-    JUST(attrs.SetAttr<std::string>("embedding_tables", embedding_tables));
+    JUST(attrs.SetAttr<int32_t>("num_columns", num_columns));
+    JUST(attrs.SetAttr<std::string>("embedding_columns", embedding_columns));
     JUST(attrs.SetAttr<std::string>("key_value_store_options", key_value_store_options));
-    if (table_ids) {
-      return OpInterpUtil::Dispatch<Tensor>(*op_has_table_ids_, {shadow, ids, JUST(table_ids)},
+    if (column_ids) {
+      return OpInterpUtil::Dispatch<Tensor>(*op_has_column_ids_, {shadow, ids, JUST(column_ids)},
                                             attrs);
     } else {
-      return OpInterpUtil::Dispatch<Tensor>(*op_no_table_ids_, {shadow, ids}, attrs);
+      return OpInterpUtil::Dispatch<Tensor>(*op_no_column_ids_, {shadow, ids}, attrs);
     }
   }
 
  private:
-  std::shared_ptr<OpExpr> op_has_table_ids_;
-  std::shared_ptr<OpExpr> op_no_table_ids_;
+  std::shared_ptr<OpExpr> op_has_column_ids_;
+  std::shared_ptr<OpExpr> op_no_column_ids_;
 };
 
 class OneEmbeddingUniqueKeyValuePairFunctor {
@@ -2527,9 +2520,9 @@ class OneEmbeddingUniqueKeyValuePairFunctor {
 
   Maybe<TensorTuple> operator()(const std::shared_ptr<one::Tensor>& keys,
                                 const Optional<one::Tensor>& values,
-                                const int32_t num_tables) const {
+                                const int32_t num_columns) const {
     MutableAttrMap attrs;
-    JUST(attrs.SetAttr<int32_t>("num_tables", num_tables));
+    JUST(attrs.SetAttr<int32_t>("num_columns", num_columns));
     if (values) {
       return OpInterpUtil::Dispatch<TensorTuple>(*op_has_input_value_, {keys, JUST(values)}, attrs);
     } else {
@@ -2542,180 +2535,6 @@ class OneEmbeddingUniqueKeyValuePairFunctor {
   std::shared_ptr<OpExpr> op_no_input_value_;
 };
 
-class OneEmbeddingSgdUpdateFunctor {
- public:
-  OneEmbeddingSgdUpdateFunctor() {
-    // This functor is just for unittest
-    sgd_op_ = CHECK_JUST(one::OpBuilder("sgd_embedding_update")
-                             .Input("num_unique_ids")
-                             .Input("unique_embeddings")
-                             .Input("embedding_grad")
-                             .Input("learning_rate")
-                             .Input("down_scale_by_tensor")
-                             .Input("skip_if")
-                             .Output("updated_unique_embeddings")
-                             .Build());
-    momentum_op_ = CHECK_JUST(one::OpBuilder("momentum_embedding_update")
-                                  .Input("num_unique_ids")
-                                  .Input("unique_embeddings")
-                                  .Input("embedding_grad")
-                                  .Input("learning_rate")
-                                  .Input("down_scale_by_tensor")
-                                  .Input("skip_if")
-                                  .Output("updated_unique_embeddings")
-                                  .Build());
-  }
-
-  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& num_unique_ids,
-                           const std::shared_ptr<one::Tensor>& unique_embeddings,
-                           const std::shared_ptr<one::Tensor>& embedding_grad,
-                           const std::shared_ptr<one::Tensor>& learning_rate,
-                           const std::shared_ptr<one::Tensor>& down_scale_by_tensor,
-                           const std::shared_ptr<one::Tensor>& skip_if, const double scale,
-                           const float weight_decay, const float momentum) const {
-    MutableAttrMap attrs;
-    JUST(attrs.SetAttr<double>("scale", scale));
-    JUST(attrs.SetAttr<float>("weight_decay", weight_decay));
-    if (momentum == 0) {
-      return OpInterpUtil::Dispatch<Tensor>(*sgd_op_,
-                                            {num_unique_ids, unique_embeddings, embedding_grad,
-                                             learning_rate, down_scale_by_tensor, skip_if},
-                                            attrs);
-    } else {
-      JUST(attrs.SetAttr<float>("beta", momentum));
-      return OpInterpUtil::Dispatch<Tensor>(*momentum_op_,
-                                            {num_unique_ids, unique_embeddings, embedding_grad,
-                                             learning_rate, down_scale_by_tensor, skip_if},
-                                            attrs);
-    }
-  }
-
- private:
-  std::shared_ptr<OpExpr> sgd_op_;
-  std::shared_ptr<OpExpr> momentum_op_;
-};
-
-class OneEmbeddingAdamUpdateFunctor {
- public:
-  OneEmbeddingAdamUpdateFunctor() {
-    // This functor is just for unittest
-    no_bias_correction_op_ = CHECK_JUST(one::OpBuilder("adam_embedding_update")
-                                            .Input("num_unique_ids")
-                                            .Input("unique_embeddings")
-                                            .Input("embedding_grad")
-                                            .Input("learning_rate")
-                                            .Input("down_scale_by_tensor")
-                                            .Input("skip_if")
-                                            .Output("updated_unique_embeddings")
-                                            .Build());
-    do_bias_correction_op_ = CHECK_JUST(one::OpBuilder("adam_embedding_update")
-                                            .Input("num_unique_ids")
-                                            .Input("unique_embeddings")
-                                            .Input("embedding_grad")
-                                            .Input("learning_rate")
-                                            .Input("down_scale_by_tensor")
-                                            .Input("skip_if")
-                                            .Input("bias_correction1")
-                                            .Input("bias_correction2")
-                                            .Output("updated_unique_embeddings")
-                                            .Build());
-  }
-
-  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& num_unique_ids,
-                           const std::shared_ptr<one::Tensor>& unique_embeddings,
-                           const std::shared_ptr<one::Tensor>& embedding_grad,
-                           const std::shared_ptr<one::Tensor>& learning_rate,
-                           const std::shared_ptr<one::Tensor>& down_scale_by_tensor,
-                           const std::shared_ptr<one::Tensor>& skip_if,
-                           const Optional<one::Tensor>& bias_correction1,
-                           const Optional<one::Tensor>& bias_correction2, const double scale,
-                           const float weight_decay, const float beta1, const float beta2,
-                           const float epsilon, const bool do_bias_correction) const {
-    MutableAttrMap attrs;
-    JUST(attrs.SetAttr<double>("scale", scale));
-    JUST(attrs.SetAttr<float>("weight_decay", weight_decay));
-    JUST(attrs.SetAttr<float>("beta1", beta1));
-    JUST(attrs.SetAttr<float>("beta2", beta2));
-    JUST(attrs.SetAttr<float>("epsilon", epsilon));
-    JUST(attrs.SetAttr<bool>("do_bias_correction", do_bias_correction));
-    if (do_bias_correction) {
-      CHECK(bias_correction1);
-      CHECK(bias_correction2);
-      return OpInterpUtil::Dispatch<Tensor>(
-          *do_bias_correction_op_,
-          {num_unique_ids, unique_embeddings, embedding_grad, learning_rate, down_scale_by_tensor,
-           skip_if, JUST(bias_correction1), JUST(bias_correction2)},
-          attrs);
-    } else {
-      return OpInterpUtil::Dispatch<Tensor>(*no_bias_correction_op_,
-                                            {num_unique_ids, unique_embeddings, embedding_grad,
-                                             learning_rate, down_scale_by_tensor, skip_if},
-                                            attrs);
-    }
-  }
-
- private:
-  std::shared_ptr<OpExpr> no_bias_correction_op_;
-  std::shared_ptr<OpExpr> do_bias_correction_op_;
-};
-
-class OneEmbeddingAdagradUpdateFunctor {
- public:
-  OneEmbeddingAdagradUpdateFunctor() {
-    // This functor is just for unittest
-    op_ = CHECK_JUST(one::OpBuilder("adagrad_embedding_update")
-                         .Input("num_unique_ids")
-                         .Input("unique_embeddings")
-                         .Input("embedding_grad")
-                         .Input("learning_rate")
-                         .Input("down_scale_by_tensor")
-                         .Input("skip_if")
-                         .Input("train_step")
-                         .Output("updated_unique_embeddings")
-                         .Build());
-  }
-
-  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& num_unique_ids,
-                           const std::shared_ptr<one::Tensor>& unique_embeddings,
-                           const std::shared_ptr<one::Tensor>& embedding_grad,
-                           const std::shared_ptr<one::Tensor>& learning_rate,
-                           const std::shared_ptr<one::Tensor>& down_scale_by_tensor,
-                           const std::shared_ptr<one::Tensor>& skip_if,
-                           const std::shared_ptr<one::Tensor>& train_step, const double scale,
-                           const float weight_decay, const float lr_decay,
-                           const float epsilon) const {
-    MutableAttrMap attrs;
-    JUST(attrs.SetAttr<double>("scale", scale));
-    JUST(attrs.SetAttr<float>("weight_decay", weight_decay));
-    JUST(attrs.SetAttr<float>("lr_decay", lr_decay));
-    JUST(attrs.SetAttr<float>("epsilon", epsilon));
-    return OpInterpUtil::Dispatch<Tensor>(
-        *op_,
-        {num_unique_ids, unique_embeddings, embedding_grad, learning_rate, down_scale_by_tensor,
-         skip_if, train_step},
-        attrs);
-  }
-
- private:
-  std::shared_ptr<OpExpr> op_;
-};
-
-class RocAucScoreFunctor {
- public:
-  RocAucScoreFunctor() {
-    op_ = CHECK_JUST(
-        one::OpBuilder("roc_auc_score").Input("label").Input("pred").Output("out").Build());
-  }
-
-  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& label,
-                           const std::shared_ptr<one::Tensor>& pred) const {
-    return OpInterpUtil::Dispatch<Tensor>(*op_, {label, pred});
-  }
-
- private:
-  std::shared_ptr<OpExpr> op_;
-};
-
 }  // namespace impl
 
 ONEFLOW_FUNCTION_LIBRARY(m) {
@@ -2793,10 +2612,6 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
       "OneEmbeddingEmbeddingGradientShuffle");
   m.add_functor<impl::OneEmbeddingLookupFunctor>("OneEmbeddingLookup");
   m.add_functor<impl::OneEmbeddingUniqueKeyValuePairFunctor>("OneEmbeddingUniqueKeyValuePair");
-  m.add_functor<impl::OneEmbeddingSgdUpdateFunctor>("OneEmbeddingSgdUpdate");
-  m.add_functor<impl::OneEmbeddingAdamUpdateFunctor>("OneEmbeddingAdamUpdate");
-  m.add_functor<impl::OneEmbeddingAdagradUpdateFunctor>("OneEmbeddingAdagradUpdate");
-  m.add_functor<impl::RocAucScoreFunctor>("RocAucScore");
 };
 
 }  // namespace functional
diff --git a/oneflow/core/functional/impl/nn_grad_functor.cpp b/oneflow/core/functional/impl/nn_grad_functor.cpp
index 3057df2e36e..437977dd2c0 100644
--- a/oneflow/core/functional/impl/nn_grad_functor.cpp
+++ b/oneflow/core/functional/impl/nn_grad_functor.cpp
@@ -118,6 +118,7 @@ class PoolingNdGradFunctor {
         const auto& op_type_name = GetOpTypeName(mode, ndims);
         op_expr_map_[op_type_name] = CHECK_JUST(one::OpBuilder(op_type_name)
                                                     .Input("x")
+                                                    .Input("y")
                                                     .Input("indice")
                                                     .Input("dy")
                                                     .Output("dx")
@@ -129,6 +130,7 @@ class PoolingNdGradFunctor {
     return mode + "pool_" + std::to_string(ndims) + "d_grad";
   }
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
+                           const std::shared_ptr<one::Tensor>& y,
                            const std::shared_ptr<one::Tensor>& indice,
                            const std::shared_ptr<one::Tensor>& dy, const std::string& mode,
                            const int32_t& ndims, const std::string& data_format,
@@ -149,7 +151,7 @@ class PoolingNdGradFunctor {
     CHECK_OR_RETURN(it != op_expr_map_.end())
         << "Encounter unsupported op " << op_type_name << " in PoolingNdGradFunctor.";
     CHECK_NOTNULL_OR_RETURN(it->second);
-    return OpInterpUtil::Dispatch<Tensor>(*it->second, {x, indice, dy}, attrs);
+    return OpInterpUtil::Dispatch<Tensor>(*it->second, {x, y, indice, dy}, attrs);
   }
 
  protected:
@@ -648,14 +650,15 @@ class AvgPoolingNdGradFunctor {
   AvgPoolingNdGradFunctor() {
     for (int ndims = 1; ndims <= 3; ++ndims) {
       const auto& op_type_name = GetOpTypeName(ndims);
-      op_expr_map_[op_type_name] =
-          CHECK_JUST(one::OpBuilder(op_type_name).Input("x").Input("dy").Output("dx").Build());
+      op_expr_map_[op_type_name] = CHECK_JUST(
+          one::OpBuilder(op_type_name).Input("x").Input("y").Input("dy").Output("dx").Build());
     }
   }
   static std::string GetOpTypeName(const int32_t& ndims) {
     return "avgpool_" + std::to_string(ndims) + "d_grad";
   }
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
+                           const std::shared_ptr<one::Tensor>& y,
                            const std::shared_ptr<one::Tensor>& dy, const int32_t& ndims,
                            const std::string& data_format, const std::vector<int32_t>& padding,
                            const std::vector<int32_t>& kernel_size,
@@ -674,7 +677,7 @@ class AvgPoolingNdGradFunctor {
     CHECK_OR_RETURN(it != op_expr_map_.end())
         << "Encounter unsupported op " << op_type_name << " in PoolingNdGradFunctor.";
     CHECK_NOTNULL_OR_RETURN(it->second);
-    return OpInterpUtil::Dispatch<Tensor>(*it->second, {x, dy}, attrs);
+    return OpInterpUtil::Dispatch<Tensor>(*it->second, {x, y, dy}, attrs);
   }
 
  protected:
diff --git a/oneflow/core/functional/impl/unary_functor.cpp b/oneflow/core/functional/impl/unary_functor.cpp
index 936506d88ee..d18c83a897f 100644
--- a/oneflow/core/functional/impl/unary_functor.cpp
+++ b/oneflow/core/functional/impl/unary_functor.cpp
@@ -67,8 +67,7 @@ namespace impl {
   OF_PP_MAKE_TUPLE_SEQ("sqrt", Sqrt)             \
   OF_PP_MAKE_TUPLE_SEQ("square", Square)         \
   OF_PP_MAKE_TUPLE_SEQ("tan", Tan)               \
-  OF_PP_MAKE_TUPLE_SEQ("tanh", Tanh)             \
-  OF_PP_MAKE_TUPLE_SEQ("not_equal_zero", NotEqualZero)
+  OF_PP_MAKE_TUPLE_SEQ("tanh", Tanh)
 
 #define LOGICAL_FLOAT_UNARY_FUNC_SEQ OF_PP_MAKE_TUPLE_SEQ("logical_not", LogicalNot)
 
@@ -152,7 +151,6 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   ADD_UNARY_FUNCTOR(Square, "Square");
   ADD_UNARY_FUNCTOR(Tan, "Tan");
   ADD_UNARY_FUNCTOR(Tanh, "Tanh");
-  ADD_UNARY_FUNCTOR(NotEqualZero, "NotEqualZero")
   m.add_functor<LogicalNotFunctor>("LogicalNot");
   m.add_functor<InplaceSinFunctor>("Sin_");
   m.add_functor<InplaceFloorFunctor>("Floor_");
diff --git a/oneflow/core/functional/tensor_index.cpp b/oneflow/core/functional/tensor_index.cpp
index 048da540131..312664a184d 100644
--- a/oneflow/core/functional/tensor_index.cpp
+++ b/oneflow/core/functional/tensor_index.cpp
@@ -334,9 +334,9 @@ Maybe<Tensor> ApplyAdvancedIndexing(const std::shared_ptr<Tensor>& input,
     const auto& broadcast_sbp = JUST(MakeBroadcastSbpParallel());
     int n = JUST(input->nd_sbp())->sbp_parallel_size();
     std::vector<Symbol<SbpParallel>> grad_sbp_tuple;
-    packed_indices = JUST(ToConsistent(packed_indices, placement,
-                                       std::vector<Symbol<SbpParallel>>(n, broadcast_sbp),
-                                       grad_sbp_tuple, /* check_meta */ false));
+    packed_indices =
+        JUST(ToConsistent(packed_indices, placement,
+                          std::vector<Symbol<SbpParallel>>(n, broadcast_sbp), grad_sbp_tuple));
   } else {
     Symbol<Device> device = JUST(transposed_input->device());
     if (JUST(packed_indices->device()) != device) {
diff --git a/oneflow/core/job/graph_scope_vars.cpp b/oneflow/core/job/graph_scope_vars.cpp
index 758e7ca1b35..d545642ef48 100644
--- a/oneflow/core/job/graph_scope_vars.cpp
+++ b/oneflow/core/job/graph_scope_vars.cpp
@@ -19,20 +19,10 @@ namespace oneflow {
 
 namespace {
 
-bool* GetGraphVerboseStepLr() {
-  static thread_local bool graph_verbose_step_lr = false;
+std::atomic<bool>* GetGraphVerboseStepLr() {
+  static std::atomic<bool> graph_verbose_step_lr{false};
   return &graph_verbose_step_lr;
 }
-
-int32_t* GetGraphDebugMaxPyStackDepthVar() {
-  static thread_local int32_t graph_debug_max_py_stack_depth = 2;
-  return &graph_debug_max_py_stack_depth;
-}
-
-bool* GetGraphDebugModeFlag() {
-  static thread_local bool graph_debug_mode_flag = false;
-  return &graph_debug_mode_flag;
-}
 }  // namespace
 
 bool IsOpenGraphVerboseStepLr() {
@@ -46,9 +36,13 @@ void SetGraphVerboseStepLr(bool verbose) {
   *graph_verbose_step_lr = verbose;
 }
 
-void SetGraphDebugMaxPyStackDepth(int32_t depth) { *GetGraphDebugMaxPyStackDepthVar() = depth; }
-int32_t GetGraphDebugMaxPyStackDepth() { return *GetGraphDebugMaxPyStackDepthVar(); }
+std::atomic<int32_t>* GetGraphDebugMaxPyStackDepthVar() {
+  static std::atomic<int32_t> graph_debug_max_py_stack_depth{2};
+  return &graph_debug_max_py_stack_depth;
+}
 
-void SetGraphDebugMode(bool mode) { *GetGraphDebugModeFlag() = mode; }
-bool GetGraphDebugMode() { return *GetGraphDebugModeFlag(); }
+std::atomic<bool>* GetGraphDebugModeFlag() {
+  static std::atomic<bool> graph_debug_mode_flag{false};
+  return &graph_debug_mode_flag;
+}
 }  // namespace oneflow
diff --git a/oneflow/core/job/graph_scope_vars.h b/oneflow/core/job/graph_scope_vars.h
index de6d39c6312..0fa922ac113 100644
--- a/oneflow/core/job/graph_scope_vars.h
+++ b/oneflow/core/job/graph_scope_vars.h
@@ -16,16 +16,15 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_JOB_GRAPH_SCOPE_VARS_H_
 #define ONEFLOW_CORE_JOB_GRAPH_SCOPE_VARS_H_
 
-#include <cstdint>
+#include "oneflow/core/common/maybe.h"
+
 namespace oneflow {
 
 bool IsOpenGraphVerboseStepLr();
 void SetGraphVerboseStepLr(bool verbose);
 
-void SetGraphDebugMaxPyStackDepth(int32_t depth);
-int32_t GetGraphDebugMaxPyStackDepth();
-void SetGraphDebugMode(bool mode);
-bool GetGraphDebugMode();
+std::atomic<int32_t>* GetGraphDebugMaxPyStackDepthVar();
+std::atomic<bool>* GetGraphDebugModeFlag();
 }  // namespace oneflow
 
 #endif  // ONEFLOW_CORE_JOB_GRAPH_SCOPE_VARS_H_
diff --git a/oneflow/core/job/job_conf.proto b/oneflow/core/job/job_conf.proto
index 664a0ac5989..36c0aca4483 100644
--- a/oneflow/core/job/job_conf.proto
+++ b/oneflow/core/job/job_conf.proto
@@ -60,8 +60,8 @@ message AdagradModelUpdateConf {
 }
 
 message ClipByGlobalNormConf {
-  optional float max_norm = 1 [default = 1.0];
-  optional double norm_type = 2 [default = 2.0];
+  required float clip_norm = 1;
+  optional float global_norm = 2;
 }
 
 message ClipConf {
diff --git a/oneflow/core/job/runtime.cpp b/oneflow/core/job/runtime.cpp
index 7cfdc1735dc..f15357abc47 100644
--- a/oneflow/core/job/runtime.cpp
+++ b/oneflow/core/job/runtime.cpp
@@ -58,11 +58,10 @@ bool HasNonCtrlConsumedRegstDescId(const TaskProto& task) {
 }  // namespace
 
 Runtime::Runtime(const Plan& plan, const HashMap<std::string, Blob*>& variable_op_name2eager_blob) {
-  DumpThreadIdsFromPlan(plan);
   {
     // NOTE(chengcheng): All runtime Global objects AddPlan
     Global<RegstMgr>::Get()->AddPlan(plan, variable_op_name2eager_blob);
-    Global<ThreadMgr>::Get()->AddThreads(thread_ids_);
+    Global<ThreadMgr>::Get()->AddPlan(plan);
     Global<RuntimeJobDescs>::Get()->AddPlan(plan);
     collective_boxing_scheduler_plan_token_ =
         Global<boxing::collective::Scheduler>::Get()->AddPlan(plan);
@@ -107,27 +106,7 @@ Runtime::~Runtime() {
     Global<RuntimeCtx>::Get()->WaitUntilCntEqualZero(GetRunningActorCountKeyByJobId(pair.first));
   }
   OF_SESSION_BARRIER();
-  Global<ThreadMgr>::Get()->DeleteThreads(independent_thread_ids_);
   Global<boxing::collective::Scheduler>::Get()->DeletePlan(collective_boxing_scheduler_plan_token_);
 }
 
-void Runtime::DumpThreadIdsFromPlan(const Plan& plan) {
-  const int64_t this_rank = GlobalProcessCtx::Rank();
-  for (const TaskProto& task : plan.task()) {
-    TaskId task_id = DecodeTaskIdFromInt64(task.task_id());
-    StreamId stream_id = task_id.stream_id();
-    if (stream_id.rank() != this_rank) { continue; }
-    int64_t thrd_id = EncodeStreamIdToInt64(stream_id);
-    thread_ids_.insert(thrd_id);
-    // NOTE(chengcheng): there is not a interface to query whether a task type is indenpendent,
-    //  so use hard code.
-    if (task.task_type() == TaskType::kWaitAndSendIds
-        || task.task_type() == TaskType::kCriticalSectionWaitTick) {
-      CHECK(independent_thread_ids_.insert(thrd_id).second)
-          << " RuntimeError! Thread : " << thrd_id
-          << " not independent with task proto: " << task.DebugString();
-    }
-  }
-}
-
 }  // namespace oneflow
diff --git a/oneflow/core/job/runtime.h b/oneflow/core/job/runtime.h
index c305aaa4a1f..d784de07fb1 100644
--- a/oneflow/core/job/runtime.h
+++ b/oneflow/core/job/runtime.h
@@ -33,11 +33,7 @@ class Runtime final {
   Runtime(const Plan& plan, const HashMap<std::string, Blob*>& variable_op_name2eager_blob);
 
  private:
-  void DumpThreadIdsFromPlan(const Plan& plan);
-
   HashMap<int64_t, int64_t> job_id2actor_size_;
-  HashSet<int64_t> thread_ids_;
-  HashSet<int64_t> independent_thread_ids_;
 
   boxing::collective::SchedulerPlanToken* collective_boxing_scheduler_plan_token_;
 };
diff --git a/oneflow/core/job_rewriter/autograd.cpp b/oneflow/core/job_rewriter/autograd.cpp
index b66a765d5c4..38b905ef3c2 100644
--- a/oneflow/core/job_rewriter/autograd.cpp
+++ b/oneflow/core/job_rewriter/autograd.cpp
@@ -20,7 +20,6 @@ limitations under the License.
 #include "oneflow/core/operator/variable_op.h"
 #include "oneflow/core/register/op_blob_arg.pb.h"
 #include "oneflow/core/common/protobuf.h"
-#include "oneflow/core/common/container_util.h"
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/job_rewriter/job_pass.h"
 #include "oneflow/core/job_rewriter/dynamic_loss_scale_job_pass_state.h"
@@ -438,22 +437,22 @@ std::string AddLbns(JobBuilder* job_builder, const std::vector<std::string>& lbn
   }
 }
 
-std::string AddParallelCast(JobBuilder* job_builder, const std::string& in_lbn,
-                            const std::string& sbp_str, const ParallelConf& parallel_conf,
-                            const std::string& op_name_prefix) {
+std::string AddCastToP(JobBuilder* job_builder, const std::string& in_lbn,
+                       const ParallelConf& parallel_conf, const std::string& op_name_prefix) {
   ParallelConf flat_parallel_conf = parallel_conf;
   flat_parallel_conf.mutable_hierarchy()->clear_dim();
   const int64_t scope_symbol_id =
       MakeScopeSymbolId(job_builder->job().job_conf(), flat_parallel_conf);
-  std::vector<std::string> sbp = {sbp_str};
+  std::vector<std::string> cast_nd_sbp;
+  cast_nd_sbp.emplace_back("P");
   auto parallel_cast_op =
       user_op::UserOpConfWrapperBuilder(op_name_prefix + NewUniqueId())
           .Op("hierarchical_parallel_cast")
           .Input("in", in_lbn)
           .Output("out")
-          .Attr<std::vector<std::string>>("nd_sbp", sbp)
+          .Attr<std::vector<std::string>>("nd_sbp", cast_nd_sbp)
           .Attr<std::string>("grad_mode", "auto")
-          .Attr<std::vector<std::string>>("grad_nd_sbp", std::vector<std::string>{})
+          .Attr<std::vector<std::string>>("grad_nd_sbp", std::vector<std::string>())
           .ScopeSymbolId(scope_symbol_id)
           .Build();
   job_builder->AddOps(flat_parallel_conf, {parallel_cast_op.op_conf()});
@@ -468,300 +467,123 @@ bool IsBroadcast(const NdSbp& nd_sbp, const ParallelDesc& parallel_desc) {
   return true;
 }
 
-bool HasSplit(const NdSbp& nd_sbp, const ParallelDesc& parallel_desc) {
-  if (parallel_desc.parallel_num() == 1) { return false; }
-  for (const auto& sbp : nd_sbp.sbp_parallel()) {
-    if (sbp.has_split_parallel()) { return true; }
-  }
-  return false;
-}
-
-OperatorConf GenConstantLikeOp(const std::string& op_name, int64_t scope_symbol_id,
-                               const std::string& like_lbn, double value, DataType dtype) {
-  OperatorConf op_conf;
-  op_conf.set_name(op_name);
-  op_conf.set_scope_symbol_id(scope_symbol_id);
-  ConstantLikeOpConf* constant_like_conf = op_conf.mutable_constant_like_conf();
-  constant_like_conf->set_like(like_lbn);
-  if (dtype == DataType::kInt32) {
-    constant_like_conf->set_int_operand(static_cast<int32_t>(value));
-  } else if (dtype == DataType::kInt64) {
-    constant_like_conf->set_int_operand(static_cast<int64_t>(value));
-  } else if (dtype == DataType::kFloat) {
-    constant_like_conf->set_float_operand(static_cast<float>(value));
-  } else if (dtype == DataType::kDouble) {
-    constant_like_conf->set_float_operand(value);
-  } else {
-    UNIMPLEMENTED();
-  }
-  constant_like_conf->set_data_type(dtype);
-  constant_like_conf->set_out("out");
-  return op_conf;
-}
-
-std::string GlobalAbsMaxMin(const OpGraph& op_graph, JobBuilder* job_builder,
-                            const HashMap<LogicalBlobId, LogicalBlobId>& lbi2diff_lbi,
-                            bool max_or_min, ParallelConf* out_parallel_conf) {
-  // max(abs(x))
-  bool all_same_parallel_desc = true;
-  const ParallelDesc& any_parallel_desc =
-      op_graph.OpNode4OpName(lbi2diff_lbi.begin()->first.op_name())->parallel_desc();
-  std::vector<std::string> group_reduce_lbns;
-
-  auto GroupReduce = [&](const ParallelDesc& parallel_desc, const NdSbp& nd_sbp,
-                         const std::vector<LogicalBlobId>& lbis) {
-    if (!parallel_desc.EqualsIgnoringHierarchy(any_parallel_desc)) {
-      all_same_parallel_desc = false;
-    }
-    int64_t scope_symbol_id =
-        MakeScopeSymbolId(job_builder->job().job_conf(), parallel_desc.parallel_conf());
-    bool has_split = HasSplit(nd_sbp, parallel_desc);
-    if (job_builder->job().job_conf().enable_gradients_stats_aggregation()) {
-      std::string multi_reduce_op_type_name =
-          has_split ? (max_or_min ? "local_multi_reduce_max_abs" : "local_multi_reduce_min_abs")
-                    : (max_or_min ? "multi_reduce_max_abs" : "multi_reduce_min_abs");
-      std::string multi_reduce_op_name =
-          "System-ClipGradient-GlobalNorm-MultiReduceXimumAbs-" + NewUniqueId();
-      auto multi_reduce_op_builder = user_op::UserOpConfWrapperBuilder(multi_reduce_op_name)
-                                         .Op(multi_reduce_op_type_name)
-                                         .Output("y")
-                                         .ScopeSymbolId(scope_symbol_id);
-      for (const auto& lbi : lbis) {
-        multi_reduce_op_builder.Input("x", GenLogicalBlobName(lbi2diff_lbi.at(lbi)));
-      }
-      auto multi_reduce_op = multi_reduce_op_builder.Build();
-      job_builder->AddOps(parallel_desc.parallel_conf(), {multi_reduce_op.op_conf()});
-      if (has_split) {
-        std::string group_reduce_op_type_name = max_or_min ? "reduce_max" : "reduce_min";
-        std::string group_reduce_op_name =
-            "System-ClipGradient-GlobalNorm-GroupReduceXimum-" + NewUniqueId();
-        auto group_reduce_op = user_op::UserOpConfWrapperBuilder(group_reduce_op_name)
-                                   .Op(group_reduce_op_type_name)
-                                   .Input("input_tensor", multi_reduce_op.output("y", 0))
-                                   .Output("output_tensor")
-                                   .Attr("axis", std::vector<int32_t>{0})
-                                   .Attr("keepdims", false)
-                                   .ScopeSymbolId(scope_symbol_id)
-                                   .Build();
-        job_builder->AddOps(parallel_desc.parallel_conf(), {group_reduce_op.op_conf()});
-        group_reduce_lbns.push_back(group_reduce_op.output("output_tensor", 0));
-      } else {
-        group_reduce_lbns.push_back(multi_reduce_op.output("y", 0));
-      }
-    } else {
-      UNIMPLEMENTED();
-    }
-  };
-  ForEachAggregatedParamGroup(op_graph, lbi2diff_lbi, GroupReduce);
-  CHECK_GT(group_reduce_lbns.size(), 0);
-
-  *out_parallel_conf = all_same_parallel_desc ? any_parallel_desc.parallel_conf()
-                                              : GenParallelConfOfCpuZeroOnMaster();
-  out_parallel_conf->mutable_hierarchy()->clear_dim();
-  if (group_reduce_lbns.size() == 1) {
-    return group_reduce_lbns[0];
-  } else {
-    // stack all group max and go on max
-    const int64_t scope_symbol_id =
-        MakeScopeSymbolId(job_builder->job().job_conf(), *out_parallel_conf);
-    auto stack_op_builder =
-        user_op::UserOpConfWrapperBuilder("System-ClipGradient-GlobalNorm-GlobalStack-"
-                                          + NewUniqueId())
-            .Op("stack")
-            .Output("out")
-            .Attr("axis", int64_t(0))
-            .Attr("max_dim_size", static_cast<int64_t>(group_reduce_lbns.size()))
-            .ScopeSymbolId(scope_symbol_id);
-    for (const auto& lbn : group_reduce_lbns) { stack_op_builder.Input("in", lbn); }
-    auto stack_op = stack_op_builder.Build();
-    job_builder->AddOps(*out_parallel_conf, {stack_op.op_conf()});
-
-    std::string reduce_op_type_name = max_or_min ? "reduce_max" : "reduce_min";
-    std::string reduce_op_name =
-        "System-ClipGradient-GlobalNorm-GlobalReduceXimum-" + NewUniqueId();
-    auto reduce_op = user_op::UserOpConfWrapperBuilder(reduce_op_name)
-                         .Op(reduce_op_type_name)
-                         .Input("input_tensor", stack_op.output("out", 0))
-                         .Output("output_tensor")
-                         .Attr("axis", std::vector<int32_t>{0})
-                         .Attr("keepdims", false)
-                         .ScopeSymbolId(scope_symbol_id)
-                         .Build();
-    job_builder->AddOps(*out_parallel_conf, {reduce_op.op_conf()});
-    return reduce_op.output("output_tensor", 0);
-  }
-}
-
-std::string GlobalNorm(const OpGraph& op_graph, JobBuilder* job_builder,
-                       const HashMap<LogicalBlobId, LogicalBlobId>& lbi2diff_lbi, float p,
-                       ParallelConf* out_parallel_conf) {
-  bool all_same_parallel_desc = true;
-  const ParallelDesc& any_parallel_desc =
-      op_graph.OpNode4OpName(lbi2diff_lbi.begin()->first.op_name())->parallel_desc();
-  bool all_broadcast = true;
-  std::vector<std::string> group_lbns;
-  std::vector<ParallelConf> group_parallel_confs;
-  group_lbns.reserve(lbi2diff_lbi.size());
-  group_parallel_confs.reserve(lbi2diff_lbi.size());
-
-  auto GroupNorm = [&](const ParallelDesc& parallel_desc, const NdSbp& nd_sbp,
-                       const std::vector<LogicalBlobId>& lbis) {
-    if (!parallel_desc.EqualsIgnoringHierarchy(any_parallel_desc)) {
-      all_same_parallel_desc = false;
-    }
-    int64_t scope_symbol_id =
-        MakeScopeSymbolId(job_builder->job().job_conf(), parallel_desc.parallel_conf());
-    if (!IsBroadcast(nd_sbp, parallel_desc)) { all_broadcast = false; }
-    group_parallel_confs.emplace_back(parallel_desc.parallel_conf());
-
-    if (job_builder->job().job_conf().enable_gradients_stats_aggregation()) {
-      auto multi_reduce_sum_op_builder =
-          user_op::UserOpConfWrapperBuilder("System-ClipGradient-GlobalNorm-MultiReduceSumPowAbs-"
-                                            + NewUniqueId())
-              .Op("multi_reduce_sum_pow_abs")
-              .Attr("p", p)
-              .Output("y")
-              .ScopeSymbolId(scope_symbol_id);
-      for (const auto& lbi : lbis) {
-        multi_reduce_sum_op_builder.Input("x", GenLogicalBlobName(lbi2diff_lbi.at(lbi)));
-      }
-      const auto multi_reduce_sum_op = multi_reduce_sum_op_builder.Build();
-      job_builder->AddOps(parallel_desc.parallel_conf(), {multi_reduce_sum_op.op_conf()});
-      group_lbns.emplace_back(multi_reduce_sum_op.output("y", 0));
-    } else {
-      std::vector<std::string> lbns_to_add;
-      lbns_to_add.reserve(lbis.size());
-      for (const auto& lbi : lbis) {
-        const LogicalBlobId& diff_lbi = lbi2diff_lbi.at(lbi);
-        const auto square_sum_op =
-            user_op::UserOpConfWrapperBuilder("System-ClipGradient-GlobalNorm-ReduceSumPowAbs-"
-                                              + NewUniqueId())
-                .Op("multi_reduce_sum_pow_abs")
-                .Input("x", GenLogicalBlobName(diff_lbi))
-                .Attr("p", p)
-                .Output("y")
-                .ScopeSymbolId(scope_symbol_id)
-                .Build();
-        job_builder->AddOps(parallel_desc.parallel_conf(), {square_sum_op.op_conf()});
-        lbns_to_add.emplace_back(square_sum_op.output("y", 0));
-      }
-      group_lbns.emplace_back(AddLbns(job_builder, lbns_to_add, parallel_desc.parallel_conf(),
-                                      scope_symbol_id, "System-ClipGradient-GlobalNorm-Add-"));
-    }
-  };
-  ForEachAggregatedParamGroup(op_graph, lbi2diff_lbi, GroupNorm);
-
-  // sum in group
-  *out_parallel_conf = all_same_parallel_desc ? any_parallel_desc.parallel_conf()
-                                              : GenParallelConfOfCpuZeroOnMaster();
-  const int64_t scope_symbol_id =
-      MakeScopeSymbolId(job_builder->job().job_conf(), *out_parallel_conf);
-  std::vector<std::string> sum_group_lbns;
-  if (all_broadcast) {
-    sum_group_lbns = std::move(group_lbns);
-  } else {
-    sum_group_lbns.reserve(group_lbns.size());
-    for (size_t i = 0; i < group_lbns.size(); ++i) {
-      std::string lbn;
-      if (all_same_parallel_desc) {
-        // reduce many times P->B (allreduce) to 1 times
-        lbn = AddParallelCast(job_builder, group_lbns.at(i), "P", group_parallel_confs.at(i),
-                              "System-ClipGradient-ParallelCast-");
-      } else {
-        // sum will run on cpu 0, we need do P->B first,
-        // because when execution is on single device, only B is accepted
-        lbn = AddParallelCast(job_builder, group_lbns.at(i), "B", group_parallel_confs.at(i),
-                              "System-ClipGradient-ParallelCast-");
-      }
-      sum_group_lbns.push_back(std::move(lbn));
-    }
-    out_parallel_conf->mutable_hierarchy()->clear_dim();
-  }
-  auto global_reduce_sum_lbn = AddLbns(job_builder, sum_group_lbns, *out_parallel_conf,
-                                       scope_symbol_id, "System-ClipGradient-GlobalNorm-Add-");
-
-  auto global_pow_op =
-      user_op::UserOpConfWrapperBuilder("System-ClipGradient-GlobalNorm-GlobalPow-" + NewUniqueId())
-          .Op("scalar_pow")
-          .Input("in", global_reduce_sum_lbn)
-          .Attr("float_operand", 1.0 / p)
-          .Attr("has_float_operand", true)
-          .Output("out")
-          .ScopeSymbolId(scope_symbol_id)
-          .Build();
-  job_builder->AddOps(*out_parallel_conf, {global_pow_op.op_conf()});
-
-  return global_pow_op.output("out", 0);
-}
-
 void ClipGradientByGlobalNorm(const OpGraph& op_graph, JobBuilder* job_builder,
                               HashMap<LogicalBlobId, LogicalBlobId>* lbi2diff_lbi,
                               const ClipByGlobalNormConf& conf) {
   if (lbi2diff_lbi->empty()) { return; }
-  ParallelConf parallel_conf;
-  std::string total_norm_lbn;
-  CHECK(conf.has_norm_type());
-  double norm_type = conf.norm_type();
-  if (std::isinf(norm_type) && norm_type > 0) {
-    total_norm_lbn = GlobalAbsMaxMin(op_graph, job_builder, *lbi2diff_lbi, true, &parallel_conf);
-  } else if (std::isinf(norm_type) && norm_type < 0) {
-    total_norm_lbn = GlobalAbsMaxMin(op_graph, job_builder, *lbi2diff_lbi, false, &parallel_conf);
+  bool all_same_parallel_desc = true;
+  const ParallelDesc& any_parallel_desc =
+      op_graph.OpNode4OpName(lbi2diff_lbi->begin()->first.op_name())->parallel_desc();
+  const size_t loop_size = lbi2diff_lbi->size();
+  std::vector<std::string> partial_square_sum_lbns;
+  partial_square_sum_lbns.reserve(loop_size);
+  std::vector<bool> is_broadcast_nd_sbp;
+  is_broadcast_nd_sbp.reserve(loop_size);
+  std::vector<ParallelConf> param_group_parallel_confs;
+  param_group_parallel_confs.reserve(loop_size);
+  ForEachAggregatedParamGroup(
+      op_graph, *lbi2diff_lbi,
+      [&](const ParallelDesc& parallel_desc, const NdSbp& nd_sbp,
+          const std::vector<LogicalBlobId>& lbis) {
+        if (!parallel_desc.EqualsIgnoringHierarchy(any_parallel_desc)) {
+          all_same_parallel_desc = false;
+        }
+        int64_t scope_symbol_id =
+            MakeScopeSymbolId(job_builder->job().job_conf(), parallel_desc.parallel_conf());
+        is_broadcast_nd_sbp.emplace_back(IsBroadcast(nd_sbp, parallel_desc));
+        param_group_parallel_confs.emplace_back(parallel_desc.parallel_conf());
+        if (job_builder->job().job_conf().enable_gradients_stats_aggregation()) {
+          auto multi_square_sum_op_builder =
+              user_op::UserOpConfWrapperBuilder("System-ClipGradient-GlobalNorm-MultiSquareSum-"
+                                                + NewUniqueId())
+                  .Op("multi_square_sum")
+                  .Output("y")
+                  .ScopeSymbolId(scope_symbol_id);
+          for (const auto& lbi : lbis) {
+            multi_square_sum_op_builder.Input("x", GenLogicalBlobName(lbi2diff_lbi->at(lbi)));
+          }
+          const auto multi_square_sum_op = multi_square_sum_op_builder.Build();
+          job_builder->AddOps(parallel_desc.parallel_conf(), {multi_square_sum_op.op_conf()});
+          partial_square_sum_lbns.emplace_back(multi_square_sum_op.output("y", 0));
+        } else {
+          std::vector<std::string> lbns_to_add;
+          lbns_to_add.reserve(lbis.size());
+          for (const auto& lbi : lbis) {
+            const LogicalBlobId& diff_lbi = lbi2diff_lbi->at(lbi);
+            const auto square_sum_op =
+                user_op::UserOpConfWrapperBuilder("System-ClipGradient-GlobalNorm-SquareSum-"
+                                                  + NewUniqueId())
+                    .Op("square_sum")
+                    .Input("x", GenLogicalBlobName(diff_lbi))
+                    .Output("y")
+                    .ScopeSymbolId(scope_symbol_id)
+                    .Build();
+            job_builder->AddOps(parallel_desc.parallel_conf(), {square_sum_op.op_conf()});
+            lbns_to_add.emplace_back(square_sum_op.output("y", 0));
+          }
+          partial_square_sum_lbns.emplace_back(
+              AddLbns(job_builder, lbns_to_add, parallel_desc.parallel_conf(), scope_symbol_id,
+                      "System-ClipGradient-GlobalNorm-Add-"));
+        }
+      });
+  ParallelConf global_norm_parallel_conf = all_same_parallel_desc
+                                               ? any_parallel_desc.parallel_conf()
+                                               : GenParallelConfOfCpuZeroOnMaster();
+  const bool all_group_broadcast =
+      std::all_of(is_broadcast_nd_sbp.begin(), is_broadcast_nd_sbp.end(), [](bool i) { return i; });
+  std::vector<std::string> square_sum_lbns_for_add;
+  if (!all_group_broadcast) {
+    for (int64_t i = 0; i < partial_square_sum_lbns.size(); ++i) {
+      square_sum_lbns_for_add.emplace_back(AddCastToP(job_builder, partial_square_sum_lbns.at(i),
+                                                      param_group_parallel_confs.at(i),
+                                                      "System-ClipGradient-ParallelCast-"));
+    }
+    global_norm_parallel_conf.mutable_hierarchy()->clear_dim();
   } else {
-    total_norm_lbn = GlobalNorm(op_graph, job_builder, *lbi2diff_lbi, norm_type, &parallel_conf);
+    square_sum_lbns_for_add = std::move(partial_square_sum_lbns);
   }
 
-  int64_t scope_symbol_id = MakeScopeSymbolId(job_builder->job().job_conf(), parallel_conf);
-
-  auto add_eps_ops =
-      user_op::UserOpConfWrapperBuilder("System-ClipGradient-GlobalNorm-AddEps-" + NewUniqueId())
-          .Op("scalar_add")
-          .Input("in", total_norm_lbn)
-          .Attr("float_operand", 1e-6)
-          .Attr("has_float_operand", true)
-          .Output("out")
-          .ScopeSymbolId(scope_symbol_id)
-          .Build();
-  job_builder->AddOps(parallel_conf, {add_eps_ops.op_conf()});
-
-  auto inv_op =
-      user_op::UserOpConfWrapperBuilder("System-ClipGradient-GlobalNorm-Inv-" + NewUniqueId())
-          .Op("reciprocal_no_nan")
-          .Input("x", add_eps_ops.output("out", 0))
-          .Output("y")
-          .ScopeSymbolId(scope_symbol_id)
-          .Build();
-  job_builder->AddOps(parallel_conf, {inv_op.op_conf()});
-
-  auto coeff_op =
-      user_op::UserOpConfWrapperBuilder("System-ClipGradient-GlobalNorm-Coeff-" + NewUniqueId())
-          .Op("scalar_mul")
-          .Input("in", inv_op.output("y", 0))
-          .Attr("float_operand", static_cast<double>(conf.max_norm()))
-          .Attr("has_float_operand", true)
-          .Output("out")
-          .ScopeSymbolId(scope_symbol_id)
-          .Build();
-  job_builder->AddOps(parallel_conf, {coeff_op.op_conf()});
-
-  auto clamp_coeff_op =
-      user_op::UserOpConfWrapperBuilder("System-ClipGradient-GlobalNorm-Clamp-" + NewUniqueId())
-          .Op("clip_by_scalar_max")
-          .Input("x", coeff_op.output("out", 0))
-          .Attr("floating_max", 1.0)
-          .Output("y")
+  const int64_t scope_symbol_id =
+      MakeScopeSymbolId(job_builder->job().job_conf(), global_norm_parallel_conf);
+  const std::string square_sum_lbn =
+      AddLbns(job_builder, square_sum_lbns_for_add, global_norm_parallel_conf, scope_symbol_id,
+              "System-ClipGradient-GlobalNorm-Add-");
+  auto inv_global_norm_op = user_op::UserOpConfWrapperBuilder(
+                                "System-ClipGradient-GlobalNorm-InvGlobalNorm-" + NewUniqueId())
+                                .Op("rsqrt")
+                                .Input("x", square_sum_lbn)
+                                .Output("y")
+                                .ScopeSymbolId(scope_symbol_id)
+                                .Build();
+  job_builder->AddOps(global_norm_parallel_conf, {inv_global_norm_op.op_conf()});
+  OperatorConf inv_clip_norm_op_conf{};
+  inv_clip_norm_op_conf.set_name("System-ClipGradient-GlobalNorm-InvClipNorm-" + NewUniqueId());
+  ConstantLikeOpConf* inv_clip_norm_constant_like_conf =
+      inv_clip_norm_op_conf.mutable_constant_like_conf();
+  inv_clip_norm_constant_like_conf->set_like(inv_global_norm_op.output("y", 0));
+  inv_clip_norm_constant_like_conf->set_float_operand(1.0 / conf.clip_norm());
+  inv_clip_norm_constant_like_conf->set_out("out");
+  inv_clip_norm_op_conf.set_scope_symbol_id(scope_symbol_id);
+  job_builder->AddOps(global_norm_parallel_conf, {inv_clip_norm_op_conf});
+  auto minimum_op =
+      user_op::UserOpConfWrapperBuilder("System-ClipGradient-GlobalNorm-Minimum-" + NewUniqueId())
+          .Op("broadcast_minimum")
+          .Input("x", inv_global_norm_op.output("y", 0))
+          .Input("y", GenLogicalBlobName(inv_clip_norm_op_conf.name(),
+                                         inv_clip_norm_constant_like_conf->out()))
+          .Output("z")
           .ScopeSymbolId(scope_symbol_id)
           .Build();
-  job_builder->AddOps(parallel_conf, {clamp_coeff_op.op_conf()});
-
-  const std::string& coeff_lbn = clamp_coeff_op.output("y", 0);
+  job_builder->AddOps(global_norm_parallel_conf, {minimum_op.op_conf()});
+  const std::string gradient_scale_factor_lbn = minimum_op.output("z", 0);
   for (auto& pair : *lbi2diff_lbi) {
     const LogicalBlobId& lbi = pair.first;
     LogicalBlobId& diff_lbi = pair.second;
-    auto mul_op_name = "System-ClipGradient-GlobalNorm-ScalarMul-" + NewUniqueId();
-    auto scalar_mul_op = user_op::UserOpConfWrapperBuilder(mul_op_name)
+    auto scalar_mul_op = user_op::UserOpConfWrapperBuilder(
+                             "System-ClipGradient-GlobalNorm-ScalarMul-" + NewUniqueId())
                              .Op("scalar_mul_by_tensor")
                              .Input("x", GenLogicalBlobName(diff_lbi))
-                             .Input("scalar", coeff_lbn)
+                             .Input("scalar", gradient_scale_factor_lbn)
                              .Output("y")
                              .ScopeSymbolId(ScopeSymbolId4Lbi(op_graph, lbi))
                              .Build();
@@ -1234,9 +1056,9 @@ Maybe<void> CountNotFiniteIfNeeded(JobPassCtx* ctx, const OpGraph& op_graph,
                                              : GenParallelConfOfCpuZeroOnMaster();
   if (!all_group_broadcast) {
     for (int64_t i = 0; i < partial_count_not_finite_lbns.size(); ++i) {
-      count_not_finite_lbns_for_add.emplace_back(AddParallelCast(
-          job_builder, JUST(VectorAt(partial_count_not_finite_lbns, i)), "P",
-          JUST(VectorAt(param_group_parallel_confs, i)), "System-DynamicLossScale-ParallelCast-"));
+      count_not_finite_lbns_for_add.emplace_back(
+          AddCastToP(job_builder, partial_count_not_finite_lbns.at(i),
+                     param_group_parallel_confs.at(i), "System-DynamicLossScale-ParallelCast-"));
     }
     count_all_parallel_conf.mutable_hierarchy()->clear_dim();
   } else {
diff --git a/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp b/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp
index 11b7b09973d..5c96addce73 100644
--- a/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp
+++ b/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp
@@ -373,12 +373,7 @@ bool TryBuildNcclLogicalOpConf(OperatorConf* ret, const OpNode* src_node, const
 
   // NOTE(chengcheng): nccl donot support dynamic shape.
   if (logical_blob_desc.is_dynamic()) { return false; }
-  CHECK_GT(logical_blob_desc.shape().elem_cnt(), 0)
-      << dst_node->op().op_name() << " consume " << GenLogicalBlobName(lbi) << ", "
-      << *CHECK_JUST(PlacementToString(*src_reduced_parallel_desc)) << " "
-      << NdSbpToString(*src_reduced_nd_sbp) << " -> "
-      << *CHECK_JUST(PlacementToString(*dst_reduced_parallel_desc)) << " "
-      << NdSbpToString(*dst_reduced_nd_sbp);
+  CHECK_GT(logical_blob_desc.shape().elem_cnt(), 0);
 
   int64_t scope_symbol_id = CHECK_JUST(BuildScopeWithReducedParallelDesc(
       src_node->op().op_conf().scope_symbol_id(), *src_reduced_parallel_desc));
diff --git a/oneflow/core/job_rewriter/replace_embedding_ops_pass.cpp b/oneflow/core/job_rewriter/replace_embedding_ops_pass.cpp
index 00d579f9069..0c4bd8daf5d 100644
--- a/oneflow/core/job_rewriter/replace_embedding_ops_pass.cpp
+++ b/oneflow/core/job_rewriter/replace_embedding_ops_pass.cpp
@@ -127,9 +127,11 @@ void BuildEmbeddingLookup(JobPassCtx* ctx, JobBuilder* job_builder, const int64_
                           bool has_embedding_prefetch, const ParallelConf& parallel_conf,
                           const user_op::UserOpConfWrapper& embedding_op,
                           const std::string& num_unique_ids_lbn, const std::string& unique_ids_lbn,
-                          const std::string& unique_table_ids_lbn, std::string* embedding_lbn,
-                          std::string* unique_values_lbn, OperatorConf* embedding_prefetch_op_conf,
-                          OperatorConf* embedding_lookup_op_conf) {
+                          const std::string& unique_columns_lbn, std::string* embedding_lbn,
+                          std::string* unique_values_lbn) {
+  auto AddIdentityOp = [&](const std::string& in_lbn) -> std::string {
+    return BuildIdentityOp(job_builder, in_lbn, parallel_conf, embedding_op);
+  };
   std::string context_lbn;
   if (has_embedding_prefetch) {
     // embedding prefetch op
@@ -137,34 +139,35 @@ void BuildEmbeddingLookup(JobPassCtx* ctx, JobBuilder* job_builder, const int64_
                                                                     + "_embedding_prefetch");
     user_op::UserOpConfWrapper embedding_prefetch_op =
         embedding_prefetch_op_builder.OpTypeName("embedding_prefetch")
-            .Input("num_unique_ids", num_unique_ids_lbn)
-            .Input("unique_ids", unique_ids_lbn)
-            .Input("table_ids", unique_table_ids_lbn)
+            .Input("num_unique_ids", AddIdentityOp(num_unique_ids_lbn))
+            .Input("unique_ids", AddIdentityOp(unique_ids_lbn))
+            .Input("column_ids", AddIdentityOp(unique_columns_lbn))
             .Output("context")
             .Attr<int64_t>("embedding_size", embedding_size)
             .Attr<int64_t>("line_size", line_size)
-            .Attr<std::string>("embedding_tables",
-                               embedding_op.attr<std::string>("embedding_tables"))
+            .Attr<std::string>("embedding_columns",
+                               embedding_op.attr<std::string>("embedding_columns"))
             .Attr<std::string>("embedding_name", embedding_name)
             .ScopeSymbolId(embedding_op.op_conf().scope_symbol_id())
             .Build();
-    *embedding_prefetch_op_conf = embedding_prefetch_op.op_conf();
-    embedding_prefetch_op_conf->set_stream_name_hint(embedding_name + "_EMBEDDING");
-    context_lbn = embedding_prefetch_op.output("context", 0);
+    OperatorConf embedding_prefetch_new_op_conf = embedding_prefetch_op.op_conf();
+    embedding_prefetch_new_op_conf.set_stream_name_hint(embedding_name + "_EMBEDDING");
+    job_builder->AddOps(parallel_conf, {embedding_prefetch_new_op_conf});
+    context_lbn = AddIdentityOp(embedding_prefetch_op.output("context", 0));
   }
 
   // embedding lookup op
   user_op::UserOpConfWrapperBuilder embedding_lookup_op_builder(embedding_op.op_name()
                                                                 + "_embedding_lookup");
   embedding_lookup_op_builder.OpTypeName("embedding_lookup")
-      .Input("num_unique_ids", num_unique_ids_lbn)
-      .Input("unique_ids", unique_ids_lbn)
-      .Input("table_ids", unique_table_ids_lbn)
+      .Input("num_unique_ids", AddIdentityOp(num_unique_ids_lbn))
+      .Input("unique_ids", AddIdentityOp(unique_ids_lbn))
+      .Input("column_ids", AddIdentityOp(unique_columns_lbn))
       .Output("unique_values")
       .Attr<DataType>("dtype", embedding_op.attr<DataType>("dtype"))
       .Attr<int64_t>("embedding_size", embedding_size)
       .Attr<int64_t>("line_size", line_size)
-      .Attr<std::string>("embedding_tables", embedding_op.attr<std::string>("embedding_tables"))
+      .Attr<std::string>("embedding_columns", embedding_op.attr<std::string>("embedding_columns"))
       .Attr<std::string>("embedding_name", embedding_name)
       .ScopeSymbolId(embedding_op.op_conf().scope_symbol_id());
   if (has_embedding_prefetch) { embedding_lookup_op_builder.Input("context", context_lbn); }
@@ -178,8 +181,9 @@ void BuildEmbeddingLookup(JobPassCtx* ctx, JobBuilder* job_builder, const int64_
         .Attr<DataType>("embeddings_dtype", embeddings_dtype);
   }
   user_op::UserOpConfWrapper embedding_lookup_op = embedding_lookup_op_builder.Build();
-  *embedding_lookup_op_conf = embedding_lookup_op.op_conf();
-  embedding_lookup_op_conf->set_stream_name_hint(embedding_name + "_EMBEDDING");
+  OperatorConf embedding_lookup_new_op_conf = embedding_lookup_op.op_conf();
+  embedding_lookup_new_op_conf.set_stream_name_hint(embedding_name + "_EMBEDDING");
+  job_builder->AddOps(parallel_conf, {embedding_lookup_new_op_conf});
   if (has_embeddings_output) {
     *embedding_lbn = embedding_lookup_op.output("embeddings", 0);
   } else {
@@ -196,14 +200,18 @@ void BuildEmbeddingShuffle(JobBuilder* job_builder, const std::string& embedding
                            const std::string& num_unique_matrix_lbn,
                            const std::string& embedding_lbn, std::vector<OperatorConf>* add_ops,
                            std::string* new_embeddings_lbn) {
+  auto AddIdentityOp = [&](const std::string& in_lbn) -> std::string {
+    return BuildIdentityOp(job_builder, in_lbn, parallel_conf, embedding_op);
+  };
   user_op::UserOpConfWrapperBuilder embedding_shuffle_op_builder(embedding_op.op_name()
                                                                  + "_embedding_shuffle");
   user_op::UserOpConfWrapper embedding_shuffle_op =
       embedding_shuffle_op_builder.OpTypeName("embedding_shuffle")
           .Input("cur_rank_embeddings", embedding_lbn)
-          .Input("cur_rank_inverse_indices", inverse_indices_lbn)
-          .Input("inverse_unique_partition_indices", inner_inverse_unique_partition_indices_lbn)
-          .Input("num_unique_matrix", num_unique_matrix_lbn)
+          .Input("cur_rank_inverse_indices", AddIdentityOp(inverse_indices_lbn))
+          .Input("inverse_unique_partition_indices",
+                 AddIdentityOp(inner_inverse_unique_partition_indices_lbn))
+          .Input("num_unique_matrix", AddIdentityOp(num_unique_matrix_lbn))
           .Output("embeddings")
           .ScopeSymbolId(embedding_op.op_conf().scope_symbol_id())
           .Build();
@@ -225,6 +233,9 @@ void BuildEmbeddingGradientShuffle(JobPassCtx* ctx, const OpGraph& op_graph,
                                    const std::string& num_unique_matrix_lbn,
                                    const std::string& update_embedding_grad,
                                    std::string* cur_rank_unique_embedding_grad_lbn) {
+  auto AddIdentityOp = [&](const std::string& in_lbn) -> std::string {
+    return BuildIdentityOp(job_builder, in_lbn, parallel_conf, embedding_op);
+  };
   std::string update_embedding_grad_lbn = update_embedding_grad;
   if (ctx->job_desc().enable_auto_mixed_precision()
       && ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_GRADIENT_SHUFFLE_USE_FP16", true)) {
@@ -259,10 +270,11 @@ void BuildEmbeddingGradientShuffle(JobPassCtx* ctx, const OpGraph& op_graph,
         embedding_op.op_name() + "_embedding_gradient_shuffle");
     user_op::UserOpConfWrapper embedding_gradient_shuffle_op =
         embedding_gradient_shuffle_op_builder.OpTypeName("embedding_gradient_shuffle")
-            .Input("cur_rank_inverse_indices", inverse_indices_lbn)
-            .Input("inverse_unique_partition_indices", inner_inverse_unique_partition_indices_lbn)
+            .Input("cur_rank_inverse_indices", AddIdentityOp(inverse_indices_lbn))
+            .Input("inverse_unique_partition_indices",
+                   AddIdentityOp(inner_inverse_unique_partition_indices_lbn))
             .Input("embedding_grad", update_embedding_grad_lbn)
-            .Input("num_unique_matrix", num_unique_matrix_lbn)
+            .Input("num_unique_matrix", AddIdentityOp(num_unique_matrix_lbn))
             .Output("cur_rank_unique_embedding_grad")
             .ScopeSymbolId(embedding_op.op_conf().scope_symbol_id())
             .Build();
@@ -349,22 +361,22 @@ void BuildIdShuffle(bool use_system_gather, const std::string& embedding_name,
                     std::vector<OperatorConf>* add_ops,
                     std::string* inner_inverse_unique_partition_indices_lbn,
                     std::string* num_unique_ids_lbn, std::string* unique_ids_lbn,
-                    std::string* unique_table_ids_lbn, std::string* inverse_indices_lbn,
+                    std::string* unique_columns_lbn, std::string* inverse_indices_lbn,
                     std::string* num_unique_matrix_lbn) {
-  const int32_t num_tables = embedding_op.attr<int32_t>("num_tables");
+  const int32_t num_columns = embedding_op.attr<int32_t>("num_columns");
   if (use_system_gather) {
     user_op::UserOpConfWrapperBuilder unique_op_builder(embedding_op.op_name()
-                                                        + "_unique_ids_and_tables");
+                                                        + "_unique_ids_and_columns");
     unique_op_builder.OpTypeName("unique_key_value_pair")
         .Input("keys", embedding_op.input("ids", 0))
         .Output("num_unique")
         .Output("unique_keys")
         .Output("unique_values")
         .Output("inverse_indices")
-        .Attr<int32_t>("num_tables", num_tables)
+        .Attr<int32_t>("num_columns", num_columns)
         .ScopeSymbolId(embedding_op.op_conf().scope_symbol_id());
-    if (embedding_op.has_input("table_ids", 0)) {
-      unique_op_builder.Input("values", embedding_op.input("table_ids", 0));
+    if (embedding_op.has_input("column_ids", 0)) {
+      unique_op_builder.Input("values", embedding_op.input("column_ids", 0));
     }
     user_op::UserOpConfWrapper unique_op = unique_op_builder.Build();
     OperatorConf unique_new_op_conf = unique_op.op_conf();
@@ -372,7 +384,7 @@ void BuildIdShuffle(bool use_system_gather, const std::string& embedding_name,
     add_ops->push_back(unique_new_op_conf);
     *num_unique_ids_lbn = unique_op.output("num_unique", 0);
     *unique_ids_lbn = unique_op.output("unique_keys", 0);
-    *unique_table_ids_lbn = unique_op.output("unique_values", 0);
+    *unique_columns_lbn = unique_op.output("unique_values", 0);
     *inverse_indices_lbn = unique_op.output("inverse_indices", 0);
   } else {
     user_op::UserOpConfWrapperBuilder id_shuffle_op_builder(embedding_op.op_name() + "_id_shuffle");
@@ -381,13 +393,13 @@ void BuildIdShuffle(bool use_system_gather, const std::string& embedding_name,
         .Output("inverse_unique_partition_indices")
         .Output("cur_rank_num_unique")
         .Output("cur_rank_unique_ids")
-        .Output("cur_rank_unique_table_ids")
+        .Output("cur_rank_unique_column_ids")
         .Output("cur_rank_inverse_indices")
         .Output("num_unique_matrix")
-        .Attr<int32_t>("num_tables", num_tables)
+        .Attr<int32_t>("num_columns", num_columns)
         .ScopeSymbolId(embedding_op.op_conf().scope_symbol_id());
-    if (embedding_op.has_input("table_ids", 0)) {
-      id_shuffle_op_builder.Input("table_ids", embedding_op.input("table_ids", 0));
+    if (embedding_op.has_input("column_ids", 0)) {
+      id_shuffle_op_builder.Input("column_ids", embedding_op.input("column_ids", 0));
     }
     user_op::UserOpConfWrapper id_shuffle_op = id_shuffle_op_builder.Build();
     OperatorConf id_shuffle_new_op_conf = id_shuffle_op.op_conf();
@@ -397,40 +409,24 @@ void BuildIdShuffle(bool use_system_gather, const std::string& embedding_name,
         id_shuffle_op.output("inverse_unique_partition_indices", 0);
     *num_unique_ids_lbn = id_shuffle_op.output("cur_rank_num_unique", 0);
     *unique_ids_lbn = id_shuffle_op.output("cur_rank_unique_ids", 0);
-    *unique_table_ids_lbn = id_shuffle_op.output("cur_rank_unique_table_ids", 0);
+    *unique_columns_lbn = id_shuffle_op.output("cur_rank_unique_column_ids", 0);
     *inverse_indices_lbn = id_shuffle_op.output("cur_rank_inverse_indices", 0);
     *num_unique_matrix_lbn = id_shuffle_op.output("num_unique_matrix", 0);
   }
 }
 
-void MakeConstantInitializerAttr(const int64_t embedding_size, const int64_t line_size,
-                                 const std::vector<float>& values, std::string* initializer_attr) {
-  if (embedding_size == line_size) { return; }
-  const int32_t num_states = line_size / embedding_size - 1;
-  CHECK_GT(num_states, 0) << "num_states " << num_states;
-  CHECK(values.size() == 0 || num_states == values.size())
-      << "must set " << num_states << " optimizer states init value, but get " << values.size();
-  nlohmann::json initializers;
-  for (int32_t i = 0; i < num_states; ++i) {
-    nlohmann::json initializer;
-    initializer["type"] = "constant";
-    const float initial_value = values.size() > 0 ? values.at(i) : 0.0;
-    initializer["value"] = initial_value;
-    initializers.push_back(initializer);
-  }
-  *initializer_attr = initializers.dump();
-}
-
 void BuildEmbeddingUpdate(JobPassCtx* ctx, const OpGraph& op_graph, JobBuilder* job_builder,
                           const ParallelConf& parallel_conf, const int64_t embedding_size,
-                          const int64_t line_size, const float l1, const float l2,
                           const std::string& embedding_name, const OptimizerConf& optimizer_conf,
                           const user_op::UserOpConfWrapper& embedding_op,
                           const std::string& num_unique_ids_lbn, const std::string& unique_ids_lbn,
                           const std::string& unique_values_lbn,
                           const std::string& embedding_grad_lbn,
-                          const std::string& learning_rate_lbn, std::string* state_initializer) {
+                          const std::string& learning_rate_lbn) {
   const TrainConf& train_conf = job_builder->job().job_conf().train_conf();
+  auto AddIdentityOp = [&](const std::string& in_lbn) -> std::string {
+    return BuildIdentityOp(job_builder, in_lbn, parallel_conf, embedding_op);
+  };
   auto AddAdamBiasCorrectionFactorOp = [&](float beta_val,
                                            const std::string& op_name) -> std::string {
     user_op::UserOpConfWrapperBuilder op_builder(embedding_op.op_name() + op_name);
@@ -446,7 +442,6 @@ void BuildEmbeddingUpdate(JobPassCtx* ctx, const OpGraph& op_graph, JobBuilder*
   };
   user_op::UserOpConfWrapperBuilder embedding_update_op_builder(embedding_op.op_name()
                                                                 + "_embedding_update");
-  std::vector<float> state_constant_init_values;
   if (optimizer_conf.has_naive_conf()) {
     embedding_update_op_builder.OpTypeName("sgd_embedding_update");
   } else if (optimizer_conf.has_momentum_conf()) {
@@ -467,25 +462,13 @@ void BuildEmbeddingUpdate(JobPassCtx* ctx, const OpGraph& op_graph, JobBuilder*
       embedding_update_op_builder.Input("bias_correction1", bias_correction1_lbn)
           .Input("bias_correction2", bias_correction2_lbn);
     }
-  } else if (optimizer_conf.has_adagrad_conf()) {
-    const AdagradModelUpdateConf& adagrad_conf = optimizer_conf.adagrad_conf();
-    state_constant_init_values.push_back(adagrad_conf.initial_accumulator_value());
-    embedding_update_op_builder.OpTypeName("adagrad_embedding_update")
-        .Input("train_step", train_conf.train_step_lbn())
-        .Attr<float>("lr_decay", adagrad_conf.lr_decay())
-        .Attr<float>("epsilon", adagrad_conf.epsilon());
   } else {
     UNIMPLEMENTED();
   }
-  MakeConstantInitializerAttr(embedding_size, line_size, state_constant_init_values,
-                              state_initializer);
-  embedding_update_op_builder.Input("num_unique_ids", num_unique_ids_lbn)
+  embedding_update_op_builder.Input("num_unique_ids", AddIdentityOp(num_unique_ids_lbn))
       .Input("unique_embeddings", unique_values_lbn)
       .Input("embedding_grad", embedding_grad_lbn)
       .Input("learning_rate", learning_rate_lbn)
-      .Attr<float>("weight_decay", optimizer_conf.weight_decay_conf().weight_decay_rate())
-      .Attr<float>("l1", l1)
-      .Attr<float>("l2", l2)
       .Output("updated_unique_embeddings");
   double scale = GetLossInstanceNumScaleFactor(op_graph, job_builder);
   if (train_conf.has_dynamic_loss_scale_policy()) {
@@ -509,8 +492,8 @@ void BuildEmbeddingUpdate(JobPassCtx* ctx, const OpGraph& op_graph, JobBuilder*
                                                              + "_embedding_put");
   user_op::UserOpConfWrapper embedding_put_op =
       embedding_put_op_builder.OpTypeName("embedding_put")
-          .Input("num_unique_ids", num_unique_ids_lbn)
-          .Input("unique_ids", unique_ids_lbn)
+          .Input("num_unique_ids", AddIdentityOp(num_unique_ids_lbn))
+          .Input("unique_ids", AddIdentityOp(unique_ids_lbn))
           .Input("unique_embeddings", embedding_update_op.output("updated_unique_embeddings", 0))
           .Attr<std::string>("embedding_name", embedding_name)
           .ScopeSymbolId(embedding_op.op_conf().scope_symbol_id())
@@ -536,7 +519,6 @@ void UpdateConsumerOpConf(const OpNode* consumer, const LogicalBlobId& out,
     }
   }
 }
-
 }  // namespace
 
 class ReplaceEmbeddingOps final : public JobPass {
@@ -581,28 +563,23 @@ Maybe<void> ReplaceEmbeddingOps::Apply(const OpGraph& op_graph, JobBuilder* job_
     std::string inner_inverse_unique_partition_indices_lbn;
     std::string num_unique_ids_lbn;
     std::string unique_ids_lbn;
-    std::string unique_table_ids_lbn;
+    std::string unique_columns_lbn;
     std::string inverse_indices_lbn;
     std::string num_unique_matrix_lbn;
 
     BuildIdShuffle(use_system_gather, options.Name(), embedding_op, &add_ops,
                    &inner_inverse_unique_partition_indices_lbn, &num_unique_ids_lbn,
-                   &unique_ids_lbn, &unique_table_ids_lbn, &inverse_indices_lbn,
+                   &unique_ids_lbn, &unique_columns_lbn, &inverse_indices_lbn,
                    &num_unique_matrix_lbn);
-    const bool is_train_job = job_builder->job().job_conf().has_train_conf();
-    const bool no_optimizer_states = (embedding_size == options.LineSize());
-    const bool has_embedding_prefetch =
-        (!options.IsFullCache()) && (is_train_job || no_optimizer_states);
 
-    OperatorConf embedding_prefetch_op_conf;
-    OperatorConf embedding_lookup_op_conf;
+    bool has_embedding_prefetch = (!options.IsFullCache()) ? true : false;
+
     // embedding lookup op
     std::string embedding_lbn, unique_values_lbn;
     BuildEmbeddingLookup(ctx, job_builder, embedding_size, options.LineSize(), options.Name(),
                          has_embedding_prefetch, op_node->parallel_desc().parallel_conf(),
-                         embedding_op, num_unique_ids_lbn, unique_ids_lbn, unique_table_ids_lbn,
-                         &embedding_lbn, &unique_values_lbn, &embedding_prefetch_op_conf,
-                         &embedding_lookup_op_conf);
+                         embedding_op, num_unique_ids_lbn, unique_ids_lbn, unique_columns_lbn,
+                         &embedding_lbn, &unique_values_lbn);
 
     if (use_system_gather) {
       user_op::UserOpConfWrapperBuilder gather_op_builder(embedding_op.op_name() + "_gather");
@@ -641,7 +618,6 @@ Maybe<void> ReplaceEmbeddingOps::Apply(const OpGraph& op_graph, JobBuilder* job_
       }
     }
 
-    std::string state_initializer;
     // find update op
     const OpNode* producer =
         op_graph.OpNode4OpName(GenLogicalBlobId(embedding_op.input("ids", 0)).op_name());
@@ -683,43 +659,16 @@ Maybe<void> ReplaceEmbeddingOps::Apply(const OpGraph& op_graph, JobBuilder* job_
           if (found_embedding_optimizer == true) { break; }
         }
         CHECK_EQ(found_embedding_optimizer, true);
-
-        const OpNode* shadow_node = op_graph.OpNode4OpName(shadow_op_name);
-        const VariableOpConf& shadow_variable_conf = shadow_node->op().op_conf().variable_conf();
-        float l1 = 0.0;
-        float l2 = 0.0;
-        if (shadow_variable_conf.has_regularizer()) {
-          const RegularizerConf& regularizer_conf = shadow_variable_conf.regularizer();
-          if (regularizer_conf.has_l1_l2_conf()) {
-            l1 = regularizer_conf.l1_l2_conf().l1();
-            l2 = regularizer_conf.l1_l2_conf().l2();
-          }
-        }
         const std::string& learning_rate_lbn =
             AddScheduleOp(op_graph, job_builder, embedding_optimizer_conf,
                           "System-Train-LearningRate-Scheduler_" + NewUniqueId());
 
         BuildEmbeddingUpdate(ctx, op_graph, job_builder, op_node->parallel_desc().parallel_conf(),
-                             embedding_size, options.LineSize(), l1, l2, options.Name(),
-                             embedding_optimizer_conf, embedding_op, num_unique_ids_lbn,
-                             unique_ids_lbn, unique_values_lbn, embedding_grad_lbn,
-                             learning_rate_lbn, &state_initializer);
+                             embedding_size, options.Name(), embedding_optimizer_conf, embedding_op,
+                             num_unique_ids_lbn, unique_ids_lbn, unique_values_lbn,
+                             embedding_grad_lbn, learning_rate_lbn);
       }
     }
-    if ((state_initializer == "") && !no_optimizer_states) {
-      CHECK(!is_train_job) << "train job must have set state initializer";
-      MakeConstantInitializerAttr(embedding_size, options.LineSize(), {}, &state_initializer);
-    }
-    auto state_initializer_attr = ::oneflow::AttrValue();
-    state_initializer_attr.set_at_string(state_initializer);
-    if (has_embedding_prefetch) {
-      (*(embedding_prefetch_op_conf.mutable_user_conf()->mutable_attr()))["state_initializer"] =
-          state_initializer_attr;
-      add_ops.push_back(embedding_prefetch_op_conf);
-    }
-    (*(embedding_lookup_op_conf.mutable_user_conf()->mutable_attr()))["state_initializer"] =
-        state_initializer_attr;
-    add_ops.push_back(embedding_lookup_op_conf);
     job_builder->DelOps(delete_op_names);
     job_builder->AddOps(op_node->parallel_desc().parallel_conf(), add_ops);
   });
diff --git a/oneflow/core/job_rewriter/split_sparse_softmax_cross_entropy_op_pass.cpp b/oneflow/core/job_rewriter/split_sparse_softmax_cross_entropy_op_pass.cpp
index 19851e21852..1d621ec91fd 100644
--- a/oneflow/core/job_rewriter/split_sparse_softmax_cross_entropy_op_pass.cpp
+++ b/oneflow/core/job_rewriter/split_sparse_softmax_cross_entropy_op_pass.cpp
@@ -50,8 +50,6 @@ class SplitSparseSoftmaxCrossEntropyOpPass final : public JobPass {
 
 Maybe<void> SplitSparseSoftmaxCrossEntropyOpPass::Apply(const OpGraph& op_graph,
                                                         JobBuilder* job_builder) const {
-  std::vector<std::string> to_del_op_names;
-  HashMap<std::string, OperatorConf> consumer_op_name2op_confs;
   op_graph.ForEachNode([&](const OpNode* node) {
     const OperatorConf& op_conf = node->op().op_conf();
     if (!op_conf.has_user_conf()) { return; }
@@ -59,13 +57,14 @@ Maybe<void> SplitSparseSoftmaxCrossEntropyOpPass::Apply(const OpGraph& op_graph,
 
     const int64_t scope_symbol_id = node->op().op_conf().scope_symbol_id();
     user_op::UserOpConfWrapper cur_op(op_conf);
-    const std::string& op_prediction_blob_name = cur_op.input("prediction", 0);
-    const std::string& op_label_blob_name = cur_op.input("label", 0);
+    const std::string op_prediction_blob_name = cur_op.input("prediction", 0);
+    const std::string op_label_blob_name = cur_op.input("label", 0);
+    const int64_t depth = cur_op.attr<int64_t>("depth");
     const int32_t split_axis =
         node->LogicalBlobDesc4Lbi(node->op().BnInOp2Lbi("prediction_0")).shape().NumAxes() - 1;
     const std::vector<int32_t> axis_vec(1, split_axis);
 
-    const std::string& op_name = node->op().op_name();
+    std::string op_name = node->op().op_name();
     const auto& prediction_nd_sbp = node->NdSbp4BnInOp("prediction_0");
 
     NdSbp stat_distribution_for_consumer;
@@ -84,7 +83,6 @@ Maybe<void> SplitSparseSoftmaxCrossEntropyOpPass::Apply(const OpGraph& op_graph,
     }
 
     if (!has_split_axis_parallel) { return; }
-    to_del_op_names.push_back(op_name);
 
     auto reduce_max_device_stage_op =
         user_op::UserOpConfWrapperBuilder(op_name + "-split_softmax_reduce_max_device_stage")
@@ -182,7 +180,6 @@ Maybe<void> SplitSparseSoftmaxCrossEntropyOpPass::Apply(const OpGraph& op_graph,
     } else {
       reduce_sum_op_out = reduce_sum_op.output("output_tensor", 0);
     }
-
     auto broadcast_div_op = user_op::UserOpConfWrapperBuilder(op_name + "-split_softmax_div")
                                 .Op("broadcast_div")
                                 .Input("x", exp_op.output("y", 0))
@@ -191,65 +188,19 @@ Maybe<void> SplitSparseSoftmaxCrossEntropyOpPass::Apply(const OpGraph& op_graph,
                                 .ScopeSymbolId(scope_symbol_id)
                                 .Build();
     job_builder->AddOps(node->parallel_desc().parallel_conf(), {broadcast_div_op.op_conf()});
-
-    auto log_op = user_op::UserOpConfWrapperBuilder(op_name + "-log")
-                      .Op("log")
-                      .Input("x", reduce_sum_op_out)
-                      .Output("y")
-                      .ScopeSymbolId(scope_symbol_id)
-                      .Build();
-    job_builder->AddOps(node->parallel_desc().parallel_conf(), {log_op.op_conf()});
-
-    auto broadcast_sub_op = user_op::UserOpConfWrapperBuilder(op_name + "-broadcast_add")
-                                .Op("broadcast_sub")
-                                .Input("x", broadcast_sub_max_op.output("z", 0))
-                                .Input("y", log_op.output("y", 0))
-                                .Output("z")
-                                .ScopeSymbolId(scope_symbol_id)
-                                .Build();
-    job_builder->AddOps(node->parallel_desc().parallel_conf(), {broadcast_sub_op.op_conf()});
-
-    auto nll_op = user_op::UserOpConfWrapperBuilder(op_name + "-nll")
-                      .Op("nll")
-                      .Input("input", broadcast_sub_op.output("z", 0))
-                      .Input("target", op_label_blob_name)
-                      .Output("out")
-                      .Output("total_weight")
-                      .Attr<int64_t>("ignore_index", -100)
-                      .ScopeSymbolId(scope_symbol_id)
-                      .Build();
-    job_builder->AddOps(node->parallel_desc().parallel_conf(), {nll_op.op_conf()});
-
-    const std::string& prob_lbn = cur_op.output("prob", 0);
-    const std::string& out_lbn = cur_op.output("out", 0);
-    const std::string& new_prob_lbn = broadcast_div_op.output("z", 0);
-    const std::string& new_out_lbn = nll_op.output("out", 0);
-
-    for (const OpEdge* out_edge : node->out_edges()) {
-      const OpNode* consumer = out_edge->dst_node();
-      const std::string& consumer_op_name = consumer->op().op_name();
-      if (consumer_op_name2op_confs.find(consumer_op_name) == consumer_op_name2op_confs.end()) {
-        consumer_op_name2op_confs[consumer_op_name] = consumer->op().op_conf();
-      }
-      OperatorConf& consumer_op_conf = consumer_op_name2op_confs[consumer_op_name];
-      for (const std::string& ibn : consumer->op().input_bns()) {
-        const std::string& input_lbn = GenLogicalBlobName(consumer->op().BnInOp2Lbi(ibn));
-        if (input_lbn == prob_lbn) {
-          const auto& old_lbn =
-              ReplaceInputLbnInOpCustomizedConf(&consumer_op_conf, ibn, new_prob_lbn);
-          CHECK_EQ(old_lbn, prob_lbn);
-        } else if (input_lbn == out_lbn) {
-          const auto& old_lbn =
-              ReplaceInputLbnInOpCustomizedConf(&consumer_op_conf, ibn, new_out_lbn);
-          CHECK_EQ(old_lbn, out_lbn);
-        } else {
-          // does not care
-        }
-      }
-    }
+    UpdateProbConsumerOpConf(broadcast_div_op.output("z", 0), node, job_builder);
+
+    auto sparse_cross_entropy_ms_op = user_op::UserOpConfWrapperBuilder(op_name)
+                                          .Op("sparse_cross_entropy_ms")
+                                          .Input("prediction", broadcast_div_op.output("z", 0))
+                                          .Input("label", op_label_blob_name)
+                                          .Output("out")
+                                          .Attr("depth", depth)
+                                          .ScopeSymbolId(scope_symbol_id)
+                                          .Build();
+
+    job_builder->MutOpsOnlyOnce({sparse_cross_entropy_ms_op.op_conf()});
   });
-  for (const auto& pair : consumer_op_name2op_confs) { job_builder->MutOpsOnlyOnce({pair.second}); }
-  job_builder->DelOps(to_del_op_names);
   return Maybe<void>::Ok();
 }
 
diff --git a/oneflow/core/kernel/cuda_check_numerics_kernel_observer.cu b/oneflow/core/kernel/cuda_check_numerics_kernel_observer.cu
index 1d7a032e46c..9314e43ac62 100644
--- a/oneflow/core/kernel/cuda_check_numerics_kernel_observer.cu
+++ b/oneflow/core/kernel/cuda_check_numerics_kernel_observer.cu
@@ -66,7 +66,6 @@ bool HasNotFiniteGpu(ep::Stream* stream, const Blob* blob, bool* has_not_finite_
   auto* cuda_stream = stream->As<ep::CudaStream>();
   const DataType dtype = blob->data_type();
   const int64_t elem_cnt = blob->shape().elem_cnt();
-  if (elem_cnt == 0) { return false; }
   if (dtype == kFloat) {
     return HasNotFinite<float>(stream, elem_cnt, blob->dptr<float>(), has_not_finite_host,
                                has_not_finite_device);
diff --git a/oneflow/core/operator/operator.cpp b/oneflow/core/operator/operator.cpp
index 86f4ac5066d..87a6dde4a82 100644
--- a/oneflow/core/operator/operator.cpp
+++ b/oneflow/core/operator/operator.cpp
@@ -845,9 +845,7 @@ Maybe<void> Operator::InferNdSbpSignature(
     for (const auto& ibn : input_bns()) {
       const NdSbpInferHint* hint = JUST(NdSbpInferHint4Ibn(ibn));
       if (hint->nd_sbp().sbp_parallel_size() != 1) {
-        CHECK_OR_RETURN(Is1dSbp(hint->nd_sbp()) || hint->parallel_desc().parallel_num() == 1)
-            << op_name() << ", " << *JUST(PlacementToString(hint->parallel_desc())) << ", "
-            << NdSbpToString(hint->nd_sbp());
+        CHECK_OR_RETURN(Is1dSbp(hint->nd_sbp()) || hint->parallel_desc().parallel_num() == 1);
       }
       ibn2sbp_infer_hint.emplace(ibn,
                                  SbpInferHint(&hint->parallel_desc(), &hint->logical_blob_desc(),
diff --git a/oneflow/core/profiler/kernel.h b/oneflow/core/profiler/kernel.h
index 58f391f2aed..6f42b283329 100644
--- a/oneflow/core/profiler/kernel.h
+++ b/oneflow/core/profiler/kernel.h
@@ -21,7 +21,7 @@ limitations under the License.
 namespace oneflow {
 
 class Kernel;
-class KernelContext;
+struct KernelContext;
 class Blob;
 
 namespace profiler {
diff --git a/oneflow/core/thread/thread.cpp b/oneflow/core/thread/thread.cpp
index 0af7609a878..d394bf505c7 100644
--- a/oneflow/core/thread/thread.cpp
+++ b/oneflow/core/thread/thread.cpp
@@ -60,9 +60,7 @@ void Thread::PollMsgChannel() {
     local_msg_queue_.pop();
     if (msg.msg_type() == ActorMsgType::kCmdMsg) {
       if (msg.actor_cmd() == ActorCmd::kStopThread) {
-        CHECK(id2actor_ptr_.empty())
-            << " RuntimeError! Thread: " << thrd_id_
-            << " NOT empty when stop with actor num: " << id2actor_ptr_.size();
+        CHECK(id2actor_ptr_.empty());
         break;
       } else if (msg.actor_cmd() == ActorCmd::kConstructActor) {
         ConstructActor(msg.dst_actor_id());
diff --git a/oneflow/core/thread/thread_manager.cpp b/oneflow/core/thread/thread_manager.cpp
index cad421fdbb3..16db1d8f151 100644
--- a/oneflow/core/thread/thread_manager.cpp
+++ b/oneflow/core/thread/thread_manager.cpp
@@ -17,6 +17,7 @@ limitations under the License.
 #include "oneflow/core/job/resource_desc.h"
 #include "oneflow/core/job/global_for.h"
 #include "oneflow/core/control/global_process_ctx.h"
+#include "oneflow/core/job/global_for.h"
 
 namespace oneflow {
 
@@ -25,47 +26,27 @@ ThreadMgr::~ThreadMgr() {
     ActorMsg msg = ActorMsg::BuildCommandMsg(-1, ActorCmd::kStopThread);
     thread_pair.second->GetMsgChannelPtr()->Send(msg);
     thread_pair.second.reset();
-    VLOG(1) << " Actor thread: " << thread_pair.first << " finished when process exits.";
+    VLOG(3) << "actor thread " << thread_pair.first << " finish";
   }
 }
 
 Thread* ThreadMgr::GetThrd(int64_t thrd_id) {
   auto iter = threads_.find(thrd_id);
-  CHECK(iter != threads_.end()) << " Thread: " << thrd_id << " not found";
+  CHECK(iter != threads_.end()) << "thread " << thrd_id << " not found";
   return iter->second.get();
 }
 
-void ThreadMgr::AddThreads(const HashSet<int64_t>& thread_ids) {
+void ThreadMgr::AddPlan(const Plan& plan) {
   const int64_t this_rank = GlobalProcessCtx::Rank();
-  for (int64_t thrd_id : thread_ids) {
-    const auto& it = threads_.find(thrd_id);
-    if (it != threads_.end()) {
-      // NOTE(chengcheng): check thread is not null.
-      CHECK(it->second) << " RuntimeError! Thread: " << thrd_id << " in manager must be NOT null.";
-      VLOG(1) << " Actor thread: " << thrd_id << " reused.";
-      continue;
-    }
-    StreamId stream_id = DecodeStreamIdFromInt64(thrd_id);
+  for (const TaskProto& task : plan.task()) {
+    TaskId task_id = DecodeTaskIdFromInt64(task.task_id());
+    StreamId stream_id = task_id.stream_id();
     if (stream_id.rank() != this_rank) { continue; }
+    int64_t thrd_id = EncodeStreamIdToInt64(stream_id);
+    if (threads_.find(thrd_id) != threads_.end()) { continue; }
     Thread* thread = new Thread(stream_id);
     CHECK_NOTNULL(thread);
     threads_[thrd_id].reset(thread);
-    VLOG(1) << " Actor thread: " << thrd_id << " created.";
-  }
-}
-
-void ThreadMgr::DeleteThreads(const HashSet<int64_t>& thread_ids) {
-  std::unique_lock<std::mutex> lock(mutex4del_threads_);
-  for (int64_t thrd_id : thread_ids) {
-    const auto& it = threads_.find(thrd_id);
-    CHECK((it != threads_.end()) && (it->second))
-        << " RuntimeError! Actor thread: " << thrd_id << " non-existent but want to delete";
-    auto& thread = it->second;
-    ActorMsg msg = ActorMsg::BuildCommandMsg(-1, ActorCmd::kStopThread);
-    thread->GetMsgChannelPtr()->Send(msg);
-    thread.reset();
-    VLOG(1) << " Actor thread: " << thrd_id << " finished when the graph is destructed.";
-    threads_.erase(it);
   }
 }
 
diff --git a/oneflow/core/thread/thread_manager.h b/oneflow/core/thread/thread_manager.h
index 7d562a8b756..39602dd8681 100644
--- a/oneflow/core/thread/thread_manager.h
+++ b/oneflow/core/thread/thread_manager.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_THREAD_THREAD_MANAGER_H_
 #define ONEFLOW_CORE_THREAD_THREAD_MANAGER_H_
 
-#include <mutex>
 #include "oneflow/core/common/channel.h"
 #include "oneflow/core/common/protobuf.h"
 #include "oneflow/core/common/auto_registration_factory.h"
@@ -37,15 +36,13 @@ class ThreadMgr final {
   ThreadMgr() = default;
   ~ThreadMgr();
 
-  void AddThreads(const HashSet<int64_t>& thread_ids);
-  void DeleteThreads(const HashSet<int64_t>& thread_ids);
+  void AddPlan(const Plan& plan);
   Thread* GetThrd(int64_t thrd_id);
 
  private:
   friend class Global<ThreadMgr>;
 
   HashMap<int64_t, std::unique_ptr<Thread>> threads_;
-  std::mutex mutex4del_threads_;
 };
 
 void SingleThreadLoop(size_t num, std::function<void(size_t i)> Callback);
diff --git a/oneflow/core/vm/control_stream_type.h b/oneflow/core/vm/control_stream_type.h
index a5e66dcd6a5..3e1940acdf6 100644
--- a/oneflow/core/vm/control_stream_type.h
+++ b/oneflow/core/vm/control_stream_type.h
@@ -22,7 +22,7 @@ limitations under the License.
 namespace oneflow {
 namespace vm {
 
-class InstructionMsg;
+struct InstructionMsg;
 
 class ControlStreamType final : public StreamType {
  public:
diff --git a/oneflow/core/vm/instruction.h b/oneflow/core/vm/instruction.h
index 2180fdc3bdf..303dc6c5ad9 100644
--- a/oneflow/core/vm/instruction.h
+++ b/oneflow/core/vm/instruction.h
@@ -100,7 +100,7 @@ FLAT_MSG_BEGIN(InstructionStatusBuffer);
 FLAT_MSG_END(InstructionStatusBuffer);
 // clang-format on
 
-class Instruction;
+struct Instruction;
 class InstructionEdge final
     : public intrusive::Base,
       public intrusive::EnableObjectPool<InstructionEdge,
@@ -149,7 +149,7 @@ class InstructionEdge final
   intrusive::ListHook out_edge_hook_;
 };
 
-class Stream;
+struct Stream;
 class Instruction final : public intrusive::Base {
  public:
   // types
diff --git a/oneflow/core/vm/instruction_type.h b/oneflow/core/vm/instruction_type.h
index 005c57751e8..6a1c9fa2941 100644
--- a/oneflow/core/vm/instruction_type.h
+++ b/oneflow/core/vm/instruction_type.h
@@ -22,8 +22,8 @@ limitations under the License.
 namespace oneflow {
 namespace vm {
 
-class InstructionMsg;
-class Instruction;
+struct InstructionMsg;
+struct Instruction;
 
 enum InstructionFuseType {
   kInvalidInstructionFuseType = 0,
diff --git a/oneflow/core/vm/phy_instr_operand.h b/oneflow/core/vm/phy_instr_operand.h
index caef8b3930f..01cc5e2195b 100644
--- a/oneflow/core/vm/phy_instr_operand.h
+++ b/oneflow/core/vm/phy_instr_operand.h
@@ -25,7 +25,7 @@ limitations under the License.
 namespace oneflow {
 namespace vm {
 
-class MirroredObject;
+struct MirroredObject;
 
 using DependenceVector = std::vector<MirroredObject*>;
 
diff --git a/oneflow/core/vm/stream.h b/oneflow/core/vm/stream.h
index 3e1936f5b2d..85e65672f03 100644
--- a/oneflow/core/vm/stream.h
+++ b/oneflow/core/vm/stream.h
@@ -23,7 +23,7 @@ limitations under the License.
 namespace oneflow {
 namespace vm {
 
-class ThreadCtx;
+struct ThreadCtx;
 
 class Stream final : public intrusive::Base {
  public:
diff --git a/oneflow/core/vm/stream_runtime_desc.h b/oneflow/core/vm/stream_runtime_desc.h
index 6e7aa400c55..f889884400a 100644
--- a/oneflow/core/vm/stream_runtime_desc.h
+++ b/oneflow/core/vm/stream_runtime_desc.h
@@ -23,7 +23,7 @@ namespace oneflow {
 namespace vm {
 
 class StreamType;
-class StreamDesc;
+struct StreamDesc;
 
 // Rt is short for Runtime
 class StreamRtDesc final : public intrusive::Base {
diff --git a/oneflow/core/vm/stream_type.h b/oneflow/core/vm/stream_type.h
index 8fee7b6054d..50c56bd15c6 100644
--- a/oneflow/core/vm/stream_type.h
+++ b/oneflow/core/vm/stream_type.h
@@ -28,10 +28,10 @@ namespace oneflow {
 
 namespace vm {
 
-class Stream;
+struct Stream;
 struct InstructionStatusBuffer;
-class Instruction;
-class InstructionMsg;
+struct Instruction;
+struct InstructionMsg;
 class InstructionType;
 
 class StreamType {
diff --git a/oneflow/core/vm/virtual_machine_engine.h b/oneflow/core/vm/virtual_machine_engine.h
index 374f92cef91..6d230f1d6b4 100644
--- a/oneflow/core/vm/virtual_machine_engine.h
+++ b/oneflow/core/vm/virtual_machine_engine.h
@@ -46,7 +46,7 @@ class ScheduleCtx {
   virtual void OnWorkerLoadPending(vm::ThreadCtx* thread_ctx) const = 0;
 };
 
-class VmDesc;
+struct VmDesc;
 class VirtualMachineEngine final : public intrusive::Base {
  public:
   // types
diff --git a/oneflow/core/vm/vm_object.h b/oneflow/core/vm/vm_object.h
index cfc6b69a784..994971c7556 100644
--- a/oneflow/core/vm/vm_object.h
+++ b/oneflow/core/vm/vm_object.h
@@ -28,8 +28,8 @@ namespace oneflow {
 
 namespace vm {
 
-class Instruction;
-class MirroredObject;
+struct Instruction;
+struct MirroredObject;
 
 enum OperandAccessType {
   kConstOperandAccess = 0,
diff --git a/oneflow/core/vm/vm_util.h b/oneflow/core/vm/vm_util.h
index 6b8445670fa..223213deb43 100644
--- a/oneflow/core/vm/vm_util.h
+++ b/oneflow/core/vm/vm_util.h
@@ -23,7 +23,7 @@ limitations under the License.
 namespace oneflow {
 namespace vm {
 
-class InstructionMsg;
+struct InstructionMsg;
 
 Maybe<void> Run(vm::InstructionMsgList* instr_msg_list);
 Maybe<void> ClusterSync();
diff --git a/oneflow/ir/include/OneFlow/OneFlowBase.td b/oneflow/ir/include/OneFlow/OneFlowBase.td
index 22d99134aeb..72f3f0f5c6f 100644
--- a/oneflow/ir/include/OneFlow/OneFlowBase.td
+++ b/oneflow/ir/include/OneFlow/OneFlowBase.td
@@ -257,6 +257,7 @@ class OneFlow_MaxPoolGradBaseOp<string mnemonic, list<Trait> traits = []> :
   let summary = "OneFlow Max Pooling Grad operation";
   let input = (ins
     AnyType:$x,
+    AnyType:$y,
     AnyType:$indice,
     AnyType:$dy
   );
@@ -283,6 +284,7 @@ class OneFlow_AvgPoolGradBaseOp<string mnemonic, list<Trait> traits = []> :
   let summary = "OneFlow Average Pooling Grad operation";
   let input = (ins
     AnyType:$x,
+    AnyType:$y,
     AnyType:$dy
   );
   let output = (outs
diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td
index e082caeea93..1a767be37cc 100644
--- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td
+++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td
@@ -211,8 +211,8 @@ class OneFlow_NormalizationAddReluBaseOp : OneFlow_BaseOp<"normalization_add_rel
 #endif // GET_ONEFLOW_BASE_OP_DEFINITIONS
 
 // Group: BINARY
-// bias_add, cast_like, celu_grad, diag_grad, diagonal_grad, dot, dropout_grad, elementwise_maximum, elementwise_minimum, elu_grad, floordiv, gelu_grad, grid_sample, hardsigmoid_grad, hardshrink_grad, hardswish_grad, l1_l2_regularize_gradient, leaky_relu_grad, masked_fill, mish_grad, multiply, narrow_grad, pow, prelu, relu_grad, selu_grad, sigmoid_grad, silu_grad, softshrink_grad, threshold_grad, tf_prelu, unfold_tensor_grad, xdivy, xlogy
-// Total: 34
+// bias_add, cast_like, celu_grad, diag_grad, diagonal_grad, dot, dropout_grad, elementwise_maximum, elementwise_minimum, elu_grad, floordiv, gelu_grad, grid_sample, hardsigmoid_grad, hardswish_grad, l1_l2_regularize_gradient, leaky_relu_grad, masked_fill, mish_grad, multiply, narrow_grad, pow, prelu, relu_grad, selu_grad, sigmoid_grad, silu_grad, softshrink_grad, threshold_grad, tf_prelu, unfold_tensor_grad, xdivy, xlogy
+// Total: 33
 
 #ifdef GET_ONEFLOW_BINARY_OP_DEFINITIONS
 
@@ -438,23 +438,6 @@ def OneFlow_HardsigmoidGradOp : OneFlow_BaseOp<"hardsigmoid_grad", [NoSideEffect
   let has_data_type_infer_fn = 1;
 }
 
-def OneFlow_HardShrinkGradOp : OneFlow_BaseOp<"hardshrink_grad", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    OneFlow_Tensor:$y,
-    OneFlow_Tensor:$dy
-  );
-  let output = (outs
-    OneFlow_Tensor:$dx
-  );
-  let attrs = (ins
-    DefaultValuedAttr<F64Attr, "0.">:$lambd
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_physical_tensor_desc_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-  let has_data_type_infer_fn = 1;
-}
-
 def OneFlow_HardswishGradOp : OneFlow_BaseOp<"hardswish_grad", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
   let input = (ins
     OneFlow_Tensor:$x,
@@ -4672,71 +4655,6 @@ def OneFlow_MultiSquareSumOp : OneFlow_BaseOp<"multi_square_sum", [NoSideEffect,
   let has_data_type_infer_fn = 1;
 }
 
-def OneFlow_MultiReduceSumPowAbsOp : OneFlow_BaseOp<"multi_reduce_sum_pow_abs", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    Variadic<OneFlow_Tensor>:$x
-  );
-  let output = (outs
-    OneFlow_Tensor:$y
-  );
-  let attrs = (ins
-    DefaultValuedAttr<F32Attr, "0">:$p
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_data_type_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-}
-
-def OneFlow_MultiReduceMaxAbsOp : OneFlow_BaseOp<"multi_reduce_max_abs", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    Variadic<OneFlow_Tensor>:$x
-  );
-  let output = (outs
-    OneFlow_Tensor:$y
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_data_type_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-}
-
-def OneFlow_MultiReduceMinAbsOp : OneFlow_BaseOp<"multi_reduce_min_abs", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    Variadic<OneFlow_Tensor>:$x
-  );
-  let output = (outs
-    OneFlow_Tensor:$y
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_data_type_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-}
-
-def OneFlow_LocalMultiReduceMaxAbsOp : OneFlow_BaseOp<"local_multi_reduce_max_abs", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    Variadic<OneFlow_Tensor>:$x
-  );
-  let output = (outs
-    OneFlow_Tensor:$y
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_physical_tensor_desc_infer_fn = 1;
-  let has_data_type_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-}
-
-def OneFlow_LocalMultiReduceMinAbsOp : OneFlow_BaseOp<"local_multi_reduce_min_abs", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    Variadic<OneFlow_Tensor>:$x
-  );
-  let output = (outs
-    OneFlow_Tensor:$y
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_physical_tensor_desc_infer_fn = 1;
-  let has_data_type_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-}
-
 def OneFlow_NllOp : OneFlow_BaseOp<"nll", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
   let input = (ins
     OneFlow_Tensor:$input,
@@ -7714,8 +7632,8 @@ def OneFlow_TestUserOpAttrAutoTypeOp : OneFlow_BaseOp<"test_user_op_attr_auto_ty
 #endif // GET_ONEFLOW_TEST_OP_DEFINITIONS
 
 // Group: TRIGONOMETRIC
-// acos, acos_grad, acosh, acosh_grad, asin, asin_grad, asinh, asinh_grad, atan, atan2, atan2_x_grad, atan2_y_grad, atan_grad, atanh, atanh_grad, cos, cos_grad, cosh, cosh_grad, hardtanh, hardtanh_grad, sin, sin_grad, sinh, sinh_grad, tan, tan_grad, tanh, tanh_grad, not_equal_zero, not_equal_zero_grad
-// Total: 31
+// acos, acos_grad, acosh, acosh_grad, asin, asin_grad, asinh, asinh_grad, atan, atan2, atan2_x_grad, atan2_y_grad, atan_grad, atanh, atanh_grad, cos, cos_grad, cosh, cosh_grad, hardtanh, hardtanh_grad, sin, sin_grad, sinh, sinh_grad, tan, tan_grad, tanh, tanh_grad
+// Total: 29
 
 #ifdef GET_ONEFLOW_TRIGONOMETRIC_OP_DEFINITIONS
 
@@ -8122,37 +8040,10 @@ def OneFlow_TanhGradOp : OneFlow_BaseOp<"tanh_grad", [NoSideEffect, DeclareOpInt
   let has_data_type_infer_fn = 1;
 }
 
-def OneFlow_NotEqualZeroOp : OneFlow_BaseOp<"not_equal_zero", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    OneFlow_Tensor:$x
-  );
-  let output = (outs
-    OneFlow_Tensor:$y
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_physical_tensor_desc_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-  let has_data_type_infer_fn = 1;
-}
-
-def OneFlow_NotEqualZeroGradOp : OneFlow_BaseOp<"not_equal_zero_grad", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    OneFlow_Tensor:$x,
-    OneFlow_Tensor:$dy
-  );
-  let output = (outs
-    OneFlow_Tensor:$dx
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_physical_tensor_desc_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-  let has_data_type_infer_fn = 1;
-}
-
 #endif // GET_ONEFLOW_TRIGONOMETRIC_OP_DEFINITIONS
 
 // Group: UNARY
-// acc, affine_grid, affine_grid_grad, bernoulli, cast, cast_to_static_shape, cast_to_tick, celu, copy, count_not_finite, diag, diagonal, elu, expand, expand_dims, flatten, flip, flip_grad, fold, gelu, hardsigmoid, hardshrink, hardswish, leaky_relu, log2, logical_not, mish, narrow, one_hot, pack, random_mask_like, repeat, roll, selu, sigmoid, silu, softshrink, softsign, sort, square_sum, squeeze, threshold, transpose, tril, triu, unfold, unfold_tensor, unpack, zero_like, to_contiguous, isnan, isinf
+// acc, affine_grid, affine_grid_grad, bernoulli, cast, cast_to_static_shape, cast_to_tick, celu, copy, count_not_finite, diag, diagonal, elu, expand, expand_dims, flatten, flip, flip_grad, fold, gelu, hardsigmoid, hardswish, leaky_relu, log2, logical_not, mish, narrow, one_hot, pack, random_mask_like, repeat, roll, selu, sigmoid, silu, softshrink, softsign, sort, square_sum, squeeze, threshold, transpose, tril, triu, unfold, unfold_tensor, unpack, zero_like, to_contiguous, isnan, isinf
 // Total: 51
 
 #ifdef GET_ONEFLOW_UNARY_OP_DEFINITIONS
@@ -8495,22 +8386,6 @@ def OneFlow_HardsigmoidOp : OneFlow_BaseOp<"hardsigmoid", [NoSideEffect, Declare
   let has_data_type_infer_fn = 1;
 }
 
-def OneFlow_HardShrinkOp : OneFlow_BaseOp<"hardshrink", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    OneFlow_Tensor:$in
-  );
-  let output = (outs
-    OneFlow_Tensor:$out
-  );
-  let attrs = (ins
-    DefaultValuedAttr<F64Attr, "0.">:$lambd
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_physical_tensor_desc_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-  let has_data_type_infer_fn = 1;
-}
-
 def OneFlow_HardswishOp : OneFlow_BaseOp<"hardswish", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
   let input = (ins
     OneFlow_Tensor:$in
@@ -9373,7 +9248,7 @@ def OneFlow_EmbeddingLookupPlaceholderOp : OneFlow_BaseOp<"embedding_lookup_plac
   let input = (ins
     OneFlow_Tensor:$shadow,
     OneFlow_Tensor:$ids,
-    Optional<OneFlow_Tensor>:$table_ids
+    Optional<OneFlow_Tensor>:$column_ids
   );
   let output = (outs
     OneFlow_Tensor:$embeddings
@@ -9381,8 +9256,8 @@ def OneFlow_EmbeddingLookupPlaceholderOp : OneFlow_BaseOp<"embedding_lookup_plac
   let attrs = (ins
     OneFlow_DataType:$dtype,
     DefaultValuedAttr<SI64Attr, "0">:$embedding_size,
-    DefaultValuedAttr<SI32Attr, "1">:$num_tables,
-    StrAttr:$embedding_tables,
+    DefaultValuedAttr<SI32Attr, "1">:$num_columns,
+    StrAttr:$embedding_columns,
     StrAttr:$key_value_store_options
   );
   let has_logical_tensor_desc_infer_fn = 1;
@@ -9418,7 +9293,7 @@ def OneFlow_UniqueKeyValuePairOp : OneFlow_BaseOp<"unique_key_value_pair", [NoSi
     OneFlow_Tensor:$inverse_indices
   );
   let attrs = (ins
-    DefaultValuedAttr<SI32Attr, "1">:$num_tables
+    DefaultValuedAttr<SI32Attr, "1">:$num_columns
   );
   let same_output_regst_num = 1;
   let has_logical_tensor_desc_infer_fn = 1;
@@ -9430,20 +9305,20 @@ def OneFlow_UniqueKeyValuePairOp : OneFlow_BaseOp<"unique_key_value_pair", [NoSi
 def OneFlow_IdShuffleOp : OneFlow_BaseOp<"id_shuffle", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
   let input = (ins
     OneFlow_Tensor:$ids,
-    Optional<OneFlow_Tensor>:$table_ids
+    Optional<OneFlow_Tensor>:$column_ids
   );
   let output = (outs
     OneFlow_Tensor:$num_unique_matrix,
     OneFlow_Tensor:$inverse_unique_partition_indices,
     OneFlow_Tensor:$cur_rank_num_unique,
     OneFlow_Tensor:$cur_rank_unique_ids,
-    OneFlow_Tensor:$cur_rank_unique_table_ids,
+    OneFlow_Tensor:$cur_rank_unique_column_ids,
     OneFlow_Tensor:$cur_rank_inverse_indices
   );
   let attrs = (ins
-    DefaultValuedAttr<SI32Attr, "1">:$num_tables
+    DefaultValuedAttr<SI32Attr, "1">:$num_columns
   );
-  let same_output_regst_num = 2;
+  let same_output_regst_num = 1;
   let has_logical_tensor_desc_infer_fn = 1;
   let has_physical_tensor_desc_infer_fn = 1;
   let has_get_sbp_fn = 1;
@@ -9488,7 +9363,7 @@ def OneFlow_EmbeddingPrefetchOp : OneFlow_BaseOp<"embedding_prefetch", [NoSideEf
   let input = (ins
     OneFlow_Tensor:$num_unique_ids,
     OneFlow_Tensor:$unique_ids,
-    OneFlow_Tensor:$table_ids
+    OneFlow_Tensor:$column_ids
   );
   let output = (outs
     OneFlow_Tensor:$context //no practical sense, control lookup run after prefetch.
@@ -9497,8 +9372,7 @@ def OneFlow_EmbeddingPrefetchOp : OneFlow_BaseOp<"embedding_prefetch", [NoSideEf
     DefaultValuedAttr<SI64Attr, "0">:$line_size,
     DefaultValuedAttr<SI64Attr, "0">:$embedding_size,
     StrAttr:$embedding_name,
-    StrAttr:$embedding_tables,
-    StrAttr:$state_initializer
+    StrAttr:$embedding_columns
   );
   let same_output_regst_num = 1;
   let has_logical_tensor_desc_infer_fn = 1;
@@ -9511,7 +9385,7 @@ def OneFlow_EmbeddingLookupOp : OneFlow_BaseOp<"embedding_lookup", [NoSideEffect
   let input = (ins
     OneFlow_Tensor:$num_unique_ids,
     OneFlow_Tensor:$unique_ids,
-    OneFlow_Tensor:$table_ids,
+    OneFlow_Tensor:$column_ids,
     Optional<OneFlow_Tensor>:$context
   );
   let output = (outs
@@ -9524,8 +9398,7 @@ def OneFlow_EmbeddingLookupOp : OneFlow_BaseOp<"embedding_lookup", [NoSideEffect
     DefaultValuedAttr<SI64Attr, "0">:$line_size,
     DefaultValuedAttr<SI64Attr, "0">:$embedding_size,
     StrAttr:$embedding_name,
-    StrAttr:$embedding_tables,
-    StrAttr:$state_initializer
+    StrAttr:$embedding_columns
   );
   let same_output_regst_num = 1;
   let has_logical_tensor_desc_infer_fn = 1;
@@ -9547,10 +9420,7 @@ def OneFlow_SgdEmbeddingUpdateOp : OneFlow_BaseOp<"sgd_embedding_update", [AttrS
     OneFlow_Tensor:$updated_unique_embeddings
   );
   let attrs = (ins
-    DefaultValuedAttr<F64Attr, "1.">:$scale,
-    DefaultValuedAttr<F32Attr, "0.">:$l1,
-    DefaultValuedAttr<F32Attr, "0.">:$l2,
-    DefaultValuedAttr<F32Attr, "0.">:$weight_decay
+    DefaultValuedAttr<F64Attr, "1.">:$scale
   );
   let same_output_regst_num = 1;
   let has_logical_tensor_desc_infer_fn = 1;
@@ -9573,9 +9443,6 @@ def OneFlow_MomentumEmbeddingUpdateOp : OneFlow_BaseOp<"momentum_embedding_updat
   );
   let attrs = (ins
     DefaultValuedAttr<F64Attr, "1.">:$scale,
-    DefaultValuedAttr<F32Attr, "0.">:$l1,
-    DefaultValuedAttr<F32Attr, "0.">:$l2,
-    DefaultValuedAttr<F32Attr, "0.">:$weight_decay,
     DefaultValuedAttr<F32Attr, "0.9">:$beta
   );
   let same_output_regst_num = 1;
@@ -9601,9 +9468,6 @@ def OneFlow_AdamEmbeddingUpdateOp : OneFlow_BaseOp<"adam_embedding_update", [Att
   );
   let attrs = (ins
     DefaultValuedAttr<F64Attr, "1.">:$scale,
-    DefaultValuedAttr<F32Attr, "0.">:$l1,
-    DefaultValuedAttr<F32Attr, "0.">:$l2,
-    DefaultValuedAttr<F32Attr, "0.">:$weight_decay,
     DefaultValuedAttr<F32Attr, "0.9">:$beta1,
     DefaultValuedAttr<F32Attr, "0.999">:$beta2,
     DefaultValuedAttr<F32Attr, "0.">:$epsilon,
@@ -9616,34 +9480,6 @@ def OneFlow_AdamEmbeddingUpdateOp : OneFlow_BaseOp<"adam_embedding_update", [Att
   let has_data_type_infer_fn = 1;
 }
 
-def OneFlow_AdagradEmbeddingUpdateOp : OneFlow_BaseOp<"adagrad_embedding_update", [AttrSizedOperandSegments, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    OneFlow_Tensor:$num_unique_ids,
-    OneFlow_Tensor:$unique_embeddings,
-    OneFlow_Tensor:$embedding_grad,
-    OneFlow_Tensor:$learning_rate,
-    OneFlow_Tensor:$train_step,
-    Optional<OneFlow_Tensor>:$down_scale_by_tensor,
-    Optional<OneFlow_Tensor>:$skip_if
-  );
-  let output = (outs
-    OneFlow_Tensor:$updated_unique_embeddings
-  );
-  let attrs = (ins
-    DefaultValuedAttr<F64Attr, "1.">:$scale,
-    DefaultValuedAttr<F32Attr, "0.">:$l1,
-    DefaultValuedAttr<F32Attr, "0.">:$l2,
-    DefaultValuedAttr<F32Attr, "0.">:$weight_decay,
-    DefaultValuedAttr<F32Attr, "0.">:$lr_decay,
-    DefaultValuedAttr<F32Attr, "0.">:$epsilon
-  );
-  let same_output_regst_num = 1;
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_physical_tensor_desc_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-  let has_data_type_infer_fn = 1;
-}
-
 def OneFlow_EmbeddingPutOp : OneFlow_BaseOp<"embedding_put", [DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
   let input = (ins
     OneFlow_Tensor:$num_unique_ids,
@@ -9660,18 +9496,4 @@ def OneFlow_EmbeddingPutOp : OneFlow_BaseOp<"embedding_put", [DeclareOpInterface
   let has_data_type_infer_fn = 1;
 }
 
-def OneFlow_RocAucScoreOp : OneFlow_BaseOp<"roc_auc_score", [NoSideEffect, NoGrad, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
-  let input = (ins
-    OneFlow_Tensor:$label,
-    OneFlow_Tensor:$pred
-  );
-  let output = (outs
-    OneFlow_Tensor:$out
-  );
-  let has_logical_tensor_desc_infer_fn = 1;
-  let has_physical_tensor_desc_infer_fn = 1;
-  let has_get_sbp_fn = 1;
-  let has_data_type_infer_fn = 1;
-}
-
 #endif // GET_ONEFLOW_ONE_EMBEDDING_OP_DEFINITIONS
diff --git a/oneflow/user/kernels/activation_kernels.cpp b/oneflow/user/kernels/activation_kernels.cpp
index c0984d3925f..9c345d96dc7 100644
--- a/oneflow/user/kernels/activation_kernels.cpp
+++ b/oneflow/user/kernels/activation_kernels.cpp
@@ -22,7 +22,6 @@ namespace oneflow {
   REGISTER_CELU_KERNEL(DeviceType::kCPU, dtype);        \
   REGISTER_HARDSWISH_KERNEL(DeviceType::kCPU, dtype);   \
   REGISTER_HARDSIGMOID_KERNEL(DeviceType::kCPU, dtype); \
-  REGISTER_HARDSHRINK_KERNEL(DeviceType::kCPU, dtype);  \
   REGISTER_HARDTANH_KERNEL(DeviceType::kCPU, dtype);    \
   REGISTER_MISH_KERNEL(DeviceType::kCPU, dtype);        \
   REGISTER_SILU_KERNEL(DeviceType::kCPU, dtype);        \
diff --git a/oneflow/user/kernels/activation_kernels.cu b/oneflow/user/kernels/activation_kernels.cu
index f1b8c3bf93b..db4a3eec89d 100644
--- a/oneflow/user/kernels/activation_kernels.cu
+++ b/oneflow/user/kernels/activation_kernels.cu
@@ -122,29 +122,6 @@ struct HardswishGradFunctor<half> {
   }
 };
 
-template<>
-struct HardShrinkFunctor<half> {
-  OF_DEVICE_FUNC explicit HardShrinkFunctor(float lambd)
-      : lambd(lambd), float_functor(HardShrinkFunctor<float>(lambd)) {}
-  OF_DEVICE_FUNC half operator()(half x) const {
-    return __float2half(float_functor(__half2float(x)));
-  }
-  const float lambd;
-  HardShrinkFunctor<float> float_functor;
-};
-
-template<>
-struct HardShrinkGradFunctor<half> {
-  OF_DEVICE_FUNC explicit HardShrinkGradFunctor(float lambd)
-      : lambd(lambd), float_functor(HardShrinkGradFunctor<float>(lambd)) {}
-  OF_DEVICE_FUNC half operator()(half y, half dy) const {
-    return __float2half(float_functor(__half2float(y), __half2float(dy)));
-  }
-
-  const float lambd;
-  HardShrinkGradFunctor<float> float_functor;
-};
-
 template<>
 struct MishFunctor<half> {
   OF_DEVICE_FUNC explicit MishFunctor() : float_functor(MishFunctor<float>()) {}
@@ -284,7 +261,6 @@ struct SoftShrinkGradFunctor<half> {
   REGISTER_CELU_KERNEL(DeviceType::kCUDA, dtype);        \
   REGISTER_HARDSWISH_KERNEL(DeviceType::kCUDA, dtype);   \
   REGISTER_HARDSIGMOID_KERNEL(DeviceType::kCUDA, dtype); \
-  REGISTER_HARDSHRINK_KERNEL(DeviceType::kCUDA, dtype);  \
   REGISTER_HARDTANH_KERNEL(DeviceType::kCUDA, dtype);    \
   REGISTER_MISH_KERNEL(DeviceType::kCUDA, dtype);        \
   REGISTER_SILU_KERNEL(DeviceType::kCUDA, dtype);        \
diff --git a/oneflow/user/kernels/activation_kernels.h b/oneflow/user/kernels/activation_kernels.h
index bbf9d3f3894..dc90351eebc 100644
--- a/oneflow/user/kernels/activation_kernels.h
+++ b/oneflow/user/kernels/activation_kernels.h
@@ -118,26 +118,6 @@ struct HardsigmoidGradFunctor {
   }
 };
 
-template<typename T>
-struct HardShrinkFunctor {
-  OF_DEVICE_FUNC explicit HardShrinkFunctor(double lambd) : lambd(lambd) {}
-  OF_DEVICE_FUNC T operator()(T x) const {
-    return (x <= lambd && x >= -lambd) ? static_cast<T>(0) : x;
-  }
-
-  const T lambd;
-};
-
-template<typename T>
-struct HardShrinkGradFunctor {
-  OF_DEVICE_FUNC explicit HardShrinkGradFunctor(double lambd) : lambd(lambd) {}
-  OF_DEVICE_FUNC T operator()(T y, T dy) const {
-    return y == static_cast<T>(0) ? static_cast<T>(0) : dy;
-  }
-
-  const T lambd;
-};
-
 template<typename T>
 struct HardtanhFunctor {
   OF_DEVICE_FUNC explicit HardtanhFunctor(float min_val, float max_val)
@@ -390,40 +370,6 @@ struct SoftShrinkGradFunctor {
       [](user_op::KernelComputeContext* ctx) { return HardsigmoidGradFunctor<dtype>(); }, "dx", \
       "x", "dy");
 
-#define REGISTER_HARDSHRINK_KERNEL(device, dtype)                                                \
-  REGISTER_USER_KERNEL("hardshrink")                                                             \
-      .SetCreateFn([]() {                                                                        \
-        return user_op::NewOpKernel<                                                             \
-            UnaryElemwiseXpuKernel<device, HardShrinkFunctor<dtype>, dtype, dtype>>(             \
-            [](user_op::KernelComputeContext* ctx) {                                             \
-              return HardShrinkFunctor<dtype>(ctx->Attr<double>("lambd"));                       \
-            },                                                                                   \
-            "out", "in");                                                                        \
-      })                                                                                         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                      \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))          \
-      .SetInplaceProposalFn([](const user_op::InferContext&,                                     \
-                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {  \
-        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true));                        \
-        return Maybe<void>::Ok();                                                                \
-      });                                                                                        \
-  REGISTER_USER_KERNEL("hardshrink_grad")                                                        \
-      .SetCreateFn([]() {                                                                        \
-        return user_op::NewOpKernel<                                                             \
-            BinaryElemwiseXpuKernel<device, HardShrinkGradFunctor<dtype>, dtype, dtype, dtype>>( \
-            [](user_op::KernelComputeContext* ctx) {                                             \
-              return HardShrinkGradFunctor<dtype>(ctx->Attr<double>("lambd"));                   \
-            },                                                                                   \
-            "dx", "y", "dy");                                                                    \
-      })                                                                                         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                      \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))          \
-      .SetInplaceProposalFn([](const user_op::InferContext&,                                     \
-                               user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {  \
-        OF_RETURN_IF_ERROR(AddInplaceArgPairFn("dx", 0, "dy", 0, true));                         \
-        return Maybe<void>::Ok();                                                                \
-      });
-
 #define REGISTER_HARDTANH_KERNEL(device, dtype)                                                 \
   REGISTER_USER_KERNEL("hardtanh")                                                              \
       .SetCreateFn([]() {                                                                       \
diff --git a/oneflow/user/kernels/bias_add_kernel.h b/oneflow/user/kernels/bias_add_kernel.h
index 96ad83b8a46..5bf87a267d0 100644
--- a/oneflow/user/kernels/bias_add_kernel.h
+++ b/oneflow/user/kernels/bias_add_kernel.h
@@ -38,7 +38,6 @@ class BiasAddUserKernel final : public user_op::OpKernel, public user_op::CudaGr
   void Compute(user_op::KernelComputeContext* ctx) const override {
     const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0);
     const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0);
-    if (a_tensor->shape().elem_cnt() == 0 || b_tensor->shape().elem_cnt() == 0) { return; }
     auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0);
     const int32_t bias_add_axis = ctx->Attr<int32_t>("axis");
     const int64_t outer_size = a_tensor->shape().Count(0, bias_add_axis);
diff --git a/oneflow/user/kernels/binary_cross_entropy_kernel.cu b/oneflow/user/kernels/binary_cross_entropy_kernel.cu
index 96c163bac09..2262858da20 100644
--- a/oneflow/user/kernels/binary_cross_entropy_kernel.cu
+++ b/oneflow/user/kernels/binary_cross_entropy_kernel.cu
@@ -52,8 +52,8 @@ struct BinaryCrossEntropyFunctor<float> {
   __device__ __forceinline__ float operator()(float input_val, float target_val) const {
     assert(input_val >= zero_);
     assert(input_val <= one_);
-    return (target_val - one_) * max(logf(one_ - input_val), negative_hundred_)
-           - target_val * max(logf(input_val), negative_hundred_);
+    return (target_val - one_) * max(__logf(one_ - input_val), negative_hundred_)
+           - target_val * max(__logf(input_val), negative_hundred_);
   }
 
   __device__ __forceinline__ float operator()(float input_val, float target_val,
diff --git a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cu b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cu
index c2b5c94c433..9f12bec0091 100644
--- a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cu
+++ b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.cu
@@ -70,7 +70,7 @@ struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> {
   __device__ __forceinline__ float operator()(float input_val, float target_val) const {
     const float max_val = -input_val < zero_ ? zero_ : -input_val;
     return (one_ - target_val) * input_val + max_val
-           + (logf(expf(-max_val) + expf(-input_val - max_val)));
+           + (__logf(__expf(-max_val) + __expf(-input_val - max_val)));
   }
 };
 
@@ -85,7 +85,7 @@ struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> {
     const float pos_weight_processed_val = weight_val - target_val + one_;
     return (one_ - target_val) * input_val
            + (pos_weight_processed_val
-              * (logf(expf(-max_val) + expf(-input_val - max_val)) + max_val));
+              * (__logf(__expf(-max_val) + __expf(-input_val - max_val)) + max_val));
   }
 };
 
diff --git a/oneflow/user/kernels/conv_cudnn_kernels.cpp b/oneflow/user/kernels/conv_cudnn_kernels.cpp
index df04b81aa6e..6ae4ad3889d 100644
--- a/oneflow/user/kernels/conv_cudnn_kernels.cpp
+++ b/oneflow/user/kernels/conv_cudnn_kernels.cpp
@@ -175,7 +175,6 @@ class ConvGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphS
   void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
                const user_op::OpKernelCache* cache) const override {
     const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    if (in->shape().elem_cnt() == 0) return;
     const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0);
     user_op::Tensor* buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
     user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
@@ -219,7 +218,6 @@ class ConvGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphS
                        && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                                \
         const auto& in = ctx->InputTensorDesc("in", 0);                                            \
-        if (in.shape().elem_cnt() == 0) return 0;                                                  \
         const auto& weight = ctx->InputTensorDesc("weight", 0);                                    \
         const auto* out = ctx->OutputTensorDesc("out", 0);                                         \
         const auto& cudnn_conf = Global<ResourceDesc, ForSession>::Get()->resource().cudnn_conf(); \
@@ -252,7 +250,6 @@ class ConvDataGradGpuKernel final : public user_op::OpKernel, public user_op::Cu
     const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
     const user_op::Tensor* filter = ctx->Tensor4ArgNameAndIndex("filter", 0);
     user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    if (dx->shape().elem_cnt() == 0) return;
     user_op::Tensor* buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
     const auto& cudnn_conf = Global<ResourceDesc, ForSession>::Get()->resource().cudnn_conf();
 
@@ -300,7 +297,6 @@ class ConvDataGradGpuKernel final : public user_op::OpKernel, public user_op::Cu
         const auto& dy = ctx->InputTensorDesc("dy", 0);                                            \
         const auto& filter = ctx->InputTensorDesc("filter", 0);                                    \
         const auto* dx = ctx->OutputTensorDesc("dx", 0);                                           \
-        if (dx->shape().elem_cnt() == 0) return 0;                                                 \
         const auto& cudnn_conf = Global<ResourceDesc, ForSession>::Get()->resource().cudnn_conf(); \
         return InferTmpSizeWithCudnn<cudnnConvolutionBwdDataAlgoPerf_t>(                           \
             dx, &filter, &dy, *ctx, cudnn_conf.has_cudnn_conv_force_bwd_data_algo(),               \
@@ -332,11 +328,6 @@ class ConvFilterGradGpuKernel final : public user_op::OpKernel, public user_op::
     const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
     const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
     user_op::Tensor* filter_diff = ctx->Tensor4ArgNameAndIndex("filter_diff", 0);
-    if (x->shape().elem_cnt() == 0) {
-      Memset<DeviceType::kCUDA>(ctx->stream(), filter_diff->mut_dptr<T>(), 0,
-                                filter_diff->shape().elem_cnt() * sizeof(T));
-      return;
-    }
     user_op::Tensor* buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
     const auto& cudnn_conf = Global<ResourceDesc, ForSession>::Get()->resource().cudnn_conf();
 
@@ -370,7 +361,6 @@ class ConvFilterGradGpuKernel final : public user_op::OpKernel, public user_op::
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                                \
         const auto& dy = ctx->InputTensorDesc("dy", 0);                                            \
         const auto& x = ctx->InputTensorDesc("x", 0);                                              \
-        if (x.shape().elem_cnt() == 0) return 0;                                                   \
         const auto* filter_diff = ctx->OutputTensorDesc("filter_diff", 0);                         \
         const auto& cudnn_conf = Global<ResourceDesc, ForSession>::Get()->resource().cudnn_conf(); \
         return InferTmpSizeWithCudnn<cudnnConvolutionBwdFilterAlgoPerf_t>(                         \
diff --git a/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu b/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu
index 6fdc212040a..a890606240c 100644
--- a/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu
+++ b/oneflow/user/kernels/cublas_bias_add_relu_matmul_grad_kernel.cu
@@ -13,7 +13,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/core/kernel/cuda_graph_support.h"
 #include "oneflow/user/kernels/cublas_fused_mlp_util.cuh"
 // CUBLAS_AUX_EPILOGUE only support in cuda11.4 or higher version, in cuda11.4 it need static link.
 #if CUDA_VERSION >= 11040
@@ -21,8 +20,7 @@ limitations under the License.
 namespace oneflow {
 
 template<typename T>
-class CublasBiasAddReluMatmulGradKernel final : public user_op::OpKernel,
-                                                public user_op::CudaGraphSupport {
+class CublasBiasAddReluMatmulGradKernel final : public user_op::OpKernel {
  public:
   CublasBiasAddReluMatmulGradKernel() = default;
   ~CublasBiasAddReluMatmulGradKernel() override = default;
diff --git a/oneflow/user/kernels/cublas_fused_mlp_kernel.cu b/oneflow/user/kernels/cublas_fused_mlp_kernel.cu
index c5baaeccecd..17e13d59aa1 100644
--- a/oneflow/user/kernels/cublas_fused_mlp_kernel.cu
+++ b/oneflow/user/kernels/cublas_fused_mlp_kernel.cu
@@ -13,7 +13,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include "oneflow/core/kernel/cuda_graph_support.h"
 #include "oneflow/user/kernels/cublas_fused_mlp_util.cuh"
 // CUBLAS_AUX_EPILOGUE only support in cuda11.4 or higher version, in cuda11.4 it need static link.
 #if CUDA_VERSION >= 11040
@@ -21,7 +20,7 @@ limitations under the License.
 namespace oneflow {
 
 template<typename T>
-class CublasFusedMLPKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
+class CublasFusedMLPKernel final : public user_op::OpKernel {
  public:
   CublasFusedMLPKernel() = default;
   ~CublasFusedMLPKernel() override = default;
diff --git a/oneflow/user/kernels/data_shuffle_kernel.cu b/oneflow/user/kernels/data_shuffle_kernel.cu
index df72470c090..a92959be900 100644
--- a/oneflow/user/kernels/data_shuffle_kernel.cu
+++ b/oneflow/user/kernels/data_shuffle_kernel.cu
@@ -22,7 +22,6 @@ limitations under the License.
 #include "oneflow/user/kernels/unsorted_segment_sum_kernel_util.h"
 #include "oneflow/core/cuda/atomic.cuh"
 #include "oneflow/core/embedding/hash_functions.cuh"
-#include "oneflow/core/cuda/elementwise.cuh"
 
 namespace oneflow {
 
@@ -90,16 +89,16 @@ __global__ void HashTableUniqueAndPartitionPairs(const uint32_t table_capacity,
 }
 
 template<typename U>
-__global__ void GenerateTableIds(int32_t elem_cnt, int32_t num_tables, U* table_ids) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { table_ids[i] = i % num_tables; }
+__global__ void GenerateColumnIds(int32_t elem_cnt, int32_t num_columns, U* column_ids) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { column_ids[i] = i % num_columns; }
 }
 
 template<typename K, typename V, typename IDX, typename HASH>
 void UniqueAndPartition(cudaStream_t cuda_stream, int64_t num_ids, size_t capacity,
-                        int64_t num_partition, const K* ids, const V* table_ids,
+                        int64_t num_partition, const K* ids, const V* column_ids,
                         IDX* num_partitioned_unique_ids_ptr, K* partitioned_unique_ids,
-                        V* partitioned_unique_table_ids, IDX* inverse_unique_partition_indices,
-                        void* workspace_ptr, size_t workspace_bytes, bool need_process_table_ids) {
+                        V* partitioned_unique_column_ids, IDX* inverse_unique_partition_indices,
+                        void* workspace_ptr, size_t workspace_bytes, bool need_process_column_ids) {
   size_t table_capacity_bytes = capacity * sizeof(TableEntry<K>);
   CHECK_GE(workspace_bytes, table_capacity_bytes);
   OF_CUDA_CHECK(cudaMemsetAsync(workspace_ptr, 0, table_capacity_bytes, cuda_stream));
@@ -108,8 +107,8 @@ void UniqueAndPartition(cudaStream_t cuda_stream, int64_t num_ids, size_t capaci
   HashTableUniqueAndPartitionPairs<K, V, IDX, HASH>
       <<<BlocksNum4ThreadsNum(num_ids), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(
           capacity, num_ids, num_partition, num_partitioned_unique_ids_ptr,
-          reinterpret_cast<TableEntry<K>*>(workspace_ptr), ids, table_ids, partitioned_unique_ids,
-          partitioned_unique_table_ids, inverse_unique_partition_indices, need_process_table_ids);
+          reinterpret_cast<TableEntry<K>*>(workspace_ptr), ids, column_ids, partitioned_unique_ids,
+          partitioned_unique_column_ids, inverse_unique_partition_indices, need_process_column_ids);
 }
 
 template<typename T>
@@ -156,12 +155,12 @@ void MakeShuffleParams(const IDX* host_num_unique_matrix, const int64_t num_ids,
 }
 
 template<typename K, typename U, typename IDX>
-void ShuffleIdsAndTableIds(cudaStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
-                           int64_t parallel_num, int64_t num_ids, DataType ids_data_type,
-                           DataType table_ids_data_type, IDX* host_num_unique_matrix,
-                           K* partitioned_unique_ids, U* partitioned_unique_table_ids,
-                           K* received_ids, U* received_table_ids, int64_t* received_elem_cnt,
-                           bool need_process_table_ids) {
+void ShuffleIdsAndColumns(cudaStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
+                          int64_t parallel_num, int64_t num_ids, DataType ids_data_type,
+                          DataType column_ids_data_type, IDX* host_num_unique_matrix,
+                          K* partitioned_unique_ids, U* partitioned_unique_column_ids,
+                          K* received_ids, U* received_column_ids, int64_t* received_elem_cnt,
+                          bool need_process_column_ids) {
   std::vector<int64_t> send_offsets;
   std::vector<int64_t> send_elem_cnt;
   std::vector<int64_t> recv_offsets;
@@ -171,9 +170,9 @@ void ShuffleIdsAndTableIds(cudaStream_t cuda_stream, ncclComm_t comm, int64_t pa
   ShuffleData(cuda_stream, comm, ids_data_type, send_offsets, send_elem_cnt, partitioned_unique_ids,
               recv_offsets, recv_elem_cnt, received_ids);
   *received_elem_cnt = recv_offsets.at(parallel_num - 1) + recv_elem_cnt.at(parallel_num - 1);
-  if (need_process_table_ids) {
-    ShuffleData(cuda_stream, comm, table_ids_data_type, send_offsets, send_elem_cnt,
-                partitioned_unique_table_ids, recv_offsets, recv_elem_cnt, received_table_ids);
+  if (need_process_column_ids) {
+    ShuffleData(cuda_stream, comm, column_ids_data_type, send_offsets, send_elem_cnt,
+                partitioned_unique_column_ids, recv_offsets, recv_elem_cnt, received_column_ids);
   }
 }
 
@@ -181,9 +180,9 @@ enum class IdShuffleBufferType {
   kNumPartitionedUnique = 0,
   kPartitionedUniqueIds,
   kReceivedIds,
-  kTableIds,
-  kPartitionedUniqueTableIds,
-  kReceivedTableIds,
+  kColumnIds,
+  kPartitionedUniqueColumnIds,
+  kReceivedColumnIds,
   kWorkspace,
   kMaxType
 };
@@ -193,21 +192,21 @@ class IdShuffleTmpBufferManager final {
  public:
   OF_DISALLOW_COPY_AND_MOVE(IdShuffleTmpBufferManager);
   IdShuffleTmpBufferManager(void* ptr, const int64_t num_ids, const int64_t parallel_num,
-                            bool need_table_ids, bool need_process_table_ids)
+                            bool need_column_ids, bool need_process_column_ids)
       : offset_(0),
         offsets_(static_cast<size_t>(IdShuffleBufferType::kMaxType), -1),
         sizes_(static_cast<size_t>(IdShuffleBufferType::kMaxType)),
         ptr_(ptr) {
-    const int64_t num_table_ids = need_process_table_ids ? num_ids : 0;
-    const size_t table_ids_bytes = need_table_ids ? num_ids * sizeof(U) : 0;
+    const int64_t num_column_ids = need_process_column_ids ? num_ids : 0;
+    const size_t column_ids_bytes = need_column_ids ? num_ids * sizeof(U) : 0;
     AllocBuffer(IdShuffleBufferType::kNumPartitionedUnique, parallel_num * sizeof(IDX));
     size_t partitioned_ids_bytes = parallel_num * num_ids * sizeof(K);
     AllocBuffer(IdShuffleBufferType::kPartitionedUniqueIds, partitioned_ids_bytes);
     AllocBuffer(IdShuffleBufferType::kReceivedIds, partitioned_ids_bytes);
-    AllocBuffer(IdShuffleBufferType::kTableIds, table_ids_bytes);
-    size_t partitioned_table_ids_bytes = parallel_num * num_table_ids * sizeof(U);
-    AllocBuffer(IdShuffleBufferType::kPartitionedUniqueTableIds, partitioned_table_ids_bytes);
-    AllocBuffer(IdShuffleBufferType::kReceivedTableIds, partitioned_table_ids_bytes);
+    AllocBuffer(IdShuffleBufferType::kColumnIds, column_ids_bytes);
+    size_t partitioned_column_ids_bytes = parallel_num * num_column_ids * sizeof(U);
+    AllocBuffer(IdShuffleBufferType::kPartitionedUniqueColumnIds, partitioned_column_ids_bytes);
+    AllocBuffer(IdShuffleBufferType::kReceivedColumnIds, partitioned_column_ids_bytes);
     const size_t hash_table_capacity = parallel_num * num_ids;
     AllocBuffer(IdShuffleBufferType::kWorkspace, hash_table_capacity * sizeof(TableEntry<K>));
   }
@@ -322,50 +321,51 @@ class IdShuffleKernel final : public user_op::OpKernel {
         ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0);
     user_op::Tensor* cur_rank_num_unique = ctx->Tensor4ArgNameAndIndex("cur_rank_num_unique", 0);
     user_op::Tensor* cur_rank_unique_ids = ctx->Tensor4ArgNameAndIndex("cur_rank_unique_ids", 0);
-    user_op::Tensor* cur_rank_unique_table_ids =
-        ctx->Tensor4ArgNameAndIndex("cur_rank_unique_table_ids", 0);
+    user_op::Tensor* cur_rank_unique_column_ids =
+        ctx->Tensor4ArgNameAndIndex("cur_rank_unique_column_ids", 0);
     user_op::Tensor* cur_rank_inverse_indices =
         ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0);
     user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const int32_t num_tables = ctx->Attr<int32_t>("num_tables");
-    const bool has_table_ids = ctx->has_input("table_ids", 0);
-    const bool need_gen_table_ids = (!has_table_ids && num_tables > 1);
-    const bool need_process_table_ids = (has_table_ids || num_tables > 1);
+    const int32_t num_columns = ctx->Attr<int32_t>("num_columns");
+    const bool has_column_ids = ctx->has_input("column_ids", 0);
+    const bool need_gen_column_ids = (!has_column_ids && num_columns > 1);
+    const bool need_process_column_ids = (has_column_ids || num_columns > 1);
     const int64_t num_ids = ids->shape().elem_cnt();
     const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
     const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
     cudaStream_t cuda_stream = ctx->stream()->As<ep::CudaStream>()->cuda_stream();
-    IdShuffleTmpBufferManager<K, U, IDX> buffer_manager(
-        tmp_buffer->mut_dptr(), num_ids, parallel_num, need_gen_table_ids, need_process_table_ids);
+    IdShuffleTmpBufferManager<K, U, IDX> buffer_manager(tmp_buffer->mut_dptr(), num_ids,
+                                                        parallel_num, need_gen_column_ids,
+                                                        need_process_column_ids);
     CHECK_GE(tmp_buffer->shape().elem_cnt(), buffer_manager.TotalBufferSize());
 
-    const U* table_ids_ptr;
-    if (has_table_ids) {
-      const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0);
-      table_ids_ptr = reinterpret_cast<const U*>(table_ids->dptr());
-    } else if (need_gen_table_ids) {
-      GenerateTableIds<<<BlocksNum4ThreadsNum(num_ids), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(
-          num_ids, num_tables, buffer_manager.template Ptr<U>(IdShuffleBufferType::kTableIds));
-      table_ids_ptr = buffer_manager.template Ptr<U>(IdShuffleBufferType::kTableIds);
+    const U* column_ids_ptr;
+    if (has_column_ids) {
+      const user_op::Tensor* column_ids = ctx->Tensor4ArgNameAndIndex("column_ids", 0);
+      column_ids_ptr = reinterpret_cast<const U*>(column_ids->dptr());
+    } else if (need_gen_column_ids) {
+      GenerateColumnIds<<<BlocksNum4ThreadsNum(num_ids), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(
+          num_ids, num_columns, buffer_manager.template Ptr<U>(IdShuffleBufferType::kColumnIds));
+      column_ids_ptr = buffer_manager.template Ptr<U>(IdShuffleBufferType::kColumnIds);
     } else {
-      table_ids_ptr = nullptr;
+      column_ids_ptr = nullptr;
     }
     IDX* num_partitioned_unique =
         buffer_manager.template Ptr<IDX>(IdShuffleBufferType::kNumPartitionedUnique);
     K* partitioned_unique_ids =
         buffer_manager.template Ptr<K>(IdShuffleBufferType::kPartitionedUniqueIds);
-    U* partitioned_unique_table_ids =
-        buffer_manager.template Ptr<U>(IdShuffleBufferType::kPartitionedUniqueTableIds);
+    U* partitioned_unique_column_ids =
+        buffer_manager.template Ptr<U>(IdShuffleBufferType::kPartitionedUniqueColumnIds);
     IDX* num_unique_matrix_ptr = reinterpret_cast<IDX*>(num_unique_matrix->mut_dptr());
     size_t hash_table_capacity = parallel_num * num_ids;
     void* workspace_ptr = buffer_manager.Ptr(IdShuffleBufferType::kWorkspace);
     size_t workspace_size = buffer_manager.Size(IdShuffleBufferType::kWorkspace);
     UniqueAndPartition<K, U, IDX, embedding::ShardingHash>(
         cuda_stream, num_ids, hash_table_capacity, parallel_num,
-        reinterpret_cast<const K*>(ids->dptr()), table_ids_ptr, num_partitioned_unique,
-        partitioned_unique_ids, partitioned_unique_table_ids,
+        reinterpret_cast<const K*>(ids->dptr()), column_ids_ptr, num_partitioned_unique,
+        partitioned_unique_ids, partitioned_unique_column_ids,
         reinterpret_cast<IDX*>(inverse_unique_partition_indices->mut_dptr()), workspace_ptr,
-        workspace_size, need_process_table_ids);
+        workspace_size, need_process_column_ids);
     ncclComm_t comm = kernel_state->comm();
     OF_NCCL_CHECK(ncclAllGather(num_partitioned_unique, num_unique_matrix_ptr, parallel_num,
                                 GetNcclDataType(num_unique_matrix->data_type()), comm,
@@ -377,21 +377,22 @@ class IdShuffleKernel final : public user_op::OpKernel {
     CHECK_JUST(ctx->stream()->Sync());
 
     K* received_ids = buffer_manager.template Ptr<K>(IdShuffleBufferType::kReceivedIds);
-    U* received_table_ids = buffer_manager.template Ptr<U>(IdShuffleBufferType::kReceivedTableIds);
+    U* received_column_ids =
+        buffer_manager.template Ptr<U>(IdShuffleBufferType::kReceivedColumnIds);
     int64_t received_elem_cnt = 0;
-    ShuffleIdsAndTableIds(cuda_stream, comm, parallel_id, parallel_num, num_ids, ids->data_type(),
-                          cur_rank_unique_table_ids->data_type(), host_num_unique_matrix,
-                          partitioned_unique_ids, partitioned_unique_table_ids, received_ids,
-                          received_table_ids, &received_elem_cnt, need_process_table_ids);
+    ShuffleIdsAndColumns(cuda_stream, comm, parallel_id, parallel_num, num_ids, ids->data_type(),
+                         cur_rank_unique_column_ids->data_type(), host_num_unique_matrix,
+                         partitioned_unique_ids, partitioned_unique_column_ids, received_ids,
+                         received_column_ids, &received_elem_cnt, need_process_column_ids);
     UniqueAndPartition<K, U, IDX, embedding::LocalUniqueHash>(
-        cuda_stream, received_elem_cnt, hash_table_capacity, 1, received_ids, received_table_ids,
+        cuda_stream, received_elem_cnt, hash_table_capacity, 1, received_ids, received_column_ids,
         reinterpret_cast<IDX*>(cur_rank_num_unique->mut_dptr()),
         reinterpret_cast<K*>(cur_rank_unique_ids->mut_dptr()),
-        reinterpret_cast<U*>(cur_rank_unique_table_ids->mut_dptr()),
+        reinterpret_cast<U*>(cur_rank_unique_column_ids->mut_dptr()),
         reinterpret_cast<IDX*>(cur_rank_inverse_indices->mut_dptr()), workspace_ptr, workspace_size,
-        need_process_table_ids);
-    if (!need_process_table_ids) {
-      OF_CUDA_CHECK(cudaMemsetAsync(cur_rank_unique_table_ids->mut_dptr(), 0,
+        need_process_column_ids);
+    if (!need_process_column_ids) {
+      OF_CUDA_CHECK(cudaMemsetAsync(cur_rank_unique_column_ids->mut_dptr(), 0,
                                     received_elem_cnt * sizeof(U), cuda_stream));
     }
   }
@@ -404,45 +405,37 @@ class IdShuffleKernel final : public user_op::OpKernel {
   OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)   \
   OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
 
-#define TABLE_ID_DATA_TYPE_SEQ                      \
-  OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)   \
-  OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
-  OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \
-  OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)     \
-  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)   \
-  OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
-
 #define IDX_DATA_TYPE_SEQ                           \
   OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
   OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
 
-#define REGISTER_CUDA_ID_SHUFFLE_KERNEL(k_dtype_pair, table_id_dtype_pair, idx_dtype_pair)        \
+#define REGISTER_CUDA_ID_SHUFFLE_KERNEL(k_dtype_pair, column_dtype_pair, idx_dtype_pair)          \
   REGISTER_USER_KERNEL("id_shuffle")                                                              \
       .SetCreateFn<                                                                               \
-          IdShuffleKernel<OF_PP_PAIR_FIRST(k_dtype_pair), OF_PP_PAIR_FIRST(table_id_dtype_pair),  \
+          IdShuffleKernel<OF_PP_PAIR_FIRST(k_dtype_pair), OF_PP_PAIR_FIRST(column_dtype_pair),    \
                           OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                                    \
       .SetIsMatchedHob(                                                                           \
           (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
           && (user_op::HobDataType("ids", 0) == OF_PP_PAIR_SECOND(k_dtype_pair))                  \
-          && (user_op::HobDataType("cur_rank_unique_table_ids", 0)                                \
-              == OF_PP_PAIR_SECOND(table_id_dtype_pair))                                          \
+          && (user_op::HobDataType("cur_rank_unique_column_ids", 0)                               \
+              == OF_PP_PAIR_SECOND(column_dtype_pair))                                            \
           && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                         \
         const user_op::TensorDesc& ids = ctx->InputTensorDesc("ids", 0);                          \
-        const bool has_table_ids = ctx->has_input("table_ids", 0);                                \
-        const int32_t num_tables = ctx->Attr<int32_t>("num_tables");                              \
-        const bool need_gen_table_ids = (!has_table_ids && num_tables > 1);                       \
-        const bool need_process_table_ids = (has_table_ids || num_tables > 1);                    \
+        const bool has_column_ids = ctx->has_input("column_ids", 0);                              \
+        const int32_t num_columns = ctx->Attr<int32_t>("num_columns");                            \
+        const bool need_gen_column_ids = (!has_column_ids && num_columns > 1);                    \
+        const bool need_process_column_ids = (has_column_ids || num_columns > 1);                 \
         IdShuffleTmpBufferManager<OF_PP_PAIR_FIRST(k_dtype_pair),                                 \
-                                  OF_PP_PAIR_FIRST(table_id_dtype_pair),                          \
+                                  OF_PP_PAIR_FIRST(column_dtype_pair),                            \
                                   OF_PP_PAIR_FIRST(idx_dtype_pair)>                               \
             buffer_manager(nullptr, ids.shape().elem_cnt(), ctx->parallel_desc().parallel_num(),  \
-                           need_gen_table_ids, need_process_table_ids);                           \
+                           need_gen_column_ids, need_process_column_ids);                         \
         return buffer_manager.TotalBufferSize();                                                  \
       });
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ID_SHUFFLE_KERNEL, ID_DATA_TYPE_SEQ,
-                                 TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+                                 ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
 
 template<typename T, typename IDX>
 void ShuffleEmbeddings(cudaStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
@@ -459,391 +452,6 @@ void ShuffleEmbeddings(cudaStream_t cuda_stream, ncclComm_t comm, int64_t parall
               reverse_unique_cur_rank_embeddings, recv_offsets, recv_elem_cnt, received_embeddings);
 }
 
-// Quantized Version.
-template<typename T, typename IDX>
-void ShuffleEmbeddings(cudaStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
-                       int64_t parallel_num, int64_t num_ids, int64_t embedding_size,
-                       DataType data_type, IDX* host_num_unique_matrix,
-                       int8_t* reverse_unique_cur_rank_embeddings, int8_t* received_embeddings,
-                       T* reverse_cur_rank_quantize_factor, T* recv_quantize_factor) {
-  std::vector<int64_t> send_offsets;
-  std::vector<int64_t> send_elem_cnt;
-  std::vector<int64_t> recv_offsets;
-  std::vector<int64_t> recv_elem_cnt;
-  // shuffle quantized_embedding
-  MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num,
-                    &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt);
-  ShuffleData(cuda_stream, comm, DataType::kInt8, send_offsets, send_elem_cnt,
-              reverse_unique_cur_rank_embeddings, recv_offsets, recv_elem_cnt, received_embeddings);
-  // shuffle quantize_factor
-  MakeShuffleParams(host_num_unique_matrix, num_ids, /*embedding_size=*/1, parallel_id,
-                    parallel_num, &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt);
-  ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt,
-              reverse_cur_rank_quantize_factor, recv_offsets, recv_elem_cnt, recv_quantize_factor);
-}
-
-__device__ float RoundHalfAwayFromZero(const float x) {
-  float abs_val = abs(x);
-  float floor_val = floor(abs_val + static_cast<float>(0.5));
-  return copysignf(floor_val, x);
-}
-
-// warp reduce version.
-constexpr int32_t kWarpSize = 32;
-constexpr int32_t kMaxColSize = 1024;
-
-template<typename T, int thread_group_width = kWarpSize>
-__inline__ __device__ T WarpMaxAllReduce(T val) {
-  for (int32_t lane_mask = thread_group_width / 2; lane_mask > 0; lane_mask /= 2) {
-    val = max(val, __shfl_xor_sync(0xffffffff, val, lane_mask, thread_group_width));
-  }
-  return val;
-}
-
-inline cudaError_t GetWarpImplNumBlocks(int64_t block_size, int64_t max_blocks, int64_t waves,
-                                        int* num_blocks) {
-  int dev;
-  {
-    cudaError_t err = cudaGetDevice(&dev);
-    if (err != cudaSuccess) { return err; }
-  }
-  int sm_count;
-  {
-    cudaError_t err = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
-    if (err != cudaSuccess) { return err; }
-  }
-  int tpm;
-  {
-    cudaError_t err = cudaDeviceGetAttribute(&tpm, cudaDevAttrMaxThreadsPerMultiProcessor, dev);
-    if (err != cudaSuccess) { return err; }
-  }
-  *num_blocks =
-      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
-  return cudaSuccess;
-}
-
-template<typename T, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access, bool padding>
-__global__ void QuantizeWarpImplKernel(const T* src, int8_t* dst, T* quantize_factor,
-                                       const int64_t rows, const int64_t cols) {
-  static_assert(cols_per_thread % pack_size == 0, "");
-  static_assert(thread_group_width <= kWarpSize, "");
-  static_assert(kWarpSize % thread_group_width == 0, "");
-  constexpr int num_packs = cols_per_thread / pack_size;
-  assert(cols <= cols_per_thread * thread_group_width);
-  ComputeType buf[rows_per_access][cols_per_thread];
-  const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
-  const int num_global_thread_group = gridDim.x * blockDim.y;
-  const int lane_id = threadIdx.x;
-  const int64_t step = num_global_thread_group * rows_per_access;
-  using LoadType = cuda::elementwise::PackType<T, pack_size>;
-  using LoadPack = cuda::elementwise::Pack<T, pack_size>;
-  using StoreType = cuda::elementwise::PackType<int8_t, pack_size>;
-  using StorePack = cuda::elementwise::Pack<int8_t, pack_size>;
-
-  for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) {
-    ComputeType thread_abs_max[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; row_id++) {
-      ComputeType* row_buf = buf[row_id];
-      thread_abs_max[row_id] = 0.0;
-#pragma unroll
-      for (int pack_id = 0; pack_id < num_packs; pack_id++) {
-        const int pack_offset = pack_id * pack_size;
-        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
-        LoadPack load_pack;
-        if (!padding || col < cols) {
-          const int64_t load_offset = ((row + row_id) * cols + col) / pack_size;
-          load_pack.storage = *(reinterpret_cast<const LoadType*>(src) + load_offset);
-#pragma unroll
-          for (int i = 0; i < pack_size; i++) {
-            row_buf[pack_offset + i] = static_cast<ComputeType>(load_pack.elem[i]);
-            thread_abs_max[row_id] = max(thread_abs_max[row_id], abs(row_buf[pack_offset + i]));
-          }
-        } else {
-#pragma unroll
-          for (int i = 0; i < pack_size; i++) { row_buf[pack_offset + i] = 0.0; }
-        }
-      }
-    }
-    ComputeType warp_max[rows_per_access];
-#pragma unroll
-    for (int row_id = 0; row_id < rows_per_access; row_id++) {
-      warp_max[row_id] = WarpMaxAllReduce<ComputeType, thread_group_width>(thread_abs_max[row_id]);
-      if (threadIdx.x == 0) { quantize_factor[row + row_id] = static_cast<T>(warp_max[row_id]); }
-      ComputeType* row_buf = buf[row_id];
-      ComputeType quantize_factor_val = static_cast<ComputeType>(127.0) / warp_max[row_id];
-#pragma unroll
-      for (int col = 0; col < cols_per_thread; col++) {
-        row_buf[col] = RoundHalfAwayFromZero(row_buf[col] * quantize_factor_val);
-      }
-#pragma unroll
-      for (int pack_id = 0; pack_id < num_packs; pack_id++) {
-        const int pack_offset = pack_id * pack_size;
-        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
-        StorePack store_pack;
-        if (!padding || col < cols) {
-          const int64_t store_offset = ((row + row_id) * cols + col) / pack_size;
-          for (int i = 0; i < pack_size; i++) {
-            store_pack.elem[i] = static_cast<int8_t>(row_buf[pack_id * pack_size + i]);
-          }
-          *(reinterpret_cast<StoreType*>(dst) + store_offset) = store_pack.storage;
-        }
-      }
-    }
-  }
-}
-
-template<typename T, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access, bool padding>
-inline cudaError_t LaunchQuantizeWarpImpl(cudaStream_t stream, const T* src, int8_t* dst,
-                                          T* quantize_factor, const int64_t rows,
-                                          const int64_t cols) {
-  constexpr int block_size = 128;
-  constexpr int waves = 32;
-  static_assert(block_size % thread_group_width == 0, "");
-  constexpr int thread_groups_per_block = block_size / thread_group_width;
-  dim3 block_dim(thread_group_width, thread_groups_per_block);
-  const int64_t num_blocks =
-      (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block;
-  int grid_dim_x = 0;
-
-  cudaError_t err = GetWarpImplNumBlocks(block_size, num_blocks, waves, &grid_dim_x);
-  if (err != cudaSuccess) { return err; }
-
-  QuantizeWarpImplKernel<T, ComputeType, pack_size, cols_per_thread, thread_group_width,
-                         rows_per_access, padding>
-      <<<grid_dim_x, block_dim, 0, stream>>>(src, dst, quantize_factor, rows, cols);
-  return cudaPeekAtLastError();
-}
-
-template<typename T, typename ComputeType, int pack_size, int cols_per_thread,
-         int thread_group_width, int rows_per_access>
-inline cudaError_t DispatchQuantizeWarpImplPadding(cudaStream_t stream, const T* src, int8_t* dst,
-                                                   T* quantize_factor, const int64_t rows,
-                                                   const int64_t cols) {
-  if (cols == cols_per_thread * thread_group_width) {
-    return LaunchQuantizeWarpImpl<T, ComputeType, pack_size, cols_per_thread, thread_group_width,
-                                  rows_per_access, false>(stream, src, dst, quantize_factor, rows,
-                                                          cols);
-  } else {
-    return LaunchQuantizeWarpImpl<T, ComputeType, pack_size, cols_per_thread, thread_group_width,
-                                  rows_per_access, true>(stream, src, dst, quantize_factor, rows,
-                                                         cols);
-  }
-}
-
-template<typename T, typename ComputeType, int pack_size>
-typename std::enable_if<pack_size == 1, cudaError_t>::type DispatchQuantizeWarpImplCols(
-    cudaStream_t stream, const T* src, int8_t* dst, T* quantize_factor, const int64_t rows,
-    const int64_t cols) {
-  if (cols <= 0) { return cudaErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                       \
-  else if (cols <= (thread_group_width)*pack_size) {                                              \
-    if (rows % 2 == 0) {                                                                          \
-      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
-                                             thread_group_width, 2>(stream, src, dst,             \
-                                                                    quantize_factor, rows, cols); \
-    } else {                                                                                      \
-      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
-                                             thread_group_width, 1>(stream, src, dst,             \
-                                                                    quantize_factor, rows, cols); \
-    }                                                                                             \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                              \
-  else if (cols <= (col)*kWarpSize) {                                                     \
-    return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, col, kWarpSize, 1>( \
-        stream, src, dst, quantize_factor, rows, cols);                                   \
-  }
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(3)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(5)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(7)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(9)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(11)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(13)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(15)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(17)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(19)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(21)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(23)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(25)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(27)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(29)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(31)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return cudaErrorInvalidValue;
-  }
-}
-
-template<typename T, typename ComputeType, int pack_size>
-typename std::enable_if<pack_size == 2, cudaError_t>::type DispatchQuantizeWarpImplCols(
-    cudaStream_t stream, const T* src, int8_t* dst, T* quantize_factor, const int64_t rows,
-    const int64_t cols) {
-  if (cols <= 0) { return cudaErrorInvalidValue; }
-#define DEFINE_ONE_ELIF(thread_group_width)                                                       \
-  else if (cols <= (thread_group_width)*pack_size) {                                              \
-    if (rows % 2 == 0) {                                                                          \
-      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
-                                             thread_group_width, 2>(stream, src, dst,             \
-                                                                    quantize_factor, rows, cols); \
-    } else {                                                                                      \
-      return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, pack_size,                \
-                                             thread_group_width, 1>(stream, src, dst,             \
-                                                                    quantize_factor, rows, cols); \
-    }                                                                                             \
-  }
-  DEFINE_ONE_ELIF(1)
-  DEFINE_ONE_ELIF(2)
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-#define DEFINE_ONE_ELIF(col)                                                              \
-  else if (cols <= (col)*kWarpSize) {                                                     \
-    return DispatchQuantizeWarpImplPadding<T, ComputeType, pack_size, col, kWarpSize, 1>( \
-        stream, src, dst, quantize_factor, rows, cols);                                   \
-  }
-  DEFINE_ONE_ELIF(4)
-  DEFINE_ONE_ELIF(6)
-  DEFINE_ONE_ELIF(8)
-  DEFINE_ONE_ELIF(10)
-  DEFINE_ONE_ELIF(12)
-  DEFINE_ONE_ELIF(14)
-  DEFINE_ONE_ELIF(16)
-  DEFINE_ONE_ELIF(18)
-  DEFINE_ONE_ELIF(20)
-  DEFINE_ONE_ELIF(22)
-  DEFINE_ONE_ELIF(24)
-  DEFINE_ONE_ELIF(26)
-  DEFINE_ONE_ELIF(28)
-  DEFINE_ONE_ELIF(30)
-  DEFINE_ONE_ELIF(32)
-#undef DEFINE_ONE_ELIF
-  else {
-    return cudaErrorInvalidValue;
-  }
-}
-
-template<typename T, typename ComputeType>
-struct DispatchQuantizeWarpImplPackSize {
-  cudaError_t operator()(cudaStream_t stream, const T* src, int8_t* dst, T* quantize_factor,
-                         const int64_t rows, const int64_t cols) {
-    if (cols % 2 == 0) {
-      return DispatchQuantizeWarpImplCols<T, ComputeType, 2>(stream, src, dst, quantize_factor,
-                                                             rows, cols);
-    } else {
-      return DispatchQuantizeWarpImplCols<T, ComputeType, 1>(stream, src, dst, quantize_factor,
-                                                             rows, cols);
-    }
-  }
-};
-
-template<typename T, typename ComputeType, typename IDX, int pack_size>
-__global__ void DequantizeKernel(const int8_t* x, T* quantize_factor, T* out, IDX col_size,
-                                 IDX elem_cnt);
-
-template<typename T, typename ComputeType, typename IDX, int pack_size>
-__global__ void DequantizeKernel(const int8_t* x, T* quantize_factor, T* out, IDX col_size,
-                                 IDX elem_cnt) {
-  IDX global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int index = global_thread_id * pack_size; index < elem_cnt;
-       index += gridDim.x * blockDim.x * pack_size) {
-    IDX quantize_factor_idx = index / col_size;
-    ComputeType quantize_factor_val = static_cast<ComputeType>(quantize_factor[quantize_factor_idx])
-                                      / static_cast<ComputeType>(127.0);
-    using LoadPackType = cuda::elementwise::PackType<int8_t, pack_size>;
-    using LoadPack = cuda::elementwise::Pack<int8_t, pack_size>;
-    using StorePackType = cuda::elementwise::PackType<T, pack_size>;
-    using StorePack = cuda::elementwise::Pack<T, pack_size>;
-    LoadPack load_pack{};
-    StorePack store_pack{};
-    load_pack.storage = *(reinterpret_cast<const LoadPackType*>(x) + index / pack_size);
-#pragma unroll
-    for (int i = 0; i < pack_size; i++) {
-      store_pack.elem[i] =
-          static_cast<T>(static_cast<ComputeType>(load_pack.elem[i]) * quantize_factor_val);
-    }
-    *(reinterpret_cast<StorePackType*>(out) + index / pack_size) = store_pack.storage;
-  }
-}
-
-template<typename T, typename ComputeType, typename IDX, int pack_size>
-cudaError_t DispatchDequantizeKernelPackSize(cudaStream_t stream, const int8_t* src,
-                                             T* quantize_factor, T* dst, const int64_t col_size,
-                                             const int64_t elem_cnt) {
-  const int64_t pack_num = elem_cnt / pack_size;
-  int grid_size = 0;
-  cudaError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
-  if (err != cudaSuccess) { return err; }
-  DequantizeKernel<T, ComputeType, IDX, pack_size>
-      <<<grid_size, cuda::elementwise::kBlockSize, 0, stream>>>(src, quantize_factor, dst, col_size,
-                                                                elem_cnt);
-  return cudaSuccess;
-}
-
-template<typename T, typename ComputeType, typename IDX>
-inline cudaError_t LaunchDequantizeKernel(cudaStream_t stream, const int8_t* src,
-                                          T* quantize_factor, T* dst, const int64_t col_size,
-                                          const int64_t elem_cnt) {
-  constexpr int quantized_src_pack_size = cuda::elementwise::PackSize<int8_t>();
-  constexpr int dst_pack_size = cuda::elementwise::PackSize<T>();
-  int launch_pack_size = std::min(quantized_src_pack_size, dst_pack_size);
-  if (launch_pack_size == 8 && col_size % 8 == 0) {
-    cudaError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 8>(
-        stream, src, quantize_factor, dst, col_size, elem_cnt);
-    if (err != cudaSuccess) { return err; }
-  } else if (launch_pack_size == 4 && col_size % 4 == 0) {
-    cudaError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 4>(
-        stream, src, quantize_factor, dst, col_size, elem_cnt);
-    if (err != cudaSuccess) { return err; }
-  } else if (launch_pack_size == 2 && col_size % 2 == 0) {
-    cudaError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 2>(
-        stream, src, quantize_factor, dst, col_size, elem_cnt);
-    if (err != cudaSuccess) { return err; }
-  } else {
-    cudaError_t err = DispatchDequantizeKernelPackSize<T, ComputeType, IDX, 1>(
-        stream, src, quantize_factor, dst, col_size, elem_cnt);
-    if (err != cudaSuccess) { return err; }
-  }
-  return cudaPeekAtLastError();
-}
-
-template<typename T>
-struct DefaultComputeType {
-  using type = T;
-};
-
-template<>
-struct DefaultComputeType<half> {
-  using type = float;
-};
-
 template<typename T, typename IDX>
 class EmbeddingShuffleKernel final : public user_op::OpKernel {
  public:
@@ -870,21 +478,13 @@ class EmbeddingShuffleKernel final : public user_op::OpKernel {
         ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0);
     user_op::Tensor* embeddings = ctx->Tensor4ArgNameAndIndex("embeddings", 0);
     user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    ncclComm_t comm = kernel_state->comm();
-    using ComputeType = typename DefaultComputeType<T>::type;
+
     const int64_t embedding_size = cur_rank_embeddings->shape().At(1);
     IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix();
     DataType data_type = cur_rank_embeddings->data_type();
     const int64_t num_ids = inverse_unique_partition_indices->shape().elem_cnt();
     const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
     const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-    bool enable_quantized_comm_env_var =
-        ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false);
-    bool enable_quantized_comm = enable_quantized_comm_env_var && (embedding_size < kMaxColSize);
-    if (enable_quantized_comm_env_var && !enable_quantized_comm) {
-      LOG(WARNING) << "Only envrionment variable ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM=1 and "
-                      "embedding_size less equal than 1024 can use quantized communication. ";
-    }
     cudaStream_t cuda_stream = ctx->stream()->As<ep::CudaStream>()->cuda_stream();
     OF_CUDA_CHECK(cudaMemcpyAsync(
         host_num_unique_matrix, reinterpret_cast<const IDX*>(num_unique_matrix->dptr()),
@@ -894,121 +494,34 @@ class EmbeddingShuffleKernel final : public user_op::OpKernel {
     for (int64_t i = 0; i < parallel_num; ++i) {
       cur_rank_num_ids += host_num_unique_matrix[i * parallel_num + parallel_id];
     }
-    size_t full_elem_cnt = parallel_num * num_ids * embedding_size;
-    CHECK_EQ(full_elem_cnt, cur_rank_embeddings->shape().elem_cnt());
-    if (!enable_quantized_comm) {
-      size_t reverse_unique_cur_rank_embeddings_size =
-          GetCudaAlignedSize(full_elem_cnt * sizeof(T));
-      size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size;
-
-      CHECK_GE(tmp_buffer->shape().elem_cnt(),
-               reverse_unique_cur_rank_embeddings_size + received_embeddings_size);
-
-      T* reverse_unique_cur_rank_embeddings = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
-      T* received_embeddings = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
-                                                    + reverse_unique_cur_rank_embeddings_size);
-      // reverse cur_rank unique
-      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
-          cur_rank_num_ids, cur_rank_embeddings->dptr<T>(),
-          Shape({1, cur_rank_embeddings->shape().elem_cnt() / embedding_size, embedding_size}),
-          reverse_unique_cur_rank_embeddings, 0);
-
-      ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size,
-                        data_type, host_num_unique_matrix, reverse_unique_cur_rank_embeddings,
-                        received_embeddings);
-
-      // reverse unique_partition
-      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
-          inverse_unique_partition_indices->shape().elem_cnt(), received_embeddings,
-          Shape({1, parallel_num * num_ids, embedding_size}), embeddings->mut_dptr<T>(), 0);
-    } else {
-      size_t reverse_unique_cur_rank_embeddings_size =
-          GetCudaAlignedSize(full_elem_cnt * sizeof(int8_t));
-      size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size;
-      size_t quantize_cur_rank_embeddings_size = reverse_unique_cur_rank_embeddings_size;
-      size_t reverse_recv_quantize_cur_rank_embeddings_size =
-          reverse_unique_cur_rank_embeddings_size;
-      size_t cur_rank_quantize_factor_size =
-          GetCudaAlignedSize(cur_rank_embeddings->shape().At(0) * sizeof(T));
-      size_t reverse_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;
-      size_t recv_quantize_factor_size = cur_rank_quantize_factor_size;
-      size_t reverse_recv_quantize_factor_size = cur_rank_quantize_factor_size;
-      CHECK_GE(tmp_buffer->shape().elem_cnt(),
-               reverse_unique_cur_rank_embeddings_size + received_embeddings_size
-                   + quantize_cur_rank_embeddings_size
-                   + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size
-                   + reverse_cur_rank_quantize_factor_size + recv_quantize_factor_size
-                   + reverse_recv_quantize_factor_size);
-      int8_t* reverse_unique_cur_rank_embeddings =
-          reinterpret_cast<int8_t*>(tmp_buffer->mut_dptr());
-      int8_t* received_embeddings = reinterpret_cast<int8_t*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size);
-      int8_t* quantize_cur_rank_embeddings = reinterpret_cast<int8_t*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size);
-      int8_t* reverse_recv_quantize_cur_rank_embeddings = reinterpret_cast<int8_t*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size + quantize_cur_rank_embeddings_size);
-      T* cur_rank_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size + quantize_cur_rank_embeddings_size
-          + reverse_recv_quantize_cur_rank_embeddings_size);
-      T* reverse_cur_rank_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size + quantize_cur_rank_embeddings_size
-          + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size);
-      T* recv_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size + quantize_cur_rank_embeddings_size
-          + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size
-          + reverse_cur_rank_quantize_factor_size);
-      T* reverse_recv_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + reverse_unique_cur_rank_embeddings_size
-          + received_embeddings_size + quantize_cur_rank_embeddings_size
-          + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size
-          + reverse_cur_rank_quantize_factor_size + recv_quantize_factor_size);
-      DispatchQuantizeWarpImplPackSize<T, ComputeType>()(
-          cuda_stream, cur_rank_embeddings->dptr<T>(), quantize_cur_rank_embeddings,
-          cur_rank_quantize_factor, cur_rank_num_ids, embedding_size);
-      // reverse cur_rank embedding unique
-      GatherKernelUtilImpl<DeviceType::kCUDA, int8_t, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
-          cur_rank_num_ids, quantize_cur_rank_embeddings,
-          Shape({1, cur_rank_embeddings->shape().elem_cnt() / embedding_size, embedding_size}),
-          reverse_unique_cur_rank_embeddings, 0);
 
-      // reverse cur_rank quantize factor unique
-      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
-          cur_rank_num_ids, cur_rank_quantize_factor,
-          Shape({1, cur_rank_embeddings->shape().elem_cnt() / embedding_size, 1}),
-          reverse_cur_rank_quantize_factor, 0);
+    CHECK_EQ(parallel_num * num_ids * embedding_size, cur_rank_embeddings->shape().elem_cnt());
+    size_t reverse_unique_cur_rank_embeddings_size =
+        GetCudaAlignedSize(parallel_num * num_ids * embedding_size * sizeof(T));
+    size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size;
+    T* reverse_unique_cur_rank_embeddings = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
+    T* received_embeddings = reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>()
+                                                  + reverse_unique_cur_rank_embeddings_size);
+    CHECK_GE(tmp_buffer->shape().elem_cnt(),
+             reverse_unique_cur_rank_embeddings_size + received_embeddings_size);
+
+    // reverse cur_rank unique
+    GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
+        ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
+        cur_rank_num_ids, cur_rank_embeddings->dptr<T>(),
+        Shape({1, cur_rank_embeddings->shape().elem_cnt() / embedding_size, embedding_size}),
+        reverse_unique_cur_rank_embeddings, 0);
 
-      ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size,
-                        data_type, host_num_unique_matrix, reverse_unique_cur_rank_embeddings,
-                        received_embeddings, reverse_cur_rank_quantize_factor,
-                        recv_quantize_factor);
-
-      // reverse unique_partition
-      GatherKernelUtilImpl<DeviceType::kCUDA, int8_t, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
-          inverse_unique_partition_indices->shape().elem_cnt(), received_embeddings,
-          Shape({1, parallel_num * num_ids, embedding_size}),
-          reverse_recv_quantize_cur_rank_embeddings, 0);
-
-      GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
-          ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
-          inverse_unique_partition_indices->shape().elem_cnt(), recv_quantize_factor,
-          Shape({1, parallel_num * num_ids, 1}), reverse_recv_quantize_factor, 0);
+    ncclComm_t comm = kernel_state->comm();
+    ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size,
+                      data_type, host_num_unique_matrix, reverse_unique_cur_rank_embeddings,
+                      received_embeddings);
 
-      int32_t dequantize_row_size = inverse_unique_partition_indices->shape().elem_cnt();
-      IDX dequantize_elem_cnt = dequantize_row_size * embedding_size;
-      OF_CUDA_CHECK((LaunchDequantizeKernel<T, ComputeType, IDX>(
-          cuda_stream, reverse_recv_quantize_cur_rank_embeddings, reverse_recv_quantize_factor,
-          embeddings->mut_dptr<T>(), embedding_size, dequantize_elem_cnt)));
-    }
+    // reverse unique_partition
+    GatherKernelUtilImpl<DeviceType::kCUDA, T, IDX>::Forward(
+        ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
+        inverse_unique_partition_indices->shape().elem_cnt(), received_embeddings,
+        Shape({1, parallel_num * num_ids, embedding_size}), embeddings->mut_dptr<T>(), 0);
   }
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
@@ -1024,39 +537,15 @@ class EmbeddingShuffleKernel final : public user_op::OpKernel {
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                         \
         const user_op::TensorDesc& cur_rank_embeddings =                                          \
             ctx->InputTensorDesc("cur_rank_embeddings", 0);                                       \
-        bool enable_quantized_comm =                                                              \
-            ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false)             \
-            && (cur_rank_embeddings.shape().At(1) < kMaxColSize);                                 \
-        size_t tmp_size = 0;                                                                      \
-        if (!enable_quantized_comm) {                                                             \
-          size_t reverse_cur_rank_embeddings_size = GetCudaAlignedSize(                           \
-              cur_rank_embeddings.shape().elem_cnt() * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));   \
-          size_t recv_unique_embeddings_size = reverse_cur_rank_embeddings_size;                  \
-          tmp_size = reverse_cur_rank_embeddings_size + recv_unique_embeddings_size;              \
-        } else {                                                                                  \
-          size_t total_elem_cnt = cur_rank_embeddings.shape().elem_cnt();                         \
-          size_t reverse_cur_rank_embeddings_size =                                               \
-              GetCudaAlignedSize(total_elem_cnt * sizeof(int8_t));                                \
-          size_t recv_unique_embeddings = reverse_cur_rank_embeddings_size;                       \
-          size_t quantize_cur_rank_embeddings_size = reverse_cur_rank_embeddings_size;            \
-          size_t reverse_recv_quantize_cur_rank_embeddings_size =                                 \
-              reverse_cur_rank_embeddings_size;                                                   \
-          size_t cur_rank_quantize_factor_size = GetCudaAlignedSize(                              \
-              cur_rank_embeddings.shape().At(0) * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));        \
-          size_t reverse_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;           \
-          size_t recv_quantize_factor_size = cur_rank_quantize_factor_size;                       \
-          size_t reverse_recv_quantize_factor_size = cur_rank_quantize_factor_size;               \
-          tmp_size = reverse_cur_rank_embeddings_size + recv_unique_embeddings                    \
-                     + quantize_cur_rank_embeddings_size                                          \
-                     + reverse_recv_quantize_cur_rank_embeddings_size                             \
-                     + cur_rank_quantize_factor_size + reverse_cur_rank_quantize_factor_size      \
-                     + recv_quantize_factor_size + reverse_recv_quantize_factor_size;             \
-        }                                                                                         \
-        return tmp_size;                                                                          \
+        const user_op::TensorDesc& embeddings = ctx->InputTensorDesc("embeddings", 0);            \
+        size_t reverse_cur_rank_embeddings_size = GetCudaAlignedSize(                             \
+            cur_rank_embeddings.shape().elem_cnt() * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));     \
+        size_t recv_unique_embeddings = reverse_cur_rank_embeddings_size;                         \
+        return reverse_cur_rank_embeddings_size + recv_unique_embeddings;                         \
       });
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_SHUFFLE_KERNEL,
-                                 FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+                                 FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
 
 template<typename T, typename IDX>
 void ShuffleEmbeddingsGrad(cudaStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
@@ -1074,31 +563,6 @@ void ShuffleEmbeddingsGrad(cudaStream_t cuda_stream, ncclComm_t comm, int64_t pa
               received_embeddings_grad);
 }
 
-// Quantize Version.
-template<typename T, typename IDX>
-void ShuffleEmbeddingsGrad(cudaStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id,
-                           int64_t parallel_num, int64_t num_ids, int64_t embedding_size,
-                           DataType data_type, IDX* host_num_unique_matrix,
-                           int8_t* unique_partition_embedding_grad,
-                           int8_t* received_embeddings_grad, T* cur_rank_quantize_factor,
-                           T* received_cur_rank_quantize_factor) {
-  std::vector<int64_t> send_offsets;
-  std::vector<int64_t> send_elem_cnt;
-  std::vector<int64_t> recv_offsets;
-  std::vector<int64_t> recv_elem_cnt;
-  // Shuffle Embedding Grad.
-  MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num,
-                    &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt);
-  ShuffleData(cuda_stream, comm, DataType::kInt8, send_offsets, send_elem_cnt,
-              unique_partition_embedding_grad, recv_offsets, recv_elem_cnt,
-              received_embeddings_grad);
-  // Shuffle Quantize factor.
-  MakeShuffleParams(host_num_unique_matrix, num_ids, /*embedding_size=*/1, parallel_id,
-                    parallel_num, &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt);
-  ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt, cur_rank_quantize_factor,
-              recv_offsets, recv_elem_cnt, received_cur_rank_quantize_factor);
-}
-
 template<typename T, typename IDX>
 class EmbeddingGradientShuffleKernel final : public user_op::OpKernel {
  public:
@@ -1117,7 +581,6 @@ class EmbeddingGradientShuffleKernel final : public user_op::OpKernel {
     auto* kernel_state = dynamic_cast<DataShuffleKernelState<IDX>*>(state);
     CHECK(kernel_state != nullptr);
     const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
-
     const user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0);
     const user_op::Tensor* cur_rank_inverse_indices =
         ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0);
@@ -1132,148 +595,51 @@ class EmbeddingGradientShuffleKernel final : public user_op::OpKernel {
     const int64_t parallel_num = ctx->parallel_ctx().parallel_num();
     const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
     user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    ncclComm_t comm = kernel_state->comm();
-    using ComputeType = typename DefaultComputeType<T>::type;
-    bool enable_quantized_comm_env_var =
-        ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false);
-    bool enable_quantized_comm = enable_quantized_comm_env_var && (embedding_size < kMaxColSize);
-    if (enable_quantized_comm_env_var && !enable_quantized_comm) {
-      LOG(WARNING) << "Only envrionment variable ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM=1 and "
-                      "embedding_size less equal than 1024 can use quantized communication. ";
-    }
+
     cudaStream_t cuda_stream = ctx->stream()->As<ep::CudaStream>()->cuda_stream();
     OF_CUDA_CHECK(cudaMemcpyAsync(host_num_unique_matrix, num_unique_matrix->dptr(),
                                   parallel_num * parallel_num * sizeof(IDX), cudaMemcpyDefault,
                                   cuda_stream));
     CHECK_JUST(ctx->stream()->Sync());
-
     int64_t cur_rank_num_ids = 0;
     for (int64_t i = 0; i < parallel_num; ++i) {
       cur_rank_num_ids += host_num_unique_matrix[i * parallel_num + parallel_id];
     }
-    size_t full_num_ids = parallel_num * num_ids;
-    size_t full_elem_cnt = full_num_ids * embedding_size;
-    size_t unique_partition_embedding_grad_size = GetCudaAlignedSize(full_elem_cnt * sizeof(T));
 
-    if (!enable_quantized_comm) {
-      size_t received_embedding_grad_size = unique_partition_embedding_grad_size;
-      T* unique_partition_embedding_grad = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
-      T* received_embedding_grad =
-          reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size);
-      CHECK_GE(tmp_buffer->shape().elem_cnt(),
-               unique_partition_embedding_grad_size + received_embedding_grad_size);
+    size_t unique_partition_embedding_grad_size =
+        GetCudaAlignedSize(parallel_num * num_ids * embedding_size * sizeof(T));
+    size_t received_embedding_grad_size = unique_partition_embedding_grad_size;
+    T* unique_partition_embedding_grad = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
+    T* received_embedding_grad =
+        reinterpret_cast<T*>(tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size);
+    CHECK_GE(tmp_buffer->shape().elem_cnt(),
+             unique_partition_embedding_grad_size + received_embedding_grad_size);
 
-      // unique and partition embedding grad
-      for (int64_t i = 0; i < parallel_num; ++i) {
-        const int64_t offset = i * num_ids * embedding_size;
-        const int64_t valid_value_size =
-            host_num_unique_matrix[parallel_id * parallel_num + i] * embedding_size * sizeof(T);
-        OF_CUDA_CHECK(cudaMemsetAsync(unique_partition_embedding_grad + offset, 0, valid_value_size,
-                                      cuda_stream));
-      }
-      UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, T, IDX, T>::UnsortedSegmentSum(
-          ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
-          embedding_grad->dptr<T>(), num_ids, parallel_num * num_ids, 1, embedding_size, 0,
-          unique_partition_embedding_grad);
-
-      ShuffleEmbeddingsGrad(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size,
-                            data_type, host_num_unique_matrix, unique_partition_embedding_grad,
-                            received_embedding_grad);
-
-      // unique cur_rank embedding grad
-      OF_CUDA_CHECK(cudaMemsetAsync(cur_rank_unique_embedding_grad->mut_dptr(), 0,
-                                    cur_rank_num_ids * embedding_size * sizeof(T), cuda_stream));
-      UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, T, IDX, T>::UnsortedSegmentSum(
-          ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
-          received_embedding_grad, cur_rank_num_ids, cur_rank_num_ids, 1, embedding_size, 0,
-          cur_rank_unique_embedding_grad->mut_dptr<T>());
-    } else {
-      size_t received_embedding_grad_size = GetCudaAlignedSize(full_elem_cnt * sizeof(int8_t));
-      size_t quantize_cur_rank_embedding_grad_size = received_embedding_grad_size;
-      size_t cur_rank_quantize_factor_size = GetCudaAlignedSize(full_num_ids * sizeof(T));
-      size_t received_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;
-      size_t dequantize_cur_rank_embedding_grad_size =
-          GetCudaAlignedSize(full_elem_cnt * sizeof(T));
-      CHECK_GE(tmp_buffer->shape().elem_cnt(),
-               unique_partition_embedding_grad_size + received_embedding_grad_size
-                   + quantize_cur_rank_embedding_grad_size + cur_rank_quantize_factor_size
-                   + received_cur_rank_quantize_factor_size
-                   + dequantize_cur_rank_embedding_grad_size);
-      T* unique_partition_embedding_grad = reinterpret_cast<T*>(tmp_buffer->mut_dptr());
-      int8_t* received_embedding_grad = reinterpret_cast<int8_t*>(
-          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size);
-
-      int8_t* quantize_cur_rank_embedding_grad = reinterpret_cast<int8_t*>(
-          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
-          + received_embedding_grad_size);
-      T* cur_rank_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
-          + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size);
-      T* received_cur_rank_quantize_factor = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
-          + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size
-          + cur_rank_quantize_factor_size);
-      T* dequantize_cur_rank_embedding_grad = reinterpret_cast<T*>(
-          tmp_buffer->mut_dptr<char>() + unique_partition_embedding_grad_size
-          + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size
-          + cur_rank_quantize_factor_size + received_cur_rank_quantize_factor_size);
-
-      // unique and partition embedding grad
-      for (int64_t i = 0; i < parallel_num; ++i) {
-        const int64_t offset = i * num_ids * embedding_size;
-        const int64_t valid_value_size =
-            host_num_unique_matrix[parallel_id * parallel_num + i] * embedding_size * sizeof(T);
-        OF_CUDA_CHECK(cudaMemsetAsync(unique_partition_embedding_grad + offset, 0, valid_value_size,
-                                      cuda_stream));
-      }
-
-      UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, T, IDX, T>::UnsortedSegmentSum(
-          ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
-          embedding_grad->dptr<T>(), num_ids, parallel_num * num_ids, 1, embedding_size, 0,
-          unique_partition_embedding_grad);
-
-      // Quantize.
-      for (int64_t i = 0; i < parallel_num; ++i) {
-        const int64_t embedding_grad_offset = i * num_ids * embedding_size;
-        const int64_t quantize_factor_offset = i * num_ids;
-        const int64_t valid_row_size = host_num_unique_matrix[parallel_id * parallel_num + i];
-        DispatchQuantizeWarpImplPackSize<T, ComputeType>()(
-            cuda_stream, unique_partition_embedding_grad + embedding_grad_offset,
-            quantize_cur_rank_embedding_grad + embedding_grad_offset,
-            cur_rank_quantize_factor + quantize_factor_offset, valid_row_size, embedding_size);
-      }
-
-      ShuffleEmbeddingsGrad(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size,
-                            data_type, host_num_unique_matrix, quantize_cur_rank_embedding_grad,
-                            received_embedding_grad, cur_rank_quantize_factor,
-                            received_cur_rank_quantize_factor);
-
-      int64_t dequantize_cur_rank_num = 0;
-      for (int64_t i = 0; i < parallel_num; ++i) {
-        /*
-        Host num unique matrix:
-                |  Partition0  |  Partition1  |
-        | Rank0 |      2       |       4      |
-        | Rank1 |      3       |       3      |
-        After ShuffleEmbeddingGrads, each rank will exchange partition.
-        For example:
-        Rank0 will have (matrix[rank0][part0] + matrix[rank1][part0]) grad tensor.
-        Rank1 will have (matrix[rank0][part1] + matrix[rank1][part1]) grad tensor.
-        */
-        dequantize_cur_rank_num += host_num_unique_matrix[i * parallel_num + parallel_id];
-      }
-      IDX dequantize_elem_cnt = dequantize_cur_rank_num * embedding_size;
-      OF_CUDA_CHECK((LaunchDequantizeKernel<T, ComputeType, IDX>(
-          cuda_stream, received_embedding_grad, received_cur_rank_quantize_factor,
-          dequantize_cur_rank_embedding_grad, embedding_size, dequantize_elem_cnt)));
-      // unique cur_rank embedding grad
-      OF_CUDA_CHECK(cudaMemsetAsync(cur_rank_unique_embedding_grad->mut_dptr(), 0,
-                                    cur_rank_num_ids * embedding_size * sizeof(T), cuda_stream));
-      UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, T, IDX, T>::UnsortedSegmentSum(
-          ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
-          dequantize_cur_rank_embedding_grad, cur_rank_num_ids, cur_rank_num_ids, 1, embedding_size,
-          0, cur_rank_unique_embedding_grad->mut_dptr<T>());
+    // unique and partition embedding grad
+    for (int64_t i = 0; i < parallel_num; ++i) {
+      const int64_t offset = i * num_ids * embedding_size;
+      const int64_t valid_value_size =
+          host_num_unique_matrix[parallel_id * parallel_num + i] * embedding_size * sizeof(T);
+      OF_CUDA_CHECK(cudaMemsetAsync(unique_partition_embedding_grad + offset, 0, valid_value_size,
+                                    cuda_stream));
     }
+    UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, T, IDX, T>::UnsortedSegmentSum(
+        ctx->stream(), reinterpret_cast<const IDX*>(inverse_unique_partition_indices->dptr()),
+        embedding_grad->dptr<T>(), num_ids, parallel_num * num_ids, 1, embedding_size, 0,
+        unique_partition_embedding_grad);
+
+    ncclComm_t comm = kernel_state->comm();
+    ShuffleEmbeddingsGrad(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size,
+                          data_type, host_num_unique_matrix, unique_partition_embedding_grad,
+                          received_embedding_grad);
+
+    // unique cur_rank embedding grad
+    OF_CUDA_CHECK(cudaMemsetAsync(cur_rank_unique_embedding_grad->mut_dptr(), 0,
+                                  cur_rank_num_ids * embedding_size * sizeof(T), cuda_stream));
+    UnsortedSegmentSumKernelUtil<DeviceType::kCUDA, T, IDX, T>::UnsortedSegmentSum(
+        ctx->stream(), reinterpret_cast<const IDX*>(cur_rank_inverse_indices->dptr()),
+        received_embedding_grad, cur_rank_num_ids, cur_rank_num_ids, 1, embedding_size, 0,
+        cur_rank_unique_embedding_grad->mut_dptr<T>());
   }
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
@@ -1289,33 +655,10 @@ class EmbeddingGradientShuffleKernel final : public user_op::OpKernel {
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                         \
         const user_op::TensorDesc& cur_rank_unique_embedding_grad =                               \
             ctx->InputTensorDesc("cur_rank_unique_embedding_grad", 0);                            \
-        size_t cur_rank_embedding_grad_num = cur_rank_unique_embedding_grad.shape().At(0);        \
-        size_t embedding_size = cur_rank_unique_embedding_grad.shape().At(1);                     \
-        size_t cur_rank_embedding_grad_elem_cnt = cur_rank_embedding_grad_num * embedding_size;   \
-        bool enable_quantized_comm =                                                              \
-            ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false)             \
-            && (embedding_size < kMaxColSize);                                                    \
-        size_t tmp_size = 0;                                                                      \
-        if (!enable_quantized_comm) {                                                             \
-          size_t cur_rank_embedding_grad_size = GetCudaAlignedSize(                               \
-              cur_rank_embedding_grad_elem_cnt * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));         \
-          tmp_size = 2 * cur_rank_embedding_grad_size;                                            \
-        } else {                                                                                  \
-          size_t unique_partition_embedding_grad_size = GetCudaAlignedSize(                       \
-              cur_rank_embedding_grad_elem_cnt * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));         \
-          size_t received_embedding_grad_size =                                                   \
-              GetCudaAlignedSize(cur_rank_embedding_grad_elem_cnt * sizeof(int8_t));              \
-          size_t quantize_cur_rank_embedding_grad_size = received_embedding_grad_size;            \
-          size_t cur_rank_quantize_factor_size = GetCudaAlignedSize(                              \
-              cur_rank_embedding_grad_num * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));              \
-          size_t received_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size;          \
-          size_t dequantize_cur_rank_embedding_grad_size = unique_partition_embedding_grad_size;  \
-          tmp_size = unique_partition_embedding_grad_size + received_embedding_grad_size          \
-                     + quantize_cur_rank_embedding_grad_size + cur_rank_quantize_factor_size      \
-                     + received_cur_rank_quantize_factor_size                                     \
-                     + dequantize_cur_rank_embedding_grad_size;                                   \
-        }                                                                                         \
-        return tmp_size;                                                                          \
+        size_t cur_rank_embedding_grad_size =                                                     \
+            GetCudaAlignedSize(cur_rank_unique_embedding_grad.shape().elem_cnt()                  \
+                               * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)));                         \
+        return 2 * cur_rank_embedding_grad_size;                                                  \
       });
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_GRADIENT_SHUFFLE_KERNEL,
@@ -1337,9 +680,9 @@ class UniqueKeyValuePairKernel final : public user_op::OpKernel {
     user_op::Tensor* unique_values = ctx->Tensor4ArgNameAndIndex("unique_values", 0);
     user_op::Tensor* inverse_indices = ctx->Tensor4ArgNameAndIndex("inverse_indices", 0);
     user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    const int32_t num_tables = ctx->Attr<int32_t>("num_tables");
+    const int32_t num_columns = ctx->Attr<int32_t>("num_columns");
     const bool has_values = ctx->has_input("values", 0);
-    const bool need_values_buffer = (!has_values && num_tables > 1);
+    const bool need_values_buffer = (!has_values && num_columns > 1);
     size_t values_buffer_bytes =
         need_values_buffer ? GetCudaAlignedSize(keys->shape().elem_cnt() * sizeof(V)) : 0;
     const int64_t num_keys = keys->shape().elem_cnt();
@@ -1353,13 +696,13 @@ class UniqueKeyValuePairKernel final : public user_op::OpKernel {
       values_ptr = reinterpret_cast<const V*>(values->dptr());
     } else if (need_values_buffer) {
       V* values_buffer_ptr = reinterpret_cast<V*>(tmp_buffer->mut_dptr());
-      GenerateTableIds<<<BlocksNum4ThreadsNum(num_keys), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(
-          num_keys, num_tables, values_buffer_ptr);
+      GenerateColumnIds<<<BlocksNum4ThreadsNum(num_keys), kCudaThreadsNumPerBlock, 0,
+                          cuda_stream>>>(num_keys, num_columns, values_buffer_ptr);
       values_ptr = values_buffer_ptr;
     } else {
       values_ptr = nullptr;
     }
-    const bool need_process_table_ids = (has_values || num_tables > 1);
+    const bool need_process_column_ids = (has_values || num_columns > 1);
     TableEntry<K>* workspace_ptr =
         reinterpret_cast<TableEntry<K>*>(tmp_buffer->mut_dptr<char>() + values_buffer_bytes);
     UniqueAndPartition<K, V, IDX, embedding::GlobalUniqueHash>(
@@ -1368,7 +711,7 @@ class UniqueKeyValuePairKernel final : public user_op::OpKernel {
         reinterpret_cast<K*>(unique_keys->mut_dptr()),
         reinterpret_cast<V*>(unique_values->mut_dptr()),
         reinterpret_cast<IDX*>(inverse_indices->mut_dptr()), workspace_ptr, workspace_bytes,
-        need_process_table_ids);
+        need_process_column_ids);
   }
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
@@ -1389,9 +732,9 @@ class UniqueKeyValuePairKernel final : public user_op::OpKernel {
         const int64_t hash_capacity = num_keys;                                                    \
         const size_t workspace_bytes = GetCudaAlignedSize(                                         \
             hash_capacity * sizeof(TableEntry<OF_PP_PAIR_FIRST(k_dtype_pair)>));                   \
-        const int32_t num_tables = ctx->Attr<int32_t>("num_tables");                               \
+        const int32_t num_columns = ctx->Attr<int32_t>("num_columns");                             \
         const bool has_values = ctx->has_input("values", 0);                                       \
-        const bool need_values_buffer = (!has_values && num_tables > 1);                           \
+        const bool need_values_buffer = (!has_values && num_columns > 1);                          \
         size_t values_buffer_bytes =                                                               \
             need_values_buffer                                                                     \
                 ? GetCudaAlignedSize(num_keys * sizeof(OF_PP_PAIR_FIRST(value_dtype_pair)))        \
diff --git a/oneflow/user/kernels/deconv_cudnn_kernel.cpp b/oneflow/user/kernels/deconv_cudnn_kernel.cpp
index 1706170b4dd..793b921bc99 100644
--- a/oneflow/user/kernels/deconv_cudnn_kernel.cpp
+++ b/oneflow/user/kernels/deconv_cudnn_kernel.cpp
@@ -144,7 +144,6 @@ class DeConvGpuKernel final : public user_op::OpKernel {
                        && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                                \
         const auto& in = ctx->InputTensorDesc("in", 0);                                            \
-        if (in.shape().elem_cnt() == 0) return 0;                                                  \
         const auto& weight = ctx->InputTensorDesc("weight", 0);                                    \
         const auto* out = ctx->OutputTensorDesc("out", 0);                                         \
         const auto& cudnn_conf = Global<ResourceDesc, ForSession>::Get()->resource().cudnn_conf(); \
diff --git a/oneflow/user/kernels/dim_gather_kernel_util.cpp b/oneflow/user/kernels/dim_gather_kernel_util.cpp
index c210bc5c7fa..8cf01d029b5 100644
--- a/oneflow/user/kernels/dim_gather_kernel_util.cpp
+++ b/oneflow/user/kernels/dim_gather_kernel_util.cpp
@@ -24,10 +24,9 @@ template<typename IN_T, typename IDX_T>
 struct DimGatherFunctor<DeviceType::kCPU, IN_T, IDX_T> final {
   void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& input_nd_helper,
                   const DimOpIndexNdHelper<IDX_T>& index_nd_helper, int ndim, int64_t elem_cnt,
-                  int32_t dim_length, int32_t dim, const IDX_T* index, const IN_T* input,
-                  IN_T* output) {
-    DoDimGather<IN_T, IDX_T>(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim,
-                             index, input, output);
+                  int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) {
+    DoDimGather<IN_T, IDX_T>(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input,
+                             output);
   }
 };
 
diff --git a/oneflow/user/kernels/dim_gather_kernel_util.cu b/oneflow/user/kernels/dim_gather_kernel_util.cu
index c8963c54e04..1dca3380949 100644
--- a/oneflow/user/kernels/dim_gather_kernel_util.cu
+++ b/oneflow/user/kernels/dim_gather_kernel_util.cu
@@ -24,21 +24,19 @@ namespace user_op {
 template<typename IN_T, typename IDX_T>
 __global__ void DoCUDADimGather(const DimOpIndexNdHelper<IDX_T> input_nd_helper,
                                 const DimOpIndexNdHelper<IDX_T> index_nd_helper, int ndim,
-                                int64_t elem_cnt, int32_t dim_length, int32_t dim,
-                                const IDX_T* index, const IN_T* input, IN_T* output) {
-  DoDimGather<IN_T, IDX_T>(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index,
-                           input, output);
+                                int64_t elem_cnt, int32_t dim, const IDX_T* index,
+                                const IN_T* input, IN_T* output) {
+  DoDimGather<IN_T, IDX_T>(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input,
+                           output);
 }
 
 template<typename IDX_T, typename IN_T>
 struct DimGatherFunctor<DeviceType::kCUDA, IN_T, IDX_T> final {
   void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& input_nd_helper,
                   const DimOpIndexNdHelper<IDX_T>& index_nd_helper, int ndim, int64_t elem_cnt,
-                  int32_t dim_length, int32_t dim, const IDX_T* index, const IN_T* input,
-                  IN_T* output) {
+                  int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output) {
     RUN_CUDA_KERNEL((DoCUDADimGather<IN_T, IDX_T>), stream, BlocksNum4ThreadsNum(elem_cnt),
-                    input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index, input,
-                    output);
+                    input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index, input, output);
   }
 };
 
@@ -47,10 +45,9 @@ template<typename IDX_T>
 struct DimGatherFunctor<DeviceType::kCUDA, float16, IDX_T> final {
   void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& input_nd_helper,
                   const DimOpIndexNdHelper<IDX_T>& index_nd_helper, int ndim, int64_t elem_cnt,
-                  int32_t dim_length, int32_t dim, const IDX_T* index, const float16* input,
-                  float16* output) {
+                  int32_t dim, const IDX_T* index, const float16* input, float16* output) {
     RUN_CUDA_KERNEL((DoCUDADimGather<half, IDX_T>), stream, BlocksNum4ThreadsNum(elem_cnt),
-                    input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index,
+                    input_nd_helper, index_nd_helper, ndim, elem_cnt, dim, index,
                     reinterpret_cast<const half*>(input), reinterpret_cast<half*>(output));
   }
 };
diff --git a/oneflow/user/kernels/dim_gather_kernel_util.h b/oneflow/user/kernels/dim_gather_kernel_util.h
index 310784a4dc7..f1da1706b3c 100644
--- a/oneflow/user/kernels/dim_gather_kernel_util.h
+++ b/oneflow/user/kernels/dim_gather_kernel_util.h
@@ -43,24 +43,17 @@ template<DeviceType device_type, typename IN_T, typename IDX_T>
 struct DimGatherFunctor final {
   void operator()(ep::Stream* stream, const DimOpIndexNdHelper<IDX_T>& input_nd_helper,
                   const DimOpIndexNdHelper<IDX_T>& index_nd_helper, int ndim, int64_t elem_cnt,
-                  int32_t dim_length, int32_t dim, const IDX_T* index, const IN_T* input,
-                  IN_T* output);
+                  int32_t dim, const IDX_T* index, const IN_T* input, IN_T* output);
 };
 
 template<typename IN_T, typename IDX_T>
 OF_DEVICE_FUNC void DoDimGather(const DimOpIndexNdHelper<IDX_T>& input_nd_helper,
                                 const DimOpIndexNdHelper<IDX_T>& index_nd_helper, int ndim,
-                                int64_t elem_cnt, int32_t dim_length, int32_t dim,
-                                const IDX_T* index, const IN_T* input, IN_T* output) {
+                                int64_t elem_cnt, int32_t dim, const IDX_T* index,
+                                const IN_T* input, IN_T* output) {
   XPU_1D_KERNEL_LOOP(index_offset, elem_cnt) {
     IDX_T coordinate[kDimGatherMaxDimCount] = {0};
     const IDX_T x = index[index_offset];
-#ifdef __CUDA_ARCH__
-    assert(x < dim_length && "gather index is out of bounds");
-#else
-    CHECK_LE(x, dim_length) << "RuntimeError: index " << x << " is out of bounds for dimension "
-                            << dim << " with size " << dim_length;
-#endif
     index_nd_helper.OffsetToNdIndex(index_offset, coordinate, ndim);
     coordinate[dim] = x;
 
diff --git a/oneflow/user/kernels/dim_gather_kernels.cpp b/oneflow/user/kernels/dim_gather_kernels.cpp
index 81d50aa8b2d..aac416a24e6 100644
--- a/oneflow/user/kernels/dim_gather_kernels.cpp
+++ b/oneflow/user/kernels/dim_gather_kernels.cpp
@@ -59,9 +59,10 @@ class DimGatherKernel final : public user_op::OpKernel {
     DimOpIndexNdHelper<IDX_T> input_nd_helper(shape_vec.data(), ndim);
     shape2dims(index_tensor->shape());
     DimOpIndexNdHelper<IDX_T> index_nd_helper(shape_vec.data(), ndim);
-    DimGatherFunctor<device_type, IN_T, IDX_T>()(
-        ctx->stream(), input_nd_helper, index_nd_helper, ndim, index_tensor->shape().elem_cnt(),
-        input_tensor->shape().At(dim), dim, index, input, output);
+
+    DimGatherFunctor<device_type, IN_T, IDX_T>()(ctx->stream(), input_nd_helper, index_nd_helper,
+                                                 ndim, index_tensor->shape().elem_cnt(), dim, index,
+                                                 input, output);
   }
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
diff --git a/oneflow/user/kernels/distributions/normal_kernel.h b/oneflow/user/kernels/distributions/normal_kernel.h
index d5358b2c8c6..a04e88cd80f 100644
--- a/oneflow/user/kernels/distributions/normal_kernel.h
+++ b/oneflow/user/kernels/distributions/normal_kernel.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/user/kernels/distributions/common.h"
 #include "oneflow/user/kernels/distributions/normal_distribution.h"
-#include "oneflow/user/kernels/random_seed_util.h"
 
 namespace oneflow {
 
@@ -35,9 +34,7 @@ class NormalKernel final : public user_op::OpKernel {
   std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
       user_op::KernelInitContext* ctx) const override {
     const auto& generator = CHECK_JUST(one::MakeGenerator(device_type));
-    // When SBP is Split, each rank uses a different seeds, otherwise, ranks use the same seed
-    generator->set_current_seed(
-        CHECK_JUST(GetOpKernelRandomSeedInCurrentRank(ctx, ctx->Attr<int64_t>("seed"))));
+    generator->set_current_seed(ctx->Attr<int64_t>("seed"));
     return std::make_shared<DistributionKernelState>(generator);
   }
 
diff --git a/oneflow/user/kernels/fused_dot_feature_interaction_kernel.cu b/oneflow/user/kernels/fused_dot_feature_interaction_kernel.cu
index 76cfbc22db2..e371bb89d5d 100644
--- a/oneflow/user/kernels/fused_dot_feature_interaction_kernel.cu
+++ b/oneflow/user/kernels/fused_dot_feature_interaction_kernel.cu
@@ -17,7 +17,6 @@ limitations under the License.
 #include "oneflow/core/ep/cuda/cuda_stream.h"
 #include "oneflow/core/ep/include/primitive/copy_nd.h"
 #include "oneflow/core/ep/include/primitive/batch_matmul.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
 
 namespace oneflow {
 
@@ -190,8 +189,7 @@ void ConcatFeaturesGrad(user_op::KernelComputeContext* ctx, const int64_t batch_
 }  // namespace
 
 template<typename T>
-class FusedDotFeatureInteractionKernel final : public user_op::OpKernel,
-                                               public user_op::CudaGraphSupport {
+class FusedDotFeatureInteractionKernel final : public user_op::OpKernel {
  public:
   FusedDotFeatureInteractionKernel() = default;
   ~FusedDotFeatureInteractionKernel() override = default;
@@ -285,8 +283,7 @@ REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(float)
 REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(half)
 
 template<typename T>
-class FusedDotFeatureInteractionGradKernel final : public user_op::OpKernel,
-                                                   public user_op::CudaGraphSupport {
+class FusedDotFeatureInteractionGradKernel final : public user_op::OpKernel {
  public:
   FusedDotFeatureInteractionGradKernel() = default;
   ~FusedDotFeatureInteractionGradKernel() override = default;
diff --git a/oneflow/user/kernels/gather_kernel.cpp b/oneflow/user/kernels/gather_kernel.cpp
index 42a0a6dc976..e9f02511ae1 100644
--- a/oneflow/user/kernels/gather_kernel.cpp
+++ b/oneflow/user/kernels/gather_kernel.cpp
@@ -118,12 +118,6 @@ class GatherKernel final : public user_op::OpKernel, public user_op::CudaGraphSu
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_GATHER_KERNEL, DEVICE_TYPE_SEQ, GATHER_DATA_TYPE_SEQ,
                                  INDEX_DATA_TYPE_SEQ)
 
-#ifdef WITH_CUDA
-// For Half
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_GATHER_KERNEL, OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA),
-                                 HALF_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ)
-#endif
-
 }  // namespace user_op
 
 }  // namespace oneflow
diff --git a/oneflow/user/kernels/gather_kernel_util.cu b/oneflow/user/kernels/gather_kernel_util.cu
index 492eca7b825..d2d83a4e7bd 100644
--- a/oneflow/user/kernels/gather_kernel_util.cu
+++ b/oneflow/user/kernels/gather_kernel_util.cu
@@ -115,8 +115,8 @@ struct GatherKernelUtilImpl<DeviceType::kCUDA, T, K> final {
 #define INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL(in_type_pair, index_type_pair)              \
   template struct GatherKernelUtilImpl<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
                                        OF_PP_PAIR_FIRST(index_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL,
-                                 GATHER_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, GATHER_INDEX_TYPE_SEQ);
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL, GATHER_DATA_TYPE_SEQ,
+                                 GATHER_INDEX_TYPE_SEQ);
 #undef INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL
 
 }  // namespace oneflow
diff --git a/oneflow/user/kernels/gather_kernel_util.h b/oneflow/user/kernels/gather_kernel_util.h
index 2eb1a774c84..e11e3397abe 100644
--- a/oneflow/user/kernels/gather_kernel_util.h
+++ b/oneflow/user/kernels/gather_kernel_util.h
@@ -34,7 +34,7 @@ struct GatherKernelUtilImpl final {
                       const Shape& flat_in_shape, T* out, int64_t offset);
 };
 
-#define GATHER_DATA_TYPE_SEQ ARITHMETIC_DATA_TYPE_SEQ
+#define GATHER_DATA_TYPE_SEQ ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ
 #define GATHER_INDEX_TYPE_SEQ INDEX_DATA_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32)
 
 }  // namespace oneflow
diff --git a/oneflow/user/kernels/math_unary_elementwise_func.h b/oneflow/user/kernels/math_unary_elementwise_func.h
index 27f991e3127..3791b438c8e 100644
--- a/oneflow/user/kernels/math_unary_elementwise_func.h
+++ b/oneflow/user/kernels/math_unary_elementwise_func.h
@@ -153,13 +153,6 @@ struct AtanhFunctor<float> {
   }
 };
 
-template<>
-struct NotEqualZeroFunctor<float> {
-  static OF_DEVICE_FUNC float Forward(const float x) { return x != 0; }
-
-  static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; }
-};
-
 template<>
 struct CeilFunctor<float> {
   static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(ceil, x); }
@@ -429,13 +422,6 @@ struct AtanhFunctor<double> {
   }
 };
 
-template<>
-struct NotEqualZeroFunctor<double> {
-  static OF_DEVICE_FUNC double Forward(const double x) { return x != 0; }
-
-  static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0f; }
-};
-
 template<>
 struct CeilFunctor<double> {
   static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(ceil, x); }
@@ -731,13 +717,6 @@ struct CeilFunctor<half> {
   static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
 };
 
-template<>
-struct NotEqualZeroFunctor<half> {
-  static OF_HALF_FUNC half Forward(const half x) { return x != static_cast<half>(0.0); }
-
-  static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal<half>(); }
-};
-
 template<>
 struct CosFunctor<half> {
   static OF_HALF_FUNC half Forward(const half x) { return hcos(x); }
diff --git a/oneflow/user/kernels/model_update_kernel_util.h b/oneflow/user/kernels/model_update_kernel_util.h
index 8c98f16e482..03ae9b819c0 100644
--- a/oneflow/user/kernels/model_update_kernel_util.h
+++ b/oneflow/user/kernels/model_update_kernel_util.h
@@ -113,8 +113,7 @@ struct AdagradUpdateFunctor {
         CastScaleRegularizeGradientFunctor<T, G>()(*model_diff, model_val, scale, l1, l2);
     const T next_sum = *sum + model_diff_t * model_diff_t;
     *sum = next_sum;
-    *model = model_val - learning_rate / (sqrt(next_sum) + epsilon) * model_diff_t
-             - learning_rate * weight_decay * model_val;
+    *model = model_val - learning_rate / (sqrt(next_sum) + epsilon) * model_diff_t;
   }
 };
 
diff --git a/oneflow/user/kernels/multi_reduce_kernel_util.h b/oneflow/user/kernels/multi_reduce_kernel_util.h
deleted file mode 100644
index dce31f4d443..00000000000
--- a/oneflow/user/kernels/multi_reduce_kernel_util.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_USER_KERNELS_MULTI_REDUCE_KERNEL_UTIL_H_
-#define ONEFLOW_USER_KERNELS_MULTI_REDUCE_KERNEL_UTIL_H_
-
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/common/device_type.h"
-#include "oneflow/core/common/device_type.pb.h"
-#include "oneflow/core/ep/include/stream.h"
-
-namespace oneflow {
-
-template<typename T>
-struct MultiReduceParam {
-  const T* data;
-  size_t size;
-};
-
-template<DeviceType device_type, typename T, typename TransformFn, typename ReduceFn>
-struct MultiReduce {
-  void operator()(ep::Stream* stream, TransformFn transform,
-                  const std::vector<MultiReduceParam<T>>& params, T init, T* ret, T* temp);
-};
-
-template<typename T, typename TransformFn, typename ReduceFn>
-struct MultiReduce<DeviceType::kCPU, T, TransformFn, ReduceFn> {
-  void operator()(ep::Stream* stream, TransformFn transform,
-                  const std::vector<MultiReduceParam<T>>& params, T init, T* ret, T* temp) {
-    *ret = init;
-    ReduceFn reduce{};
-    FOR_RANGE(size_t, i, 0, params.size()) {
-      const auto& p = params[i];
-      FOR_RANGE(size_t, j, 0, p.size) { *ret = reduce(*ret, transform(p.data[j])); }
-    }
-  }
-};
-
-template<typename T>
-struct BinaryAdd {
-  OF_DEVICE_FUNC T operator()(const T& x, const T& y) const { return x + y; }
-};
-
-template<typename T>
-struct BinaryMax {
-  OF_DEVICE_FUNC T operator()(const T& x, const T& y) const { return x > y ? x : y; }
-};
-
-template<typename T>
-struct BinaryMin {
-  OF_DEVICE_FUNC T operator()(const T& x, const T& y) const { return x < y ? x : y; }
-};
-
-template<typename T>
-struct Abs {
-  OF_DEVICE_FUNC T operator()(const T& x) const { return x < GetZeroVal<T>() ? -x : x; }
-};
-
-template<typename T>
-struct PowByZero {
-  OF_DEVICE_FUNC T operator()(const T& x) const {
-    return x != GetZeroVal<T>() ? GetOneVal<T>() : x;
-  }
-};
-
-template<typename T>
-struct Square {
-  OF_DEVICE_FUNC T operator()(const T& x) const { return x * x; }
-};
-
-template<typename T>
-struct AbsPow {
-  explicit AbsPow(const T& base) : base_(base) {}
-
-  OF_DEVICE_FUNC T operator()(const T& x) {
-    T abs_x = x < GetZeroVal<T>() ? -x : x;
-#if defined(__CUDA_ARCH__)
-    return pow(abs_x, base_);
-#else
-    return std::pow(abs_x, base_);
-#endif
-  }
-
- private:
-  T base_;
-};
-
-}  // namespace oneflow
-
-#endif  // ONEFLOW_USER_KERNELS_MULTI_REDUCE_KERNEL_UTIL_H_
diff --git a/oneflow/user/kernels/multi_reduce_kernels.cpp b/oneflow/user/kernels/multi_reduce_kernels.cpp
deleted file mode 100644
index 5b9a3d5c6d2..00000000000
--- a/oneflow/user/kernels/multi_reduce_kernels.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/user/kernels/multi_reduce_kernels.h"
-
-namespace oneflow {
-
-#define REGISTER_MULTI_REDUCE_SUM_POW_ABS_CPU_KERNEL(dtype)               \
-  REGISTER_USER_KERNEL("multi_reduce_sum_pow_abs")                        \
-      .SetCreateFn<MultiReduceSumPowAbsKernel<DeviceType::kCPU, dtype>>() \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)     \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
-
-#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CPU_KERNEL(op_type_name, ximum_enum, dtype)  \
-  REGISTER_USER_KERNEL(op_type_name)                                                 \
-      .SetCreateFn<MultiReduceXimumAbsKernel<DeviceType::kCPU, dtype, ximum_enum>>() \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value));
-
-#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CPU_KERNELS(dtype)                                     \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CPU_KERNEL("multi_reduce_max_abs", Ximum::kMax, dtype)       \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CPU_KERNEL("multi_reduce_min_abs", Ximum::kMin, dtype)       \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CPU_KERNEL("local_multi_reduce_max_abs", Ximum::kMax, dtype) \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CPU_KERNEL("local_multi_reduce_min_abs", Ximum::kMin, dtype)
-
-REGISTER_MULTI_REDUCE_SUM_POW_ABS_CPU_KERNEL(float)
-REGISTER_MULTI_REDUCE_SUM_POW_ABS_CPU_KERNEL(double)
-
-REGISTER_MULTI_REDUCE_XIMUM_ABS_CPU_KERNELS(float)
-REGISTER_MULTI_REDUCE_XIMUM_ABS_CPU_KERNELS(double)
-
-}  // namespace oneflow
diff --git a/oneflow/user/kernels/multi_reduce_kernels.cu b/oneflow/user/kernels/multi_reduce_kernels.cu
deleted file mode 100644
index 7de71f83751..00000000000
--- a/oneflow/user/kernels/multi_reduce_kernels.cu
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/user/kernels/multi_reduce_kernels.h"
-#include "oneflow/core/ep/include/primitive/fill.h"
-#include "oneflow/core/cuda/atomic.cuh"
-#include "oneflow/core/device/cuda_util.h"
-#include <cub/cub.cuh>
-#include <limits>
-
-namespace oneflow {
-
-namespace {
-
-constexpr int64_t kMultiReduceMaxPackSize = 64;
-
-template<typename T>
-struct MultiReduceParamsPack {
-  MultiReduceParam<T> params[kMultiReduceMaxPackSize];
-  size_t size;
-};
-
-template<typename T, typename TransformFn, typename ReduceFn>
-__global__ void MultiBlockReduceGpu(TransformFn transform,
-                                    const MultiReduceParamsPack<T> pack_params, const T init,
-                                    T* out) {
-  ReduceFn reduce_fn{};
-  T t_out = init;
-  for (int i = 0; i < pack_params.size; ++i) {
-    const auto& param = pack_params.params[i];
-    CUDA_1D_KERNEL_LOOP(j, param.size) { t_out = reduce_fn(t_out, transform(param.data[j])); }
-  }
-  typedef cub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  T b_out = BlockReduce(temp_storage).Reduce(t_out, reduce_fn);
-  if (threadIdx.x == 0) { out[blockIdx.x] = b_out; }
-}
-
-size_t InferTempStorageSize(user_op::InferContext* ctx) {
-  auto input_size = ctx->input_size("x");
-  if (input_size == 0) { return 0; }
-  int64_t max_elem_cnt = 0;
-  int64_t pack_size = 0;
-  int32_t num_blocks = 0;
-  for (size_t i = 0; i < input_size; ++i) {
-    int64_t elem_cnt = ctx->InputShape("x", i).elem_cnt();
-    max_elem_cnt = std::max(max_elem_cnt, elem_cnt);
-    pack_size++;
-    if (pack_size == kMultiReduceMaxPackSize || i == input_size - 1) {
-      CHECK_LT(max_elem_cnt, std::numeric_limits<int32_t>::max());
-      num_blocks += BlocksNum4ThreadsNum(static_cast<int32_t>(max_elem_cnt));
-      max_elem_cnt = 0;
-      pack_size = 0;
-    }
-  }
-  CHECK_LT(num_blocks, kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock)
-      << "Too much blocks needed for computing " << ctx->op_name() << ", should be less than "
-      << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock
-      << ", but got " << num_blocks;
-  size_t elem_size = GetSizeOfDataType(ctx->InputDType("x", 0));
-  return GetCudaAlignedSize(num_blocks * elem_size * 2);
-}
-
-}  // namespace
-
-template<typename T, typename TransformFn, typename ReduceFn>
-struct MultiReduce<DeviceType::kCUDA, T, TransformFn, ReduceFn> {
-  void operator()(ep::Stream* stream, TransformFn transform,
-                  const std::vector<MultiReduceParam<T>>& params, T init, T* ret, T* temp) {
-    CHECK_NOTNULL(temp);
-    int32_t total_num_blocks = 0;
-    for (size_t i = 0; i < params.size(); i += kMultiReduceMaxPackSize) {
-      MultiReduceParamsPack<T> pack_params{};
-      size_t max_elem_cnt = 0;
-      pack_params.size = std::min<size_t>(kMultiReduceMaxPackSize, params.size() - i);
-      for (size_t j = 0; j < pack_params.size; ++j) {
-        pack_params.params[j] = params[i + j];
-        max_elem_cnt = std::max<size_t>(max_elem_cnt, pack_params.params[j].size);
-      }
-      int32_t num_blocks = BlocksNum4ThreadsNum(max_elem_cnt);
-      MultiBlockReduceGpu<T, TransformFn, ReduceFn>
-          <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              transform, pack_params, init, temp + total_num_blocks);
-      total_num_blocks += num_blocks;
-    }
-    size_t wksp_size = 0;
-    auto DeviceReduce = [&](void* temp_storage) -> void {
-      OF_CUDA_CHECK(cub::DeviceReduce::Reduce(temp_storage, wksp_size, temp, ret, total_num_blocks,
-                                              ReduceFn{}, init,
-                                              stream->As<ep::CudaStream>()->cuda_stream()));
-    };
-    DeviceReduce(nullptr);
-    // NOTE(zwx): We have allocated the temp storage with the space
-    //  that can hold all the elements to reduce,
-    //  normally the `temp_storage_bytes` for cub::DeviceReduce shouldn't exceed it.
-    CHECK_LE(wksp_size, total_num_blocks * sizeof(T))
-        << wksp_size << " size in bytes of temp storage is needed for doing cub::DeviceReduce, "
-        << "but only allocated " << total_num_blocks * sizeof(T);
-    DeviceReduce(temp + total_num_blocks);
-  }
-};
-
-#define REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(dtype)                           \
-  REGISTER_USER_KERNEL("multi_reduce_sum_pow_abs")                                     \
-      .SetCreateFn<MultiReduceSumPowAbsKernel<DeviceType::kCUDA, dtype>>()             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn(InferTempStorageSize);
-
-#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL(op_type_name, ximum_enum, dtype)   \
-  REGISTER_USER_KERNEL(op_type_name)                                                   \
-      .SetCreateFn<MultiReduceXimumAbsKernel<DeviceType::kCUDA, dtype, ximum_enum>>()  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                 \
-                       && (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)) \
-      .SetInferTmpSizeFn(InferTempStorageSize);
-
-#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(dtype)                                     \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("multi_reduce_max_abs", Ximum::kMax, dtype)       \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("multi_reduce_min_abs", Ximum::kMin, dtype)       \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("local_multi_reduce_max_abs", Ximum::kMax, dtype) \
-  REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("local_multi_reduce_min_abs", Ximum::kMin, dtype)
-
-REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(float)
-REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(double)
-
-REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(float)
-REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(double)
-
-}  // namespace oneflow
diff --git a/oneflow/user/kernels/multi_reduce_kernels.h b/oneflow/user/kernels/multi_reduce_kernels.h
deleted file mode 100644
index 276532380f0..00000000000
--- a/oneflow/user/kernels/multi_reduce_kernels.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_USER_KERNELS_MULTI_REDUCE_KERNELS_H_
-#define ONEFLOW_USER_KERNELS_MULTI_REDUCE_KERNELS_H_
-
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/cuda_graph_support.h"
-#include "oneflow/user/kernels/multi_reduce_kernel_util.h"
-
-namespace oneflow {
-
-template<DeviceType device_type, typename T>
-class MultiReduceSumPowAbsKernel final : public user_op::OpKernel,
-                                         public user_op::CudaGraphSupport {
- public:
-  MultiReduceSumPowAbsKernel() = default;
-  ~MultiReduceSumPowAbsKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
-               const user_op::OpKernelCache*) const override {
-    std::vector<MultiReduceParam<T>> params;
-    params.resize(ctx->input_size("x"));
-    for (size_t i = 0; i < params.size(); ++i) {
-      const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", i);
-      params[i].size = x->shape().elem_cnt();
-      params[i].data = x->dptr<T>();
-    }
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    T* y_dptr = y->mut_dptr<T>();
-    user_op::Tensor* temp = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    T* tmp_dptr = temp ? temp->mut_dptr<T>() : nullptr;
-    float p = ctx->Attr<float>("p");
-    if (p == 0) {
-      PowByZero<T> func{};
-      MultiReduce<device_type, T, decltype(func), BinaryAdd<T>> reduce_sum{};
-      reduce_sum(ctx->stream(), func, params, GetZeroVal<T>(), y_dptr, tmp_dptr);
-    } else if (p == 1) {
-      Abs<T> func{};
-      MultiReduce<device_type, T, decltype(func), BinaryAdd<T>> reduce_sum{};
-      reduce_sum(ctx->stream(), func, params, GetZeroVal<T>(), y_dptr, tmp_dptr);
-    } else if (p == 2) {
-      Square<T> func{};
-      MultiReduce<device_type, T, decltype(func), BinaryAdd<T>> reduce_sum{};
-      reduce_sum(ctx->stream(), func, params, GetZeroVal<T>(), y_dptr, tmp_dptr);
-    } else {
-      AbsPow<T> func{p};
-      MultiReduce<device_type, T, decltype(func), BinaryAdd<T>> reduce_sum{};
-      reduce_sum(ctx->stream(), func, params, GetZeroVal<T>(), y_dptr, tmp_dptr);
-    }
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-enum class Ximum {
-  kMax = 0,
-  kMin = 1,
-};
-
-template<DeviceType device_type, typename T, Ximum X>
-class MultiReduceXimumAbsKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
- public:
-  MultiReduceXimumAbsKernel() = default;
-  ~MultiReduceXimumAbsKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*,
-               const user_op::OpKernelCache*) const override {
-    std::vector<MultiReduceParam<T>> params;
-    params.resize(ctx->input_size("x"));
-    for (size_t i = 0; i < params.size(); ++i) {
-      const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", i);
-      params[i].size = x->shape().elem_cnt();
-      params[i].data = x->dptr<T>();
-    }
-    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
-    user_op::Tensor* temp = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    T* tmp_dptr = temp ? temp->mut_dptr<T>() : nullptr;
-    Abs<T> abs{};
-    if (X == Ximum::kMax) {
-      MultiReduce<device_type, T, decltype(abs), BinaryMax<T>> reduce_max{};
-      reduce_max(ctx->stream(), abs, params, GetZeroVal<T>(), y->mut_dptr<T>(), tmp_dptr);
-    } else if (X == Ximum::kMin) {
-      MultiReduce<device_type, T, decltype(abs), BinaryMin<T>> reduce_min{};
-      reduce_min(ctx->stream(), abs, params, std::numeric_limits<T>::max(), y->mut_dptr<T>(),
-                 tmp_dptr);
-    } else {
-      UNIMPLEMENTED();
-    }
-  }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-}  // namespace oneflow
-
-#endif  // ONEFLOW_USER_KERNELS_MULTI_REDUCE_KERNELS_H_
diff --git a/oneflow/user/kernels/one_embedding_kernels.cu b/oneflow/user/kernels/one_embedding_kernels.cu
index 0f3b6098ecf..15d9ca35948 100644
--- a/oneflow/user/kernels/one_embedding_kernels.cu
+++ b/oneflow/user/kernels/one_embedding_kernels.cu
@@ -29,7 +29,10 @@ namespace oneflow {
 
 namespace {
 
-enum class InitializerType { kUniform, kNormal, kConstant };
+enum class InitializerType {
+  kUniform,
+  kNormal,
+};
 
 struct EmbeddingInitializer {
   InitializerType type;
@@ -42,144 +45,57 @@ struct EmbeddingInitializer {
       float mean;
       float std;
     } normal_param;
-    struct {
-      float value;
-    } constant_param;
   };
+};
 
-  bool operator==(const EmbeddingInitializer& rhs) const {
-    if (this->type != rhs.type) { return false; }
-    if (rhs.type == InitializerType::kUniform) {
-      return (this->uniform_param.low == rhs.uniform_param.low)
-             && (this->uniform_param.high == rhs.uniform_param.high);
-    } else if (rhs.type == InitializerType::kNormal) {
-      return (this->normal_param.mean == rhs.normal_param.mean)
-             && (this->normal_param.std == rhs.normal_param.std);
-    } else if (rhs.type == InitializerType::kConstant) {
-      return this->constant_param.value == rhs.constant_param.value;
-    } else {
-      UNIMPLEMENTED();
-      return false;
-    }
-  }
+struct EmbeddingColumn {
+  EmbeddingInitializer initializer;
 };
 
-void ParseInitializerFromJson(const nlohmann::json& initializer,
-                              EmbeddingInitializer* embedding_initializer) {
+constexpr size_t kMaxColumns = 128;
+
+struct ColumnsParam {
+  int32_t num_columns = 0;
+  EmbeddingColumn columns[kMaxColumns];
+};
+
+void ParseColumnFromJson(const nlohmann::json& initializer, EmbeddingColumn* embedding_column) {
   CHECK(initializer.contains("type"));
   CHECK(initializer["type"].is_string());
   std::string type = initializer["type"].get<std::string>();
   if (type == "uniform") {
-    embedding_initializer->type = InitializerType::kUniform;
+    embedding_column->initializer.type = InitializerType::kUniform;
     CHECK(initializer.contains("low"));
     CHECK(initializer.contains("high"));
     CHECK(initializer["low"].is_number());
     CHECK(initializer["high"].is_number());
-    embedding_initializer->uniform_param.low = initializer["low"];
-    embedding_initializer->uniform_param.high = initializer["high"];
+    embedding_column->initializer.uniform_param.low = initializer["low"];
+    embedding_column->initializer.uniform_param.high = initializer["high"];
   } else if (type == "normal") {
     CHECK(initializer.contains("mean"));
     CHECK(initializer.contains("std"));
     CHECK(initializer["mean"].is_number());
     CHECK(initializer["std"].is_number());
-    embedding_initializer->type = InitializerType::kNormal;
-    embedding_initializer->normal_param.mean = initializer["mean"];
-    embedding_initializer->normal_param.std = initializer["std"];
-  } else if (type == "constant") {
-    CHECK(initializer.contains("value"));
-    CHECK(initializer["value"].is_number());
-    embedding_initializer->type = InitializerType::kConstant;
-    embedding_initializer->constant_param.value = initializer["value"];
+    embedding_column->initializer.type = InitializerType::kNormal;
+    embedding_column->initializer.normal_param.mean = initializer["mean"];
+    embedding_column->initializer.normal_param.std = initializer["std"];
   } else {
-    UNIMPLEMENTED() << "Unsupported initializer type";
-  }
-}
-
-int32_t ParseJsonToUniqueInitializerVecAndReturnOffset(
-    const nlohmann::json& initializer, std::vector<EmbeddingInitializer>* initializers) {
-  EmbeddingInitializer embedding_initializer;
-  ParseInitializerFromJson(initializer, &embedding_initializer);
-  for (int32_t i = 0; i < initializers->size(); ++i) {
-    if (initializers->at(i) == embedding_initializer) { return i; }
+    UNIMPLEMENTED();
   }
-  initializers->push_back(embedding_initializer);
-  return initializers->size() - 1;
 }
 
-void SetInitializerIndex(int32_t row_id, int32_t col_start, int32_t col_end, int64_t line_size,
-                         int8_t index, std::vector<int8_t>* initializer_index) {
-  int64_t row_offset = row_id * line_size;
-  for (int32_t col = col_start; col < col_end; ++col) {
-    initializer_index->at(row_offset + col) = index;
-  }
-}
-
-void ParseAndSetStateInitializerIndex(const std::string& state_initializer,
-                                      const int32_t num_tables, const int64_t line_size,
-                                      const int64_t embedding_size,
-                                      std::vector<EmbeddingInitializer>* initializer_params,
-                                      std::vector<int8_t>* initializer_index) {
-  if (line_size == embedding_size) { return; }
-  CHECK(state_initializer != "");
-  auto initializers = nlohmann::json::parse(state_initializer);
-  CHECK(initializers.is_array());
-  const int num_states = line_size / embedding_size - 1;
-  CHECK_EQ(num_states, initializers.size());
-  for (int32_t i = 0; i < num_states; ++i) {
-    int32_t offset =
-        ParseJsonToUniqueInitializerVecAndReturnOffset(initializers.at(i), initializer_params);
-    int32_t col_start = embedding_size + i * embedding_size;
-    int32_t col_end = col_start + embedding_size;
-    CHECK_LE(col_end, line_size);
-    for (int32_t j = 0; j < num_tables; ++j) {
-      SetInitializerIndex(j, col_start, col_end, line_size, offset, initializer_index);
-    }
-  }
-}
-
-void ParseAndSetModelInitializerIndex(const nlohmann::json& tables,
-                                      const std::vector<int64_t>& column_dims,
-                                      const int32_t num_tables, const int32_t num_columns,
-                                      const int64_t line_size, const int64_t embedding_size,
-                                      std::vector<EmbeddingInitializer>* initializer_params,
-                                      std::vector<int8_t>* initializer_index) {
-  for (int32_t i = 0; i < num_tables; ++i) {
-    auto table = tables.at(i);
-    CHECK(table.contains("columns"));
-    auto columns = table["columns"];
-    CHECK(columns.is_array());
-    CHECK_EQ(num_columns, columns.size()) << "columns size must equal to num embedding dims";
-    int32_t col_start = 0;
-    for (int k = 0; k < columns.size(); ++k) {
-      auto column = columns.at(k);
-      CHECK(column.contains("initializer"));
-      int32_t offset =
-          ParseJsonToUniqueInitializerVecAndReturnOffset(column["initializer"], initializer_params);
-      int32_t col_end = col_start + column_dims.at(k);
-      SetInitializerIndex(i, col_start, col_end, line_size, offset, initializer_index);
-      col_start = col_end;
-    }
-    CHECK_EQ(col_start, embedding_size);
-  }
-}
-
-void ParseInitializers(const int64_t line_size, const int64_t embedding_size,
-                       const std::string& state_initializer, const std::string& json_serialized,
-                       std::vector<EmbeddingInitializer>* initializer_params,
-                       std::vector<int8_t>* initializer_index) {
+void ParseEmbeddingColumns(const std::string& json_serialized, ColumnsParam* param) {
   auto json_object = nlohmann::json::parse(json_serialized);
-  CHECK(json_object.contains("column_dims"));
-  std::vector<int64_t> column_dims = json_object["column_dims"];
-  const int32_t num_columns = column_dims.size();
-  CHECK(json_object.contains("tables"));
-  auto tables = json_object["tables"];
-  CHECK(tables.is_array());
-  const int32_t num_tables = tables.size();
-  initializer_index->resize(num_tables * line_size);
-  ParseAndSetStateInitializerIndex(state_initializer, num_tables, line_size, embedding_size,
-                                   initializer_params, initializer_index);
-  ParseAndSetModelInitializerIndex(tables, column_dims, num_tables, num_columns, line_size,
-                                   embedding_size, initializer_params, initializer_index);
+  CHECK(json_object.contains("columns"));
+  auto columns = json_object["columns"];
+  CHECK(columns.is_array());
+  CHECK_LE(columns.size(), kMaxColumns);
+  for (int32_t i = 0; i < columns.size(); ++i) {
+    auto column = columns.at(i);
+    CHECK(column.contains("initializer"));
+    ParseColumnFromJson(column["initializer"], &(param->columns[i]));
+  }
+  param->num_columns = columns.size();
 }
 
 template<typename IDX>
@@ -189,45 +105,16 @@ class EmbeddingKernelState final : public user_op::OpKernelState {
       : device_index_(-1), generator_(CHECK_JUST(one::MakeGenerator(DeviceType::kCUDA))) {
     OF_CUDA_CHECK(cudaGetDevice(&device_index_));
     OF_CUDA_CHECK(cudaMallocHost(&host_num_keys_, sizeof(IDX)));
+    ParseEmbeddingColumns(ctx->Attr<std::string>("embedding_columns"), &columns_param_);
     key_value_store_ = Global<embedding::EmbeddingManager>::Get()->GetKeyValueStore(
         ctx->Attr<std::string>("embedding_name"), ctx->parallel_ctx().parallel_id());
     uint32_t max_query_length =
         ctx->TensorDesc4ArgNameAndIndex("unique_ids", 0)->shape().elem_cnt();
     key_value_store_->ReserveQueryLength(max_query_length);
-
-    const int64_t embedding_size = ctx->Attr<int64_t>("embedding_size");
-    const int64_t line_size = ctx->Attr<int64_t>("line_size");
-    const std::string& state_initializer = ctx->Attr<std::string>("state_initializer");
-
-    std::vector<EmbeddingInitializer> initializer_param;
-    std::vector<int8_t> initializer_index;
-    ParseInitializers(line_size, embedding_size, state_initializer,
-                      ctx->Attr<std::string>("embedding_tables"), &initializer_param,
-                      &initializer_index);
-
-    const size_t param_size_bytes = initializer_param.size() * sizeof(EmbeddingInitializer);
-    OF_CUDA_CHECK(cudaMallocHost(&host_initializer_param_, param_size_bytes));
-    std::memcpy(host_initializer_param_, initializer_param.data(), param_size_bytes);
-    OF_CUDA_CHECK(cudaMalloc(&device_initializer_param_, param_size_bytes));
-    OF_CUDA_CHECK(cudaMemcpyAsync(device_initializer_param_, host_initializer_param_,
-                                  param_size_bytes, cudaMemcpyDefault,
-                                  ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
-
-    const size_t index_size_bytes = initializer_index.size() * sizeof(int8_t);
-    OF_CUDA_CHECK(cudaMallocHost(&host_initializer_index_, index_size_bytes));
-    std::memcpy(host_initializer_index_, initializer_index.data(), index_size_bytes);
-    OF_CUDA_CHECK(cudaMalloc(&device_initializer_index_, index_size_bytes));
-    OF_CUDA_CHECK(cudaMemcpyAsync(device_initializer_index_, host_initializer_index_,
-                                  index_size_bytes, cudaMemcpyDefault,
-                                  ctx->stream()->As<ep::CudaStream>()->cuda_stream()));
   }
   ~EmbeddingKernelState() override {
     CudaCurrentDeviceGuard guard(device_index_);
     OF_CUDA_CHECK(cudaFreeHost(host_num_keys_));
-    OF_CUDA_CHECK(cudaFreeHost(host_initializer_param_));
-    OF_CUDA_CHECK(cudaFree(device_initializer_param_));
-    OF_CUDA_CHECK(cudaFreeHost(host_initializer_index_));
-    OF_CUDA_CHECK(cudaFree(device_initializer_index_));
   }
 
   void* HostNumKeys() { return host_num_keys_; }
@@ -236,19 +123,14 @@ class EmbeddingKernelState final : public user_op::OpKernelState {
 
   one::Generator* generator() { return generator_.get(); }
 
-  const int8_t* InitializerIndex() { return device_initializer_index_; }
-  const EmbeddingInitializer* Initializers() { return device_initializer_param_; }
+  const ColumnsParam& Columns() { return columns_param_; }
 
  private:
   int device_index_;
   void* host_num_keys_;
   std::shared_ptr<one::Generator> generator_;
   embedding::KeyValueStore* key_value_store_;
-
-  EmbeddingInitializer* host_initializer_param_;
-  EmbeddingInitializer* device_initializer_param_;
-  int8_t* host_initializer_index_;
-  int8_t* device_initializer_index_;
+  ColumnsParam columns_param_;
 };
 
 template<typename IDX>
@@ -316,11 +198,9 @@ class EmbeddingTmpBufferManager final {
 template<typename T, typename U>
 __global__ void InitValueKernel(uint64_t seed, one::CUDAGeneratorState* cuda_gen_state,
                                 uint64_t inc_offset, const int32_t line_size,
-                                const int32_t embedding_size,
-                                const EmbeddingInitializer* initializer_param,
-                                const int8_t* initializer_index, const U* table_ids,
-                                const uint32_t* num_missing_keys, const uint32_t* missing_indices,
-                                T* values) {
+                                const int32_t embedding_size, ColumnsParam param,
+                                const U* column_ids, const uint32_t* num_missing_keys,
+                                const uint32_t* missing_indices, T* values) {
   int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   curandStatePhilox4_32_10_t state;
   curand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state);
@@ -330,22 +210,22 @@ __global__ void InitValueKernel(uint64_t seed, one::CUDAGeneratorState* cuda_gen
     int col = i - row * line_size;
     const uint32_t index = missing_indices[row];
     const int64_t offset = index * line_size + col;
-    const int32_t table_idx = table_ids[index];
-    const int32_t initializer_idx = initializer_index[table_idx * line_size + col];
-    EmbeddingInitializer initializer = initializer_param[initializer_idx];
-    T value;
-    if (initializer.type == InitializerType::kUniform) {
-      const float low = initializer.uniform_param.low;
-      const float high = initializer.uniform_param.high;
-      value = curand_uniform(&state) * (high - low) + low;
-    } else if (initializer.type == InitializerType::kNormal) {
-      const float mean = initializer.normal_param.mean;
-      const float std = initializer.normal_param.std;
-      value = curand_normal(&state) * std + mean;
-    } else if (initializer.type == InitializerType::kConstant) {
-      value = initializer.constant_param.value;
-    } else {
-      __trap();
+    const int32_t column_idx = column_ids[index];
+    assert(column_idx < param.num_columns);
+    EmbeddingInitializer initializer = param.columns[column_idx].initializer;
+    T value = 0;
+    if (col < embedding_size) {
+      if (initializer.type == InitializerType::kUniform) {
+        const float low = initializer.uniform_param.low;
+        const float high = initializer.uniform_param.high;
+        value = curand_uniform(&state) * (high - low) + low;
+      } else if (initializer.type == InitializerType::kNormal) {
+        const float mean = initializer.normal_param.mean;
+        const float std = initializer.normal_param.std;
+        value = curand_normal(&state) * std + mean;
+      } else {
+        __trap();
+      }
     }
     values[offset] = value;
   }
@@ -361,9 +241,9 @@ __global__ void InitValueKernel(uint64_t seed, one::CUDAGeneratorState* cuda_gen
 
 template<typename T, typename U, typename IDX>
 void LookupAndInitMissing(ep::Stream* stream, EmbeddingKernelState<IDX>* embedding_state,
-                          const int64_t num_ids, const int64_t embedding_size,
-                          const int64_t line_size, const void* num_unique_ptr,
-                          const void* unique_ids, const void* table_ids, T* values_ptr,
+                          const int64_t num_ids, const int32_t embedding_size,
+                          const int32_t line_size, const void* num_unique_ptr,
+                          const void* unique_ids, const void* column_ids, T* values_ptr,
                           void* tmp_buffer_ptr, uint32_t* return_num_unique,
                           const bool put_to_kv_store) {
   const auto& generator = embedding_state->generator();
@@ -373,8 +253,7 @@ void LookupAndInitMissing(ep::Stream* stream, EmbeddingKernelState<IDX>* embeddi
   uint64_t seed = cuda_generator->current_seed();
   one::CUDAGeneratorState* cuda_gen_state = cuda_generator->cuda_gen_state();
   embedding::KeyValueStore* store = embedding_state->KeyValueStore();
-  const EmbeddingInitializer* initializer_param = embedding_state->Initializers();
-  const int8_t* initializer_index = embedding_state->InitializerIndex();
+  const ColumnsParam& param = embedding_state->Columns();
   bool need_value_buffer = (values_ptr == nullptr);
   EmbeddingTmpBufferManager buffer_manager(tmp_buffer_ptr, num_ids, line_size * sizeof(T),
                                            need_value_buffer);
@@ -402,9 +281,8 @@ void LookupAndInitMissing(ep::Stream* stream, EmbeddingKernelState<IDX>* embeddi
     const uint64_t inc_offset = std::ceil(elem_cnt / num_blocks / kCudaThreadsNumPerBlock);
     InitValueKernel<T, U>
         <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-            seed, cuda_gen_state, inc_offset, line_size, embedding_size, initializer_param,
-            initializer_index, reinterpret_cast<const U*>(table_ids), num_missing_ptr,
-            missing_indices, store_values);
+            seed, cuda_gen_state, inc_offset, line_size, embedding_size, param,
+            reinterpret_cast<const U*>(column_ids), num_missing_ptr, missing_indices, store_values);
   }
   if (put_to_kv_store) { store->Put(stream, num_unique, unique_ids, store_values); }
   *return_num_unique = num_unique;
@@ -482,7 +360,7 @@ class EmbeddingPrefetchKernel final : public user_op::OpKernel {
 
     const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
     const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0);
-    const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0);
+    const user_op::Tensor* column_ids = ctx->Tensor4ArgNameAndIndex("column_ids", 0);
     user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
     const int64_t embedding_size = ctx->Attr<int64_t>("embedding_size");
     const int64_t line_size = ctx->Attr<int64_t>("line_size");
@@ -490,7 +368,7 @@ class EmbeddingPrefetchKernel final : public user_op::OpKernel {
     T* values_ptr = nullptr;
     LookupAndInitMissing<T, U, IDX>(ctx->stream(), embedding_state, unique_ids->shape().elem_cnt(),
                                     embedding_size, line_size, num_unique_ids->dptr(),
-                                    unique_ids->dptr(), table_ids->dptr(), values_ptr,
+                                    unique_ids->dptr(), column_ids->dptr(), values_ptr,
                                     tmp_buffer->mut_dptr(), &num_unique, true);
   }
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
@@ -498,11 +376,9 @@ class EmbeddingPrefetchKernel final : public user_op::OpKernel {
 
 #define EMBEDDING_DATA_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)
 
-#define TABLE_ID_DATA_TYPE_SEQ                      \
-  OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)   \
+#define ID_DATA_TYPE_SEQ                            \
   OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
   OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \
-  OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)     \
   OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)   \
   OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
 
@@ -510,25 +386,25 @@ class EmbeddingPrefetchKernel final : public user_op::OpKernel {
   OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \
   OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
 
-#define REGISTER_CUDA_EMBEDDING_PREFETCH_KERNEL(t_dtype_pair, table_dtype_pair, idx_dtype_pair) \
-  REGISTER_USER_KERNEL("embedding_prefetch")                                                    \
-      .SetCreateFn<EmbeddingPrefetchKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                      \
-                                           OF_PP_PAIR_FIRST(table_dtype_pair),                  \
-                                           OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                 \
-      .SetIsMatchedHob(                                                                         \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                       \
-          && (user_op::HobDataType("table_ids", 0) == OF_PP_PAIR_SECOND(table_dtype_pair))      \
-          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)))  \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                       \
-        const user_op::TensorDesc& unique_ids = ctx->InputTensorDesc("unique_ids", 0);          \
-        EmbeddingTmpBufferManager buffer_manager(                                               \
-            nullptr, unique_ids.shape().elem_cnt(),                                             \
-            ctx->Attr<int64_t>("line_size") * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)), true);    \
-        return buffer_manager.TotalBufferSize();                                                \
+#define REGISTER_CUDA_EMBEDDING_PREFETCH_KERNEL(t_dtype_pair, column_dtype_pair, idx_dtype_pair) \
+  REGISTER_USER_KERNEL("embedding_prefetch")                                                     \
+      .SetCreateFn<EmbeddingPrefetchKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                       \
+                                           OF_PP_PAIR_FIRST(column_dtype_pair),                  \
+                                           OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                  \
+      .SetIsMatchedHob(                                                                          \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                        \
+          && (user_op::HobDataType("column_ids", 0) == OF_PP_PAIR_SECOND(column_dtype_pair))     \
+          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)))   \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                        \
+        const user_op::TensorDesc& unique_ids = ctx->InputTensorDesc("unique_ids", 0);           \
+        EmbeddingTmpBufferManager buffer_manager(                                                \
+            nullptr, unique_ids.shape().elem_cnt(),                                              \
+            ctx->Attr<int64_t>("line_size") * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)), true);     \
+        return buffer_manager.TotalBufferSize();                                                 \
       });
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_PREFETCH_KERNEL, EMBEDDING_DATA_TYPE_SEQ,
-                                 TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+                                 ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
 
 template<typename T, typename U, typename IDX>
 class EmbeddingLookupKernel final : public user_op::OpKernel {
@@ -549,7 +425,7 @@ class EmbeddingLookupKernel final : public user_op::OpKernel {
     CHECK(embedding_state != nullptr);
     const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
     const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0);
-    const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0);
+    const user_op::Tensor* column_ids = ctx->Tensor4ArgNameAndIndex("column_ids", 0);
     user_op::Tensor* unique_values = ctx->Tensor4ArgNameAndIndex("unique_values", 0);
     user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
     const int64_t embedding_size = ctx->Attr<int64_t>("embedding_size");
@@ -557,8 +433,8 @@ class EmbeddingLookupKernel final : public user_op::OpKernel {
     uint32_t num_unique;
     LookupAndInitMissing<T, U, IDX>(
         ctx->stream(), embedding_state, unique_ids->shape().elem_cnt(), embedding_size, line_size,
-        num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(), unique_values->mut_dptr<T>(),
-        tmp_buffer->mut_dptr(), &num_unique, false);
+        num_unique_ids->dptr(), unique_ids->dptr(), column_ids->dptr(),
+        unique_values->mut_dptr<T>(), tmp_buffer->mut_dptr(), &num_unique, false);
     if (ctx->has_output("embeddings", 0)) {
       user_op::Tensor* embeddings = ctx->Tensor4ArgNameAndIndex("embeddings", 0);
       CopyValuesToEmbeddings<T>(ctx->stream(), num_unique, embedding_size, line_size,
@@ -569,15 +445,15 @@ class EmbeddingLookupKernel final : public user_op::OpKernel {
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 
-#define REGISTER_CUDA_EMBEDDING_LOOKUP_KERNEL(t_dtype_pair, table_dtype_pair, idx_dtype_pair)  \
+#define REGISTER_CUDA_EMBEDDING_LOOKUP_KERNEL(t_dtype_pair, column_dtype_pair, idx_dtype_pair) \
   REGISTER_USER_KERNEL("embedding_lookup")                                                     \
       .SetCreateFn<EmbeddingLookupKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                       \
-                                         OF_PP_PAIR_FIRST(table_dtype_pair),                   \
+                                         OF_PP_PAIR_FIRST(column_dtype_pair),                  \
                                          OF_PP_PAIR_FIRST(idx_dtype_pair)>>()                  \
       .SetIsMatchedHob(                                                                        \
           (user_op::HobDeviceType() == DeviceType::kCUDA)                                      \
           && (user_op::HobDataType("unique_values", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))     \
-          && (user_op::HobDataType("table_ids", 0) == OF_PP_PAIR_SECOND(table_dtype_pair))     \
+          && (user_op::HobDataType("column_ids", 0) == OF_PP_PAIR_SECOND(column_dtype_pair))   \
           && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \
       .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                      \
         const user_op::TensorDesc& unique_ids = ctx->InputTensorDesc("unique_ids", 0);         \
@@ -588,7 +464,7 @@ class EmbeddingLookupKernel final : public user_op::OpKernel {
       });
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_LOOKUP_KERNEL, EMBEDDING_DATA_TYPE_SEQ,
-                                 TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
+                                 ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
 
 template<typename IDX>
 class EmbeddingPutKernel final : public user_op::OpKernel {
diff --git a/oneflow/user/kernels/one_embedding_update_kernels.cu b/oneflow/user/kernels/one_embedding_update_kernels.cu
index 8535ea442bb..4511800d9a2 100644
--- a/oneflow/user/kernels/one_embedding_update_kernels.cu
+++ b/oneflow/user/kernels/one_embedding_update_kernels.cu
@@ -15,15 +15,13 @@ limitations under the License.
 */
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/device/cuda_util.h"
-#include "oneflow/user/kernels/model_update_kernel_util.h"
 
 namespace oneflow {
 
 namespace {
 
 template<typename T, typename G, typename IDX>
-__global__ void SGDUpdateKernel(const int64_t embedding_size, T scale, float l1, float l2,
-                                float weight_decay, const IDX* num_unique_ids,
+__global__ void SGDUpdateKernel(const int64_t embedding_size, T scale, const IDX* num_unique_ids,
                                 const float* learning_rate, const T* down_scale_by_ptr,
                                 const int64_t* skip_if, const G* model_diff, const T* model,
                                 T* updated_model) {
@@ -35,84 +33,52 @@ __global__ void SGDUpdateKernel(const int64_t embedding_size, T scale, float l1,
     float learning_rate_val = *learning_rate;
     const int64_t n = *num_unique_ids * embedding_size;
     CUDA_1D_KERNEL_LOOP(i, n) {
-      updated_model[i] = model[i];
-      SGDUpdateFunctor<T, G>()(model_diff + i, updated_model + i, scale, l1, l2, weight_decay,
-                               learning_rate_val);
+      const T model_val = model[i];
+      updated_model[i] = model_val - learning_rate_val * (scale * static_cast<T>(model_diff[i]));
     }
   }
 }
 
-__device__ void GetMomentumOffset(const int32_t line_size, const int32_t embedding_size,
-                                  int64_t model_diff_offset, int64_t* model_offset,
-                                  int64_t* momentum_offset) {
-  const int32_t row = model_diff_offset / embedding_size;
-  const int32_t col = model_diff_offset - row * embedding_size;
-  *model_offset = row * line_size + col;
-  *momentum_offset = *model_offset + embedding_size;
-}
-
 template<typename T, typename G, typename IDX>
 __global__ void MomentumUpdateKernel(const int64_t line_size, const int64_t embedding_size, T scale,
-                                     float l1, float l2, float weight_decay, float beta,
-                                     const IDX* num_unique_ids, const float* learning_rate,
-                                     const T* down_scale_by_ptr, const int64_t* skip_if,
-                                     const G* model_diff, const T* unique_values,
-                                     T* updated_unique_values) {
+                                     float beta, const IDX* num_unique_ids,
+                                     const float* learning_rate, const T* down_scale_by_ptr,
+                                     const int64_t* skip_if, const G* model_diff,
+                                     const T* unique_values, T* updated_unique_values) {
   if (skip_if != nullptr && *skip_if != 0) {
     const int64_t n = *num_unique_ids * line_size;
-    CUDA_1D_KERNEL_LOOP(i, n) {
-      int64_t model_offset;
-      int64_t momentum_offset;
-      GetMomentumOffset(line_size, embedding_size, i, &model_offset, &momentum_offset);
-      updated_unique_values[model_offset] = unique_values[model_offset];
-      updated_unique_values[momentum_offset] = unique_values[momentum_offset];
-    }
+    CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; }
   } else {
     if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
     float learning_rate_val = *learning_rate;
     const int64_t n = *num_unique_ids * embedding_size;
     CUDA_1D_KERNEL_LOOP(i, n) {
-      int64_t model_offset;
-      int64_t momentum_offset;
-      GetMomentumOffset(line_size, embedding_size, i, &model_offset, &momentum_offset);
-      updated_unique_values[model_offset] = unique_values[model_offset];
-      updated_unique_values[momentum_offset] = unique_values[momentum_offset];
-      MomentumUpdateFunctor<T, G>()(model_diff + i, updated_unique_values + model_offset,
-                                    updated_unique_values + momentum_offset, scale, l1, l2, beta,
-                                    weight_decay, learning_rate_val);
+      const int32_t row = i / embedding_size;
+      const int32_t col = i - row * embedding_size;
+      const int64_t model_offset = row * line_size + col;
+      const int64_t momentum_offset = model_offset + embedding_size;
+      const T model_val = unique_values[model_offset];
+      const T momentum_val = unique_values[momentum_offset];
+      const T model_diff_val = scale * static_cast<T>(model_diff[i]);
+      const T next_momentum = beta * momentum_val - learning_rate_val * model_diff_val;
+      const T next_model = model_val + next_momentum;
+      updated_unique_values[model_offset] = next_model;
+      updated_unique_values[momentum_offset] = next_momentum;
     }
   }
 }
 
-__device__ void GetAdamOffset(const int32_t line_size, const int32_t embedding_size,
-                              int64_t model_diff_offset, int64_t* model_offset, int64_t* m_offset,
-                              int64_t* v_offset) {
-  const int32_t row = model_diff_offset / embedding_size;
-  const int32_t col = model_diff_offset - row * embedding_size;
-  *model_offset = row * line_size + col;
-  *m_offset = *model_offset + embedding_size;
-  *v_offset = *model_offset + 2 * embedding_size;
-}
-
 template<typename T, typename G, typename IDX>
 __global__ void AdamUpdateKernel(const int32_t line_size, const int32_t embedding_size, T scale,
-                                 float l1, float l2, float weight_decay, float beta1, float beta2,
-                                 float epsilon, const float* bias_correction1_ptr,
+                                 float beta1, float beta2, float epsilon,
+                                 const float* bias_correction1_ptr,
                                  const float* bias_correction2_ptr, const IDX* num_unique_ids,
                                  const float* learning_rate, const T* down_scale_by_ptr,
                                  const int64_t* skip_if, const G* model_diff,
                                  const T* unique_values, T* updated_unique_values) {
   if (skip_if != nullptr && *skip_if != 0) {
     const int64_t n = *num_unique_ids * line_size;
-    CUDA_1D_KERNEL_LOOP(i, n) {
-      int64_t model_offset;
-      int64_t m_offset;
-      int64_t v_offset;
-      GetAdamOffset(line_size, embedding_size, i, &model_offset, &m_offset, &v_offset);
-      updated_unique_values[model_offset] = unique_values[model_offset];
-      updated_unique_values[m_offset] = unique_values[m_offset];
-      updated_unique_values[v_offset] = unique_values[v_offset];
-    }
+    CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; }
   } else {
     if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
     float bias_correction1_val = 1.0;
@@ -122,53 +88,22 @@ __global__ void AdamUpdateKernel(const int32_t line_size, const int32_t embeddin
     float learning_rate_val = *learning_rate;
     const int64_t n = *num_unique_ids * embedding_size;
     CUDA_1D_KERNEL_LOOP(i, n) {
-      int64_t model_offset;
-      int64_t m_offset;
-      int64_t v_offset;
-      GetAdamOffset(line_size, embedding_size, i, &model_offset, &m_offset, &v_offset);
-      updated_unique_values[model_offset] = unique_values[model_offset];
-      updated_unique_values[m_offset] = unique_values[m_offset];
-      updated_unique_values[v_offset] = unique_values[v_offset];
-      AdamUpdateFunctor<T, G>()(model_diff + i, updated_unique_values + model_offset,
-                                updated_unique_values + m_offset, updated_unique_values + v_offset,
-                                nullptr, scale, l1, l2, beta1, beta2, epsilon, weight_decay, false,
-                                bias_correction1_val, bias_correction2_val, learning_rate_val);
-    }
-  }
-}
-
-template<typename T, typename G, typename IDX>
-__global__ void AdagradUpdateKernel(const int64_t line_size, const int64_t embedding_size, T scale,
-                                    float l1, float l2, float weight_decay, float lr_decay,
-                                    float epsilon, const IDX* num_unique_ids,
-                                    const float* learning_rate, const int64_t* train_step_ptr,
-                                    const T* down_scale_by_ptr, const int64_t* skip_if,
-                                    const G* model_diff, const T* unique_values,
-                                    T* updated_unique_values) {
-  if (skip_if != nullptr && *skip_if != 0) {
-    const int64_t n = *num_unique_ids * line_size;
-    CUDA_1D_KERNEL_LOOP(i, n) {
-      int64_t model_offset;
-      int64_t sum_offset;
-      GetMomentumOffset(line_size, embedding_size, i, &model_offset, &sum_offset);
-      updated_unique_values[model_offset] = unique_values[model_offset];
-      updated_unique_values[sum_offset] = unique_values[sum_offset];
-    }
-  } else {
-    int64_t train_step = *train_step_ptr + 1;
-    if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; }
-    float learning_rate_val = *learning_rate;
-    learning_rate_val = learning_rate_val / (1 + (train_step - 1) * lr_decay);
-    const int64_t n = *num_unique_ids * embedding_size;
-    CUDA_1D_KERNEL_LOOP(i, n) {
-      int64_t model_offset;
-      int64_t sum_offset;
-      GetMomentumOffset(line_size, embedding_size, i, &model_offset, &sum_offset);
-      updated_unique_values[model_offset] = unique_values[model_offset];
-      updated_unique_values[sum_offset] = unique_values[sum_offset];
-      AdagradUpdateFunctor<T, G>()(model_diff + i, updated_unique_values + model_offset,
-                                   updated_unique_values + sum_offset, scale, l1, l2, epsilon,
-                                   weight_decay, learning_rate_val);
+      const int32_t row = i / embedding_size;
+      const int32_t col = i - row * embedding_size;
+      const int64_t model_offset = row * line_size + col;
+      const int64_t m_offset = model_offset + embedding_size;
+      const int64_t v_offset = model_offset + 2 * embedding_size;
+      const T model_val = unique_values[model_offset];
+      const T m = unique_values[m_offset];
+      const T v = unique_values[v_offset];
+      const T model_diff_val = scale * static_cast<T>(model_diff[i]);
+      const T next_m = beta1 * m + (1 - beta1) * model_diff_val;
+      const T next_v = beta2 * v + (1 - beta2) * model_diff_val * model_diff_val;
+      T denom = (sqrt(next_v) / sqrt(bias_correction2_val)) + epsilon;
+      const T step_size = learning_rate_val / bias_correction1_val;
+      updated_unique_values[model_offset] = model_val - step_size * (next_m / denom);
+      updated_unique_values[m_offset] = next_m;
+      updated_unique_values[v_offset] = next_v;
     }
   }
 }
@@ -195,9 +130,7 @@ class SgdEmbeddingUpdateKernel final : public user_op::OpKernel {
     const int64_t embedding_size = embedding_grad->shape().At(1);
     CHECK_EQ(line_size, embedding_size);
     const auto scale = ctx->Attr<double>("scale");
-    const float l1 = ctx->Attr<float>("l1");
-    const float l2 = ctx->Attr<float>("l2");
-    const auto weight_decay = ctx->Attr<float>("weight_decay");
+
     const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
     const float* learning_rate_ptr = learning_rate->dptr<float>();
     const T* down_scale_by_ptr = nullptr;
@@ -218,10 +151,9 @@ class SgdEmbeddingUpdateKernel final : public user_op::OpKernel {
     SGDUpdateKernel<T, G, IDX>
         <<<BlocksNum4ThreadsNum(embedding_grad->shape().elem_cnt()), kCudaThreadsNumPerBlock, 0,
            ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            embedding_size, scale, l1, l2, weight_decay,
-            reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr,
-            down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(), unique_embeddings->dptr<T>(),
-            updated_unique_embeddings->mut_dptr<T>());
+            embedding_size, scale, reinterpret_cast<const IDX*>(num_unique_ids->dptr()),
+            learning_rate_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(),
+            unique_embeddings->dptr<T>(), updated_unique_embeddings->mut_dptr<T>());
   }
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
@@ -264,9 +196,6 @@ class MomentumEmbeddingUpdateKernel final : public user_op::OpKernel {
     const int64_t line_size = unique_embeddings->shape().At(1);
     const int64_t embedding_size = embedding_grad->shape().At(1);
     CHECK_EQ(line_size, embedding_size * 2);
-    const float l1 = ctx->Attr<float>("l1");
-    const float l2 = ctx->Attr<float>("l2");
-    const auto weight_decay = ctx->Attr<float>("weight_decay");
     const auto beta = ctx->Attr<float>("beta");
     const auto scale = ctx->Attr<double>("scale");
     const T* down_scale_by_ptr = nullptr;
@@ -289,7 +218,7 @@ class MomentumEmbeddingUpdateKernel final : public user_op::OpKernel {
     MomentumUpdateKernel<T, G, IDX>
         <<<BlocksNum4ThreadsNum(embedding_grad->shape().elem_cnt()), kCudaThreadsNumPerBlock, 0,
            ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            line_size, embedding_size, scale, l1, l2, weight_decay, beta,
+            line_size, embedding_size, scale, beta,
             reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr,
             down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(), unique_embeddings->dptr<T>(),
             updated_unique_embeddings->mut_dptr<T>());
@@ -333,9 +262,6 @@ class AdamEmbeddingUpdateKernel final : public user_op::OpKernel {
     const int64_t embedding_size = embedding_grad->shape().At(1);
     CHECK_EQ(line_size, embedding_size * 3);
 
-    const float l1 = ctx->Attr<float>("l1");
-    const float l2 = ctx->Attr<float>("l2");
-    const auto weight_decay = ctx->Attr<float>("weight_decay");
     const auto beta1 = ctx->Attr<float>("beta1");
     const auto beta2 = ctx->Attr<float>("beta2");
     const auto epsilon = ctx->Attr<float>("epsilon");
@@ -369,8 +295,8 @@ class AdamEmbeddingUpdateKernel final : public user_op::OpKernel {
     AdamUpdateKernel<T, G, IDX>
         <<<BlocksNum4ThreadsNum(embedding_grad->shape().elem_cnt()), kCudaThreadsNumPerBlock, 0,
            ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            line_size, embedding_size, static_cast<T>(scale), l1, l2, weight_decay, beta1, beta2,
-            epsilon, bias_correction1_ptr, bias_correction2_ptr,
+            line_size, embedding_size, static_cast<T>(scale), beta1, beta2, epsilon,
+            bias_correction1_ptr, bias_correction2_ptr,
             reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr,
             down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(), unique_embeddings->dptr<T>(),
             updated_unique_embeddings->mut_dptr<T>());
@@ -391,76 +317,4 @@ class AdamEmbeddingUpdateKernel final : public user_op::OpKernel {
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ADAM_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ,
                                  FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ)
-
-template<typename T, typename G, typename IDX>
-class AdagradEmbeddingUpdateKernel final : public user_op::OpKernel {
- public:
-  AdagradEmbeddingUpdateKernel() = default;
-  ~AdagradEmbeddingUpdateKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0);
-    const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0);
-    const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0);
-    user_op::Tensor* updated_unique_embeddings =
-        ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0);
-    CHECK_EQ(unique_embeddings->shape().NumAxes(), 2);
-    CHECK_EQ(embedding_grad->shape().NumAxes(), 2);
-    const int64_t num_keys = unique_embeddings->shape().At(0);
-    const int64_t line_size = unique_embeddings->shape().At(1);
-    const int64_t embedding_size = embedding_grad->shape().At(1);
-    CHECK_EQ(line_size, embedding_size * 2);
-
-    const float l1 = ctx->Attr<float>("l1");
-    const float l2 = ctx->Attr<float>("l2");
-    const auto weight_decay = ctx->Attr<float>("weight_decay");
-    const auto lr_decay = ctx->Attr<float>("lr_decay");
-    const auto epsilon = ctx->Attr<float>("epsilon");
-    const auto scale = ctx->Attr<double>("scale");
-    const T* down_scale_by_ptr = nullptr;
-    if (ctx->has_input("down_scale_by_tensor", 0)) {
-      const user_op::Tensor* down_scale_by_tensor =
-          ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0);
-      CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type());
-      CHECK_EQ(down_scale_by_tensor->shape().elem_cnt(), 1);
-      down_scale_by_ptr = down_scale_by_tensor->dptr<T>();
-    }
-    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
-    const float* learning_rate_ptr = learning_rate->dptr<float>();
-    const int64_t* train_step_ptr = ctx->Tensor4ArgNameAndIndex("train_step", 0)->dptr<int64_t>();
-    const int64_t* skip_if_ptr = nullptr;
-    if (ctx->has_input("skip_if", 0)) {
-      const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
-      CHECK_EQ(skip_if->shape().elem_cnt(), 1);
-      skip_if_ptr = skip_if->dptr<int64_t>();
-    }
-    // update kernel
-    AdagradUpdateKernel<T, G, IDX>
-        <<<BlocksNum4ThreadsNum(embedding_grad->shape().elem_cnt()), kCudaThreadsNumPerBlock, 0,
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-            line_size, embedding_size, static_cast<T>(scale), l1, l2, weight_decay, lr_decay,
-            epsilon, reinterpret_cast<const IDX*>(num_unique_ids->dptr()), learning_rate_ptr,
-            train_step_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr<G>(),
-            unique_embeddings->dptr<T>(), updated_unique_embeddings->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_ADAGRAD_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \
-  REGISTER_USER_KERNEL("adagrad_embedding_update")                                               \
-      .SetCreateFn<AdagradEmbeddingUpdateKernel<OF_PP_PAIR_FIRST(t_dtype_pair),                  \
-                                                OF_PP_PAIR_FIRST(g_type_pair),                   \
-                                                OF_PP_PAIR_FIRST(idx_dtype_pair)>>()             \
-      .SetIsMatchedHob(                                                                          \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                        \
-          && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))    \
-          && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair))       \
-          && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)));
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ADAGRAD_EMBEDDING_UPDATE_KERNEL,
-                                 FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ,
-                                 IDX_DATA_TYPE_SEQ)
-
 }  // namespace oneflow
diff --git a/oneflow/user/kernels/pool_gpu_kernel.cpp b/oneflow/user/kernels/pool_gpu_kernel.cpp
index 4a6732b1560..3b0cbf7ecaf 100644
--- a/oneflow/user/kernels/pool_gpu_kernel.cpp
+++ b/oneflow/user/kernels/pool_gpu_kernel.cpp
@@ -371,7 +371,7 @@ class MaxPool3DGradGpuKernel final : public user_op::OpKernel, public user_op::C
 };
 
 #define REGISTER_POOL_CUDA_KERNEL(dtype)                                                \
-  REGISTER_USER_KERNEL("tf_avg_pool_1d")                                                \
+  REGISTER_USER_KERNEL("avg_pool_1d")                                                   \
       .SetCreateFn<AvgPool1DGpuKernel<dtype>>()                                         \
       .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                  \
                        && (user_op::HobDataType("x", 0) == GetDataType<dtype>::value)); \
diff --git a/oneflow/user/kernels/prelu_kernel.cu b/oneflow/user/kernels/prelu_kernel.cu
index 7e71bdb173b..360c3018b05 100644
--- a/oneflow/user/kernels/prelu_kernel.cu
+++ b/oneflow/user/kernels/prelu_kernel.cu
@@ -228,34 +228,23 @@ __global__ void PReluBackwardMultiAlphaGpu(const IndexType elem_cnt, const Index
 
 constexpr int32_t kBlockSize = 256;
 
-template<typename T>
-int GetLaunchPackSize(const int64_t inner_size) {
-  constexpr int type_pack_size = cuda::elementwise::PackSize<T>();
-  for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) {
-    if (type_pack_size >= launch_pack_size && inner_size % launch_pack_size == 0) {
-      return launch_pack_size;
-    }
-  }
-  return 1;
-}
-
-template<typename T, typename IndexType>
+template<typename T, typename IndexType, int32_t pack_size>
 void DispatchPreluForwardPackSize(ep::Stream* stream, const int64_t elem_cnt,
                                   const int64_t alpha_size, const int64_t inner_size, const T* x,
                                   const T* alpha, T* y) {
-  int grid_size;
-  const int pack_size = GetLaunchPackSize<T>(inner_size);
   const int64_t pack_num = elem_cnt / pack_size;
+  int grid_size;
   cudaError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
-  if (pack_size == 8) {
+
+  if (pack_size >= 8 && inner_size % 8 == 0) {
     PReluForwardMultiAlphaGpu<T, IndexType, 8>
         <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
             elem_cnt, alpha_size, inner_size, x, alpha, y);
-  } else if (pack_size == 4) {
+  } else if (pack_size >= 4 && inner_size % 4 == 0) {
     PReluForwardMultiAlphaGpu<T, IndexType, 4>
         <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
             elem_cnt, alpha_size, inner_size, x, alpha, y);
-  } else if (pack_size == 2) {
+  } else if (pack_size >= 2 && inner_size % 2 == 0) {
     PReluForwardMultiAlphaGpu<T, IndexType, 2>
         <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
             elem_cnt, alpha_size, inner_size, x, alpha, y);
@@ -269,24 +258,27 @@ void DispatchPreluForwardPackSize(ep::Stream* stream, const int64_t elem_cnt,
 template<typename T>
 void DispatchPreluForwardIndex(ep::Stream* stream, const int64_t elem_cnt, const int64_t alpha_size,
                                const int64_t inner_size, const T* x, const T* alpha, T* y) {
+  constexpr int pack_size = cuda::elementwise::PackSize<T>();
+
   if (elem_cnt < GetMaxVal<int32_t>()) {
-    DispatchPreluForwardPackSize<T, int32_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha, y);
+    DispatchPreluForwardPackSize<T, int32_t, pack_size>(stream, elem_cnt, alpha_size, inner_size, x,
+                                                        alpha, y);
   } else {
-    DispatchPreluForwardPackSize<T, int64_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha, y);
+    DispatchPreluForwardPackSize<T, int64_t, pack_size>(stream, elem_cnt, alpha_size, inner_size, x,
+                                                        alpha, y);
   }
 }
 
-template<typename T, typename IndexType>
+template<typename T, typename IndexType, int32_t pack_size>
 void DispatchPreluBackwardPackSize(ep::Stream* stream, const int64_t elem_cnt,
                                    const int64_t alpha_size, const int64_t inner_size, const T* x,
                                    const T* alpha, const T* dy, T* dx, T* alpha_diff,
                                    const bool alpha_requires_grad) {
-  int grid_size;
-  const int pack_size = GetLaunchPackSize<T>(inner_size);
   const int64_t pack_num = elem_cnt / pack_size;
+  int grid_size;
   cudaError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size);
 
-  if (pack_size == 8) {
+  if (pack_size >= 8 && inner_size % 8 == 0) {
     if (alpha_requires_grad) {
       PReluBackwardMultiAlphaGpu<T, IndexType, 8, true>
           <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
@@ -296,7 +288,7 @@ void DispatchPreluBackwardPackSize(ep::Stream* stream, const int64_t elem_cnt,
           <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
               elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
     }
-  } else if (pack_size == 4) {
+  } else if (pack_size >= 4 && inner_size % 4 == 0) {
     if (alpha_requires_grad) {
       PReluBackwardMultiAlphaGpu<T, IndexType, 4, true>
           <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
@@ -306,7 +298,7 @@ void DispatchPreluBackwardPackSize(ep::Stream* stream, const int64_t elem_cnt,
           <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
               elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff);
     }
-  } else if (pack_size == 2) {
+  } else if (pack_size >= 2 && inner_size % 2 == 0) {
     if (alpha_requires_grad) {
       PReluBackwardMultiAlphaGpu<T, IndexType, 2, true>
           <<<grid_size, kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
@@ -335,12 +327,15 @@ void DispatchPreluBackwardIndex(ep::Stream* stream, const int64_t elem_cnt,
                                 const int64_t alpha_size, const int64_t inner_size, const T* x,
                                 const T* alpha, const T* dy, T* dx, T* alpha_diff,
                                 const bool alpha_requires_grad) {
+  constexpr int pack_size = cuda::elementwise::PackSize<T>();
   if (elem_cnt < GetMaxVal<int32_t>()) {
-    DispatchPreluBackwardPackSize<T, int32_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha,
-                                              dy, dx, alpha_diff, alpha_requires_grad);
+    DispatchPreluBackwardPackSize<T, int32_t, pack_size>(stream, elem_cnt, alpha_size, inner_size,
+                                                         x, alpha, dy, dx, alpha_diff,
+                                                         alpha_requires_grad);
   } else {
-    DispatchPreluBackwardPackSize<T, int64_t>(stream, elem_cnt, alpha_size, inner_size, x, alpha,
-                                              dy, dx, alpha_diff, alpha_requires_grad);
+    DispatchPreluBackwardPackSize<T, int64_t, pack_size>(stream, elem_cnt, alpha_size, inner_size,
+                                                         x, alpha, dy, dx, alpha_diff,
+                                                         alpha_requires_grad);
   }
 }
 
diff --git a/oneflow/user/kernels/roc_auc_score_kernel.cpp b/oneflow/user/kernels/roc_auc_score_kernel.cpp
deleted file mode 100644
index a536dfcf38a..00000000000
--- a/oneflow/user/kernels/roc_auc_score_kernel.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename L, typename P>
-double RocAucScore(size_t n, const L* label, const P* pred, float* buffer) {
-  size_t p_samples_count = 0;
-  for (size_t i = 0; i < n; ++i) {
-    if (label[i] == 0) {
-      buffer[i] = -pred[i];
-    } else {
-      p_samples_count += 1;
-      buffer[i] = pred[i];
-    }
-  }
-  const size_t n_samples_count = n - p_samples_count;
-  constexpr size_t kParallelSortThreshold = 1024;
-  auto comp = [](float a, float b) { return fabs(a) < fabs(b); };
-  if (n < kParallelSortThreshold) {
-    std::sort(buffer, buffer + n, comp);
-  } else {
-    const size_t m2 = n / 2;
-    const size_t m1 = m2 / 2;
-    const size_t m3 = (m2 + n) / 2;
-    std::thread t0([&] { std::sort(buffer, buffer + m1, comp); });
-    std::thread t1([&] { std::sort(buffer + m1, buffer + m2, comp); });
-    std::thread t2([&] { std::sort(buffer + m2, buffer + m3, comp); });
-    std::thread t3([&] { std::sort(buffer + m3, buffer + n, comp); });
-    t0.join();
-    t1.join();
-    t2.join();
-    t3.join();
-    std::inplace_merge(buffer, buffer + m1, buffer + m2, comp);
-    std::inplace_merge(buffer + m2, buffer + m3, buffer + n, comp);
-    std::inplace_merge(buffer, buffer + m2, buffer + n, comp);
-  }
-  size_t tmp_n = 0;
-  double tmp_rank_sum = 0;
-  double rank_sum = 0;
-  size_t tmp_p_samples_count = 0;
-  for (size_t i = 0; i < n; ++i) {
-    if (i != 0 && fabs(buffer[i]) != fabs(buffer[i - 1])) {
-      rank_sum += tmp_p_samples_count * (tmp_rank_sum / tmp_n);
-      tmp_n = 0;
-      tmp_rank_sum = 0;
-      tmp_p_samples_count = 0;
-    }
-    if (buffer[i] > 0) { tmp_p_samples_count += 1; }
-    tmp_rank_sum += (i + 1);
-    tmp_n += 1;
-  }
-  rank_sum += tmp_p_samples_count * (tmp_rank_sum / tmp_n);
-  return (rank_sum - p_samples_count * (p_samples_count + 1) / 2)
-         / (p_samples_count * n_samples_count);
-}
-
-template<typename L, typename P>
-class RocAucScoreKernel final : public user_op::OpKernel {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(RocAucScoreKernel);
-  RocAucScoreKernel() = default;
-  ~RocAucScoreKernel() override = default;
-
- private:
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0);
-    const user_op::Tensor* pred = ctx->Tensor4ArgNameAndIndex("pred", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    P* out_ptr = out->mut_dptr<P>();
-    CHECK_EQ(label->shape().elem_cnt(), pred->shape().elem_cnt());
-    CHECK_EQ(out->shape().elem_cnt(), 1);
-    out_ptr[0] = RocAucScore(label->shape().elem_cnt(), label->dptr<L>(), pred->dptr<P>(),
-                             tmp_buffer->mut_dptr<float>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_ROC_AUC_SCORE_KERNEL(label_type, label_cpp_type, pred_type, pred_cpp_type) \
-  REGISTER_USER_KERNEL("roc_auc_score")                                                     \
-      .SetCreateFn<RocAucScoreKernel<label_cpp_type, pred_cpp_type>>()                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU)                       \
-                       && (user_op::HobDataType("label", 0) == label_type)                  \
-                       && (user_op::HobDataType("pred", 0) == pred_type))                   \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                         \
-        const Shape& pred_shape = ctx->InputShape("pred", 0);                               \
-        size_t tmp_buffer_size = pred_shape.elem_cnt() * sizeof(float);                     \
-        return tmp_buffer_size;                                                             \
-      })
-REGISTER_ROC_AUC_SCORE_KERNEL(DataType::kDouble, double, DataType::kFloat, float);
-REGISTER_ROC_AUC_SCORE_KERNEL(DataType::kFloat, float, DataType::kFloat, float);
-REGISTER_ROC_AUC_SCORE_KERNEL(DataType::kInt32, int, DataType::kFloat, float);
-REGISTER_ROC_AUC_SCORE_KERNEL(DataType::kInt64, int64_t, DataType::kFloat, float);
-REGISTER_ROC_AUC_SCORE_KERNEL(DataType::kInt8, int8_t, DataType::kFloat, float);
-REGISTER_ROC_AUC_SCORE_KERNEL(DataType::kUInt8, uint8_t, DataType::kFloat, float);
-
-}  // namespace
-
-}  // namespace oneflow
diff --git a/oneflow/user/kernels/scalar_math_kernels.cu b/oneflow/user/kernels/scalar_math_kernels.cu
index 3d9f605f149..5f9be508c8e 100644
--- a/oneflow/user/kernels/scalar_math_kernels.cu
+++ b/oneflow/user/kernels/scalar_math_kernels.cu
@@ -107,9 +107,10 @@ template<>
 struct ScalarPowGradFunctor<half> {
   OF_DEVICE_FUNC explicit ScalarPowGradFunctor(half exponent) : exponent(exponent) {}
   __device__ half operator()(half x, half dy) const {
-    return __float2half(__half2float(exponent)
-                        * (powf(__half2float(x), __half2float(exponent) - static_cast<float>(1.0)))
-                        * __half2float(dy));
+    return __float2half(
+        __half2float(exponent)
+        * (__powf(__half2float(x), __half2float(exponent) - static_cast<float>(1.0)))
+        * __half2float(dy));
   }
   const half exponent;
 };
@@ -125,7 +126,7 @@ template<>
 struct ScalarReversePowGradFunctor<float> {
   OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(float exponent) : exponent(exponent) {}
   __device__ float operator()(float x, float dy) const {
-    return powf(exponent, x) * logf(exponent) * dy;
+    return __powf(exponent, x) * __logf(exponent) * dy;
   }
   const float exponent;
 };
@@ -135,7 +136,7 @@ struct ScalarReversePowGradFunctor<half> {
   OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(half exponent) : exponent(exponent) {}
   __device__ half operator()(half x, half dy) const {
     const float exp = __half2float(exponent);
-    return __float2half(exp * powf(exp, __half2float(x)) * logf(exp) * __half2float(dy));
+    return __float2half(exp * __powf(exp, __half2float(x)) * __logf(exp) * __half2float(dy));
   }
   const half exponent;
 };
diff --git a/oneflow/user/kernels/upsample_bicubic2d_kernel.cpp b/oneflow/user/kernels/upsample_bicubic2d_kernel.cpp
index 88f4f2d22ba..c553ca31a59 100644
--- a/oneflow/user/kernels/upsample_bicubic2d_kernel.cpp
+++ b/oneflow/user/kernels/upsample_bicubic2d_kernel.cpp
@@ -121,7 +121,7 @@ class UpsampleBicubic2dGradCPUKernel final : public user_op::OpKernel {
     const int64_t out_width = dy_tensor->shape().At(3);
 
     if (in_height == out_height && in_width == out_width) {
-      memcpy(in_ptr, out_ptr, sizeof(T) * channels * in_height * in_width);
+      memcpy(in_ptr, out_ptr, sizeof(T) * nbatch * channels * in_height * in_width);
     } else {
       const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
       const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
diff --git a/oneflow/user/kernels/upsample_bicubic2d_kernel.cu b/oneflow/user/kernels/upsample_bicubic2d_kernel.cu
index 21b7ec4ddac..4abddff91f4 100644
--- a/oneflow/user/kernels/upsample_bicubic2d_kernel.cu
+++ b/oneflow/user/kernels/upsample_bicubic2d_kernel.cu
@@ -46,11 +46,11 @@ __global__ void UpsampleBicubic2dForward(const int64_t elem_cnt, const T* in_dpt
     T* out = out_dptr;
 
     const T real_x = GetAreaPixel(scale_width, output_x, align_corners, /*cubic=*/true);
-    int64_t input_x = floor(1.0 * real_x);
+    int64_t input_x = std::floor(1.0 * real_x);
     const T t_x = real_x - input_x;
 
     const T real_y = GetAreaPixel(scale_height, output_y, align_corners, /*cubic=*/true);
-    int64_t input_y = floor(1.0 * real_y);
+    int64_t input_y = std::floor(1.0 * real_y);
     const T t_y = real_y - input_y;
 
     for (int64_t c = 0; c < channels * nbatch; c++) {
@@ -92,11 +92,11 @@ __global__ void UpsampleBicubic2dBackward(const int64_t elem_cnt, const T* dy_dp
     const T* out = dy_dptr;
 
     T real_x = GetAreaPixel(scale_width, output_x, align_corners, true);
-    int64_t input_x = floor(1.0 * real_x);
+    int64_t input_x = std::floor(1.0 * real_x);
     T t_x = real_x - input_x;
 
     T real_y = GetAreaPixel(scale_height, output_y, align_corners, true);
-    int64_t input_y = floor(1.0 * real_y);
+    int64_t input_y = std::floor(1.0 * real_y);
     T t_y = real_y - input_y;
 
     T x_coeffs[4];
@@ -105,7 +105,7 @@ __global__ void UpsampleBicubic2dBackward(const int64_t elem_cnt, const T* dy_dp
     get_cubic_upsample_coefficients<T>(x_coeffs, t_x);
     get_cubic_upsample_coefficients<T>(y_coeffs, t_y);
 
-    for (int64_t c = 0; c < channels * nbatch; c++) {
+    for (int64_t c = 0; c < channels; c++) {
       T out_value = out[output_y * out_width + output_x];
 
       for (int64_t i = 0; i < 4; i++) {
diff --git a/oneflow/user/kernels/upsample_kernel.h b/oneflow/user/kernels/upsample_kernel.h
index 5365d9a8b60..d5f11371135 100644
--- a/oneflow/user/kernels/upsample_kernel.h
+++ b/oneflow/user/kernels/upsample_kernel.h
@@ -128,13 +128,12 @@ OF_DEVICE_FUNC T upsample_get_value_bounded(const T* data, const int64_t width,
 
 template<typename T>
 OF_DEVICE_FUNC T cubic_convolution1(const T x, const T A) {
-  return ((A + static_cast<T>(2.0)) * x - (A + static_cast<T>(3.0))) * x * x + static_cast<T>(1.0);
+  return ((A + 2.0) * x - (A + 3.0)) * x * x + 1.0;
 }
 
 template<typename T>
 OF_DEVICE_FUNC T cubic_convolution2(const T x, const T A) {
-  return ((A * x - static_cast<T>(5.0) * A) * x + static_cast<T>(8.0) * A) * x
-         - static_cast<T>(4.0) * A;
+  return ((A * x - 5.0 * A) * x + 8.0 * A) * x - 4.0 * A;
 }
 
 template<typename T>
diff --git a/oneflow/user/ops/data_shuffle_op.cpp b/oneflow/user/ops/data_shuffle_op.cpp
index f93f24a7346..4cb0c160119 100644
--- a/oneflow/user/ops/data_shuffle_op.cpp
+++ b/oneflow/user/ops/data_shuffle_op.cpp
@@ -20,15 +20,15 @@ namespace oneflow {
 
 /* static */ Maybe<void> UniqueKeyValuePairOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
   const Shape& keys_shape = ctx->InputShape("keys", 0);
-  const int32_t num_tables = ctx->Attr<int32_t>("num_tables");
-  CHECK_GE_OR_RETURN(num_tables, 1) << "num_tables must greater than 1, but get " << num_tables;
+  const int32_t num_columns = ctx->Attr<int32_t>("num_columns");
+  CHECK_GE_OR_RETURN(num_columns, 1);
   if (ctx->has_input("values", 0)) {
     const Shape& values_shape = ctx->InputShape("values", 0);
-    CHECK_EQ_OR_RETURN(keys_shape, values_shape) << "keys_shape must equal to values_shape";
+    CHECK_EQ_OR_RETURN(keys_shape, values_shape);
   } else {
-    if (num_tables > 1) {
+    if (num_columns > 1) {
       CHECK_EQ_OR_RETURN(keys_shape.NumAxes(), 2);
-      CHECK_EQ_OR_RETURN(keys_shape.At(1), num_tables) << "keys cols must equal to num_tables";
+      CHECK_EQ_OR_RETURN(keys_shape.At(1), num_columns);
     }
   }
   *ctx->OutputShape("num_unique", 0) = Shape({1});
@@ -60,15 +60,15 @@ namespace oneflow {
 
 /* static */ Maybe<void> IdShuffleOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
   const Shape& ids_shape = ctx->InputShape("ids", 0);
-  const int32_t num_tables = ctx->Attr<int32_t>("num_tables");
-  CHECK_GE_OR_RETURN(num_tables, 1) << "num_tables must greater than 1, but get " << num_tables;
-  if (ctx->has_input("table_ids", 0)) {
-    const Shape& table_ids_shape = ctx->InputShape("table_ids", 0);
-    CHECK_EQ_OR_RETURN(ids_shape, table_ids_shape) << "ids_shape must equal to table_ids_shape";
+  const int32_t num_columns = ctx->Attr<int32_t>("num_columns");
+  CHECK_GE_OR_RETURN(num_columns, 1);
+  if (ctx->has_input("column_ids", 0)) {
+    const Shape& column_ids_shape = ctx->InputShape("column_ids", 0);
+    CHECK_EQ_OR_RETURN(ids_shape, column_ids_shape);
   } else {
-    if (num_tables > 1) {
+    if (num_columns > 1) {
       CHECK_EQ_OR_RETURN(ids_shape.NumAxes(), 2);
-      CHECK_EQ_OR_RETURN(ids_shape.At(1), num_tables) << "ids cols must equal to num_tables";
+      CHECK_EQ_OR_RETURN(ids_shape.At(1), num_columns);
     }
   }
   const int64_t num_ids = ids_shape.elem_cnt();
@@ -78,7 +78,7 @@ namespace oneflow {
   *ctx->OutputShape("cur_rank_num_unique", 0) = Shape({1});
   *ctx->OutputShape("cur_rank_unique_ids", 0) = Shape({num_ids * parallel_num});
   *ctx->OutputShape("cur_rank_inverse_indices", 0) = Shape({num_ids * parallel_num});
-  *ctx->OutputShape("cur_rank_unique_table_ids", 0) = Shape({num_ids * parallel_num});
+  *ctx->OutputShape("cur_rank_unique_column_ids", 0) = Shape({num_ids * parallel_num});
   return Maybe<void>::Ok();
 }
 
@@ -102,10 +102,10 @@ namespace oneflow {
   *ctx->OutputDType("cur_rank_num_unique", 0) = DataType::kUInt32;
   *ctx->OutputDType("cur_rank_unique_ids", 0) = ctx->InputDType("ids", 0);
   *ctx->OutputDType("cur_rank_inverse_indices", 0) = DataType::kUInt32;
-  if (ctx->has_input("table_ids", 0)) {
-    *ctx->OutputDType("cur_rank_unique_table_ids", 0) = ctx->InputDType("table_ids", 0);
+  if (ctx->has_input("column_ids", 0)) {
+    *ctx->OutputDType("cur_rank_unique_column_ids", 0) = ctx->InputDType("column_ids", 0);
   } else {
-    *ctx->OutputDType("cur_rank_unique_table_ids", 0) = DataType::kUInt8;
+    *ctx->OutputDType("cur_rank_unique_column_ids", 0) = DataType::kUInt32;
   }
   return Maybe<void>::Ok();
 }
diff --git a/oneflow/user/ops/deconv_op.cpp b/oneflow/user/ops/deconv_op.cpp
index 43fd14bb16f..657098c6e7f 100644
--- a/oneflow/user/ops/deconv_op.cpp
+++ b/oneflow/user/ops/deconv_op.cpp
@@ -53,13 +53,11 @@ Maybe<void> InferTensorDesc4DeConv(user_op::InferContext* ctx) {
                                      - 2 * padding_before.at(i) + output_padding.at(i)
                                      + effective_filter_size;
     }
-    if (in.shape().At(0) != 0) {
-      for (int i = 0; i < out_shape.size(); i++) {
-        CHECK_GT_OR_RETURN(out_shape[i], 0)
-            << "RuntimeError: Given input size per channel: (" << Shape(in.shape())
-            << "). Calculated output size per channel: (" << Shape(out_shape)
-            << "). Output size is too small";
-      }
+    for (int i = 0; i < out_shape.size(); i++) {
+      CHECK_GT_OR_RETURN(out_shape[i], 0)
+          << "RuntimeError: Given input size per channel: (" << Shape(in.shape())
+          << "). Calculated output size per channel: (" << Shape(out_shape)
+          << "). Output size is too small";
     }
     *out->mut_is_dynamic() = in.is_dynamic();
     *out->mut_shape() = Shape(out_shape);
diff --git a/oneflow/user/ops/dim_scatter_ops.cpp b/oneflow/user/ops/dim_scatter_ops.cpp
index 99e090994e0..30afe697117 100644
--- a/oneflow/user/ops/dim_scatter_ops.cpp
+++ b/oneflow/user/ops/dim_scatter_ops.cpp
@@ -111,22 +111,31 @@ Maybe<void> InputScalarArgModifierFn(const user_op::GetInputArgModifier& GetInpu
 }
 
 void _SetSbp(user_op::SbpContext* ctx, const char* like_or_input) {
+  const user_op::TensorDesc& index_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0);
+  int64_t index_num_axes = index_tensor.shape().NumAxes();
   const int32_t dim = ctx->Attr<int32_t>("dim");
 
-  const Shape& index_tensor_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0).shape();
-  const Shape& src_tensor_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("src", 0).shape();
-  const Shape& input_tensor_shape =
-      ctx->LogicalTensorDesc4InputArgNameAndIndex(like_or_input, 0).shape();
-
-  FOR_RANGE(int64_t, i, 0, index_tensor_shape.NumAxes()) {
-    if (i == dim) { continue; }
-    int64_t len = index_tensor_shape.At(i);
-    if (len == src_tensor_shape.At(i) && len == input_tensor_shape.At(i)) {
+  FOR_RANGE(int64_t, i, 0, index_num_axes) {
+    if (i != dim) {
       ctx->NewBuilder()
           .Split(user_op::OpArg("index", 0), i)
           .Split(user_op::OpArg("src", 0), i)
-          .Split(user_op::OpArg(like_or_input, 0), i)
           .Split(user_op::OpArg("output", 0), i)
+          .Split(user_op::OpArg(like_or_input, 0), i)
+          .Build();
+    } else {
+      ctx->NewBuilder()
+          .Split(user_op::OpArg("index", 0), i)
+          .Split(user_op::OpArg("src", 0), i)
+          .PartialSum(user_op::OpArg("output", 0))
+          .Broadcast(user_op::OpArg(like_or_input, 0))
+          .Build();
+
+      ctx->NewBuilder()
+          .Split(user_op::OpArg("index", 0), i)
+          .Split(user_op::OpArg("src", 0), i)
+          .PartialSum(user_op::OpArg("output", 0))
+          .PartialSum(user_op::OpArg(like_or_input, 0))
           .Build();
     }
   }
@@ -149,29 +158,11 @@ Maybe<void> SetSbpScatter(user_op::SbpContext* ctx) {
   return Maybe<void>::Ok();
 }
 
-Maybe<void> SetSbpScatterScalar(user_op::SbpContext* ctx) {
-  const int32_t dim = ctx->Attr<int32_t>("dim");
-
-  const Shape& index_tensor_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("index", 0).shape();
-  const Shape& input_tensor_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("input", 0).shape();
-
-  FOR_RANGE(int64_t, i, 0, index_tensor_shape.NumAxes()) {
-    if (i == dim) { continue; }
-    if (index_tensor_shape.At(i) == input_tensor_shape.At(i)) {
-      ctx->NewBuilder()
-          .Split(user_op::OpArg("index", 0), i)
-          .Split(user_op::OpArg("input", 0), i)
-          .Split(user_op::OpArg("output", 0), i)
-          .Build();
-    }
-  }
-  return Maybe<void>::Ok();
-}
-
 Maybe<void> InferDtype(user_op::InferContext* ctx) {
   const user_op::TensorDesc& index = ctx->InputTensorDesc("index", 0);
   CHECK_OR_RETURN(IsIndexDataType(index.data_type()));
   if (ctx->has_input("input", 0)) {
+    const user_op::TensorDesc& input = ctx->InputTensorDesc("input", 0);
     CHECK_EQ_OR_RETURN(ctx->InputDType("input", 0), ctx->InputDType("src", 0));
   } else {
     CHECK_EQ_OR_RETURN(ctx->InputDType("like", 0), ctx->InputDType("src", 0));
@@ -283,7 +274,7 @@ Maybe<void> ScatterBackward(user_op::BackwardOpConfContext* ctx) {
   }                                                                                               \
                                                                                                   \
   /* static */ Maybe<void> optypename::GetSbp(user_op::SbpContext* ctx) {                         \
-    return SetSbpScatterScalar(ctx);                                                              \
+    return SetSbpScatter(ctx);                                                                    \
   }                                                                                               \
                                                                                                   \
   /* static */ Maybe<void> optypename::ModifyInputArg(                                            \
diff --git a/oneflow/user/ops/distributions/normal_op.cpp b/oneflow/user/ops/distributions/normal_op.cpp
index 5af64e0d3bb..ee71ec5f696 100644
--- a/oneflow/user/ops/distributions/normal_op.cpp
+++ b/oneflow/user/ops/distributions/normal_op.cpp
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/framework/op_generated.h"
-#include "oneflow/core/job/nd_sbp_util.h"
 
 namespace oneflow {
 
@@ -28,15 +27,7 @@ namespace oneflow {
 }
 
 /*static*/ Maybe<void> NormalOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) {
-  const Shape& parallel_hierarchy = *ctx->parallel_desc().hierarchy();
-  const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0);
-  const Shape& logical_shape = ctx->Attr<Shape>("shape");
-  const int64_t parallel_id = ctx->parallel_ctx().parallel_id();
-  const Shape& physical_shape =
-      GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape();
-
-  *ctx->OutputShape("out", 0) = physical_shape;
-  return Maybe<void>::Ok();
+  return InferLogicalTensorDesc(ctx);
 }
 
 /* static */ Maybe<void> NormalOp::GetSbp(user_op::SbpContext* ctx) {
diff --git a/oneflow/user/ops/hardshrink_op.cpp b/oneflow/user/ops/hardshrink_op.cpp
deleted file mode 100644
index 21fdae26a17..00000000000
--- a/oneflow/user/ops/hardshrink_op.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/framework/op_generated.h"
-
-namespace oneflow {
-
-/* static */ Maybe<void> HardShrinkOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
-  *ctx->OutputShape("out", 0) = ctx->InputShape("in", 0);
-  return Maybe<void>::Ok();
-}
-
-/* static */ Maybe<void> HardShrinkOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) {
-  return InferLogicalTensorDesc(ctx);
-}
-
-/* static */ Maybe<void> HardShrinkOp::GetSbp(user_op::SbpContext* ctx) {
-  const user_op::TensorDesc& in_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("in", 0);
-  FOR_RANGE(int64_t, i, 0, in_tensor.shape().NumAxes()) {
-    ctx->NewBuilder().Split(user_op::OpArg("in", 0), i).Split(user_op::OpArg("out", 0), i).Build();
-  }
-  return Maybe<void>::Ok();
-}
-
-/* static */ Maybe<void> HardShrinkOp::InferDataType(user_op::InferContext* ctx) {
-  *ctx->OutputDType("out", 0) = ctx->InputDType("in", 0);
-  return Maybe<void>::Ok();
-}
-
-/* static */ Maybe<void> HardShrinkGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
-  const Shape& y_shape = ctx->InputShape("y", 0);
-  const Shape& dy_shape = ctx->InputShape("dy", 0);
-  Shape* dx_shape = ctx->OutputShape("dx", 0);
-  CHECK_OR_RETURN(dy_shape == y_shape) << "The shape of y_grad and y must be same.";
-  *dx_shape = dy_shape;
-  return Maybe<void>::Ok();
-}
-
-/* static */ Maybe<void> HardShrinkGradOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) {
-  return InferLogicalTensorDesc(ctx);
-}
-
-/* static */ Maybe<void> HardShrinkGradOp::GetSbp(user_op::SbpContext* ctx) {
-  const user_op::TensorDesc& y_tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("y", 0);
-  FOR_RANGE(int64_t, i, 0, y_tensor.shape().NumAxes()) {
-    ctx->NewBuilder()
-        .Split(user_op::OpArg("y", 0), i)
-        .Split(user_op::OpArg("dy", 0), i)
-        .Split(user_op::OpArg("dx", 0), i)
-        .Build();
-  }
-  return Maybe<void>::Ok();
-}
-
-/* static */ Maybe<void> HardShrinkGradOp::InferDataType(user_op::InferContext* ctx) {
-  CHECK_EQ_OR_RETURN(ctx->InputDType("dy", 0), ctx->InputDType("y", 0))
-      << "The dtype of y_grad and y must be same.";
-  *ctx->OutputDType("dx", 0) = ctx->InputDType("y", 0);
-  return Maybe<void>::Ok();
-}
-
-REGISTER_USER_OP_GRAD("hardshrink")
-    .SetBackwardOpConfGenFn([](user_op::BackwardOpConfContext* ctx) -> Maybe<void> {
-      const auto hardshrink_grad_op_name = ctx->FwOp().op_name() + "_grad";
-      ctx->DefineOp(hardshrink_grad_op_name, [&ctx](user_op::BackwardOpBuilder& builder) {
-        return builder.OpTypeName("hardshrink_grad")
-            .InputBind("y", ctx->FwOp().output("y", 0))
-            .InputBind("dy", ctx->FwOp().output_grad("out", 0))
-            .Attr<double>("lambd", ctx->FwOp().attr<double>("lambd"))
-            .Output("dx")
-            .Build();
-      });
-      ctx->FwOp().InputGradBind(user_op::OpArg("in", 0),
-                                [&ctx, &hardshrink_grad_op_name]() -> const std::string& {
-                                  return ctx->GetOp(hardshrink_grad_op_name).output("dx", 0);
-                                });
-      return Maybe<void>::Ok();
-    });
-}  // namespace oneflow
diff --git a/oneflow/user/ops/math_unary_elementwise_seq.h b/oneflow/user/ops/math_unary_elementwise_seq.h
index 27e2f95a26c..dc926e2ac56 100644
--- a/oneflow/user/ops/math_unary_elementwise_seq.h
+++ b/oneflow/user/ops/math_unary_elementwise_seq.h
@@ -53,8 +53,7 @@ namespace oneflow {
   OF_PP_MAKE_TUPLE_SEQ("sinh", Sinh)                         \
   OF_PP_MAKE_TUPLE_SEQ("sqrt", Sqrt)                         \
   OF_PP_MAKE_TUPLE_SEQ("square", Square)                     \
-  OF_PP_MAKE_TUPLE_SEQ("tan", Tan)                           \
-  OF_PP_MAKE_TUPLE_SEQ("not_equal_zero", NotEqualZero)
+  OF_PP_MAKE_TUPLE_SEQ("tan", Tan)
 
 #define MATH_UNARY_ELEMENTWISE_FUNC_SEQ_ODS                  \
   OF_PP_MAKE_TUPLE_SEQ("abs", Abs)                           \
@@ -89,8 +88,7 @@ namespace oneflow {
   OF_PP_MAKE_TUPLE_SEQ("sinh", Sinh)                         \
   OF_PP_MAKE_TUPLE_SEQ("sqrt", Sqrt)                         \
   OF_PP_MAKE_TUPLE_SEQ("square", Square)                     \
-  OF_PP_MAKE_TUPLE_SEQ("tan", Tan)                           \
-  OF_PP_MAKE_TUPLE_SEQ("not_equal_zero", NotEqualZero)
+  OF_PP_MAKE_TUPLE_SEQ("tan", Tan)
 
 }  // namespace oneflow
 
diff --git a/oneflow/user/ops/model_update_ops.cpp b/oneflow/user/ops/model_update_ops.cpp
index 186b08689f5..2e86c2af59d 100644
--- a/oneflow/user/ops/model_update_ops.cpp
+++ b/oneflow/user/ops/model_update_ops.cpp
@@ -35,9 +35,7 @@ Maybe<void> CheckDataTypeLike(const user_op::TensorDesc* tensor_desc,
 }
 
 Maybe<void> CheckScalarShape(const user_op::TensorDesc* tensor_desc) {
-  CHECK_OR_RETURN(tensor_desc->shape().NumAxes() == 0
-                  || (tensor_desc->shape().NumAxes() == 1 && tensor_desc->shape().At(0) == 1))
-      << tensor_desc->shape().DebugStr();
+  CHECK_EQ_OR_RETURN(tensor_desc->shape(), Shape({1}));
   return Maybe<void>::Ok();
 }
 Maybe<void> CheckScalarDataType(const user_op::TensorDesc* tensor_desc, const DataType data_type) {
diff --git a/oneflow/user/ops/multi_reduce_ops.cpp b/oneflow/user/ops/multi_reduce_ops.cpp
deleted file mode 100644
index 58ceca4ff10..00000000000
--- a/oneflow/user/ops/multi_reduce_ops.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/framework/op_generated.h"
-#include "oneflow/core/framework/nd_sbp.h"
-
-namespace oneflow {
-
-namespace {
-
-Maybe<void> InferMultiReduceOpShape(user_op::InferContext* ctx) {
-  CHECK_GT_OR_RETURN(ctx->input_size("x"), 0) << ctx->op_name() << "must have at least 1 input";
-  *ctx->OutputShape("y", 0) = Shape({});
-  return Maybe<void>::Ok();
-}
-
-Maybe<void> InferMultiReduceOpDataType(user_op::InferContext* ctx) {
-  const auto& x_0_dtype = ctx->InputDType("x", 0);
-  for (size_t i = 1; i < ctx->input_size("x"); ++i) {
-    CHECK_EQ_OR_RETURN(ctx->InputDType("x", i), x_0_dtype)
-        << ctx->op_name() << ": the " << i << " th input has the different data type with others";
-  }
-  *ctx->OutputDType("y", 0) = x_0_dtype;
-  return Maybe<void>::Ok();
-}
-
-Maybe<void> GetMultiReduceOpSbp(user_op::SbpContext* ctx) {
-  const auto& x_0 = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0);
-  int64_t min_num_axes = x_0.shape().NumAxes();
-  for (size_t i = 1; i < ctx->user_op_conf().input_size("x"); ++i) {
-    const auto& x_i = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", i);
-    min_num_axes = std::min(min_num_axes, x_i.shape().NumAxes());
-  }
-  for (int64_t i = 0; i < min_num_axes; ++i) {
-    ctx->NewBuilder().Split(ctx->inputs(), i).PartialSum(user_op::OpArg("y", 0)).Build();
-  }
-  return Maybe<void>::Ok();
-}
-
-Maybe<void> InferLocalMultiReduceOpLogicalShape(user_op::InferContext* ctx) {
-  CHECK_GT_OR_RETURN(ctx->input_size("x"), 0) << ctx->op_name() << "must have at least 1 input";
-  const NdSbp& any_nd_sbp = ctx->NdSbp4ArgNameAndIndex("x", 0);
-  for (int32_t i = 1; i < ctx->input_size("x"); ++i) {
-    const NdSbp& input_i_sbp = ctx->NdSbp4ArgNameAndIndex("x", i);
-    CHECK_OR_RETURN(input_i_sbp == any_nd_sbp)
-        << ctx->op_name() << ": the " << i << " th arg has the different sbp with others, "
-        << NdSbpToString(input_i_sbp) << " vs. " << NdSbpToString(any_nd_sbp);
-  }
-  auto rank_mesh = ctx->parallel_desc().hierarchy();
-  CHECK_EQ_OR_RETURN(rank_mesh->NumAxes(), any_nd_sbp.sbp_parallel_size())
-      << ctx->op_name() << ": ndim of ranks of " << *JUST(PlacementToString(ctx->parallel_desc()))
-      << " is mismatched with the size of sbp " << NdSbpToString(any_nd_sbp);
-  int64_t split_num = 1;
-  for (int64_t i = 0; i < rank_mesh->NumAxes(); ++i) {
-    if (any_nd_sbp.sbp_parallel(i).has_split_parallel()) { split_num *= rank_mesh->At(i); }
-  }
-  *ctx->OutputShape("y", 0) = Shape({split_num});
-  return Maybe<void>::Ok();
-}
-
-Maybe<void> InferLocalMultiReduceOpPhysicalShape(user_op::InferContext* ctx) {
-  CHECK_GT_OR_RETURN(ctx->input_size("x"), 0) << ctx->op_name() << "must have at least 1 input";
-  *ctx->OutputShape("y", 0) = Shape({1});
-  return Maybe<void>::Ok();
-}
-
-Maybe<void> GetLocalMultiReduceOpSbp(user_op::SbpContext* ctx) {
-  const auto& x_0 = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0);
-  int64_t min_num_axes = x_0.shape().NumAxes();
-  for (size_t i = 1; i < ctx->user_op_conf().input_size("x"); ++i) {
-    const auto& x_i = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", i);
-    min_num_axes = std::min(min_num_axes, x_i.shape().NumAxes());
-  }
-  for (int64_t i = 0; i < min_num_axes; ++i) {
-    ctx->NewBuilder().Split(ctx->inputs(), i).Split(user_op::OpArg("y", 0), 0).Build();
-  }
-  return Maybe<void>::Ok();
-}
-
-}  // namespace
-
-#define DEFINE_MULTI_REDUCE_OP_METHODS(op)                                 \
-  Maybe<void> op##Op::InferLogicalTensorDesc(user_op::InferContext* ctx) { \
-    return InferMultiReduceOpShape(ctx);                                   \
-  }                                                                        \
-  Maybe<void> op##Op::InferDataType(user_op::InferContext* ctx) {          \
-    return InferMultiReduceOpDataType(ctx);                                \
-  }                                                                        \
-  Maybe<void> op##Op::GetSbp(user_op::SbpContext* ctx) { return GetMultiReduceOpSbp(ctx); }
-
-DEFINE_MULTI_REDUCE_OP_METHODS(MultiReduceSumPowAbs)
-DEFINE_MULTI_REDUCE_OP_METHODS(MultiReduceMaxAbs)
-DEFINE_MULTI_REDUCE_OP_METHODS(MultiReduceMinAbs)
-#undef DEFINE_MULTI_REDUCE_OP_METHODS
-
-#define DEFINE_LOCAL_MULTI_REDUCE_OP_METHODS(op)                            \
-  Maybe<void> op##Op::InferLogicalTensorDesc(user_op::InferContext* ctx) {  \
-    return InferLocalMultiReduceOpLogicalShape(ctx);                        \
-  }                                                                         \
-  Maybe<void> op##Op::InferPhysicalTensorDesc(user_op::InferContext* ctx) { \
-    return InferLocalMultiReduceOpPhysicalShape(ctx);                       \
-  }                                                                         \
-  Maybe<void> op##Op::InferDataType(user_op::InferContext* ctx) {           \
-    return InferMultiReduceOpDataType(ctx);                                 \
-  }                                                                         \
-  Maybe<void> op##Op::GetSbp(user_op::SbpContext* ctx) { return GetLocalMultiReduceOpSbp(ctx); }
-
-DEFINE_LOCAL_MULTI_REDUCE_OP_METHODS(LocalMultiReduceMaxAbs)
-DEFINE_LOCAL_MULTI_REDUCE_OP_METHODS(LocalMultiReduceMinAbs)
-#undef DEFINE_LOCAL_MULTI_REDUCE_OP_METHODS
-
-}  // namespace oneflow
diff --git a/oneflow/user/ops/one_embedding_ops.cpp b/oneflow/user/ops/one_embedding_ops.cpp
index bf5511f467a..814502da2f4 100644
--- a/oneflow/user/ops/one_embedding_ops.cpp
+++ b/oneflow/user/ops/one_embedding_ops.cpp
@@ -22,9 +22,9 @@ namespace oneflow {
 /* static */ Maybe<void> EmbeddingLookupPlaceholderOp::InferLogicalTensorDesc(
     user_op::InferContext* ctx) {
   const Shape& ids_shape = ctx->InputShape("ids", 0);
-  if (ctx->has_input("table_ids", 0)) {
-    const Shape& table_ids_shape = ctx->InputShape("table_ids", 0);
-    CHECK_EQ_OR_RETURN(ids_shape, table_ids_shape) << "table_ids shape must equal to ids shape";
+  if (ctx->has_input("column_ids", 0)) {
+    const Shape& column_ids_shape = ctx->InputShape("column_ids", 0);
+    CHECK_EQ_OR_RETURN(ids_shape, column_ids_shape);
   }
   DimVector out_dim_vec = ids_shape.dim_vec();
   const int64_t embedding_size = ctx->Attr<int64_t>("embedding_size");
@@ -43,8 +43,8 @@ namespace oneflow {
                      .Broadcast(user_op::OpArg("shadow", 0))
                      .Split(user_op::OpArg("ids", 0), 0)
                      .Split(user_op::OpArg("embeddings", 0), 0);
-  if (ctx->user_op_conf().has_input("table_ids", 0)) {
-    builder.Split(user_op::OpArg("table_ids", 0), 0);
+  if (ctx->user_op_conf().has_input("column_ids", 0)) {
+    builder.Split(user_op::OpArg("column_ids", 0), 0);
   }
   builder.Build();
   return Maybe<void>::Ok();
@@ -53,15 +53,15 @@ namespace oneflow {
 /* static */ Maybe<void> EmbeddingLookupPlaceholderOp::ModifyInputArg(
     const GetInputArgModifier& GetInputArgModifierFn, const user_op::UserOpConfWrapper& conf) {
   user_op::InputArgModifier* shadow = GetInputArgModifierFn("shadow", 0);
-  CHECK_OR_RETURN(shadow != nullptr) << "shadow is nullptr";
+  CHECK_OR_RETURN(shadow != nullptr);
   shadow->set_requires_grad(false);
   user_op::InputArgModifier* ids = GetInputArgModifierFn("ids", 0);
   CHECK_OR_RETURN(ids != nullptr);
   ids->set_requires_grad(false);
-  if (conf.has_input("table_ids", 0)) {
-    user_op::InputArgModifier* table_ids = GetInputArgModifierFn("table_ids", 0);
-    CHECK_OR_RETURN(table_ids != nullptr) << "table_ids is nullptr";
-    table_ids->set_requires_grad(false);
+  if (conf.has_input("column_ids", 0)) {
+    user_op::InputArgModifier* column_ids = GetInputArgModifierFn("column_ids", 0);
+    CHECK_OR_RETURN(column_ids != nullptr);
+    column_ids->set_requires_grad(false);
   }
   return Maybe<void>::Ok();
 }
@@ -111,9 +111,8 @@ REGISTER_USER_OP_GRAD("embedding_lookup_placeholder")
 /* static */ Maybe<void> EmbeddingPrefetchOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
   const Shape& num_unique_ids_shape = ctx->InputShape("num_unique_ids", 0);
   const Shape& unique_ids_shape = ctx->InputShape("unique_ids", 0);
-  const Shape& table_ids_shape = ctx->InputShape("table_ids", 0);
-  CHECK_EQ_OR_RETURN(unique_ids_shape, table_ids_shape)
-      << "table_ids shape must equal to ids shape";
+  const Shape& column_ids_shape = ctx->InputShape("column_ids", 0);
+  CHECK_EQ_OR_RETURN(unique_ids_shape, column_ids_shape);
   CHECK_EQ_OR_RETURN(num_unique_ids_shape.elem_cnt(), 1);
   *ctx->OutputShape("context", 0) = num_unique_ids_shape;
   return Maybe<void>::Ok();
@@ -127,7 +126,7 @@ REGISTER_USER_OP_GRAD("embedding_lookup_placeholder")
   ctx->NewBuilder()
       .Broadcast(user_op::OpArg("num_unique_ids", 0))
       .Split(user_op::OpArg("unique_ids", 0), 0)
-      .Split(user_op::OpArg("table_ids", 0), 0)
+      .Split(user_op::OpArg("column_ids", 0), 0)
       .Broadcast(user_op::OpArg("context", 0))
       .Build();
   return Maybe<void>::Ok();
@@ -141,9 +140,8 @@ REGISTER_USER_OP_GRAD("embedding_lookup_placeholder")
 /* static */ Maybe<void> EmbeddingLookupOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
   const Shape& num_unique_ids_shape = ctx->InputShape("num_unique_ids", 0);
   const Shape& unique_ids_shape = ctx->InputShape("unique_ids", 0);
-  const Shape& table_ids_shape = ctx->InputShape("table_ids", 0);
-  CHECK_EQ_OR_RETURN(unique_ids_shape, table_ids_shape)
-      << "table_ids shape must equal to ids shape";
+  const Shape& column_ids_shape = ctx->InputShape("column_ids", 0);
+  CHECK_EQ_OR_RETURN(unique_ids_shape, column_ids_shape);
   CHECK_EQ_OR_RETURN(num_unique_ids_shape.elem_cnt(), 1);
   const int64_t embedding_size = ctx->Attr<int64_t>("embedding_size");
   const int64_t line_size = ctx->Attr<int64_t>("line_size");
@@ -170,7 +168,7 @@ REGISTER_USER_OP_GRAD("embedding_lookup_placeholder")
   auto builder = ctx->NewBuilder()
                      .Broadcast(user_op::OpArg("num_unique_ids", 0))
                      .Split(user_op::OpArg("unique_ids", 0), 0)
-                     .Split(user_op::OpArg("table_ids", 0), 0)
+                     .Split(user_op::OpArg("column_ids", 0), 0)
                      .Split(ctx->outputs(), 0);
   if (ctx->user_op_conf().has_input("context", 0)) {
     builder.Broadcast(user_op::OpArg("context", 0));
@@ -323,36 +321,4 @@ Maybe<void> CheckDataType(user_op::InferContext* ctx) {
   return Maybe<void>::Ok();
 }
 
-/* static */ Maybe<void> AdagradEmbeddingUpdateOp::InferLogicalTensorDesc(
-    user_op::InferContext* ctx) {
-  JUST(CheckDataShape(ctx));
-  const Shape& unique_embeddings_shape = ctx->InputShape("unique_embeddings", 0);
-  CHECK_EQ_OR_RETURN(unique_embeddings_shape.At(1), 2 * ctx->InputShape("embedding_grad", 0).At(1))
-      << "please adjust size_factor of MultiTableEmbedding's store_options to 2";
-  *ctx->OutputShape("updated_unique_embeddings", 0) = unique_embeddings_shape;
-  return Maybe<void>::Ok();
-}
-
-/*static*/ Maybe<void> AdagradEmbeddingUpdateOp::InferPhysicalTensorDesc(
-    user_op::InferContext* ctx) {
-  return InferLogicalTensorDesc(ctx);
-}
-
-/* static */ Maybe<void> AdagradEmbeddingUpdateOp::GetSbp(user_op::SbpContext* ctx) {
-  ctx->NewBuilder()
-      .Broadcast(ctx->inputs())
-      .Broadcast(user_op::OpArg("num_unique_ids", 0))
-      .Split(user_op::OpArg("unique_embeddings", 0), 0)
-      .Split(user_op::OpArg("embedding_grad", 0), 0)
-      .Split(user_op::OpArg("updated_unique_embeddings", 0), 0)
-      .Build();
-  return Maybe<void>::Ok();
-}
-
-/* static */ Maybe<void> AdagradEmbeddingUpdateOp::InferDataType(user_op::InferContext* ctx) {
-  JUST(CheckDataType(ctx));
-  *ctx->OutputDType("updated_unique_embeddings", 0) = ctx->InputDType("unique_embeddings", 0);
-  return Maybe<void>::Ok();
-}
-
 }  // namespace oneflow
diff --git a/oneflow/user/ops/pooling_op.cpp b/oneflow/user/ops/pooling_op.cpp
index 9e32603f10d..58ddd0e836a 100644
--- a/oneflow/user/ops/pooling_op.cpp
+++ b/oneflow/user/ops/pooling_op.cpp
@@ -116,6 +116,7 @@ Maybe<void> MaxPoolBackwardGetSbpFn(user_op::SbpContext* ctx) {
   FOR_RANGE(int64_t, i, 0, std::min(2, (int)tensor.shape().NumAxes())) {
     ctx->NewBuilder()
         .Split(user_op::OpArg("x", 0), i)
+        .Split(user_op::OpArg("y", 0), i)
         .Split(user_op::OpArg("indice", 0), i)
         .Split(user_op::OpArg("dy", 0), i)
         .Split(user_op::OpArg("dx", 0), i)
@@ -128,6 +129,7 @@ Maybe<void> AvgPoolBackwardGetSbpFn(user_op::SbpContext* ctx) {
   FOR_RANGE(int64_t, i, 0, 2) {
     ctx->NewBuilder()
         .Split(user_op::OpArg("x", 0), i)
+        .Split(user_op::OpArg("y", 0), i)
         .Split(user_op::OpArg("dy", 0), i)
         .Split(user_op::OpArg("dx", 0), i)
         .Build();
@@ -142,6 +144,7 @@ GenBackwardOpConfFn MaxPoolMakeBackwardOpConfFn(const std::string& mode, const i
       user_op::UserOpConfWrapper grad_op =
           builder.Op(mode + "pool_" + std::to_string(dim) + "d_grad")
               .Input("x", op.input("x", 0))
+              .Input("y", op.output("y", 0))
               .Input("indice", op.output("indice", 0))
               .Input("dy", op.GetGradTensorWithOpOutput("y", 0))
               .Output("dx")
@@ -167,6 +170,7 @@ GenBackwardOpConfFn AvgPoolMakeBackwardOpConfFn(const int32_t dim) {
       user_op::UserOpConfWrapper grad_op =
           builder.Op("avgpool_" + std::to_string(dim) + "d_grad")
               .Input("x", op.input("x", 0))
+              .Input("y", op.output("y", 0))
               .Input("dy", op.GetGradTensorWithOpOutput("y", 0))
               .Output("dx")
               .Attr("data_format", op.attr<std::string>("data_format"))
diff --git a/oneflow/user/ops/roc_auc_score_op.cpp b/oneflow/user/ops/roc_auc_score_op.cpp
deleted file mode 100644
index 9a7e68ed524..00000000000
--- a/oneflow/user/ops/roc_auc_score_op.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/framework/op_generated.h"
-
-namespace oneflow {
-
-/* static */ Maybe<void> RocAucScoreOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
-  user_op::TensorDesc* out_desc = ctx->OutputTensorDesc("out", 0);
-  const Shape& pred_shape = ctx->InputTensorDesc("pred", 0).shape();
-  const Shape& label_shape = ctx->InputTensorDesc("label", 0).shape();
-  CHECK_EQ_OR_RETURN(pred_shape.elem_cnt(), label_shape.elem_cnt())
-      << "pred and label MUST have same element count.";
-  out_desc->set_is_dynamic(false);
-  *out_desc->mut_shape() = Shape({1});
-  return Maybe<void>::Ok();
-}
-
-/*static*/ Maybe<void> RocAucScoreOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) {
-  return InferLogicalTensorDesc(ctx);
-}
-
-/* static */ Maybe<void> RocAucScoreOp::GetSbp(user_op::SbpContext* ctx) {
-  return user_op::GetSbpFnUtil::DefaultBroadcastToBroadcast(ctx);
-}
-
-/* static */ Maybe<void> RocAucScoreOp::InferDataType(user_op::InferContext* ctx) {
-  *ctx->OutputDType("out", 0) = DataType::kFloat;
-  const user_op::TensorDesc& label = ctx->InputTensorDesc("label", 0);
-  CHECK_OR_RETURN(IsFloatingDataType(label.data_type()) || IsIntegralDataType(label.data_type()))
-      << "Input `label` data type " << DataType_Name(label.data_type()) << " is not supported.";
-  const user_op::TensorDesc& pred = ctx->InputTensorDesc("pred", 0);
-  CHECK_OR_RETURN(pred.data_type() == DataType::kFloat)
-      << "Input `pred` data type " << DataType_Name(pred.data_type()) << " is not supported.";
-  return Maybe<void>::Ok();
-}
-
-}  // namespace oneflow
diff --git a/oneflow/user/ops/upsample_op.cpp b/oneflow/user/ops/upsample_op.cpp
index 1226fe304e2..0a48bbbfe1a 100644
--- a/oneflow/user/ops/upsample_op.cpp
+++ b/oneflow/user/ops/upsample_op.cpp
@@ -220,11 +220,7 @@ namespace oneflow {
 }
 
 /*static*/ Maybe<void> UpsampleLinear1DGradOp::GetSbp(user_op::SbpContext* ctx) {
-  ctx->NewBuilder()
-      .Split(user_op::OpArg("dy", 0), 0)
-      .Split(user_op::OpArg("x", 0), 0)
-      .Split(user_op::OpArg("dx", 0), 0)
-      .Build();
+  ctx->NewBuilder().Split(user_op::OpArg("dy", 0), 0).Split(user_op::OpArg("dx", 0), 0).Build();
   return Maybe<void>::Ok();
 }
 /*static*/ Maybe<void> UpsampleLinear1DGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
@@ -245,11 +241,7 @@ namespace oneflow {
 }
 
 /*static*/ Maybe<void> UpsampleNearest1DGradOp::GetSbp(user_op::SbpContext* ctx) {
-  ctx->NewBuilder()
-      .Split(user_op::OpArg("dy", 0), 0)
-      .Split(user_op::OpArg("x", 0), 0)
-      .Split(user_op::OpArg("dx", 0), 0)
-      .Build();
+  ctx->NewBuilder().Split(user_op::OpArg("dy", 0), 0).Split(user_op::OpArg("dx", 0), 0).Build();
   return Maybe<void>::Ok();
 }
 /*static*/ Maybe<void> UpsampleNearest1DGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
@@ -271,11 +263,7 @@ namespace oneflow {
 }
 
 /*static*/ Maybe<void> UpsampleNearest2DGradOp::GetSbp(user_op::SbpContext* ctx) {
-  ctx->NewBuilder()
-      .Split(user_op::OpArg("dy", 0), 0)
-      .Split(user_op::OpArg("x", 0), 0)
-      .Split(user_op::OpArg("dx", 0), 0)
-      .Build();
+  ctx->NewBuilder().Split(user_op::OpArg("dy", 0), 0).Split(user_op::OpArg("dx", 0), 0).Build();
   return Maybe<void>::Ok();
 }
 /*static*/ Maybe<void> UpsampleNearest2DGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
@@ -297,11 +285,7 @@ namespace oneflow {
 }
 
 /*static*/ Maybe<void> UpsampleBilinear2DGradOp::GetSbp(user_op::SbpContext* ctx) {
-  ctx->NewBuilder()
-      .Split(user_op::OpArg("dy", 0), 0)
-      .Split(user_op::OpArg("x", 0), 0)
-      .Split(user_op::OpArg("dx", 0), 0)
-      .Build();
+  ctx->NewBuilder().Split(user_op::OpArg("dy", 0), 0).Split(user_op::OpArg("dx", 0), 0).Build();
   return Maybe<void>::Ok();
 }
 /*static*/ Maybe<void> UpsampleBilinear2DGradOp::InferLogicalTensorDesc(
@@ -324,11 +308,7 @@ namespace oneflow {
 }
 
 /*static*/ Maybe<void> UpsampleBicubic2DGradOp::GetSbp(user_op::SbpContext* ctx) {
-  ctx->NewBuilder()
-      .Split(user_op::OpArg("dy", 0), 0)
-      .Split(user_op::OpArg("x", 0), 0)
-      .Split(user_op::OpArg("dx", 0), 0)
-      .Build();
+  ctx->NewBuilder().Split(user_op::OpArg("dy", 0), 0).Split(user_op::OpArg("dx", 0), 0).Build();
   return Maybe<void>::Ok();
 }
 /*static*/ Maybe<void> UpsampleBicubic2DGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
@@ -350,11 +330,7 @@ namespace oneflow {
 }
 
 /*static*/ Maybe<void> UpsampleGradOp::GetSbp(user_op::SbpContext* ctx) {
-  ctx->NewBuilder()
-      .Split(user_op::OpArg("dy", 0), 0)
-      .Split(user_op::OpArg("x", 0), 0)
-      .Split(user_op::OpArg("dx", 0), 0)
-      .Build();
+  ctx->NewBuilder().Split(user_op::OpArg("dy", 0), 0).Split(user_op::OpArg("dx", 0), 0).Build();
   return Maybe<void>::Ok();
 }
 /*static*/ Maybe<void> UpsampleGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
@@ -375,11 +351,7 @@ namespace oneflow {
 }
 
 /*static*/ Maybe<void> UpsampleNearest3DGradOp::GetSbp(user_op::SbpContext* ctx) {
-  ctx->NewBuilder()
-      .Split(user_op::OpArg("dy", 0), 0)
-      .Split(user_op::OpArg("x", 0), 0)
-      .Split(user_op::OpArg("dx", 0), 0)
-      .Build();
+  ctx->NewBuilder().Split(user_op::OpArg("dy", 0), 0).Split(user_op::OpArg("dx", 0), 0).Build();
   return Maybe<void>::Ok();
 }
 /*static*/ Maybe<void> UpsampleNearest3DGradOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
@@ -401,11 +373,7 @@ namespace oneflow {
 }
 
 /*static*/ Maybe<void> UpsampleTrilinear3DGradOp::GetSbp(user_op::SbpContext* ctx) {
-  ctx->NewBuilder()
-      .Split(user_op::OpArg("dy", 0), 0)
-      .Split(user_op::OpArg("x", 0), 0)
-      .Split(user_op::OpArg("dx", 0), 0)
-      .Build();
+  ctx->NewBuilder().Split(user_op::OpArg("dy", 0), 0).Split(user_op::OpArg("dx", 0), 0).Build();
   return Maybe<void>::Ok();
 }
 /*static*/ Maybe<void> UpsampleTrilinear3DGradOp::InferLogicalTensorDesc(
diff --git a/oneflow/user/ops/where_op.cpp b/oneflow/user/ops/where_op.cpp
index e49ffb19fe6..aa2b8fba629 100644
--- a/oneflow/user/ops/where_op.cpp
+++ b/oneflow/user/ops/where_op.cpp
@@ -20,62 +20,6 @@ namespace oneflow {
 
 namespace {
 
-Maybe<Shape> GetBroadcastShape(const Shape& a_shape, const Shape& b_shape) {
-  Shape broadcast_shape = Shape::Ones(std::max(a_shape.NumAxes(), b_shape.NumAxes()));
-  Shape a_extend_shape = CreateLeftExtendedShape(ShapeView(a_shape), broadcast_shape.NumAxes());
-  Shape b_extend_shape = CreateLeftExtendedShape(ShapeView(b_shape), broadcast_shape.NumAxes());
-  FOR_RANGE(int64_t, i, 0, broadcast_shape.NumAxes()) {
-    CHECK_OR_RETURN(a_extend_shape.At(i) == 1 || b_extend_shape.At(i) == 1
-                    || a_extend_shape.At(i) == b_extend_shape.At(i))
-        << Error::RuntimeError() << "The size of tensor a (" << a_extend_shape.At(i)
-        << ") must match the size of tensor b (" << b_extend_shape.At(i)
-        << ") at non-singleton dimension " << i;
-    broadcast_shape.Set(i, std::max(a_extend_shape.At(i), b_extend_shape.At(i)));
-  }
-  return broadcast_shape;
-}
-
-Maybe<std::vector<std::tuple<int64_t, int64_t, int64_t, int64_t>>> CalValidSplitDims(
-    const Shape& a_shape, const Shape& b_shape, const Shape& c_shape) {
-  std::shared_ptr<std::vector<std::tuple<int64_t, int64_t, int64_t, int64_t>>> vaild_split_dims =
-      std::make_shared<std::vector<std::tuple<int64_t, int64_t, int64_t, int64_t>>>();
-  int32_t max_num_axes =
-      std::max(a_shape.NumAxes(), std::max(b_shape.NumAxes(), c_shape.NumAxes()));
-  Shape broadcast_shape = Shape::Ones(std::max(a_shape.NumAxes(), b_shape.NumAxes()));
-  Shape a_extend_shape = CreateLeftExtendedShape(ShapeView(a_shape), broadcast_shape.NumAxes());
-  Shape b_extend_shape = CreateLeftExtendedShape(ShapeView(b_shape), broadcast_shape.NumAxes());
-  Shape c_extend_shape = CreateLeftExtendedShape(ShapeView(c_shape), broadcast_shape.NumAxes());
-  int64_t a_dim_offset = max_num_axes - a_shape.NumAxes();
-  int64_t b_dim_offset = max_num_axes - b_shape.NumAxes();
-  int64_t c_dim_offset = max_num_axes - c_shape.NumAxes();
-  FOR_RANGE(int64_t, i, 0, max_num_axes) {
-    if (a_extend_shape.At(i) != 1 && a_extend_shape.At(i) == b_extend_shape.At(i)
-        && a_extend_shape.At(i) == c_extend_shape.At(i)) {
-      vaild_split_dims->emplace_back(
-          std::make_tuple(i - a_dim_offset, i - b_dim_offset, i - c_dim_offset, i));
-    }
-  }
-  return vaild_split_dims;
-}
-
-Maybe<std::vector<std::tuple<int64_t, int64_t, int64_t>>> CalValidSplitDims(const Shape& a_shape,
-                                                                            const Shape& b_shape) {
-  std::shared_ptr<std::vector<std::tuple<int64_t, int64_t, int64_t>>> vaild_split_dims =
-      std::make_shared<std::vector<std::tuple<int64_t, int64_t, int64_t>>>();
-  int32_t max_num_axes = std::max(a_shape.NumAxes(), b_shape.NumAxes());
-  Shape broadcast_shape = Shape::Ones(std::max(a_shape.NumAxes(), b_shape.NumAxes()));
-  Shape a_extend_shape = CreateLeftExtendedShape(ShapeView(a_shape), broadcast_shape.NumAxes());
-  Shape b_extend_shape = CreateLeftExtendedShape(ShapeView(b_shape), broadcast_shape.NumAxes());
-  int64_t a_dim_offset = max_num_axes - a_shape.NumAxes();
-  int64_t b_dim_offset = max_num_axes - b_shape.NumAxes();
-  FOR_RANGE(int64_t, i, 0, max_num_axes) {
-    if (a_extend_shape.At(i) != 1 && a_extend_shape.At(i) == b_extend_shape.At(i)) {
-      vaild_split_dims->emplace_back(std::make_tuple(i - a_dim_offset, i - b_dim_offset, i));
-    }
-  }
-  return vaild_split_dims;
-}
-
 Maybe<void> InferWhereTensorDesc(user_op::InferContext* ctx) {
   const Shape& cond_shape = ctx->InputShape("condition", 0);
   const Shape& x_shape = ctx->InputShape("x", 0);
@@ -83,8 +27,16 @@ Maybe<void> InferWhereTensorDesc(user_op::InferContext* ctx) {
   if (x_shape == y_shape && y_shape == cond_shape) {
     *ctx->OutputShape("out", 0) = cond_shape;
   } else {
-    Shape max_shape = *JUST(GetBroadcastShape(cond_shape, x_shape));
-    max_shape = *JUST(GetBroadcastShape(max_shape, y_shape));
+    Shape max_shape =
+        Shape::Ones(std::max(x_shape.NumAxes(), std::max(y_shape.NumAxes(), cond_shape.NumAxes())));
+    const Shape& x_extend_shape = CreateLeftExtendedShape(ShapeView(x_shape), max_shape.NumAxes());
+    const Shape& y_extend_shape = CreateLeftExtendedShape(ShapeView(y_shape), max_shape.NumAxes());
+    const Shape& cond_extend_shape =
+        CreateLeftExtendedShape(ShapeView(cond_shape), max_shape.NumAxes());
+    FOR_RANGE(int64_t, i, 0, max_shape.NumAxes()) {
+      max_shape.Set(i, std::max(x_extend_shape.At(i),
+                                std::max(y_extend_shape.At(i), cond_extend_shape.At(i))));
+    }
     *ctx->OutputShape("out", 0) = max_shape;
   }
   return Maybe<void>::Ok();
@@ -96,7 +48,13 @@ Maybe<void> InferWhereXScalarTensorDesc(user_op::InferContext* ctx) {
   if (cond_shape == y_shape) {
     *ctx->OutputShape("out", 0) = cond_shape;
   } else {
-    Shape max_shape = *JUST(GetBroadcastShape(cond_shape, y_shape));
+    Shape max_shape = Shape::Ones(std::max(y_shape.NumAxes(), cond_shape.NumAxes()));
+    const Shape& y_extend_shape = CreateLeftExtendedShape(ShapeView(y_shape), max_shape.NumAxes());
+    const Shape& cond_extend_shape =
+        CreateLeftExtendedShape(ShapeView(cond_shape), max_shape.NumAxes());
+    FOR_RANGE(int64_t, i, 0, max_shape.NumAxes()) {
+      max_shape.Set(i, std::max(y_extend_shape.At(i), cond_extend_shape.At(i)));
+    }
     *ctx->OutputShape("out", 0) = max_shape;
   }
   return Maybe<void>::Ok();
@@ -108,7 +66,13 @@ Maybe<void> InferWhereYScalarTensorDesc(user_op::InferContext* ctx) {
   if (cond_shape == x_shape) {
     *ctx->OutputShape("out", 0) = cond_shape;
   } else {
-    Shape max_shape = *JUST(GetBroadcastShape(cond_shape, x_shape));
+    Shape max_shape = Shape::Ones(std::max(x_shape.NumAxes(), cond_shape.NumAxes()));
+    const Shape& x_extend_shape = CreateLeftExtendedShape(ShapeView(x_shape), max_shape.NumAxes());
+    const Shape& cond_extend_shape =
+        CreateLeftExtendedShape(ShapeView(cond_shape), max_shape.NumAxes());
+    FOR_RANGE(int64_t, i, 0, max_shape.NumAxes()) {
+      max_shape.Set(i, std::max(x_extend_shape.At(i), cond_extend_shape.At(i)));
+    }
     *ctx->OutputShape("out", 0) = max_shape;
   }
   return Maybe<void>::Ok();
@@ -120,16 +84,14 @@ Maybe<void> InferWhereXYScalarTensorDesc(user_op::InferContext* ctx) {
 }
 
 Maybe<void> GetWhereSbpSignatures(user_op::SbpContext* ctx) {
-  const Shape& cond_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("condition", 0).shape();
-  const Shape& x_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0).shape();
-  const Shape& y_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("y", 0).shape();
-  const auto& vaild_split_dims = JUST(CalValidSplitDims(cond_shape, x_shape, y_shape));
-  for (const auto& vaild_split_dim : *vaild_split_dims) {
+  const user_op::TensorDesc& condition_tensor =
+      ctx->LogicalTensorDesc4InputArgNameAndIndex("condition", 0);
+  FOR_RANGE(int64_t, i, 0, condition_tensor.shape().NumAxes()) {
     ctx->NewBuilder()
-        .Split(user_op::OpArg("condition", 0), std::get<0>(vaild_split_dim))
-        .Split(user_op::OpArg("x", 0), std::get<1>(vaild_split_dim))
-        .Split(user_op::OpArg("y", 0), std::get<2>(vaild_split_dim))
-        .Split(user_op::OpArg("out", 0), std::get<3>(vaild_split_dim))
+        .Split(user_op::OpArg("condition", 0), i)
+        .Split(user_op::OpArg("x", 0), i)
+        .Split(user_op::OpArg("y", 0), i)
+        .Split(user_op::OpArg("out", 0), i)
         .Build();
   }
   ctx->NewBuilder()
@@ -144,13 +106,57 @@ Maybe<void> GetWhereSbpSignatures(user_op::SbpContext* ctx) {
 Maybe<void> GetWhereXScalarSbpSignatures(user_op::SbpContext* ctx) {
   const Shape& cond_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("condition", 0).shape();
   const Shape& y_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("y", 0).shape();
-  const auto& vaild_split_dims = JUST(CalValidSplitDims(cond_shape, y_shape));
-  for (const auto& vaild_split_dim : *vaild_split_dims) {
-    ctx->NewBuilder()
-        .Split(user_op::OpArg("condition", 0), std::get<0>(vaild_split_dim))
-        .Split(user_op::OpArg("y", 0), std::get<1>(vaild_split_dim))
-        .Split(user_op::OpArg("out", 0), std::get<2>(vaild_split_dim))
-        .Build();
+  if (cond_shape.NumAxes() < y_shape.NumAxes()) {
+    FOR_RANGE(int64_t, i, 0, y_shape.NumAxes() - cond_shape.NumAxes()) {
+      ctx->NewBuilder()
+          .Broadcast(user_op::OpArg("condition", 0))
+          .Split(user_op::OpArg("y", 0), i)
+          .Split(user_op::OpArg("out", 0), i)
+          .Build();
+    }
+    FOR_RANGE(int64_t, i, 0, cond_shape.NumAxes()) {
+      ctx->NewBuilder()
+          .Split(user_op::OpArg("condition", 0), cond_shape.NumAxes() - 1 - i)
+          .Split(user_op::OpArg("y", 0), y_shape.NumAxes() - 1 - i)
+          .Split(ctx->outputs(), y_shape.NumAxes() - 1 - i)
+          .Build();
+    }
+  } else if (cond_shape.NumAxes() > y_shape.NumAxes()) {
+    FOR_RANGE(int64_t, i, 0, cond_shape.NumAxes() - y_shape.NumAxes()) {
+      ctx->NewBuilder()
+          .Split(user_op::OpArg("condition", 0), i)
+          .Broadcast(user_op::OpArg("y", 0))
+          .Split(user_op::OpArg("out", 0), i)
+          .Build();
+    }
+    FOR_RANGE(int64_t, i, 0, y_shape.NumAxes()) {
+      ctx->NewBuilder()
+          .Split(user_op::OpArg("condition", 0), cond_shape.NumAxes() - 1 - i)
+          .Split(user_op::OpArg("y", 0), y_shape.NumAxes() - 1 - i)
+          .Split(ctx->outputs(), cond_shape.NumAxes() - 1 - i)
+          .Build();
+    }
+  } else {
+    FOR_RANGE(int64_t, i, 0, cond_shape.NumAxes()) {
+      if (cond_shape.At(i) == 1 && y_shape.At(i) == 1) { continue; }
+      if (cond_shape.At(i) == y_shape.At(i)) {
+        ctx->NewBuilder().Split(ctx->inputs(), i).Split(ctx->outputs(), i).Build();
+      } else if (cond_shape.At(i) == 1) {
+        ctx->NewBuilder()
+            .Broadcast(user_op::OpArg("condition", 0))
+            .Split(user_op::OpArg("y", 0), i)
+            .Split(ctx->outputs(), i)
+            .Build();
+      } else if (y_shape.At(i) == 1) {
+        ctx->NewBuilder()
+            .Split(user_op::OpArg("condition", 0), i)
+            .Broadcast(user_op::OpArg("y", 0))
+            .Split(ctx->outputs(), i)
+            .Build();
+      } else {
+        UNIMPLEMENTED();
+      }
+    }
   }
   ctx->NewBuilder()
       .Broadcast(user_op::OpArg("condition", 0))
@@ -163,13 +169,57 @@ Maybe<void> GetWhereXScalarSbpSignatures(user_op::SbpContext* ctx) {
 Maybe<void> GetWhereYScalarSbpSignatures(user_op::SbpContext* ctx) {
   const Shape& cond_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("condition", 0).shape();
   const Shape& x_shape = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0).shape();
-  const auto& vaild_split_dims = JUST(CalValidSplitDims(cond_shape, x_shape));
-  for (const auto& vaild_split_dim : *vaild_split_dims) {
-    ctx->NewBuilder()
-        .Split(user_op::OpArg("condition", 0), std::get<0>(vaild_split_dim))
-        .Split(user_op::OpArg("x", 0), std::get<1>(vaild_split_dim))
-        .Split(user_op::OpArg("out", 0), std::get<2>(vaild_split_dim))
-        .Build();
+  if (cond_shape.NumAxes() < x_shape.NumAxes()) {
+    FOR_RANGE(int64_t, i, 0, x_shape.NumAxes() - cond_shape.NumAxes()) {
+      ctx->NewBuilder()
+          .Broadcast(user_op::OpArg("condition", 0))
+          .Split(user_op::OpArg("x", 0), i)
+          .Split(user_op::OpArg("out", 0), i)
+          .Build();
+    }
+    FOR_RANGE(int64_t, i, 0, cond_shape.NumAxes()) {
+      ctx->NewBuilder()
+          .Split(user_op::OpArg("condition", 0), cond_shape.NumAxes() - 1 - i)
+          .Split(user_op::OpArg("x", 0), x_shape.NumAxes() - 1 - i)
+          .Split(ctx->outputs(), x_shape.NumAxes() - 1 - i)
+          .Build();
+    }
+  } else if (cond_shape.NumAxes() > x_shape.NumAxes()) {
+    FOR_RANGE(int64_t, i, 0, cond_shape.NumAxes() - x_shape.NumAxes()) {
+      ctx->NewBuilder()
+          .Split(user_op::OpArg("condition", 0), i)
+          .Broadcast(user_op::OpArg("x", 0))
+          .Split(user_op::OpArg("out", 0), i)
+          .Build();
+    }
+    FOR_RANGE(int64_t, i, 0, x_shape.NumAxes()) {
+      ctx->NewBuilder()
+          .Split(user_op::OpArg("condition", 0), cond_shape.NumAxes() - 1 - i)
+          .Split(user_op::OpArg("x", 0), x_shape.NumAxes() - 1 - i)
+          .Split(ctx->outputs(), cond_shape.NumAxes() - 1 - i)
+          .Build();
+    }
+  } else {
+    FOR_RANGE(int64_t, i, 0, cond_shape.NumAxes()) {
+      if (cond_shape.At(i) == 1 && x_shape.At(i) == 1) { continue; }
+      if (cond_shape.At(i) == x_shape.At(i)) {
+        ctx->NewBuilder().Split(ctx->inputs(), i).Split(ctx->outputs(), i).Build();
+      } else if (cond_shape.At(i) == 1) {
+        ctx->NewBuilder()
+            .Broadcast(user_op::OpArg("condition", 0))
+            .Split(user_op::OpArg("x", 0), i)
+            .Split(ctx->outputs(), i)
+            .Build();
+      } else if (x_shape.At(i) == 1) {
+        ctx->NewBuilder()
+            .Split(user_op::OpArg("condition", 0), i)
+            .Broadcast(user_op::OpArg("x", 0))
+            .Split(ctx->outputs(), i)
+            .Build();
+      } else {
+        UNIMPLEMENTED();
+      }
+    }
   }
   ctx->NewBuilder()
       .Broadcast(user_op::OpArg("condition", 0))
diff --git a/python/oneflow/__init__.py b/python/oneflow/__init__.py
index 1d7d896ca22..89f5c8c419a 100755
--- a/python/oneflow/__init__.py
+++ b/python/oneflow/__init__.py
@@ -106,7 +106,6 @@ def is_deprecated(func_or_class):
 from oneflow._C import log1p
 from oneflow._C import add
 from oneflow._C import div, div_
-from oneflow._C import addcmul
 from oneflow._C import floor, floor_
 from oneflow._C import floor_divide
 from oneflow._C import mul
@@ -154,7 +153,6 @@ def is_deprecated(func_or_class):
 from oneflow._C import pad
 from oneflow._C import transpose
 from oneflow._C import relu
-from oneflow._C import roc_auc_score
 from oneflow._C import softmax
 from oneflow._C import log_softmax
 from oneflow._C import argmax
@@ -167,7 +165,6 @@ def is_deprecated(func_or_class):
 from oneflow._C import unsqueeze
 from oneflow._C import permute
 from oneflow._C import select
-from oneflow._C import unbind
 from oneflow._C import tensor_split
 from oneflow._C import hsplit
 from oneflow._C import vsplit
@@ -183,8 +180,6 @@ def is_deprecated(func_or_class):
 from oneflow._C import cumsum
 from oneflow._C import cumprod
 from oneflow._C import swapaxes
-from oneflow._C import amax
-from oneflow._C import swapdims
 from oneflow._C import t
 from oneflow._C import masked_fill
 from oneflow._C import equal
@@ -193,7 +188,6 @@ def is_deprecated(func_or_class):
 from oneflow._C import not_equal as ne
 from oneflow._C import less as lt
 from oneflow._C import less_equal as le
-from oneflow._C import index_select
 from oneflow._C import isnan
 from oneflow._C import isinf
 from oneflow._oneflow_internal import _set_num_threads as set_num_threads
@@ -277,13 +271,7 @@ def atexit_hook(hook):
 from oneflow._C import tensor, batch_gather
 from oneflow._C import from_numpy
 
-from oneflow.autograd import (
-    enable_grad,
-    set_grad_enabled,
-    no_grad,
-    inference_mode,
-    is_grad_enabled,
-)
+from oneflow.autograd import grad_enable, no_grad, inference_mode, is_grad_enabled
 import oneflow.nn.image
 
 from oneflow.framework.check_point_v2 import load
@@ -339,6 +327,7 @@ def atexit_hook(hook):
 from oneflow.nn.modules.logical_ops import logical_or_op as logical_or
 from oneflow.nn.modules.logical_ops import logical_xor_op as logical_xor
 from oneflow.nn.modules.tensor_ops import is_floating_point
+from oneflow.nn.modules.index_select import index_select_op as index_select
 from oneflow.nn.modules.masked_select import masked_select_op as masked_select
 from oneflow.nn.modules.math_ops import addmm_op as addmm
 from oneflow.nn.modules.math_ops import topk_op as topk
@@ -368,8 +357,6 @@ def atexit_hook(hook):
 )
 from oneflow.nn.modules.as_tensor import as_tensor
 from oneflow.nn.modules.tensor_buffer import tensor_to_tensor_buffer
-from oneflow.nn.modules.global_cast import local_to_global_op as local_to_global
-from oneflow.nn.modules.global_cast import global_to_global_op as global_to_global
 from oneflow.nn.modules.global_cast import to_global_op as to_global
 from oneflow.nn.modules.global_cast import to_local_op as to_local
 from oneflow.nn.modules.where import where_op as where
diff --git a/python/oneflow/autograd/__init__.py b/python/oneflow/autograd/__init__.py
index ddee0ebf0db..5ab070f3b02 100644
--- a/python/oneflow/autograd/__init__.py
+++ b/python/oneflow/autograd/__init__.py
@@ -17,8 +17,7 @@
 from oneflow.autograd.autograd import backward, grad
 from oneflow.autograd.autograd_function import Function
 from oneflow.autograd.autograd_mode import (
-    set_grad_enabled,
-    enable_grad,
+    grad_enable,
     inference_mode,
     is_grad_enabled,
     no_grad,
@@ -28,8 +27,7 @@
     "backward",
     "grad",
     "Function",
-    "set_grad_enabled",
-    "enable_grad",
+    "grad_enable",
     "inference_mode",
     "is_grad_enabled",
     "no_grad",
diff --git a/python/oneflow/autograd/autograd_mode.py b/python/oneflow/autograd/autograd_mode.py
index 747134d38c4..8750ed04a4b 100644
--- a/python/oneflow/autograd/autograd_mode.py
+++ b/python/oneflow/autograd/autograd_mode.py
@@ -74,7 +74,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         pass
 
 
-class enable_grad:
+class grad_enable:
     r"""
     Context-manager that enabled gradient calculation.
 
@@ -89,11 +89,11 @@ class enable_grad:
         >>> import oneflow as flow
         >>> x = flow.ones(2, 3, requires_grad=True)
         >>> with flow.no_grad():
-        ...     with flow.enable_grad():
+        ...     with flow.grad_enable():
         ...         y = x * x
         >>> y.requires_grad
         True
-        >>> @flow.enable_grad()
+        >>> @flow.grad_enable()
         ... def no_grad_func(x):
         ...     return x * x
         >>> with flow.no_grad():
@@ -163,55 +163,6 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         pass
 
 
-class set_grad_enabled:
-    r"""
-    Context-manager that enabled gradient calculation.
-
-    Enables gradient calculation, if it has been disabled via no_grad.
-
-    This context manager is thread local; it will not affect computation in other threads.
-
-    Also functions as a decorator. (Make sure to instantiate with parenthesis.)
-
-
-    Args:
-        mode (bool): Flag whether to enable or disable gradient calculation. (default: True)
-
-    .. code-block:: python
-
-        >>> import oneflow as flow
-        >>> x = flow.ones(2, 3, requires_grad=True)
-        >>> with flow.set_grad_enabled(True):
-        ...     y = x * x
-        >>> y.requires_grad
-        True
-        >>> @flow.set_grad_enabled(False)
-        ... def no_grad_func(x):
-        ...     return x * x
-        >>> y = no_grad_func(x)
-        >>> y.requires_grad
-        False
-        
-    """
-
-    def __init__(self, is_train=True):
-        self.is_train = is_train
-
-    def __call__(self, func):
-        def wrapper(*args, **kwargs):
-            with AutoGradMode(self.is_train):
-                return func(*args, **kwargs)
-
-        return wrapper
-
-    def __enter__(self):
-        self.grad_mode = AutoGradMode(self.is_train)
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
-
-
 if __name__ == "__main__":
     import doctest
 
diff --git a/python/oneflow/framework/check_point_v2.py b/python/oneflow/framework/check_point_v2.py
index f45a074b867..9b8d2e20111 100644
--- a/python/oneflow/framework/check_point_v2.py
+++ b/python/oneflow/framework/check_point_v2.py
@@ -358,25 +358,12 @@ def save(
     obj = {"protocol_version": PROTOCOL_VERSION, "data": obj}
     with tensor_pickling_context(path, global_dst_rank):
         pickled_bytes = pickle.dumps(obj)
-
-    def write_to_path(path):
+    rank = flow.env.get_rank()
+    if global_dst_rank is None or global_dst_rank == rank:
         path.mkdir(exist_ok=True)
         pickle_path = path / PICKLE_FILENAME
         pickle_path.write_bytes(pickled_bytes)
 
-    if global_dst_rank is not None:
-        assert isinstance(
-            global_dst_rank, int
-        ), f"global_dst_rank expected type int, but got {type(global_dst_rank)}."
-        assert (
-            global_dst_rank >= 0 and global_dst_rank < flow.env.get_world_size()
-        ), f"out of range (expected to be in range of [0, {flow.env.get_world_size()}), but got {global_dst_rank})."
-        if flow.env.get_rank() == global_dst_rank:
-            write_to_path(path)
-    else:
-        # global_dst_rank is None
-        write_to_path(path)
-
 
 save_load_path = None
 global_src_dsk_rank = None
diff --git a/python/oneflow/framework/docstr/__init__.py b/python/oneflow/framework/docstr/__init__.py
index e5dfd9154e9..bdc683843be 100644
--- a/python/oneflow/framework/docstr/__init__.py
+++ b/python/oneflow/framework/docstr/__init__.py
@@ -46,8 +46,6 @@
 from .clamp import *
 from .erfinv import *
 from .swapaxes import *
-from .amax import *
-from .unbind import *
 from .repeat import *
 from .tile import *
 from .tensor_t import *
@@ -62,11 +60,9 @@
 from .index_select import *
 from .sort import *
 from .is_floating_point import *
-from .swapdims import *
 from .where import *
 from .einsum import *
 from .oneflow import *
 from .argsort import *
 from .module import *
 from .util_ops import *
-from .deconv import *
diff --git a/python/oneflow/framework/docstr/amax.py b/python/oneflow/framework/docstr/amax.py
deleted file mode 100644
index 31407270407..00000000000
--- a/python/oneflow/framework/docstr/amax.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import oneflow
-from oneflow.framework.docstr.utils import add_docstr
-
-add_docstr(
-    oneflow.amax,
-    """
-    oneflow.amax(input, dim=None, keepdim=False) -> Tensor
-
-    This function is equivalent to PyTorch’s amax function. It returns the maximum along a dimension.
-
-    Args:
-        input (oneflow.Tensor): the input Tensor.
-        dim (int or List of int, optional): the dimension or the dimensions to reduce. Dim is None by default. 
-        keepdim (bool, optional): whether to retain the dimension. keepdim is False by default. 
-
-    Returns:
-        oneflow.Tensor: Maximum of the input tensor
-
-    For example:
-
-    .. code-block:: python
-    
-        >>> import oneflow as flow
-               
-        >>> x = flow.tensor([[[0,1],[2,3]],[[4,5],[6,7]]])
-        >>> flow.amax(x, 1)
-        tensor([[2, 3],
-                [6, 7]], dtype=oneflow.int64)
-        >>> flow.amax(x, 0)
-        tensor([[4, 5],
-                [6, 7]], dtype=oneflow.int64)
-        >>> flow.amax(x)
-        tensor(7, dtype=oneflow.int64)
-        >>> flow.amax(x, 0, True)
-        tensor([[[4, 5],
-                 [6, 7]]], dtype=oneflow.int64)
-    """,
-)
diff --git a/python/oneflow/framework/docstr/conv.py b/python/oneflow/framework/docstr/conv.py
index 76183ae5417..3655e794b81 100644
--- a/python/oneflow/framework/docstr/conv.py
+++ b/python/oneflow/framework/docstr/conv.py
@@ -19,7 +19,7 @@
 add_docstr(
     oneflow._C.conv1d,
     r"""
-    conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
+    conv1d(input, weight, bias=None, stride=[1], padding=[0], dilation=[1], groups=1) -> Tensor
 
     The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.functional.conv1d.html?highlight=conv1d
 
@@ -29,9 +29,9 @@
     See :class:`~oneflow.nn.Conv1d` for details and output shape.
 
     Args:
-        input: input tensor of shape :math:`(\text{minibatch} , \text{in_channels} , iW)`
-        weight: filters of shape :math:`(\text{out_channels} , \frac{\text{in_channels}}{\text{groups}} , iW)`
-        bias: optional bias of shape :math:`(\text{out_channels})`. Default: None.
+        input: quantized input tensor of shape :math:`(\text{minibatch} , \text{in_channels} , iW)`
+        weight: quantized filters of shape :math:`(\text{out_channels} , \frac{\text{in_channels}}{\text{groups}} , iW)`
+        bias: **non-quantized** bias tensor of shape :math:`(\text{out_channels})`. The tensor type must be `flow.float`.
         stride: the stride of the convolving kernel. Can be a single number or a
           tuple `(sW,)`. Default: 1
         padding: implicit paddings on both sides of the input. Can be a
@@ -46,17 +46,17 @@
     .. code-block:: python
 
         >>> import oneflow as flow
-        >>> import oneflow.nn.functional as F
+        >>> import numpy as np
         
-        >>> inputs = flow.randn(33, 16, 30)
-        >>> filters = flow.randn(20, 16, 5)
-        >>> outputs = F.conv1d(inputs, filters)
+        >>> input = flow.tensor(np.random.randn(33, 16, 30), dtype=flow.float32)
+        >>> filters = flow.tensor(np.random.randn(20, 16, 5), dtype=flow.float32)
+        >>> out = flow._C.conv1d(input, filters,stride=[1], padding=[0], dilation=[1], channel_pos="channels_first")
         """,
 )
 add_docstr(
     oneflow._C.conv2d,
     r"""
-    conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
+    conv2d(input, weight, bias=None, stride=[1], padding=[0], dilation=[1], groups=1) -> Tensor
 
     The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.functional.conv2d.html?highlight=conv2d
 
@@ -66,9 +66,9 @@
     See :class:`~oneflow.nn.Conv2d` for details and output shape.
 
     Args:
-        input: input tensor of shape :math:`(\text{minibatch} , \text{in_channels} , iH , iW)`
-        weight: filters of shape :math:`(\text{out_channels} , \frac{\text{in_channels}}{\text{groups}} , kH , kW)`
-        bias: optional bias of shape :math:`(\text{out_channels})`. Default: None.
+        input: quantized input tensor of shape :math:`(\text{minibatch} , \text{in_channels} , iH , iW)`
+        weight: quantized filters of shape :math:`(\text{out_channels} , \frac{\text{in_channels}}{\text{groups}} , kH , kW)`
+        bias: **non-quantized** bias tensor of shape :math:`(\text{out_channels})`. The tensor type must be `flow.float`.
         stride: the stride of the convolving kernel. Can be a single number or a
           tuple `(sH, sW)`. Default: 1
         padding: implicit paddings on both sides of the input. Can be a
@@ -77,24 +77,14 @@
           a tuple `(dH, dW)`. Default: 1
         groups: split input into groups, :math:`\text{in_channels}` should be divisible by the
           number of groups. Default: 1
-
-    For examples:
-
-    .. code-block:: python
-
-        >>> import oneflow as flow
-        >>> import oneflow.nn.functional as F
-        
-        >>> inputs = flow.randn(8, 4, 3, 3)
-        >>> filters = flow.randn(1, 4, 5, 5)
-        >>> outputs = F.conv2d(inputs, filters, padding=1)
+    
     
         """,
 )
 add_docstr(
     oneflow._C.conv3d,
     r"""
-    conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
+    conv3d(input, weight, bias=None, stride=[1], padding=[0], dilation=[1], groups=1) -> Tensor
 
     The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.functional.conv3d.html?highlight=conv3d
 
@@ -105,11 +95,12 @@
     See :class:`~oneflow.nn.Conv3d` for details and output shape.
 
     Args:
-        input: input tensor of shape
+        input: quantized input tensor of shape
           :math:`(\text{minibatch} , \text{in_channels} , iD , iH , iW)`
-        weight: filters of shape
+        weight: quantized filters of shape
           :math:`(\text{out_channels} , \frac{\text{in_channels}}{\text{groups}} , kD , kH , kW)`
-        bias: optional bias of shape :math:`(\text{out_channels})`. Default: None.
+        bias: **non-quantized** bias tensor of shape
+          :math:`(\text{out_channels})`. The tensor type must be `flow.float`.
         stride: the stride of the convolving kernel. Can be a single number or a
           tuple `(sD, sH, sW)`. Default: 1
         padding: implicit paddings on both sides of the input. Can be a
@@ -118,17 +109,7 @@
           a tuple `(dD, dH, dW)`. Default: 1
         groups: split input into groups, :math:`\text{in_channels}` should be
           divisible by the number of groups. Default: 1
-
-    For examples:
-
-    .. code-block:: python
-
-        >>> import oneflow as flow
-        >>> import oneflow.nn.functional as F
         
-        >>> inputs = flow.randn(20, 16, 50, 10, 20)
-        >>> filters = flow.randn(33, 16, 3, 3, 3)
-        >>> outputs = F.conv3d(inputs, filters)
         
     """,
 )
diff --git a/python/oneflow/framework/docstr/deconv.py b/python/oneflow/framework/docstr/deconv.py
deleted file mode 100644
index 2b2219595d9..00000000000
--- a/python/oneflow/framework/docstr/deconv.py
+++ /dev/null
@@ -1,128 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import oneflow
-from oneflow.framework.docstr.utils import add_docstr
-
-add_docstr(
-    oneflow._C.deconv1d,
-    r"""
-    conv_transpose1d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
-
-    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.functional.conv_transpose1d.html
-
-    Applies a 1D transposed convolution operator over an input signal composed of several input planes, sometimes also called “deconvolution”.
-
-    See :class:`~oneflow.nn.ConvTranspose1d` for details and output shape.
-
-    Args:
-        input: input tensor of shape :math:`(\text{minibatch} , \text{in_channels} , iW)`
-        weight: filters of shape :math:`(\text{in_channels} , \frac{\text{out_channels}}{\text{groups}} , kW)`
-        bias: optional bias of shape :math:`(\text{out_channels})`. Default: None.
-        stride: the stride of the convolving kernel. Can be a single number or a
-          tuple `(sW,)`. Default: 1
-        padding: `dilation * (kernel_size - 1) - padding` zero-padding will be added to both sides of each dimension in the input. Can be a single number or a tuple `(padW,)`. Default: 0
-        output_padding: additional size added to one side of each dimension in the output shape. Can be a single number or a tuple `(out_padW)`. Default: 0
-        groups: split input into groups, :math:`\text{in_channels}` should be divisible by the
-          number of groups. Default: 1
-        dilation: the spacing between kernel elements. Can be a single number or
-          a tuple `(dW,)`. Default: 1
-
-    For examples:
-
-    .. code-block:: python
-
-        >>> import oneflow as flow
-        >>> import oneflow.nn.functional as F
-        
-        >>> inputs = flow.randn(20, 16, 50)
-        >>> weights = flow.randn(16, 33, 5)
-        >>> outputs = F.conv_transpose1d(inputs, weights)
-        """,
-)
-add_docstr(
-    oneflow._C.deconv2d,
-    r"""
-    conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
-
-    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.functional.conv_transpose3d.html
-
-    Applies a 2D transposed convolution operator over an input image composed of several input planes, sometimes also called “deconvolution”.
-
-    See :class:`~oneflow.nn.ConvTranspose2d` for details and output shape.
-
-    Args:
-        input: input tensor of shape :math:`(\text{minibatch} , \text{in_channels} , iH , iW)`
-        weight: filters of shape :math:`(\text{in_channels} , \frac{\text{out_channels}}{\text{groups}} , kH , kW)`
-        bias: optional bias of shape :math:`(\text{out_channels})`. Default: None.
-        stride: the stride of the convolving kernel. Can be a single number or a
-          tuple `(sH, sW)`. Default: 1
-        padding: `dilation * (kernel_size - 1) - padding` zero-padding will be added to both sides of each dimension in the input. Can be a single number or a tuple `(padH, padW)`. Default: 0
-        output_padding: additional size added to one side of each dimension in the output shape. Can be a single number or a tuple `(out_padH, out_padW)`. Default: 0
-        groups: split input into groups, :math:`\text{in_channels}` should be divisible by the
-          number of groups. Default: 1
-        dilation: the spacing between kernel elements. Can be a single number or
-          a tuple `(dH, dW)`. Default: 1
-    
-    For examples:
-
-    .. code-block:: python
-
-        >>> import oneflow as flow
-        >>> import oneflow.nn.functional as F
-        
-        >>> inputs = flow.randn(1, 4, 5, 5)
-        >>> weights = flow.randn(4, 8, 3, 3)
-        >>> outputs = F.conv_transpose2d(inputs, weights, padding=1)
-        """,
-)
-add_docstr(
-    oneflow._C.deconv3d,
-    r"""
-    conv_transpose3d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
-
-    The documentation is referenced from: https://pytorch.org/docs/stable/generated/torch.nn.functional.conv_transpose3d.html
-
-    Applies a 3D transposed convolution operator over an input image composed of several input planes, sometimes also called “deconvolution”.
-
-    See :class:`~oneflow.nn.ConvTranspose3d` for details and output shape.
-
-    Args:
-        input: input tensor of shape
-          :math:`(\text{minibatch} , \text{in_channels} , iT , iH , iW)`
-        weight: filters of shape
-          :math:`(\text{in_channels} , \frac{\text{out_channels}}{\text{groups}} , kT , kH , kW)`
-        bias: optional bias of shape :math:`(\text{out_channels})`. Default: None.
-        stride: the stride of the convolving kernel. Can be a single number or a
-          tuple `(sD, sH, sW)`. Default: 1
-        padding: `dilation * (kernel_size - 1) - padding` zero-padding will be added to both sides of each dimension in the input. Can be a single number or a tuple `(padT, padH, padW)`. Default: 0
-        output_padding: additional size added to one side of each dimension in the output shape. Can be a single number or a tuple `(out_padT, out_padH, out_padW)`. Default: 0
-        groups: split input into groups, :math:`\text{in_channels}` should be
-          divisible by the number of groups. Default: 1
-        dilation: the spacing between kernel elements. Can be a single number or
-          a tuple `(dT, dH, dW)`. Default: 1
-        
-    For examples:
-
-    .. code-block:: python
-
-        >>> import oneflow as flow
-        >>> import oneflow.nn.functional as F
-        
-        >>> inputs = flow.randn(20, 16, 50, 10, 20)
-        >>> weights = flow.randn(16, 33, 3, 3, 3)
-        >>> outputs = F.conv_transpose3d(inputs, weights)
-    """,
-)
diff --git a/python/oneflow/framework/docstr/index_select.py b/python/oneflow/framework/docstr/index_select.py
index 7ecaf5e8288..4c494c9bae4 100644
--- a/python/oneflow/framework/docstr/index_select.py
+++ b/python/oneflow/framework/docstr/index_select.py
@@ -22,7 +22,7 @@
     input.index_select(dim, index) -> Tensor
 
     The interface is consistent with PyTorch.    
-    The documentation is referenced from: https://pytorch.org/docs/1.11/generated/torch.index_select.html#torch.index_select
+    The documentation is referenced from: https://pytorch-cn.readthedocs.io/zh/latest/package_references/torch/#torchindex_select
 
     Select values along an axis specified by `dim`.
 
@@ -54,56 +54,5 @@
         >>> output
         tensor([[1, 2],
                 [4, 5]], dtype=oneflow.int32)
-    
-    ..
-        Feature Stage of Operator [index_select].
-        - Maintainer List [@QiangX-man, @hjchen2, @strint]
-        - Current Stage [ ]
-        - Alpha Stage Check List [ ]
-          - API(Compatible with PyTorch 1.11, anything incompatible must be noted in API Doc.)[Yes]
-          - Doc(API Doc must be provided and showed normally on the web page.)[Yes]
-          - Functionality and its' Test [ ]
-            - Functionality is highly compatiable with PyTorch 1.11. [Yes]
-            - eager local [Yes] [@QiangX-man, @hjchen2]
-              - forward [Yes]
-              - backward [Yes]
-              - gpu [Yes]
-              - cpu [Yes]
-            - graph local [ ] [@BBuf, @strint, @hjchen2]
-              - forward [Yes]
-              - backward [ ]
-              - gpu [Yes]
-              - cpu [Yes]
-          - Exception Handling
-            - Exception Message and Hint must be provided [ ]
-        - Beta Stage Check List [ ]
-          - API(High compatibility with PyTorch 1.11, shouldn't have anything incompatible for a naive reason.)[ ]
-          - Doc(Same standard as Alpha Stage)[ ]
-          - Functionality and its' Test [ ]
-            - eager global [ ]
-              - forward [ ]
-              - backward [ ]
-              - gpu [ ]
-              - cpu [ ]
-            - graph gloal [ ]
-              - forward [ ]
-              - backward [ ]
-              - gpu [ ]
-              - cpu [ ]
-          - Performance and Scalability(Must be evaluated.)[ ]
-            - CUDA kernel [ ]
-            - CPU kernel [ ]
-            - N nodes M devices [ ]
-          - Exception Handling [ ]
-            - Exception Message and Hint must be provided [ ]
-            - Try you best to do Exception Recovery [ ]
-        - Stable Stage Check List [ ]
-          - API(Same standard as Beta Stage)[ ]
-          - Doc(Same standard as Beta Stage)[ ]
-          - Functionality and its' Test [ ]
-            - fp16 and AMP [ ]
-            - NHWC [ ]
-          - Performance and Scalability(Must be evaluated.)[ ]
-          - Exception Handling [ ]
     """,
 )
diff --git a/python/oneflow/framework/docstr/math_ops.py b/python/oneflow/framework/docstr/math_ops.py
index 0c0a7fcc6dd..1958e6cdd0b 100644
--- a/python/oneflow/framework/docstr/math_ops.py
+++ b/python/oneflow/framework/docstr/math_ops.py
@@ -1378,45 +1378,6 @@
     """,
 )
 
-add_docstr(
-    oneflow.addcmul,
-    r"""
-    oneflow.addcmul(input, tensor1, tensor2, *, value=1) -> Tensor
-
-    Performs the element-wise multiplication of tensor1 by tensor2, multiply the result
-    by the scalar value and add it to input.
-    The documentation is referenced from:
-    https://pytorch.org/docs/stable/generated/torch.addcmul.html
-    
-    .. math::
-        \text{out}_i = \text{input}_i + value \times\  \text{tensor1}_i \times\ \text{tensor2}_i
-        
-    Args:
-        input (Tensor): the tensor to be added.
-        tensor1 (Tensor): the tensor to be multiplied.
-        tensor2 (Tensor): the tensor to be multiplied.
-    
-    Keyword args:
-        value (Number, optional): multiplier for :math:`tensor1 * tensor2`.
-
-    Returns:
-        oneflow.Tensor: the output Tensor.
-
-    For example:
-
-    .. code-block:: python
-
-        >>> import oneflow as flow
-        
-        >>> input = flow.rand(2, 3, 4)
-        >>> tensor1 = flow.rand(2, 3, 4)
-        >>> tensor2 = flow.rand(2, 3, 4)
-        >>> out = flow.addcmul(input, tensor1, tensor2, value=2)
-        >>> out.size()
-        oneflow.Size([2, 3, 4])
-    """,
-)
-
 add_docstr(
     oneflow.eye,
     """oneflow.eye(n, m, *, device=None, requires_grad=False, placement=None, sbp) -> Tensor
diff --git a/python/oneflow/framework/docstr/roc_auc_score.py b/python/oneflow/framework/docstr/roc_auc_score.py
deleted file mode 100644
index 58cfb41ec56..00000000000
--- a/python/oneflow/framework/docstr/roc_auc_score.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import oneflow
-from oneflow.framework.docstr.utils import add_docstr
-
-add_docstr(
-    oneflow.roc_auc_score,
-    """
-    oneflow.roc_auc_score(label, pred) -> Tensor
-
-    Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
-
-    Note: Currently this implementation can only be used on CPU.
-
-    Args:
-        label (Tensor[N, 1]): True lable of the samples
-        pred (Tensor[N, 1]): Predicted probability value to be true
-        
-    Returns:
-        Tensor[1, ]: float32 tensor of auc score
-       
-    For example:
-
-    .. code-block:: python
-
-
-        >>> import numpy as np
-        >>> import oneflow as flow
-        
-        >>> label = flow.Tensor([0, 0, 1, 1])
-        >>> pred = flow.Tensor([0.1, 0.4, 0.35, 0.8])     
-          
-        >>> score = flow.roc_auc_score(label, pred)
-        >>> score
-        tensor([0.7500], dtype=oneflow.float32)
-
-
-    """,
-)
diff --git a/python/oneflow/framework/docstr/swapaxes.py b/python/oneflow/framework/docstr/swapaxes.py
index c126ecab6ad..bae56948f5a 100644
--- a/python/oneflow/framework/docstr/swapaxes.py
+++ b/python/oneflow/framework/docstr/swapaxes.py
@@ -17,10 +17,8 @@
 from oneflow.framework.docstr.utils import add_docstr
 
 add_docstr(
-    oneflow._C.swapaxes,
-    """swapaxes(input, axis0, axis1) -> Tensor
-    
-    This function is equivalent to NumPy’s swapaxes function.
+    oneflow.swapaxes,
+    """This function is equivalent to NumPy’s swapaxes function.
 
     For example:
 
diff --git a/python/oneflow/framework/docstr/swapdims.py b/python/oneflow/framework/docstr/swapdims.py
deleted file mode 100644
index ccb5478aaf3..00000000000
--- a/python/oneflow/framework/docstr/swapdims.py
+++ /dev/null
@@ -1,53 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import oneflow
-from oneflow.framework.docstr.utils import add_docstr
-
-add_docstr(
-    oneflow._C.swapdims,
-    """
-    swapdims(input, dim0, dim1) -> Tensor
-
-    This function is equivalent to torch’s swapdims function.
-
-    For example:
-
-    .. code-block:: python
-    
-        >>> import oneflow as flow
-
-        >>> x = flow.tensor([[[0,1],[2,3]],[[4,5],[6,7]]])
-        >>> x
-        tensor([[[0, 1],
-                 [2, 3]],
-        <BLANKLINE>
-                [[4, 5],
-                 [6, 7]]], dtype=oneflow.int64)
-        >>> flow.swapdims(x, 0, 1)
-        tensor([[[0, 1],
-                 [4, 5]],
-        <BLANKLINE>
-                [[2, 3],
-                 [6, 7]]], dtype=oneflow.int64)
-        >>> flow.swapdims(x, 0, 2)
-        tensor([[[0, 4],
-                 [2, 6]],
-        <BLANKLINE>
-                [[1, 5],
-                 [3, 7]]], dtype=oneflow.int64)
-
-    """,
-)
diff --git a/python/oneflow/framework/docstr/tensor.py b/python/oneflow/framework/docstr/tensor.py
index 941addbc12f..7bee7297faf 100644
--- a/python/oneflow/framework/docstr/tensor.py
+++ b/python/oneflow/framework/docstr/tensor.py
@@ -253,103 +253,10 @@
     """,
 )
 
-add_docstr(
-    oneflow.Tensor.local_to_global,
-    """
-    Tensor.local_to_global(placement=None, sbp=None, *, check_meta=Ture) -> Tensor
-
-    Creates a global tensor from a local tensor.
-
-    Note:
-        This tensor must be local tensor.
-
-        Both placement and sbp are required.
-
-        The returned global tensor takes this tensor as its local component in the current rank.
-
-        There is no data communication usually, but when sbp is ``oneflow.sbp.broadcast``, the data on rank 0 will be broadcast to other ranks.
-    
-    Args:
-        placement (flow.placement, optional): the desired placement of returned global tensor. Default: None
-        sbp (flow.sbp.sbp or tuple of flow.sbp.sbp, optional): the desired sbp of returned global tensor. Default: None
-    Keyword Args:
-        check_meta (bool, optional): indicates whether to check meta information when createing global tensor from local
-            tensor. Only can be set to False when the shape and dtype of the input local tensor on each rank are the same. If set to False, the
-            execution of local_to_global can be accelerated. Default: True
-
-    .. code-block:: python
-
-        >>> # Run on 2 ranks respectively
-        >>> import oneflow as flow
-        >>> input = flow.tensor([0., 1.], dtype=flow.float32) # doctest: +SKIP
-        >>> output = input.local_to_global(placement=flow.placement("cpu", ranks=[0, 1]), sbp=[flow.sbp.split(0)], check_meta=False) # doctest: +SKIP
-        >>> print(output.size()) # doctest: +SKIP
-        >>> print(output) # doctest: +SKIP
-
-    .. code-block:: python
-
-        >>> # results on rank 0
-        oneflow.Size([4])
-        tensor([0., 1., 0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32) 
- 
-    .. code-block:: python
-
-        >>> # results on rank 1
-        oneflow.Size([4])
-        tensor([0., 1., 0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32)
-    """,
-)
-
-add_docstr(
-    oneflow.Tensor.global_to_global,
-    """
-    Tensor.global_to_global(placement=None, sbp=None, *, grad_sbp=None, check_meta=False) -> Tensor
-
-    Performs Tensor placement and/or sbp conversion.
-
-    Note:
-        This tensor must be global tensor.
-
-        At least one of placement and sbp is required.
-
-        If placement and sbp are all the same as this tensor's own placement and sbp, then returns this tensor own.
-    
-    Args:
-        placement (flow.placement, optional): the desired placement of returned global tensor. Default: None
-        sbp (flow.sbp.sbp or tuple of flow.sbp.sbp, optional): the desired sbp of returned global tensor. Default: None
-    Keyword Args:
-        grad_sbp (flow.sbp.sbp or tuple of flow.sbp.sbp, optional): manually specify the sbp of this tensor's grad
-            tensor in the backward pass. If None, the grad tensor sbp will be infered automatically. Default: None
-        check_meta (bool, optional): indicates whether to check meta information. If set to True, check the consistency
-            of the input meta information (placement and sbp) on each rank. Default: False
-
-    .. code-block:: python
-
-        >>> # Run on 2 ranks respectively
-        >>> import oneflow as flow
-        >>> input = flow.tensor([0., 1.], dtype=flow.float32, placement=flow.placement("cpu", ranks=[0, 1]), sbp=[flow.sbp.broadcast]) # doctest: +SKIP
-        >>> output = input.global_to_global(placement=flow.placement("cpu", ranks=[0, 1]), sbp=[flow.sbp.split(0)]) # doctest: +SKIP
-        >>> print(output.size()) # doctest: +SKIP
-        >>> print(output) # doctest: +SKIP
-
-    .. code-block:: python
-
-        >>> # results on rank 0
-        oneflow.Size([2])
-        tensor([0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32)
-
-    .. code-block:: python
-
-        >>> # results on rank 1
-        oneflow.Size([2])
-        tensor([0., 1.], placement=oneflow.placement(type="cpu", ranks=[0, 1]), sbp=(oneflow.sbp.split(axis=0),), dtype=oneflow.float32)
-    """,
-)
-
 add_docstr(
     oneflow.Tensor.to_global,
     """
-    Tensor.to_global(placement=None, sbp=None, **kwargs) -> Tensor
+    Tensor.to_global(placement=None, sbp=None, grad_sbp=None) -> Tensor
 
     Creates a global tensor if this tensor is a local tensor, otherwise performs Tensor placement and/or sbp conversion.
 
@@ -373,12 +280,7 @@
     Args:
         placement (flow.placement, optional): the desired placement of returned global tensor. Default: None
         sbp (flow.sbp.sbp or tuple of flow.sbp.sbp, optional): the desired sbp of returned global tensor. Default: None
-    Keyword Args:
-        grad_sbp (flow.sbp.sbp or tuple of flow.sbp.sbp, optional): manually specify the sbp of this tensor's grad
-            tensor in the backward pass. If None, the grad tensor sbp will be infered automatically. It is only used if this tensor is a
-            global tensor. Default: None
-        check_meta (bool, optional): indicates whether to check meta information. If set to True, check the input meta
-            information on each rank. Default: True if this tensor is a local tensor, False if this tensor is a global tensor
+        grad_sbp (flow.sbp.sbp or tuple of flow.sbp.sbp, optional): manually specify the sbp of this tensor's grad tensor in the backward pass. If None, the grad tensor sbp will be infered automatically. It is only used if this tensor is a global tensor. Default: None
 
     For local tensor:
 
@@ -387,7 +289,7 @@
         >>> # Run on 2 ranks respectively
         >>> import oneflow as flow
         >>> input = flow.tensor([0., 1.], dtype=flow.float32) # doctest: +SKIP
-        >>> output = input.to_global(placement=flow.placement("cpu", ranks=[0, 1]), sbp=[flow.sbp.split(0)], check_meta=False) # doctest: +SKIP
+        >>> output = input.to_global(placement=flow.placement("cpu", ranks=[0, 1]), sbp=[flow.sbp.split(0)]) # doctest: +SKIP
         >>> print(output.size()) # doctest: +SKIP
         >>> print(output) # doctest: +SKIP
 
@@ -840,13 +742,6 @@
     """,
 )
 
-add_docstr(
-    oneflow.Tensor.unbind,
-    """
-    See :func:`oneflow.unbind`
-    """,
-)
-
 add_docstr(
     oneflow.Tensor.swapaxes,
     """
@@ -854,20 +749,6 @@
     """,
 )
 
-add_docstr(
-    oneflow.Tensor.amax,
-    """
-    See :func:`oneflow.amax`
-    """,
-)
-
-add_docstr(
-    oneflow.Tensor.swapdims,
-    """
-    See :func:`oneflow.swapdims`
-    """,
-)
-
 add_docstr(
     oneflow.Tensor.cast,
     """
@@ -1151,20 +1032,6 @@
     """,
 )
 
-add_docstr(
-    oneflow.Tensor.addcmul,
-    """
-    See :func:`oneflow.addcmul`
-    """,
-)
-
-add_docstr(
-    oneflow.Tensor.addcmul_,
-    """
-    In-place version of :func:`oneflow.Tensor.addcmul`.
-    """,
-)
-
 add_docstr(
     oneflow.Tensor.asin,
     """
@@ -1415,18 +1282,6 @@
     """,
 )
 
-
-add_docstr(
-    oneflow.Tensor.half,
-    """
-    self.half() is equivalent to self.to(dtype=oneflow.float16).
-
-    See :func:`oneflow.Tensor.to`
-
-    """,
-)
-
-
 add_docstr(
     oneflow.Tensor.gather,
     """
@@ -1700,22 +1555,6 @@
     """,
 )
 
-add_docstr(
-    oneflow.Tensor.reshape_as,
-    """
-    Tensor.reshape_as(other) -> Tensor
-    Returns this tensor as the same shape as other.
-    self.reshape_as(other) is equivalent to self.reshape(other.sizes()).
-    This method returns a view if other.sizes() is compatible with the current shape.
-    See :func:`oneflow.Tensor.view` on when it is possible to return a view.
-
-    Please see reshape() for more information about reshape. See :func:`oneflow.reshape`
-
-    Parameters
-    other (oneflow.Tensor) – The result tensor has the same shape as other.
-    """,
-)
-
 add_docstr(
     oneflow.Tensor.view,
     """
diff --git a/python/oneflow/framework/docstr/unbind.py b/python/oneflow/framework/docstr/unbind.py
deleted file mode 100644
index 76313f7e542..00000000000
--- a/python/oneflow/framework/docstr/unbind.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import oneflow
-from oneflow.framework.docstr.utils import add_docstr
-
-add_docstr(
-    oneflow.unbind,
-    """
-    This function is equivalent to PyTorch's unbind function.
-    Removes a tensor dimension.
-
-    Returns a tuple of all slices along a given dimension, already without it.
-        
-    Args:
-        x(Tensor): the tensor to unbind
-        dim(int): dimension to remove
-
-    For example:
-
-    .. code-block:: python
-    
-        >>> import oneflow as flow
-               
-        >>> x = flow.tensor(range(12)).reshape([3,4])
-        >>> flow.unbind(x)
-        (tensor([0, 1, 2, 3], dtype=oneflow.int64), tensor([4, 5, 6, 7], dtype=oneflow.int64), tensor([ 8,  9, 10, 11], dtype=oneflow.int64))
-        >>> flow.unbind(x, 1)
-        (tensor([0, 4, 8], dtype=oneflow.int64), tensor([1, 5, 9], dtype=oneflow.int64), tensor([ 2,  6, 10], dtype=oneflow.int64), tensor([ 3,  7, 11], dtype=oneflow.int64))
-
-    """,
-)
diff --git a/python/oneflow/framework/env_util.py b/python/oneflow/framework/env_util.py
index d216dfe88e3..ef6b80ca327 100644
--- a/python/oneflow/framework/env_util.py
+++ b/python/oneflow/framework/env_util.py
@@ -23,6 +23,8 @@
 import oneflow.core.job.env_pb2 as env_pb
 import oneflow.core.job.resource_pb2 as resource_util
 import oneflow.framework.c_api_util as c_api_util
+import oneflow.framework.scope_util as scope_util
+import oneflow.framework.session_context as session_ctx
 
 
 def api_all_device_placement(device_type: str) -> oneflow._oneflow_internal.placement:
@@ -40,7 +42,7 @@ def api_all_device_placement(device_type: str) -> oneflow._oneflow_internal.plac
 
         # Runs on 4 ranks
         import oneflow as flow
-
+        
         p = flow.env.all_device_placement("cuda") # oneflow.placement(type="cuda", ranks=[0, 1, 2, 3])
         p = flow.env.all_device_placement("cpu") # oneflow.placement(type="cpu", ranks=[0, 1, 2, 3])
 
diff --git a/python/oneflow/framework/graph_build_util.py b/python/oneflow/framework/graph_build_util.py
index 917de985a04..b5e8ecc137b 100644
--- a/python/oneflow/framework/graph_build_util.py
+++ b/python/oneflow/framework/graph_build_util.py
@@ -105,6 +105,7 @@ def __init__(self, s_level, v_level=0, mode=False, max_py_stack_depth=2):
             max_py_stack_depth, self._prev_max_py_stack_depth
         )
 
+
     def __enter__(self):
         oneflow._oneflow_internal.SetFLAGS_v(self._v)
         oneflow._oneflow_internal.SetGraphDebugMode(self._mode)
@@ -112,6 +113,7 @@ def __enter__(self):
             oneflow._oneflow_internal.SetFLAGS_alsologtostderr(True)
         oneflow._oneflow_internal.SetGraphDebugMaxPyStackDepth(self._max_py_stack_depth)
 
+
     def __exit__(self, exc_type, exc_val, exc_tb):
         if self._s == 0 and self._v >= 1:
             oneflow._oneflow_internal.SetFLAGS_alsologtostderr(self._prev_logtostderr)
diff --git a/python/oneflow/framework/tensor.py b/python/oneflow/framework/tensor.py
index 30ed10b8342..9ba5b5a2102 100644
--- a/python/oneflow/framework/tensor.py
+++ b/python/oneflow/framework/tensor.py
@@ -71,6 +71,10 @@ def _backward(self, gradient=None, retain_graph=False, create_graph=False):
         flow._oneflow_internal.nn.graph.AddTensorAsGraphLoss(self)
 
 
+def _getitem(self, key):
+    return flow._C.tensor_getitem(self, key)
+
+
 def _setitem(self, key, value):
     if self.is_global:
         if isinstance(value, (int, float)):
@@ -385,14 +389,6 @@ def _swapaxes(self, dim0, dim1):
     return flow._C.swapaxes(self, dim0, dim1)
 
 
-def _amax(self, dim=None, keepdim=False):
-    return flow._C.amax(self, dim=dim, keepdim=keepdim)
-
-
-def _swapdims(self, dim0, dim1):
-    return flow._C.swapdims(self, dim0, dim1)
-
-
 def _cast(self, dtype):
     return flow.cast(self, dtype)
 
@@ -473,14 +469,6 @@ def _cosh(self):
     return flow.cosh(self)
 
 
-def _addcmul(self, tensor1, tensor2, *, value=1):
-    return flow._C.addcmul(self, tensor1, tensor2, value=value)
-
-
-def _addcmul_(self, tensor1, tensor2, *, value=1):
-    return flow._C.addcmul_(self, tensor1, tensor2, value=value)
-
-
 def _erf(self):
     return flow.erf(self)
 
@@ -505,10 +493,6 @@ def _fmod(self, other):
     return flow.fmod(self, other)
 
 
-def _half(self):
-    return flow._C.to(self, flow.float16)
-
-
 def _index(self):
     assert self.numel() == 1 and self.dtype in (
         flow.uint8,
@@ -699,10 +683,6 @@ def _split(self, split_size_or_sections=None, dim=0):
     return flow._C.split(self, split_size_or_sections, dim)
 
 
-def _unbind(self, dim=0):
-    return flow._C.unbind(self, dim)
-
-
 def _all(self, dim=None, keepdim=False):
     return flow.all(self, dim, keepdim)
 
@@ -711,6 +691,12 @@ def _any(self, dim=None, keepdim=False):
     return flow.any(self, dim, keepdim)
 
 
+def _len(self):
+    if self.dim() == 0:
+        raise TypeError("len() of a 0-d tensor")
+    return self.shape[0]
+
+
 def _uniform(self, a=0, b=1):
     if isinstance(a, Tensor):
         assert a.ndim == 0 and a.nelement() == 1, "a must be a number or scalar tensor!"
@@ -795,7 +781,7 @@ def _copy_from_numpy_to_eager_local_tensor(eager_local_tensor, np_arr):
 
 def _init_by_initializer_conf(tensor, initializer_conf, random_seed=None):
     if random_seed is None:
-        random_seed = flow.default_generator.initial_seed()
+        random_seed = flow.default_generator.seed()
     shape = tuple(tensor.shape)
     initializer = initializer_util.GetInitializer(initializer_conf, random_seed, shape)
 
@@ -873,18 +859,8 @@ def _to(self, *args, **kwargs):
     return flow._C.to(self, *new_args, **kwargs)
 
 
-def _local_to_global(self, placement=None, sbp=None, *, check_meta=True):
-    return flow.local_to_global(self, placement, sbp, check_meta)
-
-
-def _global_to_global(
-    self, placement=None, sbp=None, *, grad_sbp=None, check_meta=False
-):
-    return flow.global_to_global(self, placement, sbp, grad_sbp, check_meta)
-
-
-def _to_global(self, placement=None, sbp=None, **kwargs):
-    return flow.to_global(self, placement, sbp, **kwargs)
+def _to_global(self, placement=None, sbp=None, grad_sbp=None):
+    return flow.to_global(self, placement, sbp, grad_sbp)
 
 
 def _to_local(self):
@@ -979,10 +955,6 @@ def _reshape(self, *shape):
     return flow._C.reshape(self, new_shape)
 
 
-def _reshape_as(self, other):
-    return _reshape(self, other.size())
-
-
 def _view(self, *shape):
     if len(shape) == 1:
         new_shape = shape[0]
@@ -1043,6 +1015,10 @@ def _numpy(self):
     return self.to_numpy()
 
 
+def _zero_(self):
+    return self.zeros_()
+
+
 def zero_(self):
     self.zero_()
     return self
@@ -1064,39 +1040,6 @@ def _isinf(self):
     return flow.isinf(self)
 
 
-def _new_tensor(
-    self, data, dtype=None, device=None, requires_grad=False, placement=None, sbp=None
-):
-    if dtype is None:
-        dtype = self.dtype
-    if self.is_local:
-        assert (
-            placement is None and sbp is None
-        ), "self is local tensor, placement and sbp are expected to be None."
-        if device is None:
-            device = self.device
-        return flow.tensor(
-            data, dtype=dtype, device=device, requires_grad=requires_grad
-        )
-    else:
-        assert device is None, "self is global tensor, device is expected to be None."
-        if placement is None:
-            placement = self.placement
-        if sbp is None:
-            sbp = self.sbp
-        return flow.tensor(
-            data, dtype=dtype, placement=placement, sbp=sbp, requires_grad=requires_grad
-        )
-
-
-def _cumsum(self, dim, dtype=None):
-    return flow._C.cumsum(self, dim, dtype=dtype)
-
-
-def _cumprod(self, dim, dtype=None):
-    return flow._C.cumprod(self, dim, dtype=dtype)
-
-
 def RegisterMethods():
     Tensor.__mul__ = lambda self, other: self.mul(other)
     Tensor.__rmul__ = lambda self, other: self.mul(other)
@@ -1112,6 +1055,7 @@ def RegisterMethods():
     Tensor.numel = _numel
     Tensor.element_size = _element_size
     Tensor.backward = _backward
+    Tensor.__getitem__ = _getitem
     Tensor.__setitem__ = _setitem
     Tensor.__str__ = _str
     Tensor.__repr__ = _repr
@@ -1140,6 +1084,7 @@ def RegisterMethods():
     Tensor.__rpow__ = _rpow
     Tensor.__format__ = _format
     Tensor.__floordiv__ = _floor_divide
+    Tensor.__len__ = _len
     Tensor.__mod__ = _fmod
     Tensor.__index__ = _index
     Tensor.__invert__ = _invert
@@ -1194,8 +1139,6 @@ def RegisterMethods():
     Tensor.log2 = _log2
     Tensor.add = _add
     Tensor.add_ = _add_inplace
-    Tensor.addcmul = _addcmul
-    Tensor.addcmul_ = _addcmul_
     Tensor.div = _truediv
     Tensor.div_ = _truediv_inplace
     Tensor.mul = _mul
@@ -1249,8 +1192,6 @@ def RegisterMethods():
     Tensor.where = _where
     Tensor.norm = _norm
     Tensor.transpose = _transpose
-    Tensor.local_to_global = _local_to_global
-    Tensor.global_to_global = _global_to_global
     Tensor.to_global = _to_global
     Tensor.relu = _relu
     Tensor.softmax = _softmax
@@ -1265,17 +1206,13 @@ def RegisterMethods():
     Tensor.repeat = _repeat
     Tensor.tile = _tile
     Tensor.split = _split
-    Tensor.unbind = _unbind
     Tensor.squeeze = _squeeze
     Tensor.swapaxes = _swapaxes
-    Tensor.amax = _amax
-    Tensor.swapdims = _swapdims
     Tensor.unfold = _unfold
     Tensor.narrow = _narrow
     Tensor.unsqueeze = _unsqueeze
     Tensor.permute = _permute
     Tensor.to = _to
-    Tensor.half = _half
     Tensor.gather = _gather
     Tensor.all = _all
     Tensor.any = _any
@@ -1290,7 +1227,6 @@ def RegisterMethods():
     Tensor.le = _le
     Tensor.to_local = _to_local
     Tensor.reshape = _reshape
-    Tensor.reshape_as = _reshape_as
     Tensor.view = _view
     Tensor.sort = _sort
     Tensor.type_as = _type_as
@@ -1310,13 +1246,11 @@ def RegisterMethods():
     Tensor.prod = _prod
     Tensor.sin = _sin
     Tensor.sin_ = _sin_inplace
+    Tensor.zero_ = _zero_
     Tensor.is_consistent = _is_consistent
     Tensor.to_consistent = _to_consistent
     Tensor.isnan = _isnan
     Tensor.isinf = _isinf
-    Tensor.new_tensor = _new_tensor
-    Tensor.cumsum = _cumsum
-    Tensor.cumprod = _cumprod
 
 
 def register_tensor_op(op_name):
diff --git a/python/oneflow/nn/__init__.py b/python/oneflow/nn/__init__.py
index 9ec182ad245..6b0e0ec917d 100644
--- a/python/oneflow/nn/__init__.py
+++ b/python/oneflow/nn/__init__.py
@@ -21,7 +21,6 @@
     GELU,
     GLU,
     Hardsigmoid,
-    Hardshrink,
     Hardswish,
     Hardtanh,
     LeakyReLU,
diff --git a/python/oneflow/nn/functional/__init__.py b/python/oneflow/nn/functional/__init__.py
index 2fbfa2fa7ec..2f2c035ae6c 100644
--- a/python/oneflow/nn/functional/__init__.py
+++ b/python/oneflow/nn/functional/__init__.py
@@ -20,9 +20,6 @@
 from oneflow._C import conv1d
 from oneflow._C import conv2d
 from oneflow._C import conv3d
-from oneflow._C import deconv1d as conv_transpose1d
-from oneflow._C import deconv2d as conv_transpose2d
-from oneflow._C import deconv3d as conv_transpose3d
 from oneflow._C import avg_pool1d
 from oneflow._C import avg_pool2d
 from oneflow._C import avg_pool3d
@@ -35,7 +32,6 @@
 from oneflow._C import relu
 from oneflow._C import hardtanh
 from oneflow._C import hardsigmoid
-from oneflow._C import hardshrink
 from oneflow._C import hardswish
 from oneflow._C import leaky_relu
 from oneflow._C import elu
@@ -55,7 +51,7 @@
 from oneflow._C import threshold
 from oneflow._C import silu
 from oneflow._C import mish
-from oneflow.nn.modules.normalization import layer_norm
+from oneflow._C import layer_norm
 from oneflow._C import dropout
 from oneflow._C import smooth_l1_loss
 from oneflow._C import pad
diff --git a/python/oneflow/nn/graph/block.py b/python/oneflow/nn/graph/block.py
index b5ff1d35b4d..3f6cd277366 100644
--- a/python/oneflow/nn/graph/block.py
+++ b/python/oneflow/nn/graph/block.py
@@ -107,7 +107,7 @@ def __init__(
     ):
         assert not isinstance(origin, Block)
         super().__init__(prefix, name)
-        self._debug = False
+        self._debug = False 
         self._debug_min_s_level = 2
         self._debug_max_v_level = 0
         self._debug_max_py_stack_depth = 2
@@ -163,7 +163,7 @@ def debug(
 
         my_rank = get_rank()
         if -1 in rank_list or my_rank in rank_list:
-            self._debug = v_level >= 0
+            self._debug = (v_level >= 0)
             if self._debug:
                 self._debug_min_s_level = 0
                 self._debug_max_v_level = max(0, v_level)
diff --git a/python/oneflow/nn/graph/graph.py b/python/oneflow/nn/graph/graph.py
index 599e96dea26..90093f04877 100644
--- a/python/oneflow/nn/graph/graph.py
+++ b/python/oneflow/nn/graph/graph.py
@@ -430,7 +430,7 @@ def debug(
 
         Each nn.Module inside a nn.Graph also has a debug() method to enable debug mode.
 
-        Use ``v_level`` to choose verbose debug info level, default level is 0, max level is 3.
+        Use ``v_level`` to choose verbose debug info level, default level is 0, max level is 3. 
         ``v_level`` -1 will disable the debug mode of the graph (i.e. no info will be printed).
         ``v_level`` 0 will print warning and graph building stages. ``v_level`` 1 will additionally
         print graph build info of each nn.Module. ``v_level`` 2 will additionally print graph build
@@ -439,7 +439,7 @@ def debug(
 
         Use ``ranks`` to choose which rank to print the debug information.
 
-        Use ``max_py_stack_depth`` to specify the max Python stack depth for the debug information.
+        Use ``max_py_stack_depth`` to specify the max Python stack depth for the debug information. 
 
         For example:
 
@@ -472,7 +472,7 @@ def debug(
 
         my_rank = get_rank()
         if -1 in rank_list or my_rank in rank_list:
-            self._debug = v_level >= 0
+            self._debug = (v_level >= 0)
             if self._debug:
                 self._debug_min_s_level = 0
                 self._debug_max_v_level = max(0, v_level)
diff --git a/python/oneflow/nn/module.py b/python/oneflow/nn/module.py
index 5ccb525ecfb..11d42c4bf14 100644
--- a/python/oneflow/nn/module.py
+++ b/python/oneflow/nn/module.py
@@ -341,7 +341,7 @@ def zero_grad(self, set_to_none: bool = False) -> None:
                         p.grad.detach_()
                     else:
                         p.grad.requires_grad_(False)
-                    p.grad.zero_()
+                    p.grad.zeros_()
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
         for (name, param) in self._parameters.items():
@@ -628,17 +628,6 @@ def double(self: T) -> T:
         """
         return self._apply(lambda t: t.double() if t.is_floating_point() else t)
 
-    def half(self: T) -> T:
-        r"""Casts all floating point parameters and buffers to ``half`` datatype.
-
-        .. note::
-            This method modifies the module in-place.
-
-        Returns:
-            Module: self
-        """
-        return self._apply(lambda t: t.half() if t.is_floating_point() else t)
-
     def _get_name(self):
         return self.__class__.__name__
 
diff --git a/python/oneflow/nn/modules/activation.py b/python/oneflow/nn/modules/activation.py
index def67abe631..947f9c339bd 100644
--- a/python/oneflow/nn/modules/activation.py
+++ b/python/oneflow/nn/modules/activation.py
@@ -451,57 +451,6 @@ def extra_repr(self):
         return inplace_str
 
 
-class Hardshrink(Module):
-    r"""
-    The Hardshrink activation.
-
-    The formula is:
-
-    .. math::
-        \text{Hardshrink}(x) =
-        \begin{cases}
-        x, & \text{ if } x > \lambda \\
-        x, & \text{ if } x < -\lambda \\
-        0, & \text{ otherwise }
-        \end{cases}
-
-    Args:
-        lambd: the :math:`\lambda` value for the Hardshrink formulation. Default: 0.5
-        inplace: can optionally do the operation in-place. Default: ``False``
-
-    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(N, *)`, same shape as the input
-
-    For example:
-
-    .. code-block:: python
-    
-        >>> import numpy as np
-        >>> import oneflow as flow
-        >>> x = np.array([-1.1, 0, 0.2, 0.5]).astype(np.float32)
-        >>> input = flow.Tensor(x)
-        >>> hardshrink = flow.nn.Hardshrink(lambd=0.5)
-        >>> out = hardshrink(input)
-        >>> out
-        tensor([-1.1000,  0.0000,  0.0000,  0.0000], dtype=oneflow.float32)
-    """
-
-    def __init__(self, lambd: float = 0.5, inplace: bool = False):
-        super().__init__()
-        self.inplace = inplace
-        self.lambd = lambd
-
-    def forward(self, x):
-        return flow._C.hardshrink(x, lambd=self.lambd, inplace=self.inplace)
-
-    def extra_repr(self) -> str:
-        param_str = f"lambd={self.lambd}"
-        param_str += ", inplace=True" if self.inplace else ""
-        return param_str
-
-
 class Softmax(Module):
     """Applies the Softmax function to an n-dimensional input Tensor
     rescaling them so that the elements of the n-dimensional output Tensor
@@ -860,7 +809,9 @@ def __init__(self, negative_slope: float = 0.01, inplace: bool = False):
         self.inplace = inplace
 
     def forward(self, x):
-        return flow._C.leaky_relu(x, alpha=self.negative_slope, inplace=self.inplace)
+        if self.inplace:
+            warnings.warn("LeakyReLU module do not support inplace now")
+        return flow._C.leaky_relu(x, alpha=self.negative_slope)
 
     def extra_repr(self):
         param_str = f"negative_slope={self.negative_slope}"
diff --git a/python/oneflow/nn/modules/batchnorm.py b/python/oneflow/nn/modules/batchnorm.py
index d8be694ff55..00474bb9b31 100644
--- a/python/oneflow/nn/modules/batchnorm.py
+++ b/python/oneflow/nn/modules/batchnorm.py
@@ -44,21 +44,17 @@ def __init__(
             self.register_parameter("weight", None)
             self.register_parameter("bias", None)
         if self.track_running_stats:
-            self.register_buffer("running_mean", flow.zeros(num_features))
-            self.register_buffer("running_var", flow.ones(num_features))
-            self.register_buffer("num_batches_tracked", flow.tensor(0, dtype=flow.long))
+            self.register_buffer("running_mean", flow.Tensor(num_features))
+            self.register_buffer("running_var", flow.Tensor(num_features))
         else:
-            self.register_buffer("running_mean", None)
-            self.register_buffer("running_var", None)
-            self.register_buffer("num_batches_tracked", None)
-
+            self.register_parameter("running_mean", None)
+            self.register_parameter("running_var", None)
         self.reset_parameters()
 
     def reset_running_stats(self) -> None:
         if self.track_running_stats:
             self.running_mean.fill_(0)
             self.running_var.fill_(1)
-            self.num_batches_tracked.zero_()
 
     def reset_parameters(self) -> None:
         self.reset_running_stats()
@@ -79,9 +75,6 @@ def _load_from_state_dict(
         unexpected_keys,
         error_msgs,
     ):
-        if self.track_running_stats:
-            num_batches_tracked_key = prefix + "num_batches_tracked"
-            state_dict[num_batches_tracked_key] = flow.tensor(0, dtype=flow.long)
         super(_NormBase, self)._load_from_state_dict(
             state_dict,
             prefix,
@@ -112,9 +105,6 @@ def __init__(
 
     def forward(self, x):
         self._check_input_dim(x)
-        if self.training and self.track_running_stats:
-            if self.num_batches_tracked is not None:
-                self.num_batches_tracked.add_(1)
         if self.training:
             is_training = True
         else:
diff --git a/python/oneflow/nn/modules/conv.py b/python/oneflow/nn/modules/conv.py
index d08287a9c00..5769ca31c97 100644
--- a/python/oneflow/nn/modules/conv.py
+++ b/python/oneflow/nn/modules/conv.py
@@ -22,8 +22,6 @@
 from oneflow.nn.module import Module
 from oneflow.nn.modules.utils import _pair, _single, _triple
 
-from typing import Union
-
 
 def slice(x, begin, size):
     ndim = len(x.shape)
@@ -73,27 +71,6 @@ def split(cls, x, axis, split_num):
         return result_list
 
 
-def get_padding(padding, kernel_size, dilation, stride):
-    valid_padding_strings = {"same", "valid"}
-    if isinstance(padding, str):
-        if padding not in valid_padding_strings:
-            raise ValueError(
-                "Invalid padding string {!r}, should be one of {}".format(
-                    padding, valid_padding_strings
-                )
-            )
-        if padding == "same" and any(s != 1 for s in list(stride)):
-            raise ValueError("padding='same' is not supported for strided convolutions")
-
-    out_padding = [0] * len(kernel_size)
-    if padding == "same":
-        for d, k, i in zip(dilation, kernel_size, range(len(kernel_size) - 1, -1, -1)):
-            total_padding = d * (k - 1)
-            left_pad = total_padding // 2
-            out_padding[i] = left_pad
-    return out_padding
-
-
 class Conv1d(Module):
     """The interface is consistent with PyTorch.    
     The documentation is referenced from: https://pytorch.org/docs/master/generated/torch.nn.Conv1d.html#conv1d
@@ -191,7 +168,7 @@ def __init__(
         out_channels: int,
         kernel_size: _size_1_t,
         stride: _size_1_t = 1,
-        padding: Union[str, _size_1_t] = 0,
+        padding: _size_1_t = 0,
         dilation: _size_1_t = 1,
         groups: int = 1,
         bias: bool = True,
@@ -202,12 +179,8 @@ def __init__(
         self.padding_mode = padding_mode
         self.kernel_size = _single(kernel_size)
         self.stride = _single(stride)
+        self.padding = _single(padding)
         self.dilation = _single(dilation)
-        self.padding = (
-            get_padding(padding, self.kernel_size, self.dilation, self.stride)
-            if isinstance(padding, str)
-            else _single(padding)
-        )
         self.groups = groups
         self.channel_pos = "channels_first"
         assert in_channels % groups == 0
@@ -380,7 +353,7 @@ def __init__(
         out_channels: int,
         kernel_size: _size_2_t,
         stride: _size_2_t = 1,
-        padding: Union[str, _size_2_t] = 0,
+        padding: _size_2_t = 0,
         dilation: _size_2_t = 1,
         groups: int = 1,
         bias: bool = True,
@@ -391,12 +364,8 @@ def __init__(
         self.padding_mode = padding_mode
         self.kernel_size = _pair(kernel_size)
         self.stride = _pair(stride)
+        self.padding = _pair(padding)
         self.dilation = _pair(dilation)
-        self.padding = (
-            get_padding(padding, self.kernel_size, self.dilation, self.stride)
-            if isinstance(padding, str)
-            else _pair(padding)
-        )
         self.groups = groups
 
         if os.getenv("ONEFLOW_ENABLE_NHWC") == "1":
@@ -566,7 +535,7 @@ def __init__(
         out_channels: int,
         kernel_size: _size_3_t,
         stride: _size_3_t = 1,
-        padding: Union[str, _size_3_t] = 0,
+        padding: _size_3_t = 0,
         dilation: _size_3_t = 1,
         groups: int = 1,
         bias: bool = True,
@@ -578,12 +547,8 @@ def __init__(
         self.padding_mode = padding_mode
         self.kernel_size = _triple(kernel_size)
         self.stride = _triple(stride)
+        self.padding = _triple(padding)
         self.dilation = _triple(dilation)
-        self.padding = (
-            get_padding(padding, self.kernel_size, self.dilation, self.stride)
-            if isinstance(padding, str)
-            else _triple(padding)
-        )
         self.groups = groups
         self.channel_pos = "channels_first"
         assert in_channels % groups == 0, "in_channels must be divisible by groups"
@@ -765,12 +730,14 @@ def forward(self, x):
             x,
             self.weight,
             self.bias,
-            self.stride,
+            self.filters,
             self.padding,
+            "channels_first",
+            self.kernel_size,
             self.output_padding,
-            self.groups,
+            self.stride,
             self.dilation,
-            "channels_first",
+            self.groups,
         )
 
 
@@ -890,12 +857,14 @@ def forward(self, x):
             x,
             self.weight,
             self.bias,
-            self.stride,
+            self.filters,
             self.padding,
+            "channels_first",
+            self.kernel_size,
             self.output_padding,
-            self.groups,
+            self.stride,
             self.dilation,
-            "channels_first",
+            self.groups,
         )
         return res
 
@@ -1051,12 +1020,14 @@ def forward(self, x):
             x,
             self.weight,
             self.bias,
-            self.stride,
+            self.filters,
             self.padding,
+            "channels_first",
+            self.kernel_size,
             self.output_padding,
-            self.groups,
+            self.stride,
             self.dilation,
-            "channels_first",
+            self.groups,
         )
 
 
diff --git a/python/oneflow/nn/modules/fused_mlp.py b/python/oneflow/nn/modules/fused_mlp.py
index 34ace37aab7..706594bc785 100644
--- a/python/oneflow/nn/modules/fused_mlp.py
+++ b/python/oneflow/nn/modules/fused_mlp.py
@@ -77,9 +77,6 @@ def __init__(
         self.reset_parameters()
 
     def add_parameters(self) -> None:
-        """Register parameter in FusedMLP module. 
-
-        """
         if self.hidden_layer_num != 0:
             # First layer.
             self.register_parameter(
@@ -132,33 +129,18 @@ def add_parameters(self) -> None:
             )
 
     def weight(self, i):
-        """Returns the ith weight. 
-
-        """
         return getattr(self, f"weight_{i}")
 
     def weights(self):
-        """Returns the weight list in FusedMLP module. 
-
-        """
         return [self.weight(i) for i in range(self.hidden_layer_num + 1)]
 
     def bias(self, i):
-        """Return the ith bias. 
-
-        """
         return getattr(self, f"bias_{i}")
 
     def biases(self):
-        """Returns the bias list in FusedMLP module. 
-
-        """
         return [self.bias(i) for i in range(self.hidden_layer_num + 1)]
 
     def reset_parameters(self) -> None:
-        """Reset the parameters in FusedMLP module. 
-
-        """
         for layer_idx in range(self.hidden_layer_num + 1):
             flow.nn.init.kaiming_uniform_(self.weight(layer_idx), a=math.sqrt(5))
             (fan_in, _) = _calculate_fan_in_and_fan_out(self.weight(layer_idx))
diff --git a/python/oneflow/nn/modules/global_cast.py b/python/oneflow/nn/modules/global_cast.py
index 30be5876578..050338098e6 100644
--- a/python/oneflow/nn/modules/global_cast.py
+++ b/python/oneflow/nn/modules/global_cast.py
@@ -18,71 +18,73 @@
 from oneflow.nn.module import Module
 
 
-def _check_sbp(sbp):
-    if sbp is None:
-        pass
-    elif isinstance(sbp, (tuple, list)):
-        if not all(isinstance(sbp_item, flow.sbp.sbp) for sbp_item in sbp):
-            raise TypeError(
-                "sbp parameter must be type of oneflow.sbp.sbp or list/tuple of oneflow.sbp.sbp"
-            )
-    elif isinstance(sbp, flow.sbp.sbp):
-        sbp = (sbp,)
-    else:
-        raise TypeError(f"Invalid parameter sbp with type {type(sbp)}")
-
-    return sbp
-
-
-def local_to_global_op(input, placement=None, sbp=None, *, check_meta=True):
+class ToGlobal(Module):
+    def __init__(self, placement, sbp):
+        super().__init__()
+        self.placement = placement
+        if isinstance(sbp, flow.sbp.sbp):
+            sbp = [sbp]
+        for elem in sbp:
+            assert isinstance(
+                elem, flow.sbp.sbp
+            ), "element %s is not an sbp instance" % (sbp)
+        self.sbp = sbp
+
+    def forward(self, x, sbp, placement):
+        return flow._C.to_global(x, placement=placement, sbp=sbp)
+
+
+def to_global_op(input, placement=None, sbp=None, grad_sbp=None):
     assert isinstance(input, Tensor)
-    assert input.is_local, "input must be a local tensor"
-    if placement is None or sbp is None:
-        raise ValueError(
-            "Converting a local tensor to global tensor must have placement and sbp parameters."
-        )
 
-    assert isinstance(
-        placement, flow.placement
-    ), f"Invalid parameter placement with type {type(placement)}"
+    def _check_sbp(sbp):
+        if sbp is None:
+            pass
+        elif isinstance(sbp, (tuple, list)):
+            if not all(isinstance(sbp_item, flow.sbp.sbp) for sbp_item in sbp):
+                raise TypeError(
+                    "sbp parameter must be type of oneflow.sbp.sbp or list/tuple of oneflow.sbp.sbp"
+                )
+        elif isinstance(sbp, flow.sbp.sbp):
+            sbp = (sbp,)
+        else:
+            raise TypeError(f"Invalid parameter sbp with type {type(sbp)}")
+
+        return sbp
 
     sbp = _check_sbp(sbp)
-    grad_sbp = tuple()
-    return flow._C.to_global(input, placement, sbp, grad_sbp, check_meta)
 
+    if input.is_global:
+        # convert global tensor to another global tensor with different placement or sbp
+        if placement is None:
+            placement = input.placement
 
-def global_to_global_op(
-    input, placement=None, sbp=None, *, grad_sbp=None, check_meta=False
-):
-    assert isinstance(input, Tensor)
-    assert input.is_global, "input must be a global tensor"
+        if sbp is None:
+            sbp = input.sbp
 
-    sbp = _check_sbp(sbp)
-    if placement is None:
-        placement = input.placement
+        grad_sbp = _check_sbp(grad_sbp)
 
-    if sbp is None:
-        sbp = input.sbp
+    else:
+        # local tensor to global tensor
+        if placement is None or sbp is None:
+            raise ValueError(
+                "Converting a local tensor to global tensor must have placement and sbp parameters."
+            )
 
-    assert isinstance(
-        placement, flow.placement
-    ), f"Invalid parameter placement with type {type(placement)}"
+        if not isinstance(placement, flow.placement):
+            raise ValueError(f"Invalid parameter placement with type {type(placement)}")
 
-    grad_sbp = _check_sbp(grad_sbp)
     if grad_sbp is None:
         grad_sbp = tuple()
-    return flow._C.to_global(input, placement, sbp, grad_sbp, check_meta)
+    return flow._C.to_global(input, placement, sbp, grad_sbp)
 
 
-def to_global_op(input, placement=None, sbp=None, **kwargs):
-    assert isinstance(input, Tensor)
+class ToLocal(Module):
+    def __init__(self):
+        super().__init__()
 
-    if input.is_global:
-        return global_to_global_op(input=input, placement=placement, sbp=sbp, **kwargs)
-    else:
-        if "grad_sbp" in kwargs:
-            del kwargs["grad_sbp"]
-        return local_to_global_op(input=input, placement=placement, sbp=sbp, **kwargs)
+    def forward(self, x):
+        return flow._C.to_local(x)
 
 
 def to_local_op(input):
diff --git a/python/oneflow/nn/modules/index_select.py b/python/oneflow/nn/modules/index_select.py
index e69de29bb2d..b4277472ba5 100644
--- a/python/oneflow/nn/modules/index_select.py
+++ b/python/oneflow/nn/modules/index_select.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import oneflow as flow
+from oneflow.framework.tensor import Tensor, register_tensor_op
+
+
+def _input_args_is_int(args):
+    return all((isinstance(x, int) for x in args))
+
+
+def index_select_op(input, dim, index):
+    assert len(index.shape) == 1, "Dimensions of index should be 1-D"
+    assert (
+        dim < len(input.shape) and dim >= 0
+    ), "Value of dim is out of range(dim should be in the range of [0, input dimensions) )"
+    # TODO(): tolist call numpy to do tolist which raise error in graph. 
+    #assert _input_args_is_int(
+    #    index.tolist()
+    #), "input index parameter is not legal!(index should be an 1-D int tensor)"
+    index_rshp = list(input.shape)
+
+    #for index_i in index:
+    #    assert (
+    #        index_i < index_rshp[dim]
+    #    ), "index is out of range(index shuold be lower than the dim-th dimension of input)"
+
+    index_rshp[dim] = 1
+    index_gather = index[0].expand(*index_rshp)
+    if index.size()[0] > 1:
+        for index_i in index[1:]:
+            x = index_i.expand(*index_rshp)
+            index_gather = flow.cat((index_gather, x), dim)
+
+    return flow.gather(input, dim, index_gather)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(raise_on_error=True)
diff --git a/python/oneflow/nn/modules/masked_select.py b/python/oneflow/nn/modules/masked_select.py
index 2d213af38d4..98153df18bd 100644
--- a/python/oneflow/nn/modules/masked_select.py
+++ b/python/oneflow/nn/modules/masked_select.py
@@ -42,6 +42,9 @@ def masked_select_op(input, mask):
         tensor([0.3139, 0.3898], dtype=oneflow.float32)
     """
 
+    assert len(input.shape) == len(
+        mask.shape
+    ), f"The dim of masked_select module's inputs can not match, please check!"
     assert input.is_global == mask.is_global, (
         f"input tensor is %s tensor, but mask is %s tensor"
         % (
diff --git a/python/oneflow/nn/modules/normalization.py b/python/oneflow/nn/modules/normalization.py
index 2473ab65cc5..b9f6b45322f 100644
--- a/python/oneflow/nn/modules/normalization.py
+++ b/python/oneflow/nn/modules/normalization.py
@@ -133,69 +133,6 @@ def extra_repr(self) -> str:
         )
 
 
-def layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-05):
-    assert len(input.shape) > len(
-        normalized_shape
-    ), "Input tensor dim must greater than normalized dim!"
-    begin_norm_axis = len(input.shape) - len(normalized_shape)
-    begin_params_axis = len(input.shape) - len(normalized_shape)
-
-    elementwise_affine = True if (weight is not None and bias is not None) else False
-
-    for i in range(0, len(normalized_shape)):
-        if input.shape[i + begin_params_axis] != normalized_shape[i]:
-            raise RuntimeError(
-                f"Given normalized_shape={normalized_shape}, expected input with shape [*, {str(normalized_shape)[1:-1]}], but got input of size {x.shape}"
-            )
-
-    if not input.is_cuda:
-        reduce_axis = []
-        for dim in range(len(input.shape)):
-            if dim >= begin_norm_axis:
-                reduce_axis.append(dim)
-        mean = input.mean(dim=reduce_axis, keepdim=True)
-        variance = input.var(dim=reduce_axis, unbiased=False, keepdim=True)
-        params_shape = input.shape[begin_params_axis:]
-        if len(mean.shape) == 1:
-            nd_params_shape = [1] * len(input.shape)
-            nd_params_shape[begin_norm_axis] = params_shape[0]
-            mean = flow.reshape(mean, shape=nd_params_shape)
-            variance = flow.reshape(variance, nd_params_shape)
-            if weight is not None and params_shape[0] == weight.nelement():
-                weight = flow.reshape(weight, shape=nd_params_shape)
-            if bias is not None and params_shape[0] == bias.nelement():
-                bias = flow.reshape(bias, shape=nd_params_shape)
-        elif len(mean.shape) == len(input.shape):
-            pass
-        else:
-            raise ValueError(
-                "shape of mean and variance should be 1D or has number of axes and x's"
-            )
-        variance += eps
-        normalized = (input - mean) * variance.rsqrt()
-        if elementwise_affine:
-            normalized = normalized * weight + bias
-        return normalized
-    else:
-        if elementwise_affine:
-            res = flow._C.layer_norm_affine(
-                input,
-                weight,
-                bias,
-                begin_norm_axis=begin_norm_axis,
-                begin_params_axis=begin_params_axis,
-                epsilon=eps,
-            )
-        else:
-            res = flow._C.layer_norm(
-                input,
-                begin_norm_axis=begin_norm_axis,
-                begin_params_axis=begin_params_axis,
-                epsilon=eps,
-            )
-        return res
-
-
 class LayerNorm(Module):
     """Applies Layer Normalization over a mini-batch of inputs as described in
     the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`__
@@ -302,6 +239,8 @@ def __init__(
             self.register_parameter("weight", None)
             self.register_parameter("bias", None)
         self.reset_parameters()
+        self.begin_norm_axis = 1
+        self.begin_params_axis = 1
 
     def reset_parameters(self) -> None:
         if self.elementwise_affine:
@@ -309,7 +248,69 @@ def reset_parameters(self) -> None:
             init.zeros_(self.bias)
 
     def forward(self, x):
-        return layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        assert len(x.shape) > len(
+            self.normalized_shape
+        ), "Input tensor dim must greater than normalized dim!"
+        self.begin_norm_axis = len(x.shape) - len(self.normalized_shape)
+        self.begin_params_axis = len(x.shape) - len(self.normalized_shape)
+
+        for i in range(0, len(self.normalized_shape)):
+            if x.shape[i + self.begin_params_axis] != self.normalized_shape[i]:
+                raise RuntimeError(
+                    f"Given normalized_shape={self.normalized_shape}, expected input with shape [*, {str(self.normalized_shape)[1:-1]}], but got input of size {x.shape}"
+                )
+
+        if not x.is_cuda:
+            reduce_axis = []
+            for dim in range(len(x.shape)):
+                if dim >= self.begin_norm_axis:
+                    reduce_axis.append(dim)
+            mean = x.mean(dim=reduce_axis, keepdim=True)
+            variance = x.var(dim=reduce_axis, unbiased=False, keepdim=True)
+            params_shape = x.shape[self.begin_params_axis :]
+            weight = self.weight
+            bias = self.bias
+            if len(mean.shape) == 1:
+                nd_params_shape = [1] * len(x.shape)
+                nd_params_shape[self.begin_norm_axis] = params_shape[0]
+                mean = flow.reshape(mean, shape=nd_params_shape)
+                variance = flow.reshape(variance, nd_params_shape)
+                if (
+                    self.weight is not None
+                    and params_shape[0] == self.weight.nelement()
+                ):
+                    weight = flow.reshape(self.weight, shape=nd_params_shape)
+                if self.bias is not None and params_shape[0] == self.bias.nelement():
+                    bias = flow.reshape(self.bias, shape=nd_params_shape)
+            elif len(mean.shape) == len(x.shape):
+                pass
+            else:
+                raise ValueError(
+                    "shape of mean and variance should be 1D or has number of axes and x's"
+                )
+            variance += self.eps
+            normalized = (x - mean) * variance.rsqrt()
+            if self.elementwise_affine:
+                normalized = normalized * weight + bias
+            return normalized
+        else:
+            if self.elementwise_affine:
+                res = flow._C.layer_norm_affine(
+                    x,
+                    self.weight,
+                    self.bias,
+                    begin_norm_axis=self.begin_norm_axis,
+                    begin_params_axis=self.begin_params_axis,
+                    epsilon=self.eps,
+                )
+            else:
+                res = flow._C.layer_norm(
+                    x,
+                    begin_norm_axis=self.begin_norm_axis,
+                    begin_params_axis=self.begin_params_axis,
+                    epsilon=self.eps,
+                )
+            return res
 
     def extra_repr(self) -> str:
         return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(
diff --git a/python/oneflow/nn/modules/reduce_ops.py b/python/oneflow/nn/modules/reduce_ops.py
index d56bdaba897..47e57f38b57 100644
--- a/python/oneflow/nn/modules/reduce_ops.py
+++ b/python/oneflow/nn/modules/reduce_ops.py
@@ -36,11 +36,11 @@ def mean_op(input, dim=None, keepdim=False):
     return flow._C.reduce_mean(input, axis=axis_checked, keepdims=keepdim)
 
 
-def prod_op(input, dim=None, keepdim=False, *, dtype=None):
+def prod_op(input, dim=None, keepdim=False):
     axis_checked = _check_axis(dim, input.shape)
     if len(axis_checked) == 0:
         return input
-    return flow._C.reduce_prod(input, axis_checked, keepdim, dtype=dtype)
+    return flow._C.reduce_prod(input, axis_checked, keepdim)
 
 
 def all_op(input, dim=None, keepdim=False):
diff --git a/python/oneflow/nn/optimizer/adamw.py b/python/oneflow/nn/optimizer/adamw.py
index 1a559fa823e..b64300dd588 100644
--- a/python/oneflow/nn/optimizer/adamw.py
+++ b/python/oneflow/nn/optimizer/adamw.py
@@ -269,7 +269,4 @@ def _generate_conf_for_graph(self, train_conf, vars_conf):
 
     @property
     def support_sparse(self):
-        """Whether AdamW Optimizer support sparse update. 
-
-        """
         return True
diff --git a/python/oneflow/nn/optimizer/lambda_lr.py b/python/oneflow/nn/optimizer/lambda_lr.py
index ad1d526e6ac..9bf4e21d47e 100644
--- a/python/oneflow/nn/optimizer/lambda_lr.py
+++ b/python/oneflow/nn/optimizer/lambda_lr.py
@@ -94,9 +94,6 @@ def load_state_dict(self, state_dict):
                 self.lr_lambdas[idx].__dict__.update(fn)
 
     def step(self):
-        """Performs a single learning rate schedule step.
-
-        """
         self.last_step += 1
         lrs = []
         for (lmbda, base_lr) in zip(self.lr_lambdas, self.base_lrs):
diff --git a/python/oneflow/nn/optimizer/optimizer.py b/python/oneflow/nn/optimizer/optimizer.py
index 726cc9a2e14..28c727edeba 100644
--- a/python/oneflow/nn/optimizer/optimizer.py
+++ b/python/oneflow/nn/optimizer/optimizer.py
@@ -232,14 +232,6 @@ def pack_group(group):
         }
 
     def step(self, closure: Union[Callable, None] = None) -> Union[Tensor, None]:
-        """Performs a single optimization step (parameter update).
-
-        Args:
-            closure (Union[Callable, None], optional): A closure that reevaluates the model and returns the loss. Optional for most optimizers.
-
-        Returns:
-            Union[Tensor, None]: The loss. 
-        """
         raise NotImplementedError()
 
     def clip_grad(self):
@@ -291,7 +283,7 @@ def zero_grad(self, set_to_none: bool = False):
                     if set_to_none:
                         param.grad = None
                     else:
-                        param.grad.zero_()
+                        param.grad.zeros_()
 
     def _parse_input_parameters(self, parameters):
         """
@@ -322,24 +314,21 @@ def _parse_input_parameters(self, parameters):
             )
 
     def _generate_grad_clip_conf_for_optim_conf(self, param_group, optimizer_conf):
-        if not param_group._enable_clip_grad:
-            return
-
-        assert "clip_grad_max_norm" in param_group
-        assert "clip_grad_norm_type" in param_group
-        max_norm = float(param_group["clip_grad_max_norm"])
-        norm_type = float(param_group["clip_grad_norm_type"])
-        clip_grad_norm = (
-            optimizer_conf.mutable_clip_conf().mutable_clip_by_global_norm()
-        )
-        clip_grad_norm.set_max_norm(max_norm)
-        clip_grad_norm.set_norm_type(norm_type)
+        if param_group._enable_clip_grad:
+            if (
+                param_group["clip_grad_max_norm"] == 1.0
+                and param_group["clip_grad_norm_type"] == 2.0
+            ):
+                optimizer_conf.mutable_clip_conf().mutable_clip_by_global_norm().set_clip_norm(
+                    param_group["clip_grad_max_norm"]
+                )
+            else:
+                warnings.warn(
+                    "For now, nn.Graph only support clip grad with `clip_grad_max_norm == 1.0` and `clip_grad_norm_type == 2.0`."
+                )
 
     @property
     def support_sparse(self):
-        """Whether the Optimizer support sparse update. 
-
-        """
         return False
 
     def _check_variables_in_graph(self, vars_conf):
diff --git a/python/oneflow/nn/optimizer/reduce_lr_on_plateau.py b/python/oneflow/nn/optimizer/reduce_lr_on_plateau.py
index 22d41066a7d..2686b2fb816 100644
--- a/python/oneflow/nn/optimizer/reduce_lr_on_plateau.py
+++ b/python/oneflow/nn/optimizer/reduce_lr_on_plateau.py
@@ -121,7 +121,7 @@ def __init__(
         self._reset()
 
     def step(self, metrics):
-        """Performs a single learning rate schedule step.
+        """Step forward once.
 
         Arguments:
             metrics (float): a metrics quantity of Measuring the effect of model training.
@@ -149,15 +149,9 @@ def step(self, metrics):
 
     @property
     def in_cooldown(self):
-        """Whether the learning rate scheduler in cooldown phase. 
-
-        """
         return self.cooldown_counter > 0
 
     def is_better(self, a, best):
-        """Whether the metric has improvement. 
-        
-        """
         if self.mode == "min" and self.threshold_mode == "rel":
             rel_epsilon = 1.0 - self.threshold
             return a < best * rel_epsilon
diff --git a/python/oneflow/nn/optimizer/sgd.py b/python/oneflow/nn/optimizer/sgd.py
index 8e0df13fdd9..27b567b05eb 100644
--- a/python/oneflow/nn/optimizer/sgd.py
+++ b/python/oneflow/nn/optimizer/sgd.py
@@ -128,12 +128,6 @@ def __init__(
         )
 
     def step(self, closure: Callable = None):
-        """Performs a single optimization step.
-
-        Args:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
         with flow.no_grad():
             loss = None
             if closure is not None:
@@ -193,7 +187,4 @@ def _generate_conf_for_graph(self, train_conf, vars_conf):
 
     @property
     def support_sparse(self):
-        """Whether SGD Optimizer support sparse update. 
-
-        """
         return True
diff --git a/python/oneflow/nn/utils/clip_grad.py b/python/oneflow/nn/utils/clip_grad.py
index ec0aadbdf3a..28acb31be8c 100644
--- a/python/oneflow/nn/utils/clip_grad.py
+++ b/python/oneflow/nn/utils/clip_grad.py
@@ -120,8 +120,9 @@ def clip_grad_norm_(
                 ),
                 norm_type,
             )
-        if error_if_nonfinite and flow.logical_or(
-            total_norm.isnan(), total_norm.isinf()
+        if error_if_nonfinite and (
+            np.isnan(total_norm.to_local().numpy()).all()
+            or np.isinf(total_norm.to_local().numpy()).all()
         ):
             raise RuntimeError(
                 f"The total norm of order {norm_type} for gradients from "
@@ -151,8 +152,8 @@ def clip_grad_norm_(
                 ),
                 norm_type,
             )
-        if error_if_nonfinite and flow.logical_or(
-            total_norm.isnan(), total_norm.isinf()
+        if error_if_nonfinite and (
+            np.isnan(total_norm.numpy()).all() or np.isinf(total_norm.numpy()).all()
         ):
             raise RuntimeError(
                 f"The total norm of order {norm_type} for gradients from "
diff --git a/python/oneflow/one_embedding.py b/python/oneflow/one_embedding.py
index abc5c4c40c5..502e9fd2364 100644
--- a/python/oneflow/one_embedding.py
+++ b/python/oneflow/one_embedding.py
@@ -18,8 +18,6 @@
 import json
 import datetime
 from oneflow._oneflow_internal import OneEmbeddingHandler
-from oneflow._oneflow_internal import PersistentTableReader
-from oneflow._oneflow_internal import PersistentTableWriter
 import numpy as np
 import traceback
 
@@ -54,110 +52,82 @@ def _check_cache(cache):
     assert cache["value_memory_kind"] in ["device", "host"]
 
 
-def _init(
-    name, embedding_dims, dtype, key_type, tables, store_options, default_initializer
-):
-    default_initializer = default_initializer or {
-        "type": "normal",
-        "mean": 0,
-        "std": 0.05,
-    }
-    key_value_store_options = {}
-    embedding_tables = {}
-    key_value_store_options["name"] = name
+class MultiTableEmbedding(Module):
+    r"""MultiTableEmbedding represent multi Embedding tables with same embedding_dim, dtype, and key_type.
+
+    Args:
+        name (str): The name of Embedding
+        embedding_dim (int): the size of each embedding vector
+        dtype (flow.dtype): the data type of embeddings
+        key_type (flow.dtype): the data type of feature ids
+        tables (list): list of table param which can be made by flow.one_embedding.make_table_options
+        store_options (dict): store option of Embedding
+        default_initializer (dict, optional): if tables param is None, use default_initializer to initialize table. Defaults to None.
+    
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> import numpy as np
+        >>> import oneflow.nn as nn
+        >>> # a simple example with 3 table
+        >>> table_size_array = [39884407, 39043, 17289]
+        >>> vocab_size = sum(table_size_array)
+        >>> num_tables = len(table_size_array)
+        >>> embedding_size = 128
+        >>> scales = np.sqrt(1 / np.array(table_size_array))
+        >>> tables = [
+        >>>     flow.one_embedding.make_table_options(
+        >>>         flow.one_embedding.make_uniform_initializer(low=-scale, high=scale)
+        >>>     )
+        >>>     for scale in scales
+        >>> ]
+        >>> store_options = flow.one_embedding.make_cached_ssd_store_options(
+        >>>     cache_budget_mb=8192, persistent_path="/your_path_to_ssd", capacity=vocab_size,
+        >>> )
+        >>> embedding = flow.one_embedding.MultiTableEmbedding(
+        >>>     name="my_embedding",
+        >>>     embedding_dim=embedding_size,
+        >>>     dtype=flow.float,
+        >>>     key_type=flow.int64,
+        >>>     tables=tables,
+        >>>     store_options=store_options,
+        >>> )
+        >>> embedding.to("cuda")
+        >>> mlp = flow.nn.FusedMLP(
+        >>>     in_features=embedding_size * num_tables,
+        >>>     hidden_features=[512, 256, 128],
+        >>>     out_features=1,
+        >>>     skip_final_activation=True,
+        >>> )
+        >>> mlp.to("cuda")
+        >>>
+        >>> class TrainGraph(flow.nn.Graph):
+        >>>     def __init__(self,):
+        >>>         super().__init__()
+        >>>         self.embedding_lookup = embedding
+        >>>         self.mlp = mlp
+        >>>         self.add_optimizer(
+        >>>             flow.optim.SGD(self.embedding_lookup.parameters(), lr=0.1, momentum=0.0)
+        >>>         )
+        >>>         self.add_optimizer(
+        >>>             flow.optim.SGD(self.mlp.parameters(), lr=0.1, momentum=0.0)
+        >>>         ) 
+        >>>     def build(self, ids):
+        >>>         embedding = self.embedding_lookup(ids)
+        >>>         loss = self.mlp(flow.reshape(embedding, (-1, num_tables * embedding_size)))
+        >>>         loss = loss.sum()
+        >>>         loss.backward()
+        >>>         return loss 
+        >>> ids = np.random.randint(0, 1000, (100, num_tables), dtype=np.int64)
+        >>> ids_tensor = flow.tensor(ids, requires_grad=False).to("cuda")
+        >>> graph = TrainGraph()
+        >>> loss = graph(ids_tensor)
+        >>> print(loss)
+
+    """
 
-    if isinstance(embedding_dims, (list, tuple)):
-        column_dims = embedding_dims
-        embedding_dim = sum(embedding_dims)
-    else:
-        assert embedding_dims > 0
-        column_dims = [embedding_dims]
-        embedding_dim = embedding_dims
-    parallel_num = flow.env.get_world_size()
-    key_type_size = np.dtype(
-        flow.convert_oneflow_dtype_to_numpy_dtype(key_type)
-    ).itemsize
-    assert key_type_size > 0
-    key_value_store_options["key_type_size"] = key_type_size
-    value_type_size = np.dtype(
-        flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
-    ).itemsize
-    assert value_type_size > 0
-    key_value_store_options["value_type_size"] = value_type_size
-    scale_factor = store_options["size_factor"]
-    key_value_store_options["storage_dim"] = scale_factor * embedding_dim
-    # kv store
-    assert store_options.__contains__("kv_store")
-    kv_store = store_options["kv_store"]
-    assert isinstance(kv_store, dict)
-    if kv_store.__contains__("caches"):
-        caches = kv_store["caches"]
-        assert isinstance(caches, (dict, list, tuple))
-        if isinstance(caches, dict):
-            _check_cache(caches)
-            caches = [caches]
-        else:
-            assert len(caches) <= 2
-            for i in range(len(caches)):
-                assert isinstance(caches[i], dict)
-                _check_cache(caches[i])
-        for i in range(len(caches)):
-            if caches[i].__contains__("capacity"):
-                caches[i]["capacity"] = caches[i]["capacity"] // parallel_num
-    assert kv_store.__contains__("persistent_table")
-    persistent_table = kv_store["persistent_table"]
-    assert isinstance(persistent_table, dict)
-    assert persistent_table.__contains__("path")
-    persistent_table_path = persistent_table["path"]
-    assert isinstance(persistent_table_path, (str, list, tuple))
-    if isinstance(persistent_table_path, (list, tuple)):
-        assert len(persistent_table_path) == parallel_num
-    if persistent_table.__contains__("physical_block_size"):
-        assert persistent_table["physical_block_size"] in [512, 4096]
-    else:
-        persistent_table["physical_block_size"] = 4096
-    if persistent_table.__contains__("capacity_hint"):
-        assert persistent_table["capacity_hint"] >= 0
-        persistent_table["capacity_hint"] = (
-            persistent_table["capacity_hint"] // parallel_num
-        )
-    key_value_store_options["kv_store"] = kv_store
-    # initializer
-    if tables is not None:
-        assert isinstance(tables, (list, tuple))
-        for i in range(len(tables)):
-            table = tables[i]
-            if table.__contains__("columns"):
-                assert not table.__contains__("initializer")
-                columns = table["columns"]
-                assert len(columns) == len(column_dims)
-                for column in columns:
-                    assert isinstance(column, dict)
-                    assert column.__contains__("initializer")
-                    _check_initializer(column["initializer"])
-            else:
-                assert isinstance(table, dict)
-                assert table.__contains__("initializer")
-                _check_initializer(table["initializer"])
-                columns = []
-                for j in range(len(column_dims)):
-                    columns.append(make_column_options(table["initializer"]))
-                table["columns"] = columns
-                del table["initializer"]
-        embedding_tables["tables"] = tables
-    else:
-        assert default_initializer is not None
-        _check_initializer(default_initializer)
-        columns = []
-        for j in range(len(column_dims)):
-            columns.append(make_column_options(default_initializer))
-        embedding_tables["tables"] = [{"columns": columns}]
-    embedding_tables["column_dims"] = column_dims
-    key_value_store_options["parallel_num"] = parallel_num
-    return embedding_dim, embedding_tables, key_value_store_options
-
-
-class Embedding(Module):
     def __init__(
         self,
         name,
@@ -169,24 +139,92 @@ def __init__(
         default_initializer=None,
     ):
         super().__init__()
+        default_initializer = default_initializer or {
+            "type": "normal",
+            "mean": 0,
+            "std": 0.05,
+        }
+        key_value_store_options = {}
+        embedding_tables = {}
+        key_value_store_options["name"] = name
+        assert embedding_dim > 0
+        self.embedding_dim = embedding_dim
         self.dtype = dtype
         self.key_type = key_type
         parallel_num = flow.env.get_world_size()
-        self.embedding_dim, embedding_tables, key_value_store_options = _init(
-            name,
-            embedding_dim,
-            dtype,
-            key_type,
-            tables,
-            store_options,
-            default_initializer,
-        )
+
+        key_type_size = np.dtype(
+            flow.convert_oneflow_dtype_to_numpy_dtype(key_type)
+        ).itemsize
+        assert key_type_size > 0
+        key_value_store_options["key_type_size"] = key_type_size
+        value_type_size = np.dtype(
+            flow.convert_oneflow_dtype_to_numpy_dtype(dtype)
+        ).itemsize
+        assert value_type_size > 0
+        key_value_store_options["value_type_size"] = value_type_size
+
+        scale_factor = store_options["size_factor"]
+        key_value_store_options["storage_dim"] = scale_factor * embedding_dim
+
+        # kv store
+        assert store_options.__contains__("kv_store")
+        kv_store = store_options["kv_store"]
+        assert isinstance(kv_store, dict)
+        if kv_store.__contains__("caches"):
+            caches = kv_store["caches"]
+            assert isinstance(caches, (dict, list, tuple))
+            if isinstance(caches, dict):
+                _check_cache(caches)
+                caches = [caches]
+            else:
+                assert len(caches) <= 2
+                for i in range(len(caches)):
+                    assert isinstance(caches[i], dict)
+                    _check_cache(caches[i])
+            for i in range(len(caches)):
+                if caches[i].__contains__("capacity"):
+                    caches[i]["capacity"] = caches[i]["capacity"] // parallel_num
+        assert kv_store.__contains__("persistent_table")
+        persistent_table = kv_store["persistent_table"]
+        assert isinstance(persistent_table, dict)
+        assert persistent_table.__contains__("path")
+        persistent_table_path = persistent_table["path"]
+        assert isinstance(persistent_table_path, (str, list, tuple))
+        if isinstance(persistent_table_path, (list, tuple)):
+            assert len(persistent_table_path) == parallel_num
+        if persistent_table.__contains__("physical_block_size"):
+            assert persistent_table["physical_block_size"] in [512, 4096]
+        else:
+            persistent_table["physical_block_size"] = 4096
+        if persistent_table.__contains__("capacity_hint"):
+            assert persistent_table["capacity_hint"] >= 0
+            persistent_table["capacity_hint"] = (
+                persistent_table["capacity_hint"] // parallel_num
+            )
+
+        key_value_store_options["kv_store"] = kv_store
+
+        # initializer
+        if tables is not None:
+            assert isinstance(tables, (list, tuple))
+            for table in tables:
+                assert isinstance(table, dict)
+                assert table.__contains__("initializer")
+                _check_initializer(table["initializer"])
+            # TODO(guoran): change "columns" to "tables" and modify c++ code
+            embedding_tables["columns"] = tables
+        else:
+            assert default_initializer is not None
+            _check_initializer(default_initializer)
+            embedding_tables["columns"] = [{"initializer": default_initializer}]
+        key_value_store_options["parallel_num"] = parallel_num
         self.key_value_store_options = json.dumps(key_value_store_options)
         self.embedding_tables = json.dumps(embedding_tables)
-        self.num_tables = len(embedding_tables["tables"])
+        self.num_tables = len(embedding_tables["columns"])
         self.local_rank = flow.env.get_local_rank()
         self.rank_id = flow.env.get_rank()
-        self.world_size = flow.env.get_world_size()
+        self.world_size = parallel_num
         self.handler = OneEmbeddingHandler(
             self.key_value_store_options, self.local_rank, self.rank_id, self.world_size
         )
@@ -333,9 +371,8 @@ def make_cached_ssd_store_options(
     capacity=None,
     size_factor=1,
     physical_block_size=512,
-    host_cache_budget_mb=0,
 ):
-    """make SSD use GPU and host as cache store_options param of MultiTableEmbedding. If cache_budget_mb > 0 and host_cache_budget_mb > 0, use GPU and host memory as multi-level cache.
+    """make SSD use GPU as cache store_options param of MultiTableEmbedding
 
     Args:
         cache_budget_mb (int): the MB budget of per GPU as cache.
@@ -343,10 +380,9 @@ def make_cached_ssd_store_options(
         capacity (int): total capacity of Embedding
         size_factor (int, optional): store size factor of embedding_dim, if SGD update, and momentum = 0, should be 1, if momentum > 0, it should be 2. if Adam, should be 3. Defaults to 1.
         physical_block_size (int, optional): physical_block_size should be sector size. Defaults to 512.
-        host_cache_budget_mb (int): the MB budget of host memory as cache per rank. Defaults to 0.
 
     Returns:
-        dict: SSD use GPU and host as cache store_options param of MultiTableEmbedding
+        dict: SSD use GPU as cache store_options param of MultiTableEmbedding
 
     For example:
 
@@ -360,33 +396,20 @@ def make_cached_ssd_store_options(
         >>> # ...
     """
     assert isinstance(persistent_path, (str, list, tuple))
-    assert cache_budget_mb > 0 or host_cache_budget_mb > 0
+    assert cache_budget_mb > 0
     if capacity is not None:
         assert capacity > 0
     else:
         capacity = 0
-
-    cache_list = []
-    if cache_budget_mb > 0:
-        cache_list.append(
-            {
-                "policy": "lru",
-                "cache_memory_budget_mb": cache_budget_mb,
-                "value_memory_kind": "device",
-            }
-        )
-    if host_cache_budget_mb > 0:
-        cache_list.append(
-            {
-                "policy": "lru",
-                "cache_memory_budget_mb": host_cache_budget_mb,
-                "value_memory_kind": "host",
-            }
-        )
-
     options = {
         "kv_store": {
-            "caches": cache_list,
+            "caches": [
+                {
+                    "policy": "lru",
+                    "cache_memory_budget_mb": cache_budget_mb,
+                    "value_memory_kind": "device",
+                }
+            ],
             "persistent_table": {
                 "path": persistent_path,
                 "physical_block_size": physical_block_size,
@@ -487,14 +510,14 @@ def make_normal_initializer(mean, std):
     return {"type": "normal", "mean": mean, "std": std}
 
 
-def make_table_options(param):
-    """make table param of Embedding tables
+def make_table_options(initializer):
+    """make table param of MultiTableEmbedding tables
 
     Args:
-        param (dict or list): param can be initializer or list of column_option. initializer can be made by make_uniform_initializer or make_normal_initializer, column options can be made by make_column_options
+        initializer (dict): initializer param, make by make_uniform_initializer or make_normal_initializer
 
     Returns:
-        dict: table param of Embedding tables
+        dict: table param of MultiTableEmbedding tables
     
     For example:
 
@@ -505,277 +528,16 @@ def make_table_options(param):
         >>> table1 = flow.one_embedding.make_table_options(initializer)
         >>> table2 = flow.one_embedding.make_table_options(initializer)
         >>> tables = [table1, table2]
-        >>> # pass the tables to the "tables" param of flow.one_embedding.MultiTableEmbedding or flow.one_embedding.MultiTableMultiColumnEmbedding
+        >>> # pass the tables to the "tables" param of flow.one_embedding.MultiTableEmbedding
         >>> # ...
         
     """
-    if isinstance(param, dict):
-        table = {"initializer": param}
-    elif isinstance(param, (list, tuple)):
-        table = {"columns": param}
-    else:
-        raise ValueError("param must be initializer or columns")
-    return table
-
-
-def make_column_options(initializer):
     return {"initializer": initializer}
 
 
-def make_table(param):
+def make_table(initializer):
     """alias of `oneflow.one_embedding.make_table_options`
 
     See also :func:`oneflow.one_embedding.make_table_options`
     """
-    return make_table_options(param)
-
-
-class MultiTableEmbedding(Embedding):
-    r"""MultiTableEmbedding represent multi Embedding tables with same embedding_dim, dtype, and key_type.
-
-    Args:
-        name (str): The name of Embedding
-        embedding_dim (int): the size of each embedding vector
-        dtype (flow.dtype): the data type of embeddings
-        key_type (flow.dtype): the data type of feature ids
-        tables (list): list of table param which can be made by flow.one_embedding.make_table_options
-        store_options (dict): store option of Embedding
-        default_initializer (dict, optional): if tables param is None, use default_initializer to initialize table. Defaults to None.
-    
-    For example:
-
-    .. code-block:: python
-
-        >>> import oneflow as flow
-        >>> import numpy as np
-        >>> import oneflow.nn as nn
-        >>> # a simple example with 3 table
-        >>> table_size_array = [39884407, 39043, 17289]
-        >>> vocab_size = sum(table_size_array)
-        >>> num_tables = len(table_size_array)
-        >>> embedding_size = 128
-        >>> scales = np.sqrt(1 / np.array(table_size_array))
-        >>> tables = [
-        >>>     flow.one_embedding.make_table_options(
-        >>>         flow.one_embedding.make_uniform_initializer(low=-scale, high=scale)
-        >>>     )
-        >>>     for scale in scales
-        >>> ]
-        >>> store_options = flow.one_embedding.make_cached_ssd_store_options(
-        >>>     cache_budget_mb=8192, persistent_path="/your_path_to_ssd", capacity=vocab_size,
-        >>> )
-        >>> embedding = flow.one_embedding.MultiTableEmbedding(
-        >>>     name="my_embedding",
-        >>>     embedding_dim=embedding_size,
-        >>>     dtype=flow.float,
-        >>>     key_type=flow.int64,
-        >>>     tables=tables,
-        >>>     store_options=store_options,
-        >>> )
-        >>> embedding.to("cuda")
-        >>> mlp = flow.nn.FusedMLP(
-        >>>     in_features=embedding_size * num_tables,
-        >>>     hidden_features=[512, 256, 128],
-        >>>     out_features=1,
-        >>>     skip_final_activation=True,
-        >>> )
-        >>> mlp.to("cuda")
-        >>>
-        >>> class TrainGraph(flow.nn.Graph):
-        >>>     def __init__(self,):
-        >>>         super().__init__()
-        >>>         self.embedding_lookup = embedding
-        >>>         self.mlp = mlp
-        >>>         self.add_optimizer(
-        >>>             flow.optim.SGD(self.embedding_lookup.parameters(), lr=0.1, momentum=0.0)
-        >>>         )
-        >>>         self.add_optimizer(
-        >>>             flow.optim.SGD(self.mlp.parameters(), lr=0.1, momentum=0.0)
-        >>>         ) 
-        >>>     def build(self, ids):
-        >>>         embedding = self.embedding_lookup(ids)
-        >>>         loss = self.mlp(flow.reshape(embedding, (-1, num_tables * embedding_size)))
-        >>>         loss = loss.sum()
-        >>>         loss.backward()
-        >>>         return loss 
-        >>> ids = np.random.randint(0, 1000, (100, num_tables), dtype=np.int64)
-        >>> ids_tensor = flow.tensor(ids, requires_grad=False).to("cuda")
-        >>> graph = TrainGraph()
-        >>> loss = graph(ids_tensor)
-        >>> print(loss)
-
-    """
-
-    def __init__(
-        self,
-        name,
-        embedding_dim,
-        dtype,
-        key_type,
-        tables,
-        store_options,
-        default_initializer=None,
-    ):
-        assert isinstance(embedding_dim, int)
-        super().__init__(
-            name,
-            embedding_dim,
-            dtype,
-            key_type,
-            tables,
-            store_options,
-            default_initializer,
-        )
-
-
-class MultiTableMultiColumnEmbedding(Embedding):
-    r"""MultiTableMultiColumnEmbedding represent multi Embedding tables with multi embedding_dim, same dtype, and key_type.
-
-    Args:
-        name (str): The name of Embedding
-        embedding_dim (list): list of the size of each embedding vector
-        dtype (flow.dtype): the data type of embeddings
-        key_type (flow.dtype): the data type of feature ids
-        tables (list): list of table param which can be made by flow.one_embedding.make_table_options
-        store_options (dict): store option of Embedding
-        default_initializer (dict, optional): if tables param is None, use default_initializer to initialize table. Defaults to None.
-    
-    For example:
-
-    .. code-block:: python
-
-        >>> import oneflow as flow
-        >>> import numpy as np
-        >>> import oneflow.nn as nn
-        >>> # a simple example with 3 table, every table has two column, the first column embedding_size is 10 and the second is 1.
-        >>> # every table's first column initialize with uniform(-1/sqrt(table_size), 1/sqrt(table_size)), second column initialize with normal(0, 1/sqrt(table_size))
-        >>> table_size_array = [39884407, 39043, 17289]
-        >>> vocab_size = sum(table_size_array)
-        >>> num_tables = len(table_size_array)
-        >>> embedding_size_list = [10, 1]
-        >>> scales = np.sqrt(1 / np.array(table_size_array))
-        >>> tables = [
-        >>>     flow.one_embedding.make_table_options(
-        >>>       [flow.one_embedding.make_column_options(    
-        >>>         flow.one_embedding.make_uniform_initializer(low=-scale, high=scale)), 
-        >>>        flow.one_embedding.make_column_options(    
-        >>>         flow.one_embedding.make_normal_initializer(mean=0, std=scale))]
-        >>>     )
-        >>>     for scale in scales
-        >>> ]
-        >>> store_options = flow.one_embedding.make_cached_ssd_store_options(
-        >>>     cache_budget_mb=8192, persistent_path="/your_path_to_ssd", capacity=vocab_size,
-        >>> )
-        >>> embedding = flow.one_embedding.MultiTableMultiColumnEmbedding(
-        >>>     name="my_embedding",
-        >>>     embedding_dim=embedding_size_list,
-        >>>     dtype=flow.float,
-        >>>     key_type=flow.int64,
-        >>>     tables=tables,
-        >>>     store_options=store_options,
-        >>> )
-        >>> embedding.to("cuda")
-        >>> mlp = flow.nn.FusedMLP(
-        >>>     in_features=sum(embedding_size_list) * num_tables,
-        >>>     hidden_features=[512, 256, 128],
-        >>>     out_features=1,
-        >>>     skip_final_activation=True,
-        >>> )
-        >>> mlp.to("cuda")
-        >>>
-        >>> class TrainGraph(flow.nn.Graph):
-        >>>     def __init__(self,):
-        >>>         super().__init__()
-        >>>         self.embedding_lookup = embedding
-        >>>         self.mlp = mlp
-        >>>         self.add_optimizer(
-        >>>             flow.optim.SGD(self.embedding_lookup.parameters(), lr=0.1, momentum=0.0)
-        >>>         )
-        >>>         self.add_optimizer(
-        >>>             flow.optim.SGD(self.mlp.parameters(), lr=0.1, momentum=0.0)
-        >>>         ) 
-        >>>     def build(self, ids):
-        >>>         embedding = self.embedding_lookup(ids)
-        >>>         loss = self.mlp(flow.reshape(embedding, (-1, num_tables * sum(embedding_size_list))))
-        >>>         loss = loss.sum()
-        >>>         loss.backward()
-        >>>         return loss 
-        >>> ids = np.random.randint(0, 1000, (100, num_tables), dtype=np.int64)
-        >>> ids_tensor = flow.tensor(ids, requires_grad=False).to("cuda")
-        >>> graph = TrainGraph()
-        >>> loss = graph(ids_tensor)
-        >>> print(loss)
-
-    """
-
-    def __init__(
-        self,
-        name,
-        embedding_dim,
-        dtype,
-        key_type,
-        tables,
-        store_options,
-        default_initializer=None,
-    ):
-        if isinstance(embedding_dim, (list, tuple)):
-            for dim in embedding_dim:
-                assert isinstance(dim, int)
-        else:
-            assert isinstance(embedding_dim, int)
-
-        super().__init__(
-            name,
-            embedding_dim,
-            dtype,
-            key_type,
-            tables,
-            store_options,
-            default_initializer,
-        )
-
-
-def make_persistent_table_reader(
-    paths, snapshot_name, key_type, value_type, storage_dim, physical_block_size=512,
-):
-    r"""Creates a reader for reading persistent table.
-    Args:
-        paths (list): paths of tables to read
-        snapshot_name (str): name of the snapshot to read
-        key_type (flow.dtype): the data type of key
-        value_type (flow.dtype): the data type of value
-        storage_dim (int): number of elements in each value
-        physical_block_size (int, optional): physical_block_size should be sector size. Defaults to 512
-    """
-    return PersistentTableReader(
-        paths,
-        snapshot_name,
-        key_type,
-        value_type,
-        storage_dim,
-        4 * 1024,
-        physical_block_size,
-    )
-
-
-def make_persistent_table_writer(
-    paths, snapshot_name, key_type, value_type, storage_dim, physical_block_size=512,
-):
-    r"""Creates a writer for writing persistent table.
-    Args:
-        paths (list): paths of tables to write
-        snapshot_name (str): name of the snapshot to write
-        key_type (flow.dtype): the data type of key
-        value_type (flow.dtype): the data type of value
-        storage_dim (int): number of elements in each value
-        physical_block_size (int, optional): physical_block_size should be sector size. Defaults to 512
-    """
-    return PersistentTableWriter(
-        paths,
-        snapshot_name,
-        key_type,
-        value_type,
-        storage_dim,
-        4 * 1024,
-        physical_block_size,
-    )
+    return make_table_options(initializer)
diff --git a/python/oneflow/test/exceptions/test_error_msg.py b/python/oneflow/test/exceptions/test_error_msg.py
index 1c59bc6c724..3e9cbff99c4 100644
--- a/python/oneflow/test/exceptions/test_error_msg.py
+++ b/python/oneflow/test/exceptions/test_error_msg.py
@@ -23,14 +23,14 @@
 @flow.unittest.skip_unless_1n1d()
 class TestErrorMsg(flow.unittest.TestCase):
     def test_torch_error_msg(test_case):
-        with test_case.assertRaises(RuntimeError) as exp:
+        with test_case.assertRaises(flow._oneflow_internal.exception.Exception) as exp:
             F.pad(torch.randn(2, 2))
         test_case.assertTrue("torch.Tensor" in str(exp.exception))
 
     def test_numpy_error_msg(test_case):
         import numpy as np
 
-        with test_case.assertRaises(RuntimeError) as exp:
+        with test_case.assertRaises(flow._oneflow_internal.exception.Exception) as exp:
             F.pad(np.random.randn(2, 2))
         test_case.assertTrue("numpy" in str(exp.exception))
 
diff --git a/python/oneflow/test/exceptions/test_reshape.py b/python/oneflow/test/exceptions/test_reshape.py
index cb36028dc9b..57df5003101 100644
--- a/python/oneflow/test/exceptions/test_reshape.py
+++ b/python/oneflow/test/exceptions/test_reshape.py
@@ -30,7 +30,7 @@ def test_reshape_exception_only_one_dim_infered(test_case):
         x = flow.tensor((2, 2))
         with test_case.assertRaises(RuntimeError) as ctx:
             y = x.reshape((-1, -1))
-        test_case.assertTrue("only one dimension can be inferred" in str(ctx.exception))
+        test_case.assertEqual("only one dimension can be inferred", str(ctx.exception))
 
 
 if __name__ == "__main__":
diff --git a/python/oneflow/test/expensive/pytorch_alexnet.py b/python/oneflow/test/expensive/pytorch_alexnet.py
index a2831c02ef6..0f140f124e0 100644
--- a/python/oneflow/test/expensive/pytorch_alexnet.py
+++ b/python/oneflow/test/expensive/pytorch_alexnet.py
@@ -15,12 +15,18 @@
 """
 import torch
 import torch.nn as nn
+from _internally_replaced_utils import load_state_dict_from_url
 from typing import Any
 
 
 __all__ = ["AlexNet", "alexnet"]
 
 
+model_urls = {
+    "alexnet": "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth",
+}
+
+
 class AlexNet(nn.Module):
     def __init__(self, num_classes: int = 1000) -> None:
         super(AlexNet, self).__init__()
@@ -67,4 +73,7 @@ def alexnet(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> A
         progress (bool): If True, displays a progress bar of the download to stderr
     """
     model = AlexNet(**kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls["alexnet"], progress=progress)
+        model.load_state_dict(state_dict)
     return model
diff --git a/python/oneflow/test/expensive/pytorch_convmixer.py b/python/oneflow/test/expensive/pytorch_convmixer.py
deleted file mode 100644
index 6c6e9d5dc68..00000000000
--- a/python/oneflow/test/expensive/pytorch_convmixer.py
+++ /dev/null
@@ -1,66 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch.nn as nn
-
-__all__ = ["ConvMixer", "convmixer_768_32_relu"]
-
-
-class Residual(nn.Module):
-    def __init__(self, fn):
-        super().__init__()
-        self.fn = fn
-
-    def forward(self, x):
-        return self.fn(x) + x
-
-
-def ConvMixer(dim, depth, kernel_size=9, patch_size=7, n_classes=1000):
-    return nn.Sequential(
-        nn.Conv2d(3, dim, kernel_size=patch_size, stride=patch_size),
-        nn.GELU(),
-        nn.BatchNorm2d(dim),
-        *[
-            nn.Sequential(
-                Residual(
-                    nn.Sequential(
-                        nn.Conv2d(dim, dim, kernel_size, groups=dim, padding="same"),
-                        nn.GELU(),
-                        nn.BatchNorm2d(dim),
-                    )
-                ),
-                nn.Conv2d(dim, dim, kernel_size=1),
-                nn.GELU(),
-                nn.BatchNorm2d(dim),
-            )
-            for i in range(depth)
-        ],
-        nn.AdaptiveAvgPool2d((1, 1)),
-        nn.Flatten(),
-        nn.Linear(dim, n_classes)
-    )
-
-
-def convmixer_768_32_relu(pretrained: bool = False, progress: bool = True, **kwargs):
-    """
-    Constructs the ConvMixer model with 32 depth and 768 hidden size and ReLU activation layer.
-    .. note::
-        ConvMixer model with 32 depth and 768 hidden size and ReLU activation layer from the `Patched Are All You Need? <https://openreview.net/pdf?id=TVHS5Y4dNvM>`_ paper.
-    Args:
-        pretrained (bool): Whether to download the pre-trained model on ImageNet. Default: ``False``
-        progress (bool): If True, displays a progress bar of the download to stderr. Default: ``True``
-    """
-    model = ConvMixer(768, 32, kernel_size=7, patch_size=7, n_classes=1000)
-    return model
diff --git a/python/oneflow/test/expensive/pytorch_convnext.py b/python/oneflow/test/expensive/pytorch_convnext.py
deleted file mode 100644
index 389d702e366..00000000000
--- a/python/oneflow/test/expensive/pytorch_convnext.py
+++ /dev/null
@@ -1,188 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from timm.models.layers import trunc_normal_, DropPath
-
-__all__ = ["ConvNeXt", "convnext_tiny"]
-
-
-class Block(nn.Module):
-    r""" ConvNeXt Block. There are two equivalent implementations:
-    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
-    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
-    We use (2) as we find it slightly faster in PyTorch
-    
-    Args:
-        dim (int): Number of input channels.
-        drop_path (float): Stochastic depth rate. Default: 0.0
-        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
-    """
-
-    def __init__(self, dim, drop_path=0.0, layer_scale_init_value=1e-6):
-        super().__init__()
-        self.dwconv = nn.Conv2d(
-            dim, dim, kernel_size=7, padding=3, groups=dim
-        )  # depthwise conv
-        self.norm = LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(
-            dim, 4 * dim
-        )  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = nn.Linear(4 * dim, dim)
-        self.gamma = (
-            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
-            if layer_scale_init_value > 0
-            else None
-        )
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-
-    def forward(self, x):
-        input = x
-        x = self.dwconv(x)
-        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
-
-        x = input + self.drop_path(x)
-        return x
-
-
-class ConvNeXt(nn.Module):
-    r""" ConvNeXt
-        A PyTorch impl of : `A ConvNet for the 2020s`  -
-          https://arxiv.org/pdf/2201.03545.pdf
-    Args:
-        in_chans (int): Number of input image channels. Default: 3
-        num_classes (int): Number of classes for classification head. Default: 1000
-        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
-        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
-        drop_path_rate (float): Stochastic depth rate. Default: 0.
-        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
-        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
-    """
-
-    def __init__(
-        self,
-        in_chans=3,
-        num_classes=1000,
-        depths=[3, 3, 9, 3],
-        dims=[96, 192, 384, 768],
-        drop_path_rate=0.0,
-        layer_scale_init_value=1e-6,
-        head_init_scale=1.0,
-    ):
-        super().__init__()
-
-        self.downsample_layers = (
-            nn.ModuleList()
-        )  # stem and 3 intermediate downsampling conv layers
-        stem = nn.Sequential(
-            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
-            LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
-        )
-        self.downsample_layers.append(stem)
-        for i in range(3):
-            downsample_layer = nn.Sequential(
-                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
-                nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
-            )
-            self.downsample_layers.append(downsample_layer)
-
-        self.stages = (
-            nn.ModuleList()
-        )  # 4 feature resolution stages, each consisting of multiple residual blocks
-        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
-        cur = 0
-        for i in range(4):
-            stage = nn.Sequential(
-                *[
-                    Block(
-                        dim=dims[i],
-                        drop_path=dp_rates[cur + j],
-                        layer_scale_init_value=layer_scale_init_value,
-                    )
-                    for j in range(depths[i])
-                ]
-            )
-            self.stages.append(stage)
-            cur += depths[i]
-
-        self.norm = nn.LayerNorm(dims[-1], eps=1e-6)  # final norm layer
-        self.head = nn.Linear(dims[-1], num_classes)
-
-        self.apply(self._init_weights)
-        self.head.weight.data.mul_(head_init_scale)
-        self.head.bias.data.mul_(head_init_scale)
-
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Conv2d, nn.Linear)):
-            trunc_normal_(m.weight, std=0.02)
-            nn.init.constant_(m.bias, 0)
-
-    def forward_features(self, x):
-        for i in range(4):
-            x = self.downsample_layers[i](x)
-            x = self.stages[i](x)
-        return self.norm(
-            x.mean([-2, -1])
-        )  # global average pooling, (N, C, H, W) -> (N, C)
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-
-class LayerNorm(nn.Module):
-    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
-    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
-    shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
-    with shape (batch_size, channels, height, width).
-    """
-
-    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(normalized_shape))
-        self.bias = nn.Parameter(torch.zeros(normalized_shape))
-        self.eps = eps
-        self.data_format = data_format
-        if self.data_format not in ["channels_last", "channels_first"]:
-            raise NotImplementedError
-        self.normalized_shape = (normalized_shape,)
-
-    def forward(self, x):
-        if self.data_format == "channels_last":
-            return F.layer_norm(
-                x, self.normalized_shape, self.weight, self.bias, self.eps
-            )
-        elif self.data_format == "channels_first":
-            u = x.mean(1, keepdim=True)
-            s = (x - u).pow(2).mean(1, keepdim=True)
-            x = (x - u) / torch.sqrt(s + self.eps)
-            x = self.weight[:, None, None] * x + self.bias[:, None, None]
-            return x
-
-
-def convnext_tiny(pretrained=False, in_22k=False, **kwargs):
-    model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
-    return model
diff --git a/python/oneflow/test/expensive/pytorch_crossformer.py b/python/oneflow/test/expensive/pytorch_crossformer.py
deleted file mode 100644
index 24982427ab7..00000000000
--- a/python/oneflow/test/expensive/pytorch_crossformer.py
+++ /dev/null
@@ -1,818 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint as checkpoint
-from timm.models.layers import DropPath, to_2tuple, trunc_normal_
-
-__all__ = ["CrossFormer", "crossformer_tiny_patch4_group7_224"]
-
-
-class Mlp(nn.Module):
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        drop=0.0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class DynamicPosBias(nn.Module):
-    def __init__(self, dim, num_heads, residual):
-        super().__init__()
-        self.residual = residual
-        self.num_heads = num_heads
-        self.pos_dim = dim // 4
-        self.pos_proj = nn.Linear(2, self.pos_dim)
-        self.pos1 = nn.Sequential(
-            nn.LayerNorm(self.pos_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.pos_dim, self.pos_dim),
-        )
-        self.pos2 = nn.Sequential(
-            nn.LayerNorm(self.pos_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.pos_dim, self.pos_dim),
-        )
-        self.pos3 = nn.Sequential(
-            nn.LayerNorm(self.pos_dim),
-            nn.ReLU(inplace=True),
-            nn.Linear(self.pos_dim, self.num_heads),
-        )
-
-    def forward(self, biases):
-        if self.residual:
-            pos = self.pos_proj(biases)  # 2Wh-1 * 2Ww-1, heads
-            pos = pos + self.pos1(pos)
-            pos = pos + self.pos2(pos)
-            pos = self.pos3(pos)
-        else:
-            pos = self.pos3(self.pos2(self.pos1(self.pos_proj(biases))))
-        return pos
-
-    def flops(self, N):
-        flops = N * 2 * self.pos_dim
-        flops += N * self.pos_dim * self.pos_dim
-        flops += N * self.pos_dim * self.pos_dim
-        flops += N * self.pos_dim * self.num_heads
-        return flops
-
-
-class Attention(nn.Module):
-    r""" Multi-head self attention module with dynamic position bias.
-    Args:
-        dim (int): Number of input channels.
-        group_size (tuple[int]): The height and width of the group.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-
-    def __init__(
-        self,
-        dim,
-        group_size,
-        num_heads,
-        qkv_bias=True,
-        qk_scale=None,
-        attn_drop=0.0,
-        proj_drop=0.0,
-        position_bias=True,
-    ):
-
-        super().__init__()
-        self.dim = dim
-        self.group_size = group_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-        self.position_bias = position_bias
-
-        if position_bias:
-            self.pos = DynamicPosBias(self.dim // 4, self.num_heads, residual=False)
-
-            # generate mother-set
-            position_bias_h = torch.arange(1 - self.group_size[0], self.group_size[0])
-            position_bias_w = torch.arange(1 - self.group_size[1], self.group_size[1])
-            biases = torch.stack(
-                torch.meshgrid([position_bias_h, position_bias_w])
-            )  # 2, 2Wh-1, 2W2-1
-            biases = biases.flatten(1).transpose(0, 1).float()
-            self.register_buffer("biases", biases)
-
-            # get pair-wise relative position index for each token inside the group
-            coords_h = torch.arange(self.group_size[0])
-            coords_w = torch.arange(self.group_size[1])
-            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-            relative_coords = (
-                coords_flatten[:, :, None] - coords_flatten[:, None, :]
-            )  # 2, Wh*Ww, Wh*Ww
-            relative_coords = relative_coords.permute(
-                1, 2, 0
-            ).contiguous()  # Wh*Ww, Wh*Ww, 2
-            relative_coords[:, :, 0] += self.group_size[0] - 1  # shift to start from 0
-            relative_coords[:, :, 1] += self.group_size[1] - 1
-            relative_coords[:, :, 0] *= 2 * self.group_size[1] - 1
-            relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-            self.register_buffer("relative_position_index", relative_position_index)
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, mask=None):
-        """
-        Args:
-            x: input features with shape of (num_groups*B, N, C)
-            mask: (0/-inf) mask with shape of (num_groups, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        qkv = (
-            self.qkv(x)
-            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
-            .permute(2, 0, 3, 1, 4)
-        )
-        q, k, v = (
-            qkv[0],
-            qkv[1],
-            qkv[2],
-        )  # make torchscript happy (cannot use tensor as tuple)
-
-        q = q * self.scale
-        attn = q @ k.transpose(-2, -1)
-
-        if self.position_bias:
-            pos = self.pos(self.biases)  # 2Wh-1 * 2Ww-1, heads
-            # select position bias
-            relative_position_bias = pos[self.relative_position_index.view(-1)].view(
-                self.group_size[0] * self.group_size[1],
-                self.group_size[0] * self.group_size[1],
-                -1,
-            )  # Wh*Ww,Wh*Ww,nH
-            relative_position_bias = relative_position_bias.permute(
-                2, 0, 1
-            ).contiguous()  # nH, Wh*Ww, Wh*Ww
-            attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
-                1
-            ).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return (
-            f"dim={self.dim}, group_size={self.group_size}, num_heads={self.num_heads}"
-        )
-
-    def flops(self, N):
-        # calculate flops for 1 group with token length of N
-        flops = 0
-        # qkv = self.qkv(x)
-        flops += N * self.dim * 3 * self.dim
-        # attn = (q @ k.transpose(-2, -1))
-        flops += self.num_heads * N * (self.dim // self.num_heads) * N
-        #  x = (attn @ v)
-        flops += self.num_heads * N * N * (self.dim // self.num_heads)
-        # x = self.proj(x)
-        flops += N * self.dim * self.dim
-        if self.position_bias:
-            flops += self.pos.flops(N)
-        return flops
-
-
-class CrossFormerBlock(nn.Module):
-    r""" CrossFormer Block.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resulotion.
-        num_heads (int): Number of attention heads.
-        group_size (int): Group size.
-        lsda_flag (int): use SDA or LDA, 0 for SDA and 1 for LDA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(
-        self,
-        dim,
-        input_resolution,
-        num_heads,
-        group_size=7,
-        lsda_flag=0,
-        mlp_ratio=4.0,
-        qkv_bias=True,
-        qk_scale=None,
-        drop=0.0,
-        attn_drop=0.0,
-        drop_path=0.0,
-        act_layer=nn.GELU,
-        norm_layer=nn.LayerNorm,
-        num_patch_size=1,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        self.group_size = group_size
-        self.lsda_flag = lsda_flag
-        self.mlp_ratio = mlp_ratio
-        self.num_patch_size = num_patch_size
-        if min(self.input_resolution) <= self.group_size:
-            # if group size is larger than input resolution, we don't partition groups
-            self.lsda_flag = 0
-            self.group_size = min(self.input_resolution)
-
-        self.norm1 = norm_layer(dim)
-
-        self.attn = Attention(
-            dim,
-            group_size=to_2tuple(self.group_size),
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop,
-            position_bias=True,
-        )
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(
-            in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=drop,
-        )
-
-        attn_mask = None
-        self.register_buffer("attn_mask", attn_mask)
-
-    def forward(self, x):
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size %d, %d, %d" % (L, H, W)
-
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(B, H, W, C)
-
-        # group embeddings
-        G = self.group_size
-        if self.lsda_flag == 0:  # 0 for SDA
-            x = x.reshape(B, H // G, G, W // G, G, C).permute(0, 1, 3, 2, 4, 5)
-        else:  # 1 for LDA
-            x = x.reshape(B, G, H // G, G, W // G, C).permute(0, 2, 4, 1, 3, 5)
-        x = x.reshape(B * H * W // G ** 2, G ** 2, C)
-
-        # multi-head self-attention
-        x = self.attn(x, mask=self.attn_mask)  # nW*B, G*G, C
-
-        # ungroup embeddings
-        x = x.reshape(B, H // G, W // G, G, G, C)
-        if self.lsda_flag == 0:
-            x = x.permute(0, 1, 3, 2, 4, 5).reshape(B, H, W, C)
-        else:
-            x = x.permute(0, 3, 1, 4, 2, 5).reshape(B, H, W, C)
-        x = x.view(B, H * W, C)
-
-        # FFN
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        return x
-
-    def extra_repr(self) -> str:
-        return (
-            f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
-            f"group_size={self.group_size}, lsda_flag={self.lsda_flag}, mlp_ratio={self.mlp_ratio}"
-        )
-
-    def flops(self):
-        flops = 0
-        H, W = self.input_resolution
-        # norm1
-        flops += self.dim * H * W
-        # LSDA
-        nW = H * W / self.group_size / self.group_size
-        flops += nW * self.attn.flops(self.group_size * self.group_size)
-        # mlp
-        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
-        # norm2
-        flops += self.dim * H * W
-        return flops
-
-
-class PatchMerging(nn.Module):
-    r""" Patch Merging Layer.
-    Args:
-        input_resolution (tuple[int]): Resolution of input feature.
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(
-        self,
-        input_resolution,
-        dim,
-        norm_layer=nn.LayerNorm,
-        patch_size=[2],
-        num_input_patch_size=1,
-    ):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.reductions = nn.ModuleList()
-        self.patch_size = patch_size
-        self.norm = norm_layer(dim)
-
-        for i, ps in enumerate(patch_size):
-            if i == len(patch_size) - 1:
-                out_dim = 2 * dim // 2 ** i
-            else:
-                out_dim = 2 * dim // 2 ** (i + 1)
-            stride = 2
-            padding = (ps - stride) // 2
-            self.reductions.append(
-                nn.Conv2d(dim, out_dim, kernel_size=ps, stride=stride, padding=padding)
-            )
-
-    def forward(self, x):
-        """
-        x: B, H*W, C
-        """
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
-
-        x = self.norm(x)
-        x = x.view(B, H, W, C).permute(0, 3, 1, 2)
-
-        xs = []
-        for i in range(len(self.reductions)):
-            tmp_x = self.reductions[i](x).flatten(2).transpose(1, 2)
-            xs.append(tmp_x)
-        x = torch.cat(xs, dim=2)
-        return x
-
-    def extra_repr(self) -> str:
-        return f"input_resolution={self.input_resolution}, dim={self.dim}"
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = H * W * self.dim
-        for i, ps in enumerate(self.patch_size):
-            if i == len(self.patch_size) - 1:
-                out_dim = 2 * self.dim // 2 ** i
-            else:
-                out_dim = 2 * self.dim // 2 ** (i + 1)
-            flops += (H // 2) * (W // 2) * ps * ps * out_dim * self.dim
-        return flops
-
-
-class Stage(nn.Module):
-    """ CrossFormer blocks for one stage.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        group_size (int): variable G in the paper, one group has GxG embeddings
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-    """
-
-    def __init__(
-        self,
-        dim,
-        input_resolution,
-        depth,
-        num_heads,
-        group_size,
-        mlp_ratio=4.0,
-        qkv_bias=True,
-        qk_scale=None,
-        drop=0.0,
-        attn_drop=0.0,
-        drop_path=0.0,
-        norm_layer=nn.LayerNorm,
-        downsample=None,
-        use_checkpoint=False,
-        patch_size_end=[4],
-        num_patch_size=None,
-    ):
-
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # build blocks
-        self.blocks = nn.ModuleList()
-        for i in range(depth):
-            lsda_flag = 0 if (i % 2 == 0) else 1
-            self.blocks.append(
-                CrossFormerBlock(
-                    dim=dim,
-                    input_resolution=input_resolution,
-                    num_heads=num_heads,
-                    group_size=group_size,
-                    lsda_flag=lsda_flag,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    drop=drop,
-                    attn_drop=attn_drop,
-                    drop_path=drop_path[i]
-                    if isinstance(drop_path, list)
-                    else drop_path,
-                    norm_layer=norm_layer,
-                    num_patch_size=num_patch_size,
-                )
-            )
-
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(
-                input_resolution,
-                dim=dim,
-                norm_layer=norm_layer,
-                patch_size=patch_size_end,
-                num_input_patch_size=num_patch_size,
-            )
-        else:
-            self.downsample = None
-
-    def forward(self, x):
-        for blk in self.blocks:
-            # if self.use_checkpoint:
-            #     x = checkpoint.checkpoint(blk, x)
-            # else:
-            x = blk(x)
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
-
-    def flops(self):
-        flops = 0
-        for blk in self.blocks:
-            flops += blk.flops()
-        if self.downsample is not None:
-            flops += self.downsample.flops()
-        return flops
-
-
-class PatchEmbed(nn.Module):
-    r""" Image to Patch Embedding
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: [4].
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-
-    def __init__(
-        self, img_size=224, patch_size=[4], in_chans=3, embed_dim=96, norm_layer=None
-    ):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        # patch_size = to_2tuple(patch_size)
-        patches_resolution = [
-            img_size[0] // patch_size[0],
-            img_size[0] // patch_size[0],
-        ]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-        self.projs = nn.ModuleList()
-        for i, ps in enumerate(patch_size):
-            if i == len(patch_size) - 1:
-                dim = embed_dim // 2 ** i
-            else:
-                dim = embed_dim // 2 ** (i + 1)
-            stride = patch_size[0]
-            padding = (ps - patch_size[0]) // 2
-            self.projs.append(
-                nn.Conv2d(in_chans, dim, kernel_size=ps, stride=stride, padding=padding)
-            )
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        # FIXME look at relaxing size constraints
-        assert (
-            H == self.img_size[0] and W == self.img_size[1]
-        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        xs = []
-        for i in range(len(self.projs)):
-            tx = self.projs[i](x).flatten(2).transpose(1, 2)
-            xs.append(tx)  # B Ph*Pw C
-        x = torch.cat(xs, dim=2)
-        if self.norm is not None:
-            x = self.norm(x)
-        return x
-
-    def flops(self):
-        Ho, Wo = self.patches_resolution
-        flops = 0
-        for i, ps in enumerate(self.patch_size):
-            if i == len(self.patch_size) - 1:
-                dim = self.embed_dim // 2 ** i
-            else:
-                dim = self.embed_dim // 2 ** (i + 1)
-            flops += (
-                Ho
-                * Wo
-                * dim
-                * self.in_chans
-                * (self.patch_size[i] * self.patch_size[i])
-            )
-        if self.norm is not None:
-            flops += Ho * Wo * self.embed_dim
-        return flops
-
-
-class CrossFormer(nn.Module):
-    r""" CrossFormer
-        A PyTorch impl of : `CrossFormer: A Versatile Vision Transformer Based on Cross-scale Attention`  -
-    Args:
-        img_size (int | tuple(int)): Input image size. Default 224
-        patch_size (int | tuple(int)): Patch size. Default: 4
-        in_chans (int): Number of input image channels. Default: 3
-        num_classes (int): Number of classes for classification head. Default: 1000
-        embed_dim (int): Patch embedding dimension. Default: 96
-        depths (tuple(int)): Depth of each stage.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        group_size (int): Group size. Default: 7
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
-        drop_rate (float): Dropout rate. Default: 0
-        attn_drop_rate (float): Attention dropout rate. Default: 0
-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
-    """
-
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=[4],
-        in_chans=3,
-        num_classes=1000,
-        embed_dim=96,
-        depths=[2, 2, 6, 2],
-        num_heads=[3, 6, 12, 24],
-        group_size=7,
-        mlp_ratio=4.0,
-        qkv_bias=True,
-        qk_scale=None,
-        drop_rate=0.0,
-        attn_drop_rate=0.0,
-        drop_path_rate=0.1,
-        norm_layer=nn.LayerNorm,
-        ape=False,
-        patch_norm=True,
-        use_checkpoint=False,
-        merge_size=[[2], [2], [2]],
-        **kwargs,
-    ):
-        super().__init__()
-
-        self.num_classes = num_classes
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
-        self.mlp_ratio = mlp_ratio
-
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None,
-        )
-        num_patches = self.patch_embed.num_patches
-        patches_resolution = self.patch_embed.patches_resolution
-        self.patches_resolution = patches_resolution
-
-        # absolute position embedding
-        if self.ape:
-            self.absolute_pos_embed = nn.Parameter(
-                torch.zeros(1, num_patches, embed_dim)
-            )
-            trunc_normal_(self.absolute_pos_embed, std=0.02)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        # stochastic depth
-        dpr = [
-            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
-        ]  # stochastic depth decay rule
-
-        # build layers
-        self.layers = nn.ModuleList()
-
-        num_patch_sizes = [len(patch_size)] + [len(m) for m in merge_size]
-        for i_layer in range(self.num_layers):
-            patch_size_end = (
-                merge_size[i_layer] if i_layer < self.num_layers - 1 else None
-            )
-            num_patch_size = num_patch_sizes[i_layer]
-            layer = Stage(
-                dim=int(embed_dim * 2 ** i_layer),
-                input_resolution=(
-                    patches_resolution[0] // (2 ** i_layer),
-                    patches_resolution[1] // (2 ** i_layer),
-                ),
-                depth=depths[i_layer],
-                num_heads=num_heads[i_layer],
-                group_size=group_size[i_layer],
-                mlp_ratio=self.mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
-                norm_layer=norm_layer,
-                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
-                use_checkpoint=use_checkpoint,
-                patch_size_end=patch_size_end,
-                num_patch_size=num_patch_size,
-            )
-            self.layers.append(layer)
-
-        self.norm = norm_layer(self.num_features)
-        self.avgpool = nn.AdaptiveAvgPool1d(1)
-        self.head = (
-            nn.Linear(self.num_features, num_classes)
-            if num_classes > 0
-            else nn.Identity()
-        )
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def no_weight_decay(self):
-        return {"absolute_pos_embed"}
-
-    def no_weight_decay_keywords(self):
-        return {"relative_position_bias_table"}
-
-    def forward_features(self, x):
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        for layer in self.layers:
-            x = layer(x)
-
-        x = self.norm(x)  # B L C
-        x = self.avgpool(x.transpose(1, 2))  # B C 1
-        x = torch.flatten(x, 1)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-    def flops(self):
-        flops = 0
-        flops += self.patch_embed.flops()
-        for i, layer in enumerate(self.layers):
-            flops += layer.flops()
-        flops += (
-            self.num_features
-            * self.patches_resolution[0]
-            * self.patches_resolution[1]
-            // (2 ** self.num_layers)
-        )
-        flops += self.num_features * self.num_classes
-        return flops
-
-
-def _create_cross_former(arch, pretrained=False, progress=True, **model_kwargs):
-    model = CrossFormer(**model_kwargs)
-    return model
-
-
-def crossformer_tiny_patch4_group7_224(pretrained=False, progress=True, **kwargs):
-    """
-    Constructs CrossFormer-T 224x224 model.
-    .. note::
-        CrossFormer-T 224x224 model from `"CrossFormer: A Versatile Vision Transformer Based on Cross-scale Attention" <https://arxiv.org/pdf/2108.00154.pdf>`_.
-    Args:
-        pretrained (bool): Whether to download the pre-trained model on ImageNet. Default: ``False``
-        progress (bool): If True, displays a progress bar of the download to stderr. Default: ``True``
-    For example:
-    .. code-block:: python
-        >>> import flowvision
-        >>> crossformer_tiny_patch4_group7_224 = flowvision.models.crossformer_tiny_patch4_group7_224(pretrained=False, progress=True)
-    """
-    model_kwargs = dict(
-        img_size=224,
-        patch_size=(4, 8, 16, 32),
-        embed_dim=64,
-        depths=(1, 1, 8, 6),
-        num_heads=(2, 4, 8, 16),
-        group_size=(7, 7, 7, 7),
-        merge_size=((2, 4), (2, 4), (2, 4)),
-        drop_path_rate=0.1,
-        **kwargs,
-    )
-    return _create_cross_former(
-        "crossformer_tiny_patch4_group7_224",
-        pretrained=pretrained,
-        progress=progress,
-        **model_kwargs,
-    )
diff --git a/python/oneflow/test/expensive/pytorch_densenet.py b/python/oneflow/test/expensive/pytorch_densenet.py
deleted file mode 100644
index 49553ee9727..00000000000
--- a/python/oneflow/test/expensive/pytorch_densenet.py
+++ /dev/null
@@ -1,279 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-
-from collections import OrderedDict
-from typing import Any, List, Tuple
-
-__all__ = [
-    "DenseNet",
-    "densenet121",
-]
-
-
-class _DenseLayer(nn.Module):
-    def __init__(
-        self,
-        num_input_features: int,
-        growth_rate: int,
-        bn_size: int,
-        drop_rate: float,
-        memory_efficient: bool = False,
-    ) -> None:
-        super().__init__()
-        self.norm1: nn.BatchNorm2d
-        self.add_module("norm1", nn.BatchNorm2d(num_input_features))
-        self.relu1: nn.ReLU
-        self.add_module("relu1", nn.ReLU(inplace=True))
-        self.conv1: nn.Conv2d
-        self.add_module(
-            "conv1",
-            nn.Conv2d(
-                num_input_features,
-                bn_size * growth_rate,
-                kernel_size=1,
-                stride=1,
-                bias=False,
-            ),
-        )
-        self.norm2: nn.BatchNorm2d
-        self.add_module("norm2", nn.BatchNorm2d(bn_size * growth_rate))
-        self.relu2: nn.ReLU
-        self.add_module("relu2", nn.ReLU(inplace=True))
-        self.conv2: nn.Conv2d
-        self.add_module(
-            "conv2",
-            nn.Conv2d(
-                bn_size * growth_rate,
-                growth_rate,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                bias=False,
-            ),
-        )
-        self.drop_rate = float(drop_rate)
-        self.memory_efficient = memory_efficient
-
-    def bn_function(self, inputs: List[Tensor]) -> Tensor:
-        concated_features = torch.cat(inputs, 1)
-        bottleneck_output = self.conv1(
-            self.relu1(self.norm1(concated_features))
-        )  # noqa: T484
-        return bottleneck_output
-
-    # todo: rewrite when torchscript supports any
-    def any_requires_grad(self, input: List[Tensor]) -> bool:
-        for tensor in input:
-            if tensor.requires_grad:
-                return True
-        return False
-
-    # torchscript does not yet support *args, so we overload method
-    # allowing it to take either a List[Tensor] or single Tensor
-    def forward(self, input: Tensor) -> Tensor:  # noqa: F811
-        if isinstance(input, Tensor):
-            prev_features = [input]
-        else:
-            prev_features = input
-
-        if self.memory_efficient and self.any_requires_grad(prev_features):
-            if torch.jit.is_scripting():
-                raise Exception("Memory Efficient not supported in JIT")
-
-            bottleneck_output = self.call_checkpoint_bottleneck(prev_features)
-        else:
-            bottleneck_output = self.bn_function(prev_features)
-
-        new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))
-        if self.drop_rate > 0:
-            new_features = F.dropout(
-                new_features, p=self.drop_rate, training=self.training
-            )
-        return new_features
-
-
-class _DenseBlock(nn.ModuleDict):
-    _version = 2
-
-    def __init__(
-        self,
-        num_layers: int,
-        num_input_features: int,
-        bn_size: int,
-        growth_rate: int,
-        drop_rate: float,
-        memory_efficient: bool = False,
-    ) -> None:
-        super().__init__()
-        for i in range(num_layers):
-            layer = _DenseLayer(
-                num_input_features + i * growth_rate,
-                growth_rate=growth_rate,
-                bn_size=bn_size,
-                drop_rate=drop_rate,
-                memory_efficient=memory_efficient,
-            )
-            self.add_module("denselayer%d" % (i + 1), layer)
-
-    def forward(self, init_features: Tensor) -> Tensor:
-        features = [init_features]
-        for name, layer in self.items():
-            new_features = layer(features)
-            features.append(new_features)
-        return torch.cat(features, 1)
-
-
-class _Transition(nn.Sequential):
-    def __init__(self, num_input_features: int, num_output_features: int) -> None:
-        super().__init__()
-        self.add_module("norm", nn.BatchNorm2d(num_input_features))
-        self.add_module("relu", nn.ReLU(inplace=True))
-        self.add_module(
-            "conv",
-            nn.Conv2d(
-                num_input_features,
-                num_output_features,
-                kernel_size=1,
-                stride=1,
-                bias=False,
-            ),
-        )
-        self.add_module("pool", nn.AvgPool2d(kernel_size=2, stride=2))
-
-
-class DenseNet(nn.Module):
-    r"""Densenet-BC model class, based on
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_.
-    Args:
-        growth_rate (int) - how many filters to add each layer (`k` in paper)
-        block_config (list of 4 ints) - how many layers in each pooling block
-        num_init_features (int) - the number of filters to learn in the first convolution layer
-        bn_size (int) - multiplicative factor for number of bottle neck layers
-          (i.e. bn_size * k features in the bottleneck layer)
-        drop_rate (float) - dropout rate after each dense layer
-        num_classes (int) - number of classification classes
-        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
-          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_.
-    """
-
-    def __init__(
-        self,
-        growth_rate: int = 32,
-        block_config: Tuple[int, int, int, int] = (6, 12, 24, 16),
-        num_init_features: int = 64,
-        bn_size: int = 4,
-        drop_rate: float = 0,
-        num_classes: int = 1000,
-        memory_efficient: bool = False,
-    ) -> None:
-
-        super().__init__()
-
-        # First convolution
-        self.features = nn.Sequential(
-            OrderedDict(
-                [
-                    (
-                        "conv0",
-                        nn.Conv2d(
-                            3,
-                            num_init_features,
-                            kernel_size=7,
-                            stride=2,
-                            padding=3,
-                            bias=False,
-                        ),
-                    ),
-                    ("norm0", nn.BatchNorm2d(num_init_features)),
-                    ("relu0", nn.ReLU(inplace=True)),
-                    ("pool0", nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
-                ]
-            )
-        )
-
-        # Each denseblock
-        num_features = num_init_features
-        for i, num_layers in enumerate(block_config):
-            block = _DenseBlock(
-                num_layers=num_layers,
-                num_input_features=num_features,
-                bn_size=bn_size,
-                growth_rate=growth_rate,
-                drop_rate=drop_rate,
-                memory_efficient=memory_efficient,
-            )
-            self.features.add_module("denseblock%d" % (i + 1), block)
-            num_features = num_features + num_layers * growth_rate
-            if i != len(block_config) - 1:
-                trans = _Transition(
-                    num_input_features=num_features,
-                    num_output_features=num_features // 2,
-                )
-                self.features.add_module("transition%d" % (i + 1), trans)
-                num_features = num_features // 2
-
-        # Final batch norm
-        self.features.add_module("norm5", nn.BatchNorm2d(num_features))
-
-        # Linear layer
-        self.classifier = nn.Linear(num_features, num_classes)
-
-        # Official init from torch repo.
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight)
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.Linear):
-                nn.init.constant_(m.bias, 0)
-
-    def forward(self, x: Tensor) -> Tensor:
-        features = self.features(x)
-        out = F.relu(features, inplace=True)
-        out = F.adaptive_avg_pool2d(out, (1, 1))
-        out = torch.flatten(out, 1)
-        out = self.classifier(out)
-        return out
-
-
-def _densenet(
-    growth_rate: int,
-    block_config: Tuple[int, int, int, int],
-    num_init_features: int,
-    progress: bool,
-    **kwargs: Any,
-) -> DenseNet:
-    model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)
-    return model
-
-
-def densenet121(progress: bool = True, **kwargs: Any) -> DenseNet:
-    r"""Densenet-121 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_.
-    The required minimum input size of the model is 29x29.
-    Args:
-        weights (DenseNet121_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
-        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
-          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_.
-    """
-
-    return _densenet(32, (6, 12, 24, 16), 64, progress, **kwargs)
diff --git a/python/oneflow/test/expensive/pytorch_efficientnet.py b/python/oneflow/test/expensive/pytorch_efficientnet.py
deleted file mode 100644
index 12b254abc09..00000000000
--- a/python/oneflow/test/expensive/pytorch_efficientnet.py
+++ /dev/null
@@ -1,617 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-from torch import nn, Tensor
-from torchvision.ops import StochasticDepth
-
-import copy
-import math
-import warnings
-from dataclasses import dataclass
-from functools import partial
-from typing import Any, Callable, Optional, List, Sequence, Tuple, Union
-
-__all__ = [
-    "EfficientNet",
-    "efficientnet_b0",
-]
-
-
-class SqueezeExcitation(torch.nn.Module):
-    """
-    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
-    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.
-    Args:
-        input_channels (int): Number of channels in the input image
-        squeeze_channels (int): Number of squeeze channels
-        activation (Callable[..., torch.nn.Module], optional): ``delta`` activation. Default: ``torch.nn.ReLU``
-        scale_activation (Callable[..., torch.nn.Module]): ``sigma`` activation. Default: ``torch.nn.Sigmoid``
-    """
-
-    def __init__(
-        self,
-        input_channels: int,
-        squeeze_channels: int,
-        activation: Callable[..., torch.nn.Module] = torch.nn.ReLU,
-        scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid,
-    ) -> None:
-        super().__init__()
-        self.avgpool = torch.nn.AdaptiveAvgPool2d(1)
-        self.fc1 = torch.nn.Conv2d(input_channels, squeeze_channels, 1)
-        self.fc2 = torch.nn.Conv2d(squeeze_channels, input_channels, 1)
-        self.activation = activation()
-        self.scale_activation = scale_activation()
-
-    def _scale(self, input: Tensor) -> Tensor:
-        scale = self.avgpool(input)
-        scale = self.fc1(scale)
-        scale = self.activation(scale)
-        scale = self.fc2(scale)
-        return self.scale_activation(scale)
-
-    def forward(self, input: Tensor) -> Tensor:
-        scale = self._scale(input)
-        return scale * input
-
-
-class ConvNormActivation(torch.nn.Sequential):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int = 3,
-        stride: int = 1,
-        padding: Optional[int] = None,
-        groups: int = 1,
-        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
-        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        dilation: int = 1,
-        inplace: Optional[bool] = True,
-        bias: Optional[bool] = None,
-        conv_layer: Callable[..., torch.nn.Module] = torch.nn.Conv2d,
-    ) -> None:
-
-        if padding is None:
-            padding = (kernel_size - 1) // 2 * dilation
-        if bias is None:
-            bias = norm_layer is None
-
-        layers = [
-            conv_layer(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding,
-                dilation=dilation,
-                groups=groups,
-                bias=bias,
-            )
-        ]
-
-        if norm_layer is not None:
-            layers.append(norm_layer(out_channels))
-
-        if activation_layer is not None:
-            params = {} if inplace is None else {"inplace": inplace}
-            layers.append(activation_layer(**params))
-        super().__init__(*layers)
-        self.out_channels = out_channels
-
-        if self.__class__ == ConvNormActivation:
-            warnings.warn(
-                "Don't use ConvNormActivation directly, please use Conv2dNormActivation and Conv3dNormActivation instead."
-            )
-
-
-class Conv2dNormActivation(ConvNormActivation):
-    """
-    Configurable block used for Convolution2d-Normalization-Activation blocks.
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
-        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
-        stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
-        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
-        activation_layer (Callable[..., torch.nn.Module], optinal): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
-        dilation (int): Spacing between kernel elements. Default: 1
-        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
-        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int = 3,
-        stride: int = 1,
-        padding: Optional[int] = None,
-        groups: int = 1,
-        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
-        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        dilation: int = 1,
-        inplace: Optional[bool] = True,
-        bias: Optional[bool] = None,
-    ) -> None:
-
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            groups,
-            norm_layer,
-            activation_layer,
-            dilation,
-            inplace,
-            bias,
-            torch.nn.Conv2d,
-        )
-
-
-def _make_divisible(v, divisor=8, min_value=None, round_limit=0.9):
-    min_value = min_value or divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < round_limit * v:
-        new_v += divisor
-    return new_v
-
-
-@dataclass
-class _MBConvConfig:
-    expand_ratio: float
-    kernel: int
-    stride: int
-    input_channels: int
-    out_channels: int
-    num_layers: int
-    block: Callable[..., nn.Module]
-
-    @staticmethod
-    def adjust_channels(
-        channels: int, width_mult: float, min_value: Optional[int] = None
-    ) -> int:
-        return _make_divisible(channels * width_mult, 8, min_value)
-
-
-class MBConvConfig(_MBConvConfig):
-    # Stores information listed at Table 1 of the EfficientNet paper & Table 4 of the EfficientNetV2 paper
-    def __init__(
-        self,
-        expand_ratio: float,
-        kernel: int,
-        stride: int,
-        input_channels: int,
-        out_channels: int,
-        num_layers: int,
-        width_mult: float = 1.0,
-        depth_mult: float = 1.0,
-        block: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        input_channels = self.adjust_channels(input_channels, width_mult)
-        out_channels = self.adjust_channels(out_channels, width_mult)
-        num_layers = self.adjust_depth(num_layers, depth_mult)
-        if block is None:
-            block = MBConv
-        super().__init__(
-            expand_ratio,
-            kernel,
-            stride,
-            input_channels,
-            out_channels,
-            num_layers,
-            block,
-        )
-
-    @staticmethod
-    def adjust_depth(num_layers: int, depth_mult: float):
-        return int(math.ceil(num_layers * depth_mult))
-
-
-class FusedMBConvConfig(_MBConvConfig):
-    # Stores information listed at Table 4 of the EfficientNetV2 paper
-    def __init__(
-        self,
-        expand_ratio: float,
-        kernel: int,
-        stride: int,
-        input_channels: int,
-        out_channels: int,
-        num_layers: int,
-        block: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        if block is None:
-            block = FusedMBConv
-        super().__init__(
-            expand_ratio,
-            kernel,
-            stride,
-            input_channels,
-            out_channels,
-            num_layers,
-            block,
-        )
-
-
-class MBConv(nn.Module):
-    def __init__(
-        self,
-        cnf: MBConvConfig,
-        stochastic_depth_prob: float,
-        norm_layer: Callable[..., nn.Module],
-        se_layer: Callable[..., nn.Module] = SqueezeExcitation,
-    ) -> None:
-        super().__init__()
-
-        if not (1 <= cnf.stride <= 2):
-            raise ValueError("illegal stride value")
-
-        self.use_res_connect = (
-            cnf.stride == 1 and cnf.input_channels == cnf.out_channels
-        )
-
-        layers: List[nn.Module] = []
-        activation_layer = nn.SiLU
-
-        # expand
-        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
-        if expanded_channels != cnf.input_channels:
-            layers.append(
-                Conv2dNormActivation(
-                    cnf.input_channels,
-                    expanded_channels,
-                    kernel_size=1,
-                    norm_layer=norm_layer,
-                    activation_layer=activation_layer,
-                )
-            )
-
-        # depthwise
-        layers.append(
-            Conv2dNormActivation(
-                expanded_channels,
-                expanded_channels,
-                kernel_size=cnf.kernel,
-                stride=cnf.stride,
-                groups=expanded_channels,
-                norm_layer=norm_layer,
-                activation_layer=activation_layer,
-            )
-        )
-
-        # squeeze and excitation
-        squeeze_channels = max(1, cnf.input_channels // 4)
-        layers.append(
-            se_layer(
-                expanded_channels,
-                squeeze_channels,
-                activation=partial(nn.SiLU, inplace=True),
-            )
-        )
-
-        # project
-        layers.append(
-            Conv2dNormActivation(
-                expanded_channels,
-                cnf.out_channels,
-                kernel_size=1,
-                norm_layer=norm_layer,
-                activation_layer=None,
-            )
-        )
-
-        self.block = nn.Sequential(*layers)
-        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
-        self.out_channels = cnf.out_channels
-
-    def forward(self, input: Tensor) -> Tensor:
-        result = self.block(input)
-        if self.use_res_connect:
-            result = self.stochastic_depth(result)
-            result += input
-        return result
-
-
-class FusedMBConv(nn.Module):
-    def __init__(
-        self,
-        cnf: FusedMBConvConfig,
-        stochastic_depth_prob: float,
-        norm_layer: Callable[..., nn.Module],
-    ) -> None:
-        super().__init__()
-
-        if not (1 <= cnf.stride <= 2):
-            raise ValueError("illegal stride value")
-
-        self.use_res_connect = (
-            cnf.stride == 1 and cnf.input_channels == cnf.out_channels
-        )
-
-        layers: List[nn.Module] = []
-        activation_layer = nn.SiLU
-
-        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
-        if expanded_channels != cnf.input_channels:
-            # fused expand
-            layers.append(
-                Conv2dNormActivation(
-                    cnf.input_channels,
-                    expanded_channels,
-                    kernel_size=cnf.kernel,
-                    stride=cnf.stride,
-                    norm_layer=norm_layer,
-                    activation_layer=activation_layer,
-                )
-            )
-
-            # project
-            layers.append(
-                Conv2dNormActivation(
-                    expanded_channels,
-                    cnf.out_channels,
-                    kernel_size=1,
-                    norm_layer=norm_layer,
-                    activation_layer=None,
-                )
-            )
-        else:
-            layers.append(
-                Conv2dNormActivation(
-                    cnf.input_channels,
-                    cnf.out_channels,
-                    kernel_size=cnf.kernel,
-                    stride=cnf.stride,
-                    norm_layer=norm_layer,
-                    activation_layer=activation_layer,
-                )
-            )
-
-        self.block = nn.Sequential(*layers)
-        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
-        self.out_channels = cnf.out_channels
-
-    def forward(self, input: Tensor) -> Tensor:
-        result = self.block(input)
-        if self.use_res_connect:
-            result = self.stochastic_depth(result)
-            result += input
-        return result
-
-
-class EfficientNet(nn.Module):
-    def __init__(
-        self,
-        inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
-        dropout: float,
-        stochastic_depth_prob: float = 0.2,
-        num_classes: int = 1000,
-        norm_layer: Optional[Callable[..., nn.Module]] = None,
-        last_channel: Optional[int] = None,
-        **kwargs: Any,
-    ) -> None:
-        """
-        EfficientNet V1 and V2 main class
-        Args:
-            inverted_residual_setting (Sequence[Union[MBConvConfig, FusedMBConvConfig]]): Network structure
-            dropout (float): The droupout probability
-            stochastic_depth_prob (float): The stochastic depth probability
-            num_classes (int): Number of classes
-            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
-            last_channel (int): The number of channels on the penultimate layer
-        """
-        super().__init__()
-        if not inverted_residual_setting:
-            raise ValueError("The inverted_residual_setting should not be empty")
-        elif not (
-            isinstance(inverted_residual_setting, Sequence)
-            and all([isinstance(s, _MBConvConfig) for s in inverted_residual_setting])
-        ):
-            raise TypeError(
-                "The inverted_residual_setting should be List[MBConvConfig]"
-            )
-
-        if "block" in kwargs:
-            warnings.warn(
-                "The parameter 'block' is deprecated since 0.13 and will be removed 0.15. "
-                "Please pass this information on 'MBConvConfig.block' instead."
-            )
-            if kwargs["block"] is not None:
-                for s in inverted_residual_setting:
-                    if isinstance(s, MBConvConfig):
-                        s.block = kwargs["block"]
-
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-
-        layers: List[nn.Module] = []
-
-        # building first layer
-        firstconv_output_channels = inverted_residual_setting[0].input_channels
-        layers.append(
-            Conv2dNormActivation(
-                3,
-                firstconv_output_channels,
-                kernel_size=3,
-                stride=2,
-                norm_layer=norm_layer,
-                activation_layer=nn.SiLU,
-            )
-        )
-
-        # building inverted residual blocks
-        total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting)
-        stage_block_id = 0
-        for cnf in inverted_residual_setting:
-            stage: List[nn.Module] = []
-            for _ in range(cnf.num_layers):
-                # copy to avoid modifications. shallow copy is enough
-                block_cnf = copy.copy(cnf)
-
-                # overwrite info if not the first conv in the stage
-                if stage:
-                    block_cnf.input_channels = block_cnf.out_channels
-                    block_cnf.stride = 1
-
-                # adjust stochastic depth probability based on the depth of the stage block
-                sd_prob = (
-                    stochastic_depth_prob * float(stage_block_id) / total_stage_blocks
-                )
-
-                stage.append(block_cnf.block(block_cnf, sd_prob, norm_layer))
-                stage_block_id += 1
-
-            layers.append(nn.Sequential(*stage))
-
-        # building last several layers
-        lastconv_input_channels = inverted_residual_setting[-1].out_channels
-        lastconv_output_channels = (
-            last_channel if last_channel is not None else 4 * lastconv_input_channels
-        )
-        layers.append(
-            Conv2dNormActivation(
-                lastconv_input_channels,
-                lastconv_output_channels,
-                kernel_size=1,
-                norm_layer=norm_layer,
-                activation_layer=nn.SiLU,
-            )
-        )
-
-        self.features = nn.Sequential(*layers)
-        self.avgpool = nn.AdaptiveAvgPool2d(1)
-        self.classifier = nn.Sequential(
-            nn.Dropout(p=dropout, inplace=True),
-            nn.Linear(lastconv_output_channels, num_classes),
-        )
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode="fan_out")
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
-                nn.init.ones_(m.weight)
-                nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.Linear):
-                init_range = 1.0 / math.sqrt(m.out_features)
-                nn.init.uniform_(m.weight, -init_range, init_range)
-                nn.init.zeros_(m.bias)
-
-    def _forward_impl(self, x: Tensor) -> Tensor:
-        x = self.features(x)
-
-        x = self.avgpool(x)
-        x = torch.flatten(x, 1)
-
-        x = self.classifier(x)
-
-        return x
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self._forward_impl(x)
-
-
-def _efficientnet(
-    inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
-    dropout: float,
-    last_channel: Optional[int],
-    progress: bool,
-    **kwargs: Any,
-) -> EfficientNet:
-    model = EfficientNet(
-        inverted_residual_setting, dropout, last_channel=last_channel, **kwargs
-    )
-    return model
-
-
-def _efficientnet_conf(
-    arch: str, **kwargs: Any,
-) -> Tuple[Sequence[Union[MBConvConfig, FusedMBConvConfig]], Optional[int]]:
-    inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]]
-    if arch.startswith("efficientnet_b"):
-        bneck_conf = partial(
-            MBConvConfig,
-            width_mult=kwargs.pop("width_mult"),
-            depth_mult=kwargs.pop("depth_mult"),
-        )
-        inverted_residual_setting = [
-            bneck_conf(1, 3, 1, 32, 16, 1),
-            bneck_conf(6, 3, 2, 16, 24, 2),
-            bneck_conf(6, 5, 2, 24, 40, 2),
-            bneck_conf(6, 3, 2, 40, 80, 3),
-            bneck_conf(6, 5, 1, 80, 112, 3),
-            bneck_conf(6, 5, 2, 112, 192, 4),
-            bneck_conf(6, 3, 1, 192, 320, 1),
-        ]
-        last_channel = None
-    elif arch.startswith("efficientnet_v2_s"):
-        inverted_residual_setting = [
-            FusedMBConvConfig(1, 3, 1, 24, 24, 2),
-            FusedMBConvConfig(4, 3, 2, 24, 48, 4),
-            FusedMBConvConfig(4, 3, 2, 48, 64, 4),
-            MBConvConfig(4, 3, 2, 64, 128, 6),
-            MBConvConfig(6, 3, 1, 128, 160, 9),
-            MBConvConfig(6, 3, 2, 160, 256, 15),
-        ]
-        last_channel = 1280
-    elif arch.startswith("efficientnet_v2_m"):
-        inverted_residual_setting = [
-            FusedMBConvConfig(1, 3, 1, 24, 24, 3),
-            FusedMBConvConfig(4, 3, 2, 24, 48, 5),
-            FusedMBConvConfig(4, 3, 2, 48, 80, 5),
-            MBConvConfig(4, 3, 2, 80, 160, 7),
-            MBConvConfig(6, 3, 1, 160, 176, 14),
-            MBConvConfig(6, 3, 2, 176, 304, 18),
-            MBConvConfig(6, 3, 1, 304, 512, 5),
-        ]
-        last_channel = 1280
-    elif arch.startswith("efficientnet_v2_l"):
-        inverted_residual_setting = [
-            FusedMBConvConfig(1, 3, 1, 32, 32, 4),
-            FusedMBConvConfig(4, 3, 2, 32, 64, 7),
-            FusedMBConvConfig(4, 3, 2, 64, 96, 7),
-            MBConvConfig(4, 3, 2, 96, 192, 10),
-            MBConvConfig(6, 3, 1, 192, 224, 19),
-            MBConvConfig(6, 3, 2, 224, 384, 25),
-            MBConvConfig(6, 3, 1, 384, 640, 7),
-        ]
-        last_channel = 1280
-    else:
-        raise ValueError(f"Unsupported model type {arch}")
-
-    return inverted_residual_setting, last_channel
-
-
-def efficientnet_b0(progress: bool = True, **kwargs: Any) -> EfficientNet:
-    """
-    Constructs a EfficientNet B0 architecture from
-    `"EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" <https://arxiv.org/abs/1905.11946>`_.
-    Args:
-        weights (EfficientNet_B0_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-
-    inverted_residual_setting, last_channel = _efficientnet_conf(
-        "efficientnet_b0", width_mult=1.0, depth_mult=1.0
-    )
-    return _efficientnet(
-        inverted_residual_setting, 0.2, last_channel, progress, **kwargs
-    )
diff --git a/python/oneflow/test/expensive/pytorch_ghostnet.py b/python/oneflow/test/expensive/pytorch_ghostnet.py
deleted file mode 100644
index 2160891e4db..00000000000
--- a/python/oneflow/test/expensive/pytorch_ghostnet.py
+++ /dev/null
@@ -1,225 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-import math
-
-
-__all__ = ["ghost_net"]
-
-
-def _make_divisible(v, divisor, min_value=None):
-    """
-    This function is taken from the original tf repo.
-    It ensures that all layers have a channel number that is divisible by 8
-    It can be seen here:
-    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
-    """
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-class SELayer(nn.Module):
-    def __init__(self, channel, reduction=4):
-        super(SELayer, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Sequential(
-            nn.Linear(channel, channel // reduction),
-            nn.ReLU(inplace=True),
-            nn.Linear(channel // reduction, channel),
-        )
-
-    def forward(self, x):
-        b, c, _, _ = x.size()
-        y = self.avg_pool(x).view(b, c)
-        y = self.fc(y).view(b, c, 1, 1)
-        y = torch.clamp(y, 0, 1)
-        return x * y
-
-
-def depthwise_conv(inp, oup, kernel_size=3, stride=1, relu=False):
-    return nn.Sequential(
-        nn.Conv2d(
-            inp, oup, kernel_size, stride, kernel_size // 2, groups=inp, bias=False
-        ),
-        nn.BatchNorm2d(oup),
-        nn.ReLU(inplace=True) if relu else nn.Sequential(),
-    )
-
-
-class GhostModule(nn.Module):
-    def __init__(
-        self, inp, oup, kernel_size=1, ratio=2, dw_size=3, stride=1, relu=True
-    ):
-        super(GhostModule, self).__init__()
-        self.oup = oup
-        init_channels = math.ceil(oup / ratio)
-        new_channels = init_channels * (ratio - 1)
-
-        self.primary_conv = nn.Sequential(
-            nn.Conv2d(
-                inp, init_channels, kernel_size, stride, kernel_size // 2, bias=False
-            ),
-            nn.BatchNorm2d(init_channels),
-            nn.ReLU(inplace=True) if relu else nn.Sequential(),
-        )
-
-        self.cheap_operation = nn.Sequential(
-            nn.Conv2d(
-                init_channels,
-                new_channels,
-                dw_size,
-                1,
-                dw_size // 2,
-                groups=init_channels,
-                bias=False,
-            ),
-            nn.BatchNorm2d(new_channels),
-            nn.ReLU(inplace=True) if relu else nn.Sequential(),
-        )
-
-    def forward(self, x):
-        x1 = self.primary_conv(x)
-        x2 = self.cheap_operation(x1)
-        out = torch.cat([x1, x2], dim=1)
-        return out[:, : self.oup, :, :]
-
-
-class GhostBottleneck(nn.Module):
-    def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se):
-        super(GhostBottleneck, self).__init__()
-        assert stride in [1, 2]
-
-        self.conv = nn.Sequential(
-            # pw
-            GhostModule(inp, hidden_dim, kernel_size=1, relu=True),
-            # dw
-            depthwise_conv(hidden_dim, hidden_dim, kernel_size, stride, relu=False)
-            if stride == 2
-            else nn.Sequential(),
-            # Squeeze-and-Excite
-            SELayer(hidden_dim) if use_se else nn.Sequential(),
-            # pw-linear
-            GhostModule(hidden_dim, oup, kernel_size=1, relu=False),
-        )
-
-        if stride == 1 and inp == oup:
-            self.shortcut = nn.Sequential()
-        else:
-            self.shortcut = nn.Sequential(
-                depthwise_conv(inp, inp, kernel_size, stride, relu=False),
-                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
-                nn.BatchNorm2d(oup),
-            )
-
-    def forward(self, x):
-        return self.conv(x) + self.shortcut(x)
-
-
-class GhostNet(nn.Module):
-    def __init__(self, cfgs, num_classes=1000, width_mult=1.0):
-        super(GhostNet, self).__init__()
-        # setting of inverted residual blocks
-        self.cfgs = cfgs
-
-        # building first layer
-        output_channel = _make_divisible(16 * width_mult, 4)
-        layers = [
-            nn.Sequential(
-                nn.Conv2d(3, output_channel, 3, 2, 1, bias=False),
-                nn.BatchNorm2d(output_channel),
-                nn.ReLU(inplace=True),
-            )
-        ]
-        input_channel = output_channel
-
-        # building inverted residual blocks
-        block = GhostBottleneck
-        for k, exp_size, c, use_se, s in self.cfgs:
-            output_channel = _make_divisible(c * width_mult, 4)
-            hidden_channel = _make_divisible(exp_size * width_mult, 4)
-            layers.append(
-                block(input_channel, hidden_channel, output_channel, k, s, use_se)
-            )
-            input_channel = output_channel
-        self.features = nn.Sequential(*layers)
-
-        # building last several layers
-        output_channel = _make_divisible(exp_size * width_mult, 4)
-        self.squeeze = nn.Sequential(
-            nn.Conv2d(input_channel, output_channel, 1, 1, 0, bias=False),
-            nn.BatchNorm2d(output_channel),
-            nn.ReLU(inplace=True),
-            nn.AdaptiveAvgPool2d((1, 1)),
-        )
-        input_channel = output_channel
-
-        output_channel = 1280
-        self.classifier = nn.Sequential(
-            nn.Linear(input_channel, output_channel, bias=False),
-            nn.BatchNorm1d(output_channel),
-            nn.ReLU(inplace=True),
-            nn.Dropout(0.2),
-            nn.Linear(output_channel, num_classes),
-        )
-
-        self._initialize_weights()
-
-    def forward(self, x):
-        x = self.features(x)
-        x = self.squeeze(x)
-        x = x.view(x.size(0), -1)
-        x = self.classifier(x)
-        return x
-
-    def _initialize_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-
-def ghost_net(**kwargs):
-    """
-    Constructs a GhostNet model
-    """
-    cfgs = [
-        # k, t, c, SE, s
-        [3, 16, 16, 0, 1],
-        [3, 48, 24, 0, 2],
-        [3, 72, 24, 0, 1],
-        [5, 72, 40, 1, 2],
-        [5, 120, 40, 1, 1],
-        [3, 240, 80, 0, 2],
-        [3, 200, 80, 0, 1],
-        [3, 184, 80, 0, 1],
-        [3, 184, 80, 0, 1],
-        [3, 480, 112, 1, 1],
-        [3, 672, 112, 1, 1],
-        [5, 672, 160, 1, 2],
-        [5, 960, 160, 0, 1],
-        [5, 960, 160, 1, 1],
-        [5, 960, 160, 0, 1],
-        [5, 960, 160, 1, 1],
-    ]
-    return GhostNet(cfgs, **kwargs)
diff --git a/python/oneflow/test/expensive/pytorch_googlenet.py b/python/oneflow/test/expensive/pytorch_googlenet.py
deleted file mode 100644
index a98054862e1..00000000000
--- a/python/oneflow/test/expensive/pytorch_googlenet.py
+++ /dev/null
@@ -1,276 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch import Tensor
-
-import warnings
-from typing import Optional, Tuple, List, Callable, Any
-
-__all__ = ["GoogLeNet", "googlenet"]
-
-
-class GoogLeNet(nn.Module):
-    __constants__ = ["aux_logits", "transform_input"]
-
-    def __init__(
-        self,
-        num_classes: int = 1000,
-        aux_logits: bool = True,
-        transform_input: bool = False,
-        init_weights: Optional[bool] = None,
-        blocks: Optional[List[Callable[..., nn.Module]]] = None,
-        dropout: float = 0.2,
-        dropout_aux: float = 0.7,
-    ) -> None:
-        super().__init__()
-        if blocks is None:
-            blocks = [BasicConv2d, Inception, InceptionAux]
-        if init_weights is None:
-            warnings.warn(
-                "The default weight initialization of GoogleNet will be changed in future releases of "
-                "torchvision. If you wish to keep the old behavior (which leads to long initialization times"
-                " due to scipy/scipy#11299), please set init_weights=True.",
-                FutureWarning,
-            )
-            init_weights = True
-        if len(blocks) != 3:
-            raise ValueError(f"blocks length should be 3 instead of {len(blocks)}")
-        conv_block = blocks[0]
-        inception_block = blocks[1]
-        inception_aux_block = blocks[2]
-
-        self.aux_logits = aux_logits
-        self.transform_input = transform_input
-
-        self.conv1 = conv_block(3, 64, kernel_size=7, stride=2, padding=3)
-        self.maxpool1 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
-        self.conv2 = conv_block(64, 64, kernel_size=1)
-        self.conv3 = conv_block(64, 192, kernel_size=3, padding=1)
-        self.maxpool2 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
-
-        self.inception3a = inception_block(192, 64, 96, 128, 16, 32, 32)
-        self.inception3b = inception_block(256, 128, 128, 192, 32, 96, 64)
-        self.maxpool3 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
-
-        self.inception4a = inception_block(480, 192, 96, 208, 16, 48, 64)
-        self.inception4b = inception_block(512, 160, 112, 224, 24, 64, 64)
-        self.inception4c = inception_block(512, 128, 128, 256, 24, 64, 64)
-        self.inception4d = inception_block(512, 112, 144, 288, 32, 64, 64)
-        self.inception4e = inception_block(528, 256, 160, 320, 32, 128, 128)
-        self.maxpool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
-
-        self.inception5a = inception_block(832, 256, 160, 320, 32, 128, 128)
-        self.inception5b = inception_block(832, 384, 192, 384, 48, 128, 128)
-
-        if aux_logits:
-            self.aux1 = inception_aux_block(512, num_classes, dropout=dropout_aux)
-            self.aux2 = inception_aux_block(528, num_classes, dropout=dropout_aux)
-        else:
-            self.aux1 = None  # type: ignore[assignment]
-            self.aux2 = None  # type: ignore[assignment]
-
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-        self.dropout = nn.Dropout(p=dropout)
-        self.fc = nn.Linear(1024, num_classes)
-
-        if init_weights:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
-                    torch.nn.init.trunc_normal_(m.weight, mean=0.0, std=0.01, a=-2, b=2)
-                elif isinstance(m, nn.BatchNorm2d):
-                    nn.init.constant_(m.weight, 1)
-                    nn.init.constant_(m.bias, 0)
-
-    def _transform_input(self, x: Tensor) -> Tensor:
-        if self.transform_input:
-            x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
-            x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
-            x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
-            x = torch.cat((x_ch0, x_ch1, x_ch2), 1)
-        return x
-
-    def _forward(self, x: Tensor) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
-        # N x 3 x 224 x 224
-        x = self.conv1(x)
-        # N x 64 x 112 x 112
-        x = self.maxpool1(x)
-        # N x 64 x 56 x 56
-        x = self.conv2(x)
-        # N x 64 x 56 x 56
-        x = self.conv3(x)
-        # N x 192 x 56 x 56
-        x = self.maxpool2(x)
-
-        # N x 192 x 28 x 28
-        x = self.inception3a(x)
-        # N x 256 x 28 x 28
-        x = self.inception3b(x)
-        # N x 480 x 28 x 28
-        x = self.maxpool3(x)
-        # N x 480 x 14 x 14
-        x = self.inception4a(x)
-        # N x 512 x 14 x 14
-        aux1: Optional[Tensor] = None
-        if self.aux1 is not None:
-            if self.training:
-                aux1 = self.aux1(x)
-
-        x = self.inception4b(x)
-        # N x 512 x 14 x 14
-        x = self.inception4c(x)
-        # N x 512 x 14 x 14
-        x = self.inception4d(x)
-        # N x 528 x 14 x 14
-        aux2: Optional[Tensor] = None
-        if self.aux2 is not None:
-            if self.training:
-                aux2 = self.aux2(x)
-
-        x = self.inception4e(x)
-        # N x 832 x 14 x 14
-        x = self.maxpool4(x)
-        # N x 832 x 7 x 7
-        x = self.inception5a(x)
-        # N x 832 x 7 x 7
-        x = self.inception5b(x)
-        # N x 1024 x 7 x 7
-
-        x = self.avgpool(x)
-        # N x 1024 x 1 x 1
-        x = torch.flatten(x, 1)
-        # N x 1024
-        x = self.dropout(x)
-        x = self.fc(x)
-        # N x 1000 (num_classes)
-        return x, aux2, aux1
-
-    def forward(self, x: Tensor):
-        x = self._transform_input(x)
-        x, aux1, aux2 = self._forward(x)
-        return x
-
-
-class Inception(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        ch1x1: int,
-        ch3x3red: int,
-        ch3x3: int,
-        ch5x5red: int,
-        ch5x5: int,
-        pool_proj: int,
-        conv_block: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        super().__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.branch1 = conv_block(in_channels, ch1x1, kernel_size=1)
-
-        self.branch2 = nn.Sequential(
-            conv_block(in_channels, ch3x3red, kernel_size=1),
-            conv_block(ch3x3red, ch3x3, kernel_size=3, padding=1),
-        )
-
-        self.branch3 = nn.Sequential(
-            conv_block(in_channels, ch5x5red, kernel_size=1),
-            # Here, kernel_size=3 instead of kernel_size=5 is a known bug.
-            # Please see https://github.com/pytorch/vision/issues/906 for details.
-            conv_block(ch5x5red, ch5x5, kernel_size=3, padding=1),
-        )
-
-        self.branch4 = nn.Sequential(
-            nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True),
-            conv_block(in_channels, pool_proj, kernel_size=1),
-        )
-
-    def _forward(self, x: Tensor) -> List[Tensor]:
-        branch1 = self.branch1(x)
-        branch2 = self.branch2(x)
-        branch3 = self.branch3(x)
-        branch4 = self.branch4(x)
-
-        outputs = [branch1, branch2, branch3, branch4]
-        return outputs
-
-    def forward(self, x: Tensor) -> Tensor:
-        outputs = self._forward(x)
-        return torch.cat(outputs, 1)
-
-
-class InceptionAux(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        num_classes: int,
-        conv_block: Optional[Callable[..., nn.Module]] = None,
-        dropout: float = 0.7,
-    ) -> None:
-        super().__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.conv = conv_block(in_channels, 128, kernel_size=1)
-
-        self.fc1 = nn.Linear(2048, 1024)
-        self.fc2 = nn.Linear(1024, num_classes)
-        self.dropout = nn.Dropout(p=dropout)
-
-    def forward(self, x: Tensor) -> Tensor:
-        # aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14
-        x = F.adaptive_avg_pool2d(x, (4, 4))
-        # aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4
-        x = self.conv(x)
-        # N x 128 x 4 x 4
-        x = torch.flatten(x, 1)
-        # N x 2048
-        x = F.relu(self.fc1(x), inplace=True)
-        # N x 1024
-        x = self.dropout(x)
-        # N x 1024
-        x = self.fc2(x)
-        # N x 1000 (num_classes)
-
-        return x
-
-
-class BasicConv2d(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int, **kwargs: Any) -> None:
-        super().__init__()
-        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
-        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.conv(x)
-        x = self.bn(x)
-        return F.relu(x, inplace=True)
-
-
-def googlenet(progress: bool = True, **kwargs: Any) -> GoogLeNet:
-    r"""GoogLeNet (Inception v1) model architecture from
-    `"Going Deeper with Convolutions" <http://arxiv.org/abs/1409.4842>`_.
-    The required minimum input size of the model is 15x15.
-    Args:
-        weights (GoogLeNet_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
-        aux_logits (bool): If True, adds two auxiliary branches that can improve training.
-            Default: *False* when pretrained is True otherwise *True*
-        transform_input (bool): If True, preprocesses the input according to the method with which it
-            was trained on ImageNet. Default: True if ``weights=GoogLeNet_Weights.IMAGENET1K_V1``, else False.
-    """
-    model = GoogLeNet(**kwargs)
-    return model
diff --git a/python/oneflow/test/expensive/pytorch_inception_v3.py b/python/oneflow/test/expensive/pytorch_inception_v3.py
deleted file mode 100644
index 133dac51269..00000000000
--- a/python/oneflow/test/expensive/pytorch_inception_v3.py
+++ /dev/null
@@ -1,437 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn.functional as F
-from torch import nn, Tensor
-
-import warnings
-from typing import Callable, Any, Optional, Tuple, List
-
-__all__ = ["Inception3", "inception_v3"]
-
-
-class Inception3(nn.Module):
-    def __init__(
-        self,
-        num_classes: int = 1000,
-        aux_logits: bool = True,
-        transform_input: bool = False,
-        inception_blocks: Optional[List[Callable[..., nn.Module]]] = None,
-        init_weights: Optional[bool] = None,
-        dropout: float = 0.5,
-    ) -> None:
-        super().__init__()
-        if inception_blocks is None:
-            inception_blocks = [
-                BasicConv2d,
-                InceptionA,
-                InceptionB,
-                InceptionC,
-                InceptionD,
-                InceptionE,
-                InceptionAux,
-            ]
-        if init_weights is None:
-            warnings.warn(
-                "The default weight initialization of inception_v3 will be changed in future releases of "
-                "torchvision. If you wish to keep the old behavior (which leads to long initialization times"
-                " due to scipy/scipy#11299), please set init_weights=True.",
-                FutureWarning,
-            )
-            init_weights = True
-        if len(inception_blocks) != 7:
-            raise ValueError(
-                f"lenght of inception_blocks should be 7 instead of {len(inception_blocks)}"
-            )
-        conv_block = inception_blocks[0]
-        inception_a = inception_blocks[1]
-        inception_b = inception_blocks[2]
-        inception_c = inception_blocks[3]
-        inception_d = inception_blocks[4]
-        inception_e = inception_blocks[5]
-        inception_aux = inception_blocks[6]
-
-        self.aux_logits = aux_logits
-        self.transform_input = transform_input
-        self.Conv2d_1a_3x3 = conv_block(3, 32, kernel_size=3, stride=2)
-        self.Conv2d_2a_3x3 = conv_block(32, 32, kernel_size=3)
-        self.Conv2d_2b_3x3 = conv_block(32, 64, kernel_size=3, padding=1)
-        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
-        self.Conv2d_3b_1x1 = conv_block(64, 80, kernel_size=1)
-        self.Conv2d_4a_3x3 = conv_block(80, 192, kernel_size=3)
-        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
-        self.Mixed_5b = inception_a(192, pool_features=32)
-        self.Mixed_5c = inception_a(256, pool_features=64)
-        self.Mixed_5d = inception_a(288, pool_features=64)
-        self.Mixed_6a = inception_b(288)
-        self.Mixed_6b = inception_c(768, channels_7x7=128)
-        self.Mixed_6c = inception_c(768, channels_7x7=160)
-        self.Mixed_6d = inception_c(768, channels_7x7=160)
-        self.Mixed_6e = inception_c(768, channels_7x7=192)
-        self.AuxLogits: Optional[nn.Module] = None
-        if aux_logits:
-            self.AuxLogits = inception_aux(768, num_classes)
-        self.Mixed_7a = inception_d(768)
-        self.Mixed_7b = inception_e(1280)
-        self.Mixed_7c = inception_e(2048)
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-        self.dropout = nn.Dropout(p=dropout)
-        self.fc = nn.Linear(2048, num_classes)
-        if init_weights:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
-                    stddev = float(m.stddev) if hasattr(m, "stddev") else 0.1  # type: ignore
-                    torch.nn.init.trunc_normal_(
-                        m.weight, mean=0.0, std=stddev, a=-2, b=2
-                    )
-                elif isinstance(m, nn.BatchNorm2d):
-                    nn.init.constant_(m.weight, 1)
-                    nn.init.constant_(m.bias, 0)
-
-    def _transform_input(self, x: Tensor) -> Tensor:
-        if self.transform_input:
-            x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
-            x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
-            x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
-            x = torch.cat((x_ch0, x_ch1, x_ch2), 1)
-        return x
-
-    def _forward(self, x: Tensor) -> Tuple[Tensor, Optional[Tensor]]:
-        # N x 3 x 299 x 299
-        x = self.Conv2d_1a_3x3(x)
-        # N x 32 x 149 x 149
-        x = self.Conv2d_2a_3x3(x)
-        # N x 32 x 147 x 147
-        x = self.Conv2d_2b_3x3(x)
-        # N x 64 x 147 x 147
-        x = self.maxpool1(x)
-        # N x 64 x 73 x 73
-        x = self.Conv2d_3b_1x1(x)
-        # N x 80 x 73 x 73
-        x = self.Conv2d_4a_3x3(x)
-        # N x 192 x 71 x 71
-        x = self.maxpool2(x)
-        # N x 192 x 35 x 35
-        x = self.Mixed_5b(x)
-        # N x 256 x 35 x 35
-        x = self.Mixed_5c(x)
-        # N x 288 x 35 x 35
-        x = self.Mixed_5d(x)
-        # N x 288 x 35 x 35
-        x = self.Mixed_6a(x)
-        # N x 768 x 17 x 17
-        x = self.Mixed_6b(x)
-        # N x 768 x 17 x 17
-        x = self.Mixed_6c(x)
-        # N x 768 x 17 x 17
-        x = self.Mixed_6d(x)
-        # N x 768 x 17 x 17
-        x = self.Mixed_6e(x)
-        # N x 768 x 17 x 17
-        aux: Optional[Tensor] = None
-        if self.AuxLogits is not None:
-            if self.training:
-                aux = self.AuxLogits(x)
-        # N x 768 x 17 x 17
-        x = self.Mixed_7a(x)
-        # N x 1280 x 8 x 8
-        x = self.Mixed_7b(x)
-        # N x 2048 x 8 x 8
-        x = self.Mixed_7c(x)
-        # N x 2048 x 8 x 8
-        # Adaptive average pooling
-        x = self.avgpool(x)
-        # N x 2048 x 1 x 1
-        x = self.dropout(x)
-        # N x 2048 x 1 x 1
-        x = torch.flatten(x, 1)
-        # N x 2048
-        x = self.fc(x)
-        # N x 1000 (num_classes)
-        return x, aux
-
-    def forward(self, x: Tensor):
-        x = self._transform_input(x)
-        x, aux = self._forward(x)
-        return x
-
-
-class InceptionA(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        pool_features: int,
-        conv_block: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        super().__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.branch1x1 = conv_block(in_channels, 64, kernel_size=1)
-
-        self.branch5x5_1 = conv_block(in_channels, 48, kernel_size=1)
-        self.branch5x5_2 = conv_block(48, 64, kernel_size=5, padding=2)
-
-        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
-        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
-        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, padding=1)
-
-        self.branch_pool = conv_block(in_channels, pool_features, kernel_size=1)
-
-    def _forward(self, x: Tensor) -> List[Tensor]:
-        branch1x1 = self.branch1x1(x)
-
-        branch5x5 = self.branch5x5_1(x)
-        branch5x5 = self.branch5x5_2(branch5x5)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
-
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool(branch_pool)
-
-        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
-        return outputs
-
-    def forward(self, x: Tensor) -> Tensor:
-        outputs = self._forward(x)
-        return torch.cat(outputs, 1)
-
-
-class InceptionB(nn.Module):
-    def __init__(
-        self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super().__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.branch3x3 = conv_block(in_channels, 384, kernel_size=3, stride=2)
-
-        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
-        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
-        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, stride=2)
-
-    def _forward(self, x: Tensor) -> List[Tensor]:
-        branch3x3 = self.branch3x3(x)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
-
-        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
-
-        outputs = [branch3x3, branch3x3dbl, branch_pool]
-        return outputs
-
-    def forward(self, x: Tensor) -> Tensor:
-        outputs = self._forward(x)
-        return torch.cat(outputs, 1)
-
-
-class InceptionC(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        channels_7x7: int,
-        conv_block: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        super().__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.branch1x1 = conv_block(in_channels, 192, kernel_size=1)
-
-        c7 = channels_7x7
-        self.branch7x7_1 = conv_block(in_channels, c7, kernel_size=1)
-        self.branch7x7_2 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
-        self.branch7x7_3 = conv_block(c7, 192, kernel_size=(7, 1), padding=(3, 0))
-
-        self.branch7x7dbl_1 = conv_block(in_channels, c7, kernel_size=1)
-        self.branch7x7dbl_2 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
-        self.branch7x7dbl_3 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
-        self.branch7x7dbl_4 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
-        self.branch7x7dbl_5 = conv_block(c7, 192, kernel_size=(1, 7), padding=(0, 3))
-
-        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
-
-    def _forward(self, x: Tensor) -> List[Tensor]:
-        branch1x1 = self.branch1x1(x)
-
-        branch7x7 = self.branch7x7_1(x)
-        branch7x7 = self.branch7x7_2(branch7x7)
-        branch7x7 = self.branch7x7_3(branch7x7)
-
-        branch7x7dbl = self.branch7x7dbl_1(x)
-        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
-
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool(branch_pool)
-
-        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
-        return outputs
-
-    def forward(self, x: Tensor) -> Tensor:
-        outputs = self._forward(x)
-        return torch.cat(outputs, 1)
-
-
-class InceptionD(nn.Module):
-    def __init__(
-        self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super().__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.branch3x3_1 = conv_block(in_channels, 192, kernel_size=1)
-        self.branch3x3_2 = conv_block(192, 320, kernel_size=3, stride=2)
-
-        self.branch7x7x3_1 = conv_block(in_channels, 192, kernel_size=1)
-        self.branch7x7x3_2 = conv_block(192, 192, kernel_size=(1, 7), padding=(0, 3))
-        self.branch7x7x3_3 = conv_block(192, 192, kernel_size=(7, 1), padding=(3, 0))
-        self.branch7x7x3_4 = conv_block(192, 192, kernel_size=3, stride=2)
-
-    def _forward(self, x: Tensor) -> List[Tensor]:
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = self.branch3x3_2(branch3x3)
-
-        branch7x7x3 = self.branch7x7x3_1(x)
-        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
-        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
-        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
-
-        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
-        outputs = [branch3x3, branch7x7x3, branch_pool]
-        return outputs
-
-    def forward(self, x: Tensor) -> Tensor:
-        outputs = self._forward(x)
-        return torch.cat(outputs, 1)
-
-
-class InceptionE(nn.Module):
-    def __init__(
-        self, in_channels: int, conv_block: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super().__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.branch1x1 = conv_block(in_channels, 320, kernel_size=1)
-
-        self.branch3x3_1 = conv_block(in_channels, 384, kernel_size=1)
-        self.branch3x3_2a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
-        self.branch3x3_2b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
-
-        self.branch3x3dbl_1 = conv_block(in_channels, 448, kernel_size=1)
-        self.branch3x3dbl_2 = conv_block(448, 384, kernel_size=3, padding=1)
-        self.branch3x3dbl_3a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
-        self.branch3x3dbl_3b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
-
-        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
-
-    def _forward(self, x: Tensor) -> List[Tensor]:
-        branch1x1 = self.branch1x1(x)
-
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = [
-            self.branch3x3_2a(branch3x3),
-            self.branch3x3_2b(branch3x3),
-        ]
-        branch3x3 = torch.cat(branch3x3, 1)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = [
-            self.branch3x3dbl_3a(branch3x3dbl),
-            self.branch3x3dbl_3b(branch3x3dbl),
-        ]
-        branch3x3dbl = torch.cat(branch3x3dbl, 1)
-
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool(branch_pool)
-
-        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
-        return outputs
-
-    def forward(self, x: Tensor) -> Tensor:
-        outputs = self._forward(x)
-        return torch.cat(outputs, 1)
-
-
-class InceptionAux(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        num_classes: int,
-        conv_block: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        super().__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.conv0 = conv_block(in_channels, 128, kernel_size=1)
-        self.conv1 = conv_block(128, 768, kernel_size=5)
-        self.conv1.stddev = 0.01  # type: ignore[assignment]
-        self.fc = nn.Linear(768, num_classes)
-        self.fc.stddev = 0.001  # type: ignore[assignment]
-
-    def forward(self, x: Tensor) -> Tensor:
-        # N x 768 x 17 x 17
-        x = F.avg_pool2d(x, kernel_size=5, stride=3)
-        # N x 768 x 5 x 5
-        x = self.conv0(x)
-        # N x 128 x 5 x 5
-        x = self.conv1(x)
-        # N x 768 x 1 x 1
-        # Adaptive average pooling
-        x = F.adaptive_avg_pool2d(x, (1, 1))
-        # N x 768 x 1 x 1
-        x = torch.flatten(x, 1)
-        # N x 768
-        x = self.fc(x)
-        # N x 1000
-        return x
-
-
-class BasicConv2d(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int, **kwargs: Any) -> None:
-        super().__init__()
-        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
-        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.conv(x)
-        x = self.bn(x)
-        return F.relu(x, inplace=True)
-
-
-def inception_v3(progress: bool = True, **kwargs: Any) -> Inception3:
-    r"""Inception v3 model architecture from
-    `"Rethinking the Inception Architecture for Computer Vision" <http://arxiv.org/abs/1512.00567>`_.
-    The required minimum input size of the model is 75x75.
-    .. note::
-        **Important**: In contrast to the other models the inception_v3 expects tensors with a size of
-        N x 3 x 299 x 299, so ensure your images are sized accordingly.
-    Args:
-        weights (Inception_V3_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
-        aux_logits (bool): If True, add an auxiliary branch that can improve training.
-            Default: *True*
-        transform_input (bool): If True, preprocesses the input according to the method with which it
-            was trained on ImageNet. Default: True if ``weights=Inception_V3_Weights.IMAGENET1K_V1``, else False.
-    """
-    model = Inception3(**kwargs)
-    return model
diff --git a/python/oneflow/test/expensive/pytorch_levit.py b/python/oneflow/test/expensive/pytorch_levit.py
deleted file mode 100644
index 7fef7d45ffc..00000000000
--- a/python/oneflow/test/expensive/pytorch_levit.py
+++ /dev/null
@@ -1,535 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import itertools
-from timm.models.vision_transformer import trunc_normal_
-
-specification = {
-    "LeViT_128S": {
-        "C": "128_256_384",
-        "D": 16,
-        "N": "4_6_8",
-        "X": "2_3_4",
-        "drop_path": 0,
-        "weights": "https://dl.fbaipublicfiles.com/LeViT/LeViT-128S-96703c44.pth",
-    }
-}
-
-__all__ = ["LeViT_128S"]
-
-
-def LeViT_128S(num_classes=1000, distillation=False, pretrained=False, fuse=False):
-    return model_factory(
-        **specification["LeViT_128S"],
-        num_classes=num_classes,
-        distillation=distillation,
-        pretrained=pretrained,
-        fuse=fuse
-    )
-
-
-FLOPS_COUNTER = 0
-
-
-class Conv2d_BN(torch.nn.Sequential):
-    def __init__(
-        self,
-        a,
-        b,
-        ks=1,
-        stride=1,
-        pad=0,
-        dilation=1,
-        groups=1,
-        bn_weight_init=1,
-        resolution=-10000,
-    ):
-        super().__init__()
-        self.add_module(
-            "c", torch.nn.Conv2d(a, b, ks, stride, pad, dilation, groups, bias=False)
-        )
-        bn = torch.nn.BatchNorm2d(b)
-        torch.nn.init.constant_(bn.weight, bn_weight_init)
-        torch.nn.init.constant_(bn.bias, 0)
-        self.add_module("bn", bn)
-
-        global FLOPS_COUNTER
-        output_points = (
-            (resolution + 2 * pad - dilation * (ks - 1) - 1) // stride + 1
-        ) ** 2
-        FLOPS_COUNTER += a * b * output_points * (ks ** 2) // groups
-
-    @torch.no_grad()
-    def fuse(self):
-        c, bn = self._modules.values()
-        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
-        w = c.weight * w[:, None, None, None]
-        b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5
-        m = torch.nn.Conv2d(
-            w.size(1) * self.c.groups,
-            w.size(0),
-            w.shape[2:],
-            stride=self.c.stride,
-            padding=self.c.padding,
-            dilation=self.c.dilation,
-            groups=self.c.groups,
-        )
-        m.weight.data.copy_(w)
-        m.bias.data.copy_(b)
-        return m
-
-
-class Linear_BN(torch.nn.Sequential):
-    def __init__(self, a, b, bn_weight_init=1, resolution=-100000):
-        super().__init__()
-        self.add_module("c", torch.nn.Linear(a, b, bias=False))
-        bn = torch.nn.BatchNorm1d(b)
-        torch.nn.init.constant_(bn.weight, bn_weight_init)
-        torch.nn.init.constant_(bn.bias, 0)
-        self.add_module("bn", bn)
-
-        global FLOPS_COUNTER
-        output_points = resolution ** 2
-        FLOPS_COUNTER += a * b * output_points
-
-    @torch.no_grad()
-    def fuse(self):
-        l, bn = self._modules.values()
-        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
-        w = l.weight * w[:, None]
-        b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5
-        m = torch.nn.Linear(w.size(1), w.size(0))
-        m.weight.data.copy_(w)
-        m.bias.data.copy_(b)
-        return m
-
-    def forward(self, x):
-        l, bn = self._modules.values()
-        x = l(x)
-        return bn(x.flatten(0, 1)).reshape_as(x)
-
-
-class BN_Linear(torch.nn.Sequential):
-    def __init__(self, a, b, bias=True, std=0.02):
-        super().__init__()
-        self.add_module("bn", torch.nn.BatchNorm1d(a))
-        l = torch.nn.Linear(a, b, bias=bias)
-        trunc_normal_(l.weight, std=std)
-        if bias:
-            torch.nn.init.constant_(l.bias, 0)
-        self.add_module("l", l)
-        global FLOPS_COUNTER
-        FLOPS_COUNTER += a * b
-
-    @torch.no_grad()
-    def fuse(self):
-        bn, l = self._modules.values()
-        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
-        b = (
-            bn.bias
-            - self.bn.running_mean * self.bn.weight / (bn.running_var + bn.eps) ** 0.5
-        )
-        w = l.weight * w[None, :]
-        if l.bias is None:
-            b = b @ self.l.weight.T
-        else:
-            b = (l.weight @ b[:, None]).view(-1) + self.l.bias
-        m = torch.nn.Linear(w.size(1), w.size(0))
-        m.weight.data.copy_(w)
-        m.bias.data.copy_(b)
-        return m
-
-
-def b16(n, activation, resolution=224):
-    return torch.nn.Sequential(
-        Conv2d_BN(3, n // 8, 3, 2, 1, resolution=resolution),
-        activation(),
-        Conv2d_BN(n // 8, n // 4, 3, 2, 1, resolution=resolution // 2),
-        activation(),
-        Conv2d_BN(n // 4, n // 2, 3, 2, 1, resolution=resolution // 4),
-        activation(),
-        Conv2d_BN(n // 2, n, 3, 2, 1, resolution=resolution // 8),
-    )
-
-
-class Residual(torch.nn.Module):
-    def __init__(self, m, drop):
-        super().__init__()
-        self.m = m
-        self.drop = drop
-
-    def forward(self, x):
-        if self.training and self.drop > 0:
-            return (
-                x
-                + self.m(x)
-                * torch.rand(x.size(0), 1, 1, device=x.device)
-                .ge_(self.drop)
-                .div(1 - self.drop)
-                .detach()
-            )
-        else:
-            return x + self.m(x)
-
-
-class Attention(torch.nn.Module):
-    def __init__(
-        self, dim, key_dim, num_heads=8, attn_ratio=4, activation=None, resolution=14
-    ):
-        super().__init__()
-        self.num_heads = num_heads
-        self.scale = key_dim ** -0.5
-        self.key_dim = key_dim
-        self.nh_kd = nh_kd = key_dim * num_heads
-        self.d = int(attn_ratio * key_dim)
-        self.dh = int(attn_ratio * key_dim) * num_heads
-        self.attn_ratio = attn_ratio
-        h = self.dh + nh_kd * 2
-        self.qkv = Linear_BN(dim, h, resolution=resolution)
-        self.proj = torch.nn.Sequential(
-            activation(),
-            Linear_BN(self.dh, dim, bn_weight_init=0, resolution=resolution),
-        )
-
-        points = list(itertools.product(range(resolution), range(resolution)))
-        N = len(points)
-        attention_offsets = {}
-        idxs = []
-        for p1 in points:
-            for p2 in points:
-                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
-                if offset not in attention_offsets:
-                    attention_offsets[offset] = len(attention_offsets)
-                idxs.append(attention_offsets[offset])
-        self.attention_biases = torch.nn.Parameter(
-            torch.zeros(num_heads, len(attention_offsets))
-        )
-        self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(N, N))
-
-        global FLOPS_COUNTER
-        # queries * keys
-        FLOPS_COUNTER += num_heads * (resolution ** 4) * key_dim
-        # softmax
-        FLOPS_COUNTER += num_heads * (resolution ** 4)
-        # attention * v
-        FLOPS_COUNTER += num_heads * self.d * (resolution ** 4)
-
-    @torch.no_grad()
-    def train(self, mode=True):
-        super().train(mode)
-        if mode and hasattr(self, "ab"):
-            del self.ab
-        else:
-            self.ab = self.attention_biases[:, self.attention_bias_idxs]
-
-    def forward(self, x):  # x (B,N,C)
-        B, N, C = x.shape
-        qkv = self.qkv(x)
-        q, k, v = qkv.view(B, N, self.num_heads, -1).split(
-            [self.key_dim, self.key_dim, self.d], dim=3
-        )
-        q = q.permute(0, 2, 1, 3)
-        k = k.permute(0, 2, 1, 3)
-        v = v.permute(0, 2, 1, 3)
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale + (
-            self.attention_biases[:, self.attention_bias_idxs]
-            if self.training
-            else self.ab
-        )
-        attn = attn.softmax(dim=-1)
-        x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
-        x = self.proj(x)
-        return x
-
-
-class Subsample(torch.nn.Module):
-    def __init__(self, stride, resolution):
-        super().__init__()
-        self.stride = stride
-        self.resolution = resolution
-
-    def forward(self, x):
-        B, N, C = x.shape
-        x = x.view(B, self.resolution, self.resolution, C)[
-            :, :: self.stride, :: self.stride
-        ].reshape(B, -1, C)
-        return x
-
-
-class AttentionSubsample(torch.nn.Module):
-    def __init__(
-        self,
-        in_dim,
-        out_dim,
-        key_dim,
-        num_heads=8,
-        attn_ratio=2,
-        activation=None,
-        stride=2,
-        resolution=14,
-        resolution_=7,
-    ):
-        super().__init__()
-        self.num_heads = num_heads
-        self.scale = key_dim ** -0.5
-        self.key_dim = key_dim
-        self.nh_kd = nh_kd = key_dim * num_heads
-        self.d = int(attn_ratio * key_dim)
-        self.dh = int(attn_ratio * key_dim) * self.num_heads
-        self.attn_ratio = attn_ratio
-        self.resolution_ = resolution_
-        self.resolution_2 = resolution_ ** 2
-        h = self.dh + nh_kd
-        self.kv = Linear_BN(in_dim, h, resolution=resolution)
-
-        self.q = torch.nn.Sequential(
-            Subsample(stride, resolution),
-            Linear_BN(in_dim, nh_kd, resolution=resolution_),
-        )
-        self.proj = torch.nn.Sequential(
-            activation(), Linear_BN(self.dh, out_dim, resolution=resolution_)
-        )
-
-        self.stride = stride
-        self.resolution = resolution
-        points = list(itertools.product(range(resolution), range(resolution)))
-        points_ = list(itertools.product(range(resolution_), range(resolution_)))
-        N = len(points)
-        N_ = len(points_)
-        attention_offsets = {}
-        idxs = []
-        for p1 in points_:
-            for p2 in points:
-                size = 1
-                offset = (
-                    abs(p1[0] * stride - p2[0] + (size - 1) / 2),
-                    abs(p1[1] * stride - p2[1] + (size - 1) / 2),
-                )
-                if offset not in attention_offsets:
-                    attention_offsets[offset] = len(attention_offsets)
-                idxs.append(attention_offsets[offset])
-        self.attention_biases = torch.nn.Parameter(
-            torch.zeros(num_heads, len(attention_offsets))
-        )
-        self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(N_, N))
-
-        global FLOPS_COUNTER
-        # queries * keys
-        FLOPS_COUNTER += num_heads * (resolution ** 2) * (resolution_ ** 2) * key_dim
-        # softmax
-        FLOPS_COUNTER += num_heads * (resolution ** 2) * (resolution_ ** 2)
-        # attention * v
-        FLOPS_COUNTER += num_heads * (resolution ** 2) * (resolution_ ** 2) * self.d
-
-    @torch.no_grad()
-    def train(self, mode=True):
-        super().train(mode)
-        if mode and hasattr(self, "ab"):
-            del self.ab
-        else:
-            self.ab = self.attention_biases[:, self.attention_bias_idxs]
-
-    def forward(self, x):
-        B, N, C = x.shape
-        k, v = (
-            self.kv(x)
-            .view(B, N, self.num_heads, -1)
-            .split([self.key_dim, self.d], dim=3)
-        )
-        k = k.permute(0, 2, 1, 3)  # BHNC
-        v = v.permute(0, 2, 1, 3)  # BHNC
-        q = (
-            self.q(x)
-            .view(B, self.resolution_2, self.num_heads, self.key_dim)
-            .permute(0, 2, 1, 3)
-        )
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale + (
-            self.attention_biases[:, self.attention_bias_idxs]
-            if self.training
-            else self.ab
-        )
-        attn = attn.softmax(dim=-1)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, -1, self.dh)
-        x = self.proj(x)
-        return x
-
-
-class LeViT(torch.nn.Module):
-    """ Vision Transformer with support for patch or hybrid CNN input stage
-    """
-
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=16,
-        in_chans=3,
-        num_classes=1000,
-        embed_dim=[192],
-        key_dim=[64],
-        depth=[12],
-        num_heads=[3],
-        attn_ratio=[2],
-        mlp_ratio=[2],
-        hybrid_backbone=None,
-        down_ops=[],
-        attention_activation=torch.nn.Hardswish,
-        mlp_activation=torch.nn.Hardswish,
-        distillation=True,
-        drop_path=0,
-    ):
-        super().__init__()
-        global FLOPS_COUNTER
-
-        self.num_classes = num_classes
-        self.num_features = embed_dim[-1]
-        self.embed_dim = embed_dim
-        self.distillation = distillation
-
-        self.patch_embed = hybrid_backbone
-
-        self.blocks = []
-        down_ops.append([""])
-        resolution = img_size // patch_size
-        for i, (ed, kd, dpth, nh, ar, mr, do) in enumerate(
-            zip(embed_dim, key_dim, depth, num_heads, attn_ratio, mlp_ratio, down_ops)
-        ):
-            for _ in range(dpth):
-                self.blocks.append(
-                    Residual(
-                        Attention(
-                            ed,
-                            kd,
-                            nh,
-                            attn_ratio=ar,
-                            activation=attention_activation,
-                            resolution=resolution,
-                        ),
-                        drop_path,
-                    )
-                )
-                if mr > 0:
-                    h = int(ed * mr)
-                    self.blocks.append(
-                        Residual(
-                            torch.nn.Sequential(
-                                Linear_BN(ed, h, resolution=resolution),
-                                mlp_activation(),
-                                Linear_BN(
-                                    h, ed, bn_weight_init=0, resolution=resolution
-                                ),
-                            ),
-                            drop_path,
-                        )
-                    )
-            if do[0] == "Subsample":
-                # ('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
-                resolution_ = (resolution - 1) // do[5] + 1
-                self.blocks.append(
-                    AttentionSubsample(
-                        *embed_dim[i : i + 2],
-                        key_dim=do[1],
-                        num_heads=do[2],
-                        attn_ratio=do[3],
-                        activation=attention_activation,
-                        stride=do[5],
-                        resolution=resolution,
-                        resolution_=resolution_
-                    )
-                )
-                resolution = resolution_
-                if do[4] > 0:  # mlp_ratio
-                    h = int(embed_dim[i + 1] * do[4])
-                    self.blocks.append(
-                        Residual(
-                            torch.nn.Sequential(
-                                Linear_BN(embed_dim[i + 1], h, resolution=resolution),
-                                mlp_activation(),
-                                Linear_BN(
-                                    h,
-                                    embed_dim[i + 1],
-                                    bn_weight_init=0,
-                                    resolution=resolution,
-                                ),
-                            ),
-                            drop_path,
-                        )
-                    )
-        self.blocks = torch.nn.Sequential(*self.blocks)
-
-        # Classifier head
-        self.head = (
-            BN_Linear(embed_dim[-1], num_classes)
-            if num_classes > 0
-            else torch.nn.Identity()
-        )
-        if distillation:
-            self.head_dist = (
-                BN_Linear(embed_dim[-1], num_classes)
-                if num_classes > 0
-                else torch.nn.Identity()
-            )
-
-        self.FLOPS = FLOPS_COUNTER
-        FLOPS_COUNTER = 0
-
-    def no_weight_decay(self):
-        return {x for x in self.state_dict().keys() if "attention_biases" in x}
-
-    def forward(self, x):
-        x = self.patch_embed(x)
-        x = x.flatten(2).transpose(1, 2)
-        x = self.blocks(x)
-        x = x.mean(1)
-        if self.distillation:
-            x = self.head(x), self.head_dist(x)
-            if not self.training:
-                x = (x[0] + x[1]) / 2
-        else:
-            x = self.head(x)
-        return x
-
-
-def model_factory(
-    C, D, X, N, drop_path, weights, num_classes, distillation, pretrained, fuse
-):
-    embed_dim = [int(x) for x in C.split("_")]
-    num_heads = [int(x) for x in N.split("_")]
-    depth = [int(x) for x in X.split("_")]
-    act = torch.nn.Hardswish
-    model = LeViT(
-        patch_size=16,
-        embed_dim=embed_dim,
-        num_heads=num_heads,
-        key_dim=[D] * 3,
-        depth=depth,
-        attn_ratio=[2, 2, 2],
-        mlp_ratio=[2, 2, 2],
-        down_ops=[
-            # ('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
-            ["Subsample", D, embed_dim[0] // D, 4, 2, 2],
-            ["Subsample", D, embed_dim[1] // D, 4, 2, 2],
-        ],
-        attention_activation=act,
-        mlp_activation=act,
-        hybrid_backbone=b16(embed_dim[0], activation=act),
-        num_classes=num_classes,
-        drop_path=drop_path,
-        distillation=distillation,
-    )
-    return model
diff --git a/python/oneflow/test/expensive/pytorch_mnasnet.py b/python/oneflow/test/expensive/pytorch_mnasnet.py
deleted file mode 100644
index d33d5414b6c..00000000000
--- a/python/oneflow/test/expensive/pytorch_mnasnet.py
+++ /dev/null
@@ -1,219 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-from torch import Tensor
-
-import warnings
-from typing import Any, Dict, List
-
-__all__ = [
-    "MNASNet",
-    "mnasnet1_0",
-]
-
-
-# Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is
-# 1.0 - tensorflow.
-_BN_MOMENTUM = 1 - 0.9997
-
-
-class _InvertedResidual(nn.Module):
-    def __init__(
-        self,
-        in_ch: int,
-        out_ch: int,
-        kernel_size: int,
-        stride: int,
-        expansion_factor: int,
-        bn_momentum: float = 0.1,
-    ) -> None:
-        super().__init__()
-        if stride not in [1, 2]:
-            raise ValueError(f"stride should be 1 or 2 instead of {stride}")
-        if kernel_size not in [3, 5]:
-            raise ValueError(f"kernel_size should be 3 or 5 instead of {kernel_size}")
-        mid_ch = in_ch * expansion_factor
-        self.apply_residual = in_ch == out_ch and stride == 1
-        self.layers = nn.Sequential(
-            # Pointwise
-            nn.Conv2d(in_ch, mid_ch, 1, bias=False),
-            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
-            nn.ReLU(inplace=True),
-            # Depthwise
-            nn.Conv2d(
-                mid_ch,
-                mid_ch,
-                kernel_size,
-                padding=kernel_size // 2,
-                stride=stride,
-                groups=mid_ch,
-                bias=False,
-            ),
-            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
-            nn.ReLU(inplace=True),
-            # Linear pointwise. Note that there's no activation.
-            nn.Conv2d(mid_ch, out_ch, 1, bias=False),
-            nn.BatchNorm2d(out_ch, momentum=bn_momentum),
-        )
-
-    def forward(self, input: Tensor) -> Tensor:
-        if self.apply_residual:
-            return self.layers(input) + input
-        else:
-            return self.layers(input)
-
-
-def _stack(
-    in_ch: int,
-    out_ch: int,
-    kernel_size: int,
-    stride: int,
-    exp_factor: int,
-    repeats: int,
-    bn_momentum: float,
-) -> nn.Sequential:
-    """Creates a stack of inverted residuals."""
-    if repeats < 1:
-        raise ValueError(f"repeats should be >= 1, instead got {repeats}")
-    # First one has no skip, because feature map size changes.
-    first = _InvertedResidual(
-        in_ch, out_ch, kernel_size, stride, exp_factor, bn_momentum=bn_momentum
-    )
-    remaining = []
-    for _ in range(1, repeats):
-        remaining.append(
-            _InvertedResidual(
-                out_ch, out_ch, kernel_size, 1, exp_factor, bn_momentum=bn_momentum
-            )
-        )
-    return nn.Sequential(first, *remaining)
-
-
-def _round_to_multiple_of(val: float, divisor: int, round_up_bias: float = 0.9) -> int:
-    """Asymmetric rounding to make `val` divisible by `divisor`. With default
-    bias, will round up, unless the number is no more than 10% greater than the
-    smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88."""
-    if not 0.0 < round_up_bias < 1.0:
-        raise ValueError(
-            f"round_up_bias should be greater than 0.0 and smaller than 1.0 instead of {round_up_bias}"
-        )
-    new_val = max(divisor, int(val + divisor / 2) // divisor * divisor)
-    return new_val if new_val >= round_up_bias * val else new_val + divisor
-
-
-def _get_depths(alpha: float) -> List[int]:
-    """Scales tensor depths as in reference MobileNet code, prefers rouding up
-    rather than down."""
-    depths = [32, 16, 24, 40, 80, 96, 192, 320]
-    return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
-
-
-class MNASNet(torch.nn.Module):
-    """MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf. This
-    implements the B1 variant of the model.
-    >>> model = MNASNet(1.0, num_classes=1000)
-    >>> x = torch.rand(1, 3, 224, 224)
-    >>> y = model(x)
-    >>> y.dim()
-    2
-    >>> y.nelement()
-    1000
-    """
-
-    # Version 2 adds depth scaling in the initial stages of the network.
-    _version = 2
-
-    def __init__(
-        self, alpha: float, num_classes: int = 1000, dropout: float = 0.2
-    ) -> None:
-        super().__init__()
-        if alpha <= 0.0:
-            raise ValueError(f"alpha should be greater than 0.0 instead of {alpha}")
-        self.alpha = alpha
-        self.num_classes = num_classes
-        depths = _get_depths(alpha)
-        layers = [
-            # First layer: regular conv.
-            nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
-            nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
-            nn.ReLU(inplace=True),
-            # Depthwise separable, no skip.
-            nn.Conv2d(
-                depths[0],
-                depths[0],
-                3,
-                padding=1,
-                stride=1,
-                groups=depths[0],
-                bias=False,
-            ),
-            nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(depths[0], depths[1], 1, padding=0, stride=1, bias=False),
-            nn.BatchNorm2d(depths[1], momentum=_BN_MOMENTUM),
-            # MNASNet blocks: stacks of inverted residuals.
-            _stack(depths[1], depths[2], 3, 2, 3, 3, _BN_MOMENTUM),
-            _stack(depths[2], depths[3], 5, 2, 3, 3, _BN_MOMENTUM),
-            _stack(depths[3], depths[4], 5, 2, 6, 3, _BN_MOMENTUM),
-            _stack(depths[4], depths[5], 3, 1, 6, 2, _BN_MOMENTUM),
-            _stack(depths[5], depths[6], 5, 2, 6, 4, _BN_MOMENTUM),
-            _stack(depths[6], depths[7], 3, 1, 6, 1, _BN_MOMENTUM),
-            # Final mapping to classifier input.
-            nn.Conv2d(depths[7], 1280, 1, padding=0, stride=1, bias=False),
-            nn.BatchNorm2d(1280, momentum=_BN_MOMENTUM),
-            nn.ReLU(inplace=True),
-        ]
-        self.layers = nn.Sequential(*layers)
-        self.classifier = nn.Sequential(
-            nn.Dropout(p=dropout, inplace=True), nn.Linear(1280, num_classes)
-        )
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.ones_(m.weight)
-                nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.Linear):
-                nn.init.kaiming_uniform_(
-                    m.weight, mode="fan_out", nonlinearity="sigmoid"
-                )
-                nn.init.zeros_(m.bias)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.layers(x)
-        # Equivalent to global avgpool and removing H and W dimensions.
-        x = x.mean([2, 3])
-        return self.classifier(x)
-
-
-def _mnasnet(alpha: float, progress: bool, **kwargs: Any) -> MNASNet:
-    model = MNASNet(alpha, **kwargs)
-    return model
-
-
-def mnasnet1_0(progress: bool = True, **kwargs: Any) -> MNASNet:
-    r"""MNASNet with depth multiplier of 1.0 from
-    `"MnasNet: Platform-Aware Neural Architecture Search for Mobile"
-    <https://arxiv.org/pdf/1807.11626.pdf>`_.
-    Args:
-        weights (MNASNet1_0_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _mnasnet(1.0, progress, **kwargs)
diff --git a/python/oneflow/test/expensive/pytorch_poolformer.py b/python/oneflow/test/expensive/pytorch_poolformer.py
deleted file mode 100644
index 0e263c20198..00000000000
--- a/python/oneflow/test/expensive/pytorch_poolformer.py
+++ /dev/null
@@ -1,437 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-from timm.models.layers import DropPath, trunc_normal_
-from timm.models.layers.helpers import to_2tuple
-
-import os
-import copy
-
-
-class PatchEmbed(nn.Module):
-    """
-    Patch Embedding that is implemented by a layer of conv. 
-    Input: tensor in shape [B, C, H, W]
-    Output: tensor in shape [B, C, H/stride, W/stride]
-    """
-
-    def __init__(
-        self,
-        patch_size=16,
-        stride=16,
-        padding=0,
-        in_chans=3,
-        embed_dim=768,
-        norm_layer=None,
-    ):
-        super().__init__()
-        patch_size = to_2tuple(patch_size)
-        stride = to_2tuple(stride)
-        padding = to_2tuple(padding)
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding
-        )
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-
-    def forward(self, x):
-        x = self.proj(x)
-        x = self.norm(x)
-        return x
-
-
-class LayerNormChannel(nn.Module):
-    """
-    LayerNorm only for Channel Dimension.
-    Input: tensor in shape [B, C, H, W]
-    """
-
-    def __init__(self, num_channels, eps=1e-05):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(num_channels))
-        self.bias = nn.Parameter(torch.zeros(num_channels))
-        self.eps = eps
-
-    def forward(self, x):
-        u = x.mean(1, keepdim=True)
-        s = (x - u).pow(2).mean(1, keepdim=True)
-        x = (x - u) / torch.sqrt(s + self.eps)
-        x = self.weight.unsqueeze(-1).unsqueeze(-1) * x + self.bias.unsqueeze(
-            -1
-        ).unsqueeze(-1)
-        return x
-
-
-class GroupNorm(nn.GroupNorm):
-    """
-    Group Normalization with 1 group.
-    Input: tensor in shape [B, C, H, W]
-    """
-
-    def __init__(self, num_channels, **kwargs):
-        super().__init__(1, num_channels, **kwargs)
-
-
-class Pooling(nn.Module):
-    """
-    Implementation of pooling for PoolFormer
-    --pool_size: pooling size
-    """
-
-    def __init__(self, pool_size=3):
-        super().__init__()
-        self.pool = nn.AvgPool2d(
-            pool_size, stride=1, padding=pool_size // 2, count_include_pad=False
-        )
-
-    def forward(self, x):
-        return self.pool(x) - x
-
-
-class Mlp(nn.Module):
-    """
-    Implementation of MLP with 1*1 convolutions.
-    Input: tensor with shape [B, C, H, W]
-    """
-
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        drop=0.0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
-        self.act = act_layer()
-        self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
-        self.drop = nn.Dropout(drop)
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Conv2d):
-            trunc_normal_(m.weight, std=0.02)
-            if m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class PoolFormerBlock(nn.Module):
-    """
-    Implementation of one PoolFormer block.
-    --dim: embedding dim
-    --pool_size: pooling size
-    --mlp_ratio: mlp expansion ratio
-    --act_layer: activation
-    --norm_layer: normalization
-    --drop: dropout rate
-    --drop path: Stochastic Depth, 
-        refer to https://arxiv.org/abs/1603.09382
-    --use_layer_scale, --layer_scale_init_value: LayerScale, 
-        refer to https://arxiv.org/abs/2103.17239
-    """
-
-    def __init__(
-        self,
-        dim,
-        pool_size=3,
-        mlp_ratio=4.0,
-        act_layer=nn.GELU,
-        norm_layer=GroupNorm,
-        drop=0.0,
-        drop_path=0.0,
-        use_layer_scale=True,
-        layer_scale_init_value=1e-5,
-    ):
-
-        super().__init__()
-
-        self.norm1 = norm_layer(dim)
-        self.token_mixer = Pooling(pool_size=pool_size)
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(
-            in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=drop,
-        )
-
-        # The following two techniques are useful to train deep PoolFormers.
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.use_layer_scale = use_layer_scale
-        if use_layer_scale:
-            self.layer_scale_1 = nn.Parameter(
-                layer_scale_init_value * torch.ones((dim)), requires_grad=True
-            )
-            self.layer_scale_2 = nn.Parameter(
-                layer_scale_init_value * torch.ones((dim)), requires_grad=True
-            )
-
-    def forward(self, x):
-        if self.use_layer_scale:
-            x = x + self.drop_path(
-                self.layer_scale_1.unsqueeze(-1).unsqueeze(-1)
-                * self.token_mixer(self.norm1(x))
-            )
-            x = x + self.drop_path(
-                self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x))
-            )
-        else:
-            x = x + self.drop_path(self.token_mixer(self.norm1(x)))
-            x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-
-
-def basic_blocks(
-    dim,
-    index,
-    layers,
-    pool_size=3,
-    mlp_ratio=4.0,
-    act_layer=nn.GELU,
-    norm_layer=GroupNorm,
-    drop_rate=0.0,
-    drop_path_rate=0.0,
-    use_layer_scale=True,
-    layer_scale_init_value=1e-5,
-):
-    """
-    generate PoolFormer blocks for a stage
-    return: PoolFormer blocks 
-    """
-    blocks = []
-    for block_idx in range(layers[index]):
-        block_dpr = (
-            drop_path_rate * (block_idx + sum(layers[:index])) / (sum(layers) - 1)
-        )
-        blocks.append(
-            PoolFormerBlock(
-                dim,
-                pool_size=pool_size,
-                mlp_ratio=mlp_ratio,
-                act_layer=act_layer,
-                norm_layer=norm_layer,
-                drop=drop_rate,
-                drop_path=block_dpr,
-                use_layer_scale=use_layer_scale,
-                layer_scale_init_value=layer_scale_init_value,
-            )
-        )
-    blocks = nn.Sequential(*blocks)
-
-    return blocks
-
-
-class PoolFormer(nn.Module):
-    """
-    PoolFormer, the main class of our model
-    --layers: [x,x,x,x], number of blocks for the 4 stages
-    --embed_dims, --mlp_ratios, --pool_size: the embedding dims, mlp ratios and 
-        pooling size for the 4 stages
-    --downsamples: flags to apply downsampling or not
-    --norm_layer, --act_layer: define the types of normalization and activation
-    --num_classes: number of classes for the image classification
-    --in_patch_size, --in_stride, --in_pad: specify the patch embedding
-        for the input image
-    --down_patch_size --down_stride --down_pad: 
-        specify the downsample (patch embed.)
-    --fork_feat: whether output features of the 4 stages, for dense prediction
-    --init_cfg, --pretrained: 
-        for mmdetection and mmsegmentation to load pretrained weights
-    """
-
-    def __init__(
-        self,
-        layers,
-        embed_dims=None,
-        mlp_ratios=None,
-        downsamples=None,
-        pool_size=3,
-        norm_layer=GroupNorm,
-        act_layer=nn.GELU,
-        num_classes=1000,
-        in_patch_size=7,
-        in_stride=4,
-        in_pad=2,
-        down_patch_size=3,
-        down_stride=2,
-        down_pad=1,
-        drop_rate=0.0,
-        drop_path_rate=0.0,
-        use_layer_scale=True,
-        layer_scale_init_value=1e-5,
-        fork_feat=False,
-        init_cfg=None,
-        pretrained=None,
-        **kwargs,
-    ):
-
-        super().__init__()
-
-        if not fork_feat:
-            self.num_classes = num_classes
-        self.fork_feat = fork_feat
-
-        self.patch_embed = PatchEmbed(
-            patch_size=in_patch_size,
-            stride=in_stride,
-            padding=in_pad,
-            in_chans=3,
-            embed_dim=embed_dims[0],
-        )
-
-        # set the main block in network
-        network = []
-        for i in range(len(layers)):
-            stage = basic_blocks(
-                embed_dims[i],
-                i,
-                layers,
-                pool_size=pool_size,
-                mlp_ratio=mlp_ratios[i],
-                act_layer=act_layer,
-                norm_layer=norm_layer,
-                drop_rate=drop_rate,
-                drop_path_rate=drop_path_rate,
-                use_layer_scale=use_layer_scale,
-                layer_scale_init_value=layer_scale_init_value,
-            )
-            network.append(stage)
-            if i >= len(layers) - 1:
-                break
-            if downsamples[i] or embed_dims[i] != embed_dims[i + 1]:
-                # downsampling between two stages
-                network.append(
-                    PatchEmbed(
-                        patch_size=down_patch_size,
-                        stride=down_stride,
-                        padding=down_pad,
-                        in_chans=embed_dims[i],
-                        embed_dim=embed_dims[i + 1],
-                    )
-                )
-
-        self.network = nn.ModuleList(network)
-
-        if self.fork_feat:
-            # add a norm layer for each output
-            self.out_indices = [0, 2, 4, 6]
-            for i_emb, i_layer in enumerate(self.out_indices):
-                if i_emb == 0 and os.environ.get("FORK_LAST3", None):
-                    # TODO: more elegant way
-                    """For RetinaNet, `start_level=1`. The first norm layer will not used.
-                    cmd: `FORK_LAST3=1 python -m torch.distributed.launch ...`
-                    """
-                    layer = nn.Identity()
-                else:
-                    layer = norm_layer(embed_dims[i_emb])
-                layer_name = f"norm{i_layer}"
-                self.add_module(layer_name, layer)
-        else:
-            # Classifier head
-            self.norm = norm_layer(embed_dims[-1])
-            self.head = (
-                nn.Linear(embed_dims[-1], num_classes)
-                if num_classes > 0
-                else nn.Identity()
-            )
-
-        self.apply(self.cls_init_weights)
-
-        self.init_cfg = copy.deepcopy(init_cfg)
-        # load pre-trained model
-        if self.fork_feat and (self.init_cfg is not None or pretrained is not None):
-            self.init_weights()
-
-    # init for classification
-    def cls_init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-
-    def get_classifier(self):
-        return self.head
-
-    def reset_classifier(self, num_classes):
-        self.num_classes = num_classes
-        self.head = (
-            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        )
-
-    def forward_embeddings(self, x):
-        x = self.patch_embed(x)
-        return x
-
-    def forward_tokens(self, x):
-        outs = []
-        for idx, block in enumerate(self.network):
-            x = block(x)
-            if self.fork_feat and idx in self.out_indices:
-                norm_layer = getattr(self, f"norm{idx}")
-                x_out = norm_layer(x)
-                outs.append(x_out)
-        if self.fork_feat:
-            # output the features of four stages for dense prediction
-            return outs
-        # output only the features of last layer for image classification
-        return x
-
-    def forward(self, x):
-        # input embedding
-        x = self.forward_embeddings(x)
-        # through backbone
-        x = self.forward_tokens(x)
-        if self.fork_feat:
-            # otuput features of four stages for dense prediction
-            return x
-        x = self.norm(x)
-        cls_out = self.head(x.mean([-2, -1]))
-        # for image classification
-        return cls_out
-
-
-def poolformer_s12(pretrained=False, **kwargs):
-    """
-    PoolFormer-S12 model, Params: 12M
-    --layers: [x,x,x,x], numbers of layers for the four stages
-    --embed_dims, --mlp_ratios: 
-        embedding dims and mlp ratios for the four stages
-    --downsamples: flags to apply downsampling or not in four blocks
-    """
-    layers = [2, 2, 6, 2]
-    embed_dims = [64, 128, 320, 512]
-    mlp_ratios = [4, 4, 4, 4]
-    downsamples = [True, True, True, True]
-    model = PoolFormer(
-        layers,
-        embed_dims=embed_dims,
-        mlp_ratios=mlp_ratios,
-        downsamples=downsamples,
-        **kwargs,
-    )
-    return model
diff --git a/python/oneflow/test/expensive/pytorch_pvt.py b/python/oneflow/test/expensive/pytorch_pvt.py
deleted file mode 100644
index 2c117379742..00000000000
--- a/python/oneflow/test/expensive/pytorch_pvt.py
+++ /dev/null
@@ -1,365 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from timm.models.layers import DropPath, to_2tuple, trunc_normal_
-
-from functools import partial
-
-__all__ = ["pvt_tiny"]
-
-
-class Mlp(nn.Module):
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        drop=0.0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim,
-        num_heads=8,
-        qkv_bias=False,
-        qk_scale=None,
-        attn_drop=0.0,
-        proj_drop=0.0,
-        sr_ratio=1,
-    ):
-        super().__init__()
-        assert (
-            dim % num_heads == 0
-        ), f"dim {dim} should be divided by num_heads {num_heads}."
-
-        self.dim = dim
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        self.q = nn.Linear(dim, dim, bias=qkv_bias)
-        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        self.sr_ratio = sr_ratio
-        if sr_ratio > 1:
-            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
-            self.norm = nn.LayerNorm(dim)
-
-    def forward(self, x, H, W):
-        B, N, C = x.shape
-        q = (
-            self.q(x)
-            .reshape(B, N, self.num_heads, C // self.num_heads)
-            .permute(0, 2, 1, 3)
-        )
-
-        if self.sr_ratio > 1:
-            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
-            x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
-            x_ = self.norm(x_)
-            kv = (
-                self.kv(x_)
-                .reshape(B, -1, 2, self.num_heads, C // self.num_heads)
-                .permute(2, 0, 3, 1, 4)
-            )
-        else:
-            kv = (
-                self.kv(x)
-                .reshape(B, -1, 2, self.num_heads, C // self.num_heads)
-                .permute(2, 0, 3, 1, 4)
-            )
-        k, v = kv[0], kv[1]
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-
-        return x
-
-
-class Block(nn.Module):
-    def __init__(
-        self,
-        dim,
-        num_heads,
-        mlp_ratio=4.0,
-        qkv_bias=False,
-        qk_scale=None,
-        drop=0.0,
-        attn_drop=0.0,
-        drop_path=0.0,
-        act_layer=nn.GELU,
-        norm_layer=nn.LayerNorm,
-        sr_ratio=1,
-    ):
-        super().__init__()
-        self.norm1 = norm_layer(dim)
-        self.attn = Attention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop,
-            sr_ratio=sr_ratio,
-        )
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(
-            in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=drop,
-        )
-
-    def forward(self, x, H, W):
-        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        return x
-
-
-class PatchEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
-
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-
-        self.img_size = img_size
-        self.patch_size = patch_size
-        # assert img_size[0] % patch_size[0] == 0 and img_size[1] % patch_size[1] == 0, \
-        #     f"img_size {img_size} should be divided by patch_size {patch_size}."
-        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
-        self.num_patches = self.H * self.W
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
-        )
-        self.norm = nn.LayerNorm(embed_dim)
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-
-        x = self.proj(x).flatten(2).transpose(1, 2)
-        x = self.norm(x)
-        H, W = H // self.patch_size[0], W // self.patch_size[1]
-
-        return x, (H, W)
-
-
-class PyramidVisionTransformer(nn.Module):
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=16,
-        in_chans=3,
-        num_classes=1000,
-        embed_dims=[64, 128, 256, 512],
-        num_heads=[1, 2, 4, 8],
-        mlp_ratios=[4, 4, 4, 4],
-        qkv_bias=False,
-        qk_scale=None,
-        drop_rate=0.0,
-        attn_drop_rate=0.0,
-        drop_path_rate=0.0,
-        norm_layer=nn.LayerNorm,
-        depths=[3, 4, 6, 3],
-        sr_ratios=[8, 4, 2, 1],
-        num_stages=4,
-    ):
-        super().__init__()
-        self.num_classes = num_classes
-        self.depths = depths
-        self.num_stages = num_stages
-
-        dpr = [
-            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
-        ]  # stochastic depth decay rule
-        cur = 0
-
-        for i in range(num_stages):
-            patch_embed = PatchEmbed(
-                img_size=img_size if i == 0 else img_size // (2 ** (i + 1)),
-                patch_size=patch_size if i == 0 else 2,
-                in_chans=in_chans if i == 0 else embed_dims[i - 1],
-                embed_dim=embed_dims[i],
-            )
-            num_patches = (
-                patch_embed.num_patches
-                if i != num_stages - 1
-                else patch_embed.num_patches + 1
-            )
-            pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dims[i]))
-            pos_drop = nn.Dropout(p=drop_rate)
-
-            block = nn.ModuleList(
-                [
-                    Block(
-                        dim=embed_dims[i],
-                        num_heads=num_heads[i],
-                        mlp_ratio=mlp_ratios[i],
-                        qkv_bias=qkv_bias,
-                        qk_scale=qk_scale,
-                        drop=drop_rate,
-                        attn_drop=attn_drop_rate,
-                        drop_path=dpr[cur + j],
-                        norm_layer=norm_layer,
-                        sr_ratio=sr_ratios[i],
-                    )
-                    for j in range(depths[i])
-                ]
-            )
-            cur += depths[i]
-
-            setattr(self, f"patch_embed{i + 1}", patch_embed)
-            setattr(self, f"pos_embed{i + 1}", pos_embed)
-            setattr(self, f"pos_drop{i + 1}", pos_drop)
-            setattr(self, f"block{i + 1}", block)
-
-        self.norm = norm_layer(embed_dims[3])
-
-        # cls_token
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims[3]))
-
-        # classification head
-        self.head = (
-            nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
-        )
-
-        # init weights
-        for i in range(num_stages):
-            pos_embed = getattr(self, f"pos_embed{i + 1}")
-            trunc_normal_(pos_embed, std=0.02)
-        trunc_normal_(self.cls_token, std=0.02)
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def no_weight_decay(self):
-        # return {'pos_embed', 'cls_token'} # has pos_embed may be better
-        return {"cls_token"}
-
-    def get_classifier(self):
-        return self.head
-
-    def reset_classifier(self, num_classes, global_pool=""):
-        self.num_classes = num_classes
-        self.head = (
-            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        )
-
-    def _get_pos_embed(self, pos_embed, patch_embed, H, W):
-        if H * W == self.patch_embed1.num_patches:
-            return pos_embed
-        else:
-            return (
-                F.interpolate(
-                    pos_embed.reshape(1, patch_embed.H, patch_embed.W, -1).permute(
-                        0, 3, 1, 2
-                    ),
-                    size=(H, W),
-                    mode="bilinear",
-                )
-                .reshape(1, -1, H * W)
-                .permute(0, 2, 1)
-            )
-
-    def forward_features(self, x):
-        B = x.shape[0]
-
-        for i in range(self.num_stages):
-            patch_embed = getattr(self, f"patch_embed{i + 1}")
-            pos_embed = getattr(self, f"pos_embed{i + 1}")
-            pos_drop = getattr(self, f"pos_drop{i + 1}")
-            block = getattr(self, f"block{i + 1}")
-            x, (H, W) = patch_embed(x)
-
-            if i == self.num_stages - 1:
-                cls_tokens = self.cls_token.expand(B, -1, -1)
-                x = torch.cat((cls_tokens, x), dim=1)
-                pos_embed_ = self._get_pos_embed(pos_embed[:, 1:], patch_embed, H, W)
-                pos_embed = torch.cat((pos_embed[:, 0:1], pos_embed_), dim=1)
-            else:
-                pos_embed = self._get_pos_embed(pos_embed, patch_embed, H, W)
-
-            x = pos_drop(x + pos_embed)
-            for blk in block:
-                x = blk(x, H, W)
-            if i != self.num_stages - 1:
-                x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
-
-        x = self.norm(x)
-
-        return x[:, 0]
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-
-        return x
-
-
-def pvt_tiny(pretrained=False, **kwargs):
-    model = PyramidVisionTransformer(
-        patch_size=4,
-        embed_dims=[64, 128, 320, 512],
-        num_heads=[1, 2, 5, 8],
-        mlp_ratios=[8, 8, 4, 4],
-        qkv_bias=True,
-        norm_layer=partial(nn.LayerNorm, eps=1e-6),
-        depths=[2, 2, 2, 2],
-        sr_ratios=[8, 4, 2, 1],
-        **kwargs,
-    )
-    return model
diff --git a/python/oneflow/test/expensive/pytorch_res2net.py b/python/oneflow/test/expensive/pytorch_res2net.py
deleted file mode 100644
index 7f790abc1fc..00000000000
--- a/python/oneflow/test/expensive/pytorch_res2net.py
+++ /dev/null
@@ -1,201 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch.nn as nn
-import torch
-import torch.nn.functional as F
-import math
-
-__all__ = ["Res2Net", "res2net50"]
-
-
-class Bottle2neck(nn.Module):
-    expansion = 4
-
-    def __init__(
-        self,
-        inplanes,
-        planes,
-        stride=1,
-        downsample=None,
-        baseWidth=26,
-        scale=4,
-        stype="normal",
-    ):
-        """ Constructor
-        Args:
-            inplanes: input channel dimensionality
-            planes: output channel dimensionality
-            stride: conv stride. Replaces pooling layer.
-            downsample: None when stride = 1
-            baseWidth: basic width of conv3x3
-            scale: number of scale.
-            type: 'normal': normal set. 'stage': first block of a new stage.
-        """
-        super(Bottle2neck, self).__init__()
-
-        width = int(math.floor(planes * (baseWidth / 64.0)))
-        self.conv1 = nn.Conv2d(inplanes, width * scale, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(width * scale)
-
-        if scale == 1:
-            self.nums = 1
-        else:
-            self.nums = scale - 1
-        if stype == "stage":
-            self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)
-        convs = []
-        bns = []
-        for i in range(self.nums):
-            convs.append(
-                nn.Conv2d(
-                    width, width, kernel_size=3, stride=stride, padding=1, bias=False
-                )
-            )
-            bns.append(nn.BatchNorm2d(width))
-        self.convs = nn.ModuleList(convs)
-        self.bns = nn.ModuleList(bns)
-
-        self.conv3 = nn.Conv2d(
-            width * scale, planes * self.expansion, kernel_size=1, bias=False
-        )
-        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
-
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stype = stype
-        self.scale = scale
-        self.width = width
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        spx = torch.split(out, self.width, 1)
-        for i in range(self.nums):
-            if i == 0 or self.stype == "stage":
-                sp = spx[i]
-            else:
-                sp = sp + spx[i]
-            sp = self.convs[i](sp)
-            sp = self.relu(self.bns[i](sp))
-            if i == 0:
-                out = sp
-            else:
-                out = torch.cat((out, sp), 1)
-        if self.scale != 1 and self.stype == "normal":
-            out = torch.cat((out, spx[self.nums]), 1)
-        elif self.scale != 1 and self.stype == "stage":
-            out = torch.cat((out, self.pool(spx[self.nums])), 1)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class Res2Net(nn.Module):
-    def __init__(self, block, layers, baseWidth=26, scale=4, num_classes=1000):
-        self.inplanes = 64
-        super(Res2Net, self).__init__()
-        self.baseWidth = baseWidth
-        self.scale = scale
-        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
-        self.bn1 = nn.BatchNorm2d(64)
-        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
-        self.avgpool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Linear(512 * block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-
-    def _make_layer(self, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(
-                    self.inplanes,
-                    planes * block.expansion,
-                    kernel_size=1,
-                    stride=stride,
-                    bias=False,
-                ),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(
-            block(
-                self.inplanes,
-                planes,
-                stride,
-                downsample=downsample,
-                stype="stage",
-                baseWidth=self.baseWidth,
-                scale=self.scale,
-            )
-        )
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(
-                block(self.inplanes, planes, baseWidth=self.baseWidth, scale=self.scale)
-            )
-
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-
-        x = self.avgpool(x)
-        x = x.view(x.size(0), -1)
-        x = self.fc(x)
-
-        return x
-
-
-def res2net50(pretrained=False, **kwargs):
-    """Constructs a Res2Net-50 model.
-    Res2Net-50 refers to the Res2Net-50_26w_4s.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = Res2Net(Bottle2neck, [3, 4, 6, 3], baseWidth=26, scale=4, **kwargs)
-    return model
diff --git a/python/oneflow/test/expensive/pytorch_resmlp.py b/python/oneflow/test/expensive/pytorch_resmlp.py
deleted file mode 100644
index e8f7e2d35a9..00000000000
--- a/python/oneflow/test/expensive/pytorch_resmlp.py
+++ /dev/null
@@ -1,239 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-from timm.models.layers import trunc_normal_, DropPath, to_2tuple
-
-
-__all__ = ["resmlp_12"]
-
-
-class Mlp(nn.Module):
-    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
-    """
-
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        drop=0.0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        drop_probs = to_2tuple(drop)
-
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.drop1 = nn.Dropout(drop_probs[0])
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop2 = nn.Dropout(drop_probs[1])
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop1(x)
-        x = self.fc2(x)
-        x = self.drop2(x)
-        return x
-
-
-class PatchEmbed(nn.Module):
-    """ 2D Image to Patch Embedding
-    """
-
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=16,
-        in_chans=3,
-        embed_dim=768,
-        norm_layer=None,
-        flatten=True,
-    ):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
-        self.num_patches = self.grid_size[0] * self.grid_size[1]
-        self.flatten = flatten
-
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
-        )
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        assert (
-            H == self.img_size[0],
-            f"Input image height ({H}) doesn't match model ({self.img_size[0]}).",
-        )
-        assert (
-            W == self.img_size[1],
-            f"Input image width ({W}) doesn't match model ({self.img_size[1]}).",
-        )
-        x = self.proj(x)
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        x = self.norm(x)
-        return x
-
-
-class Affine(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.alpha = nn.Parameter(torch.ones(dim))
-        self.beta = nn.Parameter(torch.zeros(dim))
-
-    def forward(self, x):
-        return self.alpha * x + self.beta
-
-
-class layers_scale_mlp_blocks(nn.Module):
-    def __init__(
-        self,
-        dim,
-        drop=0.0,
-        drop_path=0.0,
-        act_layer=nn.GELU,
-        init_values=1e-4,
-        num_patches=196,
-    ):
-        super().__init__()
-        self.norm1 = Affine(dim)
-        self.attn = nn.Linear(num_patches, num_patches)
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.norm2 = Affine(dim)
-        self.mlp = Mlp(
-            in_features=dim,
-            hidden_features=int(4.0 * dim),
-            act_layer=act_layer,
-            drop=drop,
-        )
-        self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
-        self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
-
-    def forward(self, x):
-        x = x + self.drop_path(
-            self.gamma_1 * self.attn(self.norm1(x).transpose(1, 2)).transpose(1, 2)
-        )
-        x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
-        return x
-
-
-class resmlp_models(nn.Module):
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=16,
-        in_chans=3,
-        num_classes=1000,
-        embed_dim=768,
-        depth=12,
-        drop_rate=0.0,
-        Patch_layer=PatchEmbed,
-        act_layer=nn.GELU,
-        drop_path_rate=0.0,
-        init_scale=1e-4,
-    ):
-        super().__init__()
-
-        self.num_classes = num_classes
-        self.num_features = self.embed_dim = embed_dim
-
-        self.patch_embed = Patch_layer(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=int(in_chans),
-            embed_dim=embed_dim,
-        )
-        num_patches = self.patch_embed.num_patches
-        dpr = [drop_path_rate for i in range(depth)]
-
-        self.blocks = nn.ModuleList(
-            [
-                layers_scale_mlp_blocks(
-                    dim=embed_dim,
-                    drop=drop_rate,
-                    drop_path=dpr[i],
-                    act_layer=act_layer,
-                    init_values=init_scale,
-                    num_patches=num_patches,
-                )
-                for i in range(depth)
-            ]
-        )
-
-        self.norm = Affine(embed_dim)
-
-        self.feature_info = [dict(num_chs=embed_dim, reduction=0, module="head")]
-        self.head = (
-            nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        )
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def get_classifier(self):
-        return self.head
-
-    def reset_classifier(self, num_classes, global_pool=""):
-        self.num_classes = num_classes
-        self.head = (
-            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        )
-
-    def forward_features(self, x):
-        B = x.shape[0]
-
-        x = self.patch_embed(x)
-
-        for i, blk in enumerate(self.blocks):
-            x = blk(x)
-
-        x = self.norm(x)
-        x = x.mean(dim=1).reshape(B, 1, -1)
-
-        return x[:, 0]
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-
-def resmlp_12(pretrained=False, dist=False, **kwargs):
-    model = resmlp_models(
-        patch_size=16,
-        embed_dim=384,
-        depth=12,
-        Patch_layer=PatchEmbed,
-        init_scale=0.1,
-        **kwargs,
-    )
-    return model
diff --git a/python/oneflow/test/expensive/pytorch_resnet.py b/python/oneflow/test/expensive/pytorch_resnet.py
index eebd35ad491..76847f5e483 100644
--- a/python/oneflow/test/expensive/pytorch_resnet.py
+++ b/python/oneflow/test/expensive/pytorch_resnet.py
@@ -16,6 +16,7 @@
 import torch
 from torch import Tensor
 import torch.nn as nn
+from _internally_replaced_utils import load_state_dict_from_url
 from typing import Type, Any, Callable, Union, List, Optional
 
 
@@ -33,6 +34,19 @@
 ]
 
 
+model_urls = {
+    "resnet18": "https://download.pytorch.org/models/resnet18-f37072fd.pth",
+    "resnet34": "https://download.pytorch.org/models/resnet34-b627a593.pth",
+    "resnet50": "https://download.pytorch.org/models/resnet50-0676ba61.pth",
+    "resnet101": "https://download.pytorch.org/models/resnet101-63fe2227.pth",
+    "resnet152": "https://download.pytorch.org/models/resnet152-394f9c45.pth",
+    "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
+    "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
+    "wide_resnet50_2": "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
+    "wide_resnet101_2": "https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
+}
+
+
 def conv3x3(
     in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1
 ) -> nn.Conv2d:
@@ -306,6 +320,9 @@ def _resnet(
     **kwargs: Any
 ) -> ResNet:
     model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
     return model
 
 
diff --git a/python/oneflow/test/expensive/pytorch_rexnet.py b/python/oneflow/test/expensive/pytorch_rexnet.py
deleted file mode 100644
index 98506d25c9f..00000000000
--- a/python/oneflow/test/expensive/pytorch_rexnet.py
+++ /dev/null
@@ -1,244 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-import math
-
-
-__all__ = [
-    "ReXNetV1",
-    "rexnetv1_1_0",
-]
-
-
-def silu(x, inplace=False):
-    return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
-
-
-class SiLU(nn.Module):
-    def __init__(self, inplace=True):
-        super(SiLU, self).__init__()
-        self.inplace = inplace
-
-    def forward(self, x):
-        return silu(x, self.inplace)
-
-
-def ConvBNAct(
-    out,
-    in_channels,
-    channels,
-    kernel=1,
-    stride=1,
-    pad=0,
-    num_group=1,
-    active=True,
-    relu6=False,
-):
-    out.append(
-        nn.Conv2d(
-            in_channels, channels, kernel, stride, pad, groups=num_group, bias=False
-        )
-    )
-    out.append(nn.BatchNorm2d(channels))
-    if active:
-        out.append(nn.ReLU6(inplace=True) if relu6 else nn.ReLU(inplace=True))
-
-
-def ConvBNSiLU(out, in_channels, channels, kernel=1, stride=1, pad=0, num_group=1):
-    out.append(
-        nn.Conv2d(
-            in_channels, channels, kernel, stride, pad, groups=num_group, bias=False
-        )
-    )
-    out.append(nn.BatchNorm2d(channels))
-    out.append(SiLU(inplace=True))
-
-
-class SE(nn.Module):
-    def __init__(self, in_channels, channels, se_ratio=12):
-        super(SE, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Sequential(
-            nn.Conv2d(in_channels, channels // se_ratio, kernel_size=1, padding=0),
-            nn.BatchNorm2d(channels // se_ratio),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(channels // se_ratio, channels, kernel_size=1, padding=0),
-            nn.Sigmoid(),
-        )
-
-    def forward(self, x):
-        y = self.avg_pool(x)
-        y = self.fc(y)
-        return x * y
-
-
-class LinearBottleneck(nn.Module):
-    def __init__(
-        self, in_channels, channels, t, stride, use_se=True, se_ratio=12, **kwargs
-    ):
-        super(LinearBottleneck, self).__init__(**kwargs)
-        self.use_shortcut = stride == 1 and in_channels <= channels
-        self.in_channels = in_channels
-        self.out_channels = channels
-
-        out = []
-        if t != 1:
-            dw_channels = in_channels * t
-            ConvBNSiLU(out, in_channels=in_channels, channels=dw_channels)
-        else:
-            dw_channels = in_channels
-
-        ConvBNAct(
-            out,
-            in_channels=dw_channels,
-            channels=dw_channels,
-            kernel=3,
-            stride=stride,
-            pad=1,
-            num_group=dw_channels,
-            active=False,
-        )
-
-        if use_se:
-            out.append(SE(dw_channels, dw_channels, se_ratio))
-
-        out.append(nn.ReLU6())
-        ConvBNAct(
-            out, in_channels=dw_channels, channels=channels, active=False, relu6=True
-        )
-        self.out = nn.Sequential(*out)
-
-    def forward(self, x):
-        out = self.out(x)
-        if self.use_shortcut:
-            out[:, 0 : self.in_channels] += x
-
-        return out
-
-
-class ReXNetV1(nn.Module):
-    def __init__(
-        self,
-        input_ch=16,
-        final_ch=180,
-        width_mult=1.0,
-        depth_mult=1.0,
-        classes=1000,
-        use_se=True,
-        se_ratio=12,
-        dropout_ratio=0.2,
-        bn_momentum=0.9,
-    ):
-        super(ReXNetV1, self).__init__()
-
-        layers = [1, 2, 2, 3, 3, 5]
-        strides = [1, 2, 2, 2, 1, 2]
-        use_ses = [False, False, True, True, True, True]
-
-        layers = [math.ceil(element * depth_mult) for element in layers]
-        strides = sum(
-            [
-                [element] + [1] * (layers[idx] - 1)
-                for idx, element in enumerate(strides)
-            ],
-            [],
-        )
-        if use_se:
-            use_ses = sum(
-                [[element] * layers[idx] for idx, element in enumerate(use_ses)], []
-            )
-        else:
-            use_ses = [False] * sum(layers[:])
-        ts = [1] * layers[0] + [6] * sum(layers[1:])
-
-        self.depth = sum(layers[:]) * 3
-        stem_channel = 32 / width_mult if width_mult < 1.0 else 32
-        inplanes = input_ch / width_mult if width_mult < 1.0 else input_ch
-
-        features = []
-        in_channels_group = []
-        channels_group = []
-
-        # The following channel configuration is a simple instance to make each layer become an expand layer.
-        for i in range(self.depth // 3):
-            if i == 0:
-                in_channels_group.append(int(round(stem_channel * width_mult)))
-                channels_group.append(int(round(inplanes * width_mult)))
-            else:
-                in_channels_group.append(int(round(inplanes * width_mult)))
-                inplanes += final_ch / (self.depth // 3 * 1.0)
-                channels_group.append(int(round(inplanes * width_mult)))
-
-        ConvBNSiLU(
-            features,
-            3,
-            int(round(stem_channel * width_mult)),
-            kernel=3,
-            stride=2,
-            pad=1,
-        )
-
-        for block_idx, (in_c, c, t, s, se) in enumerate(
-            zip(in_channels_group, channels_group, ts, strides, use_ses)
-        ):
-            features.append(
-                LinearBottleneck(
-                    in_channels=in_c,
-                    channels=c,
-                    t=t,
-                    stride=s,
-                    use_se=se,
-                    se_ratio=se_ratio,
-                )
-            )
-
-        pen_channels = int(1280 * width_mult)
-        ConvBNSiLU(features, c, pen_channels)
-
-        features.append(nn.AdaptiveAvgPool2d(1))
-        self.features = nn.Sequential(*features)
-        self.output = nn.Sequential(
-            nn.Dropout(dropout_ratio), nn.Conv2d(pen_channels, classes, 1, bias=True)
-        )
-
-    def extract_features(self, x):
-        return self.features[:-1](x)
-
-    def forward(self, x):
-        x = self.features(x)
-        x = self.output(x).flatten(1)
-        return x
-
-
-def _create_rexnetv1(arch, pretrained=False, progress=True, **model_kwargs):
-    model = ReXNetV1(**model_kwargs)
-    return model
-
-
-def rexnetv1_1_0(pretrained=False, progress=True, **kwargs):
-    """
-    Constructs the ReXNet model with width multiplier of 1.0.
-    .. note::
-        ReXNet model with width multiplier of 1.0 from the `Rethinking Channel Dimensions for Efficient Model Design <https://arxiv.org/pdf/2007.00992.pdf>`_ paper.
-    Args:
-        pretrained (bool): Whether to download the pre-trained model on ImageNet. Default: ``False``
-        progress (bool): If True, displays a progress bar of the download to stderr. Default: ``True``
-    """
-    model_kwargs = dict(width_mult=1.0, **kwargs)
-    return _create_rexnetv1(
-        "rexnetv1_1_0", pretrained=pretrained, progress=progress, **model_kwargs
-    )
diff --git a/python/oneflow/test/expensive/pytorch_rexnetv1_lite.py b/python/oneflow/test/expensive/pytorch_rexnetv1_lite.py
deleted file mode 100644
index 913d69fbf2f..00000000000
--- a/python/oneflow/test/expensive/pytorch_rexnetv1_lite.py
+++ /dev/null
@@ -1,259 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-from math import ceil
-
-__all__ = [
-    "ReXNetV1_lite",
-    "rexnet_lite_1_0",
-]
-
-
-def _make_divisible(channel_size, divisor=None, min_value=None):
-    """
-    This function is taken from the original tf repo.
-    It ensures that all layers have a channel number that is divisible by 8
-    It can be seen here:
-    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
-    """
-    if not divisor:
-        return channel_size
-
-    if min_value is None:
-        min_value = divisor
-    new_channel_size = max(
-        min_value, int(channel_size + divisor / 2) // divisor * divisor
-    )
-    # Make sure that round down does not go down by more than 10%.
-    if new_channel_size < 0.9 * channel_size:
-        new_channel_size += divisor
-    return new_channel_size
-
-
-def _add_conv(
-    out,
-    in_channels,
-    channels,
-    kernel=1,
-    stride=1,
-    pad=0,
-    num_group=1,
-    active=True,
-    relu6=True,
-    bn_momentum=0.1,
-    bn_eps=1e-5,
-):
-    out.append(
-        nn.Conv2d(
-            in_channels, channels, kernel, stride, pad, groups=num_group, bias=False
-        )
-    )
-    out.append(nn.BatchNorm2d(channels, momentum=bn_momentum, eps=bn_eps))
-    if active:
-        out.append(nn.ReLU6(inplace=True) if relu6 else nn.ReLU(inplace=True))
-
-
-class LinearBottleneck(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        channels,
-        t,
-        kernel_size=3,
-        stride=1,
-        bn_momentum=0.1,
-        bn_eps=1e-5,
-        **kwargs
-    ):
-        super(LinearBottleneck, self).__init__(**kwargs)
-        self.conv_shortcut = None
-        self.use_shortcut = stride == 1 and in_channels <= channels
-        self.in_channels = in_channels
-        self.out_channels = channels
-        out = []
-        if t != 1:
-            dw_channels = in_channels * t
-            _add_conv(
-                out,
-                in_channels=in_channels,
-                channels=dw_channels,
-                bn_momentum=bn_momentum,
-                bn_eps=bn_eps,
-            )
-        else:
-            dw_channels = in_channels
-
-        _add_conv(
-            out,
-            in_channels=dw_channels,
-            channels=dw_channels * 1,
-            kernel=kernel_size,
-            stride=stride,
-            pad=(kernel_size // 2),
-            num_group=dw_channels,
-            bn_momentum=bn_momentum,
-            bn_eps=bn_eps,
-        )
-
-        _add_conv(
-            out,
-            in_channels=dw_channels,
-            channels=channels,
-            active=False,
-            bn_momentum=bn_momentum,
-            bn_eps=bn_eps,
-        )
-
-        self.out = nn.Sequential(*out)
-
-    def forward(self, x):
-        out = self.out(x)
-
-        if self.use_shortcut:
-            out[:, 0 : self.in_channels] += x
-        return out
-
-
-class ReXNetV1_lite(nn.Module):
-    def __init__(
-        self,
-        fix_head_stem=False,
-        divisible_value=8,
-        input_ch=16,
-        final_ch=164,
-        multiplier=1.0,
-        classes=1000,
-        dropout_ratio=0.2,
-        bn_momentum=0.1,
-        bn_eps=1e-5,
-        kernel_conf="333333",
-    ):
-        super(ReXNetV1_lite, self).__init__()
-
-        layers = [1, 2, 2, 3, 3, 5]
-        strides = [1, 2, 2, 2, 1, 2]
-        kernel_sizes = [int(element) for element in kernel_conf]
-
-        strides = sum(
-            [
-                [element] + [1] * (layers[idx] - 1)
-                for idx, element in enumerate(strides)
-            ],
-            [],
-        )
-        ts = [1] * layers[0] + [6] * sum(layers[1:])
-        kernel_sizes = sum(
-            [[element] * layers[idx] for idx, element in enumerate(kernel_sizes)], []
-        )
-        self.num_convblocks = sum(layers[:])
-
-        features = []
-        inplanes = input_ch / multiplier if multiplier < 1.0 else input_ch
-        first_channel = 32 / multiplier if multiplier < 1.0 or fix_head_stem else 32
-        first_channel = _make_divisible(
-            int(round(first_channel * multiplier)), divisible_value
-        )
-
-        in_channels_group = []
-        channels_group = []
-
-        _add_conv(
-            features,
-            3,
-            first_channel,
-            kernel=3,
-            stride=2,
-            pad=1,
-            bn_momentum=bn_momentum,
-            bn_eps=bn_eps,
-        )
-
-        for i in range(self.num_convblocks):
-            inplanes_divisible = _make_divisible(
-                int(round(inplanes * multiplier)), divisible_value
-            )
-            if i == 0:
-                in_channels_group.append(first_channel)
-                channels_group.append(inplanes_divisible)
-            else:
-                in_channels_group.append(inplanes_divisible)
-                inplanes += final_ch / (self.num_convblocks - 1 * 1.0)
-                inplanes_divisible = _make_divisible(
-                    int(round(inplanes * multiplier)), divisible_value
-                )
-                channels_group.append(inplanes_divisible)
-
-        for block_idx, (in_c, c, t, k, s) in enumerate(
-            zip(in_channels_group, channels_group, ts, kernel_sizes, strides)
-        ):
-            features.append(
-                LinearBottleneck(
-                    in_channels=in_c,
-                    channels=c,
-                    t=t,
-                    kernel_size=k,
-                    stride=s,
-                    bn_momentum=bn_momentum,
-                    bn_eps=bn_eps,
-                )
-            )
-
-        pen_channels = (
-            int(1280 * multiplier) if multiplier > 1 and not fix_head_stem else 1280
-        )
-        _add_conv(features, c, pen_channels, bn_momentum=bn_momentum, bn_eps=bn_eps)
-
-        self.features = nn.Sequential(*features)
-        self.avgpool = nn.AdaptiveAvgPool2d(1)
-
-        self.output = nn.Sequential(
-            nn.Conv2d(pen_channels, 1024, 1, bias=True),
-            nn.BatchNorm2d(1024, momentum=bn_momentum, eps=bn_eps),
-            nn.ReLU6(inplace=True),
-            nn.Dropout(dropout_ratio),
-            nn.Conv2d(1024, classes, 1, bias=True),
-        )
-
-    def forward(self, x):
-        x = self.features(x)
-        x = self.avgpool(x)
-        x = self.output(x).flatten(1)
-        return x
-
-
-def _create_rexnet_lite(arch, pretrained=False, progress=True, **model_kwargs):
-    model = ReXNetV1_lite(**model_kwargs)
-    return model
-
-
-def rexnet_lite_1_0(pretrained=False, progress=True, **kwargs):
-    """
-    Constructs the ReXNet-lite model with width multiplier of 1.0.
-    .. note::
-        ReXNet-lite model with width multiplier of 1.0 from the `Rethinking Channel Dimensions for Efficient Model Design <https://arxiv.org/pdf/2007.00992.pdf>`_ paper.
-    Args:
-        pretrained (bool): Whether to download the pre-trained model on ImageNet. Default: ``False``
-        progress (bool): If True, displays a progress bar of the download to stderr. Default: ``True``
-    For example:
-    .. code-block:: python
-        >>> import flowvision
-        >>> rexnet_lite_1_0 = flowvision.models.rexnet_lite_1_0(pretrained=False, progress=True)
-    """
-    model_kwargs = dict(multiplier=1.0, **kwargs)
-    return _create_rexnet_lite(
-        "rexnet_lite_1_0", pretrained=pretrained, progress=progress, **model_kwargs
-    )
diff --git a/python/oneflow/test/expensive/pytorch_senet.py b/python/oneflow/test/expensive/pytorch_senet.py
deleted file mode 100644
index 3fb33703385..00000000000
--- a/python/oneflow/test/expensive/pytorch_senet.py
+++ /dev/null
@@ -1,361 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-from __future__ import print_function, division, absolute_import
-from collections import OrderedDict
-import math
-import torch.nn as nn
-
-__all__ = ["SENet", "senet154"]
-
-
-class SEModule(nn.Module):
-    def __init__(self, channels, reduction):
-        super(SEModule, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc1 = nn.Conv2d(channels, channels // reduction, kernel_size=1, padding=0)
-        self.relu = nn.ReLU(inplace=True)
-        self.fc2 = nn.Conv2d(channels // reduction, channels, kernel_size=1, padding=0)
-        self.sigmoid = nn.Sigmoid()
-
-    def forward(self, x):
-        module_input = x
-        x = self.avg_pool(x)
-        x = self.fc1(x)
-        x = self.relu(x)
-        x = self.fc2(x)
-        x = self.sigmoid(x)
-        return module_input * x
-
-
-class Bottleneck(nn.Module):
-    """
-    Base class for bottlenecks that implements `forward()` method.
-    """
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out = self.se_module(out) + residual
-        out = self.relu(out)
-
-        return out
-
-
-class SEBottleneck(Bottleneck):
-    """
-    Bottleneck for SENet154.
-    """
-
-    expansion = 4
-
-    def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None):
-        super(SEBottleneck, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes * 2, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes * 2)
-        self.conv2 = nn.Conv2d(
-            planes * 2,
-            planes * 4,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=groups,
-            bias=False,
-        )
-        self.bn2 = nn.BatchNorm2d(planes * 4)
-        self.conv3 = nn.Conv2d(planes * 4, planes * 4, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * 4)
-        self.relu = nn.ReLU(inplace=True)
-        self.se_module = SEModule(planes * 4, reduction=reduction)
-        self.downsample = downsample
-        self.stride = stride
-
-
-class SEResNetBottleneck(Bottleneck):
-    """
-    ResNet bottleneck with a Squeeze-and-Excitation module. It follows Caffe
-    implementation and uses `stride=stride` in `conv1` and not in `conv2`
-    (the latter is used in the torchvision implementation of ResNet).
-    """
-
-    expansion = 4
-
-    def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None):
-        super(SEResNetBottleneck, self).__init__()
-        self.conv1 = nn.Conv2d(
-            inplanes, planes, kernel_size=1, bias=False, stride=stride
-        )
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, padding=1, groups=groups, bias=False
-        )
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * 4)
-        self.relu = nn.ReLU(inplace=True)
-        self.se_module = SEModule(planes * 4, reduction=reduction)
-        self.downsample = downsample
-        self.stride = stride
-
-
-class SEResNeXtBottleneck(Bottleneck):
-    """
-    ResNeXt bottleneck type C with a Squeeze-and-Excitation module.
-    """
-
-    expansion = 4
-
-    def __init__(
-        self,
-        inplanes,
-        planes,
-        groups,
-        reduction,
-        stride=1,
-        downsample=None,
-        base_width=4,
-    ):
-        super(SEResNeXtBottleneck, self).__init__()
-        width = math.floor(planes * (base_width / 64)) * groups
-        self.conv1 = nn.Conv2d(inplanes, width, kernel_size=1, bias=False, stride=1)
-        self.bn1 = nn.BatchNorm2d(width)
-        self.conv2 = nn.Conv2d(
-            width,
-            width,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=groups,
-            bias=False,
-        )
-        self.bn2 = nn.BatchNorm2d(width)
-        self.conv3 = nn.Conv2d(width, planes * 4, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * 4)
-        self.relu = nn.ReLU(inplace=True)
-        self.se_module = SEModule(planes * 4, reduction=reduction)
-        self.downsample = downsample
-        self.stride = stride
-
-
-class SENet(nn.Module):
-    def __init__(
-        self,
-        block,
-        layers,
-        groups,
-        reduction,
-        dropout_p=0.2,
-        inplanes=128,
-        input_3x3=True,
-        downsample_kernel_size=3,
-        downsample_padding=1,
-        num_classes=1000,
-    ):
-        """
-        Parameters
-        ----------
-        block (nn.Module): Bottleneck class.
-            - For SENet154: SEBottleneck
-            - For SE-ResNet models: SEResNetBottleneck
-            - For SE-ResNeXt models:  SEResNeXtBottleneck
-        layers (list of ints): Number of residual blocks for 4 layers of the
-            network (layer1...layer4).
-        groups (int): Number of groups for the 3x3 convolution in each
-            bottleneck block.
-            - For SENet154: 64
-            - For SE-ResNet models: 1
-            - For SE-ResNeXt models:  32
-        reduction (int): Reduction ratio for Squeeze-and-Excitation modules.
-            - For all models: 16
-        dropout_p (float or None): Drop probability for the Dropout layer.
-            If `None` the Dropout layer is not used.
-            - For SENet154: 0.2
-            - For SE-ResNet models: None
-            - For SE-ResNeXt models: None
-        inplanes (int):  Number of input channels for layer1.
-            - For SENet154: 128
-            - For SE-ResNet models: 64
-            - For SE-ResNeXt models: 64
-        input_3x3 (bool): If `True`, use three 3x3 convolutions instead of
-            a single 7x7 convolution in layer0.
-            - For SENet154: True
-            - For SE-ResNet models: False
-            - For SE-ResNeXt models: False
-        downsample_kernel_size (int): Kernel size for downsampling convolutions
-            in layer2, layer3 and layer4.
-            - For SENet154: 3
-            - For SE-ResNet models: 1
-            - For SE-ResNeXt models: 1
-        downsample_padding (int): Padding for downsampling convolutions in
-            layer2, layer3 and layer4.
-            - For SENet154: 1
-            - For SE-ResNet models: 0
-            - For SE-ResNeXt models: 0
-        num_classes (int): Number of outputs in `last_linear` layer.
-            - For all models: 1000
-        """
-        super(SENet, self).__init__()
-        self.inplanes = inplanes
-        if input_3x3:
-            layer0_modules = [
-                ("conv1", nn.Conv2d(3, 64, 3, stride=2, padding=1, bias=False)),
-                ("bn1", nn.BatchNorm2d(64)),
-                ("relu1", nn.ReLU(inplace=True)),
-                ("conv2", nn.Conv2d(64, 64, 3, stride=1, padding=1, bias=False)),
-                ("bn2", nn.BatchNorm2d(64)),
-                ("relu2", nn.ReLU(inplace=True)),
-                ("conv3", nn.Conv2d(64, inplanes, 3, stride=1, padding=1, bias=False)),
-                ("bn3", nn.BatchNorm2d(inplanes)),
-                ("relu3", nn.ReLU(inplace=True)),
-            ]
-        else:
-            layer0_modules = [
-                (
-                    "conv1",
-                    nn.Conv2d(
-                        3, inplanes, kernel_size=7, stride=2, padding=3, bias=False
-                    ),
-                ),
-                ("bn1", nn.BatchNorm2d(inplanes)),
-                ("relu1", nn.ReLU(inplace=True)),
-            ]
-        # To preserve compatibility with Caffe weights `ceil_mode=True`
-        # is used instead of `padding=1`.
-        layer0_modules.append(("pool", nn.MaxPool2d(3, stride=2, ceil_mode=True)))
-        self.layer0 = nn.Sequential(OrderedDict(layer0_modules))
-        self.layer1 = self._make_layer(
-            block,
-            planes=64,
-            blocks=layers[0],
-            groups=groups,
-            reduction=reduction,
-            downsample_kernel_size=1,
-            downsample_padding=0,
-        )
-        self.layer2 = self._make_layer(
-            block,
-            planes=128,
-            blocks=layers[1],
-            stride=2,
-            groups=groups,
-            reduction=reduction,
-            downsample_kernel_size=downsample_kernel_size,
-            downsample_padding=downsample_padding,
-        )
-        self.layer3 = self._make_layer(
-            block,
-            planes=256,
-            blocks=layers[2],
-            stride=2,
-            groups=groups,
-            reduction=reduction,
-            downsample_kernel_size=downsample_kernel_size,
-            downsample_padding=downsample_padding,
-        )
-        self.layer4 = self._make_layer(
-            block,
-            planes=512,
-            blocks=layers[3],
-            stride=2,
-            groups=groups,
-            reduction=reduction,
-            downsample_kernel_size=downsample_kernel_size,
-            downsample_padding=downsample_padding,
-        )
-        self.avg_pool = nn.AvgPool2d(7, stride=1)
-        self.dropout = nn.Dropout(dropout_p) if dropout_p is not None else None
-        self.last_linear = nn.Linear(512 * block.expansion, num_classes)
-
-    def _make_layer(
-        self,
-        block,
-        planes,
-        blocks,
-        groups,
-        reduction,
-        stride=1,
-        downsample_kernel_size=1,
-        downsample_padding=0,
-    ):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(
-                    self.inplanes,
-                    planes * block.expansion,
-                    kernel_size=downsample_kernel_size,
-                    stride=stride,
-                    padding=downsample_padding,
-                    bias=False,
-                ),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(
-            block(self.inplanes, planes, groups, reduction, stride, downsample)
-        )
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes, groups, reduction))
-
-        return nn.Sequential(*layers)
-
-    def features(self, x):
-        x = self.layer0(x)
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-        return x
-
-    def logits(self, x):
-        x = self.avg_pool(x)
-        if self.dropout is not None:
-            x = self.dropout(x)
-        x = x.view(x.size(0), -1)
-        x = self.last_linear(x)
-        return x
-
-    def forward(self, x):
-        x = self.features(x)
-        x = self.logits(x)
-        return x
-
-
-def senet154(num_classes=1000, pretrained="imagenet"):
-    model = SENet(
-        SEBottleneck,
-        [3, 8, 12, 3],
-        groups=64,
-        reduction=16,
-        dropout_p=0.2,
-        num_classes=num_classes,
-    )
-    return model
diff --git a/python/oneflow/test/expensive/pytorch_shufflenetv2.py b/python/oneflow/test/expensive/pytorch_shufflenetv2.py
deleted file mode 100644
index 94a5c484eba..00000000000
--- a/python/oneflow/test/expensive/pytorch_shufflenetv2.py
+++ /dev/null
@@ -1,205 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-from torch import Tensor
-
-from typing import Callable, Any, List
-
-__all__ = [
-    "ShuffleNetV2",
-    "shufflenet_v2_x2_0",
-]
-
-
-def channel_shuffle(x: Tensor, groups: int) -> Tensor:
-    batchsize, num_channels, height, width = x.size()
-    channels_per_group = num_channels // groups
-
-    # reshape
-    x = x.view(batchsize, groups, channels_per_group, height, width)
-
-    x = torch.transpose(x, 1, 2).contiguous()
-
-    # flatten
-    x = x.view(batchsize, -1, height, width)
-
-    return x
-
-
-class InvertedResidual(nn.Module):
-    def __init__(self, inp: int, oup: int, stride: int) -> None:
-        super().__init__()
-
-        if not (1 <= stride <= 3):
-            raise ValueError("illegal stride value")
-        self.stride = stride
-
-        branch_features = oup // 2
-        if (self.stride == 1) and (inp != branch_features << 1):
-            raise ValueError(
-                f"Invalid combination of stride {stride}, inp {inp} and oup {oup} values. If stride == 1 then inp should be equal to oup // 2 << 1."
-            )
-
-        if self.stride > 1:
-            self.branch1 = nn.Sequential(
-                self.depthwise_conv(
-                    inp, inp, kernel_size=3, stride=self.stride, padding=1
-                ),
-                nn.BatchNorm2d(inp),
-                nn.Conv2d(
-                    inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False
-                ),
-                nn.BatchNorm2d(branch_features),
-                nn.ReLU(inplace=True),
-            )
-        else:
-            self.branch1 = nn.Sequential()
-
-        self.branch2 = nn.Sequential(
-            nn.Conv2d(
-                inp if (self.stride > 1) else branch_features,
-                branch_features,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                bias=False,
-            ),
-            nn.BatchNorm2d(branch_features),
-            nn.ReLU(inplace=True),
-            self.depthwise_conv(
-                branch_features,
-                branch_features,
-                kernel_size=3,
-                stride=self.stride,
-                padding=1,
-            ),
-            nn.BatchNorm2d(branch_features),
-            nn.Conv2d(
-                branch_features,
-                branch_features,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                bias=False,
-            ),
-            nn.BatchNorm2d(branch_features),
-            nn.ReLU(inplace=True),
-        )
-
-    @staticmethod
-    def depthwise_conv(
-        i: int,
-        o: int,
-        kernel_size: int,
-        stride: int = 1,
-        padding: int = 0,
-        bias: bool = False,
-    ) -> nn.Conv2d:
-        return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i)
-
-    def forward(self, x: Tensor) -> Tensor:
-        if self.stride == 1:
-            x1, x2 = x.chunk(2, dim=1)
-            out = torch.cat((x1, self.branch2(x2)), dim=1)
-        else:
-            out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
-
-        out = channel_shuffle(out, 2)
-
-        return out
-
-
-class ShuffleNetV2(nn.Module):
-    def __init__(
-        self,
-        stages_repeats: List[int],
-        stages_out_channels: List[int],
-        num_classes: int = 1000,
-        inverted_residual: Callable[..., nn.Module] = InvertedResidual,
-    ) -> None:
-        super().__init__()
-        if len(stages_repeats) != 3:
-            raise ValueError("expected stages_repeats as list of 3 positive ints")
-        if len(stages_out_channels) != 5:
-            raise ValueError("expected stages_out_channels as list of 5 positive ints")
-        self._stage_out_channels = stages_out_channels
-
-        input_channels = 3
-        output_channels = self._stage_out_channels[0]
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False),
-            nn.BatchNorm2d(output_channels),
-            nn.ReLU(inplace=True),
-        )
-        input_channels = output_channels
-
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-
-        # Static annotations for mypy
-        self.stage2: nn.Sequential
-        self.stage3: nn.Sequential
-        self.stage4: nn.Sequential
-        stage_names = [f"stage{i}" for i in [2, 3, 4]]
-        for name, repeats, output_channels in zip(
-            stage_names, stages_repeats, self._stage_out_channels[1:]
-        ):
-            seq = [inverted_residual(input_channels, output_channels, 2)]
-            for i in range(repeats - 1):
-                seq.append(inverted_residual(output_channels, output_channels, 1))
-            setattr(self, name, nn.Sequential(*seq))
-            input_channels = output_channels
-
-        output_channels = self._stage_out_channels[-1]
-        self.conv5 = nn.Sequential(
-            nn.Conv2d(input_channels, output_channels, 1, 1, 0, bias=False),
-            nn.BatchNorm2d(output_channels),
-            nn.ReLU(inplace=True),
-        )
-
-        self.fc = nn.Linear(output_channels, num_classes)
-
-    def _forward_impl(self, x: Tensor) -> Tensor:
-        # See note [TorchScript super()]
-        x = self.conv1(x)
-        x = self.maxpool(x)
-        x = self.stage2(x)
-        x = self.stage3(x)
-        x = self.stage4(x)
-        x = self.conv5(x)
-        x = x.mean([2, 3])  # globalpool
-        x = self.fc(x)
-        return x
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self._forward_impl(x)
-
-
-def _shufflenetv2(progress: bool, *args: Any, **kwargs: Any,) -> ShuffleNetV2:
-    model = ShuffleNetV2(*args, **kwargs)
-    return model
-
-
-def shufflenet_v2_x2_0(progress: bool = True, **kwargs: Any) -> ShuffleNetV2:
-    """
-    Constructs a ShuffleNetV2 with 2.0x output channels, as described in
-    `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design"
-    <https://arxiv.org/abs/1807.11164>`_.
-    Args:
-        weights (ShuffleNet_V2_X2_0_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _shufflenetv2(progress, [4, 8, 4], [24, 244, 488, 976, 2048], **kwargs)
diff --git a/python/oneflow/test/expensive/pytorch_squeezenet.py b/python/oneflow/test/expensive/pytorch_squeezenet.py
deleted file mode 100644
index 6117ba20274..00000000000
--- a/python/oneflow/test/expensive/pytorch_squeezenet.py
+++ /dev/null
@@ -1,140 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-import torch.nn.init as init
-
-from typing import Any
-
-__all__ = ["SqueezeNet", "squeezenet1_1"]
-
-
-class Fire(nn.Module):
-    def __init__(
-        self,
-        inplanes: int,
-        squeeze_planes: int,
-        expand1x1_planes: int,
-        expand3x3_planes: int,
-    ) -> None:
-        super().__init__()
-        self.inplanes = inplanes
-        self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
-        self.squeeze_activation = nn.ReLU(inplace=True)
-        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes, kernel_size=1)
-        self.expand1x1_activation = nn.ReLU(inplace=True)
-        self.expand3x3 = nn.Conv2d(
-            squeeze_planes, expand3x3_planes, kernel_size=3, padding=1
-        )
-        self.expand3x3_activation = nn.ReLU(inplace=True)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.squeeze_activation(self.squeeze(x))
-        return torch.cat(
-            [
-                self.expand1x1_activation(self.expand1x1(x)),
-                self.expand3x3_activation(self.expand3x3(x)),
-            ],
-            1,
-        )
-
-
-class SqueezeNet(nn.Module):
-    def __init__(
-        self, version: str = "1_0", num_classes: int = 1000, dropout: float = 0.5
-    ) -> None:
-        super().__init__()
-        self.num_classes = num_classes
-        if version == "1_0":
-            self.features = nn.Sequential(
-                nn.Conv2d(3, 96, kernel_size=7, stride=2),
-                nn.ReLU(inplace=True),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(96, 16, 64, 64),
-                Fire(128, 16, 64, 64),
-                Fire(128, 32, 128, 128),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(256, 32, 128, 128),
-                Fire(256, 48, 192, 192),
-                Fire(384, 48, 192, 192),
-                Fire(384, 64, 256, 256),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(512, 64, 256, 256),
-            )
-        elif version == "1_1":
-            self.features = nn.Sequential(
-                nn.Conv2d(3, 64, kernel_size=3, stride=2),
-                nn.ReLU(inplace=True),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(64, 16, 64, 64),
-                Fire(128, 16, 64, 64),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(128, 32, 128, 128),
-                Fire(256, 32, 128, 128),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(256, 48, 192, 192),
-                Fire(384, 48, 192, 192),
-                Fire(384, 64, 256, 256),
-                Fire(512, 64, 256, 256),
-            )
-        else:
-            # FIXME: Is this needed? SqueezeNet should only be called from the
-            # FIXME: squeezenet1_x() functions
-            # FIXME: This checking is not done for the other models
-            raise ValueError(
-                f"Unsupported SqueezeNet version {version}: 1_0 or 1_1 expected"
-            )
-
-        # Final convolution is initialized differently from the rest
-        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
-        self.classifier = nn.Sequential(
-            nn.Dropout(p=dropout),
-            final_conv,
-            nn.ReLU(inplace=True),
-            nn.AdaptiveAvgPool2d((1, 1)),
-        )
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                if m is final_conv:
-                    init.normal_(m.weight, mean=0.0, std=0.01)
-                else:
-                    init.kaiming_uniform_(m.weight)
-                if m.bias is not None:
-                    init.constant_(m.bias, 0)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.features(x)
-        x = self.classifier(x)
-        return torch.flatten(x, 1)
-
-
-def _squeezenet(version: str, progress: bool, **kwargs: Any,) -> SqueezeNet:
-    model = SqueezeNet(version, **kwargs)
-    return model
-
-
-def squeezenet1_1(progress: bool = True, **kwargs: Any) -> SqueezeNet:
-    r"""SqueezeNet 1.1 model from the `official SqueezeNet repo
-    <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>`_.
-    SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
-    than SqueezeNet 1.0, without sacrificing accuracy.
-    The required minimum input size of the model is 17x17.
-    Args:
-        weights (SqueezeNet1_1_Weights, optional): The pretrained weights for the model
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _squeezenet("1_1", progress, **kwargs)
diff --git a/python/oneflow/test/expensive/pytorch_swin_transformer.py b/python/oneflow/test/expensive/pytorch_swin_transformer.py
deleted file mode 100644
index 6f41f71806f..00000000000
--- a/python/oneflow/test/expensive/pytorch_swin_transformer.py
+++ /dev/null
@@ -1,783 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import torch
-import torch.nn as nn
-from timm.models.layers import DropPath, to_2tuple, trunc_normal_
-
-
-class Mlp(nn.Module):
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        drop=0.0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (B, H, W, C)
-        window_size (int): window size
-    Returns:
-        windows: (num_windows*B, window_size, window_size, C)
-    """
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = (
-        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    )
-    return windows
-
-
-def window_reverse(windows, window_size, H, W):
-    """
-    Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
-    Returns:
-        x: (B, H, W, C)
-    """
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(
-        B, H // window_size, W // window_size, window_size, window_size, -1
-    )
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
-
-
-class WindowAttention(nn.Module):
-    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-
-    def __init__(
-        self,
-        dim,
-        window_size,
-        num_heads,
-        qkv_bias=True,
-        qk_scale=None,
-        attn_drop=0.0,
-        proj_drop=0.0,
-    ):
-
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        # define a parameter table of relative position bias
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
-        )  # 2*Wh-1 * 2*Ww-1, nH
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(self.window_size[0])
-        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = (
-            coords_flatten[:, :, None] - coords_flatten[:, None, :]
-        )  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(
-            1, 2, 0
-        ).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        self.register_buffer("relative_position_index", relative_position_index)
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        trunc_normal_(self.relative_position_bias_table, std=0.02)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, mask=None):
-        """
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        qkv = (
-            self.qkv(x)
-            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
-            .permute(2, 0, 3, 1, 4)
-        )
-        q, k, v = (
-            qkv[0],
-            qkv[1],
-            qkv[2],
-        )  # make torchscript happy (cannot use tensor as tuple)
-
-        q = q * self.scale
-        attn = q @ k.transpose(-2, -1)
-
-        relative_position_bias = self.relative_position_bias_table[
-            self.relative_position_index.view(-1)
-        ].view(
-            self.window_size[0] * self.window_size[1],
-            self.window_size[0] * self.window_size[1],
-            -1,
-        )  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(
-            2, 0, 1
-        ).contiguous()  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
-                1
-            ).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}"
-
-    def flops(self, N):
-        # calculate flops for 1 window with token length of N
-        flops = 0
-        # qkv = self.qkv(x)
-        flops += N * self.dim * 3 * self.dim
-        # attn = (q @ k.transpose(-2, -1))
-        flops += self.num_heads * N * (self.dim // self.num_heads) * N
-        #  x = (attn @ v)
-        flops += self.num_heads * N * N * (self.dim // self.num_heads)
-        # x = self.proj(x)
-        flops += N * self.dim * self.dim
-        return flops
-
-
-class SwinTransformerBlock(nn.Module):
-    r""" Swin Transformer Block.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resulotion.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        shift_size (int): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(
-        self,
-        dim,
-        input_resolution,
-        num_heads,
-        window_size=7,
-        shift_size=0,
-        mlp_ratio=4.0,
-        qkv_bias=True,
-        qk_scale=None,
-        drop=0.0,
-        attn_drop=0.0,
-        drop_path=0.0,
-        act_layer=nn.GELU,
-        norm_layer=nn.LayerNorm,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        if min(self.input_resolution) <= self.window_size:
-            # if window size is larger than input resolution, we don't partition windows
-            self.shift_size = 0
-            self.window_size = min(self.input_resolution)
-        assert (
-            0 <= self.shift_size < self.window_size
-        ), "shift_size must in 0-window_size"
-
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim,
-            window_size=to_2tuple(self.window_size),
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop,
-        )
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(
-            in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=drop,
-        )
-
-        if self.shift_size > 0:
-            # calculate attention mask for SW-MSA
-            H, W = self.input_resolution
-            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
-            h_slices = (
-                slice(0, -self.window_size),
-                slice(-self.window_size, -self.shift_size),
-                slice(-self.shift_size, None),
-            )
-            w_slices = (
-                slice(0, -self.window_size),
-                slice(-self.window_size, -self.shift_size),
-                slice(-self.shift_size, None),
-            )
-            cnt = 0
-            for h in h_slices:
-                for w in w_slices:
-                    img_mask[:, h, w, :] = cnt
-                    cnt += 1
-
-            mask_windows = window_partition(
-                img_mask, self.window_size
-            )  # nW, window_size, window_size, 1
-            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-            attn_mask = attn_mask.masked_fill(
-                attn_mask != 0, float(-100.0)
-            ).masked_fill(attn_mask == 0, float(0.0))
-        else:
-            attn_mask = None
-
-        self.register_buffer("attn_mask", attn_mask)
-
-    def forward(self, x):
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(B, H, W, C)
-
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_x = torch.roll(
-                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
-            )
-        else:
-            shifted_x = x
-
-        # partition windows
-        x_windows = window_partition(
-            shifted_x, self.window_size
-        )  # nW*B, window_size, window_size, C
-        x_windows = x_windows.view(
-            -1, self.window_size * self.window_size, C
-        )  # nW*B, window_size*window_size, C
-
-        # W-MSA/SW-MSA
-        attn_windows = self.attn(
-            x_windows, mask=self.attn_mask
-        )  # nW*B, window_size*window_size, C
-
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
-
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = torch.roll(
-                shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)
-            )
-        else:
-            x = shifted_x
-        x = x.view(B, H * W, C)
-
-        # FFN
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        return x
-
-    def extra_repr(self) -> str:
-        return (
-            f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
-            f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
-        )
-
-    def flops(self):
-        flops = 0
-        H, W = self.input_resolution
-        # norm1
-        flops += self.dim * H * W
-        # W-MSA/SW-MSA
-        nW = H * W / self.window_size / self.window_size
-        flops += nW * self.attn.flops(self.window_size * self.window_size)
-        # mlp
-        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
-        # norm2
-        flops += self.dim * H * W
-        return flops
-
-
-class PatchMerging(nn.Module):
-    r""" Patch Merging Layer.
-    Args:
-        input_resolution (tuple[int]): Resolution of input feature.
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
-        self.norm = norm_layer(4 * dim)
-
-    def forward(self, x):
-        """
-        x: B, H*W, C
-        """
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
-
-        x = x.view(B, H, W, C)
-
-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
-
-        x = self.norm(x)
-        x = self.reduction(x)
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"input_resolution={self.input_resolution}, dim={self.dim}"
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = H * W * self.dim
-        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
-        return flops
-
-
-class BasicLayer(nn.Module):
-    """ A basic Swin Transformer layer for one stage.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-    """
-
-    def __init__(
-        self,
-        dim,
-        input_resolution,
-        depth,
-        num_heads,
-        window_size,
-        mlp_ratio=4.0,
-        qkv_bias=True,
-        qk_scale=None,
-        drop=0.0,
-        attn_drop=0.0,
-        drop_path=0.0,
-        norm_layer=nn.LayerNorm,
-        downsample=None,
-        use_checkpoint=False,
-    ):
-
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # build blocks
-        self.blocks = nn.ModuleList(
-            [
-                SwinTransformerBlock(
-                    dim=dim,
-                    input_resolution=input_resolution,
-                    num_heads=num_heads,
-                    window_size=window_size,
-                    shift_size=0 if (i % 2 == 0) else window_size // 2,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    drop=drop,
-                    attn_drop=attn_drop,
-                    drop_path=drop_path[i]
-                    if isinstance(drop_path, list)
-                    else drop_path,
-                    norm_layer=norm_layer,
-                )
-                for i in range(depth)
-            ]
-        )
-
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(
-                input_resolution, dim=dim, norm_layer=norm_layer
-            )
-        else:
-            self.downsample = None
-
-    def forward(self, x):
-        for blk in self.blocks:
-            x = blk(x)
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
-
-    def flops(self):
-        flops = 0
-        for blk in self.blocks:
-            flops += blk.flops()
-        if self.downsample is not None:
-            flops += self.downsample.flops()
-        return flops
-
-
-class PatchEmbed(nn.Module):
-    r""" Image to Patch Embedding
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-
-    def __init__(
-        self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None
-    ):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [
-            img_size[0] // patch_size[0],
-            img_size[1] // patch_size[1],
-        ]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
-        )
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        # FIXME look at relaxing size constraints
-        assert (
-            H == self.img_size[0] and W == self.img_size[1]
-        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
-        if self.norm is not None:
-            x = self.norm(x)
-        return x
-
-    def flops(self):
-        Ho, Wo = self.patches_resolution
-        flops = (
-            Ho
-            * Wo
-            * self.embed_dim
-            * self.in_chans
-            * (self.patch_size[0] * self.patch_size[1])
-        )
-        if self.norm is not None:
-            flops += Ho * Wo * self.embed_dim
-        return flops
-
-
-class SwinTransformer(nn.Module):
-    r""" Swin Transformer
-        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
-          https://arxiv.org/pdf/2103.14030
-    Args:
-        img_size (int | tuple(int)): Input image size. Default 224
-        patch_size (int | tuple(int)): Patch size. Default: 4
-        in_chans (int): Number of input image channels. Default: 3
-        num_classes (int): Number of classes for classification head. Default: 1000
-        embed_dim (int): Patch embedding dimension. Default: 96
-        depths (tuple(int)): Depth of each Swin Transformer layer.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        window_size (int): Window size. Default: 7
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
-        drop_rate (float): Dropout rate. Default: 0
-        attn_drop_rate (float): Attention dropout rate. Default: 0
-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
-    """
-
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=4,
-        in_chans=3,
-        num_classes=1000,
-        embed_dim=96,
-        depths=[2, 2, 6, 2],
-        num_heads=[3, 6, 12, 24],
-        window_size=7,
-        mlp_ratio=4.0,
-        qkv_bias=True,
-        qk_scale=None,
-        drop_rate=0.0,
-        attn_drop_rate=0.0,
-        drop_path_rate=0.1,
-        norm_layer=nn.LayerNorm,
-        ape=False,
-        patch_norm=True,
-        use_checkpoint=False,
-        **kwargs,
-    ):
-        super().__init__()
-
-        self.num_classes = num_classes
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
-        self.mlp_ratio = mlp_ratio
-
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None,
-        )
-        num_patches = self.patch_embed.num_patches
-        patches_resolution = self.patch_embed.patches_resolution
-        self.patches_resolution = patches_resolution
-
-        # absolute position embedding
-        if self.ape:
-            self.absolute_pos_embed = nn.Parameter(
-                torch.zeros(1, num_patches, embed_dim)
-            )
-            trunc_normal_(self.absolute_pos_embed, std=0.02)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        # stochastic depth
-        dpr = [
-            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
-        ]  # stochastic depth decay rule
-
-        # build layers
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = BasicLayer(
-                dim=int(embed_dim * 2 ** i_layer),
-                input_resolution=(
-                    patches_resolution[0] // (2 ** i_layer),
-                    patches_resolution[1] // (2 ** i_layer),
-                ),
-                depth=depths[i_layer],
-                num_heads=num_heads[i_layer],
-                window_size=window_size,
-                mlp_ratio=self.mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
-                norm_layer=norm_layer,
-                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
-                use_checkpoint=use_checkpoint,
-            )
-            self.layers.append(layer)
-
-        self.norm = norm_layer(self.num_features)
-        self.avgpool = nn.AdaptiveAvgPool1d(1)
-        self.head = (
-            nn.Linear(self.num_features, num_classes)
-            if num_classes > 0
-            else nn.Identity()
-        )
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def no_weight_decay(self):
-        return {"absolute_pos_embed"}
-
-    def no_weight_decay_keywords(self):
-        return {"relative_position_bias_table"}
-
-    def forward_features(self, x):
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        for layer in self.layers:
-            x = layer(x)
-
-        x = self.norm(x)  # B L C
-        x = self.avgpool(x.transpose(1, 2))  # B C 1
-        x = torch.flatten(x, 1)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-    def flops(self):
-        flops = 0
-        flops += self.patch_embed.flops()
-        for i, layer in enumerate(self.layers):
-            flops += layer.flops()
-        flops += (
-            self.num_features
-            * self.patches_resolution[0]
-            * self.patches_resolution[1]
-            // (2 ** self.num_layers)
-        )
-        flops += self.num_features * self.num_classes
-        return flops
-
-
-def _create_swin_transformer(arch, pretrained=False, progress=True, **model_kwargs):
-    model = SwinTransformer(**model_kwargs)
-    return model
-
-
-def swin_tiny_patch4_window7_224(pretrained=False, progress=True, **kwargs):
-    """
-    Constructs Swin-T 224x224 model trained on ImageNet-1k.
-    .. note::
-        Swin-T 224x224 model from `"Swin Transformer: Hierarchical Vision Transformer using Shifted Windows" <https://arxiv.org/pdf/2103.14030>`_.
-    Args:
-        pretrained (bool): Whether to download the pre-trained model on ImageNet. Default: ``False``
-        progress (bool): If True, displays a progress bar of the download to stderr. Default: ``True``
-    For example:
-    .. code-block:: python
-        >>> import flowvision
-        >>> swin_tiny_patch4_window7_224 = flowvision.models.swin_tiny_patch4_window7_224(pretrained=False, progress=True)
-    """
-    model_kwargs = dict(
-        img_size=224,
-        patch_size=4,
-        window_size=7,
-        embed_dim=96,
-        depths=(2, 2, 6, 2),
-        num_heads=(3, 6, 12, 24),
-        drop_path_rate=0.2,
-        **kwargs,
-    )
-    return _create_swin_transformer(
-        "swin_tiny_patch4_window7_224",
-        pretrained=pretrained,
-        progress=progress,
-        **model_kwargs,
-    )
diff --git a/python/oneflow/test/expensive/pytorch_uniformer.py b/python/oneflow/test/expensive/pytorch_uniformer.py
deleted file mode 100644
index 78a3a2b61dd..00000000000
--- a/python/oneflow/test/expensive/pytorch_uniformer.py
+++ /dev/null
@@ -1,536 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-from collections import OrderedDict
-import torch
-import torch.nn as nn
-from functools import partial
-from timm.models.layers import trunc_normal_, DropPath, to_2tuple
-
-layer_scale = False
-init_value = 1e-6
-
-
-class Mlp(nn.Module):
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        drop=0.0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class CMlp(nn.Module):
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        drop=0.0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
-        self.act = act_layer()
-        self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim,
-        num_heads=8,
-        qkv_bias=False,
-        qk_scale=None,
-        attn_drop=0.0,
-        proj_drop=0.0,
-    ):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
-        self.scale = qk_scale or head_dim ** -0.5
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x):
-        B, N, C = x.shape
-        qkv = (
-            self.qkv(x)
-            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
-            .permute(2, 0, 3, 1, 4)
-        )
-        q, k, v = (
-            qkv[0],
-            qkv[1],
-            qkv[2],
-        )  # make torchscript happy (cannot use tensor as tuple)
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class CBlock(nn.Module):
-    def __init__(
-        self,
-        dim,
-        num_heads,
-        mlp_ratio=4.0,
-        qkv_bias=False,
-        qk_scale=None,
-        drop=0.0,
-        attn_drop=0.0,
-        drop_path=0.0,
-        act_layer=nn.GELU,
-        norm_layer=nn.LayerNorm,
-    ):
-        super().__init__()
-        self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
-        self.norm1 = nn.BatchNorm2d(dim)
-        self.conv1 = nn.Conv2d(dim, dim, 1)
-        self.conv2 = nn.Conv2d(dim, dim, 1)
-        self.attn = nn.Conv2d(dim, dim, 5, padding=2, groups=dim)
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.norm2 = nn.BatchNorm2d(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = CMlp(
-            in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=drop,
-        )
-
-    def forward(self, x):
-        x = x + self.pos_embed(x)
-        x = x + self.drop_path(self.conv2(self.attn(self.conv1(self.norm1(x)))))
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-
-
-class SABlock(nn.Module):
-    def __init__(
-        self,
-        dim,
-        num_heads,
-        mlp_ratio=4.0,
-        qkv_bias=False,
-        qk_scale=None,
-        drop=0.0,
-        attn_drop=0.0,
-        drop_path=0.0,
-        act_layer=nn.GELU,
-        norm_layer=nn.LayerNorm,
-    ):
-        super().__init__()
-        self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
-        self.norm1 = norm_layer(dim)
-        self.attn = Attention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop,
-        )
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(
-            in_features=dim,
-            hidden_features=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=drop,
-        )
-        global layer_scale
-        self.ls = layer_scale
-        if self.ls:
-            global init_value
-            print(f"Use layer_scale: {layer_scale}, init_values: {init_value}")
-            self.gamma_1 = nn.Parameter(
-                init_value * torch.ones((dim)), requires_grad=True
-            )
-            self.gamma_2 = nn.Parameter(
-                init_value * torch.ones((dim)), requires_grad=True
-            )
-
-    def forward(self, x):
-        x = x + self.pos_embed(x)
-        B, N, H, W = x.shape
-        x = x.flatten(2).transpose(1, 2)
-        if self.ls:
-            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
-            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
-        else:
-            x = x + self.drop_path(self.attn(self.norm1(x)))
-            x = x + self.drop_path(self.mlp(self.norm2(x)))
-        x = x.transpose(1, 2).reshape(B, N, H, W)
-        return x
-
-
-class head_embedding(nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super(head_embedding, self).__init__()
-
-        self.proj = nn.Sequential(
-            nn.Conv2d(
-                in_channels,
-                out_channels // 2,
-                kernel_size=(3, 3),
-                stride=(2, 2),
-                padding=(1, 1),
-            ),
-            nn.BatchNorm2d(out_channels // 2),
-            nn.GELU(),
-            nn.Conv2d(
-                out_channels // 2,
-                out_channels,
-                kernel_size=(3, 3),
-                stride=(2, 2),
-                padding=(1, 1),
-            ),
-            nn.BatchNorm2d(out_channels),
-        )
-
-    def forward(self, x):
-        x = self.proj(x)
-        return x
-
-
-class middle_embedding(nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super(middle_embedding, self).__init__()
-
-        self.proj = nn.Sequential(
-            nn.Conv2d(
-                in_channels,
-                out_channels,
-                kernel_size=(3, 3),
-                stride=(2, 2),
-                padding=(1, 1),
-            ),
-            nn.BatchNorm2d(out_channels),
-        )
-
-    def forward(self, x):
-        x = self.proj(x)
-        return x
-
-
-class PatchEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
-
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.num_patches = num_patches
-        self.norm = nn.LayerNorm(embed_dim)
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
-        )
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        # FIXME look at relaxing size constraints
-        assert (
-            H == self.img_size[0] and W == self.img_size[1]
-        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        x = self.proj(x)
-        B, C, H, W = x.shape
-        x = x.flatten(2).transpose(1, 2)
-        x = self.norm(x)
-        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
-        return x
-
-
-class UniFormer(nn.Module):
-    """ Vision Transformer
-    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`  -
-        https://arxiv.org/abs/2010.11929
-    """
-
-    def __init__(
-        self,
-        depth=[3, 4, 8, 3],
-        img_size=224,
-        in_chans=3,
-        num_classes=1000,
-        embed_dim=[64, 128, 320, 512],
-        head_dim=64,
-        mlp_ratio=4.0,
-        qkv_bias=True,
-        qk_scale=None,
-        representation_size=None,
-        drop_rate=0.0,
-        attn_drop_rate=0.0,
-        drop_path_rate=0.0,
-        norm_layer=None,
-        conv_stem=False,
-    ):
-        """
-        Args:
-            depth (list): depth of each stage
-            img_size (int, tuple): input image size
-            in_chans (int): number of input channels
-            num_classes (int): number of classes for classification head
-            embed_dim (list): embedding dimension of each stage
-            head_dim (int): head dimension
-            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
-            qkv_bias (bool): enable bias for qkv if True
-            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
-            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
-            drop_rate (float): dropout rate
-            attn_drop_rate (float): attention dropout rate
-            drop_path_rate (float): stochastic depth rate
-            norm_layer (nn.Module): normalization layer
-            conv_stem (bool): whether use overlapped patch stem
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        self.num_features = (
-            self.embed_dim
-        ) = embed_dim  # num_features for consistency with other models
-        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
-        if conv_stem:
-            self.patch_embed1 = head_embedding(
-                in_channels=in_chans, out_channels=embed_dim[0]
-            )
-            self.patch_embed2 = middle_embedding(
-                in_channels=embed_dim[0], out_channels=embed_dim[1]
-            )
-            self.patch_embed3 = middle_embedding(
-                in_channels=embed_dim[1], out_channels=embed_dim[2]
-            )
-            self.patch_embed4 = middle_embedding(
-                in_channels=embed_dim[2], out_channels=embed_dim[3]
-            )
-        else:
-            self.patch_embed1 = PatchEmbed(
-                img_size=img_size,
-                patch_size=4,
-                in_chans=in_chans,
-                embed_dim=embed_dim[0],
-            )
-            self.patch_embed2 = PatchEmbed(
-                img_size=img_size // 4,
-                patch_size=2,
-                in_chans=embed_dim[0],
-                embed_dim=embed_dim[1],
-            )
-            self.patch_embed3 = PatchEmbed(
-                img_size=img_size // 8,
-                patch_size=2,
-                in_chans=embed_dim[1],
-                embed_dim=embed_dim[2],
-            )
-            self.patch_embed4 = PatchEmbed(
-                img_size=img_size // 16,
-                patch_size=2,
-                in_chans=embed_dim[2],
-                embed_dim=embed_dim[3],
-            )
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-        dpr = [
-            x.item() for x in torch.linspace(0, drop_path_rate, sum(depth))
-        ]  # stochastic depth decay rule
-        num_heads = [dim // head_dim for dim in embed_dim]
-        self.blocks1 = nn.ModuleList(
-            [
-                CBlock(
-                    dim=embed_dim[0],
-                    num_heads=num_heads[0],
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    drop=drop_rate,
-                    attn_drop=attn_drop_rate,
-                    drop_path=dpr[i],
-                    norm_layer=norm_layer,
-                )
-                for i in range(depth[0])
-            ]
-        )
-        self.blocks2 = nn.ModuleList(
-            [
-                CBlock(
-                    dim=embed_dim[1],
-                    num_heads=num_heads[1],
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    drop=drop_rate,
-                    attn_drop=attn_drop_rate,
-                    drop_path=dpr[i + depth[0]],
-                    norm_layer=norm_layer,
-                )
-                for i in range(depth[1])
-            ]
-        )
-        self.blocks3 = nn.ModuleList(
-            [
-                SABlock(
-                    dim=embed_dim[2],
-                    num_heads=num_heads[2],
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    drop=drop_rate,
-                    attn_drop=attn_drop_rate,
-                    drop_path=dpr[i + depth[0] + depth[1]],
-                    norm_layer=norm_layer,
-                )
-                for i in range(depth[2])
-            ]
-        )
-        self.blocks4 = nn.ModuleList(
-            [
-                SABlock(
-                    dim=embed_dim[3],
-                    num_heads=num_heads[3],
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    drop=drop_rate,
-                    attn_drop=attn_drop_rate,
-                    drop_path=dpr[i + depth[0] + depth[1] + depth[2]],
-                    norm_layer=norm_layer,
-                )
-                for i in range(depth[3])
-            ]
-        )
-        self.norm = nn.BatchNorm2d(embed_dim[-1])
-
-        # Representation layer
-        if representation_size:
-            self.num_features = representation_size
-            self.pre_logits = nn.Sequential(
-                OrderedDict(
-                    [
-                        ("fc", nn.Linear(embed_dim, representation_size)),
-                        ("act", nn.Tanh()),
-                    ]
-                )
-            )
-        else:
-            self.pre_logits = nn.Identity()
-
-        # Classifier head
-        self.head = (
-            nn.Linear(embed_dim[-1], num_classes) if num_classes > 0 else nn.Identity()
-        )
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    def no_weight_decay(self):
-        return {"pos_embed", "cls_token"}
-
-    def get_classifier(self):
-        return self.head
-
-    def reset_classifier(self, num_classes, global_pool=""):
-        self.num_classes = num_classes
-        self.head = (
-            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        )
-
-    def forward_features(self, x):
-        x = self.patch_embed1(x)
-        x = self.pos_drop(x)
-        for blk in self.blocks1:
-            x = blk(x)
-        x = self.patch_embed2(x)
-        for blk in self.blocks2:
-            x = blk(x)
-        x = self.patch_embed3(x)
-        for blk in self.blocks3:
-            x = blk(x)
-        x = self.patch_embed4(x)
-        for blk in self.blocks4:
-            x = blk(x)
-        x = self.norm(x)
-        x = self.pre_logits(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = x.flatten(2).mean(-1)
-        x = self.head(x)
-        return x
-
-
-def uniformer_small(pretrained=True, **kwargs):
-    model = UniFormer(
-        depth=[3, 4, 8, 3],
-        embed_dim=[64, 128, 320, 512],
-        head_dim=64,
-        mlp_ratio=4,
-        qkv_bias=True,
-        norm_layer=partial(nn.LayerNorm, eps=1e-6),
-        **kwargs,
-    )
-    return model
diff --git a/python/oneflow/test/expensive/pytroch_mlp_mixer.py b/python/oneflow/test/expensive/pytroch_mlp_mixer.py
deleted file mode 100644
index a75c5c0ca48..00000000000
--- a/python/oneflow/test/expensive/pytroch_mlp_mixer.py
+++ /dev/null
@@ -1,431 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import math
-import torch
-import torch.nn as nn
-from timm.models.layers import DropPath, lecun_normal_, to_2tuple
-
-from functools import partial
-from typing import Callable
-
-
-class Mlp(nn.Module):
-    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
-    """
-
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        drop=0.0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        drop_probs = to_2tuple(drop)
-
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.drop1 = nn.Dropout(drop_probs[0])
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop2 = nn.Dropout(drop_probs[1])
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop1(x)
-        x = self.fc2(x)
-        x = self.drop2(x)
-        return x
-
-
-class PatchEmbed(nn.Module):
-    """ 2D Image to Patch Embedding
-    """
-
-    def __init__(
-        self,
-        img_size=224,
-        patch_size=16,
-        in_chans=3,
-        embed_dim=768,
-        norm_layer=None,
-        flatten=True,
-    ):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
-        self.num_patches = self.grid_size[0] * self.grid_size[1]
-        self.flatten = flatten
-
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
-        )
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        assert (
-            H == self.img_size[0],
-            f"Input image height ({H}) doesn't match model ({self.img_size[0]}).",
-        )
-        assert (
-            W == self.img_size[1],
-            f"Input image width ({W}) doesn't match model ({self.img_size[1]}).",
-        )
-        x = self.proj(x)
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        x = self.norm(x)
-        return x
-
-
-def named_apply(
-    fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False
-) -> nn.Module:
-    if not depth_first and include_root:
-        fn(module=module, name=name)
-    for child_name, child_module in module.named_children():
-        child_name = ".".join((name, child_name)) if name else child_name
-        named_apply(
-            fn=fn,
-            module=child_module,
-            name=child_name,
-            depth_first=depth_first,
-            include_root=True,
-        )
-    if depth_first and include_root:
-        fn(module=module, name=name)
-    return module
-
-
-class GatedMlp(nn.Module):
-    """ MLP as used in gMLP
-    """
-
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        gate_layer=None,
-        drop=0.0,
-    ):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        drop_probs = to_2tuple(drop)
-
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.drop1 = nn.Dropout(drop_probs[0])
-        if gate_layer is not None:
-            assert hidden_features % 2 == 0
-            self.gate = gate_layer(hidden_features)
-            hidden_features = (
-                hidden_features // 2
-            )  # FIXME base reduction on gate property?
-        else:
-            self.gate = nn.Identity()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop2 = nn.Dropout(drop_probs[1])
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop1(x)
-        x = self.gate(x)
-        x = self.fc2(x)
-        x = self.drop2(x)
-        return x
-
-
-class MixerBlock(nn.Module):
-    """ Residual Block w/ token mixing and channel MLPs
-    Based on: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
-    """
-
-    def __init__(
-        self,
-        dim,
-        seq_len,
-        mlp_ratio=(0.5, 4.0),
-        mlp_layer=Mlp,
-        norm_layer=partial(nn.LayerNorm, eps=1e-6),
-        act_layer=nn.GELU,
-        drop=0.0,
-        drop_path=0.0,
-    ):
-        super().__init__()
-        tokens_dim, channels_dim = [int(x * dim) for x in to_2tuple(mlp_ratio)]
-        self.norm1 = norm_layer(dim)
-        self.mlp_tokens = mlp_layer(seq_len, tokens_dim, act_layer=act_layer, drop=drop)
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        self.mlp_channels = mlp_layer(dim, channels_dim, act_layer=act_layer, drop=drop)
-
-    def forward(self, x):
-        x = x + self.drop_path(
-            self.mlp_tokens(self.norm1(x).transpose(1, 2)).transpose(1, 2)
-        )
-        x = x + self.drop_path(self.mlp_channels(self.norm2(x)))
-        return x
-
-
-class Affine(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.alpha = nn.Parameter(torch.ones((1, 1, dim)))
-        self.beta = nn.Parameter(torch.zeros((1, 1, dim)))
-
-    def forward(self, x):
-        return torch.addcmul(self.beta, self.alpha, x)
-
-
-class ResBlock(nn.Module):
-    """ Residual MLP block w/ LayerScale and Affine 'norm'
-    Based on: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404
-    """
-
-    def __init__(
-        self,
-        dim,
-        seq_len,
-        mlp_ratio=4,
-        mlp_layer=Mlp,
-        norm_layer=Affine,
-        act_layer=nn.GELU,
-        init_values=1e-4,
-        drop=0.0,
-        drop_path=0.0,
-    ):
-        super().__init__()
-        channel_dim = int(dim * mlp_ratio)
-        self.norm1 = norm_layer(dim)
-        self.linear_tokens = nn.Linear(seq_len, seq_len)
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        self.mlp_channels = mlp_layer(dim, channel_dim, act_layer=act_layer, drop=drop)
-        self.ls1 = nn.Parameter(init_values * torch.ones(dim))
-        self.ls2 = nn.Parameter(init_values * torch.ones(dim))
-
-    def forward(self, x):
-        x = x + self.drop_path(
-            self.ls1 * self.linear_tokens(self.norm1(x).transpose(1, 2)).transpose(1, 2)
-        )
-        x = x + self.drop_path(self.ls2 * self.mlp_channels(self.norm2(x)))
-        return x
-
-
-class SpatialGatingUnit(nn.Module):
-    """ Spatial Gating Unit
-    Based on: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050
-    """
-
-    def __init__(self, dim, seq_len, norm_layer=nn.LayerNorm):
-        super().__init__()
-        gate_dim = dim // 2
-        self.norm = norm_layer(gate_dim)
-        self.proj = nn.Linear(seq_len, seq_len)
-
-    def init_weights(self):
-        # special init for the projection gate, called as override by base model init
-        nn.init.normal_(self.proj.weight, std=1e-6)
-        nn.init.ones_(self.proj.bias)
-
-    def forward(self, x):
-        u, v = x.chunk(2, dim=-1)
-        v = self.norm(v)
-        v = self.proj(v.transpose(-1, -2))
-        return u * v.transpose(-1, -2)
-
-
-class SpatialGatingBlock(nn.Module):
-    """ Residual Block w/ Spatial Gating
-    Based on: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050
-    """
-
-    def __init__(
-        self,
-        dim,
-        seq_len,
-        mlp_ratio=4,
-        mlp_layer=GatedMlp,
-        norm_layer=partial(nn.LayerNorm, eps=1e-6),
-        act_layer=nn.GELU,
-        drop=0.0,
-        drop_path=0.0,
-    ):
-        super().__init__()
-        channel_dim = int(dim * mlp_ratio)
-        self.norm = norm_layer(dim)
-        sgu = partial(SpatialGatingUnit, seq_len=seq_len)
-        self.mlp_channels = mlp_layer(
-            dim, channel_dim, act_layer=act_layer, gate_layer=sgu, drop=drop
-        )
-        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
-
-    def forward(self, x):
-        x = x + self.drop_path(self.mlp_channels(self.norm(x)))
-        return x
-
-
-class MlpMixer(nn.Module):
-    def __init__(
-        self,
-        num_classes=1000,
-        img_size=224,
-        in_chans=3,
-        patch_size=16,
-        num_blocks=8,
-        embed_dim=512,
-        mlp_ratio=(0.5, 4.0),
-        block_layer=MixerBlock,
-        mlp_layer=Mlp,
-        norm_layer=partial(nn.LayerNorm, eps=1e-6),
-        act_layer=nn.GELU,
-        drop_rate=0.0,
-        drop_path_rate=0.0,
-        nlhb=False,
-        stem_norm=False,
-        global_pool="avg",
-    ):
-        super().__init__()
-        self.num_classes = num_classes
-        self.global_pool = global_pool
-        self.num_features = (
-            self.embed_dim
-        ) = embed_dim  # num_features for consistency with other models
-        self.grad_checkpointing = False
-
-        self.stem = PatchEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim,
-            norm_layer=norm_layer if stem_norm else None,
-        )
-        # FIXME drop_path (stochastic depth scaling rule or all the same?)
-        self.blocks = nn.Sequential(
-            *[
-                block_layer(
-                    embed_dim,
-                    self.stem.num_patches,
-                    mlp_ratio,
-                    mlp_layer=mlp_layer,
-                    norm_layer=norm_layer,
-                    act_layer=act_layer,
-                    drop=drop_rate,
-                    drop_path=drop_path_rate,
-                )
-                for _ in range(num_blocks)
-            ]
-        )
-        self.norm = norm_layer(embed_dim)
-        self.head = (
-            nn.Linear(embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
-        )
-
-        self.init_weights(nlhb=nlhb)
-
-    def init_weights(self, nlhb=False):
-        head_bias = -math.log(self.num_classes) if nlhb else 0.0
-        named_apply(
-            partial(_init_weights, head_bias=head_bias), module=self
-        )  # depth-first
-
-    def group_matcher(self, coarse=False):
-        return dict(
-            stem=r"^stem",  # stem and embed
-            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))],
-        )
-
-    def set_grad_checkpointing(self, enable=True):
-        self.grad_checkpointing = enable
-
-    def get_classifier(self):
-        return self.head
-
-    def reset_classifier(self, num_classes, global_pool=None):
-        self.num_classes = num_classes
-        if global_pool is not None:
-            assert global_pool in ("", "avg")
-            self.global_pool = global_pool
-        self.head = (
-            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        )
-
-    def forward_features(self, x):
-        x = self.stem(x)
-        x = self.blocks(x)
-        x = self.norm(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        if self.global_pool == "avg":
-            x = x.mean(dim=1)
-        x = self.head(x)
-        return x
-
-
-def _init_weights(module: nn.Module, name: str, head_bias: float = 0.0, flax=False):
-    """ Mixer weight initialization (trying to match Flax defaults)
-    """
-    if isinstance(module, nn.Linear):
-        if name.startswith("head"):
-            nn.init.zeros_(module.weight)
-            nn.init.constant_(module.bias, head_bias)
-        else:
-            if flax:
-                # Flax defaults
-                lecun_normal_(module.weight)
-                if module.bias is not None:
-                    nn.init.zeros_(module.bias)
-            else:
-                # like MLP init in vit (my original init)
-                nn.init.xavier_uniform_(module.weight)
-                if module.bias is not None:
-                    if "mlp" in name:
-                        nn.init.normal_(module.bias, std=1e-6)
-                    else:
-                        nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Conv2d):
-        lecun_normal_(module.weight)
-        if module.bias is not None:
-            nn.init.zeros_(module.bias)
-    elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
-        nn.init.ones_(module.weight)
-        nn.init.zeros_(module.bias)
-    elif hasattr(module, "init_weights"):
-        # NOTE if a parent module contains init_weights method, it can override the init of the
-        # child modules as this will be called in depth-first order.
-        module.init_weights()
-
-
-def mixer_s32_224(pretrained=False, **kwargs):
-    """ Mixer-S/32 224x224
-    Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
-    """
-    model_args = dict(patch_size=32, num_blocks=8, embed_dim=512, **kwargs)
-    model = MlpMixer(**model_args)
-    return model
diff --git a/python/oneflow/test/expensive/test_compatibility.py b/python/oneflow/test/expensive/test_compatibility.py
deleted file mode 100644
index 0157f09b7be..00000000000
--- a/python/oneflow/test/expensive/test_compatibility.py
+++ /dev/null
@@ -1,154 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-from oneflow.test_utils.oneflow_pytorch_compatibility import *
-
-
-@flow.unittest.skip_unless_1n1d()
-@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test gpu cases")
-class TestApiCompatibility(flow.unittest.TestCase):
-    def test_alexnet_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_alexnet.py", "alexnet", "cuda", 16, 224
-        )
-
-    def test_resnet50_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_resnet.py", "resnet50", "cuda", 16, 224
-        )
-
-    def test_convmixer_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_convmixer.py", "convmixer_768_32_relu", "cuda", 4, 224
-        )
-
-    def test_densenet_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_densenet.py", "densenet121", "cuda", 8, 224
-        )
-
-    def test_ghostnet_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_ghostnet.py", "ghost_net", "cuda", 16, 224
-        )
-
-    def test_googlenet_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_googlenet.py", "googlenet", "cuda", 8, 224
-        )
-
-    def test_inception_v3_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_inception_v3.py", "inception_v3", "cuda", 4, 299
-        )
-
-    def test_mnasnet_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_mnasnet.py", "mnasnet1_0", "cuda", 16, 224
-        )
-
-    # def test_rexnet_compatibility(test_case):
-    #     do_test_train_loss_oneflow_pytorch(
-    #         test_case, "pytorch_rexnet.py", "rexnetv1_1_0", "cuda", 16, 224
-    #     )
-
-    def test_rexnetv1_lite_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_rexnetv1_lite.py", "rexnet_lite_1_0", "cuda", 16, 224
-        )
-
-    # def test_res2net_compatibility(test_case):
-    #     do_test_train_loss_oneflow_pytorch(
-    #         test_case, "pytorch_res2net.py", "res2net50", "cuda", 16, 224
-    #     )
-
-    def test_shufflenetv2_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_shufflenetv2.py", "shufflenet_v2_x2_0", "cuda", 16, 224
-        )
-
-    def test_squeezenet_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_squeezenet.py", "squeezenet1_1", "cuda", 16, 224
-        )
-
-    def test_convnext_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_convnext.py", "convnext_tiny", "cuda", 8, 224
-        )
-
-    # def test_crossformer_compatibility(test_case):
-    #     do_test_train_loss_oneflow_pytorch(
-    #         test_case,
-    #         "pytorch_crossformer.py",
-    #         "crossformer_tiny_patch4_group7_224",
-    #         "cuda",
-    #         8,
-    #         224,
-    #     )
-
-    # def test_efficientnet_compatibility(test_case):
-    #     do_test_train_loss_oneflow_pytorch(
-    #         test_case, "pytorch_efficientnet.py", "efficientnet_b0", "cuda", 8, 224,
-    #     )
-
-    def test_levit_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_levit.py", "LeViT_128S", "cuda", 8, 224,
-        )
-
-    # def test_mlp_mixer_compatibility(test_case):
-    #     do_test_train_loss_oneflow_pytorch(
-    #         test_case, "pytroch_mlp_mixer.py", "mixer_s32_224", "cuda", 8, 224,
-    #     )
-
-    def test_poolformer_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_poolformer.py", "poolformer_s12", "cuda", 8, 224,
-        )
-
-    def test_pvt_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_pvt.py", "pvt_tiny", "cuda", 8, 224,
-        )
-
-    def test_resmlp_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_resmlp.py", "resmlp_12", "cuda", 8, 224,
-        )
-
-    def test_uniformer_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_uniformer.py", "uniformer_small", "cuda", 8, 224,
-        )
-
-    def test_swin_transformer_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case,
-            "pytorch_swin_transformer.py",
-            "swin_tiny_patch4_window7_224",
-            "cuda",
-            8,
-            224,
-        )
-
-    def test_senet_compatibility(test_case):
-        do_test_train_loss_oneflow_pytorch(
-            test_case, "pytorch_senet.py", "senet154", "cuda", 2, 224,
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_module_to_half.py b/python/oneflow/test/expensive/test_compatiblity.py
similarity index 56%
rename from python/oneflow/test/modules/test_module_to_half.py
rename to python/oneflow/test/expensive/test_compatiblity.py
index 7abbe7e1197..8b6379efb24 100644
--- a/python/oneflow/test/modules/test_module_to_half.py
+++ b/python/oneflow/test/expensive/test_compatiblity.py
@@ -13,20 +13,21 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-import unittest
-from oneflow.test_utils.automated_test_util import *
-import oneflow as flow
-import oneflow.unittest
+from oneflow.test_utils.oneflow_pytorch_compatiblity import *
 
 
 @flow.unittest.skip_unless_1n1d()
-class TestModuleToHalf(flow.unittest.TestCase):
-    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-    def test_module_to_half(test_case):
-        input = flow.randn(10, 10).to(flow.float16).cuda()
-        model = flow.nn.Linear(10, 20).half().cuda()
-        output = model(input)
-        test_case.assertEqual(output.dtype, flow.float16)
+@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test gpu cases")
+class TestApiCompatiblity(flow.unittest.TestCase):
+    def test_alexnet_compatiblity(test_case):
+        do_test_train_loss_oneflow_pytorch(
+            test_case, "pytorch_alexnet.py", "alexnet", "cuda"
+        )
+
+    def test_resnet50_compatiblity(test_case):
+        do_test_train_loss_oneflow_pytorch(
+            test_case, "pytorch_resnet.py", "resnet50", "cuda"
+        )
 
 
 if __name__ == "__main__":
diff --git a/python/oneflow/test/expensive/test_convtranspose.py b/python/oneflow/test/expensive/test_convtranspose.py
index 8dcd69d6a56..fdce804027b 100644
--- a/python/oneflow/test/expensive/test_convtranspose.py
+++ b/python/oneflow/test/expensive/test_convtranspose.py
@@ -366,33 +366,6 @@ def test_deconv3d_group_with_random_data(test_case):
         y = m(x)
         return y
 
-    @autotest(n=3, auto_backward=False)
-    def test_functional_conv_transpose1d(test_case):
-        device = random_device()
-        channels = random(1, 6)
-        img = random_tensor(ndim=3, dim1=channels).to(device)
-        kernel = random_tensor(ndim=3, dim0=channels).to(device)
-        y = torch.nn.functional.conv_transpose1d(img, kernel)
-        return y
-
-    @autotest(n=3, auto_backward=False)
-    def test_functional_conv_transpose2d(test_case):
-        device = random_device()
-        channels = random(1, 6)
-        img = random_tensor(ndim=4, dim1=channels).to(device)
-        kernel = random_tensor(ndim=4, dim0=channels).to(device)
-        y = torch.nn.functional.conv_transpose2d(img, kernel)
-        return y
-
-    @autotest(n=3, auto_backward=False)
-    def test_functional_conv_transpose3d(test_case):
-        device = random_device()
-        channels = random(1, 6)
-        img = random_tensor(ndim=5, dim1=channels).to(device)
-        kernel = random_tensor(ndim=5, dim0=channels).to(device)
-        y = torch.nn.functional.conv_transpose3d(img, kernel)
-        return y
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/expensive/test_id_shuffle.py b/python/oneflow/test/expensive/test_id_shuffle.py
index 6eef77c5ae2..76363b2a317 100644
--- a/python/oneflow/test/expensive/test_id_shuffle.py
+++ b/python/oneflow/test/expensive/test_id_shuffle.py
@@ -19,43 +19,44 @@
 from oneflow.test_utils.test_util import GenArgDict
 import numpy as np
 import oneflow as flow
+import oneflow.unittest
 
 from oneflow.test_utils.automated_test_util import *
 
 
-def _test_id_shuffle(test_case, has_table_id, num_tables):
+def _test_id_shuffle(test_case, has_column_id, num_columns):
     batch_size = 512
-    ids = np.random.randint(0, 1000, (batch_size, num_tables), dtype=np.int64)
-    if has_table_id:
-        table_ids = (
-            ids % num_tables
-        )  # same id must have same table id, so in this case get table_ids from ids
-        table_ids_tensor = flow.tensor(
-            table_ids.astype(np.int32), requires_grad=False
+    ids = np.random.randint(0, 1000, (batch_size, num_columns), dtype=np.int64)
+    if has_column_id:
+        column_ids = (
+            ids % num_columns
+        )  # same id must have same column id, so in this case get column_ids from ids
+        column_ids_tensor = flow.tensor(
+            column_ids.astype(np.int32), requires_grad=False
         ).to("cuda")
     else:
-        table_ids_tensor = None
+        column_ids_tensor = None
     ids_tensor = flow.tensor(ids, requires_grad=False).to("cuda")
 
     class TestGraph(flow.nn.Graph):
         def __init__(self):
             super().__init__()
 
-        def build(self, ids, table_ids):
+        def build(self, ids, column_ids):
             (
                 num_unique_matrix,
                 inverse_unique_partition_indices,
                 cur_rank_num_unique,
                 cur_rank_unique_ids,
-                cur_rank_unique_table_ids,
+                cur_rank_unique_column_ids,
                 cur_rank_inverse_indices,
-            ) = flow._C.one_embedding_id_shuffle(ids, table_ids, num_tables)
+            ) = flow._C.one_embedding_id_shuffle(ids, column_ids, num_columns)
             return (
                 flow.cast(num_unique_matrix, flow.int32),
                 flow.cast(inverse_unique_partition_indices, flow.int32),
                 flow.cast(cur_rank_num_unique, flow.int32),
                 flow.cast(cur_rank_unique_ids, flow.int32),
-                flow.cast(cur_rank_unique_table_ids, flow.int32),
+                flow.cast(cur_rank_unique_column_ids, flow.int32),
                 flow.cast(cur_rank_inverse_indices, flow.int32),
             )
 
@@ -65,9 +66,9 @@ def build(self, ids, table_ids):
         inverse_unique_partition_indices,
         cur_rank_num_unique,
         cur_rank_unique_ids,
-        cur_rank_unique_table_ids,
+        cur_rank_unique_column_ids,
         cur_rank_inverse_indices,
-    ) = graph(ids_tensor, table_ids_tensor)
+    ) = graph(ids_tensor, column_ids_tensor)
     np_unique_ids, np_inverse = np.unique(ids, return_inverse=True)
     np_num_unique = np_unique_ids.size
     test_case.assertTrue(np.array_equal(np_num_unique, num_unique_matrix[0]))
@@ -76,79 +77,37 @@ def build(self, ids, table_ids):
         inverse_unique_partition_indices
     ]
     test_case.assertTrue(np.array_equal(reversed_ids.numpy(), ids))
-    if has_table_id:
-        reversed_table_ids = cur_rank_unique_table_ids[cur_rank_inverse_indices][
+    if has_column_id:
+        reversed_column_ids = cur_rank_unique_column_ids[cur_rank_inverse_indices][
             inverse_unique_partition_indices
         ]
-        test_case.assertTrue(np.array_equal(reversed_table_ids.numpy(), table_ids))
-    # when has_table_id=False, we can not test table ids because in this case same ids not lead to same table id
-
-
-def round_half_away_from_zero(x):
-    sign = np.sign(x)
-    abs_val = np.abs(x)
-    abs_val += 0.5
-    floor_val = np.floor(abs_val)
-    out = floor_val * sign
-    return out
-
-
-def embedding_shuffle_quantize(np_data, np_dtype):
-    # When use float16, ComputeType is set to as Float.
-    np_reduce_data = np_data.astype(np.float32)
-    abs_max_factor = np.max(np.abs(np_reduce_data), axis=2)
-    abs_max_factor = np.expand_dims(abs_max_factor, axis=2)
-    transport_quantize_factor = abs_max_factor.astype(np_dtype)
-    int8_factor = np.ones(abs_max_factor.shape, dtype=np.float32) * 127.0
-    int8_factor = int8_factor.astype(np.float32)
-    quantize_factor = int8_factor / abs_max_factor
-
-    # Covert to Compute Type.
-    np_data.astype(np.float32)
-    np_data = np_data * quantize_factor
-    np_data = round_half_away_from_zero(np_data)
-    np_data = np_data.astype(np.int8)
-
-    # Covert to Compute Type.
-    np_data = np_data.astype(np.float32)
-    dequantize_factor = transport_quantize_factor.astype(np.float32) / int8_factor
-    np_data = np_data * dequantize_factor
-    np_data = np_data.astype(np_dtype)
-    return np_data
-
-
-def _test_embedding_shuffle(test_case, dtype, enable_quantize):
-    batch_size = 512
-    num_tables = 26
-    embedding_size = 128
-    ids = np.random.randint(0, 1000, (batch_size, num_tables), dtype=np.int64)
+        test_case.assertTrue(np.array_equal(reversed_column_ids.numpy(), column_ids))
+    # when has_column_id=False, we can not test column ids because in this case same ids not lead to same column id
 
-    enable_quantized_comm = enable_quantize and embedding_size < 1025
-    if enable_quantized_comm:
-        os.environ["ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM"] = "1"
-    else:
-        os.environ["ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM"] = "0"
 
-    table_ids = (
-        ids % num_tables
-    )  # same id must have same table id, so in this case get table_ids from ids
+def _test_embedding_shuffle(test_case, dtype):
+    batch_size = 512
+    num_columns = 26
+    ids = np.random.randint(0, 1000, (batch_size, num_columns), dtype=np.int64)
+    column_ids = (
+        ids % num_columns
+    )  # same id must have same column id, so in this case get column_ids from ids
     if dtype == flow.float16:
         np_dtype = np.float16
     else:
         np_dtype = np.float32
-    data = np.random.rand(1000, embedding_size).astype(np_dtype)
-
+    data = np.random.rand(1000, 128).astype(np_dtype)
     ids_tensor = flow.tensor(ids, requires_grad=False).to("cuda")
-    table_ids_tensor = flow.tensor(table_ids.astype(np.int32), requires_grad=False).to(
-        "cuda"
-    )
+    column_ids_tensor = flow.tensor(
+        column_ids.astype(np.int32), requires_grad=False
+    ).to("cuda")
     data_tensor = flow.tensor(data, requires_grad=False).to("cuda")
 
     class TestGraph(flow.nn.Graph):
         def __init__(self):
             super().__init__()
 
-        def build(self, ids, table_ids, data):
+        def build(self, ids, column_ids, data):
             (
                 num_unique_matrix,
                 inverse_unique_partition_indices,
@@ -156,7 +115,7 @@ def build(self, ids, table_ids, data):
                 cur_rank_unique_ids,
                 _,
                 cur_rank_inverse_indices,
-            ) = flow._C.one_embedding_id_shuffle(ids, table_ids, num_tables)
+            ) = flow._C.one_embedding_id_shuffle(ids, column_ids, num_columns)
             unique_embeddings = flow._C.gather(data, cur_rank_unique_ids, axis=0)
             embeddings = flow._C.one_embedding_embedding_shuffle(
                 unique_embeddings,
@@ -167,47 +126,34 @@ def build(self, ids, table_ids, data):
             return embeddings
 
     graph = TestGraph()
-    embeddings = graph(ids_tensor, table_ids_tensor, data_tensor)
+    embeddings = graph(ids_tensor, column_ids_tensor, data_tensor)
     np_embeddings = data[ids]
 
-    # Quantized numpy embedding.
-    if enable_quantized_comm:
-        np_embeddings = embedding_shuffle_quantize(np_embeddings, np_dtype)
-    test_case.assertTrue(
-        np.allclose(embeddings.numpy(), np_embeddings, atol=1e-4, rtol=1e-4)
-    )
+    test_case.assertTrue(np.array_equal(embeddings.numpy(), np_embeddings))
 
 
-def _test_embedding_gradient_shuffle(test_case, enable_quantize):
+def _test_embedding_gradient_shuffle(test_case):
     batch_size = 512
-    num_tables = 26
+    num_columns = 26
     embedding_size = 128
-    ids = np.random.randint(0, 1000, (batch_size, num_tables), dtype=np.int64)
-    enable_quantized_comm = enable_quantize and embedding_size < 1025
-    if enable_quantized_comm:
-        os.environ["ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM"] = "1"
-        ids = np.arange(batch_size * num_tables, dtype=np.int64)
-        np.random.shuffle(ids)
-    else:
-        os.environ["ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM"] = "0"
-
-    table_ids = (
-        ids % num_tables
-    )  # same id must have same table id, so in this case get table_ids from ids
-    embedding_grad = np.random.uniform(
-        low=-1, high=1, size=(batch_size, num_tables, embedding_size)
-    ).astype(np.float32)
-    ids_tensor = flow.tensor(ids, requires_grad=False).to("cuda")
-    table_ids_tensor = flow.tensor(table_ids.astype(np.int32), requires_grad=False).to(
-        "cuda"
+    ids = np.random.randint(0, 1000, (batch_size, num_columns), dtype=np.int64)
+    column_ids = (
+        ids % num_columns
+    )  # same id must have same column id, so in this case get column_ids from ids
+    embedding_grad = np.random.rand(batch_size, num_columns, embedding_size).astype(
+        np.float32
     )
+    ids_tensor = flow.tensor(ids, requires_grad=False).to("cuda")
+    column_ids_tensor = flow.tensor(
+        column_ids.astype(np.int32), requires_grad=False
+    ).to("cuda")
     embedding_grad_tensor = flow.tensor(embedding_grad, requires_grad=False).to("cuda")
 
     class TestGraph(flow.nn.Graph):
         def __init__(self):
             super().__init__()
 
-        def build(self, ids, table_ids, embedding_grad):
+        def build(self, ids, column_ids, embedding_grad):
             (
                 num_unique_matrix,
                 inverse_unique_partition_indices,
@@ -215,7 +161,7 @@ def build(self, ids, table_ids, embedding_grad):
                 cur_rank_unique_ids,
                 _,
                 cur_rank_inverse_indices,
-            ) = flow._C.one_embedding_id_shuffle(ids, table_ids, num_tables)
+            ) = flow._C.one_embedding_id_shuffle(ids, column_ids, num_columns)
             cur_rank_unique_embedding_grad = flow._C.one_embedding_embedding_gradient_shuffle(
                 embedding_grad,
                 num_unique_matrix,
@@ -235,96 +181,82 @@ def build(self, ids, table_ids, embedding_grad):
         cur_rank_unique_ids,
         cur_rank_inverse_indices,
         inverse_unique_partition_indices,
-    ) = graph(ids_tensor, table_ids_tensor, embedding_grad_tensor)
+    ) = graph(ids_tensor, column_ids_tensor, embedding_grad_tensor)
     np_unique_ids, np_inverse = np.unique(ids, return_inverse=True)
     np_num_unique = np_unique_ids.size
     np_cur_rank_unique_embedding_grad = np.zeros(
-        cur_rank_unique_embedding_grad.shape, dtype=np.float32
+        cur_rank_unique_embedding_grad.shape
     ).reshape(-1, embedding_size)
-
-    embedding_grad = embedding_grad.reshape(-1, embedding_size)
     for k in range(np_num_unique):
-        np_data = sum(embedding_grad[np.where(ids.flatten() == np_unique_ids[k])[0]])
-        # Quantize Embedding Gradient.
-        if enable_quantized_comm:
-            abs_max_factor = np.max(np.abs(np_data))
-            int8_factor = np.full(abs_max_factor.shape, 127.0, dtype=np.float32)
-            quantize_factor = int8_factor / abs_max_factor
-            np_data = np_data * quantize_factor
-            np_data = round_half_away_from_zero(np_data)
-            np_data = np_data.astype(np.int8)
-            np_data = np_data.astype(np.float32)
-            dequantize_factor = abs_max_factor / int8_factor
-            np_data = np_data * dequantize_factor
-
-        np_cur_rank_unique_embedding_grad[k, :] = np_data
-
+        np_cur_rank_unique_embedding_grad[k, :] = sum(
+            embedding_grad.reshape(-1, embedding_size)[
+                np.where(ids.flatten() == np_unique_ids[k])[0]
+            ]
+        )
     reversed_ids = cur_rank_unique_ids[cur_rank_inverse_indices][
         inverse_unique_partition_indices
     ]
     test_case.assertTrue(np.array_equal(reversed_ids.numpy(), ids))
-    of_cur_rank_embedding_grad = cur_rank_unique_embedding_grad[
-        cur_rank_inverse_indices
-    ][inverse_unique_partition_indices]
-    of_cur_rank_embedding_grad = flow.reshape(
-        of_cur_rank_embedding_grad, (-1, embedding_size)
-    )
-    np_cur_rank_embedding_grad = np_cur_rank_unique_embedding_grad[np_inverse]
-
     test_case.assertTrue(
         np.allclose(
-            of_cur_rank_embedding_grad.numpy().flatten(),
-            np_cur_rank_embedding_grad.flatten(),
+            cur_rank_unique_embedding_grad[cur_rank_inverse_indices][
+                inverse_unique_partition_indices
+            ]
+            .numpy()
+            .flatten(),
+            np_cur_rank_unique_embedding_grad[np_inverse].flatten(),
             atol=1e-4,
             rtol=1e-4,
         )
     )
 
 
-def _test_unique_key_value(test_case, has_table_id, num_tables):
+def _test_unique_key_value(test_case, has_column_id, num_columns):
     batch_size = 128
-    ids = np.random.randint(0, 1000, (batch_size, num_tables), dtype=np.int64)
-    if has_table_id:
-        table_ids = (
-            ids % num_tables
-        )  # same id must have same table id, so in this case get table_ids from ids
-        table_ids_tensor = flow.tensor(
-            table_ids.astype(np.int32), requires_grad=False
+    ids = np.random.randint(0, 1000, (batch_size, num_columns), dtype=np.int64)
+    if has_column_id:
+        column_ids = (
+            ids % num_columns
+        )  # same id must have same column id, so in this case get column_ids from ids
+        column_ids_tensor = flow.tensor(
+            column_ids.astype(np.int32), requires_grad=False
         ).to("cuda")
     else:
-        table_ids_tensor = None
+        column_ids_tensor = None
     ids_tensor = flow.tensor(ids, requires_grad=False).to("cuda")
 
     class TestGraph(flow.nn.Graph):
         def __init__(self):
             super().__init__()
 
-        def build(self, ids, table_ids):
+        def build(self, ids, column_ids):
             (
                 num_unique,
                 unique_ids,
-                unique_table_ids,
+                unique_column_ids,
                 inverse_indices,
-            ) = flow._C.one_embedding_unique_key_value_pair(ids, table_ids, num_tables)
+            ) = flow._C.one_embedding_unique_key_value_pair(
+                ids, column_ids, num_columns
+            )
             return (
                 flow.cast(num_unique, flow.int32),
                 flow.cast(unique_ids, flow.int32),
-                flow.cast(unique_table_ids, flow.int32),
+                flow.cast(unique_column_ids, flow.int32),
                 flow.cast(inverse_indices, flow.int32),
             )
 
     graph = TestGraph()
-    (num_unique, unique_ids, unique_table_ids, inverse_indices,) = graph(
-        ids_tensor, table_ids_tensor
+    (num_unique, unique_ids, unique_column_ids, inverse_indices,) = graph(
+        ids_tensor, column_ids_tensor
     )
     np_unique_ids, np_inverse = np.unique(ids, return_inverse=True)
     np_num_unique = np_unique_ids.size
     test_case.assertTrue(np.array_equal(np_num_unique, num_unique[0]))
     reversed_ids = unique_ids[inverse_indices]
     test_case.assertTrue(np.array_equal(reversed_ids.numpy(), ids))
-    if has_table_id:
-        reversed_table_ids = unique_table_ids[inverse_indices]
-        test_case.assertTrue(np.array_equal(reversed_table_ids.numpy(), table_ids))
+    if has_column_id:
+        reversed_column_ids = unique_column_ids[inverse_indices]
+        test_case.assertTrue(np.array_equal(reversed_column_ids.numpy(), column_ids))
 
 
 @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
@@ -332,29 +264,26 @@ def build(self, ids, table_ids):
 class DataShuffleTestCase(flow.unittest.TestCase):
     def test_id_shuffle(test_case):
         arg_dict = OrderedDict()
-        arg_dict["has_table_id"] = [True, False]
-        arg_dict["num_tables"] = [1, 26]
+        arg_dict["has_column_id"] = [True, False]
+        arg_dict["num_columns"] = [1, 26]
         for kwargs in GenArgDict(arg_dict):
             _test_id_shuffle(test_case, **kwargs)
 
     def test_embedding_shuffle(test_case):
         arg_dict = OrderedDict()
         arg_dict["dtype"] = [flow.float32, flow.float16]
-        arg_dict["enable_quantize"] = [True, False]
-
         for kwargs in GenArgDict(arg_dict):
             _test_embedding_shuffle(test_case, **kwargs)
 
     def test_embedding_gradient_shuffle(test_case):
         arg_dict = OrderedDict()
-        arg_dict["enable_quantize"] = [True, False]
         for kwargs in GenArgDict(arg_dict):
             _test_embedding_gradient_shuffle(test_case, **kwargs)
 
     def test_unique_key_value(test_case):
         arg_dict = OrderedDict()
-        arg_dict["has_table_id"] = [True, False]
-        arg_dict["num_tables"] = [13, 26, 1]
+        arg_dict["has_column_id"] = [True, False]
+        arg_dict["num_columns"] = [13, 26, 1]
         for kwargs in GenArgDict(arg_dict):
             _test_unique_key_value(test_case, **kwargs)
 
diff --git a/python/oneflow/test/expensive/test_permute.py b/python/oneflow/test/expensive/test_permute.py
index 68f482f97dc..0254c6c96ba 100644
--- a/python/oneflow/test/expensive/test_permute.py
+++ b/python/oneflow/test/expensive/test_permute.py
@@ -73,7 +73,8 @@ def test_permute(test_case):
             _test_permute_impl(test_case, *arg)
             _test_tensor_permute_impl(test_case, *arg)
 
-    @autotest(check_graph=False)
+    @unittest.skip("pytorch 1.9.0 exist not torch.permute api")
+    @autotest()
     def test_torch_permute4d_with_random_data(test_case):
         device = random_device()
         ndim = 4
@@ -137,6 +138,7 @@ def test_permute3d_tensor_with_random_data(test_case):
             dim0=random(1, 18).to(int),
             dim1=random(1, 78).to(int),
             dim2=random(1, 99).to(int),
+            dim3=random(1, 98).to(int),
         ).to(device)
         y = x.permute(permute_list)
         return y
diff --git a/python/oneflow/test/graph/test_graph_clip_grad_norm.py b/python/oneflow/test/graph/test_graph_clip_grad_norm.py
deleted file mode 100644
index d89262aeb62..00000000000
--- a/python/oneflow/test/graph/test_graph_clip_grad_norm.py
+++ /dev/null
@@ -1,444 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import os
-import unittest
-import numpy as np
-
-import oneflow as flow
-import oneflow.unittest
-
-
-class MyModule1(flow.nn.Module):
-    def __init__(self, param):
-        super().__init__()
-        self.param = flow.nn.Parameter(param)
-
-    def forward(self, input):
-        x = flow._C.matmul(input, self.param, transpose_b=True)
-        return flow._C.gelu(x)
-
-
-class MyModule2(flow.nn.Module):
-    def __init__(self, param):
-        super().__init__()
-        self.param = flow.nn.Parameter(param)
-
-    def forward(self, input, target):
-        x = flow._C.matmul(input, self.param)
-        loss = flow._C.sparse_softmax_cross_entropy(x, target)
-        return loss.mean()
-        # return loss
-
-
-def _make_optimizer(params, norm_type, max_norm):
-    return flow.optim.SGD(
-        [
-            {
-                "params": params,
-                "lr": 1.0,
-                "momentum": 0.0,
-                "clip_grad_max_norm": max_norm,
-                "clip_grad_norm_type": norm_type,
-            },
-        ]
-    )
-
-
-class MyGraph(flow.nn.Graph):
-    def __init__(self, module1, module2, optimizer=None, acc=1):
-        super().__init__()
-
-        self.m1 = module1
-        self.m2 = module2
-
-        if (
-            module1.param.is_global
-            and module2.param.is_global
-            and module1.param.placement != module2.param.placement
-        ):
-            self.m1.config.stage_id = 0
-            self.m2.config.stage_id = 1
-
-        if optimizer is not None:
-            self.add_optimizer(optimizer)
-
-        if acc > 1:
-            self.config.set_gradient_accumulation_steps(acc)
-
-    def build(self, input, target):
-        x = self.m1(input)
-        if x.is_global and target.is_global and x.placement != target.placement:
-            x = x.to_global(placement=target.placement)
-        loss = self.m2(x, target)
-        loss.backward()
-        return loss
-
-
-class TensorGenerator(object):
-    def __init__(
-        self, batch_size=8, feat1=10, feat2=8, device="cuda", parallel_mode=None
-    ):
-        input = flow.randn(batch_size, feat1).to(device)
-        param1 = flow.randn(feat2, feat1).to(device)
-        param2 = flow.randn(feat2, feat1).to(device)
-        target = flow.randint(0, 10, (batch_size,)).to(device)
-
-        ranks = np.array(range(flow.env.get_world_size()))
-        placement = flow.placement(device, ranks)
-        self.input = input.to_global(placement, sbp=flow.sbp.broadcast)
-        self.param1 = param1.to_global(placement, sbp=flow.sbp.broadcast)
-        self.param2 = param2.to_global(placement, sbp=flow.sbp.broadcast)
-        self.target = target.to_global(placement, sbp=flow.sbp.broadcast)
-
-        self.input_sbp = None
-        self.target_sbp = None
-        self.param1_sbp = None
-        self.param2_sbp = None
-        self.placement1 = None
-        self.placement2 = None
-
-        if parallel_mode is not None:
-            assert isinstance(parallel_mode, str) or isinstance(
-                parallel_mode, (list, tuple)
-            )
-
-            if isinstance(parallel_mode, str):
-                parallel_mode = [parallel_mode]
-
-            assert all(p.upper() in ("DP", "MP", "PP") for p in parallel_mode)
-            assert len(parallel_mode) > 0 and len(parallel_mode) <= 2
-
-            self.input_sbp = []
-            self.target_sbp = []
-            self.param1_sbp = []
-            self.param2_sbp = []
-
-            has_pp = False
-
-            for p in parallel_mode:
-                if p == "DP":
-                    self.input_sbp.append(flow.sbp.split(0))
-                    self.target_sbp.append(flow.sbp.split(0))
-                    self.param1_sbp.append(flow.sbp.broadcast())
-                    self.param2_sbp.append(flow.sbp.broadcast())
-                elif p == "MP":
-                    self.input_sbp.append(flow.sbp.broadcast())
-                    self.target_sbp.append(flow.sbp.broadcast())
-                    self.param1_sbp.append(flow.sbp.split(0))
-                    self.param2_sbp.append(flow.sbp.split(0))
-                elif p == "PP":
-                    ranks = ranks.reshape(2, -1)
-                    self.placement1 = flow.placement(device, ranks[0])
-                    self.placement2 = flow.placement(device, ranks[1])
-                    has_pp = True
-                else:
-                    raise ValueError
-
-            if len(parallel_mode) > 1 and not has_pp:
-                ranks = ranks.reshape(2, -1)
-                self.placement1 = flow.placement(device, ranks)
-                self.placement2 = flow.placement(device, ranks)
-
-            if len(self.input_sbp) == 0:
-                self.input_sbp = None
-
-            if len(self.target_sbp) == 0:
-                self.target_sbp = None
-
-            if len(self.param1_sbp) == 0:
-                self.param1_sbp = None
-
-            if len(self.param2_sbp) == 0:
-                self.param2_sbp = None
-
-    def local_input(self):
-        return self.input.to_local()
-
-    def local_target(self):
-        return self.target.to_local()
-
-    def local_param1(self):
-        return self.param1.clone().to_local()
-
-    def local_param2(self):
-        return self.param2.clone().to_local()
-
-    def global_input(self):
-        if self.input_sbp is None and self.placement1 is None:
-            return self.input
-
-        return self.input.to_global(placement=self.placement1, sbp=self.input_sbp)
-
-    def global_target(self):
-        if self.target_sbp is None and self.placement2 is None:
-            return self.target
-
-        return self.target.to_global(placement=self.placement2, sbp=self.target_sbp)
-
-    def global_param1(self):
-        if self.param1_sbp is None and self.placement1 is None:
-            return self.param1.clone()
-
-        return self.param1.to_global(placement=self.placement1, sbp=self.param1_sbp)
-
-    def global_param2(self):
-        if self.param2_sbp is None and self.placement2 is None:
-            return self.param2.clone()
-
-        return self.param2.to_global(placement=self.placement2, sbp=self.param2_sbp)
-
-
-def _compare_with_eager(
-    test_case,
-    *,
-    batch_size=8,
-    acc=1,
-    norm_type=2.0,
-    max_norm=1.0,
-    device="cuda",
-    parallel_mode=None,
-    rtol=1e-05,
-    atol=1e-08,
-):
-    gen = TensorGenerator(
-        batch_size=batch_size, device=device, parallel_mode=parallel_mode
-    )
-
-    # eager
-    m1 = MyModule1(gen.local_param1())
-    m2 = MyModule2(gen.local_param2())
-    opt = _make_optimizer([m1.param, m2.param], norm_type, max_norm)
-    x = m1(gen.local_input())
-    loss = m2(x, gen.local_target())
-    opt.zero_grad()
-    loss.backward()
-    opt.clip_grad()
-    opt.step()
-
-    loss_a = loss.numpy()
-    grad1_a = m1.param.numpy()
-    grad2_a = m2.param.numpy()
-
-    # graph
-    graph_m1 = MyModule1(gen.global_param1())
-    graph_m2 = MyModule2(gen.global_param2())
-    opt = _make_optimizer([graph_m1.param, graph_m2.param], norm_type, max_norm)
-    graph = MyGraph(graph_m1, graph_m2, opt, acc)
-    graph_loss = graph(gen.global_input(), gen.global_target())
-
-    # debug
-    # rank = flow.env.get_rank()
-    # print("")
-    # print(f"[rank{rank}] eager local loss: {loss}")
-
-    # print(
-    #     f"[rank{rank}] graph_loss placement: {graph_loss.placement}, sbp: {graph_loss.sbp}"
-    # )
-    # print(f"[rank{rank}] graph_loss: {graph_loss}")
-
-    # local_loss = graph_loss.to_local()
-    # print(f"[rank{rank}] local_loss.numel(): {local_loss.numel()}")
-    # print(f"[rank{rank}] local_loss: {local_loss}")
-
-    if acc > 1 and graph_loss.numel() == acc:
-        graph_loss = graph_loss.mean()
-
-    if parallel_mode is None:
-        loss_b = graph_loss.numpy()
-        grad1_b = graph.m1.origin.param.numpy()
-        grad2_b = graph.m2.origin.param.numpy()
-    else:
-        ranks = np.array(range(flow.env.get_world_size()))
-        placement = flow.placement(device, ranks)
-        loss_b = graph_loss.to_global(placement, flow.sbp.broadcast).to_local().numpy()
-        grad1_b = graph.m1.origin.param.to_global(placement, flow.sbp.broadcast)
-        grad1_b = grad1_b.to_local().numpy()
-        grad2_b = graph.m2.origin.param.to_global(placement, flow.sbp.broadcast)
-        grad2_b = grad2_b.to_local().numpy()
-
-    # compare
-    test_case.assertTrue(
-        np.allclose(loss_a, loss_b, rtol=rtol, atol=atol), f"{loss_a} vs. {loss_b}"
-    )
-    test_case.assertTrue(
-        np.allclose(grad1_a, grad1_b, rtol=rtol, atol=atol),
-        f"\n{grad1_a}\nvs.\n{grad1_b}",
-    )
-    test_case.assertTrue(
-        np.allclose(grad2_a, grad2_b, rtol=rtol, atol=atol),
-        f"\n{grad2_a}\nvs.\n{grad2_b}",
-    )
-
-
-@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-class TestGraphClipGradNorm(flow.unittest.TestCase):
-    @flow.unittest.skip_unless_1n1d()
-    def test_local(test_case):
-        _compare_with_eager(test_case)
-
-    @flow.unittest.skip_unless_1n1d()
-    def test_acc(test_case):
-        _compare_with_eager(test_case, batch_size=8, acc=8)
-
-    @flow.unittest.skip_unless_1n2d()
-    def test_dp(test_case):
-        _compare_with_eager(test_case, parallel_mode="DP")
-
-    @flow.unittest.skip_unless_1n2d()
-    def test_mp(test_case):
-        _compare_with_eager(test_case, parallel_mode="MP")
-
-    @flow.unittest.skip_unless_1n2d()
-    def test_pp(test_case):
-        _compare_with_eager(test_case, parallel_mode="PP")
-
-    @flow.unittest.skip_unless_1n2d()
-    def test_pp_acc(test_case):
-        _compare_with_eager(test_case, batch_size=8, acc=8, parallel_mode="PP")
-
-    @flow.unittest.skip_unless_1n4d()
-    def test_dp_mp(test_case):
-        _compare_with_eager(test_case, parallel_mode=["DP", "MP"])
-
-    @flow.unittest.skip_unless_1n4d()
-    def test_mp_pp(test_case):
-        _compare_with_eager(test_case, parallel_mode=["MP", "PP"])
-
-    @flow.unittest.skip_unless_1n4d()
-    def test_dp_pp(test_case):
-        _compare_with_eager(test_case, parallel_mode=["DP", "PP"])
-
-    @flow.unittest.skip_unless_1n4d()
-    def test_mp_pp_acc(test_case):
-        _compare_with_eager(test_case, batch_size=8, acc=8, parallel_mode=["MP", "PP"])
-
-    @flow.unittest.skip_unless_1n4d()
-    def test_dp_pp_acc(test_case):
-        _compare_with_eager(test_case, batch_size=8, acc=4, parallel_mode=["DP", "PP"])
-
-
-@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-class TestGraphClipGradNormInf(flow.unittest.TestCase):
-    @flow.unittest.skip_unless_1n1d()
-    def test_local(test_case):
-        _compare_with_eager(test_case, norm_type=float("inf"))
-
-    @flow.unittest.skip_unless_1n1d()
-    def test_acc(test_case):
-        _compare_with_eager(
-            test_case, batch_size=8, acc=8, norm_type=-float("inf"), atol=1e-6
-        )
-
-    @flow.unittest.skip_unless_1n2d()
-    def test_dp(test_case):
-        _compare_with_eager(
-            test_case,
-            norm_type=float("inf"),
-            max_norm=2.0,
-            parallel_mode="DP",
-            atol=1e-6,
-        )
-
-    @flow.unittest.skip_unless_1n2d()
-    def test_mp(test_case):
-        _compare_with_eager(
-            test_case,
-            norm_type=-float("inf"),
-            max_norm=3.0,
-            parallel_mode="MP",
-            atol=1e-6,
-        )
-
-    @flow.unittest.skip_unless_1n2d()
-    def test_pp(test_case):
-        _compare_with_eager(
-            test_case,
-            norm_type=float("inf"),
-            max_norm=4.0,
-            parallel_mode="PP",
-            atol=1e-6,
-        )
-
-    @flow.unittest.skip_unless_1n2d()
-    def test_pp_acc(test_case):
-        _compare_with_eager(
-            test_case,
-            batch_size=8,
-            acc=8,
-            norm_type=-float("inf"),
-            max_norm=5.0,
-            parallel_mode="PP",
-            atol=1e-6,
-        )
-
-    @flow.unittest.skip_unless_1n4d()
-    def test_dp_mp(test_case):
-        _compare_with_eager(
-            test_case,
-            norm_type=float("inf"),
-            max_norm=1.1,
-            parallel_mode=["DP", "MP"],
-            atol=1e-6,
-        )
-
-    @flow.unittest.skip_unless_1n4d()
-    def test_mp_pp(test_case):
-        _compare_with_eager(
-            test_case,
-            norm_type=-float("inf"),
-            max_norm=1.2,
-            parallel_mode=["MP", "PP"],
-            atol=1e-6,
-        )
-
-    @flow.unittest.skip_unless_1n4d()
-    def test_dp_pp(test_case):
-        _compare_with_eager(
-            test_case,
-            norm_type=float("inf"),
-            max_norm=1.3,
-            parallel_mode=["DP", "PP"],
-            atol=1e-6,
-        )
-
-    @flow.unittest.skip_unless_1n4d()
-    def test_mp_pp_acc(test_case):
-        _compare_with_eager(
-            test_case,
-            batch_size=8,
-            acc=8,
-            norm_type=float("inf"),
-            max_norm=2.1,
-            parallel_mode=["MP", "PP"],
-            atol=1e-6,
-        )
-
-    @flow.unittest.skip_unless_1n4d()
-    def test_dp_pp_acc(test_case):
-        _compare_with_eager(
-            test_case,
-            batch_size=8,
-            acc=4,
-            norm_type=-float("inf"),
-            max_norm=2.2,
-            parallel_mode=["DP", "PP"],
-            atol=1e-6,
-        )
-
-
-if __name__ == "__main__":
-    # flow.manual_seed(0)
-    unittest.main()
diff --git a/python/oneflow/test/graph/test_graph_debug.py b/python/oneflow/test/graph/test_graph_debug.py
index 4f6dc121b64..7341a508559 100644
--- a/python/oneflow/test/graph/test_graph_debug.py
+++ b/python/oneflow/test/graph/test_graph_debug.py
@@ -25,7 +25,7 @@
 rank = flow.env.get_rank()
 
 
-def _graph_debug(test_case, v_level=0, ranks=None, max_py_stack_depth=2):
+def _graph_debug(test_case, v_level = 0, ranks=None, max_py_stack_depth=2):
     class DebugGraph(flow.nn.Graph):
         def __init__(self):
             super().__init__()
@@ -44,9 +44,7 @@ def build(self, x):
     elif isinstance(ranks, list):
         rank_list = ranks
 
-    if (
-        -1 in rank_list or rank in rank_list
-    ) and v_level >= 0:  # v_level == -1 means debug mode is closed
+    if (-1 in rank_list or rank in rank_list) and v_level >= 0: # v_level == -1 means debug mode is closed
         test_case.assertTrue(d_g._debug)
         test_case.assertTrue(d_g.m._debug)
         print(f"ranks {ranks} rank {rank} debug is opened.")
@@ -81,10 +79,9 @@ def test_graph_debug_mode_opened(test_case):
 
     def test_graph_debug_max_py_stack_depth_2(test_case):
         _graph_debug(test_case, max_py_stack_depth=2)
-
+    
     def test_graph_debug_max_py_stack_depth_8(test_case):
         _graph_debug(test_case, max_py_stack_depth=8)
 
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/graph/test_graph_sparse_softmax_cross_entropy.py b/python/oneflow/test/graph/test_graph_sparse_softmax_cross_entropy.py
deleted file mode 100644
index 4c7fbd51fbc..00000000000
--- a/python/oneflow/test/graph/test_graph_sparse_softmax_cross_entropy.py
+++ /dev/null
@@ -1,156 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import os
-import unittest
-import numpy as np
-import oneflow as flow
-import oneflow.unittest
-
-
-class CrossEntropyModule(flow.nn.Module):
-    def __init__(self, pred):
-        super().__init__()
-        if pred.is_global:
-            self.param = flow.nn.Parameter(
-                flow.zeros(
-                    *pred.shape,
-                    dtype=pred.dtype,
-                    placement=pred.placement,
-                    sbp=pred.sbp,
-                )
-            )
-        else:
-            self.param = flow.nn.Parameter(
-                flow.zeros(*pred.shape, dtype=pred.dtype, device=pred.device)
-            )
-
-    def forward(self, pred, label):
-        pred = pred + self.param
-        loss = flow._C.sparse_softmax_cross_entropy(pred, label)
-        return loss.mean()
-
-
-class CrossEntropyGraph(flow.nn.Graph):
-    def __init__(self, module):
-        super().__init__()
-        self.m = module
-        self.add_optimizer(flow.optim.SGD([module.param], lr=1.0, momentum=0.0))
-
-    def build(self, pred, label):
-        loss = self.m(pred, label)
-        loss.backward()
-        return loss
-
-
-def _compare_with_nn_cross_entropy_loss(
-    test_case, pred, label, pred_sbp=None, label_sbp=None
-):
-    if pred.is_global:
-        assert label.is_global
-        pred_ = pred.to_local().detach().clone()
-        label_ = label.to_local()
-    else:
-        pred_ = pred.detach().clone()
-        label_ = label
-
-    pred_.requires_grad = True
-    cross_entropy_loss = flow.nn.CrossEntropyLoss()
-    loss = cross_entropy_loss(pred_, label_)
-    loss.backward()
-
-    if pred_sbp is not None:
-        pred = pred.to_global(sbp=pred_sbp)
-
-    if label_sbp is not None:
-        label = label.to_global(sbp=label_sbp)
-
-    cross_entropy_module = CrossEntropyModule(pred)
-    cross_entropy_graph = CrossEntropyGraph(cross_entropy_module)
-    graph_loss = cross_entropy_graph(pred, label)
-
-    loss_a = loss.numpy()
-    grad_a = pred_.grad.numpy()
-    if graph_loss.is_local:
-        loss_b = graph_loss.numpy()
-        grad_b = -cross_entropy_module.param.numpy()
-    else:
-        graph_loss = graph_loss.to_global(
-            sbp=[flow.sbp.broadcast()] * len(graph_loss.sbp)
-        )
-        loss_b = graph_loss.to_local().numpy()
-        pred_grad = cross_entropy_module.param.to_global(
-            sbp=[flow.sbp.broadcast()] * len(cross_entropy_module.param.sbp)
-        )
-        grad_b = -pred_grad.to_local().numpy()
-
-    test_case.assertTrue(np.allclose(loss_a, loss_b), f"{loss_a} vs. {loss_b}")
-    test_case.assertTrue(np.allclose(grad_a, grad_b), f"\n{grad_a}\nvs.\n{grad_b}")
-
-
-@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-class TestSparseSoftmaxCrossEntropyGraph(oneflow.unittest.TestCase):
-    @flow.unittest.skip_unless_1n1d()
-    def test_local(test_case):
-        pred = flow.randn(8, 10).to("cuda")
-        label = flow.randint(0, 10, (8,)).to("cuda")
-        _compare_with_nn_cross_entropy_loss(test_case, pred, label)
-
-    @flow.unittest.skip_unless_1n2d()
-    def test_data_split(test_case):
-        pred = flow.randn(8, 10)
-        label = flow.randint(0, 10, (8,))
-        placement = flow.placement("cuda", list(range(flow.env.get_world_size())))
-        pred = pred.to_global(placement=placement, sbp=flow.sbp.broadcast())
-        label = label.to_global(placement=placement, sbp=flow.sbp.broadcast())
-        _compare_with_nn_cross_entropy_loss(
-            test_case, pred, label, flow.sbp.split(0), flow.sbp.split(0)
-        )
-
-    @flow.unittest.skip_unless_1n2d()
-    def test_model_split(test_case):
-        pred = flow.randn(8, 10)
-        label = flow.randint(0, 10, (8,))
-        placement = flow.placement("cuda", list(range(flow.env.get_world_size())))
-        pred = pred.to_global(placement=placement, sbp=flow.sbp.broadcast())
-        label = label.to_global(placement=placement, sbp=flow.sbp.broadcast())
-        _compare_with_nn_cross_entropy_loss(
-            test_case, pred, label, flow.sbp.split(1), flow.sbp.broadcast()
-        )
-
-    @flow.unittest.skip_unless_1n4d()
-    def test_2d_split(test_case):
-        pred = flow.randn(8, 10)
-        label = flow.randint(0, 10, (8,))
-        placement = flow.placement(
-            "cuda", np.array(range(flow.env.get_world_size())).reshape(2, 2)
-        )
-        pred = pred.to_global(
-            placement=placement, sbp=[flow.sbp.broadcast(), flow.sbp.broadcast()]
-        )
-        label = label.to_global(
-            placement=placement, sbp=[flow.sbp.broadcast(), flow.sbp.broadcast()]
-        )
-        _compare_with_nn_cross_entropy_loss(
-            test_case,
-            pred,
-            label,
-            [flow.sbp.split(0), flow.sbp.split(1)],
-            [flow.sbp.split(0), flow.sbp.broadcast()],
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/graph/test_tvm_frontend_dependency_on_graph.py b/python/oneflow/test/graph/test_tvm_frontend_dependency_on_graph.py
index 3e514cd7acb..25d5f358427 100644
--- a/python/oneflow/test/graph/test_tvm_frontend_dependency_on_graph.py
+++ b/python/oneflow/test/graph/test_tvm_frontend_dependency_on_graph.py
@@ -24,7 +24,7 @@
 from alexnet_model import alexnet
 
 
-class TvmFrontedGraph(flow.nn.Graph):
+class Graph(flow.nn.Graph):
     def __init__(self, module):
         super().__init__()
         self.m = module
@@ -97,7 +97,7 @@ def forward(self, x):
 
     def test_infos_of_nodes(test_case):
         alexnet_module = alexnet()
-        alexnet_graph = TvmFrontedGraph(alexnet_module)
+        alexnet_graph = Graph(alexnet_module)
         if not alexnet_graph._is_compiled:
             alexnet_graph._compile(flow.rand(1, 3, 224, 224))
         graph_str = repr(alexnet_graph)
@@ -181,30 +181,6 @@ def test_infos_of_nodes(test_case):
         test_case.assertEqual(strides, (4, 4))
         test_case.assertEqual(padding_before, (2, 2))
 
-        node_input_list = []
-        node_output_list = []
-        for node_name in nodes:
-            node = nodes[node_name]
-            if is_user_op(node) and node.user_conf.op_type_name == "conv2d":
-                for input_name in node.user_conf.input:
-                    node_input_paths = getattr(node.user_conf.input[input_name], "s")
-                    for i in node_input_paths:
-                        node_input = i.split("/")[0]
-                        print(node_input)
-                        node_input_list.append(node_input)
-                for output_name in node.user_conf.output:
-                    node_output_paths = getattr(node.user_conf.output[output_name], "s")
-                    for node_output_path in node_output_paths:
-                        node_output_name = node_output_path.split("/")[0]
-                        print(node_output_name)
-                        node_output_list.append(node_output_name)
-
-        test_case.assertEqual("_TvmFrontedGraph_1_input.0.0_2" in node_input_list, True)
-        test_case.assertEqual("m.features.0.weight" in node_input_list, True)
-        test_case.assertEqual("m.features.5-maxpool_2d-7" in node_input_list, True)
-        test_case.assertEqual("m.features.0-conv2d-0" in node_output_list, True)
-        test_case.assertEqual("m.features.6-conv2d-8" in node_output_list, True)
-
     def test_buffer_convert_dependence(test_case):
         class SubModule(flow.nn.Module):
             def __init__(self):
@@ -218,7 +194,7 @@ def forward(self, x):
                 return x
 
         sub_module = SubModule()
-        sub_graph = TvmFrontedGraph(sub_module)
+        sub_graph = Graph(sub_module)
         graph_str = repr(sub_graph)
 
         size_where = 2
diff --git a/python/oneflow/test/modules/test_activation.py b/python/oneflow/test/modules/test_activation.py
index c2f6034a6f5..f49755b51c3 100644
--- a/python/oneflow/test/modules/test_activation.py
+++ b/python/oneflow/test/modules/test_activation.py
@@ -587,18 +587,6 @@ def test_leakyrelu_module_with_random_data(test_case):
         y = m(x)
         return y
 
-    @autotest(n=5)
-    def test_leakyrelu_module_with_inplace_arg(test_case):
-        m = torch.nn.LeakyReLU(
-            negative_slope=random() | nothing(), inplace=random().to(bool) | nothing()
-        )
-        m.train(random())
-        device = random_device()
-        m.to(device)
-        x = random_tensor().to(device)
-        y = m(x)
-        return y
-
     @autotest()
     @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
     def test_leakyrelu_module_with_half_random_data(test_case):
@@ -755,39 +743,6 @@ def test_flow_nn_functional_logsigmoid_with_0dim_data(test_case):
         return y
 
 
-@flow.unittest.skip_unless_1n1d()
-class TestHardshrinkModule(flow.unittest.TestCase):
-    @autotest(n=5)
-    def test_hardshrink_module_with_random_data(test_case):
-        m = torch.nn.Hardshrink(lambd=random() | nothing())
-        m.train(random())
-        device = random_device()
-        m.to(device)
-        x = random_tensor().to(device)
-        y = m(x)
-        return y
-
-    @autotest(n=5)
-    def test_hardshrink_module_with_0dim_data(test_case):
-        m = torch.nn.Hardshrink(lambd=random() | nothing())
-        m.train(random())
-        device = random_device()
-        m.to(device)
-        x = random_tensor(ndim=0).to(device)
-        y = m(x)
-        return y
-
-    @autotest(auto_backward=False, check_graph=True)
-    def test_hardshrink_module_with_0_size_data(test_case):
-        m = torch.nn.Hardshrink(lambd=random() | nothing())
-        m.train(random())
-        device = random_device()
-        m.to(device)
-        x = random_tensor(4, 2, 3, 0, 3).to(device)
-        y = m(x)
-        return y
-
-
 @flow.unittest.skip_unless_1n1d()
 class TestSoftshrinkModule(flow.unittest.TestCase):
     @autotest(n=5)
diff --git a/python/oneflow/test/modules/test_addcmul.py b/python/oneflow/test/modules/test_addcmul.py
deleted file mode 100644
index c66a99048f2..00000000000
--- a/python/oneflow/test/modules/test_addcmul.py
+++ /dev/null
@@ -1,65 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import unittest
-from oneflow.test_utils.automated_test_util import *
-import oneflow as flow
-import oneflow.unittest
-
-
-class TestAddcmul(flow.unittest.TestCase):
-    @autotest(check_graph=False)
-    def test_addcmul(test_case):
-        device = random_device()
-        ndim = random(low=2).to(int).value()
-        shape = [random(low=2, high=4) for i in range(ndim)]
-
-        input = random_tensor(len(shape), *shape).to(device)
-        tensor1 = random_tensor(len(shape), *shape).to(device)
-        tensor2 = random_tensor(len(shape), *shape).to(device)
-        value = random(3, 6).to(int)
-        output = torch.addcmul(input, tensor1, tensor2, value=value)
-        return output
-
-    @autotest(check_graph=False)
-    def test_tensor_addcmul(test_case):
-        device = random_device()
-        ndim = random(low=2).to(int).value()
-        shape = [random(low=2, high=4) for i in range(ndim)]
-
-        input = random_tensor(len(shape), *shape).to(device)
-        tensor1 = random_tensor(len(shape), *shape).to(device)
-        tensor2 = random_tensor(len(shape), *shape).to(device)
-        value = random(3, 6).to(int)
-        output = input.addcmul(tensor1, tensor2, value=value)
-        return output
-
-    @autotest(check_graph=False)
-    def test_tensor_addcmul_inplace(test_case):
-        device = random_device()
-        ndim = random(low=2).to(int).value()
-        shape = [random(low=2, high=4) for i in range(ndim)]
-
-        input = random_tensor(len(shape), *shape).to(device)
-        input = input + 1.0
-        tensor1 = random_tensor(len(shape), *shape).to(device)
-        tensor2 = random_tensor(len(shape), *shape).to(device)
-        value = random(3, 6).to(int)
-        input.addcmul_(tensor1, tensor2, value=value)
-        return input
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_amax.py b/python/oneflow/test/modules/test_amax.py
deleted file mode 100644
index 67c1061c9a5..00000000000
--- a/python/oneflow/test/modules/test_amax.py
+++ /dev/null
@@ -1,136 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-from collections import OrderedDict
-from oneflow.test_utils.automated_test_util import *
-from oneflow.test_utils.test_util import GenArgList
-
-import oneflow as flow
-import numpy as np
-
-
-def __check(test_case, input, dim, keepdim, device):
-    of_out = flow.amax(input, dim=dim, keepdim=keepdim)
-    if type(dim) is tuple:
-        if len(dim) == 0:
-            dim = None
-    np_out = np.amax(input.numpy(), axis=dim, keepdims=keepdim)
-    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, rtol=0.0001, atol=1e-05,))
-
-
-def _test_amax_with_negative_dim(test_case, device):
-    input = flow.tensor(
-        np.random.randn(3, 5, 6, 8), dtype=flow.float32, device=flow.device(device)
-    )
-    dim = random(-4, 0).to(int).value()
-    keepdim = random_bool().value()
-    __check(test_case, input, dim, keepdim, device)
-
-
-def _test_amax_with_positive_dim(test_case, device):
-    input = flow.tensor(
-        np.random.randn(3, 5, 6, 8), dtype=flow.float32, device=flow.device(device)
-    )
-    dim = random(0, 4).to(int).value()
-    keepdim = random_bool().value()
-    __check(test_case, input, dim, keepdim, device)
-
-
-def _test_amax_with_multiple_axes(test_case, device):
-    input = flow.tensor(
-        np.random.randn(3, 5, 6, 8), dtype=flow.float32, device=flow.device(device)
-    )
-    axes = set()
-    num_axes = random(1, 4).to(int).value()
-    for _ in range(num_axes):
-        axes.add(random(0, 4).to(int).value())
-    keepdim = random_bool().value()
-    __check(test_case, input, tuple(axes), keepdim, device)
-
-
-def _test_amax_with_empty_dim(test_case, device):
-    input = flow.tensor(
-        np.random.randn(3, 5, 6, 8), dtype=flow.float32, device=flow.device(device)
-    )
-    keepdim = random_bool().value()
-    __check(test_case, input, None, keepdim, device)
-
-
-def _test_amax_keepdim(test_case, device):
-    input = flow.tensor(
-        np.random.randn(3, 5, 6, 8), dtype=flow.float32, device=flow.device(device)
-    )
-    dim = random(-4, 4).to(int).value()
-    keepdim = True
-    __check(test_case, input, dim, keepdim, device)
-
-
-def _test_amax_not_keepdim(test_case, device):
-    input = flow.tensor(
-        np.random.randn(3, 5, 6, 8), dtype=flow.float32, device=flow.device(device)
-    )
-    dim = random(-4, 4).to(int).value()
-    keepdim = False
-    __check(test_case, input, dim, keepdim, device)
-
-
-@flow.unittest.skip_unless_1n1d()
-class TestAmax(flow.unittest.TestCase):
-    def test_amax(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["test_fun"] = [
-            _test_amax_with_negative_dim,
-            _test_amax_with_positive_dim,
-            _test_amax_with_multiple_axes,
-            _test_amax_with_empty_dim,
-            _test_amax_keepdim,
-            _test_amax_not_keepdim,
-        ]
-        arg_dict["device"] = ["cpu", "cuda"]
-        for arg in GenArgList(arg_dict):
-            arg[0](test_case, *arg[1:])
-
-    @autotest()
-    def test_amax_with_random_data_single_dim(test_case):
-        device = random_device()
-        ndim = random(1, 6).to(int)
-        x = random_tensor(ndim=ndim).to(device)
-        y = torch.amax(x, dim=random(0, ndim), keepdim=random().to(bool))
-        return y
-
-    @autotest()
-    def test_amax_with_random_data_empty_dim(test_case):
-        device = random_device()
-        ndim = random(1, 6).to(int)
-        x = random_tensor(ndim=ndim).to(device)
-        y = torch.amax(x, dim=None, keepdim=random().to(bool))
-        return y
-
-    @autotest()
-    def test_amax_with_random_data_multi_dims(test_case):
-        device = random_device()
-        ndim = random(2, 6).to(int)
-        x = random_tensor(ndim=ndim).to(device)
-        dim = set()
-        for _ in range(random(1, ndim).to(int).value()):
-            dim.add(random(0, ndim).to(int).value())
-        y = torch.amax(x, dim=tuple(dim), keepdim=random().to(bool))
-        return y
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_autograd.py b/python/oneflow/test/modules/test_autograd.py
index da8aa0a9658..d498b3d001d 100644
--- a/python/oneflow/test/modules/test_autograd.py
+++ b/python/oneflow/test/modules/test_autograd.py
@@ -127,21 +127,6 @@ def test_grad_grad(test_case):
         )[0]
         return x_grad_grad
 
-    @autotest(n=10, auto_backward=False, rtol=1e-3, atol=1e-3, check_graph=False)
-    def test_autograd_multiple_times(test_case):
-        device = random_device()
-        ndim = random(1, 4).to(int).value()
-        dims = [random(0, 10).to(int) for _ in range(ndim)]
-        x = random_tensor(ndim, *dims, requires_grad=True)
-        x1 = x.to(device)
-        y = random_tensor(ndim, *dims, requires_grad=True)
-        y1 = y.to(device)
-        z = x1 + y1
-
-        for _ in range(10):
-            z.sum().backward()
-        return (x.grad, y.grad)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_autograd_mode.py b/python/oneflow/test/modules/test_autograd_mode.py
index ee8abed9285..76ba59f8c14 100644
--- a/python/oneflow/test/modules/test_autograd_mode.py
+++ b/python/oneflow/test/modules/test_autograd_mode.py
@@ -15,8 +15,8 @@
 """
 
 import unittest
-
 import oneflow as flow
+
 import oneflow.unittest
 
 
@@ -47,12 +47,12 @@ def func():
         func()
         test_case.assertTrue(flow.is_grad_enabled())
 
-    def test_enable_grad(test_case):
-        with flow.enable_grad():
+    def test_grad_enable(test_case):
+        with flow.grad_enable():
             test_case.assertTrue(flow.is_grad_enabled())
         test_case.assertTrue(flow.is_grad_enabled())
 
-        @flow.enable_grad()
+        @flow.grad_enable()
         def func():
             test_case.assertTrue(flow.is_grad_enabled())
 
@@ -71,29 +71,6 @@ def func():
         func()
         test_case.assertTrue(flow.is_grad_enabled())
 
-    def test_set_grad_enabled(test_case):
-        with flow.set_grad_enabled(True):
-            test_case.assertTrue(flow.is_grad_enabled())
-        test_case.assertTrue(flow.is_grad_enabled())
-
-        @flow.set_grad_enabled(True)
-        def func():
-            test_case.assertTrue(flow.is_grad_enabled())
-
-        func()
-        test_case.assertTrue(flow.is_grad_enabled())
-
-        with flow.set_grad_enabled(False):
-            test_case.assertFalse(flow.is_grad_enabled())
-        test_case.assertTrue(flow.is_grad_enabled())
-
-        @flow.set_grad_enabled(False)
-        def func():
-            test_case.assertFalse(flow.is_grad_enabled())
-
-        func()
-        test_case.assertTrue(flow.is_grad_enabled())
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_consistent_activation.py b/python/oneflow/test/modules/test_consistent_activation.py
index 77f75681536..bff0f8ce648 100644
--- a/python/oneflow/test/modules/test_consistent_activation.py
+++ b/python/oneflow/test/modules/test_consistent_activation.py
@@ -38,8 +38,6 @@ def build_module(act_type):
         return torch.nn.Sigmoid()
     elif act_type == "hardsigmoid":
         return torch.nn.Hardsigmoid()
-    elif act_type == "hardshrink":
-        return torch.nn.Hardshrink(lambd=random())
     elif act_type == "logsigmoid":
         return torch.nn.LogSigmoid()
     elif act_type == "hardswish":
@@ -160,11 +158,6 @@ def test_hardsigmoid_module(test_case):
         _test_activation_module(test_case, "hardsigmoid")
 
 
-class TestHardshrinkModule(flow.unittest.TestCase):
-    def test_hardshrink_module(test_case):
-        _test_activation_module(test_case, "hardshrink")
-
-
 class TestLogSigmoidModule(flow.unittest.TestCase):
     def test_logsigmoid_module(test_case):
         _test_activation_module(test_case, "logsigmoid")
diff --git a/python/oneflow/test/modules/test_consistent_addcmul.py b/python/oneflow/test/modules/test_consistent_addcmul.py
deleted file mode 100644
index bf876ffef44..00000000000
--- a/python/oneflow/test/modules/test_consistent_addcmul.py
+++ /dev/null
@@ -1,45 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-from oneflow.test_utils.automated_test_util import *
-import oneflow as flow
-import oneflow.unittest
-
-
-@autotest(n=1, check_graph=False)
-def _test_addcmul(test_case, ndim, placement, sbp):
-    shape = [random(low=2, high=4) * 8 for i in range(ndim)]
-
-    input = random_tensor(ndim, *shape).to_global(placement=placement, sbp=sbp)
-    tensor1 = random_tensor(len(shape), *shape).to_global(placement=placement, sbp=sbp)
-    tensor2 = random_tensor(len(shape), *shape).to_global(placement=placement, sbp=sbp)
-    value = random(3, 6).to(int)
-    output = torch.addcmul(input, tensor1, tensor2, value=value)
-    return output
-
-
-class TestModule(flow.unittest.TestCase):
-    @globaltest
-    def test_addcmul(test_case):
-        ndim = random(low=2).to(int).value()
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=ndim):
-                _test_addcmul(test_case, ndim, placement, sbp)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_consistent_argsort.py b/python/oneflow/test/modules/test_consistent_argsort.py
index 5c2d8cb6188..ebfa00316f5 100644
--- a/python/oneflow/test/modules/test_consistent_argsort.py
+++ b/python/oneflow/test/modules/test_consistent_argsort.py
@@ -31,7 +31,6 @@ def _test_argsort_with_random_data(test_case, ndim, placement, sbp):
     return y
 
 
-@unittest.skip("argsort has bug not found at now.")
 class TestArgsort(flow.unittest.TestCase):
     @globaltest
     def test_argsort(test_case):
diff --git a/python/oneflow/test/modules/test_consistent_math_ops.py b/python/oneflow/test/modules/test_consistent_math_ops.py
index d037f736839..f560c6af923 100644
--- a/python/oneflow/test/modules/test_consistent_math_ops.py
+++ b/python/oneflow/test/modules/test_consistent_math_ops.py
@@ -113,7 +113,7 @@ def _test_floordiv_with_scalar(test_case, placement, sbp, ndim):
 @autotest(n=1, check_graph=False)
 def _test_arccos(test_case, placement, sbp, ndim):
     dim_list = [random(1, 3).to(int).value() * 8 for _ in range(ndim)]
-    x = random_tensor(ndim, *dim_list, low=-1, high=1).to_global(placement, sbp)
+    x = random_tensor(ndim, *dim_list, low=2, high=3).to_global(placement, sbp)
     y = torch.arccos(x)
     return y
 
@@ -121,7 +121,7 @@ def _test_arccos(test_case, placement, sbp, ndim):
 @autotest(n=1, check_graph=False)
 def _test_acos(test_case, placement, sbp, ndim):
     dim_list = [random(1, 3).to(int).value() * 8 for _ in range(ndim)]
-    x = random_tensor(ndim, *dim_list, low=-1, high=1).to_global(placement, sbp)
+    x = random_tensor(ndim, *dim_list, low=2, high=3).to_global(placement, sbp)
     y = torch.acos(x)
     return y
 
@@ -163,7 +163,7 @@ def _test_atan2(test_case, placement, sbp, ndim):
 class TestMathOps(flow.unittest.TestCase):
     @globaltest
     def test_math_ops(test_case):
-        ndim = random(1, 3).to(int).value()
+        ndim = random().to(int).value()
         for placement in all_placement():
             for sbp in all_sbp(placement, max_dim=ndim):
                 _test_sinh(test_case, placement, sbp, ndim)
diff --git a/python/oneflow/test/modules/test_consistent_randn.py b/python/oneflow/test/modules/test_consistent_randn.py
deleted file mode 100644
index 481e29cda71..00000000000
--- a/python/oneflow/test/modules/test_consistent_randn.py
+++ /dev/null
@@ -1,124 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-from collections import OrderedDict
-
-import oneflow as flow
-import numpy as np
-import oneflow.unittest
-from oneflow.test_utils.automated_test_util import *
-
-from oneflow.test_utils.test_util import GenArgDict
-
-
-def _test_consistent_randn(test_case, shape, placement, sbp):
-    x1 = flow.randn(*shape, placement=placement, sbp=sbp)
-    x2 = flow.randn(*shape, placement=placement, sbp=sbp)
-    test_case.assertTrue(not np.allclose(x1.numpy(), x2.numpy(), atol=1e-4, rtol=1e-4))
-    test_case.assertEqual(x1.shape, flow.Size(shape))
-    test_case.assertEqual(x1.sbp, sbp)
-    test_case.assertEqual(x1.placement, placement)
-
-
-def _test_different_dtype(test_case, shape, placement, sbp):
-    x1 = flow.randn(*shape, dtype=flow.float32, placement=placement, sbp=sbp)
-    x2 = flow.randn(*shape, dtype=flow.float64, placement=placement, sbp=sbp)
-    test_case.assertTrue(not np.allclose(x1.numpy(), x2.numpy(), atol=1e-4, rtol=1e-4))
-    test_case.assertEqual(x1.shape, flow.Size(shape))
-
-
-def _test_backward(test_case, shape, placement, sbp):
-    x = flow.randn(*shape, placement=placement, sbp=sbp, requires_grad=True)
-    y = x.sum()
-    y.backward()
-    test_case.assertTrue(
-        np.allclose(np.ones(shape), x.grad.numpy(), atol=1e-4, rtol=1e-4)
-    )
-
-
-def _test_with_generator(test_case, shape, placement, sbp):
-    gen = flow.Generator()
-    gen.manual_seed(0)
-    y1 = flow.randn(*shape, placement=placement, sbp=sbp, generator=gen)
-    gen.manual_seed(0)
-    y2 = flow.randn(*shape, placement=placement, sbp=sbp, generator=gen)
-    test_case.assertTrue(np.allclose(y1.numpy(), y2.numpy(), atol=1e-4, rtol=1e-4))
-
-
-def _test_randn_tuple_shape(test_case, shape, placement, sbp):
-    y1 = flow.randn(*shape, placement=placement, sbp=sbp)
-    y2 = flow.randn(*shape, placement=placement, sbp=sbp)
-
-    test_case.assertTrue(not np.array_equal(y1.numpy(), y2.numpy()))
-    test_case.assertTrue(shape == y1.shape)
-
-
-def _test_graph_randn(test_case, shape, placement, sbp):
-    class ConsistentRandnGraph(flow.nn.Graph):
-        def __init__(self,):
-            super().__init__()
-
-        def build(self):
-            x = flow.randn(*shape, placement=placement, sbp=sbp)
-            return x
-
-    model = ConsistentRandnGraph()
-    x = model()
-
-    test_case.assertEqual(x.shape, flow.Size(shape))
-    test_case.assertEqual(x.sbp, sbp)
-    test_case.assertEqual(x.placement, placement)
-
-
-class TestRandnConsistent(flow.unittest.TestCase):
-    @globaltest
-    def test_randn_consistent(test_case):
-        shapes = [(8,), (8, 8,), (8, 8, 8)]
-        for shape in shapes:
-            for placement in all_placement():
-                for sbp in all_sbp(
-                    placement, max_dim=len(shape), except_partial_sum=True
-                ):
-                    _test_consistent_randn(test_case, shape, placement, sbp)
-                    _test_different_dtype(test_case, shape, placement, sbp)
-                    _test_backward(test_case, shape, placement, sbp)
-                    _test_with_generator(test_case, shape, placement, sbp)
-                    _test_randn_tuple_shape(test_case, shape, placement, sbp)
-
-    @flow.unittest.skip_unless_1n2d()
-    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-    @globaltest
-    def test_randn_graph(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["shape"] = [(8,), (8, 8,), (8, 8, 8)]
-        arg_dict["placement"] = [
-            # 1d
-            flow.placement("cpu", ranks=[0, 1]),
-            flow.placement("cuda", ranks=[0, 1]),
-            # 2d
-            flow.placement("cpu", ranks=[[0, 1],]),
-            flow.placement("cuda", ranks=[[0, 1],]),
-        ]
-        for args in GenArgDict(arg_dict):
-            shape = args["shape"]
-            placement = args["placement"]
-            for sbp in all_sbp(placement, max_dim=len(shape), except_partial_sum=True):
-                _test_graph_randn(test_case, shape, placement, sbp)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_consistent_scatter_nd.py b/python/oneflow/test/modules/test_consistent_scatter_nd.py
deleted file mode 100644
index 57ddae65dbd..00000000000
--- a/python/oneflow/test/modules/test_consistent_scatter_nd.py
+++ /dev/null
@@ -1,63 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-
-import numpy as np
-import oneflow as flow
-import oneflow.unittest
-from oneflow.test_utils.automated_test_util import *
-
-
-def _test_scatter_nd(test_case, placement, sbp):
-    indices = (
-        flow.tensor(np.array([[1], [6], [4]]), dtype=flow.int)
-        .to_global(flow.env.all_device_placement("cpu"), [flow.sbp.broadcast,])
-        .to_global(placement, sbp)
-    )
-    update = (
-        flow.tensor(np.array([10.2, 5.1, 12.7]), dtype=flow.float)
-        .to_global(flow.env.all_device_placement("cpu"), [flow.sbp.broadcast,])
-        .to_global(placement, sbp)
-        .requires_grad_()
-    )
-    output = flow.scatter_nd(indices, update, [8])
-
-    # forward
-    of_local = output.to_global(
-        flow.env.all_device_placement("cpu"), [flow.sbp.broadcast,]
-    ).to_local()
-    np_out = np.array([0.0, 10.2, 0.0, 0.0, 12.7, 0.0, 5.1, 0.0])
-    test_case.assertTrue(np.allclose(of_local.numpy(), np_out, 1e-4, 1e-4))
-
-    # backward
-    output.sum().backward()
-    of_grad_local = update.grad.to_global(
-        flow.env.all_device_placement("cpu"), [flow.sbp.broadcast,]
-    ).to_local()
-    test_case.assertTrue(np.allclose(of_grad_local.numpy(), np.ones((3)), 1e-4, 1e-4))
-
-
-class TestScatterNd(flow.unittest.TestCase):
-    @globaltest
-    def test_scatter_nd(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_partial_sum=True, except_split=True):
-                _test_scatter_nd(test_case, placement, sbp)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_consistent_scatter_ops.py b/python/oneflow/test/modules/test_consistent_scatter_ops.py
deleted file mode 100644
index 61d8557bf4d..00000000000
--- a/python/oneflow/test/modules/test_consistent_scatter_ops.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-
-import numpy as np
-import oneflow as flow
-import oneflow.unittest
-from oneflow.test_utils.automated_test_util import *
-
-
-@autotest(n=10, auto_backward=True, check_graph=False)
-def _test_scatter_random_data(test_case, placement):
-    input = random_tensor(ndim=2, dim0=2, dim1=2).to_global(
-        placement=placement, sbp=random_sbp(placement, max_dim=2)
-    )
-    src = random_tensor(ndim=2, dim0=2, dim1=2).to_global(
-        placement=placement, sbp=random_sbp(placement, max_dim=2)
-    )
-    index = (
-        torch.tensor(np.array([[0, 1], [1, 0]]), dtype=torch.int64)
-        .to_global(flow.env.all_device_placement("cpu"), [flow.sbp.broadcast,])
-        .to_global(placement, sbp=random_sbp(placement, max_dim=2),)
-    )
-    dim = random(0, 2).to(int).value()
-    return torch.scatter(input, dim, index, src)
-
-
-@autotest(n=10, auto_backward=True, check_graph=False)
-def _test_scatter_scalar_random_data(test_case, placement):
-    input = random_tensor(ndim=2, dim0=2, dim1=2).to_global(
-        placement=placement, sbp=random_sbp(placement, max_dim=2)
-    )
-    index = (
-        torch.tensor(np.array([[0, 1], [1, 0]]), dtype=torch.int64)
-        .to_global(flow.env.all_device_placement("cpu"), [flow.sbp.broadcast,])
-        .to_global(placement, sbp=random_sbp(placement, max_dim=2),)
-    )
-    dim = random(0, 2).to(int).value()
-    return torch.scatter(input, dim, index, 3.14)
-
-
-@autotest(n=10, auto_backward=True, check_graph=False)
-def _test_scatter_add_random_data(test_case, placement):
-    input = random_tensor(ndim=2, dim0=2, dim1=2).to_global(
-        placement=placement, sbp=random_sbp(placement, max_dim=2)
-    )
-    src = random_tensor(ndim=2, dim0=2, dim1=2).to_global(
-        placement=placement, sbp=random_sbp(placement, max_dim=2)
-    )
-    index = (
-        torch.tensor(np.array([[0, 1], [1, 0]]), dtype=torch.int64)
-        .to_global(flow.env.all_device_placement("cpu"), [flow.sbp.broadcast,])
-        .to_global(placement, sbp=random_sbp(placement, max_dim=2),)
-    )
-    dim = random(0, 2).to(int).value()
-    return torch.scatter_add(input, dim, index, src)
-
-
-@flow.unittest.skip_unless_1n2d()
-class TestScatterOps(flow.unittest.TestCase):
-    @globaltest
-    def test_scatter_ops(test_case):
-        for placement in all_placement():
-            _test_scatter_random_data(test_case, placement)
-            _test_scatter_scalar_random_data(test_case, placement)
-            _test_scatter_add_random_data(test_case, placement)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_consistent_unbind.py b/python/oneflow/test/modules/test_consistent_unbind.py
deleted file mode 100644
index 75fa6f676c3..00000000000
--- a/python/oneflow/test/modules/test_consistent_unbind.py
+++ /dev/null
@@ -1,43 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import unittest
-import oneflow as flow
-import oneflow.unittest
-from oneflow.test_utils.automated_test_util import *
-
-
-# TODO: the test is dependent on global select op(consistent tensor->stride())
-@unittest.skip("global select op is not currently supported")
-@autotest(n=1, check_graph=False)
-def _test_unbind(test_case, placement, sbp):
-    dim_size = random(1, 3).to(int).value() * 8
-    rand_dim = random(0, 3).to(int).value()
-    x = random_tensor(ndim=3, dim0=dim_size, dim1=dim_size, dim2=dim_size).to_global(
-        placement, sbp
-    )
-    return torch.unbind(x, dim=rand_dim)
-
-
-class TestUnbind(flow.unittest.TestCase):
-    @globaltest
-    def test_unbind(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=3):
-                _test_unbind(test_case, placement, sbp)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_consistent_upsample.py b/python/oneflow/test/modules/test_consistent_upsample.py
deleted file mode 100644
index e364719275b..00000000000
--- a/python/oneflow/test/modules/test_consistent_upsample.py
+++ /dev/null
@@ -1,113 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-from collections import OrderedDict
-
-import numpy as np
-from oneflow.test_utils.test_util import GenArgList
-from oneflow.test_utils.automated_test_util import *
-
-import oneflow as flow
-import oneflow.unittest
-
-
-@autotest(n=1, auto_backward=True, check_graph=False)
-def _test_global_upsample2d_nearest(test_case, placement, sbp):
-    x = random_tensor(ndim=3, dim0=8, dim1=16).to_global(placement, sbp)
-    print(x)
-    m = torch.nn.Upsample(scale_factor=random().to(int), mode="nearest",)
-    y = m(x)
-    return y
-
-
-@autotest(n=1, auto_backward=True, check_graph=False)
-def _test_global_upsample2d_linear(test_case, placement, sbp):
-    x = random_tensor(ndim=3, dim0=8, dim1=16).to_global(placement, sbp)
-    m = torch.nn.Upsample(
-        scale_factor=random().to(int), mode="linear", align_corners=random_bool(),
-    )
-    y = m(x)
-    return y
-
-
-@autotest(n=1, auto_backward=True, check_graph=False)
-def _test_global_upsample2d_bilinear(test_case, placement, sbp):
-    x = random_tensor(ndim=4, dim0=8, dim1=16).to_global(placement, sbp)
-    m = torch.nn.Upsample(
-        scale_factor=random().to(int), mode="bilinear", align_corners=random_bool(),
-    )
-    y = m(x)
-    return y
-
-
-@autotest(n=1, auto_backward=True, check_graph=False)
-def _test_global_upsample2d_bicubic(test_case, placement, sbp):
-    x = random_tensor(ndim=4, dim0=8, dim1=16).to_global(placement, sbp)
-    m = torch.nn.Upsample(
-        scale_factor=random().to(int), mode="bicubic", align_corners=random_bool(),
-    )
-    y = m(x)
-    return y
-
-
-@autotest(n=1, auto_backward=True, check_graph=False)
-def _test_global_upsample2d_trilinear(test_case, placement, sbp):
-    x = random_tensor(ndim=5, dim0=8, dim1=16).to_global(placement, sbp)
-    m = torch.nn.Upsample(
-        scale_factor=random().to(int), mode="trilinear", align_corners=random_bool(),
-    )
-    y = m(x)
-    return y
-
-
-class TestGlobalUpsample2d(flow.unittest.TestCase):
-    @unittest.skip(
-        "The nearest interpolate operation in pytorch has bug, https://github.com/pytorch/pytorch/issues/65200"
-    )
-    @globaltest
-    def test_global_upsample2d_nearest(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=1):
-                _test_global_upsample2d_nearest(test_case, placement, sbp)
-
-    @globaltest
-    def test_global_upsample2d_linear(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=1):
-                _test_global_upsample2d_linear(test_case, placement, sbp)
-
-    @globaltest
-    def test_global_upsample2d_bilinear(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=1):
-                _test_global_upsample2d_bilinear(test_case, placement, sbp)
-
-    @globaltest
-    def test_global_upsample2d_bicubic(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=1):
-                _test_global_upsample2d_bicubic(test_case, placement, sbp)
-
-    @globaltest
-    def test_global_upsample2d_trilinear(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=1):
-                _test_global_upsample2d_trilinear(test_case, placement, sbp)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_consistent_view.py b/python/oneflow/test/modules/test_consistent_view.py
deleted file mode 100644
index 0b611f5c44f..00000000000
--- a/python/oneflow/test/modules/test_consistent_view.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-
-from oneflow.test_utils.automated_test_util import *
-
-import oneflow as flow
-import oneflow.unittest
-
-
-@autotest(n=1, check_graph=False)
-def _test_global_view(test_case, placement, sbp):
-    x = random_tensor(ndim=2, dim0=8, dim1=32).to_global(placement, sbp)
-    y = x.view(8, 8, 2, -1)
-    return y
-
-
-@autotest(n=1, check_graph=False)
-def _test_global_view_size(test_case, placement, sbp):
-    x = random_tensor(ndim=2, dim0=8, dim1=32).to_global(placement, sbp)
-    shape = torch.Size([8, 8, 2, -1])
-    y = x.view(shape)
-    return y
-
-
-class TestGlobalView(flow.unittest.TestCase):
-    @globaltest
-    def test_global_view(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=2):
-                _test_global_view(test_case, placement, sbp)
-
-    @globaltest
-    def test_global_view_size(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=2):
-                _test_global_view_size(test_case, placement, sbp)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_consistent_weight_norm.py b/python/oneflow/test/modules/test_consistent_weight_norm.py
deleted file mode 100644
index 13834186482..00000000000
--- a/python/oneflow/test/modules/test_consistent_weight_norm.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import unittest
-from collections import OrderedDict
-
-from oneflow.test_utils.test_util import GenArgList
-from oneflow.test_utils.automated_test_util import *
-import oneflow as flow
-import oneflow.unittest
-
-
-@autotest(n=1, check_graph=False)
-def _test_global_weight_norm_with_random_data(test_case, placement, sbp):
-    dim = random(-2, 2).to(int).value()
-    liner_model_torch = torch.nn.Linear(8, 16).to_global(placement, sbp)
-    m = torch.nn.utils.weight_norm(liner_model_torch, name="weight", dim=dim)
-    return m.weight_g, m.weight_v
-
-
-class TestGlobalWeightNorm(flow.unittest.TestCase):
-    @globaltest
-    def test_global_weight_norm_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=1):
-                _test_global_weight_norm_with_random_data(test_case, placement, sbp)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_consistent_where.py b/python/oneflow/test/modules/test_consistent_where.py
deleted file mode 100644
index 0be2c1eb0b3..00000000000
--- a/python/oneflow/test/modules/test_consistent_where.py
+++ /dev/null
@@ -1,371 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import unittest
-from collections import OrderedDict
-
-from oneflow.test_utils.test_util import GenArgList
-from oneflow.test_utils.automated_test_util import *
-import oneflow as flow
-import oneflow.unittest
-
-
-@autotest(n=1, check_graph=False)
-def _test_global_where(test_case, placement, sbp):
-    x = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    y = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    condition = random_tensor(ndim=2, dim0=8, dim1=16, high=2, dtype=int).to_global(
-        placement, sbp
-    )
-
-    condition = condition.to(torch.bool)
-
-    z = torch.where(condition, x, y)
-    return z
-
-
-@autotest(n=1, check_graph=False)
-def _test_global_where_broadcast(test_case, placement, sbp):
-    x = random_tensor(ndim=3, dim0=8, dim1=16, dim2=1).to_global(placement, sbp)
-    y = random_tensor(ndim=3, dim0=8, dim1=16, dim2=8).to_global(placement, sbp)
-    condition = random_tensor(
-        ndim=3, dim0=8, dim1=16, dim2=1, high=2, dtype=int
-    ).to_global(placement, sbp)
-
-    condition = condition.to(torch.bool)
-
-    z = torch.where(condition, x, y)
-    return z
-
-
-@autotest(n=1, check_graph=False)
-def _test_global_where_scalar(test_case, placement, sbp):
-    x = random_tensor(ndim=0).to_global(placement, sbp)
-    y = random_tensor(ndim=0).to_global(placement, sbp)
-    condition = random_tensor(ndim=0, high=2, dtype=int).to_global(placement, sbp)
-
-    condition = condition.to(torch.bool)
-
-    z = torch.where(condition, x, y)
-    return z
-
-
-# Close auto_backward because pytorch raise error:
-# PyTorch error: element 0 of tensors does not require grad and does not have a grad_fn
-@autotest(n=1, auto_backward=False, check_graph=False)
-def _test_where_x_y_none(test_case, placement, sbp):
-    condition = random_tensor(ndim=2, dim0=8, dim1=8, low=-1, high=1).to_global(
-        placement, sbp
-    )
-    y = torch.where(condition)
-    return y[0], y[1]
-
-
-@autotest(n=1, check_graph=False)
-def _test_global_where_tensor_with_0dim_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    x = random_tensor(ndim=0).to_global(placement, sbp)
-    y = random_tensor(ndim=0).to_global(placement, sbp)
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, check_graph=False)
-def _test_flow_where_tensor_broadcast_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=3, dim0=8, dim1=16, dim2=8).to_global(placement, sbp)
-    x = random_tensor(ndim=3, dim0=8, dim1=1, dim2=8).to_global(placement, sbp)
-    y = random_tensor(ndim=3, dim0=8, dim1=16, dim2=1).to_global(placement, sbp)
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, check_graph=False)
-def _test_flow_where_scalar_x_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    x = random().to(float)
-    y = (
-        random_tensor(ndim=2, dim0=8, dim1=16, dtype=float)
-        .to_global(placement, sbp)
-        .to(torch.float64)
-    )
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, check_graph=False)
-def _test_flow_where_scalar_x_broadcast_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=1, dim1=16).to_global(placement, sbp)
-    x = random().to(float)
-    y = (
-        random_tensor(ndim=2, dim0=8, dim1=1, dtype=float)
-        .to_global(placement, sbp)
-        .to(torch.float64)
-    )
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, auto_backward=False, check_graph=False)
-def _test_flow_where_scalar_x_int_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    x = random().to(int)
-    y = random_tensor(ndim=2, dim0=8, dim1=16, dtype=int).to_global(placement, sbp)
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, check_graph=False)
-def _test_flow_where_scalar_y_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    x = (
-        random_tensor(ndim=2, dim0=8, dim1=16, dtype=float)
-        .to_global(placement, sbp)
-        .to(torch.float64)
-    )
-    y = random().to(float)
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, check_graph=False)
-def _test_flow_where_scalar_y_broadcast_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=1, dim1=16).to_global(placement, sbp)
-    x = (
-        random_tensor(ndim=2, dim0=8, dim1=1, dtype=float)
-        .to_global(placement, sbp)
-        .to(torch.float64)
-    )
-    y = random().to(float)
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, auto_backward=False, check_graph=False)
-def _test_flow_where_scalar_y_int_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    x = random_tensor(ndim=2, dim0=8, dim1=16, dtype=int).to_global(placement, sbp)
-    y = random().to(int)
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, auto_backward=False, check_graph=False)
-def _test_flow_where_tensor_bool_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    x = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp).to(torch.bool)
-    y = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp).to(torch.bool)
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, auto_backward=False, check_graph=False)
-def _test_flow_where_tensor_broadcast_bool_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    x = random_tensor(ndim=2, dim0=1, dim1=16).to_global(placement, sbp).to(torch.bool)
-    y = random_tensor(ndim=2, dim0=8, dim1=1).to_global(placement, sbp).to(torch.bool)
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, auto_backward=False, check_graph=False)
-def _test_flow_where_scalar_x_bool_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    x = random().to(bool)
-    y = (
-        random_tensor(ndim=2, dim0=8, dim1=16, dtype=float)
-        .to_global(placement, sbp)
-        .to(torch.bool)
-    )
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, auto_backward=False, check_graph=False)
-def _test_flow_where_scalar_x_broadcast_bool_with_random_data(
-    test_case, placement, sbp
-):
-    cond = random_tensor(ndim=2, dim0=1, dim1=16).to_global(placement, sbp)
-    x = random().to(bool)
-    y = (
-        random_tensor(ndim=2, dim0=8, dim1=1, dtype=float)
-        .to_global(placement, sbp)
-        .to(torch.bool)
-    )
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, auto_backward=False, check_graph=False)
-def _test_flow_where_scalar_y_bool_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    x = (
-        random_tensor(ndim=2, dim0=8, dim1=16, dtype=float)
-        .to_global(placement, sbp)
-        .to(torch.bool)
-    )
-    y = random().to(bool)
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, auto_backward=False, check_graph=False)
-def _test_flow_where_scalar_y_broadcast_bool_with_random_data(
-    test_case, placement, sbp
-):
-    cond = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    x = (
-        random_tensor(ndim=2, dim0=8, dim1=1, dtype=float)
-        .to_global(placement, sbp)
-        .to(torch.bool)
-    )
-    y = random().to(bool)
-    return torch.where(cond > 0, x, y)
-
-
-@autotest(n=1, auto_backward=False, check_graph=False)
-def _test_flow_where_scalar_xy_bool_with_random_data(test_case, placement, sbp):
-    cond = random_tensor(ndim=2, dim0=8, dim1=16).to_global(placement, sbp)
-    x = random().to(bool)
-    y = random().to(bool)
-    return torch.where(cond > 0, x, y)
-
-
-class TestGlobalWhere(flow.unittest.TestCase):
-    @globaltest
-    def test_global_where(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=2):
-                _test_global_where(test_case, placement, sbp)
-
-    @globaltest
-    def test_global_where_broadcast(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=2):
-                _test_global_where_broadcast(test_case, placement, sbp)
-
-    @globaltest
-    def test_global_where_scalar(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_global_where_scalar(test_case, placement, sbp)
-
-    @globaltest
-    def test_where_x_y_none(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=2):
-                _test_where_x_y_none(test_case, placement, sbp)
-
-    @globaltest
-    def test_global_where_tensor_with_0dim_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_global_where_tensor_with_0dim_data(test_case, placement, sbp)
-
-    @globaltest
-    def test_flow_where_tensor_broadcast_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=1):
-                _test_flow_where_tensor_broadcast_with_random_data(
-                    test_case, placement, sbp
-                )
-
-    @globaltest
-    def test_flow_where_scalar_x_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_scalar_x_with_random_data(test_case, placement, sbp)
-
-    @globaltest
-    def test_flow_where_scalar_x_broadcast_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_scalar_x_broadcast_with_random_data(
-                    test_case, placement, sbp
-                )
-
-    @globaltest
-    def test_flow_where_scalar_x_int_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_scalar_x_int_with_random_data(
-                    test_case, placement, sbp
-                )
-
-    @globaltest
-    def test_flow_where_scalar_y_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_scalar_y_with_random_data(test_case, placement, sbp)
-
-    @globaltest
-    def test_flow_where_scalar_y_broadcast_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_scalar_y_broadcast_with_random_data(
-                    test_case, placement, sbp
-                )
-
-    @globaltest
-    def test_flow_where_scalar_y_int_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_scalar_y_int_with_random_data(
-                    test_case, placement, sbp
-                )
-
-    @globaltest
-    def test_flow_where_tensor_bool_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, max_dim=2):
-                _test_flow_where_tensor_bool_with_random_data(test_case, placement, sbp)
-
-    @globaltest
-    def test_flow_where_tensor_broadcast_bool_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_tensor_broadcast_bool_with_random_data(
-                    test_case, placement, sbp
-                )
-
-    @globaltest
-    def test_flow_where_scalar_x_bool_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_scalar_x_bool_with_random_data(
-                    test_case, placement, sbp
-                )
-
-    @globaltest
-    def test_flow_where_scalar_x_broadcast_bool_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_scalar_x_broadcast_bool_with_random_data(
-                    test_case, placement, sbp
-                )
-
-    @globaltest
-    def test_flow_where_scalar_y_bool_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_scalar_y_bool_with_random_data(
-                    test_case, placement, sbp
-                )
-
-    @globaltest
-    def test_flow_where_scalar_y_broadcast_bool_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_scalar_y_broadcast_bool_with_random_data(
-                    test_case, placement, sbp
-                )
-
-    @globaltest
-    def test_flow_where_scalar_xy_bool_with_random_data(test_case):
-        for placement in all_placement():
-            for sbp in all_sbp(placement, except_split=True):
-                _test_flow_where_scalar_xy_bool_with_random_data(
-                    test_case, placement, sbp
-                )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_consistent_zeropad2d.py b/python/oneflow/test/modules/test_consistent_zeropad2d.py
deleted file mode 100644
index 08a4cc92083..00000000000
--- a/python/oneflow/test/modules/test_consistent_zeropad2d.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import unittest
-from collections import OrderedDict
-
-from oneflow.test_utils.test_util import GenArgList
-from oneflow.test_utils.automated_test_util import *
-import oneflow as flow
-import oneflow.unittest
-
-
-@autotest(n=1, check_graph=False)
-def _test_global_ZeroPad2d(test_case, placement, sbp, padding):
-    x = random_tensor(ndim=4, dim0=8, dim1=16, dim2=8, dim3=8,).to_global(
-        placement, sbp
-    )
-    m = torch.nn.ZeroPad2d(padding)
-    y = m(x)
-    return y
-
-
-class TestGlobalZeroPad2dModule(flow.unittest.TestCase):
-    @globaltest
-    def test_global_ZeroPad2d(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["padding"] = [2, (1, 1, 2, 2)]
-        for arg in GenArgList(arg_dict):
-            for placement in all_placement():
-                for sbp in all_sbp(placement, max_dim=4):
-                    _test_global_ZeroPad2d(test_case, placement, sbp, *arg)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_conv2d.py b/python/oneflow/test/modules/test_conv2d.py
index b26a89ac0e2..7e58a552fcb 100644
--- a/python/oneflow/test/modules/test_conv2d.py
+++ b/python/oneflow/test/modules/test_conv2d.py
@@ -1850,26 +1850,6 @@ def test_conv2d_with_random_data(test_case):
         y = m(x)
         return y
 
-    @autotest(check_graph=False)
-    def test_conv2d_0size_with_random_data(test_case):
-        channels = random(1, 6)
-        m = torch.nn.Conv2d(
-            in_channels=channels,
-            out_channels=random(1, 20),
-            kernel_size=random(1, 4),
-            stride=random() | nothing(),
-            padding=random(1, 3).to(int) | nothing(),
-            dilation=random(1, 5) | nothing(),
-            groups=random(1, 5) | nothing(),
-            padding_mode=constant("zeros") | nothing(),
-        )
-        m.train(random())
-        device = random_device()
-        m.to(device)
-        x = random_tensor(ndim=4, dim0=0, dim1=channels).to(device)
-        y = m(x)
-        return y
-
     @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
     @autotest(n=30, check_allclose=False)
     def test_conv2d_group_with_random_data(test_case):
diff --git a/python/oneflow/test/modules/test_conv3d.py b/python/oneflow/test/modules/test_conv3d.py
index f7d2038d44b..17445b83c24 100644
--- a/python/oneflow/test/modules/test_conv3d.py
+++ b/python/oneflow/test/modules/test_conv3d.py
@@ -22,13 +22,14 @@
 
 @flow.unittest.skip_unless_1n1d()
 class TestConv3DModule(flow.unittest.TestCase):
-    @autotest(n=3)
-    def test_nn_functional_conv3d(test_case):
-        device = random_device()
-        img = torch.ones((1, 3, 16, 16, 16), requires_grad=True).to(device)
-        kernel = torch.ones((6, 3, 3, 3, 3), requires_grad=True).to(device)
-        y = torch.nn.functional.conv3d(img, kernel)
-        return y
+    # Disable this test for unknown error
+    # @autotest(n=3)
+    # def test_nn_functional_conv3d(test_case):
+    #     device = random_device()
+    #     img = torch.ones((1, 3, 224, 224, 224), requires_grad=True).to(device)
+    #     kernel = torch.ones((6, 3, 3, 3, 3), requires_grad=True).to(device)
+    #     y = torch.nn.functional.conv3d(img, kernel)
+    #     return y
 
     @autotest(n=10)
     def test_conv3d_with_random_data(test_case):
diff --git a/python/oneflow/test/modules/test_cum_ops.py b/python/oneflow/test/modules/test_cum_ops.py
index 2c2fa0119ac..14e764934e3 100644
--- a/python/oneflow/test/modules/test_cum_ops.py
+++ b/python/oneflow/test/modules/test_cum_ops.py
@@ -24,7 +24,7 @@
 
 @flow.unittest.skip_unless_1n1d()
 class TestCumOp(flow.unittest.TestCase):
-    @autotest(n=5, check_graph=True)
+    @autotest(n=30, check_graph=True)
     def test_cumsum(test_case):
         device = random_device()
         x = random_tensor().to(device)
@@ -41,21 +41,6 @@ def test_cumprod(test_case):
         z = torch.cumprod(x, dim)
         return z
 
-    def test_cumop_with_dtype(test_case):
-        x = flow.tensor([2, 3, 4])
-        cumsum_res = flow.cumsum(x, dim=0, dtype=flow.float)
-        cumprod_res = flow.cumprod(x, dim=0, dtype=flow.float)
-        test_case.assertEqual(cumsum_res.dtype, flow.float)
-        test_case.assertEqual(cumprod_res.dtype, flow.float)
-
-    @autotest(n=5, check_graph=True)
-    def test_cumsum(test_case):
-        device = random_device()
-        x = random_tensor().to(device)
-        dim = random(0, x.ndim.pytorch).to(int)
-        y = x.cumsum(dim)
-        return y
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_deconv2d.py b/python/oneflow/test/modules/test_deconv2d.py
index bc69915861b..08de04007be 100644
--- a/python/oneflow/test/modules/test_deconv2d.py
+++ b/python/oneflow/test/modules/test_deconv2d.py
@@ -891,26 +891,6 @@ def test_deconv2d_with_random_data(test_case):
         y = m(x)
         return y
 
-    @autotest(check_graph=False)
-    def test_deconv2d_0size_with_random_data(test_case):
-        channels = random(1, 6)
-        m = torch.nn.ConvTranspose2d(
-            in_channels=channels,
-            out_channels=random(1, 20),
-            kernel_size=random(1, 4),
-            stride=random() | nothing(),
-            padding=random(1, 3).to(int) | nothing(),
-            dilation=random(1, 5) | nothing(),
-            groups=random(1, 5) | nothing(),
-            padding_mode=constant("zeros") | nothing(),
-        )
-        m.train(random())
-        device = random_device()
-        m.to(device)
-        x = random_tensor(ndim=4, dim0=0, dim1=channels).to(device)
-        y = m(x)
-        return y
-
     @unittest.skip(
         "Likely to fail the test. This case should run on cpu when the problem is solved."
     )
diff --git a/python/oneflow/test/modules/test_generator.py b/python/oneflow/test/modules/test_generator.py
index e30647a83b8..8d2a90b4676 100644
--- a/python/oneflow/test/modules/test_generator.py
+++ b/python/oneflow/test/modules/test_generator.py
@@ -157,18 +157,6 @@ def test_set_rng_state(test_case):
         new_state = flow.get_rng_state()
         test_case.assertTrue(np.allclose(new_state.numpy(), state.numpy()))
 
-    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-    def test_tensor_init(test_case):
-        flow.manual_seed(0)
-        x = flow.ones(2)
-        x.uniform_()
-
-        flow.manual_seed(0)
-        y = flow.ones(2).to("cuda")
-        y.uniform_()
-
-        test_case.assertTrue(np.allclose(x.numpy(), y.numpy()))
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_id_shuffle_global.py b/python/oneflow/test/modules/test_id_shuffle_global.py
index 6ed40f6dfbf..55b2aedb07b 100644
--- a/python/oneflow/test/modules/test_id_shuffle_global.py
+++ b/python/oneflow/test/modules/test_id_shuffle_global.py
@@ -19,6 +19,7 @@
 from oneflow.test_utils.test_util import GenArgDict
 import numpy as np
 import oneflow as flow
+import oneflow.unittest
 
 from oneflow.test_utils.automated_test_util import *
 
@@ -26,22 +27,22 @@
 max_id = 1000
 
 
-def get_tensors(batch_size, num_tables):
+def get_tensors(batch_size, num_columns):
     placement = flow.placement(type="cuda", ranks=list(range(parallel_num)))
-    ids = np.random.randint(0, max_id, (batch_size, num_tables), dtype=np.int64)
+    ids = np.random.randint(0, max_id, (batch_size, num_columns), dtype=np.int64)
     ids_tensor = flow.tensor(ids, requires_grad=False).to_global(
         placement=placement, sbp=flow.sbp.split(0)
     )
-    table_ids = (
-        ids % num_tables
-    )  # same id must have same table id, so in this case get table_ids from ids
-    table_ids_tensor = flow.tensor(
-        table_ids.astype(np.int32), requires_grad=False
+    column_ids = (
+        ids % num_columns
+    )  # same id must have same column id, so in this case get column_ids from ids
+    column_ids_tensor = flow.tensor(
+        column_ids.astype(np.int32), requires_grad=False
     ).to_global(placement=placement, sbp=flow.sbp.split(0))
-    return ids_tensor, table_ids_tensor
+    return ids_tensor, column_ids_tensor
 
 
-def _test_id_shuffle(test_case, has_table_id, num_tables):
+def _test_id_shuffle(test_case, has_column_id, num_columns):
     batch_size = int(1024 / parallel_num)
     placement = flow.placement(type="cuda", ranks=list(range(parallel_num)))
 
@@ -49,57 +50,57 @@ class TestGraph(flow.nn.Graph):
         def __init__(self):
             super().__init__()
 
-        def build(self, ids, table_ids):
+        def build(self, ids, column_ids):
             (
                 num_unique_matrix,
                 inverse_unique_partition_indices,
                 cur_rank_num_unique,
                 cur_rank_unique_ids,
-                cur_rank_unique_table_ids,
+                cur_rank_unique_column_ids,
                 cur_rank_inverse_indices,
-            ) = flow._C.one_embedding_id_shuffle(ids, table_ids, num_tables)
+            ) = flow._C.one_embedding_id_shuffle(ids, column_ids, num_columns)
             return (
                 flow.cast(num_unique_matrix, flow.int32),
                 flow.cast(inverse_unique_partition_indices, flow.int32),
                 flow.cast(cur_rank_num_unique, flow.int32),
                 flow.cast(cur_rank_unique_ids, flow.int32),
-                flow.cast(cur_rank_unique_table_ids, flow.int32),
+                flow.cast(cur_rank_unique_column_ids, flow.int32),
                 flow.cast(cur_rank_inverse_indices, flow.int32),
             )
 
     graph = TestGraph()
     for i in range(10):
-        ids_tensor, table_ids_tensor = get_tensors(batch_size, num_tables)
-        if not has_table_id:
-            table_ids_tensor = None
-        graph(ids_tensor, table_ids_tensor)
+        ids_tensor, column_ids_tensor = get_tensors(batch_size, num_columns)
+        if not has_column_id:
+            column_ids_tensor = None
+        graph(ids_tensor, column_ids_tensor)
     (
         num_unique_matrix,
         inverse_unique_partition_indices,
         local_cur_rank_num_unique,
         cur_rank_unique_ids,
-        cur_rank_unique_table_ids,
+        cur_rank_unique_column_ids,
         cur_rank_inverse_indices,
-    ) = graph(ids_tensor, table_ids_tensor)
+    ) = graph(ids_tensor, column_ids_tensor)
     cur_rank_num_unique = local_cur_rank_num_unique.to_local().to_global(
         placement=placement, sbp=flow.sbp.split(0)
     )
     cur_rank_num_unique_list = []
     cur_rank_unique_ids_list = []
-    cur_rank_unique_table_ids_list = []
-    cur_rank_num_ids = batch_size * num_tables * parallel_num
+    cur_rank_unique_column_ids_list = []
+    cur_rank_num_ids = batch_size * num_columns * parallel_num
     for i in range(parallel_num):
         num_unique_i = cur_rank_num_unique.numpy()[i]
         unique_ids_i = cur_rank_unique_ids.numpy()[
             cur_rank_num_ids * i : cur_rank_num_ids * (i + 1)
         ]
-        unique_table_ids_i = cur_rank_unique_table_ids.numpy()[
+        unique_column_ids_i = cur_rank_unique_column_ids.numpy()[
             cur_rank_num_ids * i : cur_rank_num_ids * (i + 1)
         ]
         cur_rank_num_unique_list.append(num_unique_i)
         cur_rank_unique_ids_list.append(np.array(unique_ids_i[0:num_unique_i]))
-        cur_rank_unique_table_ids_list.append(
-            np.array(unique_table_ids_i[0:num_unique_i])
+        cur_rank_unique_column_ids_list.append(
+            np.array(unique_column_ids_i[0:num_unique_i])
         )
 
     global_ids = ids_tensor.numpy()
@@ -116,65 +117,25 @@ def build(self, ids, table_ids):
     unique_ids.sort()
     np_unique_ids.sort()
     test_case.assertTrue(np.array_equal(unique_ids, np_unique_ids))
-    if has_table_id:
-        # test unique table ids
-        unique_table_ids = np.concatenate(cur_rank_unique_table_ids_list)
-        unique_table_ids.sort()
-        global_table_ids = table_ids_tensor.numpy()
-        np_unique_table_ids = global_table_ids.flatten()[np_unique_index]
-        np_unique_table_ids.sort()
-        test_case.assertTrue(np.array_equal(unique_table_ids, np_unique_table_ids))
-
-
-def round_half_away_from_zero(x):
-    sign = np.sign(x)
-    abs_val = np.abs(x)
-    abs_val += 0.5
-    floor_val = np.floor(abs_val)
-    out = floor_val * sign
-    return out
-
-
-def embedding_shuffle_quantize(np_data, np_dtype):
-    # When use float16, ComputeType is set to as Float.
-    np_reduce_data = np_data.astype(np.float32)
-    abs_max_factor = np.max(np.abs(np_reduce_data), axis=2)
-    abs_max_factor = np.expand_dims(abs_max_factor, axis=2)
-    transport_quantize_factor = abs_max_factor.astype(np_dtype)
-    int8_factor = np.ones(abs_max_factor.shape, dtype=np.float32) * 127.0
-    int8_factor = int8_factor.astype(np.float32)
-    quantize_factor = int8_factor / abs_max_factor
-
-    # Covert to Compute Type.
-    np_data.astype(np.float32)
-    np_data = np_data * quantize_factor
-    np_data = round_half_away_from_zero(np_data)
-    np_data = np_data.astype(np.int8)
-
-    # Covert to Compute Type.
-    np_data = np_data.astype(np.float32)
-    dequantize_factor = transport_quantize_factor.astype(np.float32) / int8_factor
-    np_data = np_data * dequantize_factor
-    np_data = np_data.astype(np_dtype)
-    return np_data
-
-
-def _test_embedding_shuffle(test_case, dtype, enable_quantize):
+    if has_column_id:
+        # test unique column ids
+        unique_column_ids = np.concatenate(cur_rank_unique_column_ids_list)
+        unique_column_ids.sort()
+        global_column_ids = column_ids_tensor.numpy()
+        np_unique_column_ids = global_column_ids.flatten()[np_unique_index]
+        np_unique_column_ids.sort()
+        test_case.assertTrue(np.array_equal(unique_column_ids, np_unique_column_ids))
+
+
+def _test_embedding_shuffle(test_case, dtype):
     batch_size = int(1024 / parallel_num)
     placement = flow.placement(type="cuda", ranks=list(range(parallel_num)))
-    num_tables = 26
-    embedding_size = 128
-    enable_quantized_comm = enable_quantize and embedding_size < 1025
-    if enable_quantized_comm:
-        os.environ["ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM"] = "1"
-    else:
-        os.environ["ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM"] = "0"
-
+    num_columns = 26
     if dtype == flow.float16:
         np_dtype = np.float16
     else:
         np_dtype = np.float32
-    data = np.random.rand(max_id, embedding_size).astype(np_dtype)
+    data = np.random.rand(max_id, 128).astype(np_dtype)
     data_tensor = flow.tensor(data, requires_grad=False).to_global(
         placement=placement, sbp=flow.sbp.broadcast()
     )
@@ -183,7 +144,7 @@ class TestGraph(flow.nn.Graph):
         def __init__(self):
             super().__init__()
 
-        def build(self, ids, table_ids, data):
+        def build(self, ids, column_ids, data):
             (
                 num_unique_matrix,
                 inverse_unique_partition_indices,
@@ -191,7 +152,7 @@ def build(self, ids, table_ids, data):
                 cur_rank_unique_ids,
                 _,
                 cur_rank_inverse_indices,
-            ) = flow._C.one_embedding_id_shuffle(ids, table_ids, num_tables)
+            ) = flow._C.one_embedding_id_shuffle(ids, column_ids, num_columns)
             unique_embeddings = flow._C.gather(data, cur_rank_unique_ids, axis=0)
             embeddings = flow._C.one_embedding_embedding_shuffle(
                 unique_embeddings,
@@ -203,34 +164,21 @@ def build(self, ids, table_ids, data):
 
     graph = TestGraph()
     for i in range(10):
-        ids_tensor, table_ids_tensor = get_tensors(batch_size, num_tables)
-        graph(ids_tensor, table_ids_tensor, data_tensor)
-    embeddings = graph(ids_tensor, table_ids_tensor, data_tensor)
+        ids_tensor, column_ids_tensor = get_tensors(batch_size, num_columns)
+        graph(ids_tensor, column_ids_tensor, data_tensor)
+    embeddings = graph(ids_tensor, column_ids_tensor, data_tensor)
     global_ids = ids_tensor.numpy()
     global_data = data_tensor.numpy()
     np_embeddings = global_data[global_ids]
-
-    # Quantized numpy embedding.
-    if enable_quantized_comm:
-        np_embeddings = embedding_shuffle_quantize(np_embeddings, np_dtype)
-
     test_case.assertTrue(np.array_equal(embeddings.numpy(), np_embeddings))
 
 
-def _test_embedding_gradient_shuffle(test_case, enable_quantize):
-    np_tolerance = 0
+def _test_embedding_gradient_shuffle(test_case):
     batch_size = int(1024 / parallel_num)
     placement = flow.placement(type="cuda", ranks=list(range(parallel_num)))
-    num_tables = 26
+    num_columns = 26
     embedding_size = 128
-    enable_quantized_comm = enable_quantize and embedding_size < 1025
-    if enable_quantized_comm:
-        np_tolerance = 0.5
-        os.environ["ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM"] = "1"
-    else:
-        np_tolerance = 1e-4
-        os.environ["ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM"] = "0"
-    embedding_grad = np.random.rand(batch_size, num_tables, embedding_size).astype(
+    embedding_grad = np.random.rand(batch_size, num_columns, embedding_size).astype(
         np.float32
     )
     embedding_grad_tensor = flow.tensor(embedding_grad, requires_grad=False).to_global(
@@ -241,7 +189,7 @@ class TestGraph(flow.nn.Graph):
         def __init__(self):
             super().__init__()
 
-        def build(self, ids, table_ids, embedding_grad):
+        def build(self, ids, column_ids, embedding_grad):
             (
                 num_unique_matrix,
                 inverse_unique_partition_indices,
@@ -249,7 +197,7 @@ def build(self, ids, table_ids, embedding_grad):
                 cur_rank_unique_ids,
                 _,
                 cur_rank_inverse_indices,
-            ) = flow._C.one_embedding_id_shuffle(ids, table_ids, num_tables)
+            ) = flow._C.one_embedding_id_shuffle(ids, column_ids, num_columns)
             cur_rank_unique_embedding_grad = flow._C.one_embedding_embedding_gradient_shuffle(
                 embedding_grad,
                 num_unique_matrix,
@@ -264,14 +212,14 @@ def build(self, ids, table_ids, embedding_grad):
 
     graph = TestGraph()
     for i in range(10):
-        ids_tensor, table_ids_tensor = get_tensors(batch_size, num_tables)
-        graph(ids_tensor, table_ids_tensor, embedding_grad_tensor)
-    ids_tensor, table_ids_tensor = get_tensors(batch_size, num_tables)
+        ids_tensor, column_ids_tensor = get_tensors(batch_size, num_columns)
+        graph(ids_tensor, column_ids_tensor, embedding_grad_tensor)
+    ids_tensor, column_ids_tensor = get_tensors(batch_size, num_columns)
     (
         cur_rank_unique_embedding_grad,
         local_cur_rank_num_unique,
         cur_rank_unique_ids,
-    ) = graph(ids_tensor, table_ids_tensor, embedding_grad_tensor)
+    ) = graph(ids_tensor, column_ids_tensor, embedding_grad_tensor)
     cur_rank_num_unique = local_cur_rank_num_unique.to_local().to_global(
         placement=placement, sbp=flow.sbp.split(0)
     )
@@ -282,26 +230,13 @@ def build(self, ids, table_ids, embedding_grad):
     np_cur_rank_unique_embedding_grad = np.zeros((max_id, embedding_size))
     for k in range(np_num_unique):
         unique_id = np_unique_ids[k]
-        np_data = sum(
+        np_cur_rank_unique_embedding_grad[unique_id, :] = sum(
             global_embedding_grad.reshape(-1, embedding_size)[
                 np.where(global_ids.flatten() == unique_id)[0]
             ]
         )
-        # Quantize Embedding Gradient.
-        if enable_quantized_comm:
-            abs_max_factor = np.max(np.abs(np_data))
-            int8_factor = np.full(abs_max_factor.shape, 127.0, dtype=np.float32)
-            quantize_factor = int8_factor / abs_max_factor
-            np_data = np_data * quantize_factor
-            np_data = round_half_away_from_zero(np_data)
-            np_data = np_data.astype(np.int8)
-            np_data = np_data.astype(np.float32)
-            dequantize_factor = abs_max_factor / int8_factor
-            np_data = np_data * dequantize_factor
-
-        np_cur_rank_unique_embedding_grad[unique_id, :] = np_data
-
-    cur_rank_num_ids = batch_size * num_tables * parallel_num
+
+    cur_rank_num_ids = batch_size * num_columns * parallel_num
     of_unique_embedding_grad = np.zeros((max_id, embedding_size))
     for i in range(parallel_num):
         num_unique_i = cur_rank_num_unique.numpy()[i]
@@ -319,8 +254,8 @@ def build(self, ids, table_ids, embedding_grad):
         np.allclose(
             of_unique_embedding_grad,
             np_cur_rank_unique_embedding_grad,
-            atol=np_tolerance,
-            rtol=np_tolerance,
+            atol=1e-4,
+            rtol=1e-4,
         ),
     )
 
@@ -330,22 +265,19 @@ def build(self, ids, table_ids, embedding_grad):
 class DataShuffleTestCase(flow.unittest.TestCase):
     def test_id_shuffle(test_case):
         arg_dict = OrderedDict()
-        arg_dict["has_table_id"] = [True, False]
-        arg_dict["num_tables"] = [1, 26]
+        arg_dict["has_column_id"] = [True, False]
+        arg_dict["num_columns"] = [1, 26]
         for kwargs in GenArgDict(arg_dict):
             _test_id_shuffle(test_case, **kwargs)
 
     def test_embedding_shuffle(test_case):
         arg_dict = OrderedDict()
         arg_dict["dtype"] = [flow.float32, flow.float16]
-        arg_dict["enable_quantize"] = [True, False]
-
         for kwargs in GenArgDict(arg_dict):
             _test_embedding_shuffle(test_case, **kwargs)
 
     def test_embedding_gradient_shuffle(test_case):
         arg_dict = OrderedDict()
-        arg_dict["enable_quantize"] = [True, False]
         for kwargs in GenArgDict(arg_dict):
             _test_embedding_gradient_shuffle(test_case, **kwargs)
 
diff --git a/python/oneflow/test/modules/test_index_select.py b/python/oneflow/test/modules/test_index_select.py
index 3807ef76829..babac342eab 100644
--- a/python/oneflow/test/modules/test_index_select.py
+++ b/python/oneflow/test/modules/test_index_select.py
@@ -26,7 +26,10 @@
 
 @flow.unittest.skip_unless_1n1d()
 class TestIndexSelect(flow.unittest.TestCase):
-    @autotest()
+    # Not check graph because of one reason:
+    # Reason 1, This op needs to convert the EagerTensor to a numpy array，so eager tensors supported only.
+    # Please refer to File "oneflow/api/python/utils/tensor_utils.h", line 49, in EagerTensorToNumpy.
+    @autotest(check_graph="ValidatedFlase")
     def test_index_select_by_random(test_case):
         device = random_device()
 
@@ -57,7 +60,10 @@ def test_index_select_by_random(test_case):
 
         return y
 
-    @autotest(auto_backward=False)
+    # Not check graph because of one reason:
+    # Reason 1, This op needs to convert the EagerTensor to a numpy array，so eager tensors supported only.
+    # Please refer to File "oneflow/api/python/utils/tensor_utils.h", line 49, in EagerTensorToNumpy.
+    @autotest(auto_backward=False, check_graph="ValidatedFlase")
     def test_index_select_bool_by_random(test_case):
         device = random_device()
 
diff --git a/python/oneflow/test/modules/test_masked_select.py b/python/oneflow/test/modules/test_masked_select.py
index 3ae15808da7..ef145ea44d1 100644
--- a/python/oneflow/test/modules/test_masked_select.py
+++ b/python/oneflow/test/modules/test_masked_select.py
@@ -91,13 +91,6 @@ def test_masked_select(test_case):
         for arg in GenArgList(arg_dict):
             arg[0](test_case, *arg[1:])
 
-    def test_masked_select_broadcast(test_case):
-        x = flow.ones(2, 3, 3)
-        mask = flow.triu(flow.ones(3, 3), 1)
-        flow_res = flow.masked_select(x, mask)
-        np_res = [1, 1, 1, 1, 1, 1]
-        test_case.assertTrue(np.allclose(flow_res.numpy(), np_res, 1e-05, 1e-05))
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_math_ops.py b/python/oneflow/test/modules/test_math_ops.py
index 6b29d8d59e4..804c3e31bff 100644
--- a/python/oneflow/test/modules/test_math_ops.py
+++ b/python/oneflow/test/modules/test_math_ops.py
@@ -337,7 +337,7 @@ class TestArccos(flow.unittest.TestCase):
     @autotest()
     def test_arccos_flow_with_random_data(test_case):
         device = random_device()
-        x = random_tensor(low=-1, high=1).to(device)
+        x = random_tensor(low=2, high=3).to(device)
         y = torch.arccos(x)
         return y
 
@@ -347,7 +347,7 @@ class TestAcos(flow.unittest.TestCase):
     @autotest()
     def test_acos_flow_with_random_data(test_case):
         device = random_device()
-        x = random_tensor(low=-1, high=1).to(device)
+        x = random_tensor(low=2, high=3).to(device)
         y = torch.acos(x)
         return y
 
diff --git a/python/oneflow/test/modules/test_max.py b/python/oneflow/test/modules/test_max.py
index 546a22a5ddd..b9969bb2741 100644
--- a/python/oneflow/test/modules/test_max.py
+++ b/python/oneflow/test/modules/test_max.py
@@ -79,25 +79,6 @@ def test_max_elementwise(test_case):
         y = random_tensor(ndim, *dims).to(device)
         return torch.max(x, y)
 
-    @autotest(n=5, check_graph=False, check_dtype=True)
-    def test_max_elementwise_dtype_promotion(test_case):
-        device = random_device()
-        ndim = random().to(int).value()
-        dims = [random(1, 8) for _ in range(ndim)]
-        x = random_tensor(ndim, *dims, dtype=float).to(device)
-        y = random_tensor(ndim, *dims, dtype=int).to(device)
-        return torch.max(x, y)
-
-    @autotest(n=5, check_graph=False, check_dtype=True)
-    def test_max_broadcast_dtype_promotion(test_case):
-        device = random_device()
-        ndim = random().to(int).value()
-        dims = [random(1, 8) for _ in range(ndim)]
-        b_dims = [1 for _ in range(ndim)]
-        x = random_tensor(ndim, *dims, dtype=float).to(device)
-        y = random_tensor(ndim, *b_dims, dtype=int).to(device)
-        return torch.max(x, y)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_min.py b/python/oneflow/test/modules/test_min.py
index 6ef5e67e291..ef08e7b80b8 100644
--- a/python/oneflow/test/modules/test_min.py
+++ b/python/oneflow/test/modules/test_min.py
@@ -79,25 +79,6 @@ def test_min_elementwise(test_case):
         y = random_tensor(ndim, *dims).to(device)
         return torch.min(x, y)
 
-    @autotest(n=5, check_graph=False, check_dtype=True)
-    def test_min_elementwise_dtype_promotion(test_case):
-        device = random_device()
-        ndim = random().to(int).value()
-        dims = [random(1, 8) for _ in range(ndim)]
-        x = random_tensor(ndim, *dims, dtype=float).to(device)
-        y = random_tensor(ndim, *dims, dtype=int).to(device)
-        return torch.min(x, y)
-
-    @autotest(n=5, check_graph=False, check_dtype=True)
-    def test_min_broadcast_dtype_promotion(test_case):
-        device = random_device()
-        ndim = random().to(int).value()
-        dims = [random(1, 8) for _ in range(ndim)]
-        b_dims = [1 for _ in range(ndim)]
-        x = random_tensor(ndim, *dims, dtype=float).to(device)
-        y = random_tensor(ndim, *b_dims, dtype=int).to(device)
-        return torch.min(x, y)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_norm.py b/python/oneflow/test/modules/test_norm.py
index 7ff9e68674a..35462efc17b 100644
--- a/python/oneflow/test/modules/test_norm.py
+++ b/python/oneflow/test/modules/test_norm.py
@@ -304,15 +304,6 @@ def test_tuple_dim_norm_with_random_data(test_case):
         m = torch.linalg.norm(input, ord=ord, dim=dim, keepdim=keepdim)
         return m
 
-    @autotest(n=5)
-    def test_vector_norm_only_zero_with_random_data(test_case):
-        device = random_device()
-        input = random_tensor(ndim=2).to(device)
-        dim = oneof((-2, -1), (0, 1), (-1, 0))
-        keepdim = random().to(bool)
-        m = torch.linalg.vector_norm(input, ord=0, dim=dim, keepdim=keepdim)
-        return m
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_one_embedding_adagrad.py b/python/oneflow/test/modules/test_one_embedding_adagrad.py
deleted file mode 100644
index 516c27cef99..00000000000
--- a/python/oneflow/test/modules/test_one_embedding_adagrad.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-from collections import OrderedDict
-import tempfile
-import os
-import numpy as np
-from oneflow.test_utils.test_util import GenArgDict
-from optimizer_test_util import clip_grad_norm_np
-
-import oneflow as flow
-from oneflow.nn.parameter import Parameter
-
-
-def compare_with_numpy_adagrad(
-    test_case, weight_decay, lr_decay, scale, learning_rate, train_iters,
-):
-
-    num_rows = 500
-    embedding_size = 128
-    model_shape = (num_rows, embedding_size)
-    line_size = embedding_size * 2
-
-    num_valid_seq = np.random.randint(1, num_rows, (train_iters))
-    skip_if_seq = [np.random.randint(2) for i in range(train_iters)]
-
-    random_grad_seq = []
-    for _ in range(train_iters):
-        random_grad_seq.append(np.random.uniform(size=model_shape).astype(np.float32))
-
-    init_value = np.random.uniform(size=(num_rows, line_size)).astype(np.float32)
-
-    down_scale_by = 10
-    epsilon = 1e-5
-
-    def adagrad_by_oneflow():
-        unique_embeddings_tensor = flow.tensor(init_value, requires_grad=False).to(
-            "cuda"
-        )
-        lr_tensor = flow.tensor(
-            np.array(learning_rate).reshape(1,).astype(np.float32)
-        ).to("cuda")
-        down_scale_by_tensor = flow.tensor(
-            np.array(down_scale_by).astype(np.float32)
-        ).to("cuda")
-
-        def train_one_iter(
-            num_valid, unique_embeddings, embedding_grad, skip_if, train_step
-        ):
-            return flow._C.one_embedding_adagrad_update(
-                num_valid,
-                unique_embeddings,
-                embedding_grad,
-                lr_tensor,
-                down_scale_by_tensor,
-                skip_if,
-                train_step,
-                scale,
-                weight_decay,
-                lr_decay,
-                epsilon,
-            )
-
-        for i in range(1, train_iters):
-            num_valid_tensor = flow.tensor(
-                np.array(num_valid_seq[i]).reshape(1,).astype(np.int32)
-            ).to("cuda")
-            grad_tensor = flow.tensor(random_grad_seq[i]).to("cuda")
-            skip_if_tensor = flow.tensor(
-                np.array(skip_if_seq[i]).reshape(1,).astype(np.int64)
-            ).to("cuda")
-            step_tensor = flow.tensor(np.array(i).reshape(1,).astype(np.int64)).to(
-                "cuda"
-            )
-            updated_tensor = train_one_iter(
-                num_valid_tensor,
-                unique_embeddings_tensor,
-                grad_tensor,
-                skip_if_tensor,
-                step_tensor,
-            )
-            unique_embeddings_tensor[0 : num_valid_seq[i]] = updated_tensor[
-                0 : num_valid_seq[i]
-            ]
-        return unique_embeddings_tensor
-
-    def adagrad_by_numpy():
-        x = init_value[:, 0:embedding_size]
-        st = init_value[:, embedding_size:]
-
-        def train_one_iter(iter, num_valid, grad, model, state):
-            grad[0:num_valid] = grad[0:num_valid] * (scale / down_scale_by)
-            lr = learning_rate / (1 + iter * lr_decay)
-            state[0:num_valid] = (
-                state[0:num_valid] + grad[0:num_valid] * grad[0:num_valid]
-            )
-            model[0:num_valid] = (
-                model[0:num_valid]
-                - lr / (np.sqrt(state[0:num_valid]) + epsilon) * grad[0:num_valid]
-                - lr * weight_decay * model[0:num_valid]
-            )
-            return (model, state)
-
-        for i in range(1, train_iters):
-            if skip_if_seq[i] > 0:
-                pass
-            else:
-                (x, st) = train_one_iter(
-                    i, int(num_valid_seq[i]), random_grad_seq[i], x, st
-                )
-
-        return x, st
-
-    oneflow_res = adagrad_by_oneflow().numpy()
-    of_model = oneflow_res[:, 0:embedding_size]
-    of_sum = oneflow_res[:, embedding_size:]
-    np_model, np_sum = adagrad_by_numpy()
-    test_case.assertTrue(
-        np.allclose(of_model.flatten(), np_model.flatten(), rtol=0.001, atol=0.001)
-    )
-    test_case.assertTrue(
-        np.allclose(of_sum.flatten(), np_sum.flatten(), rtol=0.001, atol=0.001)
-    )
-
-
-@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-@flow.unittest.skip_unless_1n1d()
-class TestOptimizers(flow.unittest.TestCase):
-    def test_one_embedding_adagrad(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["weight_decay"] = [0, 0.1]
-        arg_dict["lr_decay"] = [0, 0.1]
-        arg_dict["scale"] = [1, 0.1]
-        arg_dict["learning_rate"] = [0.3, 1.5]
-        arg_dict["train_iters"] = [10]
-        for arg in GenArgDict(arg_dict):
-            compare_with_numpy_adagrad(test_case, **arg)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_one_embedding_adam.py b/python/oneflow/test/modules/test_one_embedding_adam.py
deleted file mode 100644
index 2c237ba91a4..00000000000
--- a/python/oneflow/test/modules/test_one_embedding_adam.py
+++ /dev/null
@@ -1,200 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-from collections import OrderedDict
-import tempfile
-import os
-import numpy as np
-from oneflow.test_utils.test_util import GenArgDict
-from optimizer_test_util import clip_grad_norm_np
-
-import oneflow as flow
-from oneflow.nn.parameter import Parameter
-
-
-def compare_with_numpy_adam(
-    test_case,
-    weight_decay,
-    scale,
-    learning_rate,
-    train_iters,
-    do_bias_correction,
-    beta1,
-    beta2,
-):
-
-    num_rows = 500
-    embedding_size = 128
-    model_shape = (num_rows, embedding_size)
-    line_size = embedding_size * 3
-
-    num_valid_seq = np.random.randint(1, num_rows, (train_iters))
-    skip_if_seq = [np.random.randint(2) for i in range(train_iters)]
-
-    random_grad_seq = []
-    for _ in range(train_iters):
-        random_grad_seq.append(np.random.uniform(size=model_shape).astype(np.float32))
-
-    init_value = np.random.uniform(size=(num_rows, line_size)).astype(np.float32)
-
-    down_scale_by = 10
-    epsilon = 1e-5
-
-    def adam_by_oneflow():
-        unique_embeddings_tensor = flow.tensor(init_value, requires_grad=False).to(
-            "cuda"
-        )
-        lr_tensor = flow.tensor(
-            np.array(learning_rate).reshape(1,).astype(np.float32)
-        ).to("cuda")
-        down_scale_by_tensor = flow.tensor(
-            np.array(down_scale_by).astype(np.float32)
-        ).to("cuda")
-
-        def train_one_iter(
-            num_valid,
-            unique_embeddings,
-            embedding_grad,
-            skip_if,
-            bias_correction1,
-            bias_correction2,
-        ):
-            return flow._C.one_embedding_adam_update(
-                num_valid,
-                unique_embeddings,
-                embedding_grad,
-                lr_tensor,
-                down_scale_by_tensor,
-                skip_if,
-                bias_correction1,
-                bias_correction2,
-                scale,
-                weight_decay,
-                beta1,
-                beta2,
-                epsilon,
-                do_bias_correction,
-            )
-
-        for i in range(1, train_iters):
-            num_valid_tensor = flow.tensor(
-                np.array(num_valid_seq[i]).reshape(1,).astype(np.int32)
-            ).to("cuda")
-            grad_tensor = flow.tensor(random_grad_seq[i]).to("cuda")
-            skip_if_tensor = flow.tensor(
-                np.array(skip_if_seq[i]).reshape(1,).astype(np.int64)
-            ).to("cuda")
-            if do_bias_correction:
-                bias_correction1 = 1.0 - np.power(beta1, i)
-                bias_correction2 = 1.0 - np.power(beta2, i)
-                bias_correction1_tensor = flow.tensor(
-                    np.array(bias_correction1).reshape(1,).astype(np.float32)
-                ).to("cuda")
-                bias_correction2_tensor = flow.tensor(
-                    np.array(bias_correction2).reshape(1,).astype(np.float32)
-                ).to("cuda")
-            else:
-                bias_correction1_tensor = None
-                bias_correction2_tensor = None
-            updated_tensor = train_one_iter(
-                num_valid_tensor,
-                unique_embeddings_tensor,
-                grad_tensor,
-                skip_if_tensor,
-                bias_correction1_tensor,
-                bias_correction2_tensor,
-            )
-            unique_embeddings_tensor[0 : num_valid_seq[i]] = updated_tensor[
-                0 : num_valid_seq[i]
-            ]
-        return unique_embeddings_tensor
-
-    def adam_by_numpy():
-        x = init_value[:, 0:embedding_size]
-        m = init_value[:, embedding_size : 2 * embedding_size]
-        v = init_value[:, 2 * embedding_size : 3 * embedding_size]
-
-        def np_train_one_iter(step, num_valid, grad, model, state_m, state_v):
-            grad[0:num_valid] = grad[0:num_valid] * (scale / down_scale_by)
-
-            bias_correction1 = 1.0
-            bias_correction2 = 1.0
-
-            if do_bias_correction:
-                bias_correction1 = 1.0 - np.power(beta1, step)
-                bias_correction2 = 1.0 - np.power(beta2, step)
-
-            state_m[0:num_valid] = (
-                beta1 * state_m[0:num_valid] + (1 - beta1) * grad[0:num_valid]
-            )
-            state_v[0:num_valid] = (
-                beta2 * state_v[0:num_valid]
-                + (1 - beta2) * grad[0:num_valid] * grad[0:num_valid]
-            )
-            denom = np.sqrt(state_v[0:num_valid]) / np.sqrt(bias_correction2) + epsilon
-
-            model[0:num_valid] = (
-                model[0:num_valid]
-                - ((learning_rate / bias_correction1) * state_m[0:num_valid] / denom)
-                - learning_rate * weight_decay * model[0:num_valid]
-            )
-            return (model, state_m, state_v)
-
-        for i in range(1, train_iters):  # if step = 0, bias_correction2 is 0
-            if skip_if_seq[i] > 0:
-                pass
-            else:
-                (x, m, v) = np_train_one_iter(
-                    i, int(num_valid_seq[i]), random_grad_seq[i], x, m, v
-                )
-        return x, m, v
-
-    oneflow_res = adam_by_oneflow().numpy()
-    of_model = oneflow_res[:, 0:embedding_size]
-    of_m = oneflow_res[:, embedding_size : 2 * embedding_size]
-    of_v = oneflow_res[:, 2 * embedding_size : 3 * embedding_size]
-    np_model, np_m, np_v = adam_by_numpy()
-    test_case.assertTrue(
-        np.allclose(of_model.flatten(), np_model.flatten(), rtol=0.001, atol=0.001)
-    )
-    test_case.assertTrue(
-        np.allclose(of_m.flatten(), np_m.flatten(), rtol=0.001, atol=0.001)
-    )
-    test_case.assertTrue(
-        np.allclose(of_v.flatten(), np_v.flatten(), rtol=0.001, atol=0.001)
-    )
-
-
-@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-@flow.unittest.skip_unless_1n1d()
-class TestOptimizers(flow.unittest.TestCase):
-    def test_one_embedding_adam(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["weight_decay"] = [0, 0.1]
-        arg_dict["scale"] = [1, 0.1]
-        arg_dict["learning_rate"] = [1, 1.5]
-        arg_dict["train_iters"] = [10]
-        arg_dict["do_bias_correction"] = [True, False]
-        arg_dict["beta1"] = [0.9, 0.8]
-        arg_dict["beta2"] = [0.9, 0.8]
-
-        for arg in GenArgDict(arg_dict):
-            compare_with_numpy_adam(test_case, **arg)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_one_embedding_sgd.py b/python/oneflow/test/modules/test_one_embedding_sgd.py
deleted file mode 100644
index 9b709229d30..00000000000
--- a/python/oneflow/test/modules/test_one_embedding_sgd.py
+++ /dev/null
@@ -1,146 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-from collections import OrderedDict
-import tempfile
-import os
-import numpy as np
-from oneflow.test_utils.test_util import GenArgDict
-from optimizer_test_util import clip_grad_norm_np
-
-import oneflow as flow
-from oneflow.nn.parameter import Parameter
-
-
-def compare_with_numpy_sgd(
-    test_case, momentum, weight_decay, scale, learning_rate, train_iters,
-):
-
-    num_rows = 500
-    embedding_size = 128
-    model_shape = (num_rows, embedding_size)
-    line_size = embedding_size * 2 if momentum > 0 else embedding_size
-
-    num_valid_seq = np.random.randint(1, num_rows, (train_iters))
-    skip_if_seq = [np.random.randint(2) for i in range(train_iters)]
-
-    random_grad_seq = []
-    for _ in range(train_iters):
-        random_grad_seq.append(np.random.uniform(size=model_shape).astype(np.float32))
-
-    init_value = np.random.uniform(size=(num_rows, line_size)).astype(np.float32)
-
-    down_scale_by = 10
-
-    def sgd_by_oneflow():
-        unique_embeddings_tensor = flow.tensor(init_value, requires_grad=False).to(
-            "cuda"
-        )
-        lr_tensor = flow.tensor(
-            np.array(learning_rate).reshape(1,).astype(np.float32)
-        ).to("cuda")
-        down_scale_by_tensor = flow.tensor(
-            np.array(down_scale_by).astype(np.float32)
-        ).to("cuda")
-
-        def train_one_iter(num_valid, unique_embeddings, embedding_grad, skip_if):
-            return flow._C.one_embedding_sgd_update(
-                num_valid,
-                unique_embeddings,
-                embedding_grad,
-                lr_tensor,
-                down_scale_by_tensor,
-                skip_if,
-                scale,
-                weight_decay,
-                momentum,
-            )
-
-        for i in range(train_iters):
-            num_valid_tensor = flow.tensor(
-                np.array(num_valid_seq[i]).reshape(1,).astype(np.int32)
-            ).to("cuda")
-            grad_tensor = flow.tensor(random_grad_seq[i]).to("cuda")
-            skip_if_tensor = flow.tensor(
-                np.array(skip_if_seq[i]).reshape(1,).astype(np.int64)
-            ).to("cuda")
-            updated_tensor = train_one_iter(
-                num_valid_tensor, unique_embeddings_tensor, grad_tensor, skip_if_tensor
-            )
-            unique_embeddings_tensor[0 : num_valid_seq[i]] = updated_tensor[
-                0 : num_valid_seq[i]
-            ]
-        return unique_embeddings_tensor
-
-    def sgd_by_numpy():
-        x = init_value[:, 0:embedding_size]
-        vt = init_value[:, embedding_size:]
-
-        def train_one_iter(num_valid, grad, model, state):
-            grad[0:num_valid] = grad[0:num_valid] * (scale / down_scale_by)
-            next_state = (
-                momentum * state[0:num_valid] if momentum > 0 else 0
-            ) - learning_rate * grad[0:num_valid]
-            if momentum > 0:
-                state[0:num_valid] = next_state
-            model[0:num_valid] = (
-                model[0:num_valid]
-                + next_state
-                - learning_rate * weight_decay * model[0:num_valid]
-            )
-            return (model, state)
-
-        for i in range(train_iters):
-            if skip_if_seq[i] > 0:
-                pass
-            else:
-                (x, vt) = train_one_iter(
-                    int(num_valid_seq[i]), random_grad_seq[i], x, vt
-                )
-        return x, vt
-
-    oneflow_res = sgd_by_oneflow().numpy()
-    of_model = oneflow_res[:, 0:embedding_size]
-    of_momentum = oneflow_res[:, embedding_size:]
-    np_model, np_momentum = sgd_by_numpy()
-    test_case.assertTrue(
-        np.allclose(of_model.flatten(), np_model.flatten(), rtol=0.001, atol=0.001)
-    )
-    if momentum > 0:
-        test_case.assertTrue(
-            np.allclose(
-                of_momentum.flatten(), np_momentum.flatten(), rtol=0.001, atol=0.001
-            )
-        )
-
-
-@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-@flow.unittest.skip_unless_1n1d()
-class TestOptimizers(flow.unittest.TestCase):
-    def test_one_embedding_sgd(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["momentum"] = [0, 0.9]
-        arg_dict["weight_decay"] = [0, 0.1]
-        arg_dict["scale"] = [1, 0.1]
-        arg_dict["learning_rate"] = [1, 0.9]
-        arg_dict["train_iters"] = [10]
-        for arg in GenArgDict(arg_dict):
-            compare_with_numpy_sgd(test_case, **arg)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_parital_fc.py b/python/oneflow/test/modules/test_parital_fc.py
index a5792f55447..50967dbad45 100644
--- a/python/oneflow/test/modules/test_parital_fc.py
+++ b/python/oneflow/test/modules/test_parital_fc.py
@@ -24,18 +24,13 @@
 class TestParitalFC(flow.unittest.TestCase):
     def test_parital_fc(test_case):
         p = flow.env.all_device_placement("cuda")
-        w = flow.randn(
-            50000, 128, placement=p, sbp=flow.sbp.broadcast, requires_grad=True
-        )
+        w = flow.randn(50000, 128, placement=p, sbp=flow.sbp.broadcast)
         label = flow.randint(0, 50000, (512,), placement=p, sbp=flow.sbp.broadcast)
         num_sample = 5000
         out = flow.distributed_partial_fc_sample(w, label, num_sample)
         test_case.assertTrue(out[0].shape == flow.Size([512]))
         test_case.assertTrue(out[1].shape == flow.Size([5000]))
         test_case.assertTrue(out[2].shape == flow.Size([5000, 128]))
-        # test gradient function
-        sample_weight = out[2]
-        sample_weight.sum().backward()
 
 
 if __name__ == "__main__":
diff --git a/python/oneflow/test/modules/test_prelu.py b/python/oneflow/test/modules/test_prelu.py
index 410ef2eaf2a..02c4fd2e7bd 100644
--- a/python/oneflow/test/modules/test_prelu.py
+++ b/python/oneflow/test/modules/test_prelu.py
@@ -28,7 +28,7 @@
 
 @flow.unittest.skip_unless_1n1d()
 class TestPReLU(flow.unittest.TestCase):
-    @autotest(n=5)
+    @autotest(n=30)
     def test_prelu_4dim_module_with_random_data(test_case):
         device = random_device()
         x = random_tensor(ndim=4, dim1=3).to(device)
@@ -40,7 +40,7 @@ def test_prelu_4dim_module_with_random_data(test_case):
         y = m(x)
         return y
 
-    @autotest(n=5)
+    @autotest(n=30)
     def test_prelu_4dim_default_alpha_module_with_random_data(test_case):
         device = random_device()
         x = random_tensor(ndim=4, dim1=3).to(device)
@@ -50,7 +50,7 @@ def test_prelu_4dim_default_alpha_module_with_random_data(test_case):
         y = m(x)
         return y
 
-    @autotest(n=5)
+    @autotest(n=30)
     def test_prelu_2dim_module_with_random_data(test_case):
         device = random_device()
         x = random_tensor(ndim=2, dim1=3).to(device)
diff --git a/python/oneflow/test/modules/test_prod.py b/python/oneflow/test/modules/test_prod.py
index 4d543e23a31..313d3a62c77 100644
--- a/python/oneflow/test/modules/test_prod.py
+++ b/python/oneflow/test/modules/test_prod.py
@@ -51,16 +51,6 @@ def test_reduce_prod_bool_without_dim(test_case):
 
         return y
 
-    @autotest(auto_backward=False, check_graph=False)
-    def test_reduce_prod_with_dtype(test_case):
-        device = random_device()
-        ndim = random(1, 5).to(int)
-        x = random_tensor(ndim=ndim, low=1.0, high=4.0, requires_grad=False).to(device)
-        dim = random(0, ndim).to(int)
-        y = torch.prod(x, dim, dtype=torch.int32)
-
-        return y
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/modules/test_resnet_load_torch_weight_compatibile.py b/python/oneflow/test/modules/test_resnet_load_torch_weight_compatibile.py
deleted file mode 100644
index 002b355fd20..00000000000
--- a/python/oneflow/test/modules/test_resnet_load_torch_weight_compatibile.py
+++ /dev/null
@@ -1,51 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-
-import numpy as np
-import torch
-import torchvision.models as models_torch
-import flowvision.models as models_flow
-
-import oneflow as flow
-import oneflow.unittest
-
-
-@flow.unittest.skip_unless_1n1d()
-class TestResNet18LoadWeightCompatibile(flow.unittest.TestCase):
-    def test_resnet18_load_weight_compatibile(test_case):
-        resnet18_torch = models_torch.resnet18(pretrained=True)
-        resnet18_flow = models_flow.resnet18()
-        parameters = resnet18_torch.state_dict()
-        for key, value in parameters.items():
-            val = value.detach().cpu().numpy()
-            parameters[key] = val
-
-        resnet18_flow.load_state_dict(parameters)
-        torch_input = torch.randn(1, 3, 224, 224)
-        flow_input = flow.tensor(torch_input.cpu().numpy())
-        torch_output = resnet18_torch(torch_input)
-        flow_output = resnet18_flow(flow_input)
-        test_case.assertTrue(
-            np.allclose(
-                torch_output.detach().numpy(), flow_output.numpy(), atol=1e-4, rtol=1e-4
-            )
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_roc_auc_score.py b/python/oneflow/test/modules/test_roc_auc_score.py
deleted file mode 100644
index f0d00182bef..00000000000
--- a/python/oneflow/test/modules/test_roc_auc_score.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import unittest
-from collections import OrderedDict
-
-import numpy as np
-import oneflow as flow
-from oneflow.test_utils.test_util import GenArgList
-from sklearn.metrics import roc_auc_score
-
-
-def _test_roc_auc_score(test_case, label_dtype, pred_dtype):
-    inputs = [
-        {"label": [0, 0, 1, 1], "pred": [0.1, 0.4, 0.35, 0.8], "score": 0.75},
-        {"label": [0, 1, 0, 1], "pred": [0.5, 0.5, 0.5, 0.5], "score": 0.5},
-    ]
-    for data in inputs:
-        label = flow.tensor(data["label"], dtype=label_dtype)
-        pred = flow.tensor(data["pred"], dtype=pred_dtype)
-        of_score = flow.roc_auc_score(label, pred)
-        test_case.assertTrue(np.allclose(of_score.numpy()[0], data["score"]))
-
-
-def _compare_roc_auc_score(test_case, label_dtype, pred_dtype):
-    n_examples = 16384
-    label = np.random.randint(0, 2, n_examples)
-    pred = np.random.random(n_examples)
-    score = roc_auc_score(label, pred)
-
-    label = flow.tensor(label, dtype=label_dtype)
-    pred = flow.tensor(pred, dtype=pred_dtype)
-    of_score = flow.roc_auc_score(label, pred)
-
-    test_case.assertTrue(np.allclose(of_score.numpy()[0], score))
-
-
-@flow.unittest.skip_unless_1n1d()
-class TestNMS(flow.unittest.TestCase):
-    def test_roc_auc_score(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["test_fun"] = [_test_roc_auc_score, _compare_roc_auc_score]
-        arg_dict["label_dtype"] = [
-            flow.double,
-            flow.int32,
-            flow.float,
-            flow.int64,
-            flow.int8,
-            flow.uint8,
-        ]
-        arg_dict["pred_dtype"] = [flow.float]
-        for arg in GenArgList(arg_dict):
-            arg[0](test_case, *arg[1:])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_swapdims.py b/python/oneflow/test/modules/test_swapdims.py
deleted file mode 100644
index 76d43cb9df7..00000000000
--- a/python/oneflow/test/modules/test_swapdims.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-from collections import OrderedDict
-
-import numpy as np
-
-from oneflow.test_utils.automated_test_util import *
-from oneflow.test_utils.test_util import GenArgList
-
-import oneflow as flow
-import oneflow.unittest
-
-
-@flow.unittest.skip_unless_1n1d()
-class Testswapdims(flow.unittest.TestCase):
-    @autotest(check_graph=True)
-    def test_swapdims_flow_with_random_data(test_case):
-        device = random_device()
-        x = random_tensor(ndim=3).to(device)
-        y = torch.swapdims(x, np.random.randint(0, 3), np.random.randint(0, 3))
-        return y
-
-    @autotest(check_graph=True)
-    def test_swapdims_flow_with_random_data2(test_case):
-        device = random_device()
-        x = random_tensor(ndim=4).to(device)
-        y = torch.swapdims(x, np.random.randint(0, 4), np.random.randint(0, 4))
-        return y
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_unbind.py b/python/oneflow/test/modules/test_unbind.py
deleted file mode 100644
index b6520988fec..00000000000
--- a/python/oneflow/test/modules/test_unbind.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import unittest
-from collections import OrderedDict
-
-import numpy as np
-
-from oneflow.test_utils.automated_test_util import *
-from oneflow.test_utils.test_util import GenArgList
-
-import oneflow as flow
-import oneflow.unittest
-
-
-@flow.unittest.skip_unless_1n1d()
-class TestUnbind(flow.unittest.TestCase):
-    @autotest(n=5, check_graph=True)
-    def test_unbind_flow_with_random_data1(test_case):
-        device = random_device()
-        x = random_tensor(ndim=4).to(device)
-        y = torch.unbind(x, random(0, 4).to(int))
-        return y
-
-    @autotest(n=5, check_graph=True)
-    def test_unbind_flow_with_random_data2(test_case):
-        device = random_device()
-        x = random_tensor(ndim=4).to(device)
-        y = torch.unbind(x, random(0, 4).to(int))
-        return y
-
-    @autotest(n=5, check_graph=True)
-    def test_unbind_flow_with_random_data3(test_case):
-        device = random_device()
-        x = random_tensor(ndim=3).to(device)
-        y = torch.unbind(x, random(0, 3).to(int))
-        return y
-
-    @autotest(n=5, check_graph=True)
-    def test_unbind_flow_with_random_data4(test_case):
-        device = random_device()
-        x = random_tensor(ndim=3).to(device)
-        y = torch.unbind(x, random(0, 3).to(int))
-        return y
-
-    @autotest(n=5, check_graph=True)
-    def test_unbind_flow_with_random_data5(test_case):
-        device = random_device()
-        x = random_tensor(ndim=2).to(device)
-        y = torch.unbind(x, random(0, 2).to(int))
-        return y
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/modules/test_upsample.py b/python/oneflow/test/modules/test_upsample.py
index 6da66c713ef..e3f415c729b 100644
--- a/python/oneflow/test/modules/test_upsample.py
+++ b/python/oneflow/test/modules/test_upsample.py
@@ -377,7 +377,7 @@ def test_upsample2d(test_case):
         "The nearest interpolate operation in pytorch has bug, https://github.com/pytorch/pytorch/issues/65200"
     )
     @autotest()
-    def test_upsample2d_nearest(test_case):
+    def test_upsample2d(test_case):
         device = random_device()
         x = random_tensor().to(device)
         m = torch.nn.Upsample(scale_factor=random().to(float), mode="nearest")
@@ -399,18 +399,6 @@ def test_upsample2d_bilinear(test_case):
         y = m(x)
         return y
 
-    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-    @autotest(atol=1e-5)
-    def test_upsample2d_bicubic(test_case):
-        x = random_tensor(ndim=4, dim0=16, dim1=8).to("cuda")
-        m = torch.nn.Upsample(
-            scale_factor=random().to(float),
-            mode="bicubic",
-            align_corners=random_bool(),
-        )
-        y = m(x)
-        return y
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/tensor/test_new_tensor.py b/python/oneflow/test/tensor/test_new_tensor.py
deleted file mode 100644
index c13fde2b9bd..00000000000
--- a/python/oneflow/test/tensor/test_new_tensor.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-import os
-import unittest
-import numpy as np
-import oneflow as flow
-import oneflow.unittest
-
-
-class TestNewTensor(flow.unittest.TestCase):
-    @flow.unittest.skip_unless_1n1d()
-    def test_new_tensor_local_mode_with_default_args(test_case):
-        tensor = flow.randn(5)
-        data = [[1, 2], [3, 4]]
-        new_tensor = tensor.new_tensor(data)
-        test_case.assertEqual(new_tensor.dtype, tensor.dtype)
-        test_case.assertEqual(new_tensor.device, tensor.device)
-
-    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-    @flow.unittest.skip_unless_1n1d()
-    def test_new_tensor_local_mode_with_spec_args(test_case):
-        tensor = flow.randn(5)
-        data = [[1, 2], [3, 4]]
-        new_tensor = tensor.new_tensor(data, flow.int64, "cuda")
-        test_case.assertEqual(new_tensor.dtype, flow.int64)
-        test_case.assertEqual(new_tensor.device, flow.device("cuda"))
-
-    @flow.unittest.skip_unless_1n2d()
-    def test_new_tensor_global_mode_with_default_args(test_case):
-        placement = flow.placement(type="cpu", ranks=[0, 1])
-        sbp = flow.sbp.split(0)
-        tensor = flow.randn(4, 4, placement=placement, sbp=sbp)
-        data = [[1, 2], [3, 4]]
-        new_tensor = tensor.new_tensor(data)
-        test_case.assertEqual(new_tensor.dtype, tensor.dtype)
-        test_case.assertEqual(new_tensor.placement, placement)
-        test_case.assertEqual(new_tensor.sbp, (sbp,))
-
-    @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
-    @flow.unittest.skip_unless_1n2d()
-    def test_new_tensor_global_mode_with_spec_args(test_case):
-        placement = flow.placement(type="cuda", ranks=[0, 1])
-        sbp = flow.sbp.split(0)
-        tensor = flow.randn(4, 4, placement=placement, sbp=sbp)
-        data = [[1, 2], [3, 4]]
-        new_tensor = tensor.new_tensor(
-            data, placement=placement, sbp=flow.sbp.broadcast
-        )
-        test_case.assertEqual(new_tensor.dtype, tensor.dtype)
-        test_case.assertEqual(new_tensor.placement, placement)
-        test_case.assertEqual(new_tensor.sbp, (flow.sbp.broadcast,))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/oneflow/test/tensor/test_tensor_part_1.py b/python/oneflow/test/tensor/test_tensor_part_1.py
index 5bbe6624546..8fd4d4fdf5e 100644
--- a/python/oneflow/test/tensor/test_tensor_part_1.py
+++ b/python/oneflow/test/tensor/test_tensor_part_1.py
@@ -176,7 +176,7 @@ def _test_tensor_init_methods(test_case, tensor_creator, get_numpy):
         test_case.assertTrue(np.allclose(get_numpy(x), np_zeros))
         flow.nn.init.constant_(x, random_fill_val)
         test_case.assertTrue(np.allclose(get_numpy(x), random_fill_val * np_ones))
-        x.zero_()
+        x.zeros_()
         test_case.assertTrue(np.array_equal(get_numpy(x), np_zeros))
         test_case.assertEqual(flow.nn.init.calculate_gain("conv2d"), 1)
         test_case.assertEqual(flow.nn.init.calculate_gain("tanh"), 5.0 / 3)
@@ -936,7 +936,7 @@ def test_tensor_logical_slice_assign(test_case):
     def test_zeros_(test_case):
         shape = (2, 3)
         x = flow.tensor(np.random.randn(*shape), dtype=flow.float32)
-        x.zero_()
+        x.zeros_()
         test_case.assertTrue(np.allclose(x.numpy(), np.zeros(shape)))
 
     @flow.unittest.skip_unless_1n1d()
@@ -1046,12 +1046,6 @@ def test_none_equal(test_case):
         z = None in [xt, yt, zt]
         test_case.assertTrue(np.array_equal(z, True))
 
-    def test_half(test_case):
-        x = flow.tensor([1], dtype=flow.int64)
-        test_case.assertTrue(x.dtype == flow.int64)
-        y = x.half()
-        test_case.assertTrue(y.dtype == flow.float16)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test/tensor/test_tensor_part_2.py b/python/oneflow/test/tensor/test_tensor_part_2.py
index 0c86fc5c0b3..b6f267a864a 100644
--- a/python/oneflow/test/tensor/test_tensor_part_2.py
+++ b/python/oneflow/test/tensor/test_tensor_part_2.py
@@ -899,14 +899,6 @@ def test_tensor_split_sizes(test_case):
         res = x.split([1, 2, 3, 1], dim=-2)
         return torch.cat(res, dim=1)
 
-    @flow.unittest.skip_unless_1n1d()
-    @autotest(n=5, check_graph=True)
-    def test_tensor_unbind(test_case):
-        device = random_device()
-        x = random_tensor(ndim=4).to(device)
-        y = x.unbind(random(0, 4).to(int))
-        return y
-
     @flow.unittest.skip_unless_1n1d()
     @autotest(n=5, check_graph=True)
     def test_tensor_swapaxes(test_case):
@@ -915,14 +907,6 @@ def test_tensor_swapaxes(test_case):
         y = x.swapaxes(random(0, 2).to(int), random(0, 2).to(int))
         return y
 
-    @flow.unittest.skip_unless_1n1d()
-    @autotest(n=5, check_graph=True)
-    def test_tensor_swapdimst(test_case):
-        device = random_device()
-        x = random_tensor(ndim=3).to(device)
-        y = x.swapdims(random(0, 3).to(int), random(0, 3).to(int))
-        return y
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py b/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py
index 711276a1b06..22d2563c108 100644
--- a/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py
+++ b/python/oneflow/test_utils/automated_test_util/torch_flow_dual_object.py
@@ -19,7 +19,6 @@
 import copy
 import os
 import warnings
-import gc
 
 import numpy as np
 import oneflow as flow
@@ -337,7 +336,7 @@ def build(self):
                 print(
                     "Run graph of function: ", repr(oneflow),
                 )
-                test_g.debug(2)
+                test_g.debug(3)
             test_g_res = test_g()
             if verbose:
                 print(
@@ -367,7 +366,7 @@ def build(self):
         test_g = TestGraphOfTensorMethod()
         if verbose:
             print("Run graph of method: ", repr(oneflow))
-            test_g.debug(2)
+            test_g.debug(3)
         test_g_res = test_g()
         if verbose:
             print(
@@ -438,7 +437,7 @@ def oneflow_eager_run_with_graph_check(
             test_g = get_module_graph_test(graph_train_oneflow, oneflow, *args)
             if verbose:
                 print("Run graph of module: ", repr(oneflow))
-                test_g.debug(2)
+                test_g.debug(3)
             # When testing module methods, kwargs are not considered.
             test_g_res = test_g(*graph_args)
             if verbose:
@@ -451,10 +450,7 @@ def oneflow_eager_run_with_graph_check(
         # 2. inspect.isfunction(oneflow): Compared with the ordinary flow.xxx, oneflow.nn.modules.math_ops series op exist an extra layer of python wrapper.
         # 3. inspect.ismethod(oneflow) and "oneflow.nn.modules" in oneflow.__module__:  For op that only has Tensor.xxx method, and call oneflow.xxx actually, like masked_fill.
         elif (
-            (
-                oneflow.__module__ is not None
-                and ("oneflow.nn.modules" not in oneflow.__module__)
-            )
+            ("oneflow.nn.modules" not in oneflow.__module__)
             or inspect.isfunction(oneflow)
             or (
                 inspect.ismethod(oneflow) and "oneflow.nn.modules" in oneflow.__module__
@@ -847,10 +843,6 @@ def __eq__(self, other):
         else:
             return self.pytorch == other
 
-    def __del__(self):
-        # force running gc to avoid the periodic gc related to metaclass
-        gc.collect()
-
 
 dual_modules_to_test = []
 dual_objects_to_test = []
@@ -937,33 +929,6 @@ def check_basetype_equality(a, b, ignored1, ignored2, check_dtype=False):
     return a == b
 
 
-@equality_checker(tuple, tuple)
-@equality_checker(list, list)
-def check_basetype_equality(a, b, rtol=0.0001, atol=1e-05, check_dtype=False):
-    if len(a) != len(b):
-        equality_res = False
-    else:
-        for i in range(len(a)):
-            torch_np = a[i].detach().cpu().numpy()
-            flow_np = b[i].detach().cpu().numpy()
-            equality_res = np.allclose(
-                torch_np, flow_np, rtol=rtol, atol=atol, equal_nan=True,
-            )
-            if check_dtype:
-                equality_res = equality_res and (torch_np.dtype == flow_np.dtype)
-            if equality_res == False:
-                print_note_fake_program()
-                print("---------Tensor Shape--------")
-                print(a[i].shape)
-                print(b[i].shape)
-                print("---------Tensor dtype--------")
-                print(a[i].dtype)
-                print(b[i].dtype)
-                break
-
-    return equality_res
-
-
 @equality_checker(type(None), type(None))
 def check_nonetype_equality(a, b, ignored1, ignored2, check_dtype=False):
     return True
@@ -991,8 +956,6 @@ def new_f(test_case, *args, **kwargs):
             loop_limit = successful_runs_needed * 20
             current_run = 0
             while successful_runs_needed > 0:
-                # force running gc to avoid the periodic gc related to metaclass
-                gc.collect()
                 clear_note_fake_program()
                 if current_run > loop_limit:
                     raise ValueError(
diff --git a/python/oneflow/test_utils/oneflow_pytorch_compatibility/__init__.py b/python/oneflow/test_utils/oneflow_pytorch_compatiblity/__init__.py
similarity index 100%
rename from python/oneflow/test_utils/oneflow_pytorch_compatibility/__init__.py
rename to python/oneflow/test_utils/oneflow_pytorch_compatiblity/__init__.py
diff --git a/python/oneflow/test_utils/oneflow_pytorch_compatibility/oneflow_pytorch_compatiblity_test.py b/python/oneflow/test_utils/oneflow_pytorch_compatiblity/oneflow_pytorch_compatiblity_test.py
similarity index 93%
rename from python/oneflow/test_utils/oneflow_pytorch_compatibility/oneflow_pytorch_compatiblity_test.py
rename to python/oneflow/test_utils/oneflow_pytorch_compatiblity/oneflow_pytorch_compatiblity_test.py
index ca1ee880974..acfd9d6224c 100644
--- a/python/oneflow/test_utils/oneflow_pytorch_compatibility/oneflow_pytorch_compatiblity_test.py
+++ b/python/oneflow/test_utils/oneflow_pytorch_compatiblity/oneflow_pytorch_compatiblity_test.py
@@ -150,12 +150,8 @@ def get_loss(
                 + [
                     "import oneflow as torch",
                     "import oneflow.nn as nn",
-                    "import oneflow.nn.init as init",
-                    "import oneflow.nn.functional as F",
                     "from oneflow import Tensor",
                     "from oneflow.nn import Parameter",
-                    "import math",
-                    "from flowvision.layers import *",
                 ]
                 + lines[i:]
             )
@@ -217,24 +213,20 @@ def get_loss(
 
 
 def do_test_train_loss_oneflow_pytorch(
-    test_case,
-    model_path: str,
-    module_name: str,
-    device: str = "cuda",
-    batch_size: int = 16,
-    img_size: int = 224,
+    test_case, model_path: str, module_name: str, device: str = "cuda",
 ):
-    image_nd = np.random.rand(batch_size, 3, img_size, img_size).astype(np.float32)
+    batch_size = 16
+    image_nd = np.random.rand(batch_size, 3, 224, 224).astype(np.float32)
     label_nd = np.array([e for e in range(batch_size)], dtype=np.int32)
     oneflow_model_loss = []
     pytorch_model_loss = []
 
     with tempfile.TemporaryDirectory() as tmpdirname:
         pytorch_model_loss = get_loss(
-            image_nd, label_nd, model_path, module_name, True, device, tmpdirname
+            image_nd, label_nd, model_path, module_name, True, "cuda", tmpdirname
         )
         oneflow_model_loss = get_loss(
-            image_nd, label_nd, model_path, module_name, False, device, tmpdirname
+            image_nd, label_nd, model_path, module_name, False, "cuda", tmpdirname
         )
 
     if verbose:
diff --git a/tools/functional/generate_dispatch_stateful_ops.py b/tools/functional/generate_dispatch_stateful_ops.py
index 639fa4648af..73461357909 100644
--- a/tools/functional/generate_dispatch_stateful_ops.py
+++ b/tools/functional/generate_dispatch_stateful_ops.py
@@ -94,6 +94,10 @@
     license
     + """
 
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
 namespace oneflow {{
 namespace one {{
 namespace functional {{
@@ -108,20 +112,19 @@
     license
     + """
 
-#include <Python.h>
+#include <vector>
+#include <pybind11/pybind11.h>
 
 #include "oneflow/api/python/of_api_registry.h"
-#include "oneflow/api/python/functional/common.h"
-#include "oneflow/api/python/exception/exception.h"
 #include "oneflow/api/python/functional/function_def.h"
-#include "oneflow/api/python/functional/python_arg.h"
-#include "oneflow/api/python/functional/python_arg_parser.h"
-#include "oneflow/api/python/functional/python_frame.h"
+#include "oneflow/api/python/functional/py_function.h"
 #include "oneflow/api/python/functional/dispatch_stateful_ops.yaml.h"
 #include "oneflow/api/python/functional/dispatch_stateful_ops.yaml.pybind.h"
 #include "oneflow/core/common/maybe.h"
 #include "oneflow/core/common/optional.h"
 
+namespace py = pybind11;
+
 namespace oneflow {{
 namespace one {{
 namespace functional {{
@@ -132,15 +135,11 @@
 namespace functional = one::functional;
 
 ONEFLOW_API_PYBIND11_MODULE("_C", m) {{
-  static PyMethodDef functions[] = {{
-{1}
-    {{NULL, NULL, 0, NULL}}
-  }};
+  py::options options;
+  options.disable_function_signatures();
 
-  PyObject* module = m.ptr();
-  if (module) {{
-    PyModule_AddFunctions(module, functions);
-  }}
+{1}
+  options.enable_function_signatures();
 }}
 
 }}  // namespace oneflow
diff --git a/tools/functional/generate_functional_api.py b/tools/functional/generate_functional_api.py
index 73244abdc81..deca4b529e1 100644
--- a/tools/functional/generate_functional_api.py
+++ b/tools/functional/generate_functional_api.py
@@ -97,7 +97,9 @@
     license
     + """
 
-#include <Python.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
 
 namespace oneflow {{
 namespace one {{
@@ -113,15 +115,12 @@
     license
     + """
 
-#include <Python.h>
+#include <vector>
+#include <pybind11/pybind11.h>
 
 #include "oneflow/api/python/of_api_registry.h"
-#include "oneflow/api/python/exception/exception.h"
-#include "oneflow/api/python/functional/common.h"
 #include "oneflow/api/python/functional/function_def.h"
-#include "oneflow/api/python/functional/python_arg.h"
-#include "oneflow/api/python/functional/python_arg_parser.h"
-#include "oneflow/api/python/functional/python_frame.h"
+#include "oneflow/api/python/functional/py_function.h"
 #include "oneflow/core/common/maybe.h"
 #include "oneflow/core/common/optional.h"
 #include "oneflow/core/functional/functional.h"
@@ -136,15 +135,11 @@
 namespace functional = one::functional;
 
 ONEFLOW_API_PYBIND11_MODULE("_C", m) {{
-  static PyMethodDef functions[] = {{
-{1}
-    {{NULL, NULL, 0, NULL}}
-  }};
+  py::options options;
+  options.disable_function_signatures();
 
-  PyObject* module = m.ptr();
-  if (module) {{
-    PyModule_AddFunctions(module, functions);
-  }}
+{1}
+  options.enable_function_signatures();
 }}
 
 }}  // namespace oneflow
diff --git a/tools/functional/generate_tensor_api.py b/tools/functional/generate_tensor_api.py
index 05bd9c62697..98278ce5903 100644
--- a/tools/functional/generate_tensor_api.py
+++ b/tools/functional/generate_tensor_api.py
@@ -93,7 +93,9 @@
     license
     + """
 
-#include <Python.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
 
 namespace oneflow {{
 namespace one {{
@@ -109,20 +111,19 @@
     license
     + """
 
-#include <Python.h>
+#include <vector>
+#include <pybind11/pybind11.h>
 
 #include "oneflow/api/python/of_api_registry.h"
-#include "oneflow/api/python/exception/exception.h"
-#include "oneflow/api/python/functional/common.h"
 #include "oneflow/api/python/functional/function_def.h"
-#include "oneflow/api/python/functional/python_arg.h"
-#include "oneflow/api/python/functional/python_arg_parser.h"
-#include "oneflow/api/python/functional/python_frame.h"
+#include "oneflow/api/python/functional/py_function.h"
 #include "oneflow/api/python/functional/tensor_api.yaml.h"
 #include "oneflow/api/python/functional/tensor_api.yaml.pybind.h"
 #include "oneflow/core/common/maybe.h"
 #include "oneflow/core/common/optional.h"
 
+namespace py = pybind11;
+
 namespace oneflow {{
 namespace one {{
 namespace functional {{
@@ -133,15 +134,11 @@
 namespace functional = one::functional;
 
 ONEFLOW_API_PYBIND11_MODULE("_C", m) {{
-  static PyMethodDef functions[] = {{
-{1}
-    {{NULL, NULL, 0, NULL}}
-  }};
+  py::options options;
+  options.disable_function_signatures();
 
-  PyObject* module = m.ptr();
-  if (module) {{
-    PyModule_AddFunctions(module, functions);
-  }}
+{1}
+  options.enable_function_signatures();
 }}
 
 }}  // namespace oneflow
diff --git a/tools/functional/generator.py b/tools/functional/generator.py
index 6a0054a655a..6234b6ae7f8 100644
--- a/tools/functional/generator.py
+++ b/tools/functional/generator.py
@@ -447,12 +447,10 @@ def generate_pybind_for_python(
         header_fmt = ""
         for name, blocks in self._blocks.items():
             schema_types = []
-            max_args_count = 0
             for block in blocks:
                 if not block._bind_python:
                     continue
                 signature = block._signature
-                max_args_count = max(max_args_count, signature._max_args_count)
                 schema_types.append(
                     "functional::{0}".format(signature.get_schema_name())
                 )
@@ -522,41 +520,16 @@ def generate_pybind_for_python(
                 )
 
             if len(schema_types) > 0:
-                module_fmt += '    {{"{0}", (PyCFunction)functional::{1}, METH_VARARGS | METH_KEYWORDS, NULL}},\n'.format(
-                    name, name
+                module_fmt += '  m.def("{0}", &functional::PyFunction<{1}>);\n'.format(
+                    name, ", ".join(schema_types)
                 )
 
-                header_fmt += "\n"
-                header_fmt += "PyObject* {0}(PyObject* self, PyObject* args, PyObject* kwargs);\n".format(
+                header_fmt += "\npy::object {0}(const py::args& args, const py::kwargs& kwargs);\n".format(
                     name
                 )
-                schema_fmt += "\n"
-                schema_fmt += "PyObject* {0}(PyObject* self, PyObject* args, PyObject* kwargs) {{\n".format(
-                    name
+                schema_fmt += "\npy::object {0}(const py::args& args, const py::kwargs& kwargs) {{\n  return functional::PyFunction<{1}>(args, kwargs);\n}}\n".format(
+                    name, ", ".join(schema_types)
                 )
-                schema_fmt += "  HANDLE_ERRORS\n"
-                schema_fmt += "  PythonFrameGuard pf;\n"
-                schema_fmt += '  static PythonArgParser<{0}> parser("{1}");\n'.format(
-                    ", ".join(schema_types), name
-                )
-                schema_fmt += "  ParsedArgs<{0}> r;\n".format(max_args_count)
-                schema_fmt += "  int idx = parser.Parse(args, kwargs, &r);\n"
-                i = 0
-                for block in blocks:
-                    signature = block._signature
-                    schema_fmt += "  if (idx == {0}) {{\n".format(i)
-                    params = []
-                    for j in range(len(signature._args)):
-                        cpp_type = _std_decay(signature._args[j]._cpp_type)
-                        params.append("r[{0}].As<{1}>()".format(j, cpp_type))
-                    schema_fmt += "    return CastToPyObject(functional::{0}({1}));\n".format(
-                        signature._name, ", ".join(params)
-                    )
-                    schema_fmt += "  }\n"
-                    i += 1
-                schema_fmt += "  Py_RETURN_NONE;\n"
-                schema_fmt += "  END_HANDLE_ERRORS\n"
-                schema_fmt += "}\n"
 
         render_file_if_different(
             target_pybind_header_file, pybind_header_fmt.format(header_fmt)