diff --git a/.github/workflows/canary.yml b/.github/workflows/canary.yml
index 5b053ee21fd..f39b16d050e 100644
--- a/.github/workflows/canary.yml
+++ b/.github/workflows/canary.yml
@@ -55,7 +55,7 @@ jobs:
       - name: Checkout Oneflow-Inc/oneflow
         if: ${{ github.event.inputs.oneflow-ref == '' }}
         uses: actions/checkout@v2
-      - uses: Oneflow-Inc/get-oneflow@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
         name: Build manylinux
         id: build-cuda
         with:
diff --git a/.github/workflows/on_merge.yml b/.github/workflows/on_merge.yml
index f327d68d0d3..6085a59da77 100644
--- a/.github/workflows/on_merge.yml
+++ b/.github/workflows/on_merge.yml
@@ -15,6 +15,6 @@ jobs:
     if: github.event.pull_request.merged == true
     runs-on: ubuntu-latest
     steps:
-      - uses: Oneflow-Inc/get-oneflow/update-benchmark-history@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow/update-benchmark-history@support-iree-ci
         name: Update benchmark history
         timeout-minutes: 10
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index a97e72de34d..1e4112a28ba 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -33,7 +33,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-iree-ci
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -74,7 +74,7 @@ jobs:
           python3 -m pip install -U pip setuptools wheel --user
           python3 -m pip install oss2  --user
       - uses: actions/checkout@v2
-      - uses: Oneflow-Inc/get-oneflow@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
         name: Build ${{ matrix.entry }}
         if: ${{ matrix.entry !='cpu' }}
         with:
@@ -98,7 +98,7 @@ jobs:
             3.8
             3.9
             3.10
-      - uses: Oneflow-Inc/get-oneflow@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
         name: Build ${{ matrix.entry }}
         if: ${{ matrix.entry =='cpu' }}
         with:
diff --git a/.github/workflows/simple.yml b/.github/workflows/simple.yml
index 2f22f7b74d5..1b2064f1a61 100644
--- a/.github/workflows/simple.yml
+++ b/.github/workflows/simple.yml
@@ -245,7 +245,7 @@ jobs:
           repository: Oneflow-Inc/conda-env
           ref: 30a7f00eb48ee9009d85a848e720823e5054c66b
           path: conda-env
-      - uses: Oneflow-Inc/get-oneflow@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
         name: Build with gcc7
         if: ${{ matrix.build-type == 'gcc7'}}
         with:
@@ -254,7 +254,7 @@ jobs:
           oneflow-build-env: conda
           conda-env-file: conda-env/dev/gcc7/environment-v2.yml
           conda-env-name: oneflow-dev-gcc7-v2
-      - uses: Oneflow-Inc/get-oneflow@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
         name: Build with clang10
         if: ${{ matrix.build-type == 'clang10'}}
         with:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9d465fa372b..c0f79e273ab 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,6 +16,8 @@ env:
   FLOW_VISION_COMMIT: ca8ebc663b58667cf8cd1b6ef0c861522780b7bb
   LIBAI_SRC: libai
   LIBAI_COMMIT: 7d31d9781e5f2d559dc0820f599e0bed798488ca
+  ONEFLOW_IREE_SRC: oneflow_iree
+  ONEFLOW_IREE_COMMIT: 4322cbad2545877b1664aa8e0f17a17f6b5f687c
   TEST_WITH_TORCH_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-pytorch-1.10.0-cuda11.3-cudnn8-runtime:afaf913e02a4ba02db92260daee22f99121cef62
   MLIR_DOCKER_ARGS: "-e ONEFLOW_MLIR_ENABLE_ROUND_TRIP=1 -e ONEFLOW_MLIR_PREFER_NHWC=0 -e ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION=1"
 
@@ -25,7 +27,7 @@ jobs:
     runs-on: ubuntu-latest
     if: github.event.pull_request.draft == false && github.base_ref == 'master' && contains(github.event.pull_request.requested_reviewers.*.login, 'oneflow-ci-bot')
     steps:
-      - uses: Oneflow-Inc/get-oneflow/priority-pr@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow/priority-pr@support-iree-ci
         name: Check priority PR closed
         id: save-cache
         timeout-minutes: 5
@@ -159,7 +161,7 @@ jobs:
           fi
           echo "is_secrets_accessible=1" >> $GITHUB_ENV
       - name: Wait for GPU slot
-        uses: Oneflow-Inc/get-oneflow/wait-for-gpu@support-cuda-1106
+        uses: Oneflow-Inc/get-oneflow/wait-for-gpu@support-iree-ci
         if: env.is_secrets_accessible == '1'
         timeout-minutes: 90
         continue-on-error: true
@@ -183,7 +185,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@support-iree-ci
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -230,7 +232,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -266,7 +268,7 @@ jobs:
           python-versions: |
             3.6
             3.7
-      - uses: Oneflow-Inc/get-oneflow@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
         name: Build manylinux ${{ matrix.entry }}
         id: build-cuda
         if: ${{ matrix.entry =='cu102' && !matrix.cache-hit }}
@@ -286,7 +288,7 @@ jobs:
           clean-ccache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }}
           python-versions: |
             3.7
-      - uses: Oneflow-Inc/get-oneflow@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow@support-iree-ci
         name: Build ${{ matrix.entry }}
         if: ${{ matrix.entry == 'llvm13' && !matrix.cache-hit }}
         with:
@@ -325,7 +327,7 @@ jobs:
             })
       - name: Upload packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm13' && matrix.entry != 'cu102_xla' }}
-        uses: Oneflow-Inc/get-oneflow/digest/upload@support-cuda-1106
+        uses: Oneflow-Inc/get-oneflow/digest/upload@support-iree-ci
         timeout-minutes: 10
         with:
           digest: ${{ steps.save-cache.outputs.build-digest }}
@@ -336,7 +338,7 @@ jobs:
           dst-dir: cpack
       - name: Upload whl
         if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm13' && matrix.entry != 'cu102_xla' }}
-        uses: Oneflow-Inc/get-oneflow/digest/upload@support-cuda-1106
+        uses: Oneflow-Inc/get-oneflow/digest/upload@support-iree-ci
         timeout-minutes: 10
         with:
           digest: ${{ steps.save-cache.outputs.build-digest }}
@@ -361,7 +363,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-iree-ci
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -392,7 +394,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
-      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@support-iree-ci
         name: find cache
         id: find-cache
         timeout-minutes: 5
@@ -456,12 +458,20 @@ jobs:
           # please use a commit here
           ref: ${{ env.LIBAI_COMMIT}}
           path: ${{ env.LIBAI_SRC}}
+      - name: Checkout Oneflow-Inc/oneflow_iree
+        if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
+        uses: actions/checkout@v2
+        with:
+          repository: Oneflow-Inc/oneflow_iree
+          # please use a commit here
+          ref: ${{ env.ONEFLOW_IREE_COMMIT}}
+          path: ${{ env.ONEFLOW_IREE_SRC}}
       - name: Remove container
         timeout-minutes: 45
         if: ${{ contains(matrix.runs-on, 'self-hosted') }}
         run: |
           docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -477,7 +487,7 @@ jobs:
           exit 1
       - name: Download wheel and packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/digest/download@support-cuda-1106
+        uses: Oneflow-Inc/get-oneflow/digest/download@support-iree-ci
         id: download-digest
         timeout-minutes: 10
         with:
@@ -487,7 +497,7 @@ jobs:
           ssh-tank-path: ${{ env.SSH_TANK_PATH }}
       - name: Get primary node
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/master-address@support-cuda-1106
+        uses: Oneflow-Inc/get-oneflow/master-address@support-iree-ci
         id: get-primary-node
         with:
           rank: ${{ matrix.rank }}
@@ -560,6 +570,7 @@ jobs:
           docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.FLOW_VISION_SRC}}
           docker exec ${TEST_CONTAINER_NAME} python3 -m pip install pybind11 --user
           docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.LIBAI_SRC}}
+          docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.ONEFLOW_IREE_SRC}}
       - name: Module API test (distributed)
         timeout-minutes: 90
         if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'module' && matrix.device == 'cuda' && fromJson(matrix.is-distributed) }}
@@ -649,12 +660,20 @@ jobs:
           # please use a commit here
           ref: ${{ env.LIBAI_COMMIT}}
           path: ${{ env.LIBAI_SRC}}
+      - name: Checkout Oneflow-Inc/oneflow_iree
+        if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
+        uses: actions/checkout@v2
+        with:
+          repository: Oneflow-Inc/oneflow_iree
+          # please use a commit here
+          ref: ${{ env.ONEFLOW_IREE_COMMIT}}
+          path: ${{ env.ONEFLOW_IREE_SRC}}
       - name: Remove container
         timeout-minutes: 45
         if: ${{ contains(matrix.runs-on, 'self-hosted') }}
         run: |
           docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
@@ -670,7 +689,7 @@ jobs:
           exit 1
       - name: Download wheel and packed liboneflow
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
-        uses: Oneflow-Inc/get-oneflow/digest/download@support-cuda-1106
+        uses: Oneflow-Inc/get-oneflow/digest/download@support-iree-ci
         id: download-digest
         timeout-minutes: 10
         with:
@@ -782,6 +801,7 @@ jobs:
           docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.FLOW_VISION_SRC}}
           docker exec ${TEST_CONTAINER_NAME} python3 -m pip install pybind11 --user
           docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.LIBAI_SRC}}
+          docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -e ${{ env.ONEFLOW_IREE_SRC}}
       - name: Run OneFlow doctor
         if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
         run: |
@@ -866,7 +886,7 @@ jobs:
               body: "<details>\n <summary>Speed stats:</summary>\n\n ``` \n${{ steps.speed.outputs.stats }}\n ``` \n\n</details>".replace(/\\n/g, '\n')
             })
       - name: Module API test
-        timeout-minutes: 45
+        timeout-minutes: 50
         if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'module' && !fromJson(matrix.is-distributed) }}
         run: |
           docker exec -e ONEFLOW_TEST_DIR=$PWD/python/oneflow/test/modules ${{ env.TEST_CONTAINER_NAME }} bash ci/test/generic_test_multi_client.sh
@@ -884,6 +904,11 @@ jobs:
           docker exec -e ONEFLOW_TEST_DEVICE_NUM=4 -w $PWD/${{ env.LIBAI_SRC }} ${{ env.TEST_CONTAINER_NAME }} python3 -m oneflow.distributed.launch --nproc_per_node 4 -m unittest -f tests/models/test_gpt.py
           docker exec -e ONEFLOW_TEST_DEVICE_NUM=4 -w $PWD/${{ env.LIBAI_SRC }} ${{ env.TEST_CONTAINER_NAME }} python3 -m oneflow.distributed.launch --nproc_per_node 4 -m unittest -f tests/models/test_t5.py
           docker exec -e ONEFLOW_TEST_DEVICE_NUM=4 -w $PWD/${{ env.LIBAI_SRC }} ${{ env.TEST_CONTAINER_NAME }} python3 -m oneflow.distributed.launch --nproc_per_node 4 -m unittest -f tests/models/test_vit.py
+      - name: oneflow_iree test
+        timeout-minutes: 45
+        if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'misc' }}
+        run: |
+          docker exec -w $PWD/${{ env.ONEFLOW_IREE_SRC }} ${{ env.TEST_CONTAINER_NAME }} python3 -m pytest examples
       - name: Expensive tests (models, cases require exclusive access to GPU)
         timeout-minutes: 45
         if: ${{ !fromJson(matrix.cache-hit) && (matrix.test-type == 'speed-test' || (matrix.test-type == 'misc' && matrix.device == 'cpu')) && !fromJson(matrix.is-distributed) }}
@@ -909,7 +934,7 @@ jobs:
       - name: Benchmark Test
         timeout-minutes: 100
         if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'benchmark' && matrix.device == 'cuda' }}
-        uses: Oneflow-Inc/get-oneflow/pytest-benchmark@support-cuda-1106
+        uses: Oneflow-Inc/get-oneflow/pytest-benchmark@support-iree-ci
         with:
           collect-path: ${{ env.FLOW_VISION_SRC }}/benchmark
           container-name: ${{ env.TEST_CONTAINER_NAME }}
@@ -962,7 +987,7 @@ jobs:
           ref: ${{ github.event.pull_request.head.sha }}
           repository: ${{github.event.pull_request.head.repo.full_name}}
           fetch-depth: 0
-      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-cuda-1106
+      - uses: Oneflow-Inc/get-oneflow/cache-complete@support-iree-ci
         name: Save cache if successful
         id: save-cache
         timeout-minutes: 5
diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake
index b93a12e55fe..0176468ccd6 100644
--- a/cmake/oneflow.cmake
+++ b/cmake/oneflow.cmake
@@ -250,18 +250,21 @@ if("${LLVM_MONO_REPO_URL}" STREQUAL
       "https://github.com/llvm/llvm-project/archive/7eaa84eac3ba935d13f4267d3d533a6c3e1283ed.zip"
    OR "${LLVM_MONO_REPO_URL}" STREQUAL
       "https://github.com/llvm/llvm-project/archive/35e60f5de180aea55ed478298f4b40f04dcc57d1.zip"
+   OR "${LLVM_MONO_REPO_URL}" STREQUAL
+      "https://github.com/llvm/llvm-project/archive/6a9bbd9f20dcd700e28738788bb63a160c6c088c.zip"
    OR "${LLVM_MONO_REPO_MD5}" STREQUAL "f2f17229cf21049663b8ef4f2b6b8062"
    OR "${LLVM_MONO_REPO_MD5}" STREQUAL "6b7c6506d5922de9632c8ff012b2f945"
    OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e0ea669a9f0872d35bffda5ec6c5ac6f"
+   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "241a333828bba1efa35aff4c4fc2ce87"
    OR "${LLVM_MONO_REPO_MD5}" STREQUAL "075fbfdf06cb3f02373ea44971af7b03")
   unset(LLVM_MONO_REPO_URL CACHE)
   unset(LLVM_MONO_REPO_MD5 CACHE)
 endif()
 set(LLVM_MONO_REPO_URL
-    "https://github.com/llvm/llvm-project/archive/6a9bbd9f20dcd700e28738788bb63a160c6c088c.zip"
+    "https://github.com/llvm/llvm-project/archive/32805e60c9de1f82887cd2af30d247dcabd2e1d3.zip"
     CACHE STRING "")
 use_mirror(VARIABLE LLVM_MONO_REPO_URL URL ${LLVM_MONO_REPO_URL})
-set(LLVM_MONO_REPO_MD5 "241a333828bba1efa35aff4c4fc2ce87" CACHE STRING "")
+set(LLVM_MONO_REPO_MD5 "e412dc61159b5e929b0c94e44b11feb2" CACHE STRING "")
 set(ONEFLOW_BUILD_ROOT_DIR "${PROJECT_BINARY_DIR}")
 add_subdirectory(${PROJECT_SOURCE_DIR}/oneflow/ir)
 if(WITH_MLIR)
diff --git a/oneflow/ir/include/OneFlow/Conversion/SCFToGPU.h b/oneflow/ir/include/OneFlow/Conversion/SCFToGPU.h
deleted file mode 100644
index e6c70591035..00000000000
--- a/oneflow/ir/include/OneFlow/Conversion/SCFToGPU.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_SCFTOGPU_H_
-#define ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_SCFTOGPU_H_
-
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-
-namespace oneflow {
-
-std::unique_ptr<mlir::Pass> createMapSCFToGPUPass();
-
-}  // namespace oneflow
-
-}  // namespace mlir
-
-#endif  // ONEFLOW_IR_INCLUDE_ONEFLOW_CONVERSION_SCFTOGPU_H_
diff --git a/oneflow/ir/include/OneFlow/OneFlowDialect.td b/oneflow/ir/include/OneFlow/OneFlowDialect.td
index 10bfca306c0..94e4d31ac5b 100644
--- a/oneflow/ir/include/OneFlow/OneFlowDialect.td
+++ b/oneflow/ir/include/OneFlow/OneFlowDialect.td
@@ -14,6 +14,7 @@ def OneFlow_Dialect : Dialect {
         "func::FuncDialect"
     ];
     let hasConstantMaterializer = 1;
+    let useDefaultTypePrinterParser = 1;
 }
 
 #endif // ONEFLOW_DIALECT
diff --git a/oneflow/ir/include/OneFlow/OneFlowOps.td b/oneflow/ir/include/OneFlow/OneFlowOps.td
index 405ff4499e0..c22a87143b3 100644
--- a/oneflow/ir/include/OneFlow/OneFlowOps.td
+++ b/oneflow/ir/include/OneFlow/OneFlowOps.td
@@ -288,12 +288,6 @@ def LowerOneFlowToTosaPass : Pass<"lower-oneflow-to-tosa", "ModuleOp"> {
   ];
 }
 
-def MapSCFToGPUPass : Pass<"gpu-greedy-parallel-loop-mapping", "ModuleOp"> {
-  let summary = "Greedily maps all parallel loops to gpu hardware ids";
-  let constructor = "mlir::oneflow::createMapSCFToGPUPass()";
-  let dependentDialects = ["scf::SCFDialect"];
-}
-
 def BufferHostRegisterPass : Pass<"buffer-host-register", "func::FuncOp"> {
   let summary = "";
   let constructor = "mlir::oneflow::createBufferHostRegisterPass()";
diff --git a/oneflow/ir/include/OneFlow/OneFlowPatterns.td b/oneflow/ir/include/OneFlow/OneFlowPatterns.td
index 5ea5d776f36..097d76c5fbb 100644
--- a/oneflow/ir/include/OneFlow/OneFlowPatterns.td
+++ b/oneflow/ir/include/OneFlow/OneFlowPatterns.td
@@ -5,7 +5,7 @@
 include "mlir/IR/PatternBase.td"
 include "OneFlow/OneFlowOps.td"
 include "mlir/Dialect/MemRef/IR/MemRefOps.td"
-include "mlir/Dialect/GPU/GPUOps.td"
+include "mlir/Dialect/GPU/IR/GPUOps.td"
 
 def IsNotNestedInJit: Constraint<CPred<"($0.getDefiningOp()->getParentOfType<::mlir::oneflow::Job>())">, "">;
 def IsScalarTensor: Constraint<CPred<"::mlir::oneflow::IsScalarTensor($0)">, "">;
diff --git a/oneflow/ir/include/OneFlow/Passes.h b/oneflow/ir/include/OneFlow/Passes.h
index 59c05c42d34..7c46d8f3e59 100644
--- a/oneflow/ir/include/OneFlow/Passes.h
+++ b/oneflow/ir/include/OneFlow/Passes.h
@@ -19,13 +19,12 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/SCF/SCF.h"
-#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "OneFlow/Conversion/OneFlowToTosa.h"
-#include "OneFlow/Conversion/SCFToGPU.h"
 #include "OneFlow/Transform/BufferHostRegister.h"
 #include "OneFlow/Transform/ConvertInferenceOp.h"
 #include "OneFlow/Transform/OutlineAndFuse.h"
diff --git a/oneflow/ir/install-llvm.cmake b/oneflow/ir/install-llvm.cmake
index e01bba1b36d..d25b1911634 100644
--- a/oneflow/ir/install-llvm.cmake
+++ b/oneflow/ir/install-llvm.cmake
@@ -10,6 +10,7 @@ if(NOT llvm_monorepo_POPULATED)
   execute_process(
     COMMAND
       "${CMAKE_COMMAND}" ${llvm_monorepo_SOURCE_DIR}/llvm
+      -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} # this is required in newer version of LLVM
       -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}
       -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}
       -DCMAKE_CUDA_COMPILER_LAUNCHER=${CMAKE_CUDA_COMPILER_LAUNCHER}
diff --git a/oneflow/ir/lib/OneFlow/CMakeLists.txt b/oneflow/ir/lib/OneFlow/CMakeLists.txt
index cdc4ccbb55b..b8d0ce21d1f 100644
--- a/oneflow/ir/lib/OneFlow/CMakeLists.txt
+++ b/oneflow/ir/lib/OneFlow/CMakeLists.txt
@@ -1,7 +1,7 @@
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 message(STATUS "MLIR_DIALECT_LIBS: ${dialect_libs}")
 if(WITH_MLIR_CUDA_CODEGEN)
-  set(MLIR_GPU_LIBS MLIRSCFToGPU MLIRGPUToNVVMTransforms MLIRNVVMToLLVMIRTranslation)
+  set(MLIR_GPU_LIBS MLIRGPUToNVVMTransforms MLIRNVVMToLLVMIRTranslation)
 endif(WITH_MLIR_CUDA_CODEGEN)
 
 set(ONEFLOW_OP_GROUPS
@@ -24,7 +24,6 @@ oneflow_add_mlir_dialect_library(
   OneFlowSupport.cpp
   OneFlowOpFolders.cpp
   Conversion/OneFlowToTosa.cpp
-  Conversion/SCFToGPU.cpp
   Conversion/PTXToCubin.cpp
   Transform/BufferHostRegister.cpp
   Transform/OutlineAndFuse.cpp
@@ -43,6 +42,7 @@ oneflow_add_mlir_dialect_library(
   MLIRTosaToLinalg
   MLIRMemRefToLLVM
   MLIRLinalgToLLVM
+  MLIRSCFToGPU
   MLIRReconcileUnrealizedCasts
   ${MLIR_GPU_LIBS}
   MLIRIR
diff --git a/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp b/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp
index ec92bb352ec..912ac6c3e0b 100644
--- a/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp
+++ b/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp
@@ -144,7 +144,7 @@ struct InputOpLowering final : public OpConversionPattern<InputOp> {
     // TODO: more choices to passing data between tosa and oneflow
     const auto newValues = op.input();
     const auto is_block_arg = newValues.dyn_cast<BlockArgument>() != nullptr;
-    if (!is_block_arg) op->emitError("input is not block arg");
+    if (!is_block_arg) { return op->emitError("input is not block arg"); }
     rewriter.replaceOp(op, newValues);
     return success();
   }
@@ -168,10 +168,10 @@ struct VariableOpLowering final : public OpConversionPattern<VariableOp> {
   LogicalResult matchAndRewrite(VariableOp op, OpAdaptor adaptor,
                                 ConversionPatternRewriter& rewriter) const override {
     const auto mgr = ::oneflow::Global<::oneflow::VariableTensorMgr>::Get();
-    if (!mgr) op->emitError("global variable tensor manager miss");
+    if (!mgr) { return op->emitError("global variable tensor manager miss"); }
 
     const auto tensor = mgr->Get(op.op_name().str());
-    if (!tensor) op->emitError("tensor is null");
+    if (!tensor) { return op->emitError("tensor is null"); }
     const auto value = support::TensorToDenseElementsAttr(tensor, rewriter.getContext());
     const auto output = op.output().getType();
 
@@ -204,7 +204,7 @@ struct VariableOpToConstLowering final : public OpConversionPattern<VariableOp>
 
       rewriter.replaceOpWithNewOp<tosa::ConstOp>(op, output, value);
     } else {
-      op->emitError(
+      return op->emitError(
           "OneFlow variable op lower to TOSA const op only support integer and float value now");
     }
 
@@ -327,7 +327,7 @@ struct MaxPool2DOpLowering final : public OpConversionPattern<MaxPool2DOp> {
       return RankedTensorType::get(ranked_type, shape_type.getElementType());
     };
     // TODO: support return indice
-    if (op.return_indices()) op->emitError("not support return indices now");
+    if (op.return_indices()) { return op->emitError("not support return indices now"); }
     auto stride_pairs = get_pair_int64_from_array(op.stride());
     auto kernel_pairs = get_pair_int64_from_array(op.kernel_size());
     auto pad_pairs = get_pair_int64_from_array(op.padding());
diff --git a/oneflow/ir/lib/OneFlow/Conversion/PTXToCubin.cpp b/oneflow/ir/lib/OneFlow/Conversion/PTXToCubin.cpp
index 35ea2bd8b0e..8c22c3055de 100644
--- a/oneflow/ir/lib/OneFlow/Conversion/PTXToCubin.cpp
+++ b/oneflow/ir/lib/OneFlow/Conversion/PTXToCubin.cpp
@@ -17,7 +17,7 @@ limitations under the License.
 This file is ported from mlir/lib/Dialect/GPU/Transforms/SerializeToCubin.cpp
 */
 
-#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
 #ifdef WITH_MLIR_CUDA_CODEGEN
 
 #include "mlir/Pass/Pass.h"
diff --git a/oneflow/ir/lib/OneFlow/Conversion/SCFToGPU.cpp b/oneflow/ir/lib/OneFlow/Conversion/SCFToGPU.cpp
deleted file mode 100644
index 18cb2b4bd74..00000000000
--- a/oneflow/ir/lib/OneFlow/Conversion/SCFToGPU.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "OneFlow/OneFlowOps.h"
-#include <iostream>
-#include <string>
-#include "OneFlow/OneFlowDialect.h"
-#include "OneFlow/Passes.h"
-#include "llvm/ADT/STLExtras.h"
-#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
-#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
-#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
-#include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Linalg/Passes.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/Passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Func/Transforms/Passes.h"
-#include "mlir/Dialect/Tensor/Transforms/Passes.h"
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/OpImplementation.h"
-
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/Passes.h"
-
-#include "mlir/Dialect/GPU/ParallelLoopMapper.h"
-#include "mlir/Pass/Pass.h"
-
-using namespace mlir;
-
-namespace {
-/// Simple pass for testing the mapping of parallel loops to hardware ids using
-/// a greedy mapping strategy.
-class GpuGreedyParallelLoopMappingPass
-    : public MapSCFToGPUPassBase<GpuGreedyParallelLoopMappingPass> {
-  void runOnOperation() override {
-    Operation* op = getOperation();
-    for (Region& region : op->getRegions()) greedilyMapParallelSCFToGPU(region);
-  }
-};
-}  // namespace
-
-namespace mlir {
-
-namespace oneflow {
-
-std::unique_ptr<Pass> createMapSCFToGPUPass() {
-  return std::make_unique<GpuGreedyParallelLoopMappingPass>();
-}
-
-}  // namespace oneflow
-
-}  // namespace mlir
diff --git a/oneflow/ir/lib/OneFlow/Passes.cpp b/oneflow/ir/lib/OneFlow/Passes.cpp
index 612e0a79a9a..b0f8c71bf57 100644
--- a/oneflow/ir/lib/OneFlow/Passes.cpp
+++ b/oneflow/ir/lib/OneFlow/Passes.cpp
@@ -62,7 +62,7 @@ limitations under the License.
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
-#include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"
 #endif  // WITH_MLIR_CUDA_CODEGEN
 
@@ -769,9 +769,10 @@ LogicalResult LowerModuleToCUDALLVM(mlir::MLIRContext* context, ModuleOp module)
   AddLowerToLinalgMemRefPasses(pm);
   pm.addNestedPass<func::FuncOp>(
       createConvertLinalgToParallelLoopsPass());  // convert-linalg-to-parallel-loops
-  pm.addPass(createMapSCFToGPUPass());            // gpu-greedy-parallel-loop-mapping
-  pm.addPass(createParallelLoopToGpuPass());      // convert-parallel-loops-to-gpu
-  pm.addPass(createGpuKernelOutliningPass());     // gpu-kernel-outlining
+  pm.addNestedPass<func::FuncOp>(createGpuMapParallelLoopsPass());  // gpu-map-parallel-loops
+  pm.addPass(createParallelLoopToGpuPass());                        // convert-parallel-loops-to-gpu
+  pm.addPass(createGpuLauchSinkIndexComputationsPass());
+  pm.addPass(createGpuKernelOutliningPass());                      // gpu-kernel-outlining
   pm.addNestedPass<func::FuncOp>(createBufferHostRegisterPass());  // buffer-host-register
   pm.addPass(createCanonicalizerPass());                           // canonicalize
   // -pass-pipeline='gpu.module([PASS1][PASS2]...)'
@@ -781,6 +782,7 @@ LogicalResult LowerModuleToCUDALLVM(mlir::MLIRContext* context, ModuleOp module)
   pm.addNestedPass<gpu::GPUModuleOp>(createSerializeToCubinPass());      // out-of-tree-gpu-to-cubin
   pm.addNestedPass<func::FuncOp>(createGpuCopyArgPass());                // buffer-host-register
   pm.addPass(createGpuToLLVMConversionPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());  // reconcile-unrealized-casts
   if (enable_ir_printing) pm.enableIRPrinting();
   return pm.run(module);
 }
diff --git a/oneflow/ir/oneflow-extension/CMakeLists.txt b/oneflow/ir/oneflow-extension/CMakeLists.txt
index 8a0b21aa8f3..e7e2f1fbd18 100644
--- a/oneflow/ir/oneflow-extension/CMakeLists.txt
+++ b/oneflow/ir/oneflow-extension/CMakeLists.txt
@@ -11,7 +11,7 @@ oneflow_add_mlir_library(
   MLIRIR
   MLIRParser
   MLIRPass
-  MLIRSPIRV
+  MLIRSPIRVDialect
   MLIRTranslateLib
   MLIRSupport
   MLIROneFlow
diff --git a/oneflow/ir/oneflow-opt/oneflow-opt.cpp b/oneflow/ir/oneflow-opt/oneflow-opt.cpp
index 0496d741603..f8b35f58d59 100644
--- a/oneflow/ir/oneflow-opt/oneflow-opt.cpp
+++ b/oneflow/ir/oneflow-opt/oneflow-opt.cpp
@@ -47,7 +47,7 @@ int32_t main(int32_t argc, char** argv) {
   mlir::registerAllPasses();
   mlir::registerTestOneFlowTraitsPass();
   mlir::registerLowerOneFlowToTosaPassPass();
-  mlir::registerMapSCFToGPUPassPass();
+  mlir::registerGpuMapParallelLoopsPassPass();
   mlir::registerBufferHostRegisterPassPass();
   mlir::registerGpuCopyArgPassPass();
 #ifdef WITH_MLIR_CUDA_CODEGEN
diff --git a/oneflow/ir/oneflow-runner/CMakeLists.txt b/oneflow/ir/oneflow-runner/CMakeLists.txt
index d594362192b..9c5a601af5f 100644
--- a/oneflow/ir/oneflow-runner/CMakeLists.txt
+++ b/oneflow/ir/oneflow-runner/CMakeLists.txt
@@ -16,7 +16,7 @@ target_link_libraries(
           MLIRExecutionEngine
           MLIRIR
           MLIRJitRunner
-          MLIRLLVMIR
+          MLIRLLVMIRTransforms
           MLIRLLVMToLLVMIRTranslation
           MLIRToLLVMIRTranslationRegistration
           MLIRParser
diff --git a/oneflow/ir/oneflow-translate/lib/OneFlow/CMakeLists.txt b/oneflow/ir/oneflow-translate/lib/OneFlow/CMakeLists.txt
index 5ce5c097953..539021f8f54 100644
--- a/oneflow/ir/oneflow-translate/lib/OneFlow/CMakeLists.txt
+++ b/oneflow/ir/oneflow-translate/lib/OneFlow/CMakeLists.txt
@@ -14,7 +14,7 @@ oneflow_add_mlir_library(
   MLIRIR
   MLIRParser
   MLIRPass
-  MLIRSPIRV
+  MLIRSPIRVDialect
   MLIRTranslateLib
   MLIRSupport
   MLIROneFlow
diff --git a/oneflow/ir/test/Frontend/test_iree_resnet.py b/oneflow/ir/test/Frontend/test_iree_resnet.py
deleted file mode 100644
index c538a66b575..00000000000
--- a/oneflow/ir/test/Frontend/test_iree_resnet.py
+++ /dev/null
@@ -1,108 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-# RUN: python3 %s
-
-from oneflow_iree.compiler import Runner
-from flowvision.models import resnet50
-import oneflow as flow
-import oneflow.unittest
-import unittest
-import os
-import numpy as np
-import time
-
-os.environ["ONEFLOW_MLIR_ENABLE_ROUND_TRIP"] = "1"
-os.environ["ONEFLOW_MLIR_ENABLE_CODEGEN_FUSERS"] = "1"
-
-
-def _test_iree_resnet_cpu(test_case):
-    model = resnet50(pretrained=True)
-    model.eval()
-
-    class GraphModuleForIree(flow.nn.Graph):
-        def __init__(self):
-            super().__init__()
-            self.model = model
-
-        def build(self, x):
-            return self.model(x)
-
-    class GraphModuleForOFMLIR(flow.nn.Graph):
-        def __init__(self):
-            super().__init__()
-            self.model = model
-
-        def build(self, x):
-            return self.model(x)
-
-    func = Runner(GraphModuleForIree, return_numpy=True)
-    input = flow.ones([1, 3, 224, 224])
-    f = GraphModuleForOFMLIR()
-    for iter in range(2):
-        iree_output = func(input)
-        graph_output = f(input)
-        graph_output = graph_output.cpu().detach().numpy()
-        # the rtol accumulate layer by layer
-        test_case.assertTrue(
-            np.allclose(iree_output, graph_output, rtol=1.0e-1, atol=1e-3)
-        )
-
-
-def _test_iree_resnet_cuda(test_case):
-    model = resnet50(pretrained=True).cuda()
-    model.eval()
-
-    class GraphModuleForIree(flow.nn.Graph):
-        def __init__(self):
-            super().__init__()
-            self.model = model
-
-        def build(self, x):
-            return self.model(x)
-
-    class GraphModuleForOFMLIR(flow.nn.Graph):
-        def __init__(self):
-            super().__init__()
-            self.model = model
-
-        def build(self, x):
-            return self.model(x)
-
-    func = Runner(GraphModuleForIree, return_numpy=True)
-    input = flow.ones([1, 3, 224, 224]).cuda()
-    f = GraphModuleForOFMLIR()
-    for iter in range(2):
-        iree_output = func(input)
-        graph_output = f(input)
-        graph_output = graph_output.cpu().detach().numpy()
-        # the rtol accumulate layer by layer
-        test_case.assertTrue(
-            np.allclose(iree_output, graph_output, rtol=1.0e-1, atol=1e-3)
-        )
-
-
-@flow.unittest.skip_unless_1n1d()
-class TestIreeResnet(oneflow.unittest.TestCase):
-    def test_iree_resnet_cpu(test_case):
-        _test_iree_resnet_cpu(test_case)
-
-    @unittest.skipUnless(oneflow.sysconfig.with_cuda(), "only test cpu cases")
-    def test_iree_resnet_cuda(test_case):
-        _test_iree_resnet_cuda(test_case)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/oneflow/ir/test/Frontend/test_iree_runner.py b/oneflow/ir/test/Frontend/test_iree_runner.py
deleted file mode 100644
index a0caa90fecd..00000000000
--- a/oneflow/ir/test/Frontend/test_iree_runner.py
+++ /dev/null
@@ -1,71 +0,0 @@
-"""
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-# RUN: python3 %s
-
-from oneflow_iree.compiler import Runner
-import oneflow as flow
-import oneflow.unittest
-import unittest
-import numpy as np
-
-
-class RELU(flow.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.relu = flow.nn.ReLU()
-
-    def forward(self, x):
-        return self.relu(x)
-
-
-class GraphModule(flow.nn.Graph):
-    def __init__(self):
-        super().__init__()
-        self.fw = RELU()
-
-    def build(self, x):
-        return self.fw(x)
-
-
-def _test_check_iree_runner(test_case):
-    func = Runner(GraphModule, return_numpy=True).cuda()
-    # run on iree cuda backend
-    input = flow.Tensor([-1.0, 1.0])
-    output = func(input)
-    test_case.assertTrue(np.allclose(output, [0.0, 1.0]))
-    # change input shape
-    input = flow.Tensor([-1.0, 1.0, -1])
-    output = func(input)
-    test_case.assertTrue(np.allclose(output, [0.0, 1.0, 0.0]))
-    # change on iree cpu backend
-    func = func.cpu()
-    input = flow.Tensor([-1.0, 0.0, 1.0])
-    output = func(input)
-    test_case.assertTrue(np.allclose(output, [0.0, 0.0, 1.0]))
-    # change input shape
-    input = flow.Tensor([-1, 1.0])
-    output = func(input)
-    test_case.assertTrue(np.allclose(output, [0.0, 1.0]))
-
-
-@flow.unittest.skip_unless_1n1d()
-class TestCheckIreeRunner(oneflow.unittest.TestCase):
-    def test_check_iree_runner(test_case):
-        _test_check_iree_runner(test_case)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir b/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir
index 34ee5b499dc..3115bad55c6 100644
--- a/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir
+++ b/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir
@@ -4,7 +4,7 @@
 // RUN: -tensor-bufferize -func-bufferize -buffer-results-to-out-params \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm \
 // RUN: -convert-func-to-llvm -convert-memref-to-llvm -reconcile-unrealized-casts --print-after-all \
-// RUN: | oneflow-translate -mlir-to-llvmir | clang -x ir - -c -o test.o
+// RUN: | oneflow-translate -mlir-to-llvmir
 
 builtin.module {
   func.func @Graph_0(%arg0: tensor<2xf32>) -> tensor<2xf32> {
diff --git a/oneflow/ir/test/OneFlow/cuda_code_gen/fuse_cast_scale.mlir b/oneflow/ir/test/OneFlow/cuda_code_gen/fuse_cast_scale.mlir
index a6a7db89b1b..9eaf154ac6f 100644
--- a/oneflow/ir/test/OneFlow/cuda_code_gen/fuse_cast_scale.mlir
+++ b/oneflow/ir/test/OneFlow/cuda_code_gen/fuse_cast_scale.mlir
@@ -1,4 +1,4 @@
-// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-greedy-parallel-loop-mapping \
+// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-map-parallel-loops \
 // RUN: -convert-parallel-loops-to-gpu -gpu-kernel-outlining -buffer-host-register -canonicalize \
 // RUN: -pass-pipeline='gpu.module(strip-debuginfo,lower-affine,convert-gpu-to-nvvm,out-of-tree-gpu-to-cubin)' \
 // RUN: --func-bufferize -buffer-results-to-out-params -gpu-copy-arg --tensor-bufferize \
@@ -12,7 +12,7 @@
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_c_runner_utils%shlibext \
 // RUN:   --entry-point-result=void
 
-// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-greedy-parallel-loop-mapping \
+// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-map-parallel-loops \
 // RUN: -convert-parallel-loops-to-gpu -gpu-kernel-outlining -buffer-host-register -canonicalize \
 // RUN: -pass-pipeline='gpu.module(strip-debuginfo,lower-affine,convert-gpu-to-nvvm,out-of-tree-gpu-to-cubin)' \
 // RUN: --func-bufferize --tensor-bufferize \
@@ -25,13 +25,13 @@
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
 // RUN:   --entry-point-result=void
 
-func @Cast_289__FUSE__ScalarMulByTensor_290(%arg0: tensor<3x3xi64>, %arg1: tensor<1xf32>) -> tensor<3x3xf32> {
+func.func @Cast_289__FUSE__ScalarMulByTensor_290(%arg0: tensor<3x3xi64>, %arg1: tensor<1xf32>) -> tensor<3x3xf32> {
   %0 = "oneflow.cast"(%arg0) {device_name = ["@0:0"], device_tag = "cuda", dtype = 2 : i32, hierarchy = [1], op_name = "Cast_289", output_lbns = ["Cast_289/out_0"], scope_symbol_id = 4611686018427478014 : i64} : (tensor<3x3xi64>) -> tensor<3x3xf32>
   %1 = "oneflow.scalar_mul_by_tensor"(%0, %arg1) {device_name = ["@0:0"], device_tag = "cuda", hierarchy = [1], op_name = "ScalarMulByTensor_290", output_lbns = ["ScalarMulByTensor_290/y_0"], scope_symbol_id = 4611686018427478014 : i64} : (tensor<3x3xf32>, tensor<1xf32>) -> tensor<3x3xf32>
   return %1 : tensor<3x3xf32>
 }
 
-func @main()  {
+func.func @main()  {
   %a_data = memref.alloc() : memref<3x3xi64>
   %b_data = memref.alloc() : memref<1xf32>
   %a = bufferization.to_tensor %a_data : memref<3x3xi64>
@@ -40,15 +40,15 @@ func @main()  {
   %c = call @Cast_289__FUSE__ScalarMulByTensor_290(%a, %b) : (tensor<3x3xi64>, tensor<1xf32>) -> (tensor<3x3xf32>)
   %c_buffer = bufferization.to_memref %c : memref<3x3xf32>
   %cast_c_buffer = memref.cast %c_buffer : memref<3x3xf32> to memref<*xf32>
-  call @print_memref_f32(%cast_c_buffer) : (memref<*xf32>) -> ()
+  call @printMemrefF32(%cast_c_buffer) : (memref<*xf32>) -> ()
   // TODO: use real number
   // CHECK: [3, 3]
 
   %cast_a_data = memref.cast %a_data : memref<3x3xi64> to memref<*xi64>
   %cast_b_data = memref.cast %b_data : memref<1xf32> to memref<*xf32>
-  call @print_memref_i64(%cast_a_data) : (memref<*xi64>) -> ()
-  call @print_memref_f32(%cast_b_data) : (memref<*xf32>) -> ()
+  call @printMemrefI64(%cast_a_data) : (memref<*xi64>) -> ()
+  call @printMemrefF32(%cast_b_data) : (memref<*xf32>) -> ()
   return
 }
-func private @print_memref_f32(memref<*xf32>)
-func private @print_memref_i64(memref<*xi64>)
+func.func private @printMemrefF32(memref<*xf32>)
+func.func private @printMemrefI64(memref<*xi64>)
diff --git a/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_copy_arg.mlir b/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_copy_arg.mlir
index 3371acad706..f63e65b7431 100644
--- a/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_copy_arg.mlir
+++ b/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_copy_arg.mlir
@@ -1,8 +1,8 @@
-// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-greedy-parallel-loop-mapping \
+// RUN: oneflow-opt %s -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -convert-linalg-to-parallel-loops -gpu-map-parallel-loops \
 // RUN: -convert-parallel-loops-to-gpu -gpu-kernel-outlining -buffer-host-register -canonicalize \
 // RUN: -pass-pipeline='gpu.module(strip-debuginfo,lower-affine,convert-gpu-to-nvvm,out-of-tree-gpu-to-cubin)' \
 // RUN: --func-bufferize -buffer-results-to-out-params -gpu-copy-arg
-func @Cast_289__FUSE__ScalarMulByTensor_290(%arg0: tensor<3x3xi64>, %arg1: tensor<1xf32>) -> tensor<3x3xf32> {
+func.func @Cast_289__FUSE__ScalarMulByTensor_290(%arg0: tensor<3x3xi64>, %arg1: tensor<1xf32>) -> tensor<3x3xf32> {
   %0 = "oneflow.cast"(%arg0) {device_name = ["@0:0"], device_tag = "cuda", dtype = 2 : i32, hierarchy = [1], op_name = "Cast_289", output_lbns = ["Cast_289/out_0"], scope_symbol_id = 4611686018427478014 : i64} : (tensor<3x3xi64>) -> tensor<3x3xf32>
   %1 = "oneflow.scalar_mul_by_tensor"(%0, %arg1) {device_name = ["@0:0"], device_tag = "cuda", hierarchy = [1], op_name = "ScalarMulByTensor_290", output_lbns = ["ScalarMulByTensor_290/y_0"], scope_symbol_id = 4611686018427478014 : i64} : (tensor<3x3xf32>, tensor<1xf32>) -> tensor<3x3xf32>
   return %1 : tensor<3x3xf32>
diff --git a/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_runner.mlir b/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_runner.mlir
index c5aac6f8e94..6f3d14cf212 100644
--- a/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_runner.mlir
+++ b/oneflow/ir/test/OneFlow/cuda_code_gen/gpu_runner.mlir
@@ -8,7 +8,7 @@
 // RUN:   --entry-point-result=void \
 // RUN: | FileCheck %s
 // CHECK: [{{(35, ){34}35}}]
-func @main() {
+func.func @main() {
   %arg = memref.alloc() : memref<35xf32>
   %dst = memref.cast %arg : memref<35xf32> to memref<?xf32>
   %one = arith.constant 1 : index
@@ -28,8 +28,8 @@ func @main() {
     memref.store %res, %dst[%tx] : memref<?xf32>
     gpu.terminator
   }
-  call @print_memref_f32(%cast_dst) : (memref<*xf32>) -> ()
+  call @printMemrefF32(%cast_dst) : (memref<*xf32>) -> ()
   return
 }
 
-func private @print_memref_f32(memref<*xf32>)
+func.func private @printMemrefF32(memref<*xf32>)
diff --git a/oneflow/ir/test/OneFlow/lower_to_tosa.mlir b/oneflow/ir/test/OneFlow/lower_to_tosa.mlir
index df5f91c3129..f65ed33275c 100644
--- a/oneflow/ir/test/OneFlow/lower_to_tosa.mlir
+++ b/oneflow/ir/test/OneFlow/lower_to_tosa.mlir
@@ -1,8 +1,7 @@
 // RUN: oneflow-opt -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops -linalg-bufferize -tensor-bufferize -func-bufferize -buffer-results-to-out-params -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -convert-func-to-llvm -convert-memref-to-llvm -reconcile-unrealized-casts --print-after-all %s
-// RUN: oneflow-opt -lower-oneflow-to-tosa -pass-pipeline="func.func(tosa-to-linalg)" -cse --linalg-fuse-elementwise-ops  -linalg-bufferize -tensor-bufferize -func-bufferize -buffer-results-to-out-params  -finalizing-bufferize -canonicalize %s
 
 module  {
-  func @Cast_1__FUSE__ScalarMulByTensor_2(%arg0: tensor<96x96xi64>, %arg1: tensor<1xf32>) -> tensor<96x96xf32> {
+  func.func @Cast_1__FUSE__ScalarMulByTensor_2(%arg0: tensor<96x96xi64>, %arg1: tensor<1xf32>) -> tensor<96x96xf32> {
     %0 = "oneflow.cast"(%arg0) {device_name = ["0:0"], device_tag = "cpu", dtype = 2 : i32, hierarchy = [1], op_name = "Cast_1", op_type_name = "cast", scope_symbol_id = 4611686018427416574 : i64} : (tensor<96x96xi64>) -> tensor<96x96xf32>
     %1 = "oneflow.scalar_mul_by_tensor"(%0, %arg1) {device_name = ["0:0"], device_tag = "cpu", hierarchy = [1], op_name = "ScalarMulByTensor_2", op_type_name = "scalar_mul_by_tensor", scope_symbol_id = 4611686018427416574 : i64} : (tensor<96x96xf32>, tensor<1xf32>) -> tensor<96x96xf32>
     return %1 : tensor<96x96xf32>
diff --git a/oneflow/ir/test/OneFlow/traits.mlir b/oneflow/ir/test/OneFlow/traits.mlir
index ed8eb3a5678..55506828b84 100644
--- a/oneflow/ir/test/OneFlow/traits.mlir
+++ b/oneflow/ir/test/OneFlow/traits.mlir
@@ -1,17 +1,17 @@
 // RUN: oneflow-opt -test-oneflow-trait-folder %s | FileCheck %s
 
-// CHECK-LABEL: func @testSingleIdempotent
+// CHECK-LABEL: func.func @testSingleIdempotent
 // CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>)
-func @testSingleIdempotent(%arg0 : tensor<f32>) -> tensor<f32> {
+func.func @testSingleIdempotent(%arg0 : tensor<f32>) -> tensor<f32> {
   // CHECK: [[IDEMPOTENT:%.+]] = "oneflow.relu"([[ARG0]])
   %0 = "oneflow.relu"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
   // CHECK: return [[IDEMPOTENT]]
   return %0: tensor<f32>
 }
 
-// CHECK-LABEL: func @testDoubleIdempotent
+// CHECK-LABEL: func.func @testDoubleIdempotent
 // CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>)
-func @testDoubleIdempotent(%arg0: tensor<f32>) -> tensor<f32> {
+func.func @testDoubleIdempotent(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: [[IDEMPOTENT:%.+]] = "oneflow.relu"([[ARG0]])
   %0 = "oneflow.relu"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
   %1 = "oneflow.relu"(%0) {device_tag = "cuda", op_name = "Relu_2", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
@@ -19,9 +19,9 @@ func @testDoubleIdempotent(%arg0: tensor<f32>) -> tensor<f32> {
   return %1: tensor<f32>
 }
 
-// CHECK-LABEL: func @testTripleIdempotent
+// CHECK-LABEL: func.func @testTripleIdempotent
 // CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>)
-func @testTripleIdempotent(%arg0: tensor<f32>) -> tensor<f32> {
+func.func @testTripleIdempotent(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: [[IDEMPOTENT:%.+]] = "oneflow.relu"([[ARG0]])
   %0 = "oneflow.relu"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
   %1 = "oneflow.relu"(%0) {device_tag = "cuda", op_name = "Relu_2", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
@@ -30,18 +30,18 @@ func @testTripleIdempotent(%arg0: tensor<f32>) -> tensor<f32> {
   return %2: tensor<f32>
 }
 
-// CHECK-LABEL: func @testDoubleInvolution
+// CHECK-LABEL: func.func @testDoubleInvolution
 // CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>)
-func @testDoubleInvolution(%arg0: tensor<f32>) -> tensor<f32> {
+func.func @testDoubleInvolution(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "oneflow.negative"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
   %1 = "oneflow.negative"(%0) {device_tag = "cuda", op_name = "Relu_2", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
   // CHECK: return [[ARG0]]
   return %1: tensor<f32>
 }
 
-// CHECK-LABEL: func @testTripleInvolution
+// CHECK-LABEL: func.func @testTripleInvolution
 // CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>)
-func @testTripleInvolution(%arg0: tensor<f32>) -> tensor<f32> {
+func.func @testTripleInvolution(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: [[INVOLUTION:%.+]] = "oneflow.negative"([[ARG0]])
   %0 = "oneflow.negative"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
   %1 = "oneflow.negative"(%0) {device_tag = "cuda", op_name = "Relu_2", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
@@ -50,9 +50,9 @@ func @testTripleInvolution(%arg0: tensor<f32>) -> tensor<f32> {
   return %2: tensor<f32>
 }
 
-// CHECK-LABEL: func @testFailedInvolutionFoldDueToDifferentPlacement
+// CHECK-LABEL: func.func @testFailedInvolutionFoldDueToDifferentPlacement
 // CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>)
-func @testFailedInvolutionFoldDueToDifferentPlacement(%arg0: tensor<f32>) -> tensor<f32> {
+func.func @testFailedInvolutionFoldDueToDifferentPlacement(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "oneflow.negative"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
   %1 = "oneflow.negative"(%0) {device_tag = "cuda", op_name = "Relu_2", op_type_name = "relu", device_name = ["1:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
   // CHECK: [[INVOLUTION:%.+]] = "oneflow.negative"(%1)
@@ -61,9 +61,9 @@ func @testFailedInvolutionFoldDueToDifferentPlacement(%arg0: tensor<f32>) -> ten
   return %2: tensor<f32>
 }
 
-// CHECK-LABEL: func @testFailedInvolutionFoldDueToDifferentDevice
+// CHECK-LABEL: func.func @testFailedInvolutionFoldDueToDifferentDevice
 // CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>)
-func @testFailedInvolutionFoldDueToDifferentDevice(%arg0: tensor<f32>) -> tensor<f32> {
+func.func @testFailedInvolutionFoldDueToDifferentDevice(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "oneflow.negative"(%arg0) {device_tag = "cuda", op_name = "Relu_1", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
   %1 = "oneflow.negative"(%0) {device_tag = "cpu", op_name = "Relu_2", op_type_name = "relu", device_name = ["0:0-0"], scope_symbol_id = 4611686018427420670 : i64} : (tensor<f32>) -> tensor<f32>
   // CHECK: [[INVOLUTION:%.+]] = "oneflow.negative"(%1)
diff --git a/oneflow/ir/test/OneFlow/with_cuda/test_conv_bn_auto_nhwc.py b/oneflow/ir/test/OneFlow/with_cuda/test_conv_bn_auto_nhwc.py
index 88d7c307c1a..8202c49ae89 100644
--- a/oneflow/ir/test/OneFlow/with_cuda/test_conv_bn_auto_nhwc.py
+++ b/oneflow/ir/test/OneFlow/with_cuda/test_conv_bn_auto_nhwc.py
@@ -52,7 +52,7 @@ def build(self, *input):
     lazy_res = graph(data)
 
     test_case.assertTrue(
-        np.allclose(eager_res.numpy(), lazy_res.numpy(), rtol=1e-4, atol=1e-4)
+        np.allclose(eager_res.numpy(), lazy_res.numpy(), rtol=1e-2, atol=1e-2)
     )
 
 
diff --git a/python/oneflow/nn/graph/graph.py b/python/oneflow/nn/graph/graph.py
index d6583810e5d..1952cea3699 100644
--- a/python/oneflow/nn/graph/graph.py
+++ b/python/oneflow/nn/graph/graph.py
@@ -531,7 +531,7 @@ def _shallow_repr(self):
         return shallow_repr
 
     def _ops_repr(self):
-        r"""Generate operators' string representation of this graph 
+        r"""Generate operators' string representation of this graph
         """
         if self._is_compiled and self._compiled_graph_proto is not None:
             module_conf = self._compiled_graph_proto.module_name2module_conf[self.name]
@@ -1360,6 +1360,13 @@ def __getattr__(self, name: str):
         )
 
     def __del__(self):
+        # Ensure vm has finished running this graph.
+        if self._session._env.is_shutting_down():
+            # After python shutting down, it's not safe to call oneflow._oneflow_internal.eager.
+            # But shutting down will do sync in SwitchToShuttingDownPhase.
+            # So it's safe to skip sync here.
+            return
+        oneflow._oneflow_internal.eager.Sync()
         current_env_enable_mlir_inference_opt = os.getenv(
             "ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION"
         )
@@ -1369,13 +1376,6 @@ def __del__(self):
             os.environ[
                 "ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION"
             ] = self.env_enable_mlir_inference_opt
-        # Ensure vm has finished running this graph.
-        if self._session._env.is_shutting_down():
-            # After python shutting down, it's not safe to call oneflow._oneflow_internal.eager.
-            # But shutting down will do sync in SwitchToShuttingDownPhase.
-            # So it's safe to skip sync here.
-            return
-        oneflow._oneflow_internal.eager.Sync()
         oneflow._oneflow_internal.ClearVariableTensorMgr()
 
     def __ensure_input_tensors_contiguous(self, *args, **kwargs):
diff --git a/python/oneflow/test/graph/test_comb2d.py b/python/oneflow/test/graph/test_comb2d.py
index aac2a5e12a5..7b746017bdb 100644
--- a/python/oneflow/test/graph/test_comb2d.py
+++ b/python/oneflow/test/graph/test_comb2d.py
@@ -24,7 +24,7 @@
 import oneflow.unittest
 
 
-class TestModule(nn.Module):
+class _TestModule(nn.Module):
     def forward(self, x):
         sbp_1ds = [
             flow.sbp.broadcast,
@@ -62,7 +62,7 @@ def build(self, x):
 @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
 class TestLazyAllSbpCombinationTesting(flow.unittest.TestCase):
     def test_lazy_boxing_2d_all_combination(test_case):
-        model = TestModule()
+        model = _TestModule()
         graph = _TestGraph(model)
 
         x = flow.ones(
diff --git a/python/oneflow/test/graph/test_graph_ofrecord_reader.py b/python/oneflow/test/graph/test_graph_ofrecord_reader.py
index 16b4f161e13..35dcd4d376c 100644
--- a/python/oneflow/test/graph/test_graph_ofrecord_reader.py
+++ b/python/oneflow/test/graph/test_graph_ofrecord_reader.py
@@ -90,9 +90,6 @@ def build(self):
         reader_g = GraphReader()
         image, label = reader_g()
 
-        print(image)
-        print(label)
-
 
 if __name__ == "__main__":
     unittest.main()